基于IMDb数据集的GPT-2模型微调
IMDb数据集地址
IMDb数据集
数据集汇总代码
import os
# IMDb 电影评论数据集目录路径
dataset_dir = "aclImdb/train/pos"
# 读取所有评论文本文件并将内容组合为一个长字符串
combined_text = ""
for file_name in os.listdir(dataset_dir):
if file_name.endswith(".txt"):
with open(os.path.join(dataset_dir, file_name), "r", encoding="utf-8") as file:
text = file.read()
combined_text += text + "\n" # 每个文本之间使用换行符分隔
# 将组合后的文本保存到文件
with open("combined_text.txt", "w", encoding="utf-8") as file:
file.write(combined_text)
微调代码
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
# 加载预训练的 GPT-2 模型和分词器
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# 创建 TextDataset
dataset = TextDataset(
tokenizer=tokenizer,
file_path="combined_text.txt", # 使用组合后的文本文件路径
block_size=128 # 每个样本的最大长度
)
# 设置训练参数
training_args = TrainingArguments(
output_dir="./gpt2-finetuned-imdb", # 微调后的模型保存路径
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,
)
# 创建 Trainer 对象
trainer = Trainer(
model=model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False
),
train_dataset=dataset,
)
# 开始微调
trainer.train()
# 保存微调后的模型
model.save_pretrained("./gpt2-finetuned-imdb")
tokenizer.save_pretrained("./gpt2-finetuned-imdb")