Coggle 30 Days of ML（23年7月）任务十：使用Bert在比赛数据集中完成预训练

2024 年 8 月 23 日 6点热度 0人点赞 0条评论

Coggle 30 Days of ML（23年7月）任务十：使用Bert在比赛数据集中完成预训练

任务十：使用Bert在比赛数据集中完成预训练

说明：在这个任务中，你将使用Bert模型在比赛数据集上完成预训练，通过预训练的Bert模型来提取文本特征。
实践步骤：
1. 准备比赛数据集和相应的预训练参数。
2. 使用transformer库中的Bert模型，加载预训练参数。
3. 使用Bert模型对比赛数据集进行预训练，提取文本特征。

加载与训练模型

在任务九的时候，我们已经介绍了transformer库中其实有一些有关于bert模型的参数，所以我们可以加载预训练的参数

具体的模型选择可以参考：https://huggingface.co/transformers/v3.0.2/pretrained_models.html?highlight=pretrained

# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

定义数据集

接下来就使用tokenizer来进行对数据进行提取特征，这里面设计了提取特征，最后返回input_ids，attention_mask和label

# 定义自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        # print(text)
        label = self.labels.iloc[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            max_length=128,
            truncation=True,
            return_tensors='pt'
        )
        return inputs['input_ids'].to(device), inputs['attention_mask'].to(device), torch.tensor(label).to(device)

定义训练和验证函数

定义训练和验证函数，训练函数将模型设置为训练模式并使用AdamW优化器进行模型参数更新，验证函数将模型设置为评估模式并计算验证数据集上的损失和准确率。

# 定义训练和验证函数
def train(model, train_loader):
    """ 训练模型的函数 Args: model: 当前的模型 train_loader: 训练数据集的DataLoader Returns: 无 """
    # 将模型设置为训练模式
    model.train()
    # 定义优化器
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    # 遍历训练数据集
    for batch in tqdm(train_loader, desc='Training'):
        # 获取数据
        input_ids, attention_mask, labels = batch
        # 数据转移到GPU上
        input_ids = input_ids.squeeze().to(device)
        attention_mask = attention_mask.squeeze().to(device)
        labels = labels.to(device)

        # 将梯度缓存归零
        optimizer.zero_grad()
        # 前向传播
        outputs = model(input_ids=input_ids.squeeze(),
                        attention_mask=attention_mask.squeeze(),
                        labels=labels)
        # 计算损失
        loss = outputs.loss
        # 反向传播
        loss.backward()
        # 更新模型参数
        optimizer.step()

def evaluate(model, val_loader):
    """ 验证模型的函数 Args: model: 当前的模型 val_loader: 验证数据集的DataLoader Returns: val_loss: 验证数据集上的损失 accuracy: 模型在验证数据集上的准确率 """
    # 将模型设置为评估模式
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    # 禁用梯度计算
    with torch.no_grad():
        # 遍历验证数据集
        for batch in tqdm(val_loader, desc='Evaluating'):
            # 获取数据
            input_ids, attention_mask, labels = batch
            # 数据转移到GPU上
            input_ids = input_ids.squeeze().to(device)
            attention_mask = attention_mask.squeeze().to(device)
            labels = labels.to(device)

            # 前向传播
            outputs = model(input_ids=input_ids.squeeze(),
                            attention_mask=attention_mask.squeeze(),
                            labels=labels)
            # 计算损失
            val_loss += outputs.loss.item()
            # 获取预测结果
            _, predicted = torch.max(outputs.logits, dim=1)
            # 统计预测正确的数量
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    # 计算准确率
    accuracy = correct / total
    return val_loss, accuracy

训练和验证模型

使用Bert模型对比赛数据集进行预训练，提取文本特征，训练10个epoch，验证模型并保存准确率最高的模型。

# 训练和验证
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 将模型移动到GPU上
model.to(device)

num_epochs = 10
best_accuracy = 0
# 开始训练
for epoch in range(num_epochs):
    # 训练模型
    train(model, train_loader)
    # 验证模型
    val_loss, accuracy = evaluate(model, val_loader)
    # 打印当前模型的验证损失和准确率
    print(f'Epoch {
      epoch+1}: Validation Loss = {
      val_loss:.4f}, Accuracy = {
      accuracy:.4f}')
    # 如果当前模型的准确率更高，则保存当前模型
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'bert_model.pth')  # 保存最佳模型

模型预测

加载最佳模型进行预测并保存结果为CSV文件。

# 加载最佳模型并进行预测
model.load_state_dict(torch.load('bert_model.pth'))
# 将模型移动到GPU上
model.to(device)

# 将模型设置为评估模式
model.eval()

# 加载测试数据
test_texts = test_data['content']
test_labels = np.zeros(test_data.shape[0])
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# 预测结果
predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Predicting'):
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids.squeeze(),
                        attention_mask=attention_mask.squeeze())
        _, predicted = torch.max(outputs.logits, dim=1)
        predictions.extend(predicted.tolist())

# 保存预测结果
submit = pd.read_csv('sample_submit.csv')
submit['label'] = predictions
submit.to_csv('bert_predictions.csv', index=None)