PyTorch实战:从零构建并训练自定义语料库的完整指南
一、引言:为什么需要自定义语料库?
在NLP任务中,公开数据集(如IMDB影评、Wikipedia)虽易获取,但往往无法满足特定场景需求(如行业术语、方言、垂直领域知识)。通过构建自定义语料库,可以:
训练针对特定领域的语言模型(如医疗、法律文本)。
解决数据隐私问题(如企业内部数据无法公开)。
提升模型对小众语言的适应性。
本文以情感分析任务为例,演示如何用PyTorch训练一个基于自定义语料库的分类模型。
二、构建自定义语料库的完整流程
1. 数据收集与准备
数据来源
公开数据:爬取垂直领域网站(如知乎行业话题、豆瓣书评)。
私有数据:企业日志、客服对话记录(需脱敏处理)。
人工标注:使用工具如
Label Studio
标注情感标签(积极/消极)。
数据格式
将文本和标签保存为CSV文件,示例:
text,label "这个产品非常好用",1 "服务态度太差了",0
2. 文本预处理
步骤1:加载数据
import pandas as pd from sklearn.model_selection import train_test_split # 读取数据 df = pd.read_csv("custom_corpus.csv") texts = df["text"].tolist() labels = df["label"].tolist() # 划分训练集/验证集/测试集 train_texts, val_texts, train_labels, val_labels = train_test_split( texts, labels, test_size=0.2, random_state=42 )
步骤2:文本清洗
import re from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string def preprocess_text(text): # 转换为小写 text = text.lower() # 移除特殊字符和数字 text = re.sub(r"[^a-zA-Z\s]", "", text) # 分词并移除停用词 tokens = word_tokenize(text) stop_words = set(stopwords.words("english")) tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation] return " ".join(tokens) # 应用清洗 train_texts_clean = [preprocess_text(text) for text in train_texts] val_texts_clean = [preprocess_text(text) for text in val_texts]
步骤3:构建词汇表
from collections import Counter # 统计词频并构建词汇表 vocab = Counter() for text in train_texts_clean: vocab.update(text.split()) # 定义词汇表大小和特殊符号 vocab_size = 10000 # 仅保留前10000个高频词 UNK_TOKEN = "<UNK>" # 未知词 PAD_TOKEN = "<PAD>" # 填充符 # 生成词汇表到索引的映射 word2idx = {word: idx+2 for idx, word in enumerate(list(vocab.keys())[:vocab_size-2])} word2idx[UNK_TOKEN] = 0 word2idx[PAD_TOKEN] = 1 # 索引到词汇表的映射 idx2word = {idx: word for word, idx in word2idx.items()}
步骤4:文本向量化
def text_to_sequence(text, word2idx, max_len=50): tokens = text.split() sequence = [word2idx.get(token, word2idx[UNK_TOKEN]) for token in tokens] # 填充或截断至固定长度 if len(sequence) < max_len: sequence += [word2idx[PAD_TOKEN]] * (max_len - len(sequence)) else: sequence = sequence[:max_len] return sequence # 转换所有文本 max_len = 50 X_train = [text_to_sequence(text, word2idx, max_len) for text in train_texts_clean] X_val = [text_to_sequence(text, word2idx, max_len) for text in val_texts_clean] # 转换为PyTorch张量 import torch X_train_tensor = torch.LongTensor(X_train) X_val_tensor = torch.LongTensor(X_val) y_train_tensor = torch.LongTensor(train_labels) y_val_tensor = torch.LongTensor(val_labels)
3. 定义神经网络模型
模型架构
使用TextCNN(卷积神经网络)进行文本分类:
import torch.nn as nn import torch.nn.functional as F class TextCNN(nn.Module): def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3, 4, 5]): super(TextCNN, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1) self.convs = nn.ModuleList([ nn.Conv2d(1, 1, (k, embed_dim)) for k in kernel_sizes ]) self.fc = nn.Linear(len(kernel_sizes), num_classes) def forward(self, x): # x shape: [batch_size, seq_len] x = self.embedding(x) # [batch_size, seq_len, embed_dim] x = x.unsqueeze(1) # [batch_size, 1, seq_len, embed_dim] # 通过不同卷积核 conv_outputs = [] for conv in self.convs: out = conv(x) # [batch_size, 1, seq_len-k+1, 1] out = F.relu(out.squeeze(3)) # [batch_size, 1, seq_len-k+1] out = F.max_pool1d(out, out.size(2)) # [batch_size, 1, 1] conv_outputs.append(out.squeeze(2)) # 拼接所有卷积结果 x = torch.cat(conv_outputs, dim=1) # [batch_size, num_filters] x = self.fc(x) # [batch_size, num_classes] return x
初始化模型
pythonvocab_size = len(word2idx)embed_dim = 128num_classes = 2 # 二分类model = TextCNN(vocab_size, embed_dim, num_classes)
4. 训练模型
定义损失函数和优化器
import torch.optim as optim criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001)
训练循环
def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32): model.train() for epoch in range(epochs): total_loss = 0 for i in range(0, len(X_train), batch_size): # 获取批次数据 batch_X = X_train[i:i+batch_size] batch_y = y_train[i:i+batch_size] # 前向传播 outputs = model(batch_X) loss = criterion(outputs, batch_y) # 反向传播 optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() # 验证集评估 val_loss, val_acc = evaluate_model(model, X_val, y_val, batch_size) print(f"Epoch {epoch+1}, Train Loss: {total_loss/(len(X_train)/batch_size):.4f}, " f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") def evaluate_model(model, X_val, y_val, batch_size): model.eval() total_loss = 0 correct = 0 total = 0 with torch.no_grad(): for i in range(0, len(X_val), batch_size): batch_X = X_val[i:i+batch_size] batch_y = y_val[i:i+batch_size] outputs = model(batch_X) loss = criterion(outputs, batch_y) total_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() accuracy = correct / total return total_loss/(len(X_val)/batch_size), accuracy # 启动训练 train_model(model, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)
5. 模型评估与优化
评估指标
准确率(Accuracy):正确分类样本占比。
F1分数:平衡精确率和召回率(适用于类别不平衡数据)。
优化方向
超参数调优:调整学习率、批次大小、卷积核大小。
模型改进:尝试LSTM、BERT等更复杂的架构。
数据增强:同义词替换、回译(Back Translation)扩充数据集。
三、总结与扩展
关键点回顾
数据预处理:清洗、分词、构建词汇表是基础。
模型设计:TextCNN适合短文本分类,LSTM/Transformer适合长文本。
训练技巧:批量训练、学习率调度、早停(Early Stopping)防止过拟合。
扩展应用
多语言支持:使用
sentencepiece
分词器处理非空格分隔语言(如中文)。部署上线:将模型导出为
TorchScript
格式,集成到Flask/FastAPI后端。