简介:本文通过解析Transformer模型核心代码,结合英译汉机器翻译实战案例,系统讲解注意力机制、编码器-解码器架构、位置编码等关键模块的实现细节,并提供完整训练流程与优化建议。
Transformer模型彻底改变了序列到序列(Seq2Seq)任务的实现方式,其核心创新在于完全摒弃循环神经网络(RNN)结构,转而采用自注意力机制(Self-Attention)实现并行计算。在英译汉任务中,编码器负责处理英文源句,解码器生成中文目标句,两者通过多头注意力机制实现信息交互。
自注意力机制是Transformer的核心组件,其计算过程可分为三个步骤:
import torchimport torch.nn as nnclass MultiHeadAttention(nn.Module):def __init__(self, embed_size, heads):super().__init__()self.embed_size = embed_sizeself.heads = headsself.head_dim = embed_size // heads# 线性变换矩阵self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)self.fc_out = nn.Linear(heads * self.head_dim, embed_size)def forward(self, values, keys, query, mask):N = query.shape[0] # 批次大小value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]# 分割多头values = values.reshape(N, value_len, self.heads, self.head_dim)keys = keys.reshape(N, key_len, self.heads, self.head_dim)queries = query.reshape(N, query_len, self.heads, self.head_dim)# 线性变换values = self.values(values)keys = self.keys(keys)queries = self.queries(queries)# 计算注意力分数energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])if mask is not None:energy = energy.masked_fill(mask == 0, float("-1e20"))attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)# 加权求和out = torch.einsum("nhql,nlhd->nqhd", [attention, values])out = out.reshape(N, query_len, self.heads * self.head_dim)# 输出投影out = self.fc_out(out)return out
代码中torch.einsum实现了高效的矩阵运算,energy计算查询向量与键向量的点积,通过masked_fill处理填充位置,最后通过softmax归一化得到注意力权重。多头机制使模型能同时关注不同位置的信息,提升翻译准确性。
由于Transformer缺乏时序信息,需通过位置编码(Positional Encoding)注入序列顺序信息:
class PositionalEncoding(nn.Module):def __init__(self, embed_size, max_len=5000):super().__init__()self.embedding = nn.Embedding(max_len, embed_size)self.embed_size = embed_sizedef forward(self, x):position = torch.arange(0, x.shape[1], dtype=torch.float).unsqueeze(1)div_term = torch.exp(torch.arange(0, self.embed_size, 2).float() * (-math.log(10000.0) / self.embed_size))pe = torch.zeros(x.shape[1], self.embed_size)pe[:, 0::2] = torch.sin(position * div_term)pe[:, 1::2] = torch.cos(position * div_term)return x + pe.unsqueeze(0)
正弦和余弦函数的组合使模型能学习相对位置关系,div_term中的指数衰减确保高频位置信息优先传递。
完整英译汉系统包含数据预处理、模型构建、训练优化三个核心阶段。
tokenizer = BytePairBPETokenizer()
tokenizer.train_from_iterator([“ “.join(sent) for sent in en_sentences], vocab_size=30000)
tokenizer.save_model(“bpe_tokenizer”)
def tokenize_en(text):
return tokenizer.encode(text).ids
def tokenize_zh(text):
# 中文分词器需单独处理pass
## 2.2 完整模型架构```pythonclass Transformer(nn.Module):def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, embed_size=256, num_layers=6, heads=8, forward_expansion=4, dropout=0.1, max_length=100):super().__init__()self.src_word_embedding = nn.Embedding(src_vocab_size, embed_size)self.src_position_embedding = PositionalEncoding(embed_size, max_length)self.trg_word_embedding = nn.Embedding(trg_vocab_size, embed_size)self.trg_position_embedding = PositionalEncoding(embed_size, max_length)self.encoder = Encoder(embed_size, num_layers, heads, forward_expansion, dropout, max_length)self.decoder = Decoder(embed_size, num_layers, heads, forward_expansion, dropout, max_length)self.fc_out = nn.Linear(embed_size, trg_vocab_size)self.src_pad_idx = src_pad_idxdef make_src_mask(self, src):src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)return src_maskdef forward(self, src, trg):src_seq_length, N = src.shapetrg_seq_length, N = trg.shapesrc = self.src_word_embedding(src)src = self.src_position_embedding(src)trg = self.trg_word_embedding(trg)trg = self.trg_position_embedding(trg)src_mask = self.make_src_mask(src)enc_src = self.encoder(src, src_mask)out = self.decoder(trg, enc_src, src_mask)out = self.fc_out(out)return out
模型采用6层编码器-解码器结构,每层包含多头注意力、残差连接和层归一化。make_src_mask函数生成填充位置掩码,防止模型关注无效位置。
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
学习率调度:采用Noam优化器实现动态学习率调整
class NoamOpt:def __init__(self, model_size, factor, warmup, optimizer):self.optimizer = optimizerself.warmup = warmupself.factor = factorself.model_size = model_sizeself._step = 0def step(self):self._step += 1rate = self.calculate_rate()for p in self.optimizer.param_groups:p['lr'] = rateself.optimizer.step()def calculate_rate(self):return self.factor * (self.model_size ** (-0.5) * min(self._step ** (-0.5), self._step * self.warmup ** (-1.5)))
# 模型导出示例traced_model = torch.jit.trace(model, (src_tensor, trg_tensor))traced_model.save("transformer.pt")
实际应用中,建议采用以下优化策略:
Transformer模型在英译汉任务中展现出卓越性能,通过深入理解其代码实现机制,开发者能更有效地进行模型调优和部署。本文提供的完整实现方案和优化建议,为工业级机器翻译系统开发提供了坚实基础。