北京移动端网站开发,襄阳seo营销,wordpress it博客主题,郑州小学班级网站建设从0-1搭建Transformer架构 架构图
本文主要讲解 1#xff09;输入层的词嵌入 2#xff09;输入层的位置编码 3#xff09;编码层的多头注意力机制 4#xff09;编码层的前馈全连接
1#xff09;输入层的词嵌入
class Embeddings(nn.Module):构建emb…从0-1搭建Transformer架构 架构图
本文主要讲解 1输入层的词嵌入 2输入层的位置编码 3编码层的多头注意力机制 4编码层的前馈全连接
1输入层的词嵌入
class Embeddings(nn.Module):构建embedding类实现文本嵌入def __init__(self, d_model, vocab):# d_model: 词嵌入维度# vocab: 词表的大小super(Embeddings, self).__init__()self.lut nn.Embedding(vocab, d_model)self.d_model d_modeldef forward(self, x):return self.lut(x) * math.sqrt(self.d_model)2输入层的位置编码
class PositionalEncoding(nn.Module):位置编码def __init__(self, d_model, pad_size5000):# d_model 词嵌入维度# pad_size 默认词汇大小super(PositionalEncoding, self).__init__()self.d_model d_modelself.pad_size pad_sizepe torch.zeros(pad_size, d_model)for t in range(pad_size):for i in range(d_model // 2):angle_rate 1 / (10000 ** (2 * i / d_model))pe[t, 2 * i] np.sin(t * angle_rate)pe[t, 2 * i 1] np.cos(t * angle_rate)# # 双层循环等价写法# pe torch.tensor(# [[pad / (10000.0 ** (i // 2 * 2.0 / d_model)) for i in range(d_model)] for pad in range(pad_size)])## pe[:, 0::2] np.sin(pe[:, 0::2])# pe[:, 1::2] np.cos(pe[:, 1::2])# 将位置编码扩展到三维pe pe.unsqueeze(0)# 将位置编码矩阵注册成模型的bufferbuffer不是模型的参数不跟随优化器更新# 注册成buffer后在模型保存后重新加载模型的时候将这个位置编码将和参数一起加载进来self.register_buffer(pe, pe)def forward(self, x):# 位置编码不需要反向更新x x Variable(self.pe[:, :x.size(1)], requires_gradFalse)return x3编码层的多头注意力机制 三个辅助函数注意力机制、module拷贝函数、
def attention(q, k, v, dropoutNone, maskNone):# 计算公式 AT(Q,K,V) softmax(\frac{QK^{T}}{\sqrt{d_k}})V# 词嵌入维度d_k q.shape[-1]score torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)if mask is not None:score score.masked_fill(mask 0, -1e6)score F.softmax(score, dim-1)if dropout is not None:score dropout(score)return torch.matmul(score, v), scoredef clones(module, N)::param module: 需要复制的网络模块:param N: copy数量return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])class SublayerConnection(nn.Module): 子层连接结构根据传入的sublayer(实例对象)处理在编码层sublayer可以是多头注意机制或者前馈全连接在解码层sublayer也可以是带有掩码的多头注意力机制SublayerConnection处理流程规范化 - 掩码多头/多头/前馈 - 残差连接def __init__(self, d_k, dropout0.1):super(SublayerConnection, self).__init__()self.norm nn.LayerNorm(d_k)self.dropout nn.Dropout(pdropout)def forward(self, x, sublayer):# 先规范化处理在由具体子层函数处理out sublayer(self.norm(x))out self.dropout(out)# 残差连接return x out多头注意力机制
class MultiHeadAttention(nn.Module):多头注意力机制def __init__(self, d_k, head_num, dropout0.0):super(MultiHeadAttention, self).__init__()self.d_k d_kself.head_num head_numassert d_k % head_num 0self.head_dim d_k // head_numself.dropout nn.Dropout(pdropout)# 深度copy4个线性层3个用于Q、K、V矩阵一个将用于指定维度转换self.linears clones(nn.Linear(d_k, d_k), 4)self.attn Nonedef forward(self, query, key, value, maskNone):if mask is not None:mask mask.unsqueeze(0)batch_size query.size(0)# 三个线性层对输入进行进行隐空间特征提取query, key, value \[model(x).view(batch_size, -1, self.head_num, self.head_dim).transpose(1, 2) for model, x inzip(self.linears, (query, key, value))]score, self.attn attention(query, key, value, dropoutself.dropout, maskmask)score score.transpose(1, 2).contiguous().view(batch_size, -1, self.head_dim * self.head_num)return self.linears[-1](score)# 多头注意力机制的另一种实现 建议理解这一个代码比较好理解# def forward2(self, query, key, value, maskNone):# if mask is not None:# mask mask.unsqueeze(0)# batch_size query.size(0)# query, key, value \# [model(x).view(batch_size * self.head_num, -1, self.head_dim) for model, x in# zip(self.linears, (query, key, value))]# score, self.attn attention(query, key, value, dropoutself.dropout, maskmask)# score score.view(batch_size, -1, self.head_dim * self.head_num)# return self.linears[-1](score)前馈全连接
class PositionalWiseFeedForward(nn.Module):前馈全连接def __init__(self, d_k, hidden_size, dropout0.1):super(PositionalWiseFeedForward, self).__init__()self.w1 nn.Linear(d_k, hidden_size)self.w2 nn.Linear(hidden_size, d_k)self.dropout nn.Dropout(pdropout)def forward(self, x):out self.w1(x)out F.relu(out)out self.dropout(out)return self.w2(out)编码层
class EncoderLayer(nn.Module): 子层连接结构将多头注意力机制和前馈全连接组装def __init__(self, d_k, attn, feed_forward, dropout):attn 多头注意力实例feed_forward 前馈全连接实例dropout 置零率实例super(EncoderLayer, self).__init__()self.attn attnself.feed_forward feed_forward# 拷贝2个子层连接结构具体处理方式(多头/前馈)调用时指定self.sublayer clones(SublayerConnection(d_k, dropout), 2)# 保存词嵌入维度方便后续使用self.size d_kdef forward(self, x, mask): 先走多头注意力机制在过前馈全连接。 Transformer编码顺序x self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))return self.sublayer[1](x, self.feed_forward)编码器实现
class Encoder(nn.Module): 编码器实现N个编码层EncoderLayer的堆叠def __init__(self, encoder_layer, N):super(Encoder, self).__init__()self.layers clones(encoder_layer, N)# 使用自定义规范会层 encoder_layer.size 词嵌入维度self.norn LayerNorm(encoder_layer.size)# torch中规范会层# self.norn nn.LayerNorm(encoder_layer.size)def forward(self, x, maskNone):for layer in self.layers:x layer(x, mask)return self.norn(x)