NewBeeNLP公众号原创出品
公众号专栏作者@山竹小果
Transformer的相关文章现在已经满天飞了,但是配合代码一起讲解的不多。本文基于PaddlePaddle 1.7版本,解析动态图下的Transformer encoder源码实现。
Transformer的每个Encoder子层(bert_base中包含12个encoder子层)包含 2 个小子层 :
Decoder中还包含Masked Multi-Head Attention
class 有如下几个:
名称 | 功能 |
---|---|
PrePostProcessLayer | 用于添加残差连接、正则化、dropout |
PositionwiseFeedForwardLayer | 全连接前馈神经网络 |
MultiHeadAttentionLayer | 多头注意力层 |
EncoderSubLayer | encoder子层 |
EncoderLayer | transformer encoder层 |
在PaddlePaddle动态图中,网络层的实现继paddle.fluid.dygraph.Layer
,类内方法__init__
是对网络层的定义,forward
是跑前向时所需的计算。
具体实现如下,对代码的解释在注释中
包括一些必要的导入
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
可选模式:
图中Add+Norm层。每经过一个模块的运算, 都要把运算之前的值和运算之后的值相加, 从而得到残差连接,残差可以使梯度直接走捷径反传到最初始层。
残差连接公式:
其中 表示输入的变量,实际就是跨层相加。
LayerNorm实际就是对隐含层做层归一化,即对某一层的所有神经元的输入进行归一化(沿着通道channel方向),使得其加快训练速度:
层归一化公式:
其中
PaddlePaddle 对应 api 文档 [2]
丢弃或者保持x的每个元素独立。Dropout是一种正则化手段,通过在训练过程中阻止神经元节点间的相关性来减少过拟合。根据给定的丢弃概率,dropout操作符按丢弃概率随机将一些神经元输出设置为0,其他的仍保持不变。
dropout op
可以从Program中删除,提高执行效率。
下面来看一下整体PrePostProcessLayer
的源码,具体解释放在注释内
class PrePostProcessLayer(Layer):
"""
PrePostProcessLayer
"""
def __init__(self, process_cmd, d_model, dropout_rate, name):
super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd # 处理模式 a n d, 可选多个
self.functors = [] # 处理层
self.exec_order = ""
# 根据处理模式,为处理层添加子层
for cmd in self.process_cmd:
if cmd == "a": # add residual connection
self.functors.append(lambda x, y: x + y if y else x)
self.exec_order += "a"
elif cmd == "n": # add layer normalization
self.functors.append(
self.add_sublayer(
# name
"layer_norm_%d" % len(
self.sublayers(include_sublayers=False)),
LayerNorm(
normalized_shape=d_model, # 需规范化的shape,如果是单个整数,则此模块将在最后一个维度上规范化(此时最后一维的维度需与该参数相同)。
param_attr=fluid.ParamAttr( # 权重参数
name=name + "_layer_norm_scale",
# 常量初始化函数,通过输入的value值初始化输入变量
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr( # 偏置参数
name=name + "_layer_norm_bias",
initializer=fluid.initializer.Constant(0.)))))
self.exec_order += "n"
elif cmd == "d": # add dropout
if dropout_rate:
self.functors.append(lambda x: fluid.layers.dropout(
x, dropout_prob=dropout_rate, is_test=False))
self.exec_order += "d"
def forward(self, x, residual=None):
for i, cmd in enumerate(self.exec_order):
if cmd == "a":
x = self.functors[i](x, residual "i")
else:
x = self.functors[i](x "i")
return x
bert中hidden_act(激活函数)是gelu。
class PositionwiseFeedForwardLayer(Layer):
"""
PositionwiseFeedForwardLayer
"""
def __init__(self,
hidden_act, # 激活函数
d_inner_hid, # 中间隐层的维度
d_model, # 最终输出的维度
dropout_rate,
param_initializer=None,
name=""):
super(PositionwiseFeedForwardLayer, self).__init__()
# 两个fc层
self._i2h = Linear(
input_dim=d_model,
output_dim=d_inner_hid,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0', initializer=param_initializer),
bias_attr=name + '_fc_0.b_0',
act=hidden_act)
self._h2o = Linear(
input_dim=d_inner_hid,
output_dim=d_model,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
self._dropout_rate = dropout_rate
def forward(self, x):
"""
forward
:param x:
:return:
"""
hidden = self._i2h(x)
# dropout
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
upscale_in_train="upscale_in_train",
is_test=False)
out = self._h2o(hidden)
return out
几个维度:
self._emb_size = config['hidden_size'] # 768 d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4
class MultiHeadAttentionLayer(Layer):
"""
MultiHeadAttentionLayer
"""
def __init__(self,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False,
param_initializer=None,
name=""):
super(MultiHeadAttentionLayer, self).__init__()
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = Linear(
input_dim=d_model,
output_dim=d_key * n_head,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0', initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
self._k_fc = Linear(
input_dim=d_model,
output_dim=d_key * n_head,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0', initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
self._v_fc = Linear(
input_dim=d_model,
output_dim=d_value * n_head,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0', initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
self._proj_fc = Linear(
input_dim=d_value * n_head,
output_dim=d_model,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0', initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
def forward(self, queries, keys, values, attn_bias):
"""
forward
:param queries:
:param keys:
:param values:
:param attn_bias:
:return:
"""
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
# 得到q k v 矩阵
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
q_hidden_size = q.shape[-1]
eshaped_q = fluid.layers.reshape(
x=q,
shape=[0, 0, self._n_head, q_hidden_size // self._n_head],
inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
k_hidden_size = k.shape[-1]
reshaped_k = fluid.layers.reshape(
x=k,
shape=[0, 0, self._n_head, k_hidden_size // self._n_head],
inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
v_hidden_size = v.shape[-1]
reshaped_v = fluid.layers.reshape(
x=v,
shape=[0, 0, self._n_head, v_hidden_size // self._n_head],
inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
scaled_q = fluid.layers.scale(x=transpose_q, scale=self._d_key**-0.5)
# scale dot product attention
product = fluid.layers.matmul(
#x=transpose_q,
x=scaled_q,
y=transpose_k,
transpose_y=True)
#alpha=self._d_model**-0.5)
if attn_bias:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
"""
EncoderSubLayer
"""
def __init__(self,
hidden_act,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=""):
super(EncoderSubLayer, self).__init__()
self.name = name
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
# 预处理
self._preprocess_layer = PrePostProcessLayer(
self._preprocess_cmd,
d_model,
prepostprocess_dropout,
name=name + "_pre_att")
# 多头注意力
self._multihead_attention_layer = MultiHeadAttentionLayer(
d_key,
d_value,
d_model,
n_head,
attention_dropout,
None,
None,
False,
param_initializer,
name=name + "_multi_head_att")
self._postprocess_layer = PrePostProcessLayer(
self._postprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_post_att")
self._preprocess_layer2 = PrePostProcessLayer(
self._preprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_pre_ffn")
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
hidden_act,
d_inner_hid,
d_model,
relu_dropout,
param_initializer,
name=name + "_ffn")
self._postprocess_layer2 = PrePostProcessLayer(
self._postprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_post_ffn")
def forward(self, enc_input, attn_bias):
"""
forward
:param enc_input: encoder 输入
:param attn_bias: attention 偏置
:return: 一层encoder encode输入之后的结果
"""
# 在进行多头attention前,先进行预处理
pre_process_multihead = self._preprocess_layer(enc_input)
# 预处理之后的结果给到多头attention层
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
# 经过attention之后进行后处理
attn_output = self._postprocess_layer(attn_output, enc_input)
# 在给到FFN层前进行预处理
pre_process2_output = self._preprocess_layer2(attn_output)
# 得到FFN层的结果
ffd_output = self._positionwise_feed_forward(pre_process2_output)
# 返回后处理后的结果
return self._postprocess_layer2(ffd_output, attn_output)
class EncoderLayer(Layer):
"""
encoder
"""
def __init__(self,
hidden_act,
n_layer, # encoder子层数量 / encoder深度
n_head, # 注意力机制中head数量
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout, # 处理层的dropout概率
attention_dropout, # attention层的dropout概率
relu_dropout, # 激活函数层的dropout概率
preprocess_cmd="n", # 前处理,正则化
postprocess_cmd="da", # 后处理,dropout + 残差连接
param_initializer=None,
name=""):
super(EncoderLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._hidden_act = hidden_act
# 后处理层,这里是层正则化
self._preprocess_layer = PrePostProcessLayer(
self._preprocess_cmd, 3, self._prepostprocess_dropout,
"post_encoder")
# 根据n_layer的设置(bert_base中是12)迭代定义几个encoder子层
for i in range(n_layer):
self._encoder_sublayers.append(
# 使用add_sublayer方法添加子层
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
hidden_act,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
param_initializer,
name=name + '_layer_' + str(i))))
def forward(self, enc_input, attn_bias):
"""
forward
:param enc_input: 模型输入
:param attn_bias: bias项可根据具体情况选择是否保留
:return: encode之后的结果
"""
# 迭代多个encoder子层,例如 bert base 的encoder子层数为12(self._n_layer)
for i in range(self._n_layer):
# 得到子层的输出,参数为 enc_input, attn_bias
enc_output = self._encoder_sublayers[i](enc_input, attn_bias "i")
# 该子层的输出作为下一子层的输入
enc_input = enc_output
# 返回处理过的层
return self._preprocess_layer(enc_output)
更多PaddlePaddle动态图教程 : https://www.paddlepaddle.org.cn/documentation/docs/zh/1.7/beginners_guide/basic_concept/dygraph/DyGraph.html#dygraph
[2]PaddlePaddle 对应 api 文档 : https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/dygraph_cn/LayerNorm_cn.html#layernorm
- END -
由于微信平台算法改版,公号内容将不再以时间排序展示,如果大家想第一时间看到我们的推送,强烈建议星标我们和给我们多点点【在看】。星标具体步骤为:
(1)点击页面最上方"AINLP",进入公众号主页。
(2)点击右上角的小点点,在弹出页面点击“设为星标”,就可以啦。
感谢支持,比心。
推荐阅读
征稿启示| 200元稿费+5000DBC(价值20个小时GPU算力)
完结撒花!李宏毅老师深度学习与人类语言处理课程视频及课件(附下载)
模型压缩实践系列之——bert-of-theseus,一个非常亲民的bert压缩方法
文本自动摘要任务的“不完全”心得总结番外篇——submodular函数优化
斯坦福大学NLP组Python深度学习自然语言处理工具Stanza试用
关于AINLP
AINLP 是一个有趣有AI的自然语言处理社区,专注于 AI、NLP、机器学习、深度学习、推荐算法等相关技术的分享,主题包括文本摘要、智能问答、聊天机器人、机器翻译、自动生成、知识图谱、预训练模型、推荐系统、计算广告、招聘信息、求职经验分享等,欢迎关注!加技术交流群请添加AINLPer(id:ainlper),备注工作/研究方向+加群目的。
阅读至此了,分享、点赞、在看三选一吧🙏