BERT模型源码解析

乎语百科 2022-11-11 13:21:17 287 0

BERT模型源码解析

modeling.py

属性

类

class BertConfig(object) BERT模型配置参数类

class BertModel(object) BERT模型类

函数

def gelu(x) 格鲁激活函数

def get_activation(activation_string) 通过名称获取激活函数

def get_assignment_map_from_checkpoint 读取检查点函数

def dropout(input_tensor, dropout_prob) 丢弃函数，按一定比例丢弃权重数据

def layer_norm(input_tensor, name=None) 数据标准化

def layer_norm_and_dropout 先标准化，再丢弃

def create_initializer(initializer_range=0.02) 数据初始化

def embedding_lookup 嵌入查找函数

def embedding_postprocessor 嵌入处理函数

def create_attention_mask_from_input_mask 创建注意力掩码

def attention_layer 注意力层处理函数

def transformer_model transformer模型

def get_shape_list 获取张量的形状参数列表

def reshape_to_matrix(input_tensor) 将张量转换为二维矩阵

def reshape_from_matrix(output_tensor, orig_shape_list) 将二维张量转换为指定维数

def assert_rank(tensor, expected_rank, name=None) 断言张量的维数

源码

许可信息

# coding=utf-8 编码使用utf-8

# Licensed under the Apache License, Version 2.0 (the "License");根据Apache许可证进行许可

# you may not use this file except in compliance with the License.

如不符合许可证的规定，则不可使用本文件

# You may obtain a copy of the License at 可以通过下面的网址获取许可证副本

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""The main BERT model and related functions."""

导入依赖

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

import collections

import copy

import json

import math

import re

import numpy as np

import six

import tensorflow as tf

模型配置

构造函数

参数说明

class BertConfig(object):

"""Configuration for `BertModel`."""对BERT模型进行参数配置

def __init__(self,

vocab_size,

hidden_size=768,

num_hidden_layers=12,

num_attention_heads=12,

intermediate_size=3072,

hidden_act="gelu",

hidden_dropout_prob=0.1,

attention_probs_dropout_prob=0.1,

max_position_embeddings=512,

type_vocab_size=16,

initializer_range=0.02):

"""Constructs BertConfig.构造函数

Args:参数说明

vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.

inputs_ids集合的大小

hidden_size: Size of the encoder layers and the pooler layer.

编码层和池化层的大小

num_hidden_layers: Number of hidden layers in the Transformer encoder.

Transformer 编码器中隐藏层个数

num_attention_heads: Number of attention heads for each attention layer in

the Transformer encoder.

Transformer 编码器中每个注意层的头数

intermediate_size: The size of the "intermediate" (i.e., feed-forward)

layer in the Transformer encoder.

Transformer 编码器中中间层个数

hidden_act: The non-linear activation function (function or string) in the

encoder and pooler.

编码器和池化器的激活函数

hidden_dropout_prob: The dropout probability for all fully connected

layers in the embeddings, encoder, and pooler.

丢弃概率（嵌入层、编码层、池化层）

attention_probs_dropout_prob: The dropout ratio for the attention

probabilities.

注意力概率的丢弃比例

max_position_embeddings: The maximum sequence length that this model might

ever be used with. Typically set this to something large just in case

(e.g., 512 or 1024 or 2048).

最大序列长度，一般设置大一些以防万一，例如可以设置为512,1024,2048

type_vocab_size: The vocabulary size of the `token_type_ids` passed into

`BertModel`.

token_type_ids的词汇量

initializer_range: The stdev of the truncated_normal_initializer for

initializing all weight matrices.

初始化权重参数的标准差

"""

self.vocab_size = vocab_size

self.hidden_size = hidden_size

self.num_hidden_layers = num_hidden_layers

self.num_attention_heads = num_attention_heads

self.hidden_act = hidden_act

self.intermediate_size = intermediate_size

self.hidden_dropout_prob = hidden_dropout_prob

self.attention_probs_dropout_prob = attention_probs_dropout_prob

self.max_position_embeddings = max_position_embeddings

self.type_vocab_size = type_vocab_size

self.initializer_range = initializer_range

@classmethod 类方法

def from_dict(cls, json_object):

"""Constructs a `BertConfig` from a Python dictionary of parameters."""

从一个参数字典构造配置参数

config = BertConfig(vocab_size=None)

for (key, value) in six.iteritems(json_object):

config.__dict__[key] = value

return config

@classmethod

def from_json_file(cls, json_file): 从JSON文件构造BertConfig对象

"""Constructs a `BertConfig` from a json file of parameters."""

从一个JSON文件构造配置参数

with tf.gfile.GFile(json_file, "r") as reader:

text = reader.read()

return cls.from_dict(json.loads(text))

def to_dict(self): 将BertConfig对象转换为字典

"""Serializes this instance to a Python dictionary."""

output = copy.deepcopy(self.__dict__)

return output

def to_json_string(self): 将BertConfig对象转换为JSON格式字符串

"""Serializes this instance to a JSON string."""

return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

BERT模型类

BertModel

class BertModel(object):

"""BERT model ("Bidirectional Encoder Representations from Transformers").

Transformers模型的双向编码表示

Example usage:示例

```python

已经转换为词片 id形式

# Already been converted into WordPiece token ids

tf.constant用于创建常量张量

input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])

input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])

token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])

创建配置参数对象config

config = modeling.BertConfig(vocab_size=32000, hidden_size=512,

num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

创建模型对象model

model = modeling.BertModel(config=config, is_training=True,

input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

嵌入层标签、池化输出

label_embeddings = tf.get_variable(...)

pooled_output = model.get_pooled_output()

tf.matmul=matrix multiply矩阵相乘

logits = tf.matmul(pooled_output, label_embeddings)

...

```

"""

def __init__(self,

config,

is_training,

input_ids,

input_mask=None,

token_type_ids=None,

use_one_hot_embeddings=False,

scope=None):

"""Constructor for BertModel.

Args:

config: `BertConfig` instance.配置参数对象

is_training: bool. true for training model, false for eval model. Controls

whether dropout will be applied.是否进行训练

input_ids: int32 Tensor of shape [batch_size, seq_length].输入维度

input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].输入掩码

token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].令牌类型

use_one_hot_embeddings: (optional) bool. Whether to use one-hot word

embeddings or tf.embedding_lookup() for the word embeddings.

嵌入层：是否使用one-hot词嵌入

scope: (optional) variable scope. Defaults to "bert".

可选变量，默认值"bert".

Raises:

ValueError: The config is invalid or one of the input tensor shapes

is invalid.

异常：值错误，配置参数无效或者张量形状无效

"""

config = copy.deepcopy(config) 配置参数对象深度克隆

if not is_training: 如果不训练

config.hidden_dropout_prob = 0.0 放弃比例设置为0，表示不放弃参数

config.attention_probs_dropout_prob = 0.0 放弃比例设置为0，表示不放弃参数

获取输入形状

input_shape = get_shape_list(input_ids, expected_rank=2)

batch_size = input_shape[0] 批处理量，每一批处理的数据条数

seq_length = input_shape[1] 序列长度

if input_mask is None: 如果没有输出掩码，则将掩码全部设置为1

input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

if token_type_ids is None: 如果没有令牌，则将令牌全部设置为0

token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)

variable_scope 变量共享、上下文管理器，作用域；

在这个管理器下做的事情，会被这个管理器管着

with tf.variable_scope(scope, default_name="bert"):

with tf.variable_scope("embeddings"):

# Perform embedding lookup on the word ids.对单词id执行嵌入查找。

(self.embedding_output, self.embedding_table) = embedding_lookup(

input_ids=input_ids,

vocab_size=config.vocab_size,

embedding_size=config.hidden_size,

initializer_range=config.initializer_range,

word_embedding_name="word_embeddings",

use_one_hot_embeddings=use_one_hot_embeddings)

添加位置嵌入、令牌嵌入，然后标准化并执行丢弃

# Add positional embeddings and token type embeddings, then layer

# normalize and perform dropout.

embedding_postprocessor对单词嵌入张量执行各种后处理。

self.embedding_output = embedding_postprocessor(

input_tensor=self.embedding_output,

use_token_type=True,

token_type_ids=token_type_ids,

token_type_vocab_size=config.type_vocab_size,

token_type_embedding_name="token_type_embeddings",

use_position_embeddings=True,

position_embedding_name="position_embeddings",

initializer_range=config.initializer_range,

max_position_embeddings=config.max_position_embeddings,

dropout_prob=config.hidden_dropout_prob)

with tf.variable_scope("encoder"):

将2维掩码转换成3维，用于注意力评分

# This converts a 2D mask of shape [batch_size, seq_length] to a 3D

# mask of shape [batch_size, seq_length, seq_length] which is used

# for the attention scores.

attention_mask = create_attention_mask_from_input_mask(

input_ids, input_mask)

# Run the stacked transformer. 运行堆叠的transformer模型

# `sequence_output` shape = [batch_size, seq_length, hidden_size].

创建transformer_model对象

self.all_encoder_layers = transformer_model(

input_tensor=self.embedding_output,

attention_mask=attention_mask,

hidden_size=config.hidden_size,

num_hidden_layers=config.num_hidden_layers,

num_attention_heads=config.num_attention_heads,

intermediate_size=config.intermediate_size,

intermediate_act_fn=get_activation(config.hidden_act),

hidden_dropout_prob=config.hidden_dropout_prob,

attention_probs_dropout_prob=config.attention_probs_dropout_prob,

initializer_range=config.initializer_range,

do_return_all_layers=True)

[-1]表示倒数第一项

self.sequence_output = self.all_encoder_layers[-1]

# The "pooler" converts the encoded sequence tensor of shape

# [batch_size, seq_length, hidden_size] to a tensor of shape

# [batch_size, hidden_size].

pooler改变编码张量的形状，从3维变成了2维

This is necessary for segment-level

# (or segment-pair-level) classification tasks where we need a fixed

# dimensional representation of the segment.

句子分类任务中，这种转换是必要的，因为我们需要一个固定维度的表达

with tf.variable_scope("pooler"):

# We "pool" the model by simply taking the hidden state corresponding to the first token.

通过获取和第一个令牌一致的隐藏状态，我们池化了模型

We assume that this has been pre-trained

假定模型已经预训练好了

tf.squeeze从张量的形状中去除大小为1的维数

squeeze英 [skwiːz] 美 [skwiːz]v. 挤压，捏；

first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)

self.pooled_output = tf.layers.dense(

first_token_tensor, 符号张量输入到密集层

config.hidden_size, 隐藏层的大小

activation=tf.tanh, 激活函数：反正切

kernel_initializer=create_initializer(config.initializer_range))

#构造函数结束

def get_pooled_output(self): 获取池化输出

return self.pooled_output

def get_sequence_output(self): 获取序列输出

"""Gets final hidden layer of encoder. 获取编码后的隐藏层

Returns: 返回一个张量，和transformer 编码一致的

float Tensor of shape [batch_size, seq_length, hidden_size] corresponding

to the final hidden of the transformer encoder.

"""

return self.sequence_output

def get_all_encoder_layers(self): 获取所有编码层

return self.all_encoder_layers

def get_embedding_output(self): 获取嵌入层的输出

"""Gets output of the embedding lookup (i.e., input to the transformer).

获取嵌入查找的结果，例如 transformer的输入

Returns: 返回一个浮点型张量，和嵌入层一致的

将位置嵌入和类型嵌入数据统统相加求和，然后再标准化

这就是transformer的输入

float Tensor of shape [batch_size, seq_length, hidden_size] corresponding

to the output of the embedding layer, after summing the word

embeddings with the positional embeddings and the token type embeddings,

then performing layer normalization. This is the input to the transformer.

"""

return self.embedding_output

def get_embedding_table(self): 获取嵌入表

return self.embedding_table

格鲁激活

■格鲁激活函数

def gelu(x):

"""Gaussian Error Linear Unit. 高斯误差线性单元

This is a smoother version of the RELU. gelu是relu的平滑版

Original paper: https://arxiv.org/abs/1606.08415

Args: x是将被激活的张量

x: float Tensor to perform activation.

Returns: 返回值是激活后的张量

`x` with the GELU activation applied.

""" tf.tanh 反正切函数

cdf = 0.5 * (1.0 + tf.tanh(

(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))

return x * cdf

获取

激活函数

■通过字符串（函数名称）获取激活函数

def get_activation(activation_string):

"""Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.

创建一个字符串到激活函数的映射关系

Args: 输入参数：激活函数名

activation_string: String name of the activation function.

返回值：相应的激活函数。

如果输入的字符串为None、 empty或者"linear"，就会返回None。

如果输入参数不是字符串类型，就会返回 `activation_string`

Returns:

A Python function corresponding to the activation function. If

`activation_string` is None, empty, or "linear", this will return None.

If `activation_string` is not a string, it will return `activation_string`.

Raises: 异常：如果字符串无法匹配任何一个激活函数

ValueError: The `activation_string` does not correspond to a known

activation.

"""

如果入参不是字符串，就直接返回去

# We assume that anything that"s not a string is already an activation

# function, so we just return it.

if not isinstance(activation_string, six.string_types):

return activation_string

if not activation_string: 如果字符串为None或者empty，则返回None

return None

将入参字符串转换为小写

act = activation_string.lower()

if act == "linear":

return None

elif act == "relu": 热卤激活函数

return tf.nn.relu

elif act == "gelu": 格鲁激活函数

return gelu

elif act == "tanh": 反正切激活函数

return tf.tanh

else: 触发异常

raise ValueError("Unsupported activation: %s" % act)

读取

检查点

■从检查点获取任务映射

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):

"""Compute the union of the current variables and checkpoint variables."""

获取当前变量和检查点变量

assignment_map = {}

initialized_variable_names = {}

OrderedDict的 Key 会按照插入的顺序排列，不是Key本身排序

name_to_variable = collections.OrderedDict()

for var in tvars:

name = var.name

m = re.match("^(.*):\\d+$", name)

if m is not None:

name = m.group(1)

name_to_variable[name] = var

init_vars = tf.train.list_variables(init_checkpoint)

assignment_map = collections.OrderedDict()

for x in init_vars:

(name, var) = (x[0], x[1])

if name not in name_to_variable:

continue

assignment_map[name] = name

initialized_variable_names[name] = 1

initialized_variable_names[name + ":0"] = 1

return (assignment_map, initialized_variable_names)

丢弃

标准化

初始化

■丢弃函数

def dropout(input_tensor, dropout_prob):

"""Perform dropout. 进行丢弃

Args: 参数

input_tensor: float Tensor. 输入的张量

dropout_prob: Python float. The probability of dropping out a value (NOT of

*keeping* a dimension as in `tf.nn.dropout`). 丢弃某个值的概率

Returns: 返回值：丢弃部分数据后的张量

A version of `input_tensor` with dropout applied.

""" 如果丢弃概率为None或者为0，则原封不动的返回

if dropout_prob is None or dropout_prob == 0.0:

return input_tensor

1、tf.nn.dropout 中参数 keep_prob :每一个元素被保存下的概率。

2、tf.layer.dropout 中参数 rate :每一个元素丢弃的概率。keep_prob = 1 - rate

def dropout(x, keep_prob, noise_shape=None, seed=None, name=None)

要么保留，要么丢弃，所以keep_prob+dropout_prob=1

output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)

return output

■数据标准化

def layer_norm(input_tensor, name=None):

"""Run layer normalization on the last dimension of the tensor."""

return tf.contrib.layers.layer_norm(

inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

■（2合1函数）先标准化，再丢弃，然后返回

def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):

"""Runs layer normalization followed by dropout."""

output_tensor = layer_norm(input_tensor, name)

output_tensor = dropout(output_tensor, dropout_prob)

return output_tensor

■初始化权重参数

def create_initializer(initializer_range=0.02): initializer_range初始化范围，就是标准差stddev

"""Creates a `truncated_normal_initializer` with the given range."""

正态分布初始化 // 这是神经网络权重和过滤器的推荐初始值。

return tf.truncated_normal_initializer(stddev=initializer_range)

tf.truncated_normal_initializer的意思是：从截断的正态分布中输出随机值。

生成的值服从具有指定平均值和标准偏差的正态分布，

如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。

嵌入查找

■通过词查找对应的嵌入张量

def embedding_lookup(input_ids,

vocab_size,

embedding_size=128,

initializer_range=0.02,

word_embedding_name="word_embeddings",

use_one_hot_embeddings=False):

"""Looks up words embeddings for id tensor.

Args: 入参

input_ids: int32 Tensor of shape [batch_size, seq_length] containing word

ids.包含词的id的整型张量

vocab_size: int. Size of the embedding vocabulary.嵌入词典的大小

embedding_size: int. Width of the word embeddings. 词嵌入的大小

initializer_range: float. Embedding initialization range.权重参数初始化的标准差

word_embedding_name: string. Name of the embedding table.词嵌入名称

use_one_hot_embeddings: bool. If True, use one-hot method for word

embeddings. If False, use `tf.gather()`. 是否使用onehot码

Returns: 返回一个张量

float Tensor of shape [batch_size, seq_length, embedding_size].

""" 假定输入数据形状为 [batch_size, seq_length, num_inputs]

# This function assumes that the input is of shape [batch_size, seq_length,

# num_inputs].

# 如果输入是2D张量，则必须变形为3维张量，增加第三维，并且第三维的大小为1

# If the input is a 2D tensor of shape [batch_size, seq_length], we

# reshape to [batch_size, seq_length, 1].

if input_ids.shape.ndims == 2: 如果输入是2维，则扩张维度tf.expand_dims

input_ids = tf.expand_dims(input_ids, axis=[-1])

嵌入表格

embedding_table = tf.get_variable(

name=word_embedding_name,

shape=[vocab_size, embedding_size],

initializer=create_initializer(initializer_range))

平坦化，降维成1维

哪一维使用了-1，那这一维度就不定义大小，而是根据你的数据情况进行匹配。

即先不管-1的那一个维度，先看其他维度，然后用原矩阵的总元素个数除以确定的维度，就能得到-1维度的值。

不过要注意：但列表中只能存在一个-1。

flat_input_ids = tf.reshape(input_ids, [-1])

if use_one_hot_embeddings:

tf.one_hot()函数是将input转化为one-hot类型数据输出

one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)

将one_hot向量和嵌入表相乘，用于向量降维，减少数据量

output = tf.matmul(one_hot_input_ids, embedding_table)

else:

output = tf.gather(embedding_table, flat_input_ids)

input_shape = get_shape_list(input_ids)

张量变形

output = tf.reshape(output,

input_shape[0:-1] + [input_shape[-1] * embedding_size])

return (output, embedding_table)

嵌入

后处理

■嵌入后处理

def embedding_postprocessor(input_tensor,

use_token_type=False,

token_type_ids=None,

token_type_vocab_size=16,

token_type_embedding_name="token_type_embeddings",

use_position_embeddings=True,

position_embedding_name="position_embeddings",

initializer_range=0.02,

max_position_embeddings=512,

dropout_prob=0.1):

"""Performs various post-processing on a word embedding tensor.

对词嵌入张量进行各种处理

Args: 入参输入的张量

input_tensor: float Tensor of shape [batch_size, seq_length,

embedding_size].

是否使用类型令牌

use_token_type: bool. Whether to add embeddings for `token_type_ids`.

类型令牌的id，如果要使用类型令牌，那么该参数必须指定

token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

Must be specified if `use_token_type` is True.

类型令牌大小

token_type_vocab_size: int. The vocabulary size of `token_type_ids`.

类型令牌的名称

token_type_embedding_name: string. The name of the embedding table variable

for token type ids.

是否使用位置嵌入

use_position_embeddings: bool. Whether to add position embeddings for the

position of each token in the sequence.

位置嵌入表的名称

position_embedding_name: string. The name of the embedding table variable

for positional embeddings.

标准差stdev，也就是参数的范围，用于权重参数的初始化

initializer_range: float. Range of the weight initialization.

位置嵌入的最大长度，可以大于输入序列的长度，但是不能小于

max_position_embeddings: int. Maximum sequence length that might ever be

used with this model. This can be longer than the sequence length of

input_tensor, but cannot be shorter.

丢弃率=1-保留率

dropout_prob: float. Dropout probability applied to the final output tensor.

Returns: 返回值：和输入张量形状相同的另一个张量

float tensor with same shape as `input_tensor`.

Raises: 异常：张量形状或者输入值无效

ValueError: One of the tensor shapes or input values is invalid.

"""

input_shape = get_shape_list(input_tensor, expected_rank=3) 获取形状列表

batch_size = input_shape[0]

seq_length = input_shape[1]

width = input_shape[2]

output = input_tensor

类型嵌入■

if use_token_type:

if token_type_ids is None: 如果没有token_type_ids 就触发异常

raise ValueError("`token_type_ids` must be specified if"

"`use_token_type` is True.")

类型表

token_type_table = tf.get_variable(

name=token_type_embedding_name,

shape=[token_type_vocab_size, width],

initializer=create_initializer(initializer_range))

# This vocab will be small so we always do one-hot here, since it is always

# faster for a small vocabulary.

这个词典比较小，所以使用 one-hot，因为更快

flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 平坦化，变成一维的

转换成one_hot格式的id

one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)

one_hot格式乘以一个类型表，则转换为词向量

token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)

token_type_embeddings = tf.reshape(token_type_embeddings,

[batch_size, seq_length, width])

output += token_type_embeddings 将类型数据加进去

位置嵌入■

if use_position_embeddings: 如果使用位置嵌入

断言条件 x <= y 保持元素

assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)

tf.control_dependencies是tensorflow中的一个flow顺序控制机制，作用有二：

插入依赖（dependencies）和清空依赖（依赖是op或tensor）。

with tf.control_dependencies([assert_op]):

get_variable函数的作用是创建新的tensorflow变量，常见的initializer有：常量初始化器tf.constant_initializer、正太分布初始化器tf.random_normal_initializer、截断正态分布初始化器tf.truncated_normal_initializer、均匀分布初始化器tf.random_uniform_initializer。

full_position_embeddings = tf.get_variable(

name=position_embedding_name,

shape=[max_position_embeddings, width],

initializer=create_initializer(initializer_range))

因为位置嵌入表是一个学习变量，可以通过最大长度创建；

实际的序列长度可能小于这个长度，因为快速训练任务没有长序列；

# Since the position embedding table is a learned variable, we create it

# using a (long) sequence length `max_position_embeddings`. The actual

# sequence length might be shorter than this, for faster training of

# tasks that do not have long sequences.

所以全位置嵌入是一个高效的嵌入，并且当前序列有位置信息，所以我们只执行一个切片

# So `full_position_embeddings` is effectively an embedding table

# for position [0, 1, 2, ..., max_position_embeddings-1], and the current

# sequence has positions [0, 1, 2, ... seq_length-1], so we can just

# perform a slice.

函数：tf.slice(inputs, begin, size, name)

作用：从列表、数组、张量等对象中抽取一部分数据

position_embeddings = tf.slice(full_position_embeddings, [0, 0],

[seq_length, -1])

num_dims = len(output.shape.as_list()) 维度个数

只有最后两个维度是有意义的，所以我们在第一个维度广播，通常这个维度是批处理量

# Only the last two dimensions are relevant (`seq_length` and `width`), so

# we broadcast among the first dimensions, which is typically just

# the batch size.

position_broadcast_shape = [] 广播形状

for _ in range(num_dims - 2):

position_broadcast_shape.append(1)

position_broadcast_shape.extend([seq_length, width]) 扩张

position_embeddings = tf.reshape(position_embeddings,

position_broadcast_shape) 变形

output += position_embeddings 将位置数据加进去

output = layer_norm_and_dropout(output, dropout_prob) 标准化和丢弃

return output

创建掩码

■从输入掩码创建注意力掩码

def create_attention_mask_from_input_mask(from_tensor, to_mask):

"""Create 3D attention mask from a 2D tensor mask.

从 2D掩码创建3D掩码

Args: 入参：输入张量，转换成掩码的张量

from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].

to_mask: int32 Tensor of shape [batch_size, to_seq_length].

Returns: 返回值浮点值的张量

float Tensor of shape [batch_size, from_seq_length, to_seq_length].

""" 获取入参形状参数

from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])

batch_size = from_shape[0]

from_seq_length = from_shape[1]

获取转换张量的形状

to_shape = get_shape_list(to_mask, expected_rank=2)

to_seq_length = to_shape[1]

先变形，然后转换成float32浮点数

to_mask = tf.cast(

tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

from_tensor不一定是掩码（虽然它可能是）

我们不太关心（from里面的填充符号），所以创建一个全是1的张量；

# We don't assume that `from_tensor` is a mask (although it could be). We

# don't actually care if we attend *from* padding tokens (only *to* padding)

# tokens so we create a tensor of all ones.

# `broadcast_ones` = [batch_size, from_seq_length, 1]

创建全1张量

broadcast_ones = tf.ones(

shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

我们在两个维度上进行广播，从而创建掩码

# Here we broadcast along two dimensions to create the mask.

mask = broadcast_ones * to_mask

return mask

注意力层

■注意力层

def attention_layer(from_tensor,

to_tensor,

attention_mask=None,

num_attention_heads=1,

size_per_head=512,

query_act=None,

key_act=None,

value_act=None,

attention_probs_dropout_prob=0.0,

initializer_range=0.02,

do_return_2d_tensor=False,

batch_size=None,

from_seq_length=None,

to_seq_length=None):

"""Performs multi-headed attention from `from_tensor` to `to_tensor`.

多头的注意力

This is an implementation of multi-headed attention

based on "Attention is all you Need".

这是一个多头注意力的实现，注意的才是需要的

如果from_tensor和to_tensor是一样的，name这个注意力就是自己注意自己，也叫自注意力。

If `from_tensor` and `to_tensor` are the same, then

this is self-attention. Each timestep in `from_tensor` attends to the

corresponding sequence in `to_tensor`, and returns a fixed-with vector.

先将from_tensor投射成query张量，并且将to_tensor投射成key和value张量。

这将产生一系列张量，张量个数=头数，

其中每个张量的形状都是[批处理量，序列长度，头的大小]

This function first projects `from_tensor` into a "query" tensor and

`to_tensor` into "key" and "value" tensors. These are (effectively) a list

of tensors of length `num_attention_heads`, where each tensor is of shape

[batch_size, seq_length, size_per_head].

query 张量和key张量都是点积的和成比例的？？？。

通过softmax运算从而获取注意力数据。

value 张量通过这些注意力数据差值计算得出，然后把它们连接成一个张量。

Then, the query and key tensors are dot-producted and scaled. These are

softmaxed to obtain attention probabilities. The value tensors are then

interpolated by these probabilities, then concatenated back to a single

tensor and returned.

实际操作中，多头注意力进行转置和变形运算，而不是独立的张量运算。

In practice, the multi-headed attention are done with transposes and

reshapes rather than actual separate tensors.

Args: 入参，输入张量，输出张量

from_tensor: float Tensor of shape [batch_size, from_seq_length,

from_width].

to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].

注意力掩码

attention_mask: (optional) int32 Tensor of shape [batch_size,

from_seq_length, to_seq_length]. The values should be 1 or 0. The

attention scores will effectively be set to -infinity for any positions in

the mask that are 0, and will be unchanged for positions that are 1.

注意力头数

num_attention_heads: int. Number of attention heads.

每个头的大小

size_per_head: int. Size of each attention head.

query变形的激活函数

key 变形的激活函数

value 变形的激活函数

query_act: (optional) Activation function for the query transform.

key_act: (optional) Activation function for the key transform.

value_act: (optional) Activation function for the value transform.

注意力数据的丢弃比例

attention_probs_dropout_prob: (optional) float. Dropout probability of the

attention probabilities.

标准差，数据初始化的范围（截断的正态分布）

initializer_range: float. Range of the weight initializer.

是否返回2d张量

do_return_2d_tensor: bool. If True, the output will be of shape [batch_size

* from_seq_length, num_attention_heads * size_per_head]. If False, the

output will be of shape [batch_size, from_seq_length, num_attention_heads

* size_per_head].

批处理量，输入序列长度，输出序列长度

batch_size: (Optional) int. If the input is 2D, this might be the batch size

of the 3D version of the `from_tensor` and `to_tensor`.

from_seq_length: (Optional) If the input is 2D, this might be the seq length

of the 3D version of the `from_tensor`.

to_seq_length: (Optional) If the input is 2D, this might be the seq length

of the 3D version of the `to_tensor`.

Returns: 返回值浮点值的张量

float Tensor of shape [batch_size, from_seq_length,

num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is

true, this will be of shape [batch_size * from_seq_length,

num_attention_heads * size_per_head]).

Raises: 异常：参数无效，或者张量形状无效

ValueError: Any of the arguments or tensor shapes are invalid.

"""

■变形+转置→为了获取得分

def transpose_for_scores(input_tensor, batch_size, num_attention_heads,

seq_length, width):

output_tensor = tf.reshape(

input_tensor, [batch_size, seq_length, num_attention_heads, width])

tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高（即为二维数组的个数），1代表二维数组的行，2代表二维数组的列。

tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置。

output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])

return output_tensor

获取形状

from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])

to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

if len(from_shape) != len(to_shape):

raise ValueError(

"The rank of `from_tensor` must match the rank of `to_tensor`.")

if len(from_shape) == 3: 三维张量

batch_size = from_shape[0]

from_seq_length = from_shape[1]

to_seq_length = to_shape[1]

elif len(from_shape) == 2: 二维张量

if (batch_size is None or from_seq_length is None or to_seq_length is None):

raise ValueError(

"When passing in rank 2 tensors to attention_layer, the values "

"for `batch_size`, `from_seq_length`, and `to_seq_length` "

"must all be specified.")

引用的 维度：

# Scalar dimensions referenced here: 标量维度

# B = batch size (number of sequences) B批处理量

# F = `from_tensor` sequence length F输入张量的序列长度

# T = `to_tensor` sequence length T输出张量的序列长度

# N = `num_attention_heads` N注意力头数

# H = `size_per_head` H每个头的大小

from_tensor_2d = reshape_to_matrix(from_tensor) 转换为二维矩阵

to_tensor_2d = reshape_to_matrix(to_tensor) 转换为二维矩阵

# `query_layer` = [B*F, N*H] 询问层=[批处理量*输入长度，头数*每头的大小]

query_layer = tf.layers.dense( 创建一个全连接层，密集层

from_tensor_2d,

num_attention_heads * size_per_head,

activation=query_act,

name="query",

kernel_initializer=create_initializer(initializer_range))

# `key_layer` = [B*T, N*H] 关键层=[批处理量*输出长度，头数*每头的大小]

key_layer = tf.layers.dense( 创建一个全连接层，密集层

to_tensor_2d,

num_attention_heads * size_per_head,

activation=key_act,

name="key",

kernel_initializer=create_initializer(initializer_range))

# `value_layer` = [B*T, N*H] 数值层=[批处理量*输出长度，头数*每头的大小]

value_layer = tf.layers.dense( 创建一个全连接层，密集层

to_tensor_2d,

num_attention_heads * size_per_head,

activation=value_act,

name="value",

kernel_initializer=create_initializer(initializer_range))

# `query_layer` = [B, N, F, H] 变形+转置→为了获取得分

query_layer = transpose_for_scores(query_layer, batch_size,

num_attention_heads, from_seq_length,

size_per_head)

# `key_layer` = [B, N, T, H] 变形+转置→为了获取得分

key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,

to_seq_length, size_per_head)

# Take the dot product between "query" and "key" to get the raw

# attention scores.

对query和key进行点乘计算，获取原始得分

# `attention_scores` = [B, N, F, T]

attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)

attention_scores = tf.multiply(attention_scores,

1.0 / math.sqrt(float(size_per_head)))

if attention_mask is not None: 如果注意力掩码非空

# `attention_mask` = [B, 1, F, T]

attention_mask = tf.expand_dims(attention_mask, axis=[1]) 扩张

对于关心的位置掩码为1，其他位置掩码为0，

这将创建一个新的矩阵，关心的位置为0，掩码位置为-10000

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for

# masked positions, this operation will create a tensor which is 0.0 for

# positions we want to attend and -10000.0 for masked positions.

adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

# Since we are adding it to the raw scores before the softmax, this is

# effectively the same as removing these entirely.

attention_scores += adder

在进行softmax计算前，将其加到得分里面，相当于完全删除这些内容

将注意力分数值标准化，然后就变成了概率值（归一化）

# Normalize the attention scores to probabilities.

# `attention_probs` = [B, N, F, T] probs=probabilities概率值

attention_probs = tf.nn.softmax(attention_scores)

这真的丢弃了我们关心的值，这看起来有点不正常，

但是这是按照Transformer 论文进行的

# This is actually dropping out entire tokens to attend to, which might

# seem a bit unusual, but is taken from the original Transformer paper.

attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

进行部分数据的丢弃，防止过拟合

# `value_layer` = [B, T, N, H]

value_layer = tf.reshape(

value_layer,

[batch_size, to_seq_length, num_attention_heads, size_per_head])

# `value_layer` = [B, N, T, H] 第二维和第三维进行转置

value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

# `context_layer` = [B, N, F, H]

context_layer = tf.matmul(attention_probs, value_layer)

注意力矩阵*值矩阵=上下文矩阵

# `context_layer` = [B, F, N, H] 上下文矩阵转置

context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

if do_return_2d_tensor: 如果要求返回二维矩阵，就将上下文矩阵变形成二维

# `context_layer` = [B*F, N*H]

context_layer = tf.reshape(

context_layer,

[batch_size * from_seq_length, num_attention_heads * size_per_head])

else: 如果不要求返回二维矩阵，就变形为一个三维矩阵

# `context_layer` = [B, F, N*H]

context_layer = tf.reshape(

context_layer,

[batch_size, from_seq_length, num_attention_heads * size_per_head])

return context_layer

transformer模型

■transformer模型函数

def transformer_model(input_tensor,

attention_mask=None,

hidden_size=768,

num_hidden_layers=12,

num_attention_heads=12,

intermediate_size=3072,

intermediate_act_fn=gelu,

hidden_dropout_prob=0.1,

attention_probs_dropout_prob=0.1,

initializer_range=0.02,

do_return_all_layers=False):

"""Multi-headed, multi-layer Transformer from "Attention is All You Need".

多头的，多层的Transformer 模型，基于一个理念“注意的才是需要的”

This is almost an exact implementation of the original Transformer encoder.

这是对原始Transformer 代码的精确实现

See the original paper: 论文参照下面的链接

https://arxiv.org/abs/1706.03762

Also see: 也可以参照GitHub

https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

Args: 入参说明：输入张量，隐藏层大小，隐藏层个数，注意力头数

input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].

attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,

seq_length], with 1 for positions that can be attended to and 0 in

positions that should not be.

hidden_size: int. Hidden size of the Transformer.

num_hidden_layers: int. Number of layers (blocks) in the Transformer.

num_attention_heads: int. Number of attention heads in the Transformer.

中间层大小，中间层的激活函数，隐藏层的丢弃比例，注意力概率层的丢弃比例

intermediate_size: int. The size of the "intermediate" (a.k.a., feed

forward) layer.

intermediate_act_fn: function. The non-linear activation function to apply

to the output of the intermediate/feed-forward layer.

hidden_dropout_prob: float. Dropout probability for the hidden layers.

attention_probs_dropout_prob: float. Dropout probability of the attention

probabilities.

截断的标准正太分布的标准差，

也就是权重参数初始化的数值范围（超出该范围的会被截断）

initializer_range: float. Range of the initializer (stddev of truncated

normal).

是否要求返回所有的层，还是返回最后一层

do_return_all_layers: Whether to also return all layers or just the final

layer.

Returns: 返回值，一个张量，Transformer模型中的最后一个隐藏层

float Tensor of shape [batch_size, seq_length, hidden_size], the final

hidden layer of the Transformer.

Raises: 异常无效的形状或参数值

ValueError: A Tensor shape or parameter is invalid.

"""

if hidden_size % num_attention_heads != 0:

如果隐藏的大小不能整除注意力头数，就触发异常

raise ValueError(

"The hidden size (%d) is not a multiple of the number of attention "

"heads (%d)" % (hidden_size, num_attention_heads))

attention_head_size = int(hidden_size / num_attention_heads)

input_shape = get_shape_list(input_tensor, expected_rank=3)

batch_size = input_shape[0]

seq_length = input_shape[1]

input_width = input_shape[2]

Transformer 需要对残差进行求和计算，所以所需的参数和隐藏层相同

# The Transformer performs sum residuals on all layers so the input needs

# to be the same as the hidden size.

if input_width != hidden_size: 参数尺寸不一致，就报错

raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %

(input_width, hidden_size))

我们始终使用2D张量，避免来回的变形；

矩阵变形对于GPU和CPU是很简单的，但是对于TPU就有点麻烦了，

所以要减少这么不必要的转换带来的计算量，从而提高模型效率；

# We keep the representation as a 2D tensor to avoid re-shaping it back and

# forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on

# the GPU/CPU but may not be free on the TPU, so we want to minimize them to

# help the optimizer.

将输入的张量转换为2D矩阵

prev_output = reshape_to_matrix(input_tensor)

all_layer_outputs = [] 定义所有的输出层

variable_scope()是作用域，和tf.get_variable()搭配使用

variable_scope也是个作为上下文管理器的角色,

下文管理器:意思就是，在这个管理器下做的事情，会被这个管理器管着。

variable_scope 主要是因为变量共享的需求。

for layer_idx in range(num_hidden_layers): 遍历所有的层

with tf.variable_scope("layer_%d" % layer_idx):

layer_input = prev_output 输入就是原先的输出

with tf.variable_scope("attention"):

attention_heads = [] 定义注意力头的集合

with tf.variable_scope("self"):

attention_head = attention_layer( 每个注意力头就是一个注意力层

from_tensor=layer_input, 源矩阵和目标矩阵相同，也就是自己注意自己

to_tensor=layer_input,

attention_mask=attention_mask, 注意力掩码

num_attention_heads=num_attention_heads, 头数

size_per_head=attention_head_size, 每头的大小

attention_probs_dropout_prob=attention_probs_dropout_prob, 注意力数据丢弃比例

initializer_range=initializer_range, 数据初始化范围，也就是标准差

do_return_2d_tensor=True, 是否要求返回2D张量

batch_size=batch_size, 批处理量

from_seq_length=seq_length, 源序列长度

to_seq_length=seq_length) 目标序列长度

attention_heads.append(attention_head) 将生成的头【矩阵】添加到集合中

attention_output = None

if len(attention_heads) == 1: 如果只有一头，则输出就是这一头

attention_output = attention_heads[0]

else: 如果有好多头

有多头的情况下，我们将他们连接起来，然后再投影；

# In the case where we have other sequences, we just concatenate

# them to the self-attention head before the projection.

attention_output = tf.concat(attention_heads, axis=-1)

tf.concat(),tensorflow中用来拼接张量的函数tf.concat()，用法:

axis=0 代表在第0个维度拼接; axis=1 代表在第1个维度拼接

axis=-1表示倒数第一个维度，对于三维矩阵拼接来说，axis=-1等价于axis=2。

对于一个二维矩阵，第0个维度代表最外层方括号所框下的子集，第1个维度代表内部方括号所框下的子集。维度越高，括号越小。

# Run a linear projection of `hidden_size` then add a residual

# with `layer_input`.

对隐藏层尺寸进行线性投影，然后再加上一个残差

with tf.variable_scope("output"):

attention_output = tf.layers.dense( 创建一个全连接层/密集层

attention_output,

hidden_size,

kernel_initializer=create_initializer(initializer_range))

attention_output = dropout(attention_output, hidden_dropout_prob) 丢弃

attention_output = layer_norm(attention_output + layer_input) 标准化

激活函数仅用于中间层

# The activation is only applied to the "intermediate" hidden layer.

with tf.variable_scope("intermediate"):

intermediate_output = tf.layers.dense( 创建一个全连接层/密集层

attention_output, 将上一层的输出，作为本层的输入

intermediate_size, 中间层大小

activation=intermediate_act_fn,

kernel_initializer=create_initializer(initializer_range))

向下投射到隐藏层大小，然后再和残差相加

# Down-project back to `hidden_size` then add the residual.

with tf.variable_scope("output"):

layer_output = tf.layers.dense( 创建密集层，进行矩阵投影运算

intermediate_output,

hidden_size,

kernel_initializer=create_initializer(initializer_range))

layer_output = dropout(layer_output, hidden_dropout_prob) 丢弃

layer_output = layer_norm(layer_output + attention_output) 标准化

prev_output = layer_output

all_layer_outputs.append(layer_output) 再添加一个层

if do_return_all_layers: 如果要求返回所有的层

final_outputs = [] 最终返回值

for layer_output in all_layer_outputs: 遍历所有层

final_output = reshape_from_matrix(layer_output, input_shape) 每个层都进行变形

final_outputs.append(final_output) 添加到返回值中

return final_outputs

else: 如果不要求返回所有层

final_output = reshape_from_matrix(prev_output, input_shape) 变形

return final_output