class LayerNorm(nn.Module):"""Construct a layernorm module (See citation for details).Layer 标准化"""def __init__(self, features, eps=1e-6):super(LayerNorm, self).__init__()self.a_2 = nn.Parameter(torch.ones(features))self.b_2 = nn.Parameter(torch.zeros(features))self.eps = epsdef forward(self, x):mean = x.mean(-1, keepdim=True)std = x.std(-1, keepdim=True)return self.a_2 * (x - mean) / (std + self.eps) + self.b_2class SublayerConnection(nn.Module):"""A residual connection followed by a layer norm.Note for code simplicity the norm is first as opposed to last."""def __init__(self, size, dropout):super(SublayerConnection, self).__init__()self.norm = LayerNorm(size)self.dropout = nn.Dropout(dropout)def forward(self, x, sublayer):"Apply residual connection to any sublayer with the same size."return x + self.dropout(sublayer(self.norm(x)))class PositionwiseFeedForward(nn.Module):"Implements FFN equation."def __init__(self, d_model, d_ff, dropout=0.1):""":param d_model: 词向量的维度:param d_ff::param dropout:"""super(PositionwiseFeedForward, self).__init__()self.w_1 = nn.Linear(d_model, d_ff)self.w_2 = nn.Linear(d_ff, d_model)self.dropout = nn.Dropout(dropout)self.activation = GELU()def forward(self, x):return self.w_2(self.dropout(self.activation(self.w_1(x))))class TransformerBlock(nn.Module):"""Bidirectional Encoder = Transformer (self-attention)Transformer = MultiHead_Attention + Feed_Forward with sublayer connection"""def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):""":param hidden: hidden size of transformer:param attn_heads: head sizes of multi-head attention:param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size:param dropout: dropout rate"""super(TransformerBlock, self).__init__()self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)self.dropout = nn.Dropout(p=dropout)def forward(self, x, mask):x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, mask=mask))x = self.output_sublayer(x, self.feed_forward)return self.dropout(x)
激活函数
使用了作者提出的 GELU 激活函数
class GELU(nn.Module):"""Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU在论文的 3.4 节中,作者重写设计了 GELU 激活函数来代替 RELU"""def forward(self, x):return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
BERT 网络代码
class BERT(nn.Module):"""BERT model : Bidirectional Encoder Representations from Transformers."""def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):""":param vocab_size: vocab_size of total words:param hidden: BERT model hidden size:param n_layers: numbers of Transformer blocks(layers):param attn_heads: number of attention heads:param dropout: dropout rate"""super(BERT, self).__init__()self.hidden = hiddenself.n_layers = n_layersself.attn_heads = attn_heads# paper noted they used 4*hidden_size for ff_network_hidden_sizeself.feed_forward_hidden = hidden * 4# embedding for BERT, sum of positional, segment, token embeddingsself.embedding = BERTEmbedding(vocab_size=vocab_size, d_model=hidden)# multi-layers transformer blocks, deep networkself.transformer_blocks = nn.ModuleList([TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])def forward(self, x, segment_info):# attention masking for padded token# torch.ByteTensor([batch_size, 1, seq_len, seq_len)mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)# embedding the indexed sequence to sequence of vectorsx = self.embedding(x, segment_info)# running over multiple transformer blocksfor transformer in self.transformer_blocks:x = transformer.forward(x, mask)return x