【代码解析】Transformer-XL 之 Relative

2019-07-20  本文已影响0人  PROoshio

[论文] 《Transformer-XL:Attentive Language Models beyond a Fixed-Length Context》- CMU & Google Brain

def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
                   n_head, d_head, dropout, dropatt, is_training,
                   kernel_initializer, scope='rel_attn')
# w : token emb
# r : 反向的绝对位置emb
# r_w_bias :公式中的u
# r_r_bias : 公式中的v
# attn_mask : attention mask矩阵
# mems : memory
def rel_shift(x):
    x_size = tf.shape(x)
    x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]]) #第二维padding
    x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]]) #reshape产生偏移
    x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) #截取attention score矩阵
    x = tf.reshape(x, x_size)
    return x
def _cache_mem(curr_out, prev_mem, mem_len=None):
  if mem_len is None or prev_mem is None:
    new_mem = curr_out
  elif mem_len == 0:
    return prev_mem
  else:
    new_mem = tf.concat([prev_mem, curr_out], 0)[- mem_len:]
  return tf.stop_gradient(new_mem)
def _create_mask(qlen, mlen, same_length=False): 
  #same_length : 每个token是否采用相同长度的attn length
  # 代码中train阶段为False 测试时是True

  attn_mask = tf.ones([qlen, qlen]) # 1: mask 0: non-mask
  mask_u = tf.matrix_band_part(attn_mask, 0, -1) #上三角 = 1
  mask_dia = tf.matrix_band_part(attn_mask, 0, 0) #对角 = 1
  attn_mask_pad = tf.zeros([qlen, mlen]) # memory的mask
  ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) #如果使token相同的attn_len,设置下三角mask
  if same_length:
    mask_l = tf.matrix_band_part(attn_mask, -1, 0)
    ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
  return ret
def rel_shift(x):
x_size = tf.shape(x)
x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
x = tf.reshape(x, x_size)
return x

def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
                     n_head, d_head, dropout, dropatt, is_training,
                     kernel_initializer, scope='rel_attn'):
scale = 1 / (d_head ** 0.5)
with tf.variable_scope(scope):
  qlen = tf.shape(w)[0]
  rlen = tf.shape(r)[0]
  bsz = tf.shape(w)[1]

  cat = tf.concat([mems, w],
                  0) if mems is not None and mems.shape.ndims > 1 else w
  w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
                            kernel_initializer=kernel_initializer, name='qkv')
  # word线性映射
  r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
                             kernel_initializer=kernel_initializer, name='r')
  # pos线性映射

  w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
  w_head_q = w_head_q[-qlen:]

  klen = tf.shape(w_head_k)[0]

  w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
  w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
  w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])

  r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])

  rw_head_q = w_head_q + r_w_bias
  rr_head_q = w_head_q + r_r_bias

  AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)
  BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)
  BD = rel_shift(BD)

  attn_score = (AC + BD) * scale
  attn_mask_t = attn_mask[:, :, None, None]
  attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t

  attn_prob = tf.nn.softmax(attn_score, 1)
  attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)

  attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
  size_t = tf.shape(attn_vec)
  attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])

  attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
                             kernel_initializer=kernel_initializer, name='o')
  attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)

  output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
return output
上一篇下一篇

猜你喜欢

热点阅读