fairseq

2019-04-11  本文已影响0人  VanJordan

数据处理阶段

MOSESDECODER=../mosesdecoder
$MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/train zh en $TEXT/train.clean 3 70
$MOSESDECODER/scripts/training/clean-corpus-n.perl $TEXT/valid zh en $TEXT/valid.clean 3 70
# build subword vocab
SUBWORD_NMT=../subword-nmt/subword_nmt
NUM_OPS=32000

# learn codes and encode separately
CODES=codes.${NUM_OPS}.bpe
echo "Encoding subword with BPE using ops=${NUM_OPS}"
$SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.clean.en > $TEXT/${CODES}.en
$SUBWORD_NMT/learn_bpe.py -s ${NUM_OPS} < $TEXT/train.clean.zh > $TEXT/${CODES}.zh

echo "Applying vocab to training"
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en < $TEXT/train.clean.en > $TEXT/train.${NUM_OPS}.bpe.en
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.zh < $TEXT/train.clean.zh > $TEXT/train.${NUM_OPS}.bpe.zh

VOCAB=vocab.${NUM_OPS}.bpe
echo "Generating vocab: ${VOCAB}.en"
cat $TEXT/train.${NUM_OPS}.bpe.en | $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.en

echo "Generating vocab: ${VOCAB}.zh"
cat $TEXT/train.${NUM_OPS}.bpe.zh | $SUBWORD_NMT/get_vocab.py > $TEXT/${VOCAB}.zh

# encode validation
echo "Applying vocab to valid"
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en --vocabulary $TEXT/${VOCAB}.en < $TEXT/valid.clean.en > $TEXT/valid.${NUM_OPS}.bpe.en
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.zh --vocabulary $TEXT/${VOCAB}.zh < $TEXT/valid.clean.zh > $TEXT/valid.${NUM_OPS}.bpe.zh

# encode test
echo "Applying vocab to test"
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en --vocabulary $TEXT/${VOCAB}.en < $TEXT/test.en > $TEXT/test.${NUM_OPS}.bpe.en
$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.zh --vocabulary $TEXT/${VOCAB}.zh < $TEXT/test.zh > $TEXT/test.${NUM_OPS}.bpe.zh

训练阶段

python3 preprocess.py \
    --source-lang en \
    --target-lang de \
    --tgtdict=$EN_DICT_PATH \
    --srcdict=$DE_DICT_PATH \
    --trainpref $TMP_DIR/train \
    --validpref $TMP_DIR/valid \
    --testpref $TMP_DIR/test \
    --destdir $DATA_DIR 
python3 train.py $DATA_DIR/ckpt \
    --arch transformer_vaswani_wmt_en_de_big  \  # --share-all-embeddings requires a joined dictionary
    --optimizer adam --adam-betas '(0.9, 0.98)' \
    --clip-norm 0.0 \
    --lr-scheduler inverse_sqrt \
    --warmup-init-lr 1e-07 \
    --warmup-updates 4000 \
    --lr 0.001 \
    --min-lr 1e-09 \
    --dropout 0.3 \
    --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy \
    --label-smoothing 0.1 \
    --max-tokens $TRAIN_BS \  # bs
    --update-freq $GRADIENTS_ACCUMULATIONS \ # gradients accumulations is 32 then its equal to bs is 32 times than before
    --tensorboard-logdir $TRAIN_DIR/log \ 
    --log-format json --save-dir $TRAIN_DIR/log \
    --fp16  # use-fp16
# Average 10 latest checkpoints:
python3 scripts/average_checkpoints.py \
    --inputs $TRAIN_DIR/ckpt \
    --num-epoch-checkpoints 10  \
    --output $TRAIN_DIR/ckpt/model.pt
# generate preprocessed data
echo "Preprocessing datasets..."
DATADIR=data-bin/wmt17_zh_en
rm -rf $DATADIR
mkdir -p $DATADIR
fairseq-preprocess --source-lang zh --target-lang en \
    --trainpref $TEXT/train.${NUM_OPS}.bpe --validpref $TEXT/valid.${NUM_OPS}.bpe --testpref $TEXT/test.${NUM_OPS}.bpe \
    --thresholdsrc 0 --thresholdtgt 0 --workers 12 --destdir $DATADIR

# training
echo "Training begins"
mkdir -p checkpoints
fairseq-train $DATADIR \
  -a transformer --optimizer adam -s zh -t en \
  --label-smoothing 0.1 --dropout 0.3 --max-tokens 4000 \
  --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
  --criterion label_smoothed_cross_entropy --max-update 200000 \
  --warmup-updates 10000 --warmup-init-lr '1e-7' --lr '0.001' \
  --adam-betas '(0.9, 0.98)' --adam-eps '1e-09' --clip-norm 25.0 \
  --keep-last-epochs 20 --save-dir checkpoints --log-format json > train.log
# Training
SAVE="save/dynamic_conv_wmt16en2de"
mkdir -p $SAVE
python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
    data-bin/wmt16_en_de_bpe32k --fp16  --log-interval 100 --no-progress-bar \
    --max-update 30000 --share-all-embeddings --optimizer adam \
    --adam-betas '(0.9, 0.98)' --lr-scheduler inverse_sqrt \
    --clip-norm 0.0 --weight-decay 0.0 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
    --ddp-backend=no_c10d --max-tokens 3584 \
    --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
    --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \
    --t-mult 1 --lr-period-updates 20000 \
    --arch lightconv_wmt_en_de_big --save-dir $SAVE \
    --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
    --encoder-glu 1 --decoder-glu 1

预处理脚本prepare-wmt14en2fr.sh

perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
for L in $src $tgt; do
    cp $tmp/bpe.test.$L $prep/test.$L
done

interactive

如果我们要想在预训练模型上测试我们自己的数据那么就需要使用python interactive.py脚本因为python generator.py脚本是针对目标文件夹中的二进制测试文件,顾名思义,interactive.py是以交互的方式生成翻译的语句对的,因此写个shell脚本用管道的方式传数据会很方便

$SUBWORD_NMT/apply_bpe.py -c $TEXT/${CODES}.en < $TEXT/train.clean.en > $TEXT/train.${NUM_OPS}.bpe.en
grep -P '^H' |cut -f3- | sed 's/@@\s*//g' > translation.en
cat $TEXT_FILE_PATH | python3 $SUBWORD_NMT/apply_bpe.py -c $SUBWORD_PATH | \
    python3 interactive.py $(dirname $CHECKPOINT_PATH)\
        --path $CHECKPOINT_PATH \
        --beam $BEAM_SIZE \
        --source-lang $SOURCE_LANG \
        --target-lang $TARGET_LANG  \
        --fp16 \
        --num-workers 12 \
        --batch-size $DECODER_BS \
        --buffer-size $DECODER_BS | grep -P '^H' |cut -f3- | sed 's/@@\s*//g' | tee $TRAIN_DIR/$TRANS-$FLAGS.translation.$TARGET_LANG

cat $TEXT_FILE_PATH | python3 $SUBWORD_NMT/apply_bpe.py -c $SUBWORD_PATH | \
    python3 interactive.py $(dirname $CHECKPOINT_PATH)\
        --path $CHECKPOINT_PATH \
        --beam $BEAM_SIZE \
        --source-lang $SOURCE_LANG \
        --target-lang $TARGET_LANG  \
        --fp16 \
        --num-workers 12 \
        --remove-bpe \
        --batch-size $DECODER_BS \
        --buffer-size $DECODER_BS | grep -P '^H' |cut -f3-  | tee $TRAIN_DIR/$TRANS-$FLAGS.translation.$TARGET_LANG

fairseq论文里面的东西

上一篇 下一篇

猜你喜欢

热点阅读