To verify if single_pass
is working, I used a small dataset consisting of ten sentences, but the training continued for nearly 3000 steps without stopping.Here is my configuration file:
save_data: wmt17_en_de1/data
overwrite: True
src_vocab: ../source.vocab
tgt_vocab: ../target.vocab
src_vocab_size: 36000
tgt_vocab_size: 36000
vocab_size_multiple: 8
n_sample: 0
valid_steps: 100
src_seq_length: 96
tgt_seq_length: 96
data:
corpus_1:
path_src: ../src.txt.subword.train
path_tgt: ../tgt.txt.subword.train
train_steps: 100
single_pass: true
average_decay: 0
seed: 1
num_workers: 0
world_size: 1
gpu_ranks: [0]
batch_type: "tokens"
batch_size: 80
valid_batch_size: 80
model_dtype: "fp16"
optim: "adam"
learning_rate: 2
warmup_steps: 4000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"
bucket_size: 1
encoder_type: transformer
decoder_type: transformer
enc_layers: 6
dec_layers: 6
heads: 8
hidden_size: 512
word_vec_size: 512
transformer_ff: 1024
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
position_encoding: true