Does anyone success fine-tune 3.3B on 4090? When I finetune, it crash by:
[2023-08-02 21:46:11,717 INFO] Updating vocabulary embeddings with checkpoint embeddings
[2023-08-02 21:46:13,824 INFO] src: 1078 new tokens
[2023-08-02 21:46:18,171 INFO] tgt: 1078 new tokens
Traceback (most recent call last):
File "/workspace/OpenNMT-py/train.py", line 6, in <module>
main()
File "/workspace/OpenNMT-py/onmt/bin/train.py", line 67, in main
train(opt)
File "/workspace/OpenNMT-py/onmt/bin/train.py", line 52, in train
train_process(opt, device_id=0)
File "/workspace/OpenNMT-py/onmt/train_single.py", line 165, in main
model = build_model(model_opt, opt, vocabs, checkpoint)
File "/workspace/OpenNMT-py/onmt/model_builder.py", line 412, in build_model
model.load_state_dict(
File "/workspace/OpenNMT-py/onmt/models/model.py", line 142, in load_state_dict
raise ValueError(
ValueError: Extra keys in model state_dict do not match the model config dict_keys(['encoder.embeddings.make_embedding.pe.pe', 'encoder.transformer.0.self_attn.linear_keys.bias', 'encoder.transformer.0.self_attn.linear_values.bias', 'encoder.transformer.0.self_attn.linear_query.bias', 'encoder.transformer.0.self_attn.final_linear.bias', 'encoder.transformer.0.feed_forward.w_1.bias', 'encoder.transformer.0.feed_forward.w_2.bias', 'encoder.transformer.1.self_attn.linear_keys.bias', 'encoder.transformer.1.self_attn.linear_values.bias', 'encoder.transformer.1.self_attn.linear_query.bias', 'encoder.transformer.1.self_attn.final_linear.bias', 'encoder.transformer.1.feed_forward.w_1.bias', 'encoder.transformer.1.feed_forward.w_2.bias', 'encoder.transformer.2.self_attn.linear_keys.bias', 'encoder.transformer.2.self_attn.linear_values.bias', 'encoder.transformer.2.self_attn.linear_query.bias', 'encoder.transformer.2.self_attn.final_linear.bias', 'encoder.transformer.2.feed_forward.w_1.bias', 'encoder.transformer.2.feed_forward.w_2.bias', 'encoder.transformer.3.self_attn.linear_keys.bias', 'encoder.transformer.3.self_attn.linear_values.bias', 'encoder.transformer.3.self_attn.linear_query.bias', 'encoder.transformer.3.self_attn.final_linear.bias', 'encoder.transformer.3.feed_forward.w_1.bias', 'encoder.transformer.3.feed_forward.w_2.bias', 'encoder.transformer.4.self_attn.linear_keys.bias', 'encoder.transformer.4.self_attn.linear_values.bias', 'encoder.transformer.4.self_attn.linear_query.bias', 'encoder.transformer.4.self_attn.final_linear.bias', 'encoder.transformer.4.feed_forward.w_1.bias', 'encoder.transformer.4.feed_forward.w_2.bias', 'encoder.transformer.5.self_attn.linear_keys.bias', 'encoder.transformer.5.self_attn.linear_values.bias', 'encoder.transformer.5.self_attn.linear_query.bias', 'encoder.transformer.5.self_attn.final_linear.bias', 'encoder.transformer.5.feed_forward.w_1.bias', 'encoder.transformer.5.feed_forward.w_2.bias', 'encoder.transformer.6.self_attn.linear_keys.bias', 'encoder.transformer.6.self_attn.linear_values.bias', 'encoder.transformer.6.self_attn.linear_query.bias', 'encoder.transformer.6.self_attn.final_linear.bias', 'encoder.transformer.6.feed_forward.w_1.bias', 'encoder.transformer.6.feed_forward.w_2.bias', 'encoder.transformer.7.self_attn.linear_keys.bias', 'encoder.transformer.7.self_attn.linear_values.bias', 'encoder.transformer.7.self_attn.linear_query.bias', 'encoder.transformer.7.self_attn.final_linear.bias', 'encoder.transformer.7.feed_forward.w_1.bias', 'encoder.transformer.7.feed_forward.w_2.bias', 'encoder.transformer.8.self_attn.linear_keys.bias', 'encoder.transformer.8.self_attn.linear_values.bias', 'encoder.transformer.8.self_attn.linear_query.bias', 'encoder.transformer.8.self_attn.final_linear.bias', 'encoder.transformer.8.feed_forward.w_1.bias', 'encoder.transformer.8.feed_forward.w_2.bias', 'encoder.transformer.9.self_attn.linear_keys.bias', 'encoder.transformer.9.self_attn.linear_values.bias', 'encoder.transformer.9.self_attn.linear_query.bias', 'encoder.transformer.9.self_attn.final_linear.bias', 'encoder.transformer.9.feed_forward.w_1.bias', 'encoder.transformer.9.feed_forward.w_2.bias', 'encoder.transformer.10.self_attn.linear_keys.bias', 'encoder.transformer.10.self_attn.linear_values.bias', 'encoder.transformer.10.self_attn.linear_query.bias', 'encoder.transformer.10.self_attn.final_linear.bias', 'encoder.transformer.10.feed_forward.w_1.bias', 'encoder.transformer.10.feed_forward.w_2.bias', 'encoder.transformer.11.self_attn.linear_keys.bias', 'encoder.transformer.11.self_attn.linear_values.bias', 'encoder.transformer.11.self_attn.linear_query.bias', 'encoder.transformer.11.self_attn.final_linear.bias', 'encoder.transformer.11.feed_forward.w_1.bias', 'encoder.transformer.11.feed_forward.w_2.bias', 'encoder.transformer.12.self_attn.linear_keys.bias', 'encoder.transformer.12.self_attn.linear_values.bias', 'encoder.transformer.12.self_attn.linear_query.bias', 'encoder.transformer.12.self_attn.final_linear.bias', 'encoder.transformer.12.feed_forward.w_1.bias', 'encoder.transformer.12.feed_forward.w_2.bias', 'encoder.transformer.13.self_attn.linear_keys.bias', 'encoder.transformer.13.self_attn.linear_values.bias', 'encoder.transformer.13.self_attn.linear_query.bias', 'encoder.transformer.13.self_attn.final_linear.bias', 'encoder.transformer.13.feed_forward.w_1.bias', 'encoder.transformer.13.feed_forward.w_2.bias', 'encoder.transformer.14.self_attn.linear_keys.bias', 'encoder.transformer.14.self_attn.linear_values.bias', 'encoder.transformer.14.self_attn.linear_query.bias', 'encoder.transformer.14.self_attn.final_linear.bias', 'encoder.transformer.14.feed_forward.w_1.bias', 'encoder.transformer.14.feed_forward.w_2.bias', 'encoder.transformer.15.self_attn.linear_keys.bias', 'encoder.transformer.15.self_attn.linear_values.bias', 'encoder.transformer.15.self_attn.linear_query.bias', 'encoder.transformer.15.self_attn.final_linear.bias', 'encoder.transformer.15.feed_forward.w_1.bias', 'encoder.transformer.15.feed_forward.w_2.bias', 'encoder.transformer.16.self_attn.linear_keys.bias', 'encoder.transformer.16.self_attn.linear_values.bias', 'encoder.transformer.16.self_attn.linear_query.bias', 'encoder.transformer.16.self_attn.final_linear.bias', 'encoder.transformer.16.feed_forward.w_1.bias', 'encoder.transformer.16.feed_forward.w_2.bias', 'encoder.transformer.17.self_attn.linear_keys.bias', 'encoder.transformer.17.self_attn.linear_values.bias', 'encoder.transformer.17.self_attn.linear_query.bias', 'encoder.transformer.17.self_attn.final_linear.bias', 'encoder.transformer.17.feed_forward.w_1.bias', 'encoder.transformer.17.feed_forward.w_2.bias', 'encoder.transformer.18.self_attn.linear_keys.bias', 'encoder.transformer.18.self_attn.linear_values.bias', 'encoder.transformer.18.self_attn.linear_query.bias', 'encoder.transformer.18.self_attn.final_linear.bias', 'encoder.transformer.18.feed_forward.w_1.bias', 'encoder.transformer.18.feed_forward.w_2.bias', 'encoder.transformer.19.self_attn.linear_keys.bias', 'encoder.transformer.19.self_attn.linear_values.bias', 'encoder.transformer.19.self_attn.linear_query.bias', 'encoder.transformer.19.self_attn.final_linear.bias', 'encoder.transformer.19.feed_forward.w_1.bias', 'encoder.transformer.19.feed_forward.w_2.bias', 'encoder.transformer.20.self_attn.linear_keys.bias', 'encoder.transformer.20.self_attn.linear_values.bias', 'encoder.transformer.20.self_attn.linear_query.bias', 'encoder.transformer.20.self_attn.final_linear.bias', 'encoder.transformer.20.feed_forward.w_1.bias', 'encoder.transformer.20.feed_forward.w_2.bias', 'encoder.transformer.21.self_attn.linear_keys.bias', 'encoder.transformer.21.self_attn.linear_values.bias', 'encoder.transformer.21.self_attn.linear_query.bias', 'encoder.transformer.21.self_attn.final_linear.bias', 'encoder.transformer.21.feed_forward.w_1.bias', 'encoder.transformer.21.feed_forward.w_2.bias', 'encoder.transformer.22.self_attn.linear_keys.bias', 'encoder.transformer.22.self_attn.linear_values.bias', 'encoder.transformer.22.self_attn.linear_query.bias', 'encoder.transformer.22.self_attn.final_linear.bias', 'encoder.transformer.22.feed_forward.w_1.bias', 'encoder.transformer.22.feed_forward.w_2.bias', 'encoder.transformer.23.self_attn.linear_keys.bias', 'encoder.transformer.23.self_attn.linear_values.bias', 'encoder.transformer.23.self_attn.linear_query.bias', 'encoder.transformer.23.self_attn.final_linear.bias', 'encoder.transformer.23.feed_forward.w_1.bias', 'encoder.transformer.23.feed_forward.w_2.bias', 'decoder.embeddings.make_embedding.pe.pe', 'decoder.transformer_layers.0.self_attn.linear_keys.bias', 'decoder.transformer_layers.0.self_attn.linear_values.bias', 'decoder.transformer_layers.0.self_attn.linear_query.bias', 'decoder.transformer_layers.0.self_attn.final_linear.bias', 'decoder.transformer_layers.0.context_attn.linear_keys.bias', 'decoder.transformer_layers.0.context_attn.linear_values.bias', 'decoder.transformer_layers.0.context_attn.linear_query.bias', 'decoder.transformer_layers.0.context_attn.final_linear.bias', 'decoder.transformer_layers.0.feed_forward.w_1.bias', 'decoder.transformer_layers.0.feed_forward.w_2.bias', 'decoder.transformer_layers.1.self_attn.linear_keys.bias', 'decoder.transformer_layers.1.self_attn.linear_values.bias', 'decoder.transformer_layers.1.self_attn.linear_query.bias', 'decoder.transformer_layers.1.self_attn.final_linear.bias', 'decoder.transformer_layers.1.context_attn.linear_keys.bias', 'decoder.transformer_layers.1.context_attn.linear_values.bias', 'decoder.transformer_layers.1.context_attn.linear_query.bias', 'decoder.transformer_layers.1.context_attn.final_linear.bias', 'decoder.transformer_layers.1.feed_forward.w_1.bias', 'decoder.transformer_layers.1.feed_forward.w_2.bias', 'decoder.transformer_layers.2.self_attn.linear_keys.bias', 'decoder.transformer_layers.2.self_attn.linear_values.bias', 'decoder.transformer_layers.2.self_attn.linear_query.bias', 'decoder.transformer_layers.2.self_attn.final_linear.bias', 'decoder.transformer_layers.2.context_attn.linear_keys.bias', 'decoder.transformer_layers.2.context_attn.linear_values.bias', 'decoder.transformer_layers.2.context_attn.linear_query.bias', 'decoder.transformer_layers.2.context_attn.final_linear.bias', 'decoder.transformer_layers.2.feed_forward.w_1.bias', 'decoder.transformer_layers.2.feed_forward.w_2.bias', 'decoder.transformer_layers.3.self_attn.linear_keys.bias', 'decoder.transformer_layers.3.self_attn.linear_values.bias', 'decoder.transformer_layers.3.self_attn.linear_query.bias', 'decoder.transformer_layers.3.self_attn.final_linear.bias', 'decoder.transformer_layers.3.context_attn.linear_keys.bias', 'decoder.transformer_layers.3.context_attn.linear_values.bias', 'decoder.transformer_layers.3.context_attn.linear_query.bias', 'decoder.transformer_layers.3.context_attn.final_linear.bias', 'decoder.transformer_layers.3.feed_forward.w_1.bias', 'decoder.transformer_layers.3.feed_forward.w_2.bias', 'decoder.transformer_layers.4.self_attn.linear_keys.bias', 'decoder.transformer_layers.4.self_attn.linear_values.bias', 'decoder.transformer_layers.4.self_attn.linear_query.bias', 'decoder.transformer_layers.4.self_attn.final_linear.bias', 'decoder.transformer_layers.4.context_attn.linear_keys.bias', 'decoder.transformer_layers.4.context_attn.linear_values.bias', 'decoder.transformer_layers.4.context_attn.linear_query.bias', 'decoder.transformer_layers.4.context_attn.final_linear.bias', 'decoder.transformer_layers.4.feed_forward.w_1.bias', 'decoder.transformer_layers.4.feed_forward.w_2.bias', 'decoder.transformer_layers.5.self_attn.linear_keys.bias', 'decoder.transformer_layers.5.self_attn.linear_values.bias', 'decoder.transformer_layers.5.self_attn.linear_query.bias', 'decoder.transformer_layers.5.self_attn.final_linear.bias', 'decoder.transformer_layers.5.context_attn.linear_keys.bias', 'decoder.transformer_layers.5.context_attn.linear_values.bias', 'decoder.transformer_layers.5.context_attn.linear_query.bias', 'decoder.transformer_layers.5.context_attn.final_linear.bias', 'decoder.transformer_layers.5.feed_forward.w_1.bias', 'decoder.transformer_layers.5.feed_forward.w_2.bias', 'decoder.transformer_layers.6.self_attn.linear_keys.bias', 'decoder.transformer_layers.6.self_attn.linear_values.bias', 'decoder.transformer_layers.6.self_attn.linear_query.bias', 'decoder.transformer_layers.6.self_attn.final_linear.bias', 'decoder.transformer_layers.6.context_attn.linear_keys.bias', 'decoder.transformer_layers.6.context_attn.linear_values.bias', 'decoder.transformer_layers.6.context_attn.linear_query.bias', 'decoder.transformer_layers.6.context_attn.final_linear.bias', 'decoder.transformer_layers.6.feed_forward.w_1.bias', 'decoder.transformer_layers.6.feed_forward.w_2.bias', 'decoder.transformer_layers.7.self_attn.linear_keys.bias', 'decoder.transformer_layers.7.self_attn.linear_values.bias', 'decoder.transformer_layers.7.self_attn.linear_query.bias', 'decoder.transformer_layers.7.self_attn.final_linear.bias', 'decoder.transformer_layers.7.context_attn.linear_keys.bias', 'decoder.transformer_layers.7.context_attn.linear_values.bias', 'decoder.transformer_layers.7.context_attn.linear_query.bias', 'decoder.transformer_layers.7.context_attn.final_linear.bias', 'decoder.transformer_layers.7.feed_forward.w_1.bias', 'decoder.transformer_layers.7.feed_forward.w_2.bias', 'decoder.transformer_layers.8.self_attn.linear_keys.bias', 'decoder.transformer_layers.8.self_attn.linear_values.bias', 'decoder.transformer_layers.8.self_attn.linear_query.bias', 'decoder.transformer_layers.8.self_attn.final_linear.bias', 'decoder.transformer_layers.8.context_attn.linear_keys.bias', 'decoder.transformer_layers.8.context_attn.linear_values.bias', 'decoder.transformer_layers.8.context_attn.linear_query.bias', 'decoder.transformer_layers.8.context_attn.final_linear.bias', 'decoder.transformer_layers.8.feed_forward.w_1.bias', 'decoder.transformer_layers.8.feed_forward.w_2.bias', 'decoder.transformer_layers.9.self_attn.linear_keys.bias', 'decoder.transformer_layers.9.self_attn.linear_values.bias', 'decoder.transformer_layers.9.self_attn.linear_query.bias', 'decoder.transformer_layers.9.self_attn.final_linear.bias', 'decoder.transformer_layers.9.context_attn.linear_keys.bias', 'decoder.transformer_layers.9.context_attn.linear_values.bias', 'decoder.transformer_layers.9.context_attn.linear_query.bias', 'decoder.transformer_layers.9.context_attn.final_linear.bias', 'decoder.transformer_layers.9.feed_forward.w_1.bias', 'decoder.transformer_layers.9.feed_forward.w_2.bias', 'decoder.transformer_layers.10.self_attn.linear_keys.bias', 'decoder.transformer_layers.10.self_attn.linear_values.bias', 'decoder.transformer_layers.10.self_attn.linear_query.bias', 'decoder.transformer_layers.10.self_attn.final_linear.bias', 'decoder.transformer_layers.10.context_attn.linear_keys.bias', 'decoder.transformer_layers.10.context_attn.linear_values.bias', 'decoder.transformer_layers.10.context_attn.linear_query.bias', 'decoder.transformer_layers.10.context_attn.final_linear.bias', 'decoder.transformer_layers.10.feed_forward.w_1.bias', 'decoder.transformer_layers.10.feed_forward.w_2.bias', 'decoder.transformer_layers.11.self_attn.linear_keys.bias', 'decoder.transformer_layers.11.self_attn.linear_values.bias', 'decoder.transformer_layers.11.self_attn.linear_query.bias', 'decoder.transformer_layers.11.self_attn.final_linear.bias', 'decoder.transformer_layers.11.context_attn.linear_keys.bias', 'decoder.transformer_layers.11.context_attn.linear_values.bias', 'decoder.transformer_layers.11.context_attn.linear_query.bias', 'decoder.transformer_layers.11.context_attn.final_linear.bias', 'decoder.transformer_layers.11.feed_forward.w_1.bias', 'decoder.transformer_layers.11.feed_forward.w_2.bias', 'decoder.transformer_layers.12.self_attn.linear_keys.bias', 'decoder.transformer_layers.12.self_attn.linear_values.bias', 'decoder.transformer_layers.12.self_attn.linear_query.bias', 'decoder.transformer_layers.12.self_attn.final_linear.bias', 'decoder.transformer_layers.12.context_attn.linear_keys.bias', 'decoder.transformer_layers.12.context_attn.linear_values.bias', 'decoder.transformer_layers.12.context_attn.linear_query.bias', 'decoder.transformer_layers.12.context_attn.final_linear.bias', 'decoder.transformer_layers.12.feed_forward.w_1.bias', 'decoder.transformer_layers.12.feed_forward.w_2.bias', 'decoder.transformer_layers.13.self_attn.linear_keys.bias', 'decoder.transformer_layers.13.self_attn.linear_values.bias', 'decoder.transformer_layers.13.self_attn.linear_query.bias', 'decoder.transformer_layers.13.self_attn.final_linear.bias', 'decoder.transformer_layers.13.context_attn.linear_keys.bias', 'decoder.transformer_layers.13.context_attn.linear_values.bias', 'decoder.transformer_layers.13.context_attn.linear_query.bias', 'decoder.transformer_layers.13.context_attn.final_linear.bias', 'decoder.transformer_layers.13.feed_forward.w_1.bias', 'decoder.transformer_layers.13.feed_forward.w_2.bias', 'decoder.transformer_layers.14.self_attn.linear_keys.bias', 'decoder.transformer_layers.14.self_attn.linear_values.bias', 'decoder.transformer_layers.14.self_attn.linear_query.bias', 'decoder.transformer_layers.14.self_attn.final_linear.bias', 'decoder.transformer_layers.14.context_attn.linear_keys.bias', 'decoder.transformer_layers.14.context_attn.linear_values.bias', 'decoder.transformer_layers.14.context_attn.linear_query.bias', 'decoder.transformer_layers.14.context_attn.final_linear.bias', 'decoder.transformer_layers.14.feed_forward.w_1.bias', 'decoder.transformer_layers.14.feed_forward.w_2.bias', 'decoder.transformer_layers.15.self_attn.linear_keys.bias', 'decoder.transformer_layers.15.self_attn.linear_values.bias', 'decoder.transformer_layers.15.self_attn.linear_query.bias', 'decoder.transformer_layers.15.self_attn.final_linear.bias', 'decoder.transformer_layers.15.context_attn.linear_keys.bias', 'decoder.transformer_layers.15.context_attn.linear_values.bias', 'decoder.transformer_layers.15.context_attn.linear_query.bias', 'decoder.transformer_layers.15.context_attn.final_linear.bias', 'decoder.transformer_layers.15.feed_forward.w_1.bias', 'decoder.transformer_layers.15.feed_forward.w_2.bias', 'decoder.transformer_layers.16.self_attn.linear_keys.bias', 'decoder.transformer_layers.16.self_attn.linear_values.bias', 'decoder.transformer_layers.16.self_attn.linear_query.bias', 'decoder.transformer_layers.16.self_attn.final_linear.bias', 'decoder.transformer_layers.16.context_attn.linear_keys.bias', 'decoder.transformer_layers.16.context_attn.linear_values.bias', 'decoder.transformer_layers.16.context_attn.linear_query.bias', 'decoder.transformer_layers.16.context_attn.final_linear.bias', 'decoder.transformer_layers.16.feed_forward.w_1.bias', 'decoder.transformer_layers.16.feed_forward.w_2.bias', 'decoder.transformer_layers.17.self_attn.linear_keys.bias', 'decoder.transformer_layers.17.self_attn.linear_values.bias', 'decoder.transformer_layers.17.self_attn.linear_query.bias', 'decoder.transformer_layers.17.self_attn.final_linear.bias', 'decoder.transformer_layers.17.context_attn.linear_keys.bias', 'decoder.transformer_layers.17.context_attn.linear_values.bias', 'decoder.transformer_layers.17.context_attn.linear_query.bias', 'decoder.transformer_layers.17.context_attn.final_linear.bias', 'decoder.transformer_layers.17.feed_forward.w_1.bias', 'decoder.transformer_layers.17.feed_forward.w_2.bias', 'decoder.transformer_layers.18.self_attn.linear_keys.bias', 'decoder.transformer_layers.18.self_attn.linear_values.bias', 'decoder.transformer_layers.18.self_attn.linear_query.bias', 'decoder.transformer_layers.18.self_attn.final_linear.bias', 'decoder.transformer_layers.18.context_attn.linear_keys.bias', 'decoder.transformer_layers.18.context_attn.linear_values.bias', 'decoder.transformer_layers.18.context_attn.linear_query.bias', 'decoder.transformer_layers.18.context_attn.final_linear.bias', 'decoder.transformer_layers.18.feed_forward.w_1.bias', 'decoder.transformer_layers.18.feed_forward.w_2.bias', 'decoder.transformer_layers.19.self_attn.linear_keys.bias', 'decoder.transformer_layers.19.self_attn.linear_values.bias', 'decoder.transformer_layers.19.self_attn.linear_query.bias', 'decoder.transformer_layers.19.self_attn.final_linear.bias', 'decoder.transformer_layers.19.context_attn.linear_keys.bias', 'decoder.transformer_layers.19.context_attn.linear_values.bias', 'decoder.transformer_layers.19.context_attn.linear_query.bias', 'decoder.transformer_layers.19.context_attn.final_linear.bias', 'decoder.transformer_layers.19.feed_forward.w_1.bias', 'decoder.transformer_layers.19.feed_forward.w_2.bias', 'decoder.transformer_layers.20.self_attn.linear_keys.bias', 'decoder.transformer_layers.20.self_attn.linear_values.bias', 'decoder.transformer_layers.20.self_attn.linear_query.bias', 'decoder.transformer_layers.20.self_attn.final_linear.bias', 'decoder.transformer_layers.20.context_attn.linear_keys.bias', 'decoder.transformer_layers.20.context_attn.linear_values.bias', 'decoder.transformer_layers.20.context_attn.linear_query.bias', 'decoder.transformer_layers.20.context_attn.final_linear.bias', 'decoder.transformer_layers.20.feed_forward.w_1.bias', 'decoder.transformer_layers.20.feed_forward.w_2.bias', 'decoder.transformer_layers.21.self_attn.linear_keys.bias', 'decoder.transformer_layers.21.self_attn.linear_values.bias', 'decoder.transformer_layers.21.self_attn.linear_query.bias', 'decoder.transformer_layers.21.self_attn.final_linear.bias', 'decoder.transformer_layers.21.context_attn.linear_keys.bias', 'decoder.transformer_layers.21.context_attn.linear_values.bias', 'decoder.transformer_layers.21.context_attn.linear_query.bias', 'decoder.transformer_layers.21.context_attn.final_linear.bias', 'decoder.transformer_layers.21.feed_forward.w_1.bias', 'decoder.transformer_layers.21.feed_forward.w_2.bias', 'decoder.transformer_layers.22.self_attn.linear_keys.bias', 'decoder.transformer_layers.22.self_attn.linear_values.bias', 'decoder.transformer_layers.22.self_attn.linear_query.bias', 'decoder.transformer_layers.22.self_attn.final_linear.bias', 'decoder.transformer_layers.22.context_attn.linear_keys.bias', 'decoder.transformer_layers.22.context_attn.linear_values.bias', 'decoder.transformer_layers.22.context_attn.linear_query.bias', 'decoder.transformer_layers.22.context_attn.final_linear.bias', 'decoder.transformer_layers.22.feed_forward.w_1.bias', 'decoder.transformer_layers.22.feed_forward.w_2.bias', 'decoder.transformer_layers.23.self_attn.linear_keys.bias', 'decoder.transformer_layers.23.self_attn.linear_values.bias', 'decoder.transformer_layers.23.self_attn.linear_query.bias', 'decoder.transformer_layers.23.self_attn.final_linear.bias', 'decoder.transformer_layers.23.context_attn.linear_keys.bias', 'decoder.transformer_layers.23.context_attn.linear_values.bias', 'decoder.transformer_layers.23.context_attn.linear_query.bias', 'decoder.transformer_layers.23.context_attn.final_linear.bias', 'decoder.transformer_layers.23.feed_forward.w_1.bias', 'decoder.transformer_layers.23.feed_forward.w_2.bias'])
my train config is:
share_vocab: true
src_vocab: "./nllb-200/dictionary2.txt"
src_words_min_frequency: 1
src_vocab_size: 257284
tgt_vocab: "./nllb-200/dictionary2.txt"
tgt_words_min_frequency: 1
tgt_vocab_size: 257284
vocab_size_multiple: 1
decoder_start_token: '</s>'
#### Subword
src_subword_model: "./nllb-200/flores200_sacrebleu_tokenizer_spm2.model"
tgt_subword_model: "./nllb-200/flores200_sacrebleu_tokenizer_spm2.model"
src_subword_nbest: 1
src_subword_alpha: 0.0
tgt_subword_nbest: 1
tgt_subword_alpha: 0.0
# Corpus opts:
data:
corpus_1:
path_src: "./nllb-200/dataset.tl"
path_tgt: "./nllb-200/dataset.zh"
transforms: [sentencepiece, prefix, suffix, filtertoolong]
weight: 10
src_prefix: "tgl_Latn"
tgt_prefix: "zho_Hans"
src_suffix: "</s>"
tgt_suffix: ""
update_vocab: true
train_from: "./nllb-200/nllb-200-3.3B-onmt.pt"
reset_optim: all
save_data: "nllb-200"
save_model: "./nllb-200/nllb-200-3.3B-onmt.pt"
log_file: "./nllb-200/nllb-200-3.3B-onmt.log"
keep_checkpoint: 100
save_checkpoint_steps: 4000
average_decay: 0.0005
seed: 1234
report_every: 10
train_steps: 4000
valid_steps: 100
# Batching
bucket_size: 262144
num_workers: 4
prefetch_factor: 400
world_size: 1
gpu_ranks: [0]
batch_type: "tokens"
batch_size: 512
valid_batch_size: 384
batch_size_multiple: 1
accum_count: [32, 32, 32]
accum_steps: [0, 15000, 30000]
# Optimization
model_dtype: "fp16"
optim: "fusedadam"
learning_rate: 30
warmup_steps: 100
decay_method: "noam"
adam_beta2: 0.98
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"
# Model
override_opts: true
encoder_type: transformer
decoder_type: transformer
enc_layers: 24
dec_layers: 24
heads: 16
hidden_size: 2048
word_vec_size: 2048
transformer_ff: 8192
dropout_steps: [0, 15000, 30000]
dropout: [0.1, 0.1, 0.1]
attention_dropout: [0.1, 0.1, 0.1]
share_decoder_embeddings: true
share_embeddings: true
position_encoding: true
position_encoding_type: 'SinusoidalConcat'
#LoRa
lora_layers: ['linear_values', 'linear_query', 'linear_keys', 'final_linear']
lora_rank: 4
lora_dropout: 0.0
lora_alpha: 1
lora_embedding: false