Hello, I’m currently trying to train an OpenNMT-tf TransformerBigSharedEmbeddings
model, using some pretrained fasttext embeddings specified in data.yaml
. However I’m getting this shape mismatch error:
Traceback (most recent call last):
File "/anaconda/envs/py38_default/bin/onmt-main", line 8, in <module>
sys.exit(main())
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/bin/main.py", line 325, in main
runner.train(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/runner.py", line 310, in train
summary = trainer(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 111, in __call__
for i, loss in enumerate(self._steps(dataset, accum_steps=accum_steps)):
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 229, in _steps
accumulate_gradients(batch)
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_file7ox7vydi.py", line 10, in tf___accumulate_gradients
(loss, gradients, sample_size) = ag__.converted_call(ag__.ld(self)._model.compute_gradients, (ag__.ld(features), ag__.ld(labels), ag__.ld(self)._optimizer), dict(normalize_loss=False), fscope)
File "/tmp/__autograph_generated_filegy0hlz4u.py", line 12, in tf__compute_gradients
(loss, sample_size) = ag__.converted_call(ag__.ld(self).compute_training_loss, (ag__.ld(features), ag__.ld(labels)), dict(step=ag__.ld(optimizer).iterations), fscope)
File "/tmp/__autograph_generated_filegtq_9qum.py", line 11, in tf__compute_training_loss
(outputs, _) = ag__.converted_call(ag__.ld(self), (ag__.ld(features), ag__.ld(labels)), dict(training=True, step=ag__.ld(step)), fscope)
File "/tmp/__autograph_generated_file4e7htg6v.py", line 29, in tf____call__
(outputs, predictions) = ag__.converted_call(ag__.ld(call_method), (ag__.ld(features), ag__.ld(labels), ag__.ld(training), ag__.ld(step)), None, fscope)
File "/tmp/__autograph_generated_filex3dquofg.py", line 13, in tf___forward
retval_ = ag__.converted_call(ag__.converted_call(ag__.ld(super), (), None, fscope).__call__, (ag__.ld(features),), dict(labels=ag__.ld(labels), training=ag__.ld(training), step=ag__.ld(step)), fscope)
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_filexg1vx_8o.py", line 30, in tf__call
ag__.if_stmt((ag__.ld(labels) is not None), if_body, else_body, get_state, set_state, ('outputs',), 1)
File "/tmp/__autograph_generated_filexg1vx_8o.py", line 25, in if_body
outputs = ag__.converted_call(ag__.ld(self)._decode_target, (ag__.ld(labels), ag__.ld(encoder_outputs), ag__.ld(encoder_state), ag__.ld(encoder_sequence_length)), dict(step=ag__.ld(step), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_file6wj9puor.py", line 31, in tf___decode_target
(logits, _, attention) = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(target_inputs), ag__.converted_call(ag__.ld(self).labels_inputter.get_length, (ag__.ld(labels),), None, fscope)), dict(state=ag__.ld(initial_state), input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
(logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
ValueError: in user code:
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 290, in _accumulate_gradients *
loss, gradients, sample_size = self._model.compute_gradients(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 250, in compute_gradients *
loss, sample_size = self.compute_training_loss(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 288, in compute_training_loss *
outputs, _ = self(features, labels, training=True, step=step)
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 118, in __call__ *
outputs, predictions = call_method(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 140, in _forward *
return super().__call__(features, labels=labels, training=training, step=step)
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_filexg1vx_8o.py", line 30, in tf__call
ag__.if_stmt((ag__.ld(labels) is not None), if_body, else_body, get_state, set_state, ('outputs',), 1)
File "/tmp/__autograph_generated_filexg1vx_8o.py", line 25, in if_body
outputs = ag__.converted_call(ag__.ld(self)._decode_target, (ag__.ld(labels), ag__.ld(encoder_outputs), ag__.ld(encoder_state), ag__.ld(encoder_sequence_length)), dict(step=ag__.ld(step), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_file6wj9puor.py", line 31, in tf___decode_target
(logits, _, attention) = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(target_inputs), ag__.converted_call(ag__.ld(self).labels_inputter.get_length, (ag__.ld(labels),), None, fscope)), dict(state=ag__.ld(initial_state), input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
(logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
ValueError: Exception encountered when calling layer 'transformer_big_shared_embeddings_1' (type TransformerBigSharedEmbeddings).
in user code:
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/sequence_to_sequence.py", line 178, in call *
outputs = self._decode_target(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/sequence_to_sequence.py", line 246, in _decode_target *
logits, _, attention = self.decoder(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
(logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
ValueError: Exception encountered when calling layer 'self_attention_decoder_1' (type SelfAttentionDecoder).
in user code:
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/decoders/decoder.py", line 257, in call *
logits, state, attention = self.forward(
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/decoders/self_attention_decoder.py", line 199, in forward *
logits = self.output_layer(outputs)
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
ValueError: Exception encountered when calling layer 'dense_192' (type Dense).
in user code:
File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/layers/common.py", line 75, in call *
outputs = tf.matmul(inputs, self.kernel, transpose_b=self.transpose)
ValueError: Dimensions must be equal, but are 1024 and 300 for '{{node transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true](transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/Reshape, transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/MatMul/ReadVariableOp)' with input shapes: [?,1024], [41118,300].
Call arguments received by layer 'dense_192' (type Dense):
• inputs=tf.Tensor(shape=(None, None, 1024), dtype=float32)
Call arguments received by layer 'self_attention_decoder_1' (type SelfAttentionDecoder):
• inputs=tf.Tensor(shape=(None, None, 300), dtype=float32)
• length_or_step=tf.Tensor(shape=(None,), dtype=int32)
• state=[{'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}]
• input_fn=<function outer_factory.<locals>.inner_factory.<locals>.tf___decode_target.<locals>.<lambda> at 0x7f6378c55c10>
• sampling_probability=None
• training=True
Call arguments received by layer 'transformer_big_shared_embeddings_1' (type TransformerBigSharedEmbeddings):
• features={'length': 'tf.Tensor(shape=(None,), dtype=int32)', 'tokens': 'tf.Tensor(shape=(None, None), dtype=string)', 'ids': 'tf.Tensor(shape=(None, None), dtype=int64)'}
• labels={'length': 'tf.Tensor(shape=(None,), dtype=int32)', 'tokens': 'tf.Tensor(shape=(None, None), dtype=string)', 'ids': 'tf.Tensor(shape=(None, None), dtype=int64)', 'ids_out': 'tf.Tensor(shape=(None, None), dtype=int64)'}
• training=True
• step=<tf.Variable 'iter:0' shape=() dtype=int64>
From what I’m seeing, it seems one of the layers of the Big Transformer model (d = 1024) is incompatible with the pretrained fastext embeddings (d = 300).
300 is like the conventional dimensionality of Fasttext embeddings - does this mean there’s no way to use a SharedEmbeddings
OpenNMT transformer model with pretrained embeddings? (I imagine even TransformerBaseSharedEmbeddings
will have a similar issue since in that case d = 512 and != 300)
I’d appreciate support on this, thanks.