Using SharedEmbeddings Transformer model with Pretrained Embeddings

mayowaosibodu · February 8, 2024, 12:30pm

Hello, I’m currently trying to train an OpenNMT-tf TransformerBigSharedEmbeddings model, using some pretrained fasttext embeddings specified in data.yaml. However I’m getting this shape mismatch error:

Traceback (most recent call last):
  File "/anaconda/envs/py38_default/bin/onmt-main", line 8, in <module>
    sys.exit(main())
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/bin/main.py", line 325, in main
    runner.train(
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/runner.py", line 310, in train
    summary = trainer(
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 111, in __call__
    for i, loss in enumerate(self._steps(dataset, accum_steps=accum_steps)):
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 229, in _steps
    accumulate_gradients(batch)
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/tmp/__autograph_generated_file7ox7vydi.py", line 10, in tf___accumulate_gradients
    (loss, gradients, sample_size) = ag__.converted_call(ag__.ld(self)._model.compute_gradients, (ag__.ld(features), ag__.ld(labels), ag__.ld(self)._optimizer), dict(normalize_loss=False), fscope)
  File "/tmp/__autograph_generated_filegy0hlz4u.py", line 12, in tf__compute_gradients
    (loss, sample_size) = ag__.converted_call(ag__.ld(self).compute_training_loss, (ag__.ld(features), ag__.ld(labels)), dict(step=ag__.ld(optimizer).iterations), fscope)
  File "/tmp/__autograph_generated_filegtq_9qum.py", line 11, in tf__compute_training_loss
    (outputs, _) = ag__.converted_call(ag__.ld(self), (ag__.ld(features), ag__.ld(labels)), dict(training=True, step=ag__.ld(step)), fscope)
  File "/tmp/__autograph_generated_file4e7htg6v.py", line 29, in tf____call__
    (outputs, predictions) = ag__.converted_call(ag__.ld(call_method), (ag__.ld(features), ag__.ld(labels), ag__.ld(training), ag__.ld(step)), None, fscope)
  File "/tmp/__autograph_generated_filex3dquofg.py", line 13, in tf___forward
    retval_ = ag__.converted_call(ag__.converted_call(ag__.ld(super), (), None, fscope).__call__, (ag__.ld(features),), dict(labels=ag__.ld(labels), training=ag__.ld(training), step=ag__.ld(step)), fscope)
  File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/tmp/__autograph_generated_filexg1vx_8o.py", line 30, in tf__call
    ag__.if_stmt((ag__.ld(labels) is not None), if_body, else_body, get_state, set_state, ('outputs',), 1)
  File "/tmp/__autograph_generated_filexg1vx_8o.py", line 25, in if_body
    outputs = ag__.converted_call(ag__.ld(self)._decode_target, (ag__.ld(labels), ag__.ld(encoder_outputs), ag__.ld(encoder_state), ag__.ld(encoder_sequence_length)), dict(step=ag__.ld(step), training=ag__.ld(training)), fscope)
  File "/tmp/__autograph_generated_file6wj9puor.py", line 31, in tf___decode_target
    (logits, _, attention) = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(target_inputs), ag__.converted_call(ag__.ld(self).labels_inputter.get_length, (ag__.ld(labels),), None, fscope)), dict(state=ag__.ld(initial_state), input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
  File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
    ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
  File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
    ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
  File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
    (logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
  File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
    logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
  File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
    ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
  File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
    outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
ValueError: in user code:

    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/training.py", line 290, in _accumulate_gradients  *
        loss, gradients, sample_size = self._model.compute_gradients(
    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 250, in compute_gradients  *
        loss, sample_size = self.compute_training_loss(
    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 288, in compute_training_loss  *
        outputs, _ = self(features, labels, training=True, step=step)
    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 118, in __call__  *
        outputs, predictions = call_method(
    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/model.py", line 140, in _forward  *
        return super().__call__(features, labels=labels, training=training, step=step)
    File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filexg1vx_8o.py", line 30, in tf__call
        ag__.if_stmt((ag__.ld(labels) is not None), if_body, else_body, get_state, set_state, ('outputs',), 1)
    File "/tmp/__autograph_generated_filexg1vx_8o.py", line 25, in if_body
        outputs = ag__.converted_call(ag__.ld(self)._decode_target, (ag__.ld(labels), ag__.ld(encoder_outputs), ag__.ld(encoder_state), ag__.ld(encoder_sequence_length)), dict(step=ag__.ld(step), training=ag__.ld(training)), fscope)
    File "/tmp/__autograph_generated_file6wj9puor.py", line 31, in tf___decode_target
        (logits, _, attention) = ag__.converted_call(ag__.ld(self).decoder, (ag__.ld(target_inputs), ag__.converted_call(ag__.ld(self).labels_inputter.get_length, (ag__.ld(labels),), None, fscope)), dict(state=ag__.ld(initial_state), input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
    File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
        ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
    File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
        ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
    File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
        (logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
    File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
        logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
    File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
        ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
    File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
        outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)

    ValueError: Exception encountered when calling layer 'transformer_big_shared_embeddings_1' (type TransformerBigSharedEmbeddings).
    
    in user code:
    
        File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/sequence_to_sequence.py", line 178, in call  *
            outputs = self._decode_target(
        File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/models/sequence_to_sequence.py", line 246, in _decode_target  *
            logits, _, attention = self.decoder(
        File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/tmp/__autograph_generated_filealz86rbr.py", line 75, in tf__call
            ag__.if_stmt((ag__.ld(rank) == 2), if_body_3, else_body_3, get_state_3, set_state_3, ('attention', 'logits', 'state'), 3)
        File "/tmp/__autograph_generated_filealz86rbr.py", line 71, in else_body_3
            ag__.if_stmt((ag__.ld(rank) == 3), if_body_2, else_body_2, get_state_2, set_state_2, ('attention', 'logits', 'state'), 3)
        File "/tmp/__autograph_generated_filealz86rbr.py", line 64, in if_body_2
            (logits, state, attention) = ag__.converted_call(ag__.ld(self).forward, (ag__.ld(inputs),), dict(sequence_length=ag__.ld(length_or_step), initial_state=ag__.ld(state), memory=ag__.ld(self).memory, memory_sequence_length=ag__.ld(self).memory_sequence_length, input_fn=ag__.ld(input_fn), sampling_probability=ag__.ld(sampling_probability), training=ag__.ld(training)), fscope)
        File "/tmp/__autograph_generated_filegjh6s44_.py", line 26, in tf__forward
            logits = ag__.converted_call(ag__.ld(self).output_layer, (ag__.ld(outputs),), None, fscope)
        File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
            ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
        File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
            outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
    
        ValueError: Exception encountered when calling layer 'self_attention_decoder_1' (type SelfAttentionDecoder).
        
        in user code:
        
            File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/decoders/decoder.py", line 257, in call  *
                logits, state, attention = self.forward(
            File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/decoders/self_attention_decoder.py", line 199, in forward  *
                logits = self.output_layer(outputs)
            File "/anaconda/envs/py38_default/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/tmp/__autograph_generated_file9xwjs3rn.py", line 51, in tf__call
                ag__.if_stmt(ag__.and_((lambda : (ag__.ld(inputs).dtype is ag__.ld(tf).float16)), (lambda : ((ag__.ld(self).units % 8) != 0))), if_body_1, else_body_1, get_state_1, set_state_1, ('outputs',), 1)
            File "/tmp/__autograph_generated_file9xwjs3rn.py", line 46, in else_body_1
                outputs = ag__.converted_call(ag__.ld(tf).matmul, (ag__.ld(inputs), ag__.ld(self).kernel), dict(transpose_b=ag__.ld(self).transpose), fscope)
        
            ValueError: Exception encountered when calling layer 'dense_192' (type Dense).
            
            in user code:
            
                File "/anaconda/envs/py38_default/lib/python3.8/site-packages/opennmt/layers/common.py", line 75, in call  *
                    outputs = tf.matmul(inputs, self.kernel, transpose_b=self.transpose)
            
                ValueError: Dimensions must be equal, but are 1024 and 300 for '{{node transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true](transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/Reshape, transformer_big_shared_embeddings_1/self_attention_decoder_1/dense_192/MatMul/ReadVariableOp)' with input shapes: [?,1024], [41118,300].
            
            
            Call arguments received by layer 'dense_192' (type Dense):
              • inputs=tf.Tensor(shape=(None, None, 1024), dtype=float32)
        
        
        Call arguments received by layer 'self_attention_decoder_1' (type SelfAttentionDecoder):
          • inputs=tf.Tensor(shape=(None, None, 300), dtype=float32)
          • length_or_step=tf.Tensor(shape=(None,), dtype=int32)
          • state=[{'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}, {'self_kv': ('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)'), 'memory_kv': [('tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)', 'tf.Tensor(shape=(None, 16, 0, 64), dtype=float32)')]}]
          • input_fn=<function outer_factory.<locals>.inner_factory.<locals>.tf___decode_target.<locals>.<lambda> at 0x7f6378c55c10>
          • sampling_probability=None
          • training=True
    
    
    Call arguments received by layer 'transformer_big_shared_embeddings_1' (type TransformerBigSharedEmbeddings):
      • features={'length': 'tf.Tensor(shape=(None,), dtype=int32)', 'tokens': 'tf.Tensor(shape=(None, None), dtype=string)', 'ids': 'tf.Tensor(shape=(None, None), dtype=int64)'}
      • labels={'length': 'tf.Tensor(shape=(None,), dtype=int32)', 'tokens': 'tf.Tensor(shape=(None, None), dtype=string)', 'ids': 'tf.Tensor(shape=(None, None), dtype=int64)', 'ids_out': 'tf.Tensor(shape=(None, None), dtype=int64)'}
      • training=True
      • step=<tf.Variable 'iter:0' shape=() dtype=int64>

From what I’m seeing, it seems one of the layers of the Big Transformer model (d = 1024) is incompatible with the pretrained fastext embeddings (d = 300).

300 is like the conventional dimensionality of Fasttext embeddings - does this mean there’s no way to use a SharedEmbeddings OpenNMT transformer model with pretrained embeddings? (I imagine even TransformerBaseSharedEmbeddings will have a similar issue since in that case d = 512 and != 300)

I’d appreciate support on this, thanks.