Hello community. I encountered a problem starting training after updating tensorflow from version 2.11.0 to 2.13.0.
I tried running it with different parameters (even using the default config) and kept getting this error, but when I downgraded the version back to 2.11, everything worked fine.
Maybe you can tell me what the problem is?
I tried running the workout with different parameters (even using the default config) and kept encountering this error, but when I downgraded the version back to 2.11, I stopped encountering this error.
Maybe you can tell me what the problem is?
To run I used this docker image: tensorflow/tensorflow:2.13.0-gpu
OpenNMT-tf version: “==2.32.0”
GPU: GRID V100DX-16Q
Here is error traceback:
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/runner.py", line 310, in train
| summary = trainer(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/training.py", line 111, in __call__
| for i, loss in enumerate(self._steps(dataset, accum_steps=accum_steps)):
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/training.py", line 229, in _steps
| accumulate_gradients(batch)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/training.py", line 290, in _accumulate_gradients
| loss, gradients, sample_size = self._model.compute_gradients(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/model.py", line 250, in compute_gradients
| loss, sample_size = self.compute_training_loss(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/model.py", line 288, in compute_training_loss
| outputs, _ = self(features, labels, training=True, step=step)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/model.py", line 118, in __call__
| outputs, predictions = call_method(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/model.py", line 140, in _forward
| return super().__call__(features, labels=labels, training=training, step=step)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
| outputs = call_fn(inputs, *args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/sequence_to_sequence.py", line 177, in call
| if labels is not None:
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/sequence_to_sequence.py", line 178, in call
| outputs = self._decode_target(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/models/sequence_to_sequence.py", line 246, in _decode_target
| logits, _, attention = self.decoder(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
| outputs = call_fn(inputs, *args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/decoder.py", line 238, in call
| if rank == 2:
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/decoder.py", line 252, in call
| elif rank == 3:
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/decoder.py", line 257, in call
| logits, state, attention = self.forward(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/self_attention_decoder.py", line 192, in forward
| outputs, state, attention = self._run(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/self_attention_decoder.py", line 152, in _run
| for i, layer in enumerate(self.layers):
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/decoders/self_attention_decoder.py", line 153, in _run
| inputs, layer_cache, layer_attention = layer(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
| outputs = call_fn(inputs, *args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/layers/transformer.py", line 587, in call
| outputs, self_kv = self.self_attention(
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
| outputs = call_fn(inputs, *args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/layers/common.py", line 137, in call
| if self.input_layer_norm is not None:
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/opennmt/layers/common.py", line 138, in call
| x = self.input_layer_norm(x)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
| outputs = call_fn(inputs, *args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
| return fn(*args, **kwargs)
| File "/root/.cache/pypoetry/virtualenvs/new-datastudio-prototype-9TtSrW0h-py3.10/lib/python3.10/site-packages/keras/src/layers/normalization/layer_normalization.py", line 333, in call
| outputs, _, _ = tf.compat.v1.nn.fused_batch_norm(
| Node: 'private__train_custom_model/self_attention_decoder/self_attention_decoder_layer/transformer_layer_wrapper_12/layer_norm_14/FusedBatchNormV3'
| cuDNN launch failure : input shape ([1,2500,1024,1])
| [[{{node private__train_custom_model/self_attention_decoder/self_attention_decoder_layer/transformer_layer_wrapper_12/layer_norm_14/FusedBatchNormV3}}]] [Op:__inference__accumulate_gradients_35024]