Hi,
I am trying to run from notebook, but train seems to be stuck. How can I know what is the issue?
Here is my code:
import opennmt as onmt
import tensorflow as tf
import loggingtf.get_logger().setLevel(logging.INFO)
config = {
“model_dir”: “run/”,
“data”: {
“source_vocabulary”: “vocabSrc.txt”,
“target_vocabulary”: “vocabTarget.txt”,
“train_features_file”: “SrcValFile.txt”,
“train_labels_file”: “TrgValFile.txt”,
“eval_features_file”: “SrcValFile.txt”,
“eval_labels_file”: “TrgValFile.txt”,
“sequence_controls”: {
“start”: “true”,
“end”: “true”,
},
},
“params”:{
“beam_width”: 5,
},
“train”:{
“batch_size”: 32,
“batch_type”: “tokens”,
“max_step”: 10000,
“save_checkpoints_steps” : 5000,
“keep_checkpoint_max”: 10,
“save_summary_steps”: 200,
},
“eval”:{
“batch_size”: 32,
“batch_type”: “tokens”,
“steps”: 200,
“export_on_best”: “loss”,
“export_format”: “saved_model”,
“max_exports_to_keep:”: 5,
“early_stopping”:{
“metric”: “loss”,
“min_improvement”: 0.01,
“steps”: 4,
},
},
“infer”:{
“n_best”: 5,
“with_scores”: “true”,
}
}print(tf.config.list_physical_devices(‘GPU’))
print(tf.test.is_built_with_cuda())
print(tf.test.is_built_with_xla())
print(tf.test.is_built_with_gpu_support())‘’’
#cross_tower_ops = tf.distribute.HierarchicalCopyAllReduce(num_packs=4)
#strategy = tf.distribute.MirroredStrategy(cross_device_ops=cross_tower_ops)
gpus = tf.config.experimental.list_physical_devices(‘GPU’)
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices(‘GPU’)
print(len(gpus), “Physical GPUs,”, len(logical_gpus), “Logical GPUs”)
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
‘’’
model = onmt.models.Transformer(
source_inputter=onmt.inputters.WordEmbedder(embedding_size=512),
target_inputter=onmt.inputters.WordEmbedder(embedding_size=512),
num_layers=6,
num_units=512,
num_heads=8,
ffn_inner_dim=2048,
dropout=0.1,
attention_dropout=0.1,
ffn_dropout=0.1,
)runner = onmt.Runner(model, config, auto_config=True)
print (‘start training’)
output_dir, summary= runner.train(num_devices=1, with_eval=True, return_summary=True)
print(‘end training’)
print(output_dir)
print(summary)
runner.infer(features_file=“SrcValFile.txt”, predictions_file=“onmttftest.txt”, log_time=True)tf.config.list_physical_devices(‘GPU’)
Here is the latest print in console:
> 2021-02-01 08:19:05.542813: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
> INFO:tensorflow:Number of model parameters: 44215857
> INFO:tensorflow:Number of model weights: 260 (trainable = 260, non trainable = 0)
> 2021-02-01 08:19:42.857344: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
> 2021-02-01 08:19:43.508876: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
> 2021-02-01 08:19:43.580691: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
> INFO:tensorflow:Saved checkpoint run/ckpt-1
Any clues?