In quick start tutorial, there are many shell commands, for those who are not very familiar to terminal, or Windows users I thought python only version may be very useful.
Feel free to comment:
Here is the python code, on Windows pip is installing OpenNMT 1.2.0 version therefore added the exact latest version number. If you get torch version error, you can run the last line for 1.6.0 version otherwise not needed.
Creating a virtual environment is suggested.
Install the libraries:
pip install --upgrade pip
pip install OpenNMT-py==2.1.0
# may not need
pip install --ignore-installed torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
Save this python file as onmt-quickstart.py and execute
e.g. python onmt-quickstart.py or py onmt-quickstart.py in Windows
import requests
import tarfile
import sys
import yaml
from onmt.utils.parse import ArgumentParser
from onmt.opts import dynamic_prepare_opts, train_opts, config_opts, translate_opts
from onmt.bin.build_vocab import build_vocab_main
from onmt.bin.train import train
from onmt.bin.translate import translate
def download_file_with_progress_bar(file_name, link):
with open(file_name, "wb") as f:
print("Downloading %s" % file_name)
response = requests.get(link, stream=True)
total_length = response.headers.get('content-length')
if total_length is None: # no content length header
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.write(" " + str(round(total_length/1024/1024,2)) + "M")
sys.stdout.flush()
sys.stdout.write("\n")
def extract_tar_file(file_name):
tar = tarfile.open(file_name, "r:gz")
tar.extractall()
tar.close()
def print_first_x_lines_of_a_file(file_name, number_of_lines):
a_file = open(file_name)
for i in range(number_of_lines):
line = a_file.readline()
print(line)
def create_yaml_for_vocabulary(file_name):
yaml_config = """
## Where the samples will be written
save_data: toy-ende/run/example
## Where the vocab(s) will be written
src_vocab: toy-ende/run/example.vocab.src
tgt_vocab: toy-ende/run/example.vocab.tgt
overwrite: true
# Corpus opts:
data:
corpus:
path_src: toy-ende/src-train.txt
path_tgt: toy-ende/tgt-train.txt
transforms: []
weight: 1
valid:
path_src: toy-ende/src-val.txt
path_tgt: toy-ende/tgt-val.txt
transforms: []
"""
config = yaml.safe_load(yaml_config)
with open(file_name, "w") as f:
f.write(yaml_config)
def create_yaml_for_train(file_name, gpu_array):
yaml_config = """
## Where the samples will be written
save_data: toy-ende/run/example
## Where the vocab(s) will be written
src_vocab: toy-ende/run/example.vocab.src
tgt_vocab: toy-ende/run/example.vocab.tgt
overwrite: true
# Corpus opts:
data:
corpus:
path_src: toy-ende/src-train.txt
path_tgt: toy-ende/tgt-train.txt
transforms: []
weight: 1
valid:
path_src: toy-ende/src-val.txt
path_tgt: toy-ende/tgt-val.txt
transforms: []
# Vocabulary files that were just created
src_vocab: toy-ende/run/example.vocab.src
tgt_vocab: toy-ende/run/example.vocab.tgt
# Train on a single GPU
world_size: 1
gpu_ranks: {}
# Where to save the checkpoints
save_model: toy-ende/run/model
save_checkpoint_steps: 500
train_steps: 1000
valid_steps: 500
""".format(gpu_array)
config = yaml.safe_load(yaml_config)
with open(file_name, "w") as f:
f.write(yaml_config)
if __name__ == "__main__":
# Downloading toy English to German data set.
# Includes train, test and validate files for source and target language
# src-train, tgt-train
# src-test, tgt-test
# src-val, tgt-val
# === Download toy example tar.gz file and extract
download_file_with_progress_bar(
"toy-ende.tar.gz",
"https://s3.amazonaws.com/opennmt-trainingdata/toy-ende.tar.gz"
)
extract_tar_file("toy-ende.tar.gz")
print_first_x_lines_of_a_file("toy-ende/src-train.txt", 3)
# === Prepare build-vocab yaml file ===
create_yaml_for_vocabulary("toy-ende/build-vocab.yaml")
# Build example.vocab.[src/tgt] files
parser = ArgumentParser(description='build_vocab.py')
dynamic_prepare_opts(parser, build_vocab_only=True)
base_args = (
[
"-config", "toy-ende/build-vocab.yaml",
"-n_sample", "10000"
]
)
opts, unknown = parser.parse_known_args(base_args)
build_vocab_main(opts)
# === Prepare train yaml file ===
# If you want GPU support use second parameter like [0] [0,1] number of GPUS
# [] means only CPU training
create_yaml_for_train("toy-ende/train.yaml", "[0]")
# === Start training ===
parser = ArgumentParser(description='train.py')
train_opts(parser)
base_args = (
[
"-config", "toy-ende/train.yaml"
]
)
opts, unknown = parser.parse_known_args(base_args)
train(opts)
# === Translate results to pred_1000.txt ===
parser = ArgumentParser(description='translate.py')
config_opts(parser)
translate_opts(parser)
base_args = (
[
"-model", "toy-ende/run/model_step_1000.pt",
"-src", "toy-ende/src-test.txt",
"-output", "toy-ende/pred_1000.txt",
"-gpu", "0", #comment this line out for only CPU
"-verbose"
]
)
opts, unknown = parser.parse_known_args(base_args)
translate(opts)