Quick Start Python Only Colab

chopinml · April 17, 2021, 7:20pm

In quick start tutorial, there are many shell commands, for those who are not very familiar to terminal, or Windows users I thought python only version may be very useful.

Feel free to comment:

Here is the python code, on Windows pip is installing OpenNMT 1.2.0 version therefore added the exact latest version number. If you get torch version error, you can run the last line for 1.6.0 version otherwise not needed.

Creating a virtual environment is suggested.

Install the libraries:

pip install --upgrade pip
pip install OpenNMT-py==2.1.0
# may not need
pip install --ignore-installed torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Save this python file as onmt-quickstart.py and execute
e.g. python onmt-quickstart.py or py onmt-quickstart.py in Windows

import requests
import tarfile
import sys
import yaml

from onmt.utils.parse import ArgumentParser
from onmt.opts import dynamic_prepare_opts, train_opts, config_opts, translate_opts
from onmt.bin.build_vocab import build_vocab_main
from onmt.bin.train import train
from onmt.bin.translate import translate

def download_file_with_progress_bar(file_name, link):
  with open(file_name, "wb") as f:
    print("Downloading %s" % file_name)
    response = requests.get(link, stream=True)
    total_length = response.headers.get('content-length')

    if total_length is None: # no content length header
        f.write(response.content)
    else:
        dl = 0
        total_length = int(total_length)
        for data in response.iter_content(chunk_size=4096):
            dl += len(data)
            f.write(data)
            done = int(50 * dl / total_length)
            sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
            sys.stdout.write(" " + str(round(total_length/1024/1024,2)) + "M")
            sys.stdout.flush()
        sys.stdout.write("\n")

def extract_tar_file(file_name):
  tar = tarfile.open(file_name, "r:gz")
  tar.extractall()
  tar.close()

def print_first_x_lines_of_a_file(file_name, number_of_lines):
  a_file = open(file_name)
  for i in range(number_of_lines):
    line = a_file.readline()
    print(line)

def create_yaml_for_vocabulary(file_name):
  yaml_config = """
  ## Where the samples will be written
  save_data: toy-ende/run/example

  ## Where the vocab(s) will be written
  src_vocab: toy-ende/run/example.vocab.src
  tgt_vocab: toy-ende/run/example.vocab.tgt
  overwrite: true

  # Corpus opts:
  data:
      corpus:
          path_src: toy-ende/src-train.txt
          path_tgt: toy-ende/tgt-train.txt
          transforms: []
          weight: 1
      valid:
          path_src: toy-ende/src-val.txt
          path_tgt: toy-ende/tgt-val.txt
          transforms: []
  """
  config = yaml.safe_load(yaml_config)
  with open(file_name, "w") as f:
      f.write(yaml_config)

def create_yaml_for_train(file_name, gpu_array):
  yaml_config = """
  ## Where the samples will be written
  save_data: toy-ende/run/example

  ## Where the vocab(s) will be written
  src_vocab: toy-ende/run/example.vocab.src
  tgt_vocab: toy-ende/run/example.vocab.tgt
  overwrite: true

  # Corpus opts:
  data:
      corpus:
          path_src: toy-ende/src-train.txt
          path_tgt: toy-ende/tgt-train.txt
          transforms: []
          weight: 1
      valid:
          path_src: toy-ende/src-val.txt
          path_tgt: toy-ende/tgt-val.txt
          transforms: []

  # Vocabulary files that were just created
  src_vocab: toy-ende/run/example.vocab.src
  tgt_vocab: toy-ende/run/example.vocab.tgt

  # Train on a single GPU
  world_size: 1
  gpu_ranks: {}

  # Where to save the checkpoints
  save_model: toy-ende/run/model
  save_checkpoint_steps: 500
  train_steps: 1000
  valid_steps: 500
  """.format(gpu_array)
  config = yaml.safe_load(yaml_config)
  with open(file_name, "w") as f:
      f.write(yaml_config)

if __name__ == "__main__":

  # Downloading toy English to German data set. 
  # Includes train, test and validate files for source and target language
  # src-train, tgt-train 
  # src-test, tgt-test 
  # src-val, tgt-val 

  # === Download toy example tar.gz file and extract
  download_file_with_progress_bar(
    "toy-ende.tar.gz", 
    "https://s3.amazonaws.com/opennmt-trainingdata/toy-ende.tar.gz"
  )
  extract_tar_file("toy-ende.tar.gz")
  print_first_x_lines_of_a_file("toy-ende/src-train.txt", 3)

  # === Prepare build-vocab yaml file ===
  create_yaml_for_vocabulary("toy-ende/build-vocab.yaml")

  # Build example.vocab.[src/tgt] files 
  parser = ArgumentParser(description='build_vocab.py')
  dynamic_prepare_opts(parser, build_vocab_only=True)
  base_args = (
    [
       "-config", "toy-ende/build-vocab.yaml", 
       "-n_sample", "10000"
    ]
  )
  opts, unknown = parser.parse_known_args(base_args)
  build_vocab_main(opts)

  # === Prepare train yaml file ===
  # If you want GPU support use second parameter like [0] [0,1] number of GPUS
  # [] means only CPU training
  create_yaml_for_train("toy-ende/train.yaml", "[0]")

  # === Start training ===
  parser = ArgumentParser(description='train.py')
  train_opts(parser)
  base_args = (
    [
      "-config", "toy-ende/train.yaml"
    ]
  )
  opts, unknown = parser.parse_known_args(base_args)
  train(opts)

  # === Translate results to pred_1000.txt ===
  parser = ArgumentParser(description='translate.py')

  config_opts(parser)
  translate_opts(parser)

  base_args = (
    [
    "-model", "toy-ende/run/model_step_1000.pt", 
    "-src", "toy-ende/src-test.txt", 
    "-output", "toy-ende/pred_1000.txt", 
    "-gpu", "0", #comment this line out for only CPU
    "-verbose"
    ]
  )
  opts, unknown = parser.parse_known_args(base_args)
  translate(opts)