Iʻm using OpenNMT-py to build a bilingual translation model using the transformer. On the source side (English) I use word-level tokenization using sentencepiece bpe and on the target side I use subword tokenization using DPE.
This is my tokenizer script:
import sentencepiece as spm
from collections import Counter
import re
def is_vowel(char):
return char in [‘a’, ‘e’, ‘i’, ‘o’, ‘u’, ‘ā’, ‘ē’, ‘ī’, ‘ō’, ‘ū’]
def tahitian_tokenizer(text):
tokens = []
for word in text.split():
temp_tokens = []
for i, char in enumerate(word):
if char == ‘ʻ’ and i + 1 < len(word) and is_vowel(word[i + 1]):
temp_tokens.append(char + word[i + 1])
elif char != ‘ʻ’ and (i == 0 or word[i - 1] != ‘ʻ’):
temp_tokens.append(char)
tokens.extend(temp_tokens)
return " “.join(tokens)
def tokenize_source(input_file, output_file):
with open(input_file, ‘r’, encoding=‘utf-8’) as f_in, open(output_file, ‘w’, encoding=‘utf-8’) as f_out:
for line in f_in:
#Tokenize the source text while treating punctuation as separate tokens
words = re.findall(r’\w+|[^\w\s]', line.strip())
tokenized_line = " “.join(words)
f_out.write(tokenized_line + “\n”)
def tokenize_target(input_file, output_file):
with open(input_file, ‘r’, encoding=‘utf-8’) as f_in, open(output_file, ‘w’, encoding=‘utf-8’) as f_out:
for line in f_in:
tokenized_line = tahitian_tokenizer(line.strip())
f_out.write(tokenized_line + “\n”)
def write_vocab_file(tokenized_file, vocab_file):
with open(tokenized_file, ‘r’, encoding=‘utf-8’) as f:
tokens = f.read().split()
token_counts = Counter(tokens)
sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
with open(vocab_file, ‘w’, encoding=‘utf-8’) as f:
for token, count in sorted_tokens:
f.write(f”{token}\t{count}\n”)
#Tokenize source language (English) while treating punctuation as separate tokens
tokenize_source(‘src-train.txt’, ‘src-train-words.tok’)
tokenize_source(‘src-val.txt’, ‘src-val-words.tok’)
#Tokenize target language (Tahitian) using custom tokenizer
tokenize_target(‘tgt-train.txt’, ‘tgt-train-dpe.tok’)
tokenize_target(‘tgt-val.txt’, ‘tgt-val-dpe.tok’)
#Generate vocabulary files
write_vocab_file(‘src-train-words.tok’, ‘src_words.vocab’)
write_vocab_file(‘tgt-train-dpe.tok’, ‘tgt_dpe.vocab’)
and this is my inference script:
import argparse
import re
import codecs
from onmt.translate.translator import build_translator
def is_vowel(char):
return char in [‘a’, ‘e’, ‘i’, ‘o’, ‘u’, ‘ā’, ‘ē’, ‘ī’, ‘ō’, ‘ū’]
def tahitian_detokenizer(tokens):
detokenized = []
for token in tokens:
if len(token) > 1 and token[0] == ‘ʻ’ and is_vowel(token[1]):
detokenized.append(token)
else:
detokenized.append(token)
return “”.join(detokenized)
def translate(opt):
translator = build_translator(
model=opt.models,
gpu=opt.gpu,
beam_size=opt.beam_size,
min_length=opt.min_length,
max_length=opt.max_length,
stepwise_penalty=opt.stepwise_penalty,
block_ngram_repeat=opt.block_ngram_repeat,
ignore_when_blocking=set(opt.ignore_when_blocking),
replace_unk=opt.replace_unk,
phrase_table=opt.phrase_table,
verbose=opt.verbose,
report_time=opt.report_time,
copy_attn=opt.copy_attn,
global_scorer=opt.global_scorer,
out_file=codecs.open(opt.output, “w+”, “utf-8”) if opt.output else None,
report_align=opt.report_align,
report_score=True,
seed=opt.seed,
data_type=opt.data_type,
tgt_file_prefix=opt.tgt_file_prefix,
ban_unk_token=opt.ban_unk_token,
dump_beam=opt.dump_beam,
n_best=opt.n_best,
alpha=opt.alpha,
beta=opt.beta,
length_penalty=opt.length_penalty,
coverage_penalty=opt.coverage_penalty,
ratio=opt.ratio,
max_length_ratio=opt.max_length_ratio,
random_sampling_topk=opt.random_sampling_topk,
random_sampling_topp=opt.random_sampling_topp,
random_sampling_temp=opt.random_sampling_temp,
bucket_size=opt.bucket_size,
)
with open(opt.src_file, ‘r’, encoding=‘utf-8’) as f:
source_sentences = f.readlines()
with open(opt.output, ‘w’, encoding=‘utf-8’) as f_out:
for source_sentence in source_sentences:
source_sentence = source_sentence.strip()
tokenized_input = re.findall(r’\w+|[^\w\s]‘, source_sentence)
batch = {‘src’: [tokenized_input], ‘srclen’: [len(tokenized_input)]}
results = translator.translate_batch(batch, src_dir=opt.src_dir)
model_output = results[0][0][‘tokens’]
detokenized_output = tahitian_detokenizer(model_output)
f_out.write(detokenized_output + ‘\n’)
def main():
parser = argparse.ArgumentParser(description=‘dpe-translate.py’, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(’-config’, ‘–config’, help=‘Path of the main YAML config file.’)
parser.add_argument(‘-save_config’, ‘–save_config’, help=‘Path where to save the config.’)
parser.add_argument(‘–model’, required=True, nargs=‘+’, help=‘Path to model .pt file(s). Multiple models can be specified, for ensemble decoding.’)
parser.add_argument(‘–precision’, default=‘’, choices=[‘’, ‘fp32’, ‘fp16’, ‘int8’], help=‘Precision to run inference. default is model.dtype fp32 to force slow FP16 model on GTX1080 int8 enables pytorch native 8-bit quantization (cpu only)’)
parser.add_argument(‘–fp32’, action=‘store_true’, help=‘Deprecated use 'precision' instead’)
parser.add_argument(‘–int8’, action=‘store_true’, help=‘Deprecated use 'precision' instead’)
parser.add_argument(‘–avg_raw_probs’, action=‘store_true’, help=‘If this is set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.’)
parser.add_argument(‘–self_atn_type’, default=‘scaled-dot-flash’, help=‘Self attention type in Transformer decoder layer - currently “scaled-dot”, “scaled-dot-flash” or “average”’)
parser.add_argument(‘–data_type’, default=‘text’, help=‘Type of the source input. Options: [text].’)
parser.add_argument(‘–src’, required=True, help=‘Source sequence to decode (one line per sequence)’)
parser.add_argument(‘–tgt’, help=‘True target sequence (optional)’)
parser.add_argument(‘–tgt_file_prefix’, action=‘store_true’, default=False, help=‘Generate predictions using provided -tgt as prefix.’)
parser.add_argument(‘–output’, default=‘pred.txt’, help=‘Path to output the predictions (each line will be the decoded sequence’)
parser.add_argument(‘–report_align’, action=‘store_true’, help=‘Report alignment for each translation.’)
parser.add_argument(‘–gold_align’, action=‘store_true’, help=‘Report alignment between source and gold target. Useful to test the performance of learnt alignments.’)
parser.add_argument(‘–report_time’, action=‘store_true’, help=‘Report some translation time metrics’)
parser.add_argument(‘–profile’, action=‘store_true’, help=‘Report pytorch profiling stats’)
parser.add_argument(‘-n_src_feats’, type=int, default=0, help=‘Number of source feats.’)
parser.add_argument(‘-src_feats_defaults’, help=‘Default features to apply in source in case there are not annotated’)
parser.add_argument(‘–beam_size’, type=int, default=5, help=‘Beam size’)
parser.add_argument(‘–ratio’, type=float, default=-0.0, help=‘Ratio based beam stop condition’)
parser.add_argument(‘–random_sampling_topk’, type=int, default=0, help=‘Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.’)
parser.add_argument(‘–random_sampling_topp’, type=float, default=0.0, help=‘Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0, 1]. [1904.09751] The Curious Case of Neural Text Degeneration’)
parser.add_argument(‘–random_sampling_temp’, type=float, default=1.0, help=‘If doing random sampling, divide the logits by this before computing softmax during decoding.’)
parser.add_argument(‘–seed’, type=int, default=-1, help=‘Set random seed used for better reproducibility between experiments.’)
parser.add_argument(‘–length_penalty’, default=‘avg’, choices=[‘none’, ‘wu’, ‘avg’], help=‘Length Penalty to use.’)
parser.add_argument(‘–alpha’, type=float, default=1.0, help=‘Length penalty parameter (higher = longer generation)’)
parser.add_argument(‘–coverage_penalty’, default=‘none’, choices=[‘none’, ‘wu’, ‘summary’], help=‘Coverage Penalty to use. Only available in beam search.’)
parser.add_argument(‘–beta’, type=float, default=-0.0, help=‘Coverage penalty parameter’)
parser.add_argument(‘–stepwise_penalty’, action=‘store_true’, help=‘Apply coverage penalty at every decoding step. Helpful for summary penalty.’)
parser.add_argument(‘–min_length’, type=int, default=0, help=‘Minimum prediction length’)
parser.add_argument(‘–max_length’, type=int, default=250, help=‘Maximum prediction length.’)
parser.add_argument(‘–max_length_ratio’, type=float, default=1.25, help=‘Maximum prediction length ratio. for European languages 1.25 is large enough for target Asian characters need to increase to 2-3 for special languages (burmese, amharic) to 10’)
parser.add_argument(‘–ignore_when_blocking’, nargs=‘+’, default=[], help=‘Ignore these strings when blocking repeats. You want to block sentence delimiters.’)
parser.add_argument(‘–replace_unk’, action=‘store_true’, help=‘Replace the generated UNK tokens with the source token that had highest attention weight. If phrase_table is provided, it will look up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.’)
parser.add_argument(‘–ban_unk_token’, action=‘store_true’, help=‘Prevent unk token generation by setting unk proba to 0’)
parser.add_argument(‘–phrase_table’, default=‘’, help=‘If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.’)
parser.add_argument(‘–log_file’, default=‘’, help=‘Output logs to a file under this path.’)
parser.add_argument(‘–log_file_level’, default=‘0’, choices=[‘CRITICAL’, ‘ERROR’, ‘WARNING’, ‘INFO’, ‘DEBUG’, ‘NOTSET’, ‘50’, ‘40’, ‘30’, ‘20’, ‘10’, ‘0’], help=‘Log file level.’)
parser.add_argument(‘–verbose’, action=‘store_true’, help=‘Print scores and predictions for each sentence’)
parser.add_argument(‘–atn_debug’, action=‘store_true’, help=‘Print best atn for each word’)
parser.add_argument(‘–align_debug’, action=‘store_true’, help=‘Print best align for each word’)
parser.add_argument(‘–dump_beam’, default=‘’, help=‘File to dump beam information to.’)
parser.add_argument(‘–n_best’, type=int, default=1, help=‘If verbose is set, will output the n_best decoded sentences’)
parser.add_argument(‘–with_score’, action=‘store_true’, help=‘Add a tab separated score to the translation’)
parser.add_argument(‘–gpu_ranks’, nargs=‘+’, default=[], help=‘List of ranks of each process.’)
parser.add_argument(‘–world_size’, type=int, default=1, help=‘Total number of distributed processes.’)
parser.add_argument(‘–parallel_mode’, default=‘data_parallel’, choices=[‘tensor_parallel’, ‘data_parallel’], help=‘Distributed mode.’)
parser.add_argument(‘–gpu_backend’, default=‘nccl’, help=‘Type of torch distributed backend’)
parser.add_argument(‘–gpu_verbose_level’, type=int, default=0, help=‘Gives more info on each process per GPU.’)
parser.add_argument(‘–master_ip’, default=‘localhost’, help=‘IP of master for torch.distributed training.’)
parser.add_argument(‘–master_port’, type=int, default=10000, help=‘Port of master for torch.distributed training.’)
parser.add_argument(‘–timeout’, type=int, default=60, help=‘Timeout for one GPU to wait for the others.’)
parser.add_argument(‘–batch_size’, type=int, default=30, help=‘Batch size’)
parser.add_argument(‘–batch_type’, default=‘sents’, choices=[‘sents’, ‘tokens’], help=‘Batch grouping for batch_size. Standard is sents. Tokens will do dynamic batching’)
parser.add_argument(‘–gpu’, type=int, default=-1, help=‘Device to run on’)
parser.add_argument(‘–block_ngram_repeat’, type=int, default=0, help=‘Block repetition of ngrams during decoding.’)
opt = parser.parse_args()
opt.models = [opt.model] # Adjust here to pass the model path as a list
opt.block_ngram_repeat = opt.block_ngram_repeat # Add this line
translate(opt)
if name == “main”:
main()
Command:
python3 dpe-translate.py --model run/models_step_5000.pt --src_file src-test.txt
throws this error:
/home/ubuntu/TY-EN/TY-EN/lib/python3.10/site-packages/onmt/translate/beam_search.py:480: UserWarning: Using length penalty with alpha==0 is equivalent to using length penalty none.
warnings.warn(
Traceback (most recent call last):
File “/home/ubuntu/TY-EN/EN-TY/dpe-translate.py”, line 64, in
main()
File “/home/ubuntu/TY-EN/EN-TY/dpe-translate.py”, line 61, in main
translate(opt)
File “/home/ubuntu/TY-EN/EN-TY/dpe-translate.py”, line 18, in translate
translator = build_translator(opt, report_score=True)
File “/home/ubuntu/TY-EN/TY-EN/lib/python3.10/site-packages/onmt/translate/translator.py”, line 50, in build_translator
translator = Translator.from_opt(
File “/home/ubuntu/TY-EN/TY-EN/lib/python3.10/site-packages/onmt/translate/translator.py”, line 259, in from_opt
block_ngram_repeat=opt.block_ngram_repeat,
AttributeError: ‘Namespace’ object has no attribute ‘block_ngram_repeat’