My code in tokenize.py like this :
import sentencepiece as spm
import re
s = spm.SentencePieceProcessor(‘jpa_wiki_100000.model’)
path_input_eng = ‘voc-en.txt’
path_output_eng = ‘en_trainv1.txt’
path_input_jyp = ‘voc-ja.txt’
path_output_jyp = ‘jp_trainv1.txt’
def tokenize(sequence):
test = s.Encode(sequence, out_type= str , enable_sampling= True , alpha = 0.001, nbest_size = 2)
sequence_token = ‘’
for j in range(0,len(test)):
test_clean = test[j]
test_clean = test_clean.replace(‘▁’,’’)
if test_clean != ‘’:
sequence_token = sequence_token + ’ ’ + test_clean
else:
continue
return sequence_token
def save_file(path_input, path_output, check_token = False, number_line = 1000):
open_file = open(path_input, ‘r’)
save_file = open(path_output, ‘w’)
for i in range(number_line):
line = open_file.readline()
if check_token == True:
line = tokenize(line)
save_file.write(line)
a = ‘iPhone 6 Plus 7 Plus 8 Plus 8プラス最新の8500mAh外部バッテリパック保護携帯電話充電器用’
print(a)
print(tokenize(a))
And the result is :
iPhone 6 Plus 7 Plus 8 Plus 8プラス最新の8500mAh外部バッテリパック保護携帯電話充電器用
i P h one 6 P lus 7 P lus 8 P lus 8 プラス 最新の 8 5 00 m A h 外部 バッ テリ パック 保護 携帯電話 充電器 用
Its seem its can tokenize the Japanese but the tokenize for English is not good. so How can i fix this? I translate Japanese to English