NLLB-200 with CTranslate2

mengguiyouziyi · January 13, 2024, 1:32am

! this is my index.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>翻译应用</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
    <div class="container">
        <div class="source">
            <select id="source-lang">
                <!-- 这里将添加源语言选项 -->
                <option value="en_Latn">English</option>
                <option value="zho_Hans">中文简体</option>
                <option value="kin_Latn">Kinyarwanda</option>
                <!-- 其他语种选项 -->
            </select>
            <textarea id="source-text" placeholder="请输入原文..."></textarea>
        </div>
        <div class="translate-button">
            <button id="translate-btn">翻译</button>
        </div>
        <div class="target">
            <select id="target-lang">
                <!-- 这里将添加目标语言选项 -->
                <option value="en_Latn">English</option>
                <option value="zho_Hans">中文简体</option>
                <option value="kin_Latn">Kinyarwanda</option>
                <!-- 其他语种选项 -->
            </select>
            <textarea id="target-text" placeholder="译文将显示在这里..." readonly></textarea>
        </div>
    </div>
    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
</body>
</html>

# this is my view.py
from flask import Blueprint, render_template, request, jsonify
import ctranslate2
import sentencepiece as spm
from .config import ct_model_path, sp_model_path, device


main = Blueprint('main', __name__)

# 加载模型
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)

@main.route('/')
def index():
    return render_template('index.html')

@main.route('/translate', methods=['POST'])
def translate_text():
    data = request.get_json()
    source_text = data['source_text']
    src_lang = data['src_lang']
    tgt_lang = data['tgt_lang']
    # 这里添加翻译逻辑
    # 处理多行文本
    source_sentences = source_text.split('\n')
    source_sentences = [sent.strip() for sent in source_sentences]

    # 使用 SentencePiece 进行子词切分
    source_sents_subworded = sp.encode_as_pieces(source_sentences)
    source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

    # 翻译
    target_prefix = [[tgt_lang]] * len(source_sentences)
    translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=4, target_prefix=target_prefix)
    translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]

    # Desubword
    translated_texts = sp.decode(translations_subworded)
    translated_text = '\n'.join(translated_texts)
    print(translated_text)
    # translated_text = translated_text.replace('⁇', '').strip()

    return jsonify({'translation': translated_text})

this is my script.js

document.getElementById('translate-btn').addEventListener('click', function() {
    const sourceLang = document.getElementById('source-lang').value;
    const targetLang = document.getElementById('target-lang').value;
    const sourceText = document.getElementById('source-text').value;

    // 这里发送 AJAX 请求到 Flask 后端进行翻译
    // 然后接收翻译结果并显示在 'target-text' 文本框中
    fetch('/translate', {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json'
        },
        body: JSON.stringify({
            source_text: sourceText,
            src_lang: sourceLang,
            tgt_lang: targetLang
        })
    })
    .then(response => response.json())
    .then(data => {
        document.getElementById('target-text').innerText = data.translation;
    })
    .catch(error => console.error('Error:', error));
});

this is my run.py

from app import create_app

app = create_app()

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=5000)

mengguiyouziyi · January 13, 2024, 1:33am

mengguiyouziyi · January 13, 2024, 1:49am

you can see my lite version:

import sys
import ctranslate2
import sentencepiece as spm


# [Modify] Set paths to the CTranslate2 and SentencePiece models
ct_model_path = "/root/models/nllb-200-3.3B-int8"
sp_model_path = "/root/models/flores200_sacrebleu_tokenizer_spm.model"
device = "cuda"  # or "cpu"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)


def translation_function(source_sents, src_lang, tgt_lang):
  beam_size = 4
  source_sentences = [sent.strip() for sent in source_sents]
  target_prefix = [[tgt_lang]] * len(source_sentences)
  # Subword the source sentences
  source_sents_subworded = sp.encode_as_pieces(source_sentences)
  source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]
  print("First subworded source sentence:", source_sents_subworded[0], sep="\n")
  # Translate the source sentences
  translator = ctranslate2.Translator(ct_model_path, device=device)
  translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
  translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]
  for translation in translations_subworded:
    if tgt_lang in translation:
      translation.remove(tgt_lang)
  # Desubword the target sentences
  translations = sp.decode(translations_subworded)
  for src_sent, tgt_sent in zip(source_sentences, translations):
    print(src_sent, tgt_sent, sep="\n• ")
    print()


if __name__ == '__main__':
  source_sents = [input('input text: ')]
  src_lang = input('input src_lang: ')
  tgt_lang = input('input tgt_lang: ')
  if not source_sents or not src_lang or not tgt_lang:
    source_sents = [
      "制作简历相关",
      "大家好",
      "你是谁"
    ]
    # src_lang = "kin_Latn"
    src_lang = "zho_Hans"
    tgt_lang = "en_Latn"
  translation_function(source_sents, src_lang, tgt_lang)

input text: 制作简历相关
input src_lang: zho_Hans
input tgt_lang: en_Latn
First subworded source sentence:
['zho_Hans', '▁', '制作', '简', '历', '相关', '</s>']
制作简历相关
•  ⁇ ️ ⁇ ️ ⁇ ️ ⁇

ymoslem · January 14, 2024, 10:15am

The English code is not correct. Try this and it should work.

tgt_lang = "eng_Latn"

Consider also trying the MADLAD model.

All the best,
Yasmin

mengguiyouziyi · January 14, 2024, 11:21am

thanks, i see this wrong, and it works very good, thanks.
is there some language detect models? like langdetect, langid.
and is this project or model can terminology intervention and corpus intervention?

ymoslem · January 14, 2024, 11:48am

You can use the language detection library of your choice. See examples here:

gist.github.com

https://gist.github.com/ymoslem/3bbd96fee23d3691b9b227f27fa2ab3e

language_detection.py

# -*- coding: utf-8 -*-
# pip3 install gdown langdetect fasttext pycld2 py3langid

import gdown
from datetime import datetime

# Download fasttext models
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
output = "lid.176.ftz"
gdown.download(url, output, quiet=False)

This file has been truncated. show original

After this, you can map the output to the language codes supported by NLLP, and they can be found in the metrics file.

No built-in features for this. You would have to build and evaluate it yourself (example).

All the best,
Yasmin

ymoslem · January 14, 2024, 12:14pm

It seems the original repository includes a language identification model for fastText available here under the CC-BY-NC licence, but consider this issue. As you can see in these examples, the original fastText models (lid.176.bin and lid.176.ftz) outputs a more accurate language ID than lid218e.bin. Moreover, the original fastText models have a different licence.

• lid218e.bin

import fasttext
fasttext_model = fasttext.load_model('lid218e.bin')
fasttext_model.predict("大家好", k=3)

((‘__label__yue_Hant’, ‘__label__bod_Tibt’, ‘__label__eng_Latn’),
array([0.30566457, 0.28086483, 0.16853374]))

• lid.176.bin

import fasttext
fasttext_model = fasttext.load_model('lid.176.bin')
fasttext_model.predict("大家好", k=3)

((‘__label__zh’, ‘__label__wuu’, ‘__label__en’),
array([9.98486638e-01, 7.41246971e-04, 4.91706945e-04]))

• lid.176.ftz

import fasttext
fasttext_model = fasttext.load_model('lid.176.ftz')
fasttext_model.predict("大家好", k=3)

((‘__label__zh’, ‘__label__ja’, ‘__label__wuu’),
array([0.73023683, 0.25853705, 0.00466318]))

mengguiyouziyi · January 25, 2024, 2:46pm

import java.util.Locale;

public enum Language {
// 示例语言：英语（简化）
ENGLISH(Locale.ENGLISH, “en”, “eng”, “英语”, “English”, “English”),
// 示例语言：简体中文
CHINESE_SIMPLIFIED(Locale.SIMPLIFIED_CHINESE, “zh-CN”, “zho”, “汉语（简体）”, “Chinese (Simplified)”, “普通话”),
// 示例语言：法语（法国）
FRENCH(Locale.FRENCH, “fr”, “fra”, “法语”, “French”, “Français”),
// 示例语言：德语（德国）
GERMAN(Locale.GERMAN, “de”, “deu”, “德语”, “German”, “Deutsch”),
// 示例语言：日语
JAPANESE(Locale.JAPANESE, “ja”, “jpn”, “日语”, “Japanese”, “日本語”);

private final Locale locale;
private final String iso639_1;
private final String iso639_3;
private final String chineseName;
private final String englishName;
private final String localName;

Language(Locale locale, String iso639_1, String iso639_3, String chineseName, String englishName, String localName) {
    this.locale = locale;
    this.iso639_1 = iso639_1;
    this.iso639_3 = iso639_3;
    this.chineseName = chineseName;
    this.englishName = englishName;
    this.localName = localName;
}

public Locale getLocale() {
    return locale;
}

public String getIso639_1() {
    return iso639_1;
}

public String getIso639_3() {
    return iso639_3;
}

public String getChineseName() {
    return chineseName;
}

public String getEnglishName() {
    return englishName;
}

public String getLocalName() {
    return localName;
}

}

I want to know the 205 language codes of NLLB-200, such as
ace_Arab Acehnese (Arabic script)
ace_Latn Acehnese (Latin script)
What standard is used?
Normal locale codes usually include a language code (ISO 639-1 standard) and a country/region code (ISO 3166-1 alpha-2 standard).
Do I need to make a mapping table of NLLB-200 language codes —> standard language codes?

vince62s · January 25, 2024, 2:49pm

you need to read the original paper, but yes codes are not iso

mengguiyouziyi · January 29, 2024, 5:08pm

import sys
import ctranslate2
import sentencepiece as spm


# [Modify] Set paths to the CTranslate2 and SentencePiece models
ct_model_path = "/root/models/nllb-200-3.3B-int8"
sp_model_path = "/root/models/flores200_sacrebleu_tokenizer_spm.model"
device = "cuda"  # or "cpu"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)


def translation_function(source_sents, src_lang, tgt_lang):
  beam_size = 4
  source_sentences = [sent.strip() for sent in source_sents]
  target_prefix = [[tgt_lang]] * len(source_sentences)
  # Subword the source sentences
  source_sents_subworded = sp.encode_as_pieces(source_sentences)
  source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]
  print(source_sents_subworded)
  # print("First subworded source sentence:", source_sents_subworded[0], sep="\n")
  # Translate the source sentences
  translator = ctranslate2.Translator(ct_model_path, device=device)
  translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
  print(translations_subworded)
  translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]
  for translation in translations_subworded:
    if tgt_lang in translation:
      translation.remove(tgt_lang)
  # Desubword the target sentences
  translations = sp.decode(translations_subworded)
  for src_sent, tgt_sent in zip(source_sentences, translations):
    print(src_sent)
    print(tgt_sent)


if __name__ == '__main__':
  source_sents = [input('input text: ')]
  src_lang = input('input src_lang: ')
  tgt_lang = input('input tgt_lang: ')
  if not source_sents or not src_lang or not tgt_lang:
    source_sents = [
      "In the Big Model era, big models should be used for translation, but the translation efficiency is too low, what to do?",
      # "大家好",
      # "你是谁"
    ]
    # src_lang = "kin_Latn"
    src_lang = "eng_Latn"
    tgt_lang = "zho_Hans"
  translation_function(source_sents, src_lang, tgt_lang)

my problem：
The target language is simplified: tgt_lang = “zho_Hans”, translation result: 在大模型时代, 大模型应该用于翻译,
Change the target language to Traditional Chinese: tgt_lang = “zho_Hant”, the translation result is: 但翻譯效率太低了, 我們該怎麼辦?
Only the first half and the second half of the sentence were translated respectively.
Please help analyze the reason.

ymoslem · February 1, 2024, 5:52am

Maybe try MADLAD. I got this translation:

在大模型时代,应该用大模型来翻译,但是翻译效率太低,怎么办?

mengguiyouziyi · February 1, 2024, 3:14pm

yes i could try madlad. but i want to kown why nllb-200-3.3b has this wrong.

and An additional unrelated question: I want to buy a server and host it in an overseas computer room. Is there any way to obtain it? Of course, you can also use Amazon cloud server directly. But I want to buy server hosting directly. Thank you very much for your patient reply.

vince62s · February 1, 2024, 4:16pm

NLLB is not good at Chinese

mengguiyouziyi · December 29, 2024, 7:47am

我使用m2m100训练医疗翻译模型，发现"应以最低有效剂量和浓度进行椎管内治疗，以尽可能长时间达到充分临床疗效。"无法将达到翻译成achieve，无论如何也翻译不出achieve，只能翻译出achievement，achieving，achieved等。
我用m2m100的ct2模型，翻译这个句子时，在achieve的地方会出现unk。
achieve是oov（未登录词）么？
我如何找到更大的词表？或者我如何筛选出所有oov的词，然后add_token？
我发现sentencepiece的子词包含_achieve，为什么翻译不出achieve？