NLLB-200 with CTranslate2

! this is my index.html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>翻译应用</title>
    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
    <div class="container">
        <div class="source">
            <select id="source-lang">
                <!-- 这里将添加源语言选项 -->
                <option value="en_Latn">English</option>
                <option value="zho_Hans">中文简体</option>
                <option value="kin_Latn">Kinyarwanda</option>
                <!-- 其他语种选项 -->
            </select>
            <textarea id="source-text" placeholder="请输入原文..."></textarea>
        </div>
        <div class="translate-button">
            <button id="translate-btn">翻译</button>
        </div>
        <div class="target">
            <select id="target-lang">
                <!-- 这里将添加目标语言选项 -->
                <option value="en_Latn">English</option>
                <option value="zho_Hans">中文简体</option>
                <option value="kin_Latn">Kinyarwanda</option>
                <!-- 其他语种选项 -->
            </select>
            <textarea id="target-text" placeholder="译文将显示在这里..." readonly></textarea>
        </div>
    </div>
    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
</body>
</html>
# this is my view.py
from flask import Blueprint, render_template, request, jsonify
import ctranslate2
import sentencepiece as spm
from .config import ct_model_path, sp_model_path, device


main = Blueprint('main', __name__)

# 加载模型
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)

@main.route('/')
def index():
    return render_template('index.html')

@main.route('/translate', methods=['POST'])
def translate_text():
    data = request.get_json()
    source_text = data['source_text']
    src_lang = data['src_lang']
    tgt_lang = data['tgt_lang']
    # 这里添加翻译逻辑
    # 处理多行文本
    source_sentences = source_text.split('\n')
    source_sentences = [sent.strip() for sent in source_sentences]

    # 使用 SentencePiece 进行子词切分
    source_sents_subworded = sp.encode_as_pieces(source_sentences)
    source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

    # 翻译
    target_prefix = [[tgt_lang]] * len(source_sentences)
    translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=4, target_prefix=target_prefix)
    translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]

    # Desubword
    translated_texts = sp.decode(translations_subworded)
    translated_text = '\n'.join(translated_texts)
    print(translated_text)
    # translated_text = translated_text.replace('⁇', '').strip()

    return jsonify({'translation': translated_text})

this is my script.js

document.getElementById('translate-btn').addEventListener('click', function() {
    const sourceLang = document.getElementById('source-lang').value;
    const targetLang = document.getElementById('target-lang').value;
    const sourceText = document.getElementById('source-text').value;

    // 这里发送 AJAX 请求到 Flask 后端进行翻译
    // 然后接收翻译结果并显示在 'target-text' 文本框中
    fetch('/translate', {
        method: 'POST',
        headers: {
            'Content-Type': 'application/json'
        },
        body: JSON.stringify({
            source_text: sourceText,
            src_lang: sourceLang,
            tgt_lang: targetLang
        })
    })
    .then(response => response.json())
    .then(data => {
        document.getElementById('target-text').innerText = data.translation;
    })
    .catch(error => console.error('Error:', error));
});

this is my run.py

from app import create_app

app = create_app()

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=5000)

you can see my lite version:

import sys
import ctranslate2
import sentencepiece as spm


# [Modify] Set paths to the CTranslate2 and SentencePiece models
ct_model_path = "/root/models/nllb-200-3.3B-int8"
sp_model_path = "/root/models/flores200_sacrebleu_tokenizer_spm.model"
device = "cuda"  # or "cpu"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)


def translation_function(source_sents, src_lang, tgt_lang):
  beam_size = 4
  source_sentences = [sent.strip() for sent in source_sents]
  target_prefix = [[tgt_lang]] * len(source_sentences)
  # Subword the source sentences
  source_sents_subworded = sp.encode_as_pieces(source_sentences)
  source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]
  print("First subworded source sentence:", source_sents_subworded[0], sep="\n")
  # Translate the source sentences
  translator = ctranslate2.Translator(ct_model_path, device=device)
  translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
  translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]
  for translation in translations_subworded:
    if tgt_lang in translation:
      translation.remove(tgt_lang)
  # Desubword the target sentences
  translations = sp.decode(translations_subworded)
  for src_sent, tgt_sent in zip(source_sentences, translations):
    print(src_sent, tgt_sent, sep="\n• ")
    print()


if __name__ == '__main__':
  source_sents = [input('input text: ')]
  src_lang = input('input src_lang: ')
  tgt_lang = input('input tgt_lang: ')
  if not source_sents or not src_lang or not tgt_lang:
    source_sents = [
      "制作简历相关",
      "大家好",
      "你是谁"
    ]
    # src_lang = "kin_Latn"
    src_lang = "zho_Hans"
    tgt_lang = "en_Latn"
  translation_function(source_sents, src_lang, tgt_lang)
input text: 制作简历相关
input src_lang: zho_Hans
input tgt_lang: en_Latn
First subworded source sentence:
['zho_Hans', '▁', '制作', '简', '历', '相关', '</s>']
制作简历相关
•  ⁇ ️ ⁇ ️ ⁇ ️ ⁇ 

The English code is not correct. Try this and it should work.

tgt_lang = "eng_Latn"

Consider also trying the MADLAD model.

All the best,
Yasmin

thanks, i see this wrong, and it works very good, thanks.
is there some language detect models? like langdetect, langid.
and is this project or model can terminology intervention and corpus intervention?

You can use the language detection library of your choice. See examples here:

After this, you can map the output to the language codes supported by NLLP, and they can be found in the metrics file.

No built-in features for this. You would have to build and evaluate it yourself (example).

All the best,
Yasmin

It seems the original repository includes a language identification model for fastText available here under the CC-BY-NC licence, but consider this issue. As you can see in these examples, the original fastText models (lid.176.bin and lid.176.ftz) outputs a more accurate language ID than lid218e.bin. Moreover, the original fastText models have a different licence.

• lid218e.bin

import fasttext
fasttext_model = fasttext.load_model('lid218e.bin')
fasttext_model.predict("大家好", k=3)

((‘__label__yue_Hant’, ‘__label__bod_Tibt’, ‘__label__eng_Latn’),
array([0.30566457, 0.28086483, 0.16853374]))

• lid.176.bin

import fasttext
fasttext_model = fasttext.load_model('lid.176.bin')
fasttext_model.predict("大家好", k=3)

((‘__label__zh’, ‘__label__wuu’, ‘__label__en’),
array([9.98486638e-01, 7.41246971e-04, 4.91706945e-04]))

• lid.176.ftz

import fasttext
fasttext_model = fasttext.load_model('lid.176.ftz')
fasttext_model.predict("大家好", k=3)

((‘__label__zh’, ‘__label__ja’, ‘__label__wuu’),
array([0.73023683, 0.25853705, 0.00466318]))

import java.util.Locale;

public enum Language {
// 示例语言:英语(简化)
ENGLISH(Locale.ENGLISH, “en”, “eng”, “英语”, “English”, “English”),
// 示例语言:简体中文
CHINESE_SIMPLIFIED(Locale.SIMPLIFIED_CHINESE, “zh-CN”, “zho”, “汉语(简体)”, “Chinese (Simplified)”, “普通话”),
// 示例语言:法语(法国)
FRENCH(Locale.FRENCH, “fr”, “fra”, “法语”, “French”, “Français”),
// 示例语言:德语(德国)
GERMAN(Locale.GERMAN, “de”, “deu”, “德语”, “German”, “Deutsch”),
// 示例语言:日语
JAPANESE(Locale.JAPANESE, “ja”, “jpn”, “日语”, “Japanese”, “日本語”);

private final Locale locale;
private final String iso639_1;
private final String iso639_3;
private final String chineseName;
private final String englishName;
private final String localName;

Language(Locale locale, String iso639_1, String iso639_3, String chineseName, String englishName, String localName) {
    this.locale = locale;
    this.iso639_1 = iso639_1;
    this.iso639_3 = iso639_3;
    this.chineseName = chineseName;
    this.englishName = englishName;
    this.localName = localName;
}

public Locale getLocale() {
    return locale;
}

public String getIso639_1() {
    return iso639_1;
}

public String getIso639_3() {
    return iso639_3;
}

public String getChineseName() {
    return chineseName;
}

public String getEnglishName() {
    return englishName;
}

public String getLocalName() {
    return localName;
}

}

  1. I want to know the 205 language codes of NLLB-200, such as
    ace_Arab Acehnese (Arabic script)
    ace_Latn Acehnese (Latin script)
    What standard is used?
    Normal locale codes usually include a language code (ISO 639-1 standard) and a country/region code (ISO 3166-1 alpha-2 standard).
  2. Do I need to make a mapping table of NLLB-200 language codes —> standard language codes?

you need to read the original paper, but yes codes are not iso

import sys
import ctranslate2
import sentencepiece as spm


# [Modify] Set paths to the CTranslate2 and SentencePiece models
ct_model_path = "/root/models/nllb-200-3.3B-int8"
sp_model_path = "/root/models/flores200_sacrebleu_tokenizer_spm.model"
device = "cuda"  # or "cpu"
# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
translator = ctranslate2.Translator(ct_model_path, device)


def translation_function(source_sents, src_lang, tgt_lang):
  beam_size = 4
  source_sentences = [sent.strip() for sent in source_sents]
  target_prefix = [[tgt_lang]] * len(source_sentences)
  # Subword the source sentences
  source_sents_subworded = sp.encode_as_pieces(source_sentences)
  source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]
  print(source_sents_subworded)
  # print("First subworded source sentence:", source_sents_subworded[0], sep="\n")
  # Translate the source sentences
  translator = ctranslate2.Translator(ct_model_path, device=device)
  translations_subworded = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
  print(translations_subworded)
  translations_subworded = [translation.hypotheses[0] for translation in translations_subworded]
  for translation in translations_subworded:
    if tgt_lang in translation:
      translation.remove(tgt_lang)
  # Desubword the target sentences
  translations = sp.decode(translations_subworded)
  for src_sent, tgt_sent in zip(source_sentences, translations):
    print(src_sent)
    print(tgt_sent)


if __name__ == '__main__':
  source_sents = [input('input text: ')]
  src_lang = input('input src_lang: ')
  tgt_lang = input('input tgt_lang: ')
  if not source_sents or not src_lang or not tgt_lang:
    source_sents = [
      "In the Big Model era, big models should be used for translation, but the translation efficiency is too low, what to do?",
      # "大家好",
      # "你是谁"
    ]
    # src_lang = "kin_Latn"
    src_lang = "eng_Latn"
    tgt_lang = "zho_Hans"
  translation_function(source_sents, src_lang, tgt_lang)

my problem:
The target language is simplified: tgt_lang = “zho_Hans”, translation result: 在大模型时代, 大模型应该用于翻译,
Change the target language to Traditional Chinese: tgt_lang = “zho_Hant”, the translation result is: 但翻譯效率太低了, 我們該怎麼辦?
Only the first half and the second half of the sentence were translated respectively.
Please help analyze the reason.

Maybe try MADLAD. I got this translation:

在大模型时代,应该用大模型来翻译,但是翻译效率太低,怎么办?

2 Likes

yes i could try madlad. but i want to kown why nllb-200-3.3b has this wrong.

and An additional unrelated question: I want to buy a server and host it in an overseas computer room. Is there any way to obtain it? Of course, you can also use Amazon cloud server directly. But I want to buy server hosting directly. Thank you very much for your patient reply.

NLLB is not good at Chinese