I made few adjustments… now after initial loading it’s nearly instant
And I changed the default option of device to “auto” so it’s automatically choose… this might be part of the reason why it’s nearly instant. I have also added the option to display the predict score along with the translation.
import streamlit as st
import sentencepiece as spm
import ctranslate2
import os
import pathlib
ROOT_DIR = "/app/"
#os.path.dirname(os.path.abspath(__file__)) # This is your Project Root
def tokenize(text, sp):
"""Use SentencePiece model to tokenize a sentence
Args:
text (str): A sentence to tokenize
sp (str): SentencePiece model object (should be source object)
Returns:
List of of tokens of the text.
"""
tokens = sp.encode(text, out_type=str)
return tokens
def detokenize(text, sp):
"""Use SentencePiece model to detokenize a sentence's list of tokens
Args:
text (list(str)): A sentence's list of tokens to detokenize
sp (str): SentencePiece model object (should be target object)
Returns:
String of the detokenized text.
"""
translation = sp.decode(text)
return translation
def translate(source, translator, source_sp, target_sp, predict_score = False):
"""Use CTranslate model to translate a sentence
Args:
source (str): A source sentence to translate
translator (object): Ctransalte2 object
source_sp: sentencePiece Object init with source model
target_sp: sentencePiece Object init with target model
predict_score: Indicate if you want the predict score outputted with the translation.
Returns:
Translation of the source text.
"""
source_tokenized = tokenize(source, source_sp)
translation_ob = translator.translate_batch([source_tokenized], return_scores=predict_score)
translation = detokenize(translation_ob[0][0]["tokens"], target_sp)
if (predict_score == True):
translation = str(translation_ob[0][0]["score"]) + "|||" + translation
return translation
# ct_model (str): The path to the CTranslate model
# sp_source_model (str): The path to the SentencePiece source model
# sp_target_model (str): The path to the SentencePiece target model
ct_model = os.path.join(ROOT_DIR, "path/to/the/ctranslate/model/directory")
sp_source_model = os.path.join(ROOT_DIR, "path/to/the/sentencepiece/source/model/sourcefile.model")
sp_target_model = os.path.join(ROOT_DIR, "path/to/the/sentencepiece/source/model/targetfile.model")
#init tokenizer / translator objects
#To handle multiple models you need to add some logic here in order to handle the init of the 3 object below.
source_sp = spm.SentencePieceProcessor(sp_source_model)
target_sp = spm.SentencePieceProcessor(sp_target_model)
#device is set to "auto" and will guess between "cpu" or "cuda" where cuda means to use GPU.. (you can change it to force ether one.)
#predict_score return the prediction score of the model if set to true.
translator = ctranslate2.Translator(ct_model, device='auto')
# Title for the page and nice icon
st.set_page_config(page_title="NMT", page_icon="🤖") #Ω
# Header
st.title("Translator")
# Form to add your items
with st.form("my_form"):
# Textarea to type the source text.
user_input = st.text_area("Source Text", max_chars=200)
# Translate with CTranslate2 model
translation = translate(user_input, translator, source_sp, target_sp, predict_score=True)
# Create a button
submitted = st.form_submit_button("Translate")
# If the button pressed, print the translation
# Here, we use "st.info", but you can try "st.write", "st.code", or "st.success".
if submitted:
st.write("Translation")
st.info(translation)
# Optional Style
# Source: https://towardsdatascience.com/5-ways-to-customise-your-streamlit-ui-e914e458a17c
padding = 0
st.markdown(f""" <style>
.reportview-container .main .block-container{{
padding-top: {padding}rem;
padding-right: {padding}rem;
padding-left: {padding}rem;
padding-bottom: {padding}rem;
}} </style> """, unsafe_allow_html=True)
st.markdown(""" <style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style> """, unsafe_allow_html=True)