Hello,
I converted Falcon-7b with cTranslate2:
ct2-transformers-converter --model tiiuae/falcon-7b-instruct --quantization float16 --output_dir falcon-7b-instruct --trust_remote_code
Now, I am trying to serve the model with OpenNMT-py server with the following config:
{"models_root": "./available_models",
"models": [
{
"id": 100,
"model": "falcon-7b-instruct",
"ct2_model": "falcon-7b-instruct",
"timeout": 600,
"device": "cuda",
"on_timeout": "to_cpu",
"load": true,
"inter_threads": 4,
"opt": {
"gpu": 0,
"beam_size": 2,
"batch_size": 4096
}
}
]
}
Here is the response:
Traceback (most recent call last):
File "/models/OpenNMT-py/server.py", line 6, in <module>
main()
File "/models/OpenNMT-py/onmt/bin/server.py", line 157, in main
start(
File "/models/OpenNMT-py/onmt/bin/server.py", line 33, in start
translation_server.start(config_file)
File "/models/OpenNMT-py/onmt/translate/translation_server.py", line 260, in start
self.preload_model(opt, model_id=model_id, **kwargs)
File "/models/OpenNMT-py/onmt/translate/translation_server.py", line 298, in preload_model
model = ServerModel(opt, model_id, **model_kwargs)
File "/models/OpenNMT-py/onmt/translate/translation_server.py", line 443, in __init__
self.load(preload=True)
File "/models/OpenNMT-py/onmt/translate/translation_server.py", line 514, in load
raise ServerModelError("Runtime Error: %s" % str(e))
onmt.translate.translation_server.ServerModelError: Runtime Error: This model cannot be used as a sequence-to-sequence model