Not able to convert the hidden states to tensor post infer from the ctranslate model

rkoystart · August 25, 2025, 7:03am

I have a encoder module, which i have converted to ctranslate in the following precision

fp32
fp16
bf16
int8

and i try to run the following code:

import torch.nn.functional
import ctranslate2
import numpy as np
import torch
import transformers

from sentence_transformers import SentenceTransformer


def ctranslate_infer(model_path, tokenizer_name):
    device = "cuda"
    encoder = ctranslate2.Encoder(model_path, device=device)


    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
    inputs = ["It was good!"]
    tokens = tokenizer(inputs).input_ids

    output = encoder.forward_batch(tokens)
    # for out in output:
    
    embeddings = torch.as_tensor(output.last_hidden_state, device=device)
    embeddings = embeddings[0][0]
    print(embeddings.shape)
    embeddings = torch.nn.functional.normalize(embeddings.view(1,-1), p=2.0)
    print(embeddings)

def st_infer(model_path):
    inputs = ["It was good!"]
    st_model = SentenceTransformer(model_path)
    print(st_model)
    print(st_model.encode(inputs))


if __name__ == "__main__":
    ctranslate_infer("bge_m3_ctranslate", "BAAI/bge-m3")
    print("-----------------------------------------------------------------------------")
    # st_infer("BAAI/bge-m3")

    ctranslate_infer("bge_m3_fp16_ctranslate", "BAAI/bge-m3")
    print("-----------------------------------------------------------------------------")
    
    ctranslate_infer("bge_m3_bf16_ctranslate", "BAAI/bge-m3")
    print("-----------------------------------------------------------------------------")

    ctranslate_infer("bge_m3_int8_ctranslate", "BAAI/bge-m3")
    print("-----------------------------------------------------------------------------")

But when i try to infer from the bf16 model i get the following error,

  warnings.warn("Can't initialize NVML")
torch.Size([1024])
tensor([[ 0.0102,  0.0150, -0.0657,  ..., -0.0160, -0.0231,  0.0077]],
       device='cuda:0')
-----------------------------------------------------------------------------
torch.Size([1024])
tensor([[ 0.0102,  0.0151, -0.0657,  ..., -0.0160, -0.0232,  0.0077]],
       device='cuda:0', dtype=torch.float16)
-----------------------------------------------------------------------------
Traceback (most recent call last):
  File "/data/rkoy/vectorizerstats/ctranslate/infer.py", line 43, in <module>
    ctranslate_infer("bge_m3_bf16_ctranslate", "BAAI/bge-m3")
  File "/data/rkoy/vectorizerstats/ctranslate/infer.py", line 22, in ctranslate_infer
    embeddings = torch.as_tensor(output.last_hidden_state, device=device)
RuntimeError: Could not infer dtype of ctranslate2._ext.StorageView

Ctranslate version: 4.6.0
pytorch version: 2.7.1
GPU: L40S

I would like to know whether am i missing anything or is it a bug in the library ?