Hi.
I am currently trying to optimize GPT2 to run on CPU. I tested with Ctranslate and ran inference with quantization int8 and the result was 110 tokens in 0.9s. However, when I run it with the transformers library, the result is 110 tokens in 0.6s. Am I missing something?
import ctranslate2
import transformers
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer
device = "cpu"
def infer(model,inp):
inp = tokenizer(inp, return_tensors="pt")
X = inp["input_ids"].squeeze(1).to(device)
a = inp["attention_mask"].squeeze(1).to(device)
output = model.generate(X, attention_mask=a, max_length=300)
output = tokenizer.decode(output[0])
return output
model = AutoModelForCausalLM.from_pretrained("gpt2")
generator = ctranslate2.Generator("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
text = "<startofstring> [instruction]: generate next response in [dialogue] and check the correctness of the User's answer to the Bot . \n[context]: Meet at an guitar club. \n[dialogue]: \nBot: why do you want to learn playing guitar? \nUser: I want to become a artist.\n[Bot Response]:"
for i in range(3):
start = time.time()
encoded = tokenizer.encode(text)
start_tokens = tokenizer.convert_ids_to_tokens(encoded)
results = generator.generate_batch([start_tokens], max_length=300)
print(len(results[0].sequences_ids[0]))
print(tokenizer.decode(results[0].sequences_ids[0]))
end = time.time()
print(end - start)
for i in range(3):
start = time.time()
output = infer(model,text)
print(output)
end = time.time()
print(end - start)