pip install transformers torch accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
from torch.profiler import profile, ProfilerActivity
from tqdm import tqdm
set_seed(42)
model_id = "microsoft/Phi-3-mini-4k-instruct"
prompt = "Tell a story about a superhero."
maxlen=128
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_id) # tokenizer doesnt support .to
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.bfloat16, attn_implementation="eager")
input_ids = tokenizer.encode(prompt, return_tensors="pt")
for _ in tqdm(range(2), desc="- warming up ..."):
model.generate(input_ids.to(device), max_length=maxlen, num_return_sequences=1)
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
output_ids = model.generate(input_ids.to(device), max_length=maxlen, num_return_sequences=1)
generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print(generated_texts)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
print("end.")
Sample output