vuiseng9/hf-causal-torch-profiler.md

## hf-causal-torch-profiler.md

      
    Raw
  

              hf-causal-torch-profiler.md
            
          
    Install

pip install transformers torch accelerate
Run

from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
from torch.profiler import profile, ProfilerActivity
from tqdm import tqdm

set_seed(42)

model_id = "microsoft/Phi-3-mini-4k-instruct"
prompt = "Tell a story about a superhero."
maxlen=128

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(model_id) # tokenizer doesnt support .to
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.bfloat16, attn_implementation="eager")
input_ids = tokenizer.encode(prompt, return_tensors="pt")

for _ in tqdm(range(2), desc="- warming up ..."):
    model.generate(input_ids.to(device), max_length=maxlen, num_return_sequences=1)

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    output_ids = model.generate(input_ids.to(device), max_length=maxlen, num_return_sequences=1)

generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print(generated_texts)

print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

print("end.")