from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    #device_map="auto",
    device_map={"": "cpu"},
)

# Query
query = "What is the capital of France?"

# Tokenize and generate
inputs = tokenizer(query, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)

# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
