import requests
import torch
import sys
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor


import os
os.environ['TRANSFORMERS_CACHE'] = '/var/www/.cache/huggingface/hub/'
os.environ['HF_TOKEN'] = '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'


model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_id)

url = sys.argv[1]
inp_text_arg = sys.argv[2]
image = Image.open(requests.get(url, stream=True).raw)

messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": inp_text_arg}]}]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=200)
print(processor.decode(output[0]))
