Qwen-3.5-Multimodal-Demo

Running on Zero

App Files Files Community

Qwen-3.5-Multimodal-Demo / app.py

khang119966

Update app.py

029d2da verified 14 days ago

raw

history blame contribute delete

3.92 kB

	import gradio as gr
	import torch
	import spaces
	from threading import Thread
	from transformers import TextIteratorStreamer

	current_model_name = None
	model = None
	tokenizer = None

	MODELS = [
	"Qwen/Qwen3.5-0.8B",
	"Qwen/Qwen3.5-2B",
	"Qwen/Qwen3.5-4B",
	"Qwen/Qwen3.5-9B",
	]

	@spaces.GPU(duration=120)
	def process_chat(image, prompt_text, model_name, history):
	global current_model_name, model, tokenizer

	if current_model_name != model_name:
	from unsloth import FastVisionModel

	yield history + [{"role": "assistant", "content": f"⏳ Loading model `{model_name}`... please wait."}]

	if model is not None:
	del model, tokenizer
	torch.cuda.empty_cache()

	model, tokenizer = FastVisionModel.from_pretrained(
	model_name,
	load_in_4bit=False,
	use_gradient_checkpointing="unsloth",
	)
	FastVisionModel.for_inference(model)
	current_model_name = model_name

	if image is None:
	yield history + [{"role": "assistant", "content": "⚠️ Please upload an image first!"}]
	return

	if not prompt_text.strip():
	prompt_text = "Describe this image in detail."

	messages = [
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": prompt_text}
	]}
	]

	input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
	inputs = tokenizer(
	image,
	input_text,
	add_special_tokens=False,
	return_tensors="pt",
	).to("cuda")

	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	generation_kwargs = dict(
	**inputs,
	streamer=streamer,
	max_new_tokens=1024,
	use_cache=True,
	temperature=1.5,
	min_p=0.1,
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	history = history + [
	{"role": "user", "content": prompt_text},
	{"role": "assistant", "content": ""},
	]

	for new_text in streamer:
	history[-1]["content"] += new_text
	yield history


	with gr.Blocks(title="Qwen3.5 Vision Chat") as demo:
	gr.Markdown(
	"""
	# 🦁 Qwen3.5 Vision Chat
	A simple demo to chat with Qwen3.5 Vision models using an image.
	### 📌 How to use
	1️⃣ Upload an image (or paste from clipboard).
	2️⃣ Select the model size you want.
	3️⃣ Ask a question about the image.
	4️⃣ Click Send 🚀 or press Enter.
	❤️ If this demo is useful, please consider giving it a like / heart.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(type="pil", label="🖼️ Upload Image", sources=["upload", "clipboard"])
	model_dropdown = gr.Dropdown(
	choices=MODELS,
	value="Qwen/Qwen3.5-2B",
	label="⚙️ Select Model"
	)
	clear_btn = gr.Button("🗑️ Clear History", variant="stop")

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="💬 Chat", height=500)
	with gr.Row():
	text_input = gr.Textbox(
	show_label=False,
	placeholder="Ask something about the image...",
	scale=8
	)
	submit_btn = gr.Button("Send 🚀", variant="primary", scale=1)

	submit_btn.click(
	fn=process_chat,
	inputs=[image_input, text_input, model_dropdown, chatbot],
	outputs=[chatbot]
	).then(lambda: "", None, text_input)

	text_input.submit(
	fn=process_chat,
	inputs=[image_input, text_input, model_dropdown, chatbot],
	outputs=[chatbot]
	).then(lambda: "", None, text_input)

	clear_btn.click(lambda: [], None, chatbot)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()