実行画面

日本語もそこそこいけるようです。
Pythonスクリプト
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer import gradio as gr from threading import Thread model = AutoModelForImageTextToText.from_pretrained( "PaddlePaddle/PaddleOCR-VL", dtype="bfloat16", device_map="cuda" ) processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL") def ocr_image(image): messages = [ { "role": "user", "content": [ {"type": "image", "url": image}, {"type": "text", "text": "OCR:"}, ] } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=False) generation_kwargs = dict( **inputs, max_new_tokens=100, streamer=streamer ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text clean_text = partial_text.replace("</s>", "").strip() yield clean_text with gr.Blocks() as demo: gr.Interface( fn=ocr_image, inputs=gr.Image(type="filepath", label="画像をアップロード"), outputs=gr.Textbox(lines=10, max_lines=40, buttons=["copy"]), flagging_mode="never" ) demo.launch()
Python環境構築
pyproject.tomlを載せておきます。 (バージョンはあえて固定しています)uvを使うとuv syncだけで環境構築できると思います。
[project] name = "paddleocr" version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" dependencies = [ "accelerate==1.12.0", "einops==0.8.1", "gradio==6.1.0", "hf-xet==1.2.0", "torch==2.9.1+cu126", "transformers @ git+https://github.com/huggingface/transformers", ] [[tool.uv.index]] name = "torch-cuda" url = "https://download.pytorch.org/whl/cu126" explicit = true [tool.uv.sources] torch = [{ index = "torch-cuda" }]
補足
Flash Attention 2も使えました。from threading import Thread import gradio as gr from transformers import ( AutoModelForImageTextToText, AutoProcessor, TextIteratorStreamer, ) model = AutoModelForImageTextToText.from_pretrained( "PaddlePaddle/PaddleOCR-VL", dtype="bfloat16", device_map="cuda", attn_implementation="flash_attention_2", ) processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL") def ocr_image(image): messages = [ { "role": "user", "content": [ {"type": "image", "url": image}, {"type": "text", "text": "OCR:"}, ], } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) streamer = TextIteratorStreamer( processor.tokenizer, skip_prompt=True, skip_special_tokens=False ) generation_kwargs = dict(**inputs, max_new_tokens=1024, streamer=streamer) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text clean_text = partial_text.replace("</s>", "").strip() yield clean_text with gr.Blocks() as demo: gr.Interface( fn=ocr_image, inputs=gr.Image(type="filepath", label="画像をアップロード"), outputs=gr.Textbox(lines=10, max_lines=40, buttons=["copy"]), flagging_mode="never", ) demo.launch()