https://touch-sp.hatenablog.com/entry/2025/12/12/220344

はじめに

Shisa.AIは日本の会社です。

Shisa-v2.1が日英バイリンガルモデルという事で翻訳アプリを作ってみました。

画面

Pythonスクリプト

from threading import Thread

import gradio as gr
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    TextIteratorStreamer,
)

model = AutoModelForCausalLM.from_pretrained(
    "shisa-ai/shisa-v2.1-qwen3-8b",
    dtype="bfloat16",
    device_map="cuda",
    attn_implementation="flash_attention_2",
)

tokenizer = AutoProcessor.from_pretrained("shisa-ai/shisa-v2.1-qwen3-8b")


def translate_text(text, direction):
    """テキスト翻訳（双方向対応）"""
    if not text.strip():
        return ""

    # 翻訳方向に応じてシステムプロンプトとユーザープロンプトを設定
    if direction == "英語→日本語":
        system_content = "あなたは英語から日本語への翻訳に特化した優秀なAIアシスタントです。回答は翻訳文にとどめ、日本語に翻訳するだけです。"
        user_content = f"次の英語を日本語に翻訳して下さい。\n{text}"
    else:  # "日本語→英語"
        system_content = "あなたは日本語から英語への翻訳に特化した優秀なAIアシスタントです。回答は翻訳文にとどめ、英語に翻訳するだけです。"
        user_content = f"次の日本語を英語に翻訳して下さい。\n{text}"

    prompt = [
        {
            "role": "system",
            "content": system_content,
        },
        {
            "role": "user",
            "content": user_content,
        },
    ]

    inputs = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    generation_kwargs = dict(**inputs, max_new_tokens=1024, streamer=streamer)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    result = ""
    for new_text in streamer:
        result += new_text
        yield result


# Gradioインターフェースの構築
demo = gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(lines=10, max_lines=40, label="翻訳元テキスト"),
        gr.Radio(
            choices=["英語→日本語", "日本語→英語"],
            value="英語→日本語",
            label="翻訳方向",
        ),
    ],
    outputs=gr.Textbox(lines=10, max_lines=40, buttons=["copy"], label="翻訳結果"),
    title="shisa-v2.1-qwen3-8b 翻訳アプリ",
    description="テキストを入力し、翻訳方向を選択してください。英語⇔日本語の双方向翻訳が可能です。",
)

# アプリケーションの起動
if __name__ == "__main__":
    demo.launch()

環境構築

pyproject.tomlを載せておきます。（ライブラリのバージョンはあえて固定しています）

uvを使うとuv syncだけで環境構築できると思います。

[project]
name = "shisa"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
    "accelerate==1.12.0",
    "einops==0.8.1",
    "flash-attn",
    "gradio==6.1.0",
    "hf-xet==1.2.0",
    "torch==2.9.1+cu126",
    "transformers==4.57.3",
]

[[tool.uv.index]]
name = "torch-cuda"
url = "https://download.pytorch.org/whl/cu126"
explicit = true

[tool.uv.sources]
torch = [{ index = "torch-cuda" }]
flash-attn = { path = "flash_attn-2.8.3+cu126torch2.9.1cxx11abiTRUE-cp313-cp313-win_amd64.whl" }

flash-attnはこちらを参考にビルドしました。必須ではありません。

flash-attnがなければPythonスクリプトから以下の１行を削除して下さい。

attn_implementation="flash_attention_2"

2026年1月30日追記

Transfromersライブラリ v5.0が公開されました。

PyTorchも2.10.0が公開されています。

その環境で動かしてみました。スクリプトとpyproject.tomlを載せておきます。

from threading import Thread
import gradio as gr
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor, # v5でもAutoProcessorを使用
    TextIteratorStreamer,
)

model_id = "shisa-ai/shisa-v2.1-qwen3-8b"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="bfloat16",
    device_map="cuda",
    attn_implementation="flash_attention_2",
)

# v5ではプロセッサ/トークナイザーのシリアル化が統合されています
processor = AutoProcessor.from_pretrained(model_id)

def translate_text(text, direction):
    if not text.strip():
        return ""

    if direction == "英語→日本語":
        system_content = "あなたは英語から日本語への翻訳に特化した優秀なAIアシスタントです。回答は翻訳文にとどめ、日本語に翻訳するだけです。"
        user_content = f"次の英語を日本語に翻訳して下さい。\n{text}"
    else:
        system_content = "あなたは日本語から英語への翻訳に特化した優秀なAIアシスタントです。回答は翻訳文にとどめ、英語に翻訳するだけです。"
        user_content = f"次の日本語を英語に翻訳して下さい。\n{text}"

    prompt = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content},
    ]

    # ★修正: v5ではデフォルトでBatchEncoding (dict) が返されるようになりました。
    # return_dict=Trueは冗長になりますが、明示的に含めても動作します。
    inputs = processor.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
    ).to(model.device)

    # ★修正: streamerに渡すのは processor 本体（またはその中のtokenizer）
    streamer = TextIteratorStreamer(
        processor, skip_prompt=True, skip_special_tokens=True
    )

    # v5ではKVキャッシュのデフォルト挙動がモデル定義に従うよう変更されましたが、
    # この推論コードの書き方であれば影響はありません。
    generation_kwargs = dict(**inputs, max_new_tokens=1024, streamer=streamer)

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    result = ""
    for new_text in streamer:
        result += new_text
        yield result

# Gradioインターフェース（ここは変更なし）
demo = gr.Interface(
    fn=translate_text,
    inputs=[
        gr.Textbox(lines=10, max_lines=40, label="翻訳元テキスト"),
        gr.Radio(
            choices=["英語→日本語", "日本語→英語"],
            value="英語→日本語",
            label="翻訳方向",
        ),
    ],
    outputs=gr.Textbox(lines=10, max_lines=40, buttons=["copy"], label="翻訳結果"),
    title="shisa-v2.1-qwen3-8b 翻訳アプリ (v5対応)",
)

if __name__ == "__main__":
    demo.launch()

[project]
name = "shisa"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.14"
dependencies = [
    "accelerate==1.12.0",
    "einops==0.8.2",
    "flash-attn",
    "gradio==6.5.1",
    "hf-xet==1.2.0",
    "torch==2.10.0+cu126",
    "transformers==5.0.0",
]

[[tool.uv.index]]
name = "torch-cuda"
url = "https://download.pytorch.org/whl/cu126"
explicit = true

[tool.uv.sources]
torch = [{ index = "torch-cuda" }]
flash-attn = { path = "flash_attn-2.8.3-cu126torch2.10.0cxx11abiTRUE-cp314-cp314-win_amd64.whl" }

flash attentionはこちらの方法でビルドしています。

ランキング参加中

プログラミング