はじめに
Voxtral Mini 4B Realtime 2602は日本語を含む13言語をサポートするリアルタイム文字起こしモデルです。
実際に使ってみました。
vLLMを使う必要があるのでWSL2(Ubuntu 25.10)を使いました。
WSL2上のvLLMでモデルを動かし、それをWindowsから操作しました。
環境構築
サーバー側(WSL2)
あらかじめ CUDA Toolkit 13.0 Update 2 をインストールしています。
uvを使っています。
uv venv --python 3.13 --seed source .venv/bin/activate
pip install https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0+cu130-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu130
pip install soxr librosa soundfile
サーバーの実行
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager # option: --max-model-len 2048
Windows側から確認
curl http://localhost:8000/v1/models
文字起こしの実行
音声ファイルの文字起こし
実行画面

ホスト側の環境構築
uvを使っています。pyproject.tomlを載せておくので uv sync のみで環境構築可能です。
[project] name = "test" version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" dependencies = [ "gradio==6.5.1", "librosa==0.11.0", "numpy==2.3.5", "websockets==16.0", ]
Pythonコード
""" Audio transcription using vLLM Realtime API. Usage: 1. Start vLLM server: vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager --max-model-len 4096 2. Run this script: uv run main.py 3. Upload audio file and click Transcribe """ import asyncio import base64 import json import gradio as gr import librosa import numpy as np import websockets def audio_to_pcm16_base64(audio_path: str, add_silence: float = 1.5) -> str: """Convert audio file to base64-encoded PCM16 @ 16kHz.""" audio, _ = librosa.load(audio_path, sr=16000, mono=True) # Add silence at the end to ensure last words are transcribed if add_silence > 0: silence = np.zeros(int(16000 * add_silence), dtype=audio.dtype) audio = np.concatenate([audio, silence]) pcm16 = (audio * 32767).astype(np.int16) return base64.b64encode(pcm16.tobytes()).decode("utf-8") async def realtime_transcribe(audio_path: str, host: str = "localhost", port: int = 8000, model: str = "mistralai/Voxtral-Mini-4B-Realtime-2602"): """Connect to the Realtime API and transcribe an audio file.""" uri = f"ws://{host}:{port}/v1/realtime" try: async with websockets.connect(uri) as ws: # Wait for session.created await ws.recv() # Update session await ws.send(json.dumps({ "type": "session.update", "model": model, "max_tokens": 8192, "temperature": 0.0, })) # Start audio buffer await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) # Convert and send audio audio_base64 = audio_to_pcm16_base64(audio_path, add_silence=1.5) audio_bytes = base64.b64decode(audio_base64) chunk_size = 4096 for i in range(0, len(audio_bytes), chunk_size): await ws.send(json.dumps({ "type": "input_audio_buffer.append", "audio": base64.b64encode(audio_bytes[i:i + chunk_size]).decode("utf-8"), })) # Signal end of audio await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True})) # Receive transcription transcription = "" while True: response = json.loads(await ws.recv()) if response["type"] == "transcription.delta": transcription += response["delta"] yield transcription elif response["type"] == "transcription.done": yield f"{transcription}\n\n=== Final ===\n{response['text']}\n\nUsage: {response.get('usage', {})}" break elif response["type"] == "error": yield f"Error: {response['error']}" break except websockets.exceptions.ConnectionClosedError: yield f"{transcription}\n\n(Connection closed)" except Exception as e: yield f"Error: {str(e)}" def transcribe_audio(audio_file): """Gradio function to transcribe audio file.""" if audio_file is None: yield "Please upload an audio file first." return loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: async_gen = realtime_transcribe(audio_file) while True: try: yield loop.run_until_complete(async_gen.__anext__()) except StopAsyncIteration: break finally: loop.close() def create_gradio_app(): """Create Gradio interface.""" with gr.Blocks(title="Audio Transcription") as app: gr.Markdown("# Audio Transcription") gr.Markdown("Upload an audio file to transcribe using vLLM Realtime API") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Audio File") transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Output", lines=15, max_lines=20) transcribe_btn.click(transcribe_audio, audio_input, output_text) return app if __name__ == "__main__": app = create_gradio_app() app.launch()
リアルタイム文字起こし
実行画面
PySide6を使ってGUIを構築しました。

ホスト側の環境構築
uvを使っています。pyproject.tomlを載せておくので uv sync のみで環境構築可能です。
[project] name = "voxtral" version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" dependencies = [ "librosa==0.11.0", "numpy==2.3.5", "pyside6==6.10.2", "websockets==16.0", ]
Pythonコード
import sys import json import asyncio import base64 import queue import argparse import traceback import websockets from PySide6.QtWidgets import (QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QTextEdit, QLabel, QHBoxLayout) from PySide6.QtCore import Signal, Slot, QThread from PySide6.QtGui import QTextCursor SAMPLE_RATE = 16_000 class WebSocketWorker(QThread): delta_received = Signal(str) status_changed = Signal(str) error_occurred = Signal(str) finished_signal = Signal() def __init__(self, ws_url, model_name, audio_queue): super().__init__() self.ws_url = ws_url self.model_name = model_name self.audio_queue = audio_queue self.is_running = True self.loop = None def stop(self): self.is_running = False def run(self): self.loop = asyncio.new_event_loop() asyncio.set_event_loop(self.loop) try: self.loop.run_until_complete(self.websocket_handler()) except Exception as e: print(f"WS Worker Error: {e}") traceback.print_exc() self.error_occurred.emit(str(e)) finally: self.finished_signal.emit() async def websocket_handler(self): try: async with websockets.connect(self.ws_url) as ws: # 1. Wait for session.created (Match main.py) await ws.recv() # 2. Update session (Match main.py) await ws.send(json.dumps({"type": "session.update", "model": self.model_name})) # 3. Initial commit (Match main.py) await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) self.status_changed.emit("状態: 録音中(接続済み)") async def send_audio(): while self.is_running: try: # Use run_in_executor for non-blocking queue get chunk = await self.loop.run_in_executor( None, lambda: self.audio_queue.get(timeout=0.1) ) await ws.send( json.dumps( {"type": "input_audio_buffer.append", "audio": chunk} ) ) except queue.Empty: continue except Exception: break async def receive_transcription(): try: async for message in ws: if not self.is_running: break data = json.loads(message) if not isinstance(data, dict): continue msg_type = data.get("type") # Handle deltas if msg_type in ["transcription.delta", "response.audio_transcription.delta"]: delta = data.get("delta") if delta: self.delta_received.emit(delta) elif msg_type == "error": error_payload = data.get("error") if isinstance(error_payload, dict): msg = error_payload.get("message", str(error_payload)) else: msg = str(error_payload) self.error_occurred.emit(msg) except websockets.exceptions.ConnectionClosedOK: pass except Exception: traceback.print_exc() await asyncio.gather(send_audio(), receive_transcription()) except Exception as e: self.error_occurred.emit(str(e)) traceback.print_exc() class AudioTranscriptionApp(QMainWindow): def __init__(self, ws_url, model_name): super().__init__() self.ws_url = ws_url self.model_name = model_name self.transcription_text = "" self.is_running = False self.has_error = False self.audio_queue = queue.Queue() self.setWindowTitle("Voxtral Real-time Transcription") self.setGeometry(100, 100, 700, 500) # UI Layout central_widget = QWidget() self.setCentralWidget(central_widget) layout = QVBoxLayout(central_widget) self.status_label = QLabel("状態: 待機中") self.status_label.setStyleSheet("font-weight: bold; color: blue;") layout.addWidget(self.status_label) self.text_edit = QTextEdit() self.text_edit.setReadOnly(True) self.text_edit.setPlaceholderText("文字起こし結果がここに表示されます...") # Increase font size by 1.2x font = self.text_edit.font() if font.pointSize() > 0: font.setPointSize(int(font.pointSize() * 1.2)) elif font.pixelSize() > 0: font.setPixelSize(int(font.pixelSize() * 1.2)) else: font.setPointSizeF(font.pointSizeF() * 1.2) self.text_edit.setFont(font) layout.addWidget(self.text_edit) self.toggle_button = QPushButton("スタート") self.toggle_button.setFixedHeight(60) self.toggle_button.clicked.connect(self.toggle_transcription) layout.addWidget(self.toggle_button) # Audio setup self.audio_source = None self.audio_device = None self.setup_audio_system() # Worker self.worker = None def setup_audio_system(self): from PySide6.QtMultimedia import QAudioSource, QAudioFormat, QMediaDevices format = QAudioFormat() format.setSampleRate(SAMPLE_RATE) format.setChannelCount(1) format.setSampleFormat(QAudioFormat.Int16) device = QMediaDevices.defaultAudioInput() if device.isNull(): self.status_label.setText("エラー: オーディオ入力デバイスが見つかりません。") self.toggle_button.setEnabled(False) return self.audio_source = QAudioSource(device, format) def toggle_transcription(self): if self.is_running: self.stop_transcription() else: self.start_transcription() def start_transcription(self): if self.is_running: return # Immediate UI Update self.is_running = True self.has_error = False self.toggle_button.setText("ストップ") self.status_label.setText("状態: 接続中...") self.status_label.setStyleSheet("font-weight: bold; color: orange;") if self.transcription_text: self.transcription_text += "\n---\n" self.text_edit.append("\n---\n") # Clear queue while not self.audio_queue.empty(): try: self.audio_queue.get_nowait() except queue.Empty: break # Start Audio Capture self.audio_device = self.audio_source.start() self.audio_device.readyRead.connect(self.read_audio_data) # Start WebSocket Worker self.worker = WebSocketWorker(self.ws_url, self.model_name, self.audio_queue) self.worker.delta_received.connect(self.on_delta_received) self.worker.status_changed.connect(self.on_status_changed) self.worker.error_occurred.connect(self.on_error) self.worker.finished_signal.connect(self.on_worker_finished) self.worker.start() def stop_transcription(self): if not self.is_running: return self.is_running = False if self.audio_source: self.audio_source.stop() if self.worker: self.worker.stop() # UIを即座に更新して再スタートを可能にする self.toggle_button.setText("スタート") self.status_label.setText("状態: 待機中") self.status_label.setStyleSheet("font-weight: bold; color: blue;") @Slot() def read_audio_data(self): if not self.is_running or not self.audio_device: return # QAudioSourceから生データ取得(既にPCM16形式) raw_data = self.audio_device.readAll().data() if not raw_data: return # QAudioFormatで既に16kHz、モノラル、Int16に設定されているため # データをそのままbase64エンコードする b64_chunk = base64.b64encode(raw_data).decode("utf-8") self.audio_queue.put(b64_chunk) def on_delta_received(self, delta): self.transcription_text += delta cursor = self.text_edit.textCursor() cursor.movePosition(QTextCursor.MoveOperation.End) cursor.insertText(delta) self.text_edit.setTextCursor(cursor) self.text_edit.ensureCursorVisible() def on_status_changed(self, status): self.status_label.setText(status) if "録音中" in status: self.status_label.setStyleSheet("font-weight: bold; color: green;") def on_error(self, message): self.has_error = True self.status_label.setText(f"エラー: {message}") self.status_label.setStyleSheet("font-weight: bold; color: red;") self.stop_transcription() def on_worker_finished(self): self.toggle_button.setText("スタート") # エラーが発生した場合はステータスメッセージを保持 if self.has_error: return if not self.is_running: self.status_label.setText("状態: 待機中") self.status_label.setStyleSheet("font-weight: bold; color: blue;") else: self.is_running = False self.status_label.setText("状態: 切断されました") self.status_label.setStyleSheet("font-weight: bold; color: red;") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Voxtral PySide6 App") parser.add_argument("--model", type=str, default="mistralai/Voxtral-Mini-4B-Realtime-2602") parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) args = parser.parse_args() app_url = f"ws://{args.host}:{args.port}/v1/realtime" app = QApplication(sys.argv) window = AudioTranscriptionApp(app_url, args.model) window.show() sys.exit(app.exec())
補足
自動的にvLLMのサーバーを起動する方法①
サービスの作成(systemd)
WSL内で以下のコマンドを実行し、設定ファイルを作成します。
sudo nano /etc/systemd/system/vllm-voxtral.service
中身はこのようにします。
[Unit]
Description=vLLM Voxtral Server
After=network.target nvidia-persistenced.service
[Service]
Type=simple
User=hoge
# 作業ディレクトリ(ご自身のプロジェクトフォルダに合わせてください)
WorkingDirectory=/home/hoge/voxtral
# 環境変数の設定
Environment="VLLM_DISABLE_COMPILE_CACHE=1"
# PATHには仮想環境のbinを必ず含める
Environment="PATH=/home/hoge/voxtral/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin"
# 起動コマンド
# python -m vllm.entrypoints.openai.api_server は vllm serve と同等です
# vllm というコマンド自体も .venv/bin/vllm にあるので、そちらを直接叩いてもOKです
ExecStart=/home/hoge/voxtral/.venv/bin/vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 \
--enforce-eager \
--max-model-len 2048
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
サービスを有効化して再起動する
# 自動起動の有効化 sudo systemctl enable vllm-voxtral
この後、再起動します。
確認方法
sudo journalctl -u vllm-voxtral -f
止める方法
一時的
sudo systemctl stop vllm-voxtral
永久
sudo systemctl disable vllm-voxtral
自動的にvLLMのサーバーを起動する方法②
~/.bashrcに以下を書き込みます。
if ! pgrep -f "vllm serve" > /dev/null; then
echo "Starting vLLM server..."
# ログを vllm.log に書き込み、画面には出さないようにする
/home/hoge/voxtral/.venv/bin/vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --enforce-eager > ~/vllm.log 2>&1 &
echo "vLLM is running in background. Check logs with: tail -f ~/vllm.log"
fi
ログの確認方法はこちら。
tail -f ~/vllm.log
これなら Ctrl+C でログ視聴を止めても、サーバー自体は止まりません。