作り方

まず、Cursorに以下のようなプロンプトを渡しました

引数で渡された動画ファイルをpycall経由でmlx-whisperで文字起こしするRuby スクリプトをつくって

そこから、bundle initしてGemfileつくったり、ffgmegでのノーマライズ処理のために

streamio-ffmpeg gemでノーマライズする処理をかいて

的なプロンプトを渡すなどしました。

あとはゴリゴリ手で直して調べてを繰り返しました。

現状ここまで

とりあえず動いて文字起こしが行われます。

path_to_your_venv = File.expand_path('./myenv/lib/python3.11/site-packages', __dir__)
require 'pycall'
site = PyCall.import_module('site')
site.addsitedir(path_to_your_venv)

require 'pycall/import'
include PyCall::Import

pyimport 'mlx_whisper'

require 'streamio-ffmpeg'
require "tempfile"


def exec src_file_path
  tmp_file_path = Tempfile.new(["#{File.basename(src_file_path)}", "#{File.extname(src_file_path)}"]).path
=begin
  options = {
    custom: %W(-af silencedetect=noise=-33dB:d=0.3 -f null -)
  }
  silence_segments = []
  FFMPEG::Movie.new(src_file_path).transcode(tmp_file_path, options) do |progress, line|
    puts line
    if line =~ /silence_start: (\d+\.?\d*)/
      silence_segments << $1.to_f
    end
  end
  puts silence_segments
  silence_segments.unshift(0.0)
  
  movie = FFMPEG::Movie.new(tmp_file_path)
  silence_segments.each_cons(2).with_index do |(start_time, end_time), index|
    duration = end_time - start_time
    output_file_path = Tempfile.new(["segment_#{index.to_s.rjust(4, '0')}", File.extname(tmp_file_path).sub(/^\./, '')]).path
    
    movie.transcode(output_file_path, {
      video: false,
      ss: start_time,
      t: duration,
      custom: %W(-af loudnorm=I=-14:TP=-1.5:LRA=11)
    })

    result = mlx_whisper.transcribe(tmp_file_path, path_or_hf_repo: "mlx-community/whisper-large-v3-turbo", decode_options: {language: 'ja'})

    puts result
  end
=end
  logger = Logger.new(STDERR)
  logger.level = Logger::Severity::FATAL
  FFMPEG.logger = logger

  movie = FFMPEG::Movie.new(src_file_path)
  duration = movie.duration
  segment_duration = 10
  start_time = 0
  index = 0

  while start_time < duration
    Tempfile.open(["segment_#{index}", File.extname(src_file_path)]) do |tf|
      options = {
        custom: %W(-c copy -ss #{start_time} -t #{segment_duration})
      }
      movie.transcode(tf.path, options) do |progress|
#        STDERR.puts "Transcoding progress: #{progress}"
      end

      Tempfile.open(["loudnorm_segment_#{index}", File.extname(src_file_path)]) do |sf|
        smovie = FFMPEG::Movie.new(tf.path)
        options = {
          video: false,
          custom: %W(-af loudnorm=I=-14:TP=-1.5:LRA=11)
        }
        smovie.transcode(sf.path, options) do |progress|
#              STDERR.puts "Transcoding progress: #{progress}"
        end
        #    result = mlx_whisper.transcribe(output_file_path, path_or_hf_repo: "mlx-community/whisper-large-v3-turbo", decode_options: {language: 'ja'})
        result = mlx_whisper.transcribe(sf.path, path_or_hf_repo: "mlx-community/whisper-large-v3-turbo", language: "ja", fp16: true, initial_prompt: "日本語として解釈できなかったら音の通りをカタカナで出力してください")
        puts result['text']
      end
    end
    start_time += segment_duration
    index += 1
  end

rescue FFMPEG::Error => e
    puts "FFmpeg error: #{e.message}"
end

puts ARGV
if ARGV.empty?
  puts "Usage: ruby main.rb <audio_file>"
  exit
end
exec File.expand_path(ARGV[0], __dir__)