https://redhologerbera.hatenablog.com/entry/2025/02/02/231438

本日はApple枠です。

今回はiOSでの音声認識の実装です。

前回はタイムアウトの時間を調整しました。

redhologerbera.hatenablog.com

今回は特定のキーワードを認識したときにUnityEventsを発火させる処理を実装します。

〇環境

・Windows11PC

・Unity6000.0.32f1

〇音声認識において特定のキーワードを認識する

まずはイベントを定義します。

　ここではキーワード、そしてそのキーワードを認識した際にUnityEventを発火させるようにします。

using UnityEngine;
using UnityEngine.UI;
using System.Linq;
using UnityEngine.Events;
using System.Collections;

[System.Serializable]
public class KeywordAction
{
    public string Keyword;      // キーワード
    public UnityEvent Action;   // キーワードが認識されたときに発火するイベント
}

public class SpeechToTextDemo : MonoBehaviour, ISpeechToTextListener
{
　・・・
}

同時にSpeechToTextDemoクラスにkeywordActionを追加します。

public class SpeechToTextDemo : MonoBehaviour, ISpeechToTextListener
{
    public Text SpeechText;
    public Button StartSpeechToTextButton, StopSpeechToTextButton;
    public Slider VoiceLevelSlider;
    public bool PreferOfflineRecognition;
    public KeywordAction[] KeywordActions;  // 追加: キーワードとイベントの配列

これによってイベントが追加されます。

Keywordは認識させたいキーワード、Actionはそのキーワードを認識したときに発火するイベントです。

public class SpeechToTextDemo : MonoBehaviour, ISpeechToTextListener
{
    public Text SpeechText;
    public Button StartSpeechToTextButton, StopSpeechToTextButton;
    public Slider VoiceLevelSlider;
    public bool PreferOfflineRecognition;
    public KeywordAction[] KeywordActions;  // 追加: キーワードとイベントの配列

    private float normalizedVoiceLevel;

    // キーワードがすでに検出されたかどうかを追跡するフラグ
    private bool[] keywordDetectedFlags;

    private bool isRecognitionActive = false;

    // 定期的にテキストをチェックするためのインターバル
    public float checkInterval = 1f;  // 1秒ごとにチェックする
    private Coroutine checkCoroutine;

    private void Awake()
    {
        SpeechToText.Initialize("ja-JP");

        StartSpeechToTextButton.onClick.AddListener(StartSpeechToText);
        StopSpeechToTextButton.onClick.AddListener(StopSpeechToText);

        // フラグを初期化
        keywordDetectedFlags = new bool[KeywordActions.Length];
    }

    private void Update()
    {
        StartSpeechToTextButton.interactable = SpeechToText.IsServiceAvailable(PreferOfflineRecognition) && !SpeechToText.IsBusy();
        StopSpeechToTextButton.interactable = SpeechToText.IsBusy();

        // You may also apply some noise to the voice level for a more fluid animation (e.g. via Mathf.PerlinNoise)
        VoiceLevelSlider.value = Mathf.Lerp(VoiceLevelSlider.value, normalizedVoiceLevel, 15f * Time.unscaledDeltaTime);
    }

    public void ChangeLanguage(string preferredLanguage)
    {
        if (!SpeechToText.Initialize(preferredLanguage))
            SpeechText.text = "Couldn't initialize with language: " + preferredLanguage;
    }

    public void StartSpeechToText()
    {
        SpeechToText.RequestPermissionAsync((permission) =>
        {
            if (permission == SpeechToText.Permission.Granted)
            {
                if (SpeechToText.Start(this, preferOfflineRecognition: PreferOfflineRecognition))
                {
                    SpeechText.text = "";
                    isRecognitionActive = true;
                    StartCheckingForKeywords();  // 定期的にチェックを開始
                }
                else
                    SpeechText.text = "Couldn't start speech recognition session!";
            }
            else
                SpeechText.text = "Permission is denied!";
        });
    }

    public void StopSpeechToText()
    {
        SpeechToText.ForceStop();
        isRecognitionActive = false;
        StopCheckingForKeywords(); // チェックを停止
        StartCoroutine(ReStartSpeechToText());// 3秒後に再開
    }
    
    IEnumerator ReStartSpeechToText()
    {
        yield return new WaitForSeconds(3f);
        StartSpeechToText();
    }

    private void StartCheckingForKeywords()
    {
        if (checkCoroutine != null)
            StopCoroutine(checkCoroutine);

        checkCoroutine = StartCoroutine(CheckKeywordsPeriodically());
    }

    private void StopCheckingForKeywords()
    {
        if (checkCoroutine != null)
            StopCoroutine(checkCoroutine);
    }

    private IEnumerator CheckKeywordsPeriodically()
    {
        while (isRecognitionActive)
        {
            yield return new WaitForSeconds(checkInterval);  // インターバル待機

            // キーワードがSpeechTextに含まれているかチェック
            TriggerKeywordAction(SpeechText.text, isPartial: false);

            // キーワードが見つかった場合は認識を終了し、再開する
            if (keywordDetectedFlags.Any(flag => flag))
            {
                StopSpeechToText();  // 認識を終了
                yield return new WaitForSeconds(3f);  // 3秒後に再開
                StartSpeechToText();  // 音声認識を再開
            }
        }
    }

    void ISpeechToTextListener.OnReadyForSpeech()
    {
        Debug.Log("OnReadyForSpeech");
    }

    void ISpeechToTextListener.OnBeginningOfSpeech()
    {
        Debug.Log("OnBeginningOfSpeech");
    }

    void ISpeechToTextListener.OnVoiceLevelChanged(float normalizedVoiceLevel)
    {
        this.normalizedVoiceLevel = normalizedVoiceLevel;
    }

    void ISpeechToTextListener.OnPartialResultReceived(string spokenText)
    {
        Debug.Log("OnPartialResultReceived: " + spokenText);
        SpeechText.text = spokenText;

        // 部分的な結果が得られた段階でキーワードをチェック
        TriggerKeywordAction(spokenText, isPartial: true);
    }

    void ISpeechToTextListener.OnResultReceived(string spokenText, int? errorCode)
    {
        Debug.Log("OnResultReceived: " + spokenText + (errorCode.HasValue ? (" --- Error: " + errorCode) : ""));
        SpeechText.text = spokenText;
        normalizedVoiceLevel = 0f;

        // 音声認識が終了したタイミングでキーワード認識を行い、フラグをリセットする
        TriggerKeywordAction(spokenText, isPartial: false);

        // 終了時にフラグをリセット
        ResetKeywordFlags();
    }

    // キーワードに基づいてイベントを発火させるメソッド
    private void TriggerKeywordAction(string spokenText, bool isPartial)
    {
        for (int i = 0; i < KeywordActions.Length; i++)
        {
            // 既にキーワードが検出されていなければ、キーワードを検出してイベントを発火
            if (!keywordDetectedFlags[i] && spokenText.Contains(KeywordActions[i].Keyword))
            {
                Debug.Log($"Keyword detected: {KeywordActions[i].Keyword}");
                KeywordActions[i].Action.Invoke();
                keywordDetectedFlags[i] = true;

                // 部分認識結果の場合、キーワードが発見されたらその時点でフラグを立てる
                if (isPartial)
                    break;  // 部分認識の場合、一度発火したら他のキーワードを確認しない
            }
        }
    }

    // 音声認識が終了したらフラグをリセットするメソッド
    private void ResetKeywordFlags()
    {
        for (int i = 0; i < keywordDetectedFlags.Length; i++)
        {
            keywordDetectedFlags[i] = false;
        }
    }
}

ここではTriggerKeywordActionメソッドで定期的に認識したテキストを検出して、キーワードが含まれてないかをチェックします。

キーワードを認識した場合、対応するUnityEventをInvokeします。

この際にほかのイベントが発火しないようにフラグを設定しています。

以上で特定のキーワードを検知した場合イベントが発行する実装ができました。