Skip to main content
Source Code: src/gaia/audio/
Components: AudioClient, WhisperAsr, KokoroTTS Module: gaia.audio Import: from gaia.audio import AudioClient, WhisperAsr, KokoroTTS

Overview

The audio subsystem provides complete voice interaction capabilities including Automatic Speech Recognition (ASR), Text-to-Speech (TTS), and voice chat orchestration. AudioClient coordinates ASR and TTS with LLM generation, while WhisperAsr handles voice-to-text transcription and KokoroTTS provides high-quality speech synthesis. Key Features:
  • Voice chat with streaming TTS
  • Whisper-based ASR with Voice Activity Detection (VAD)
  • High-quality Kokoro TTS with multiple voices (17+ available)
  • Streaming audio processing with real-time playback
  • Interrupt handling (keyboard and voice)
  • GPU acceleration support
  • Thread-safe operation

Requirements

AudioClient

  • Voice chat session management
  • ASR/TTS coordination (pause recording during TTS)
  • LLM integration for conversation
  • Interrupt and halt handling
  • Queue-based transcription delivery

WhisperAsr

  • Real-time speech recognition using Whisper
  • Configurable model sizes (tiny, base, small, medium, large)
  • Voice Activity Detection with silence thresholds
  • Pause/resume recording capability
  • GPU acceleration (CUDA support)

KokoroTTS

  • High-quality speech synthesis
  • Multiple voice options (American/British, male/female)
  • Streaming audio generation
  • Real-time playback
  • Interrupt event handling

API Specification

AudioClient

from gaia.audio import AudioClient

class AudioClient:
    """Handles all audio-related functionality including TTS, ASR, and voice chat."""

    def __init__(
        self,
        whisper_model_size="base",
        audio_device_index=None,
        silence_threshold=0.5,
        enable_tts=True,
        logging_level="INFO",
        use_claude=False,
        use_chatgpt=False,
        system_prompt=None,
    ):
        """
        Initialize audio client.

        Args:
            whisper_model_size: Whisper model size (tiny, base, small, medium, large)
            audio_device_index: Audio input device index (None = default)
            silence_threshold: Silence duration before processing (seconds)
            enable_tts: Enable text-to-speech output
            logging_level: Logging level (DEBUG, INFO, WARNING, ERROR)
            use_claude: Use Claude API for LLM
            use_chatgpt: Use ChatGPT API for LLM
            system_prompt: Default system prompt for LLM
        """
        pass

    async def start_voice_chat(self, message_processor_callback):
        """
        Start a voice-based chat session.

        Args:
            message_processor_callback: Async callback to process transcribed text

        Example:
            >>> async def process_message(text: str):
            ...     await audio.process_voice_input(text)
            >>> await audio.start_voice_chat(process_message)
        """
        pass

    async def process_voice_input(self, text: str, get_stats_callback=None):
        """
        Process transcribed voice input and get AI response.

        Args:
            text: Transcribed text to process
            get_stats_callback: Optional callback for performance stats
        """
        pass

    async def speak_text(self, text: str):
        """Speak text using TTS."""
        pass

    async def halt_generation(self):
        """Send a request to halt the current generation."""
        pass

WhisperAsr

from gaia.audio import WhisperAsr

class WhisperAsr:
    """Whisper-based Automatic Speech Recognition."""

    def __init__(
        self,
        model_size="small",
        device_index=None,
        transcription_queue=None,
        enable_cuda=False,
        silence_threshold=None,
        min_audio_length=None,
    ):
        """
        Initialize Whisper ASR.

        Args:
            model_size: Whisper model size (tiny, base, small, medium, large)
            device_index: Audio input device index (None = default)
            transcription_queue: Queue to receive transcriptions
            enable_cuda: Enable GPU acceleration
            silence_threshold: Custom silence threshold (default: 0.01)
            min_audio_length: Minimum audio length in samples (default: 16000)
        """
        pass

    def start_recording(self):
        """Start recording audio."""
        pass

    def stop_recording(self):
        """Stop recording audio."""
        pass

    def pause_recording(self):
        """Pause audio recording (e.g., during TTS playback)."""
        pass

    def resume_recording(self):
        """Resume audio recording after pause."""
        pass

    def get_device_name(self) -> str:
        """Get the name of the current audio input device."""
        pass

KokoroTTS

from gaia.audio import KokoroTTS

class KokoroTTS:
    """Kokoro-based Text-to-Speech synthesis."""

    def __init__(self):
        """
        Initialize Kokoro TTS.

        Note:
            - Uses American English by default
            - Default voice: af_bella (A- quality)
            - 17+ voices available
        """
        pass

    def generate_speech_streaming(
        self,
        text_queue,
        status_callback=None,
        interrupt_event=None,
    ):
        """
        Generate and play speech from streaming text queue.

        Args:
            text_queue: Queue containing text chunks and control signals
            status_callback: Callback for speaking state (True=speaking, False=done)
            interrupt_event: Event to signal interruption

        Control Signals:
            "__END__": End of text stream
            "__HALT__": Immediate stop
        """
        pass

    def get_available_voices(self) -> dict:
        """
        Get available voice configurations.

        Returns:
            Dictionary of voice configs with quality/duration ratings
        """
        pass

    def set_voice(self, voice_id: str):
        """
        Set the TTS voice.

        Args:
            voice_id: Voice identifier (e.g., "af_bella", "am_michael")
        """
        pass

Dependencies

Required Packages

# pyproject.toml
[project.optional-dependencies]
talk = [
    "openai-whisper>=20231117",  # ASR
    "pyaudio>=0.2.14",           # Audio recording
    "torch>=2.0.0",              # Whisper backend
    "kokoro>=0.3.1",             # TTS
    "sounddevice>=0.4.6",        # Audio playback
    "soundfile>=0.12.1",         # Audio file I/O
    "numpy>=1.24.0",             # Audio processing
]

Usage Examples

Basic Voice Chat

import asyncio
from gaia.audio import AudioClient

async def main():
    audio = AudioClient(
        whisper_model_size="base",
        enable_tts=True,
        system_prompt="You are a helpful assistant"
    )

    async def process_message(text):
        await audio.process_voice_input(text)

    await audio.start_voice_chat(process_message)

asyncio.run(main())

Custom Voice Settings

from gaia.audio import AudioClient

audio = AudioClient(enable_tts=True)
audio.initialize_tts()

# Change voice
audio.tts.set_voice("am_michael")  # Male voice

# Speak text
await audio.speak_text("Hello in a different voice")

Standalone Whisper ASR

import queue
from gaia.audio import WhisperAsr

# Create transcription queue
transcription_queue = queue.Queue()

# Initialize ASR
asr = WhisperAsr(
    model_size="small",
    transcription_queue=transcription_queue,
    enable_cuda=True  # Use GPU
)

# Start recording
asr.start_recording()

# Process transcriptions
try:
    while True:
        text = transcription_queue.get(timeout=1)
        print(f"Transcribed: {text}")
except KeyboardInterrupt:
    asr.stop_recording()

Standalone Kokoro TTS

import queue
import threading
from gaia.audio import KokoroTTS

tts = KokoroTTS()
text_queue = queue.Queue()

# Start TTS thread
thread = threading.Thread(
    target=tts.generate_speech_streaming,
    args=(text_queue,)
)
thread.start()

# Send text
text_queue.put("Hello")
text_queue.put(" world!")
text_queue.put("__END__")

thread.join()

Acceptance Criteria

  • AudioClient implemented in src/gaia/audio/audio_client.py
  • WhisperAsr implemented in src/gaia/audio/whisper_asr.py
  • KokoroTTS implemented in src/gaia/audio/kokoro_tts.py
  • Voice chat works end-to-end
  • Streaming TTS with recording pause/resume
  • Interrupt handling (keyboard and voice)
  • GPU acceleration support
  • Can import: from gaia.audio import AudioClient, WhisperAsr, KokoroTTS
  • Example code works