Play Audio Event

You can utilize the playAudio event to transmit audio through the WebSocket. When the bidirectional attribute is set to true, Vobiz can deliver the audio transmitted from your application to the party on the call.

Prerequisites

  • Stream element must have bidirectional="true"
  • Active WebSocket connection established by Vobiz
  • Audio must be base64-encoded in the correct format (PCM or μ-law)

Attributes

AttributeDescription
event
(string)
Required

Indicates the event type. playAudio is the value required to transmit audio over the WebSocket.

media
(object)
Required

An object containing media metadata and payload.

Media Object Properties:

  • contentType (string, required): The audio codec format
    • audio/x-l16 - Linear PCM 16-bit
    • audio/x-mulaw - μ-law encoded
  • sampleRate (integer, required): Sample rate of the audio transmitted
    • 8000 - 8kHz (standard telephony)
    • 16000 - 16kHz (wideband)
  • payload (string, required): Base64-encoded string of raw audio

Audio Format Requirements

Supported Formats

Linear PCM (audio/x-l16)

  • Format: 16-bit linear PCM
  • Endianness: Little-endian
  • Sample rates: 8000 Hz or 16000 Hz
  • Channels: Mono (1 channel)
  • Encoding: Base64

μ-law (audio/x-mulaw)

  • Format: μ-law compressed
  • Sample rate: 8000 Hz
  • Channels: Mono (1 channel)
  • Encoding: Base64
  • Use case: Lower bandwidth

Important:

  • Audio must be raw PCM or μ-law data (no WAV headers or file containers)
  • The contentType and sampleRate must match your actual audio data
  • For best quality, use 16kHz sample rate with audio/x-l16
  • Each playAudio event should contain a reasonable chunk size (typically 20-60ms of audio)

Examples

Request Format

playAudio event with 8kHz PCM audio
{
  "event": "playAudio",
  "media": {
    "contentType": "audio/x-l16",
    "sampleRate": 8000,
    "payload": "base64 encoded raw audio..."
  }
}

Node.js Implementation

Generate and send audio from text
const WebSocket = require('ws');
const fs = require('fs');

// Example: Convert text to speech and send via WebSocket
async function textToSpeechAndPlay(ws, text) {
  try {
    // 1. Generate audio from text (using TTS service)
    const audioBuffer = await generateSpeech(text, {
      format: 'pcm',
      sampleRate: 8000,
      bitDepth: 16
    });

    // 2. Convert to base64
    const audioBase64 = audioBuffer.toString('base64');

    // 3. Send playAudio event
    const playAudioEvent = {
      event: 'playAudio',
      media: {
        contentType: 'audio/x-l16',
        sampleRate: 8000,
        payload: audioBase64
      }
    };

    ws.send(JSON.stringify(playAudioEvent));
    console.log('Sent audio:', text);

  } catch (error) {
    console.error('Error generating/sending audio:', error);
  }
}

// Example: Stream audio from file in chunks
function streamAudioFile(ws, filepath) {
  // Read raw PCM audio file
  const audioData = fs.readFileSync(filepath);

  // Calculate chunk size (e.g., 20ms of audio)
  const sampleRate = 8000;
  const bytesPerSample = 2; // 16-bit = 2 bytes
  const chunkDuration = 0.020; // 20ms
  const chunkSize = Math.floor(sampleRate * bytesPerSample * chunkDuration);

  // Send audio in chunks
  for (let i = 0; i < audioData.length; i += chunkSize) {
    const chunk = audioData.slice(i, i + chunkSize);
    const chunkBase64 = chunk.toString('base64');

    const playAudioEvent = {
      event: 'playAudio',
      media: {
        contentType: 'audio/x-l16',
        sampleRate: 8000,
        payload: chunkBase64
      }
    };

    ws.send(JSON.stringify(playAudioEvent));

    // Small delay to match real-time playback
    // In production, you'd time this more precisely
  }

  console.log('Finished streaming audio file');
}

// Mock TTS function (replace with actual TTS service)
async function generateSpeech(text, options) {
  // This would call your TTS service (e.g., Google TTS, AWS Polly, etc.)
  // and return raw PCM audio data
  return Buffer.from('...'); // Placeholder
}

Python Implementation

Send audio using asyncio WebSocket
import asyncio
import websockets
import json
import base64

async def play_audio(websocket, audio_bytes, sample_rate=8000, content_type='audio/x-l16'):
    """Send playAudio event to Vobiz"""
    # Encode audio as base64
    audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')

    # Create playAudio event
    play_audio_event = {
        'event': 'playAudio',
        'media': {
            'contentType': content_type,
            'sampleRate': sample_rate,
            'payload': audio_base64
        }
    }

    # Send to Vobiz
    await websocket.send(json.dumps(play_audio_event))
    print(f"Sent {len(audio_bytes)} bytes of audio")

async def text_to_speech_and_play(websocket, text):
    """Generate speech from text and play it"""
    # 1. Generate audio from text (using TTS)
    audio_bytes = await generate_speech(text, sample_rate=8000)

    # 2. Send to Vobiz
    await play_audio(websocket, audio_bytes, sample_rate=8000)

async def stream_audio_file(websocket, filepath):
    """Stream audio file in chunks"""
    chunk_size = 320  # 20ms at 8kHz, 16-bit = 320 bytes

    with open(filepath, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break

            await play_audio(websocket, chunk, sample_rate=8000)

            # Delay to simulate real-time playback
            await asyncio.sleep(0.020)  # 20ms

    print("Finished streaming audio file")

async def generate_speech(text, sample_rate=8000):
    """Generate speech from text (mock)"""
    # This would call your TTS service
    # (e.g., Google TTS, Amazon Polly, gTTS, pyttsx3, etc.)
    # and return raw PCM audio bytes

    # Placeholder - replace with actual TTS
    return b'...'  # Raw PCM audio bytes

# Example usage in your WebSocket handler
async def handle_stream(websocket, path):
    async for message in websocket:
        data = json.loads(message)

        if data['event'] == 'start':
            # Play greeting when stream starts
            await text_to_speech_and_play(
                websocket,
                "Hello! Welcome to Vobiz audio streaming."
            )

        elif data['event'] == 'media':
            # Process incoming audio from call
            pass

Converting WAV to PCM

Convert WAV file to raw PCM for playAudio
const fs = require('fs');
const wav = require('wav');

function convertWavToPCM(wavFilePath) {
  return new Promise((resolve, reject) => {
    const reader = new wav.Reader();
    const chunks = [];

    reader.on('format', (format) => {
      console.log('WAV Format:', format);

      // Ensure it's the right format
      if (format.audioFormat !== 1) { // 1 = PCM
        reject(new Error('Only PCM WAV files are supported'));
        return;
      }

      if (format.channels !== 1) {
        reject(new Error('Only mono audio is supported'));
        return;
      }

      if (![8000, 16000].includes(format.sampleRate)) {
        reject(new Error('Sample rate must be 8000 or 16000'));
        return;
      }
    });

    reader.on('data', (chunk) => {
      chunks.push(chunk);
    });

    reader.on('end', () => {
      const pcmData = Buffer.concat(chunks);
      resolve(pcmData);
    });

    reader.on('error', reject);

    fs.createReadStream(wavFilePath).pipe(reader);
  });
}

// Usage
async function playWavFile(ws, wavFilePath) {
  try {
    // Convert WAV to raw PCM
    const pcmData = await convertWavToPCM(wavFilePath);

    // Convert to base64
    const audioBase64 = pcmData.toString('base64');

    // Send playAudio event
    ws.send(JSON.stringify({
      event: 'playAudio',
      media: {
        contentType: 'audio/x-l16',
        sampleRate: 8000, // or 16000 depending on your file
        payload: audioBase64
      }
    }));

    console.log('WAV file sent successfully');
  } catch (error) {
    console.error('Error playing WAV file:', error);
  }
}

Best Practices

Match Audio Format to ContentType

Ensure your audio data format exactly matches the contentType and sampleRate you specify. Mismatches will result in distorted or garbled audio.

Use Appropriate Chunk Sizes

Send audio in reasonable chunks (20-60ms). Too small chunks add overhead; too large chunks increase latency. For 8kHz 16-bit audio, 20ms = 320 bytes.

Remove File Headers

Always send raw PCM or μ-law data without WAV headers or any file container metadata. Strip headers before base64-encoding.

Use Checkpoints to Track Playback

Send checkpoint events after your playAudio events to get acknowledgments when audio finishes playing. This helps synchronize your application logic.