Play Audio Event
You can utilize the playAudio event to transmit audio through the WebSocket. When the bidirectional attribute is set to true, Vobiz can deliver the audio transmitted from your application to the party on the call.
Prerequisites
- →Stream element must have
bidirectional="true" - →Active WebSocket connection established by Vobiz
- →Audio must be base64-encoded in the correct format (PCM or μ-law)
Attributes
| Attribute | Description |
|---|---|
event(string) Required | Indicates the event type. |
media(object) Required | An object containing media metadata and payload. Media Object Properties:
|
Audio Format Requirements
Supported Formats
Linear PCM (audio/x-l16)
- Format: 16-bit linear PCM
- Endianness: Little-endian
- Sample rates: 8000 Hz or 16000 Hz
- Channels: Mono (1 channel)
- Encoding: Base64
μ-law (audio/x-mulaw)
- Format: μ-law compressed
- Sample rate: 8000 Hz
- Channels: Mono (1 channel)
- Encoding: Base64
- Use case: Lower bandwidth
Important:
- Audio must be raw PCM or μ-law data (no WAV headers or file containers)
- The contentType and sampleRate must match your actual audio data
- For best quality, use 16kHz sample rate with audio/x-l16
- Each playAudio event should contain a reasonable chunk size (typically 20-60ms of audio)
Examples
Request Format
{
"event": "playAudio",
"media": {
"contentType": "audio/x-l16",
"sampleRate": 8000,
"payload": "base64 encoded raw audio..."
}
}Node.js Implementation
const WebSocket = require('ws');
const fs = require('fs');
// Example: Convert text to speech and send via WebSocket
async function textToSpeechAndPlay(ws, text) {
try {
// 1. Generate audio from text (using TTS service)
const audioBuffer = await generateSpeech(text, {
format: 'pcm',
sampleRate: 8000,
bitDepth: 16
});
// 2. Convert to base64
const audioBase64 = audioBuffer.toString('base64');
// 3. Send playAudio event
const playAudioEvent = {
event: 'playAudio',
media: {
contentType: 'audio/x-l16',
sampleRate: 8000,
payload: audioBase64
}
};
ws.send(JSON.stringify(playAudioEvent));
console.log('Sent audio:', text);
} catch (error) {
console.error('Error generating/sending audio:', error);
}
}
// Example: Stream audio from file in chunks
function streamAudioFile(ws, filepath) {
// Read raw PCM audio file
const audioData = fs.readFileSync(filepath);
// Calculate chunk size (e.g., 20ms of audio)
const sampleRate = 8000;
const bytesPerSample = 2; // 16-bit = 2 bytes
const chunkDuration = 0.020; // 20ms
const chunkSize = Math.floor(sampleRate * bytesPerSample * chunkDuration);
// Send audio in chunks
for (let i = 0; i < audioData.length; i += chunkSize) {
const chunk = audioData.slice(i, i + chunkSize);
const chunkBase64 = chunk.toString('base64');
const playAudioEvent = {
event: 'playAudio',
media: {
contentType: 'audio/x-l16',
sampleRate: 8000,
payload: chunkBase64
}
};
ws.send(JSON.stringify(playAudioEvent));
// Small delay to match real-time playback
// In production, you'd time this more precisely
}
console.log('Finished streaming audio file');
}
// Mock TTS function (replace with actual TTS service)
async function generateSpeech(text, options) {
// This would call your TTS service (e.g., Google TTS, AWS Polly, etc.)
// and return raw PCM audio data
return Buffer.from('...'); // Placeholder
}Python Implementation
import asyncio
import websockets
import json
import base64
async def play_audio(websocket, audio_bytes, sample_rate=8000, content_type='audio/x-l16'):
"""Send playAudio event to Vobiz"""
# Encode audio as base64
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
# Create playAudio event
play_audio_event = {
'event': 'playAudio',
'media': {
'contentType': content_type,
'sampleRate': sample_rate,
'payload': audio_base64
}
}
# Send to Vobiz
await websocket.send(json.dumps(play_audio_event))
print(f"Sent {len(audio_bytes)} bytes of audio")
async def text_to_speech_and_play(websocket, text):
"""Generate speech from text and play it"""
# 1. Generate audio from text (using TTS)
audio_bytes = await generate_speech(text, sample_rate=8000)
# 2. Send to Vobiz
await play_audio(websocket, audio_bytes, sample_rate=8000)
async def stream_audio_file(websocket, filepath):
"""Stream audio file in chunks"""
chunk_size = 320 # 20ms at 8kHz, 16-bit = 320 bytes
with open(filepath, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
await play_audio(websocket, chunk, sample_rate=8000)
# Delay to simulate real-time playback
await asyncio.sleep(0.020) # 20ms
print("Finished streaming audio file")
async def generate_speech(text, sample_rate=8000):
"""Generate speech from text (mock)"""
# This would call your TTS service
# (e.g., Google TTS, Amazon Polly, gTTS, pyttsx3, etc.)
# and return raw PCM audio bytes
# Placeholder - replace with actual TTS
return b'...' # Raw PCM audio bytes
# Example usage in your WebSocket handler
async def handle_stream(websocket, path):
async for message in websocket:
data = json.loads(message)
if data['event'] == 'start':
# Play greeting when stream starts
await text_to_speech_and_play(
websocket,
"Hello! Welcome to Vobiz audio streaming."
)
elif data['event'] == 'media':
# Process incoming audio from call
passConverting WAV to PCM
const fs = require('fs');
const wav = require('wav');
function convertWavToPCM(wavFilePath) {
return new Promise((resolve, reject) => {
const reader = new wav.Reader();
const chunks = [];
reader.on('format', (format) => {
console.log('WAV Format:', format);
// Ensure it's the right format
if (format.audioFormat !== 1) { // 1 = PCM
reject(new Error('Only PCM WAV files are supported'));
return;
}
if (format.channels !== 1) {
reject(new Error('Only mono audio is supported'));
return;
}
if (![8000, 16000].includes(format.sampleRate)) {
reject(new Error('Sample rate must be 8000 or 16000'));
return;
}
});
reader.on('data', (chunk) => {
chunks.push(chunk);
});
reader.on('end', () => {
const pcmData = Buffer.concat(chunks);
resolve(pcmData);
});
reader.on('error', reject);
fs.createReadStream(wavFilePath).pipe(reader);
});
}
// Usage
async function playWavFile(ws, wavFilePath) {
try {
// Convert WAV to raw PCM
const pcmData = await convertWavToPCM(wavFilePath);
// Convert to base64
const audioBase64 = pcmData.toString('base64');
// Send playAudio event
ws.send(JSON.stringify({
event: 'playAudio',
media: {
contentType: 'audio/x-l16',
sampleRate: 8000, // or 16000 depending on your file
payload: audioBase64
}
}));
console.log('WAV file sent successfully');
} catch (error) {
console.error('Error playing WAV file:', error);
}
}Best Practices
Match Audio Format to ContentType
Ensure your audio data format exactly matches the contentType and sampleRate you specify. Mismatches will result in distorted or garbled audio.
Use Appropriate Chunk Sizes
Send audio in reasonable chunks (20-60ms). Too small chunks add overhead; too large chunks increase latency. For 8kHz 16-bit audio, 20ms = 320 bytes.
Remove File Headers
Always send raw PCM or μ-law data without WAV headers or any file container metadata. Strip headers before base64-encoding.
Use Checkpoints to Track Playback
Send checkpoint events after your playAudio events to get acknowledgments when audio finishes playing. This helps synchronize your application logic.