# Azure AI Voice Live SDK - Examples ## Table of Contents - [Basic Voice Assistant](#basic-voice-assistant) - [Function Calling](#function-calling) - [Manual Turn Control](#manual-turn-control) - [Audio File Processing](#audio-file-processing) - [Interrupt Handling](#interrupt-handling) - [Multi-modal (Text + Audio)](#multi-modal-text--audio) - [Azure Voice Integration](#azure-voice-integration) - [Avatar Integration](#avatar-integration) - [Transcription Only](#transcription-only) --- ## Basic Voice Assistant Complete voice assistant with Server VAD. ```python import asyncio import base64 from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def voice_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: # Configure session await conn.session.update(session={ "instructions": "You are a helpful voice assistant. Be concise.", "modalities": ["text", "audio"], "voice": "alloy", "turn_detection": { "type": "server_vad", "threshold": 0.5, "silence_duration_ms": 500 }, "input_audio_transcription": { "model": "whisper-1" } }) # Start microphone input (pseudo-code) mic_task = asyncio.create_task(stream_microphone(conn)) # Process events async for event in conn: match event.type: case "session.created": print("Session ready") case "input_audio_buffer.speech_started": print("🎤 Listening...") case "conversation.item.input_audio_transcription.completed": print(f"You: {event.transcript}") case "response.audio.delta": audio = base64.b64decode(event.delta) await play_audio(audio) case "response.audio_transcript.done": print(f"Assistant: {event.transcript}") case "error": print(f"Error: {event.error.message}") break async def stream_microphone(conn): """Stream microphone audio to the connection.""" async for chunk in read_microphone(): # Your audio capture b64 = base64.b64encode(chunk).decode() await conn.input_audio_buffer.append(audio=b64) asyncio.run(voice_assistant()) ``` --- ## Function Calling Voice assistant with tool use. ```python import asyncio import json import base64 from azure.ai.voicelive.aio import connect from azure.ai.voicelive.models import FunctionTool from azure.core.credentials import AzureKeyCredential # Define tools TOOLS = [ FunctionTool( type="function", name="get_weather", description="Get current weather for a location", parameters={ "type": "object", "properties": { "location": { "type": "string", "description": "City and state, e.g. 'San Francisco, CA'" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], "default": "fahrenheit" } }, "required": ["location"] } ), FunctionTool( type="function", name="set_reminder", description="Set a reminder for the user", parameters={ "type": "object", "properties": { "message": {"type": "string"}, "time": {"type": "string", "description": "ISO 8601 datetime"} }, "required": ["message", "time"] } ) ] def handle_function_call(name: str, arguments: str) -> dict: """Execute function and return result.""" args = json.loads(arguments) if name == "get_weather": # Mock weather API return { "location": args["location"], "temperature": 72, "unit": args.get("unit", "fahrenheit"), "conditions": "sunny" } elif name == "set_reminder": # Mock reminder service return {"status": "success", "reminder_id": "123"} else: return {"error": f"Unknown function: {name}"} async def function_calling_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "instructions": "You can check weather and set reminders.", "modalities": ["text", "audio"], "voice": "alloy", "tools": TOOLS, "tool_choice": "auto" }) async for event in conn: match event.type: case "response.function_call_arguments.done": # Execute the function result = handle_function_call(event.name, event.arguments) # Send result back await conn.conversation.item.create(item={ "type": "function_call_output", "call_id": event.call_id, "output": json.dumps(result) }) # Continue the conversation await conn.response.create() case "response.audio.delta": audio = base64.b64decode(event.delta) await play_audio(audio) case "response.done": if event.response.status == "completed": print("Response complete") asyncio.run(function_calling_assistant()) ``` --- ## Manual Turn Control Push-to-talk style without VAD. ```python import asyncio import base64 from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def push_to_talk(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: # Disable VAD for manual control await conn.session.update(session={ "instructions": "You are a helpful assistant.", "modalities": ["text", "audio"], "voice": "alloy", "turn_detection": None # Disable VAD }) # Simulate push-to-talk while True: input("Press Enter to start recording...") # Record audio (simulate with chunks) chunks = await record_audio_until_release() # Send all audio for chunk in chunks: b64 = base64.b64encode(chunk).decode() await conn.input_audio_buffer.append(audio=b64) # Commit and request response await conn.input_audio_buffer.commit() await conn.response.create() # Wait for response async for event in conn: if event.type == "response.audio.delta": audio = base64.b64decode(event.delta) await play_audio(audio) elif event.type == "response.done": break asyncio.run(push_to_talk()) ``` --- ## Audio File Processing Process an audio file and get a response. ```python import asyncio import base64 from pathlib import Path from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def process_audio_file(audio_path: str): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "instructions": "Respond to the audio message.", "modalities": ["text", "audio"], "voice": "alloy", "turn_detection": None, "input_audio_transcription": {"model": "whisper-1"} }) # Read and send audio file audio_data = Path(audio_path).read_bytes() # Send in chunks (24kHz * 2 bytes * 0.1s = 4800 bytes) chunk_size = 4800 for i in range(0, len(audio_data), chunk_size): chunk = audio_data[i:i + chunk_size] b64 = base64.b64encode(chunk).decode() await conn.input_audio_buffer.append(audio=b64) # Commit and request response await conn.input_audio_buffer.commit() await conn.response.create() # Collect response response_audio = bytearray() response_text = "" user_transcript = "" async for event in conn: match event.type: case "conversation.item.input_audio_transcription.completed": user_transcript = event.transcript case "response.audio.delta": response_audio.extend(base64.b64decode(event.delta)) case "response.audio_transcript.done": response_text = event.transcript case "response.done": break return { "user_said": user_transcript, "assistant_said": response_text, "audio": bytes(response_audio) } result = asyncio.run(process_audio_file("input.pcm")) print(f"User: {result['user_said']}") print(f"Assistant: {result['assistant_said']}") Path("output.pcm").write_bytes(result['audio']) ``` --- ## Interrupt Handling Handle user interruptions gracefully. ```python import asyncio import base64 from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def interruptible_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "instructions": "You are a helpful assistant.", "modalities": ["text", "audio"], "voice": "alloy", "turn_detection": { "type": "server_vad", "threshold": 0.5, "silence_duration_ms": 500 } }) is_responding = False async for event in conn: match event.type: case "response.created": is_responding = True case "response.done": is_responding = False case "input_audio_buffer.speech_started": if is_responding: # User interrupted - stop current response print("🛑 Interrupt detected!") await conn.response.cancel() await conn.output_audio_buffer.clear() case "response.audio.delta": if is_responding: audio = base64.b64decode(event.delta) await play_audio(audio) asyncio.run(interruptible_assistant()) ``` --- ## Multi-modal (Text + Audio) Send text context, receive audio response. ```python import asyncio import base64 from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def multimodal_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "instructions": "You are a helpful assistant.", "modalities": ["text", "audio"], "voice": "alloy", "turn_detection": None }) # Add context via text await conn.conversation.item.create(item={ "type": "message", "role": "system", "content": [{"type": "input_text", "text": "The user's name is Alice."}] }) # Add user message as text await conn.conversation.item.create(item={ "type": "message", "role": "user", "content": [{"type": "input_text", "text": "What's my name?"}] }) # Request audio response await conn.response.create() async for event in conn: if event.type == "response.audio.delta": audio = base64.b64decode(event.delta) await play_audio(audio) elif event.type == "response.done": break asyncio.run(multimodal_assistant()) ``` --- ## Azure Voice Integration Use Azure Text-to-Speech voices. ```python import asyncio from azure.ai.voicelive.aio import connect from azure.ai.voicelive.models import AzureStandardVoice, AzureCustomVoice from azure.core.credentials import AzureKeyCredential async def azure_voice_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: # Use Azure standard voice await conn.session.update(session={ "instructions": "You are a helpful assistant.", "modalities": ["text", "audio"], "voice": AzureStandardVoice( type="azure-standard", name="en-US-JennyNeural" ) }) # Or use custom voice # await conn.session.update(session={ # "voice": AzureCustomVoice( # type="azure-custom", # endpoint_id="your-custom-voice-endpoint", # name="YourCustomVoice" # ) # }) async for event in conn: # ... handle events pass asyncio.run(azure_voice_assistant()) ``` --- ## Avatar Integration Connect to Azure Avatar for visual output. ```python import asyncio from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def avatar_assistant(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "instructions": "You are a helpful assistant.", "modalities": ["text", "audio", "avatar"], "voice": "alloy", "avatar": { "type": "video-avatar", "character": "lisa", "output_protocol": "webrtc" } }) # Connect avatar await conn.send({ "type": "session.avatar.connect" }) async for event in conn: match event.type: case "session.avatar.connecting": ice_servers = event.ice_servers # Use ice_servers for WebRTC connection print(f"Avatar connecting with {len(ice_servers)} ICE servers") case "response.audio.delta": # Audio is streamed via WebRTC, not this event pass asyncio.run(avatar_assistant()) ``` --- ## Transcription Only Speech-to-text without AI response. ```python import asyncio import base64 from azure.ai.voicelive.aio import connect from azure.core.credentials import AzureKeyCredential async def transcription_only(): async with connect( endpoint="https://eastus.api.cognitive.microsoft.com", credential=AzureKeyCredential("YOUR_KEY"), model="gpt-4o-realtime-preview" ) as conn: await conn.session.update(session={ "modalities": ["text"], # No audio output "turn_detection": { "type": "server_vad", "threshold": 0.5, "silence_duration_ms": 1000 }, "input_audio_transcription": { "model": "whisper-1" } }) # Stream microphone mic_task = asyncio.create_task(stream_microphone(conn)) transcripts = [] async for event in conn: match event.type: case "conversation.item.input_audio_transcription.delta": print(event.delta, end="", flush=True) case "conversation.item.input_audio_transcription.completed": print() # Newline transcripts.append(event.transcript) return transcripts asyncio.run(transcription_only()) ```