Full Customization

Full Customization

If you want total control, you can self-host LiveKit, bring your own frontend and backend, and use ZeroWeight only for avatar delivery.

Recommended architecture

  • ZeroWeight AI handles avatar bundle retrieval.
  • Your backend handles auth, policy, and token minting.
  • Your LiveKit deployment handles room infrastructure.
  • Your frontend uses either @zeroweight/react or @zeroweight/renderer.

Custom endpoint wiring

<LiveKitAvatarSession
  avatarId="your-avatar-id"
  livekitUrl="wss://your-livekit-server.example.com"
  sessionDuration={180}
  inactivityTimeout={45000}
  api={{
    getBundle: (avatarId) =>
      fetch(`/api/avatars/bundle/${avatarId}`).then((r) => r.json()),
    getLiveKitToken: (avatarId) =>
      fetch(
        `/api/livekit/token?avatar_id=${avatarId}&name=${userName}`
      ).then((r) => r.json())
  }}
/>

LiveKit backend example

import asyncio
import json
import logging
import re
from typing import AsyncIterable
 
from livekit import rtc
from livekit.agents import Agent, AgentServer, AgentSession, JobContext, cli
from livekit.agents.llm import ChatContext
from livekit.plugins import cartesia, openai
 
logger = logging.getLogger("agent")
 
DEFAULT_PROMPT = """
You are a helpful voice assistant for ZeroWeight AI.
Keep answers concise, friendly, and useful.
 
[CRITICAL: EMOTIONS AND ACTIONS]
You must express your speech emotions and perform physical avatar actions using inline tags.
Do NOT say these tags out loud; integrate them naturally into your response text.
 
1. Speech Emotions (TTS):
   Use the tag: <emotion value="EMOTION" />
   Supported emotions: neutral, happy, angry, excited, content, sad, scared.
   Example: <emotion value="angry" /> How dare you speak to me like I'm just a robot!
 
2. Avatar Actions (Frontend):
   Use the tag: [action:ACTION_NAME]
   The available actions for you are:
    - speaking: normal speaking
    - wave_hand: waving your hand to greet people
 
   RULE: Place physical gesture tags (like [action:wave_hand]) at the very END of your response.
   Example: <emotion value="excited" /> [action:speaking] I am so thrilled to meet you! [action:wave_hand]
""".strip()
 
 
class Assistant(Agent):
    def __init__(self, room: rtc.Room, chat_ctx: ChatContext, instructions: str) -> None:
        super().__init__(chat_ctx=chat_ctx, instructions=instructions)
        self.room = room
        self._pending_end_actions = []
 
    async def trigger_pending_actions(self):
        for action in self._pending_end_actions:
            payload = json.dumps({"type": "AVATAR_UPDATE", "action": action})
            await self.room.local_participant.publish_data(payload, reliable=True)
        self._pending_end_actions.clear()
 
    async def _publish_action_now(self, action: str) -> None:
        payload = json.dumps({"type": "AVATAR_UPDATE", "action": action})
        await self.room.local_participant.publish_data(payload, reliable=True)
 
    async def tts_node(self, text: AsyncIterable[str], model_settings):
        action_pattern = re.compile(r"\\[action:([^\\]]+)\\]")
 
        async def filtered_text_stream():
            async for chunk in text:
                if not chunk:
                    continue
 
                matches = action_pattern.findall(chunk)
                for action in matches:
                    action = action.strip()
                    if action.startswith("speaking"):
                        asyncio.create_task(self._publish_action_now(action))
                    else:
                        self._pending_end_actions.append(action)
 
                cleaned = action_pattern.sub("", chunk).strip()
                if cleaned:
                    yield cleaned + " "
 
        async for audio_frame in super().tts_node(filtered_text_stream(), model_settings):
            yield audio_frame
 
 
server = AgentServer()
 
 
@server.rtc_session()
async def simple_agent(ctx: JobContext):
    await ctx.connect()
 
    chat_ctx = ChatContext()
    chat_ctx.add_message(role="system", content=DEFAULT_PROMPT)
 
    assistant = Assistant(room=ctx.room, chat_ctx=chat_ctx, instructions=DEFAULT_PROMPT)
    session = AgentSession(
        llm=openai.realtime.RealtimeModel(model="gpt-realtime-mini"),
        tts="sonic-3:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
        )
    )
 
    @session.on("agent_state_changed")
    def on_agent_state_changed(ev):
        state_str = str(ev).upper() if not hasattr(ev, "new_state") else str(ev.new_state).upper()
        if "LISTENING" in state_str or "IDLE" in state_str:
            asyncio.create_task(assistant.trigger_pending_actions())
 
    await session.start(agent=assistant, room=ctx.room)
 
 
if __name__ == "__main__":
    cli.run_app(server)

Best fit

Choose this model when you need complete control over:

  • Room creation and lifecycle
  • Region and infrastructure decisions
  • Custom observability
  • Enterprise auth and compliance workflows