zStream
Real-time token streaming for responsive AI applications with minimal time-to-first-token.
Overview
zStream provides real-time token-by-token streaming for chat and completion endpoints, enabling responsive user experiences.
~50ms TTFT
Time to first token
SSE
Standard Server-Sent Events
OpenAI Compatible
Same streaming format
- Sub-100ms time-to-first-token
- OpenAI-compatible SSE format
- Backpressure handling for slow clients
- Graceful stream cancellation
- Token usage stats in final chunk
Server-Sent Events
Enable streaming with stream: true in your request:
bash
curl http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "qwen-7b", "messages": [{"role": "user", "content": "Tell me a story"}], "stream": true }'Response format (each line prefixed with data: ):
json
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]} data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":"Once"},"finish_reason":null}]} data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" upon"},"finish_reason":null}]} data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" a"},"finish_reason":null}]} data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":15,"completion_tokens":42,"total_tokens":57}} data: [DONE]The final chunk includes
finish_reason andusage stats.Python Streaming
OpenAI Client
python
from openai import OpenAI client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed") # Streaming chat completionstream = client.chat.completions.create( model="qwen-7b", messages=[{"role": "user", "content": "Tell me a story"}], stream=True) for chunk in stream: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="", flush=True)Native API
python
from zllm_zse import ZSE model = ZSE("qwen-7b.zse") # Generator-based streamingfor token in model.chat_stream([ {"role": "user", "content": "Tell me a story"}]): print(token, end="", flush=True) # With callbackdef on_token(token: str): print(token, end="", flush=True) model.chat( messages=[{"role": "user", "content": "Tell me a story"}], stream_callback=on_token)Async Streaming
python
import asynciofrom openai import AsyncOpenAI async def stream_chat(): client = AsyncOpenAI( base_url="http://localhost:8000/v1", api_key="not-needed" ) stream = await client.chat.completions.create( model="qwen-7b", messages=[{"role": "user", "content": "Tell me a story"}], stream=True ) async for chunk in stream: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="", flush=True) asyncio.run(stream_chat())Client Integration
JavaScript/TypeScript
typescript
// Using OpenAI SDKimport OpenAI from 'openai'; const openai = new OpenAI({ baseURL: 'http://localhost:8000/v1', apiKey: 'not-needed',}); async function streamChat() { const stream = await openai.chat.completions.create({ model: 'qwen-7b', messages: [{ role: 'user', content: 'Tell me a story' }], stream: true, }); for await (const chunk of stream) { const content = chunk.choices[0]?.delta?.content || ''; process.stdout.write(content); }}typescript
// Using fetch with EventSourceasync function streamWithFetch() { const response = await fetch('http://localhost:8000/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'qwen-7b', messages: [{ role: 'user', content: 'Tell me a story' }], stream: true, }), }); const reader = response.body!.getReader(); const decoder = new TextDecoder(); while (true) { const { done, value } = await reader.read(); if (done) break; const text = decoder.decode(value); const lines = text.split('\n').filter(line => line.startsWith('data: ')); for (const line of lines) { const data = line.slice(6); if (data === '[DONE]') return; const chunk = JSON.parse(data); const content = chunk.choices[0]?.delta?.content || ''; process.stdout.write(content); } }}React Integration
tsx
import { useState, useCallback } from 'react'; function ChatComponent() { const [response, setResponse] = useState(''); const [loading, setLoading] = useState(false); const sendMessage = useCallback(async (message: string) => { setLoading(true); setResponse(''); const res = await fetch('/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'qwen-7b', messages: [{ role: 'user', content: message }], stream: true, }), }); const reader = res.body!.getReader(); const decoder = new TextDecoder(); while (true) { const { done, value } = await reader.read(); if (done) break; const text = decoder.decode(value); // Parse SSE and update state const matches = text.matchAll(/data: ({.*})/g); for (const match of matches) { const chunk = JSON.parse(match[1]); const content = chunk.choices[0]?.delta?.content || ''; setResponse(prev => prev + content); } } setLoading(false); }, []); return ( <div> <div>{response}</div> <button onClick={() => sendMessage('Hello!')}> {loading ? 'Generating...' : 'Send'} </button> </div> );}Chunked Responses
Control how tokens are grouped in streaming responses:
bash
# Stream every token (default)curl ... -d '{"stream": true}' # Stream every N tokenscurl ... -d '{"stream": true, "stream_options": {"chunk_size": 5}}' # Stream by words (space-delimited)curl ... -d '{"stream": true, "stream_options": {"chunk_by": "word"}}' # Stream by sentencescurl ... -d '{"stream": true, "stream_options": {"chunk_by": "sentence"}}'Larger chunk sizes reduce network overhead but increase perceived latency. Token-by-token streaming provides the best UX for chat applications.
Python configuration:
python
# Stream configurationresponse = model.chat_stream( messages=[{"role": "user", "content": "Hello"}], chunk_size=1, # Tokens per chunk include_usage=True, # Include token counts)