Feature

zStream

Real-time token streaming for responsive AI applications with minimal time-to-first-token.

Overview

zStream provides real-time token-by-token streaming for chat and completion endpoints, enabling responsive user experiences.

~50ms TTFT

Time to first token

SSE

Standard Server-Sent Events

OpenAI Compatible

Same streaming format

  • Sub-100ms time-to-first-token
  • OpenAI-compatible SSE format
  • Backpressure handling for slow clients
  • Graceful stream cancellation
  • Token usage stats in final chunk

Server-Sent Events

Enable streaming with stream: true in your request:

bash
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen-7b",
"messages": [{"role": "user", "content": "Tell me a story"}],
"stream": true
}'

Response format (each line prefixed with data: ):

json
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":"Once"},"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" upon"},"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" a"},"finish_reason":null}]}
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":15,"completion_tokens":42,"total_tokens":57}}
data: [DONE]
The final chunk includes finish_reason andusage stats.

Python Streaming

OpenAI Client

python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
# Streaming chat completion
stream = client.chat.completions.create(
model="qwen-7b",
messages=[{"role": "user", "content": "Tell me a story"}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)

Native API

python
from zllm_zse import ZSE
model = ZSE("qwen-7b.zse")
# Generator-based streaming
for token in model.chat_stream([
{"role": "user", "content": "Tell me a story"}
]):
print(token, end="", flush=True)
# With callback
def on_token(token: str):
print(token, end="", flush=True)
model.chat(
messages=[{"role": "user", "content": "Tell me a story"}],
stream_callback=on_token
)

Async Streaming

python
import asyncio
from openai import AsyncOpenAI
async def stream_chat():
client = AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed"
)
stream = await client.chat.completions.create(
model="qwen-7b",
messages=[{"role": "user", "content": "Tell me a story"}],
stream=True
)
async for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
asyncio.run(stream_chat())

Client Integration

JavaScript/TypeScript

typescript
// Using OpenAI SDK
import OpenAI from 'openai';
const openai = new OpenAI({
baseURL: 'http://localhost:8000/v1',
apiKey: 'not-needed',
});
async function streamChat() {
const stream = await openai.chat.completions.create({
model: 'qwen-7b',
messages: [{ role: 'user', content: 'Tell me a story' }],
stream: true,
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content || '';
process.stdout.write(content);
}
}
typescript
// Using fetch with EventSource
async function streamWithFetch() {
const response = await fetch('http://localhost:8000/v1/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: 'qwen-7b',
messages: [{ role: 'user', content: 'Tell me a story' }],
stream: true,
}),
});
const reader = response.body!.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
const lines = text.split('\n').filter(line => line.startsWith('data: '));
for (const line of lines) {
const data = line.slice(6);
if (data === '[DONE]') return;
const chunk = JSON.parse(data);
const content = chunk.choices[0]?.delta?.content || '';
process.stdout.write(content);
}
}
}

React Integration

tsx
import { useState, useCallback } from 'react';
function ChatComponent() {
const [response, setResponse] = useState('');
const [loading, setLoading] = useState(false);
const sendMessage = useCallback(async (message: string) => {
setLoading(true);
setResponse('');
const res = await fetch('/v1/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: 'qwen-7b',
messages: [{ role: 'user', content: message }],
stream: true,
}),
});
const reader = res.body!.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const text = decoder.decode(value);
// Parse SSE and update state
const matches = text.matchAll(/data: ({.*})/g);
for (const match of matches) {
const chunk = JSON.parse(match[1]);
const content = chunk.choices[0]?.delta?.content || '';
setResponse(prev => prev + content);
}
}
setLoading(false);
}, []);
return (
<div>
<div>{response}</div>
<button onClick={() => sendMessage('Hello!')}>
{loading ? 'Generating...' : 'Send'}
</button>
</div>
);
}

Chunked Responses

Control how tokens are grouped in streaming responses:

bash
# Stream every token (default)
curl ... -d '{"stream": true}'
# Stream every N tokens
curl ... -d '{"stream": true, "stream_options": {"chunk_size": 5}}'
# Stream by words (space-delimited)
curl ... -d '{"stream": true, "stream_options": {"chunk_by": "word"}}'
# Stream by sentences
curl ... -d '{"stream": true, "stream_options": {"chunk_by": "sentence"}}'
Larger chunk sizes reduce network overhead but increase perceived latency. Token-by-token streaming provides the best UX for chat applications.

Python configuration:

python
# Stream configuration
response = model.chat_stream(
messages=[{"role": "user", "content": "Hello"}],
chunk_size=1, # Tokens per chunk
include_usage=True, # Include token counts
)