Feature

zStream

Real-time token streaming for responsive AI applications with minimal time-to-first-token.

Overview

zStream provides real-time token-by-token streaming for chat and completion endpoints, enabling responsive user experiences.

~50ms TTFT

Time to first token

SSE

Standard Server-Sent Events

OpenAI Compatible

Same streaming format

Sub-100ms time-to-first-token
OpenAI-compatible SSE format
Backpressure handling for slow clients
Graceful stream cancellation
Token usage stats in final chunk

Server-Sent Events

Enable streaming with stream: true in your request:

bash

curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen-7b",
    "messages": [{"role": "user", "content": "Tell me a story"}],
    "stream": true
  }'

Response format (each line prefixed with data: ):

json

data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"role":"assistant"},"finish_reason":null}]}
 
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":"Once"},"finish_reason":null}]}
 
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" upon"},"finish_reason":null}]}
 
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{"content":" a"},"finish_reason":null}]}
 
data: {"id":"chat-1","object":"chat.completion.chunk","created":1234567890,"model":"qwen-7b","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":15,"completion_tokens":42,"total_tokens":57}}
 
data: [DONE]

The final chunk includes finish_reason andusage stats.

Python Streaming

OpenAI Client

python

from openai import OpenAI
 
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
 
# Streaming chat completion
stream = client.chat.completions.create(
    model="qwen-7b",
    messages=[{"role": "user", "content": "Tell me a story"}],
    stream=True
)
 
for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

Native API

python

from zllm_zse import ZSE
 
model = ZSE("qwen-7b.zse")
 
# Generator-based streaming
for token in model.chat_stream([
    {"role": "user", "content": "Tell me a story"}
]):
    print(token, end="", flush=True)
 
# With callback
def on_token(token: str):
    print(token, end="", flush=True)
 
model.chat(
    messages=[{"role": "user", "content": "Tell me a story"}],
    stream_callback=on_token
)

Async Streaming

python

import asyncio
from openai import AsyncOpenAI
 
async def stream_chat():
    client = AsyncOpenAI(
        base_url="http://localhost:8000/v1",
        api_key="not-needed"
    )
    
    stream = await client.chat.completions.create(
        model="qwen-7b",
        messages=[{"role": "user", "content": "Tell me a story"}],
        stream=True
    )
    
    async for chunk in stream:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)
 
asyncio.run(stream_chat())

Client Integration

JavaScript/TypeScript

typescript

// Using OpenAI SDK
import OpenAI from 'openai';
 
const openai = new OpenAI({
  baseURL: 'http://localhost:8000/v1',
  apiKey: 'not-needed',
});
 
async function streamChat() {
  const stream = await openai.chat.completions.create({
    model: 'qwen-7b',
    messages: [{ role: 'user', content: 'Tell me a story' }],
    stream: true,
  });
 
  for await (const chunk of stream) {
    const content = chunk.choices[0]?.delta?.content || '';
    process.stdout.write(content);
  }
}

typescript

// Using fetch with EventSource
async function streamWithFetch() {
  const response = await fetch('http://localhost:8000/v1/chat/completions', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      model: 'qwen-7b',
      messages: [{ role: 'user', content: 'Tell me a story' }],
      stream: true,
    }),
  });
 
  const reader = response.body!.getReader();
  const decoder = new TextDecoder();
 
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    
    const text = decoder.decode(value);
    const lines = text.split('\n').filter(line => line.startsWith('data: '));
    
    for (const line of lines) {
      const data = line.slice(6);
      if (data === '[DONE]') return;
      
      const chunk = JSON.parse(data);
      const content = chunk.choices[0]?.delta?.content || '';
      process.stdout.write(content);
    }
  }
}

React Integration

tsx

import { useState, useCallback } from 'react';
 
function ChatComponent() {
  const [response, setResponse] = useState('');
  const [loading, setLoading] = useState(false);
 
  const sendMessage = useCallback(async (message: string) => {
    setLoading(true);
    setResponse('');
 
    const res = await fetch('/v1/chat/completions', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({
        model: 'qwen-7b',
        messages: [{ role: 'user', content: message }],
        stream: true,
      }),
    });
 
    const reader = res.body!.getReader();
    const decoder = new TextDecoder();
 
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      
      const text = decoder.decode(value);
      // Parse SSE and update state
      const matches = text.matchAll(/data: ({.*})/g);
      for (const match of matches) {
        const chunk = JSON.parse(match[1]);
        const content = chunk.choices[0]?.delta?.content || '';
        setResponse(prev => prev + content);
      }
    }
 
    setLoading(false);
  }, []);
 
  return (
    <div>
      <div>{response}</div>
      <button onClick={() => sendMessage('Hello!')}>
        {loading ? 'Generating...' : 'Send'}
      </button>
    </div>
  );
}

Chunked Responses

Control how tokens are grouped in streaming responses:

bash

# Stream every token (default)
curl ... -d '{"stream": true}'
 
# Stream every N tokens
curl ... -d '{"stream": true, "stream_options": {"chunk_size": 5}}'
 
# Stream by words (space-delimited)
curl ... -d '{"stream": true, "stream_options": {"chunk_by": "word"}}'
 
# Stream by sentences
curl ... -d '{"stream": true, "stream_options": {"chunk_by": "sentence"}}'

Larger chunk sizes reduce network overhead but increase perceived latency. Token-by-token streaming provides the best UX for chat applications.

Python configuration:

python

# Stream configuration
response = model.chat_stream(
    messages=[{"role": "user", "content": "Hello"}],
    chunk_size=1,        # Tokens per chunk
    include_usage=True,  # Include token counts
)

← zInfer

zKV →