Chat with a language model running entirely in your browser via WebGPU. No server, no API keys — every token is generated on your device. First run downloads ~200 MB. Supports multi-turn conversation.
const pool = await createPool({
adapter: webLlmAdapter(),
workerUrl,
});
const llm = await pool.load<string>('text-generation', {
model: 'SmolLM2-360M-Instruct-q4f16_1-MLC',
});
const stream = llm.stream({
messages: [{ role: 'user', content: 'Hello!' }],
});
for await (const token of readableToAsyncIter(stream)) { ... }