Use OpenVINO GenAI in Chat Scenario
For chat applications, OpenVINO GenAI provides special optimizations to maintain conversation context and improve performance using KV-cache.
Refer to the How It Works for more information about KV-cache.
Chat mode is supported for both LLMPipeline and VLMPipeline.
ChatHistory
ChatHistory stores conversation messages and optional metadata for chat templates.
Messages are stored as JSON-like objects, so it supports various nested message structures with any field names your model or chat template requires (not just simple "role" and "content" fields).
A simple chat example (with grouped beam search decoding):
- Python
- C++
- JavaScript
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_path, 'CPU')
config = {'max_new_tokens': 100, 'num_beam_groups': 3, 'num_beams': 15, 'diversity_penalty': 1.5}
pipe.set_generation_config(config)
chat_history = ov_genai.ChatHistory()
while True:
try:
prompt = input('question:\n')
except EOFError:
break
chat_history.append({"role": "user", "content": prompt})
decoded_results = pipe.generate(chat_history)
# Add assistant's response to chat history
chat_history.append({"role": "assistant", "content": decoded_results.texts[0]})
print('answer:\n')
print(decoded_results.texts[0])
print('\n----------\n')
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
int main(int argc, char* argv[]) {
std::string prompt;
std::string model_path = argv[1];
ov::genai::LLMPipeline pipe(model_path, "CPU");
ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
config.num_beam_groups = 3;
config.num_beams = 15;
config.diversity_penalty = 1.0f;
ov::genai::ChatHistory chat_history;
std::cout << "question:\n";
while (std::getline(std::cin, prompt)) {
chat_history.push_back({{"role", "user"}, {"content", std::move(prompt)}});
auto decoded_results = pipe.generate(chat_history, config);
// Add assistant's response to chat history
chat_history.push_back({{"role", "assistant"}, {"content", std::move(decoded_results.texts[0])}});
std::cout << "answer:\n";
std::cout << decoded_results.texts[0] << std::endl;
std::cout << "\n----------\n"
"question:\n";
}
}
import { LLMPipeline, ChatHistory } from "openvino-genai-node";
import readline from 'readline';
const pipe = await LLMPipeline(model_path, 'CPU');
const config = {
max_new_tokens: 100,
num_beam_groups: 3,
num_beams: 15,
diversity_penalty: 1.5
};
const chatHistory = new ChatHistory();
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
console.log('question:');
rl.on('line', async (prompt) => {
chatHistory.push({ role: 'user', content: prompt });
const decodedResults = await pipe.generate(chatHistory, config);
// Add assistant's response to chat history
chatHistory.push({ role: 'assistant', content: decodedResults.toString() });
console.log('answer:');
console.log(decodedResults.toString());
console.log('\n----------\nquestion:');
});
rl.on('close', async () => {
process.exit(0);
});
ChatHistory messages are not updated automatically when using pipe.generate().
You need to manually append user prompts and model responses to the ChatHistory instance as shown in the examples above.
System Prompt
Add a system message at the beginning to set the assistant's behavior:
- Python
- C++
- JavaScript
import openvino_genai as ov_genai
chat_history = ov_genai.ChatHistory()
chat_history.append({"role": "system", "content": "You are a helpful assistant."})
# Or using constructor
chat_history = ov_genai.ChatHistory([
{"role": "system", "content": "You are a helpful assistant."}
])
#include "openvino/genai/chat_history.hpp"
ov::genai::ChatHistory chat_history;
chat_history.push_back({{"role", "system"}, {"content", "You are a helpful assistant."}});
// Or using constructor
ov::genai::ChatHistory chat_history({
{{"role", "system"}, {"content", "You are a helpful assistant."}}
});
import { ChatHistory } from "openvino-genai-node";
const chatHistory = new ChatHistory();
chatHistory.push({ role: 'system', content: 'You are a helpful assistant.' });
// Or using constructor
const chatHistory = new ChatHistory([
{ role: 'system', content: 'You are a helpful assistant.' }
]);
Chat History Metadata
Additionally, ChatHistory manages optional metadata for consistent chat template application:
- Tools definitions for function calling and agentic scenarios
- Custom chat template variables (e.g.
enable_thinkingfor models with extended reasoning like Qwen3)
- Python
- C++
- JavaScript
import openvino_genai as ov_genai
import json
chat_history = ov_genai.ChatHistory()
chat_history.append({"role": "system", "content": system_prompt})
# Load tools from JSON string
tools: list[dict] = json.loads("...")
# Set tools definitions
chat_history.set_tools(tools)
# Set custom chat template variables
chat_history.set_extra_context({ "enable_thinking": True })
chat_history.append({"role": "user", "content": user_prompt})
decoded_results = pipe.generate(chat_history, config)
# Add assistant's response to chat history
chat_history.append({"role": "assistant", "content": decoded_results.texts[0]})
#include "openvino/genai/chat_history.hpp"
ov::genai::ChatHistory chat_history;
chat_history.push_back({{"role", "system"}, {"content", std::move(system_prompt)}});
// Load tools from JSON string
ov::genai::JsonContainer tools = ov::genai::JsonContainer::from_json_string("...");
// Set tools definitions
chat_history.set_tools(tools);
// Set custom chat template variables
chat_history.set_extra_context({{"enable_thinking", true}});
chat_history.push_back({{"role", "user"}, {"content", std::move(user_prompt)}});
auto decoded_results = pipe.generate(chat_history, config);
// Add assistant's response to chat history
chat_history.push_back({{"role", "assistant"}, {"content", std::move(decoded_results.texts[0])}});
import { ChatHistory } from "openvino-genai-node";
const chatHistory = new ChatHistory();
chatHistory.push({ role: 'system', content: systemPrompt });
// Load tools from JSON string
const tools = JSON.parse("...");
// Set tools definitions
chatHistory.setTools(tools);
// Set custom chat template variables
chatHistory.setExtraContext({ enable_thinking: true });
chatHistory.push({ role: 'user', content: userPrompt });
const decodedResults = await pipe.generate(chatHistory, config);
// Add assistant's response to chat history
chatHistory.push({ role: 'assistant', content: decodedResults.toString() });
start_chat() / finish_chat() API
start_chat() / finish_chat() API is deprecated and will be removed in the next major release. It is recommended to use ChatHistory for managing chat conversations.
A simple chat example (with grouped beam search decoding):
- Python
- C++
- JavaScript
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_path, 'CPU')
config = {'max_new_tokens': 100, 'num_beam_groups': 3, 'num_beams': 15, 'diversity_penalty': 1.5}
pipe.set_generation_config(config)
pipe.start_chat()
while True:
try:
prompt = input('question:\n')
except EOFError:
break
answer = pipe.generate(prompt)
print('answer:\n')
print(answer)
print('\n----------\n')
pipe.finish_chat()
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
int main(int argc, char* argv[]) {
std::string prompt;
std::string model_path = argv[1];
ov::genai::LLMPipeline pipe(model_path, "CPU");
ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
config.num_beam_groups = 3;
config.num_beams = 15;
config.diversity_penalty = 1.0f;
pipe.start_chat();
std::cout << "question:\n";
while (std::getline(std::cin, prompt)) {
std::cout << "answer:\n";
auto answer = pipe.generate(prompt, config);
std::cout << answer << std::endl;
std::cout << "\n----------\n"
"question:\n";
}
pipe.finish_chat();
}
import { LLMPipeline } from "openvino-genai-node";
import readline from 'readline';
const pipe = await LLMPipeline(model_path, 'CPU');
const config = {
max_new_tokens: 100,
num_beam_groups: 3,
num_beams: 15,
diversity_penalty: 1.5
};
await pipe.startChat();
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
console.log('question:');
rl.on('line', async (prompt) => {
console.log('answer:');
const answer = await pipe.generate(prompt, config);
console.log(answer);
console.log('\n----------\nquestion:');
});
rl.on('close', async () => {
await pipe.finishChat();
process.exit(0);
});
For more information, refer to the Python, C++, and JavaScript chat samples.