Streaming the Output
For more interactive UIs during generation, you can stream output tokens.
info
Streaming is supported for LLMPipeline
, VLMPipeline
and WhisperPipeline
.
Streaming Function
In this example, a function outputs words to the console immediately upon generation:
- Python
- C++
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_path, "CPU")
# Create a streamer function
def streamer(subword):
print(subword, end='', flush=True)
# Return flag corresponds whether generation should be stopped.
return ov_genai.StreamingStatus.RUNNING
pipe.start_chat()
while True:
try:
prompt = input('question:\n')
except EOFError:
break
pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
print('\n----------\n')
pipe.finish_chat()
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
int main(int argc, char* argv[]) {
std::string prompt;
std::string model_path = argv[1];
ov::genai::LLMPipeline pipe(model_path, "CPU");
// Create a streamer function
auto streamer = [](std::string word) {
std::cout << word << std::flush;
// Return flag corresponds whether generation should be stopped.
return ov::genai::StreamingStatus::RUNNING;
};
pipe.start_chat();
std::cout << "question:\n";
while (std::getline(std::cin, prompt)) {
pipe.generate(prompt, ov::genai::streamer(streamer), ov::genai::max_new_tokens(100));
std::cout << "\n----------\n"
"question:\n";
}
pipe.finish_chat();
}
Custom Streamer Class
You can also create your custom streamer for more sophisticated processing:
- Python
- C++
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_path, "CPU")
# Create custom streamer class
class CustomStreamer(ov_genai.StreamerBase):
def __init__(self):
super().__init__()
# Initialization logic.
def write(self, token: int | list[int]) -> ov_genai.StreamingStatus:
# Custom processing logic for new decoded token(s).
# Return flag corresponds whether generation should be stopped.
return ov_genai.StreamingStatus.RUNNING
def end(self):
# Custom finalization logic.
pass
pipe.start_chat()
while True:
try:
prompt = input('question:\n')
except EOFError:
break
pipe.generate(prompt, streamer=CustomStreamer(), max_new_tokens=100)
print('\n----------\n')
pipe.finish_chat()
#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
// Create custom streamer class
class CustomStreamer: public ov::genai::StreamerBase {
public:
ov::genai::StreamingStatus write(int64_t token) {
// Custom processing logic for new decoded token.
// Return flag corresponds whether generation should be stopped.
return ov::genai::StreamingStatus::RUNNING;
};
ov::genai::StreamingStatus write(const std::vector<int64_t>& tokens) {
// Custom processing logic for new vector of decoded tokens.
// Return flag corresponds whether generation should be stopped.
return ov::genai::StreamingStatus::RUNNING;
};
void end() {
// Custom finalization logic.
};
};
int main(int argc, char* argv[]) {
std::string prompt;
std::shared_ptr<CustomStreamer> custom_streamer;
std::string model_path = argv[1];
ov::genai::LLMPipeline pipe(model_path, "CPU");
pipe.start_chat();
std::cout << "question:\n";
while (std::getline(std::cin, prompt)) {
pipe.generate(prompt, ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(100));
std::cout << "\n----------\n"
"question:\n";
}
pipe.finish_chat();
}
info
For fully implemented iterable CustomStreamer
refer to multinomial_causal_lm sample.