| import sse from 'k6/x/sse' |
| import {check, sleep} from 'k6' |
| import {SharedArray} from 'k6/data' |
| import {Counter, Rate, Trend} from 'k6/metrics' |
| import exec from 'k6/execution'; |
|
|
| |
| const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' |
|
|
| |
| const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8 |
|
|
| |
| const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model' |
|
|
| |
| const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' |
|
|
| |
| const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 |
|
|
| |
| const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024 |
|
|
| |
| const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048 |
|
|
| export function setup() { |
| console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) |
| } |
|
|
| const data = new SharedArray('conversations', function () { |
| const tokenizer = (message) => message.split(/[\s,'".?]/) |
|
|
| return JSON.parse(open(dataset_path)) |
| |
| .filter(data => data["conversations"].length >= 2) |
| .filter(data => data["conversations"][0]["from"] === "human") |
| .map(data => { |
| return { |
| prompt: data["conversations"][0]["value"], |
| n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length, |
| n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length, |
| } |
| }) |
| |
| .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4) |
| |
| .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot) |
| |
| .slice(0, n_prompt) |
| }) |
|
|
| const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') |
| const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') |
|
|
| const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') |
| const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') |
|
|
| const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') |
| const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') |
|
|
| const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate') |
| const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate') |
|
|
| export const options = { |
| thresholds: { |
| llamacpp_completions_truncated_rate: [ |
| |
| {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'}, |
| ], |
| }, |
| duration: '10m', |
| vus: 8, |
| } |
|
|
| export default function () { |
| const conversation = data[exec.scenario.iterationInInstance % data.length] |
| const payload = { |
| "messages": [ |
| { |
| "role": "system", |
| "content": "You are ChatGPT, an AI assistant.", |
| }, |
| { |
| "role": "user", |
| "content": conversation.prompt, |
| } |
| ], |
| "model": model, |
| "stream": true, |
| "seed": 42, |
| "max_tokens": max_tokens, |
| "stop": ["<|im_end|>"] |
| } |
|
|
| const params = {method: 'POST', body: JSON.stringify(payload)}; |
|
|
| const startTime = new Date() |
| let promptEvalEndTime = null |
| let prompt_tokens = 0 |
| let completions_tokens = 0 |
| let finish_reason = null |
| const res = sse.open(`${server_url}/chat/completions`, params, function (client) { |
| client.on('event', function (event) { |
| if (promptEvalEndTime == null) { |
| promptEvalEndTime = new Date() |
| } |
|
|
| let chunk = JSON.parse(event.data) |
| let choice = chunk.choices[0] |
| if (choice.finish_reason) { |
| finish_reason = choice.finish_reason |
| } |
|
|
| if (chunk.usage) { |
| prompt_tokens = chunk.usage.prompt_tokens |
| llamacpp_prompt_tokens.add(prompt_tokens) |
| llamacpp_prompt_tokens_total_counter.add(prompt_tokens) |
|
|
| completions_tokens = chunk.usage.completion_tokens |
| llamacpp_completion_tokens.add(completions_tokens) |
| llamacpp_completion_tokens_total_counter.add(completions_tokens) |
| } |
| }) |
|
|
| client.on('error', function (e) { |
| console.log('An unexpected error occurred: ', e.error()); |
| throw e; |
| }) |
| }) |
|
|
| check(res, {'success completion': (r) => r.status === 200}) |
|
|
| const endTime = new Date() |
|
|
| const promptEvalTime = promptEvalEndTime - startTime |
| if (promptEvalTime > 0) { |
| llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) |
| } |
|
|
| const completion_time = endTime - promptEvalEndTime |
| if (completions_tokens > 0 && completion_time > 0) { |
| llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) |
| } |
| llamacpp_completions_truncated_rate.add(finish_reason === 'length') |
| llamacpp_completions_stop_rate.add(finish_reason === 'stop') |
|
|
| sleep(0.3) |
| } |
|
|