Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Grammar] Integrate with XGrammar #635

Merged
merged 10 commits into from
Nov 22, 2024
Prev Previous commit
Next Next commit
Add TTFT and TPOT to usage extra
  • Loading branch information
CharlieFRuan committed Nov 22, 2024
commit deb5a26d13be1ac858f288ed05ff3c8c4bdd6c3d
54 changes: 32 additions & 22 deletions src/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -687,24 +687,29 @@ export class MLCEngine implements MLCEngineInterface {
const prefill_tokens_per_s = pipeline.getCurRoundPrefillTokensPerSec();
const decode_tokens_per_s = pipeline.getCurRoundDecodingTokensPerSec();
const grammar_init_s = pipeline.getCurRoundGrammarInitTotalTime();
const prefill_time = pipeline.getCurRoundPrefillTotalTime();
const decode_time = pipeline.getCurRoundDecodingTotalTime();
const grammar_per_token_s =
pipeline.getCurRoundGrammarPerTokenTotalTime();
const defaultExtra = {
prefill_tokens_per_s: prefill_tokens_per_s,
decode_tokens_per_s: decode_tokens_per_s,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
};
const usage: CompletionUsage = {
completion_tokens: completion_tokens,
prompt_tokens: prompt_tokens,
total_tokens: completion_tokens + prompt_tokens,
extra: usedGrammar
? {
prefill_tokens_per_s: prefill_tokens_per_s,
decode_tokens_per_s: decode_tokens_per_s,
grammar_init_ms: grammar_init_s * 1e3,
grammar_per_token_ms:
(grammar_per_token_s / completion_tokens) * 1e3,
...defaultExtra,
...{
grammar_init_s: grammar_init_s,
grammar_per_token_s: grammar_per_token_s / completion_tokens,
},
}
: {
prefill_tokens_per_s: prefill_tokens_per_s,
decode_tokens_per_s: decode_tokens_per_s,
},
: defaultExtra,
};
if (isChatCompletion) {
const usageChunk: ChatCompletionChunk = {
Expand Down Expand Up @@ -811,8 +816,8 @@ export class MLCEngine implements MLCEngineInterface {
let prompt_tokens = 0;
let prefill_time = 0;
let decode_time = 0;
let grammar_init_time = 0;
let grammar_per_token_time = 0;
let grammar_init_s = 0;
let grammar_per_token_s = 0;
for (let i = 0; i < n; i++) {
let outputMessage: string;
if (this.interruptSignal) {
Expand Down Expand Up @@ -869,14 +874,20 @@ export class MLCEngine implements MLCEngineInterface {
prompt_tokens += selectedPipeline.getCurRoundPrefillTotalTokens();
prefill_time += selectedPipeline.getCurRoundPrefillTotalTime();
decode_time += selectedPipeline.getCurRoundDecodingTotalTime();
grammar_init_time += selectedPipeline.getCurRoundGrammarInitTotalTime();
grammar_per_token_time +=
grammar_init_s += selectedPipeline.getCurRoundGrammarInitTotalTime();
grammar_per_token_s +=
selectedPipeline.getCurRoundGrammarPerTokenTotalTime();
}
const usedGrammar =
"response_format" in request &&
(request.response_format?.type === "grammar" ||
request.response_format?.type === "json_object");
const defaultExtra = {
prefill_tokens_per_s: prompt_tokens / prefill_time,
decode_tokens_per_s: completion_tokens / decode_time,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
};
const response: ChatCompletion = {
id: crypto.randomUUID(),
choices: choices,
Expand All @@ -889,16 +900,13 @@ export class MLCEngine implements MLCEngineInterface {
total_tokens: completion_tokens + prompt_tokens,
extra: usedGrammar
? {
prefill_tokens_per_s: prompt_tokens / prefill_time,
decode_tokens_per_s: completion_tokens / decode_time,
grammar_init_ms: grammar_init_time * 1e3,
grammar_per_token_ms:
(grammar_per_token_time / completion_tokens) * 1e3,
...defaultExtra,
...{
grammar_init_s: grammar_init_s,
grammar_per_token_s: grammar_per_token_s / completion_tokens,
},
}
: {
prefill_tokens_per_s: prompt_tokens / prefill_time,
decode_tokens_per_s: completion_tokens / decode_time,
},
: defaultExtra,
} as CompletionUsage,
};

Expand Down Expand Up @@ -1022,6 +1030,8 @@ export class MLCEngine implements MLCEngineInterface {
extra: {
prefill_tokens_per_s: prompt_tokens / prefill_time,
decode_tokens_per_s: completion_tokens / decode_time,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
},
} as CompletionUsage,
};
Expand Down
23 changes: 18 additions & 5 deletions src/openai_api_protocols/chat_completion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -934,15 +934,28 @@ export interface CompletionUsage {
decode_tokens_per_s: number;

/**
* Milliseconds spent on initializing grammar matcher for structured output.
* Seconds spent to generate the first token since receiving the request. Mainly contains
* prefilling overhead. If n > 1, it is the sum over all choices.
*/
grammar_init_ms?: number;
time_to_first_token_s: number;

/**
* Milliseconds per-token that grammar matcher spent on creating bitmask and accepting token for
* structured output.
* Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it
* is the average over all choices.
*/
grammar_per_token_ms?: number;
time_per_output_token_s: number;

/**
* Seconds spent on initializing grammar matcher for structured output. If n > 1, it
* is the sum over all choices.
*/
grammar_init_s?: number;

/**
* Seconds per-token that grammar matcher spent on creating bitmask and accepting token for
* structured output. If n > 1, it is the average over all choices.
*/
grammar_per_token_s?: number;
};
}

Expand Down