From 3103ae18b8035bd5046c2f72798dfcdc901b9c60 Mon Sep 17 00:00:00 2001 From: Raymond Date: Tue, 28 Mar 2023 18:43:02 +0800 Subject: [PATCH] Count file diff by token, not by length of string (#63) * 1.1.23 * 1.1.24 * feat(package.json): add @dqbd/tiktoken dependency refactor(generateCommitMessageFromGitDiff.ts): add tokenCount function to count the number of tokens in a string refactor(generateCommitMessageFromGitDiff.ts): change the way the length of INIT_MESSAGES_PROMPT is calculated to use tokenCount function refactor(generateCommitMessageFromGitDiff.ts): change the way the length of diff is calculated to use tokenCount function refactor(generateCommitMessageFromGitDiff.ts): rename function parameter from diff to fileDiff and update function calls accordingly feat(generateCommitMessageFromGitDiff.ts): add tokenCount function to count tokens in fileDiff and use it to check if fileDiff is bigger than MAX_REQ_TOKENS feat(utils): add tokenCount function to count the number of tokens in a string refactor(utils/mergeStrings.ts): use tokenCount function to count the number of tokens in a string instead of checking the length of the concatenated string --------- Co-authored-by: di-sukharev --- package-lock.json | 6 ++++++ package.json | 1 + src/generateCommitMessageFromGitDiff.ts | 21 ++++++++++----------- src/utils/mergeStrings.ts | 3 ++- src/utils/tokenCount.ts | 14 ++++++++++++++ 5 files changed, 33 insertions(+), 12 deletions(-) create mode 100644 src/utils/tokenCount.ts diff --git a/package-lock.json b/package-lock.json index 0cc093e..d799cb4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,6 +10,7 @@ "license": "MIT", "dependencies": { "@clack/prompts": "^0.6.1", + "@dqbd/tiktoken": "^1.0.2", "axios": "^1.3.4", "chalk": "^5.2.0", "cleye": "^1.3.2", @@ -83,6 +84,11 @@ "node": ">=12" } }, + "node_modules/@dqbd/tiktoken": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@dqbd/tiktoken/-/tiktoken-1.0.2.tgz", + "integrity": "sha512-AjGTBRWsMoVmVeN55NLyupyM8TNamOUBl6tj5t/leLDVup3CFGO9tVagNL1jf3GyZLkWZSTmYVbPQ/M2LEcNzw==" + }, "node_modules/@esbuild/android-arm": { "version": "0.15.18", "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.15.18.tgz", diff --git a/package.json b/package.json index 760b0c1..0ce4593 100644 --- a/package.json +++ b/package.json @@ -60,6 +60,7 @@ }, "dependencies": { "@clack/prompts": "^0.6.1", + "@dqbd/tiktoken": "^1.0.2", "axios": "^1.3.4", "chalk": "^5.2.0", "cleye": "^1.3.2", diff --git a/src/generateCommitMessageFromGitDiff.ts b/src/generateCommitMessageFromGitDiff.ts index 69100b0..af26956 100644 --- a/src/generateCommitMessageFromGitDiff.ts +++ b/src/generateCommitMessageFromGitDiff.ts @@ -6,6 +6,7 @@ import { api } from './api'; import { getConfig } from './commands/config'; import { mergeStrings } from './utils/mergeStrings'; import { i18n, I18nLocals } from './i18n'; +import { tokenCount } from './utils/tokenCount'; const config = getConfig(); const translation = i18n[(config?.language as I18nLocals) || 'en']; @@ -13,12 +14,10 @@ const translation = i18n[(config?.language as I18nLocals) || 'en']; const INIT_MESSAGES_PROMPT: Array = [ { role: ChatCompletionRequestMessageRoleEnum.System, - content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${ - config?.emoji - ? 'Use Gitmoji convention to preface the commit' - : 'Do not preface the commit with anything' - }, use the present tense. ${ - config?.description + content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${config?.emoji + ? 'Use Gitmoji convention to preface the commit' + : 'Do not preface the commit with anything' + }, use the present tense. ${config?.description ? 'Add a short description of what commit is about after the commit message. Don\'t start it with "This commit", just describe the changes.' : "Don't add any descriptions to the commit, only commit message." } Use ${translation.localLanguage} to answer.` @@ -80,16 +79,16 @@ interface GenerateCommitMessageError { } const INIT_MESSAGES_PROMPT_LENGTH = INIT_MESSAGES_PROMPT.map( - (msg) => msg.content -).join('').length; + (msg) => tokenCount(msg.content) + 4 +).reduce((a, b) => a + b, 0); const MAX_REQ_TOKENS = 3900 - INIT_MESSAGES_PROMPT_LENGTH; export const generateCommitMessageWithChatCompletion = async ( diff: string ): Promise => { - try { - if (diff.length >= MAX_REQ_TOKENS) { + try { + if (tokenCount(diff) >= MAX_REQ_TOKENS) { const commitMessagePromises = getCommitMsgsPromisesFromFileDiffs(diff); const commitMessages = await Promise.all(commitMessagePromises); @@ -144,7 +143,7 @@ function getCommitMsgsPromisesFromFileDiffs(diff: string) { const commitMessagePromises = []; for (const fileDiff of mergedFilesDiffs) { - if (fileDiff.length >= MAX_REQ_TOKENS) { + if (tokenCount(fileDiff) >= MAX_REQ_TOKENS) { // if file-diff is bigger than gpt context — split fileDiff into lineDiff const messagesPromises = getMessagesPromisesByLines(fileDiff, separator); diff --git a/src/utils/mergeStrings.ts b/src/utils/mergeStrings.ts index 7b55a99..ee35f50 100644 --- a/src/utils/mergeStrings.ts +++ b/src/utils/mergeStrings.ts @@ -1,8 +1,9 @@ +import { tokenCount } from './tokenCount' export function mergeStrings(arr: string[], maxStringLength: number): string[] { const mergedArr: string[] = []; let currentItem: string = arr[0]; for (const item of arr.slice(1)) { - if (currentItem.length + item.length <= maxStringLength) { + if (tokenCount(currentItem + item) <= maxStringLength) { currentItem += item; } else { mergedArr.push(currentItem); diff --git a/src/utils/tokenCount.ts b/src/utils/tokenCount.ts new file mode 100644 index 0000000..84e4f23 --- /dev/null +++ b/src/utils/tokenCount.ts @@ -0,0 +1,14 @@ +import { Tiktoken } from "@dqbd/tiktoken/lite" +import cl100k_base from "@dqbd/tiktoken/encoders/cl100k_base.json" assert{type: "json"} + +export function tokenCount(content: string): number { + const encoding = new Tiktoken( + cl100k_base.bpe_ranks, + cl100k_base.special_tokens, + cl100k_base.pat_str + ); + const tokens = encoding.encode(content); + encoding.free(); + + return tokens.length; +}