Count file diff by token, not by length of string (#63)

* 1.1.23

* 1.1.24

* feat(package.json): add @dqbd/tiktoken dependency

refactor(generateCommitMessageFromGitDiff.ts): add tokenCount function to count the number of tokens in a string
refactor(generateCommitMessageFromGitDiff.ts): change the way the length of INIT_MESSAGES_PROMPT is calculated to use tokenCount function
refactor(generateCommitMessageFromGitDiff.ts): change the way the length of diff is calculated to use tokenCount function

refactor(generateCommitMessageFromGitDiff.ts): rename function parameter from diff to fileDiff and update function calls accordingly
feat(generateCommitMessageFromGitDiff.ts): add tokenCount function to count tokens in fileDiff and use it to check if fileDiff is bigger than MAX_REQ_TOKENS

feat(utils): add tokenCount function to count the number of tokens in a string
refactor(utils/mergeStrings.ts): use tokenCount function to count the number of tokens in a string instead of checking the length of the concatenated string

---------

Co-authored-by: di-sukharev <dim.sukharev@gmail.com>
This commit is contained in:
Raymond
2023-03-28 18:43:02 +08:00
committed by GitHub
parent 7c9feba3ba
commit 3103ae18b8
5 changed files with 33 additions and 12 deletions

6
package-lock.json generated
View File

@@ -10,6 +10,7 @@
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@clack/prompts": "^0.6.1", "@clack/prompts": "^0.6.1",
"@dqbd/tiktoken": "^1.0.2",
"axios": "^1.3.4", "axios": "^1.3.4",
"chalk": "^5.2.0", "chalk": "^5.2.0",
"cleye": "^1.3.2", "cleye": "^1.3.2",
@@ -83,6 +84,11 @@
"node": ">=12" "node": ">=12"
} }
}, },
"node_modules/@dqbd/tiktoken": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/@dqbd/tiktoken/-/tiktoken-1.0.2.tgz",
"integrity": "sha512-AjGTBRWsMoVmVeN55NLyupyM8TNamOUBl6tj5t/leLDVup3CFGO9tVagNL1jf3GyZLkWZSTmYVbPQ/M2LEcNzw=="
},
"node_modules/@esbuild/android-arm": { "node_modules/@esbuild/android-arm": {
"version": "0.15.18", "version": "0.15.18",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.15.18.tgz", "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.15.18.tgz",

View File

@@ -60,6 +60,7 @@
}, },
"dependencies": { "dependencies": {
"@clack/prompts": "^0.6.1", "@clack/prompts": "^0.6.1",
"@dqbd/tiktoken": "^1.0.2",
"axios": "^1.3.4", "axios": "^1.3.4",
"chalk": "^5.2.0", "chalk": "^5.2.0",
"cleye": "^1.3.2", "cleye": "^1.3.2",

View File

@@ -6,6 +6,7 @@ import { api } from './api';
import { getConfig } from './commands/config'; import { getConfig } from './commands/config';
import { mergeStrings } from './utils/mergeStrings'; import { mergeStrings } from './utils/mergeStrings';
import { i18n, I18nLocals } from './i18n'; import { i18n, I18nLocals } from './i18n';
import { tokenCount } from './utils/tokenCount';
const config = getConfig(); const config = getConfig();
const translation = i18n[(config?.language as I18nLocals) || 'en']; const translation = i18n[(config?.language as I18nLocals) || 'en'];
@@ -13,12 +14,10 @@ const translation = i18n[(config?.language as I18nLocals) || 'en'];
const INIT_MESSAGES_PROMPT: Array<ChatCompletionRequestMessage> = [ const INIT_MESSAGES_PROMPT: Array<ChatCompletionRequestMessage> = [
{ {
role: ChatCompletionRequestMessageRoleEnum.System, role: ChatCompletionRequestMessageRoleEnum.System,
content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${ content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${config?.emoji
config?.emoji ? 'Use Gitmoji convention to preface the commit'
? 'Use Gitmoji convention to preface the commit' : 'Do not preface the commit with anything'
: 'Do not preface the commit with anything' }, use the present tense. ${config?.description
}, use the present tense. ${
config?.description
? 'Add a short description of what commit is about after the commit message. Don\'t start it with "This commit", just describe the changes.' ? 'Add a short description of what commit is about after the commit message. Don\'t start it with "This commit", just describe the changes.'
: "Don't add any descriptions to the commit, only commit message." : "Don't add any descriptions to the commit, only commit message."
} Use ${translation.localLanguage} to answer.` } Use ${translation.localLanguage} to answer.`
@@ -80,16 +79,16 @@ interface GenerateCommitMessageError {
} }
const INIT_MESSAGES_PROMPT_LENGTH = INIT_MESSAGES_PROMPT.map( const INIT_MESSAGES_PROMPT_LENGTH = INIT_MESSAGES_PROMPT.map(
(msg) => msg.content (msg) => tokenCount(msg.content) + 4
).join('').length; ).reduce((a, b) => a + b, 0);
const MAX_REQ_TOKENS = 3900 - INIT_MESSAGES_PROMPT_LENGTH; const MAX_REQ_TOKENS = 3900 - INIT_MESSAGES_PROMPT_LENGTH;
export const generateCommitMessageWithChatCompletion = async ( export const generateCommitMessageWithChatCompletion = async (
diff: string diff: string
): Promise<string | GenerateCommitMessageError> => { ): Promise<string | GenerateCommitMessageError> => {
try { try {
if (diff.length >= MAX_REQ_TOKENS) { if (tokenCount(diff) >= MAX_REQ_TOKENS) {
const commitMessagePromises = getCommitMsgsPromisesFromFileDiffs(diff); const commitMessagePromises = getCommitMsgsPromisesFromFileDiffs(diff);
const commitMessages = await Promise.all(commitMessagePromises); const commitMessages = await Promise.all(commitMessagePromises);
@@ -144,7 +143,7 @@ function getCommitMsgsPromisesFromFileDiffs(diff: string) {
const commitMessagePromises = []; const commitMessagePromises = [];
for (const fileDiff of mergedFilesDiffs) { for (const fileDiff of mergedFilesDiffs) {
if (fileDiff.length >= MAX_REQ_TOKENS) { if (tokenCount(fileDiff) >= MAX_REQ_TOKENS) {
// if file-diff is bigger than gpt context — split fileDiff into lineDiff // if file-diff is bigger than gpt context — split fileDiff into lineDiff
const messagesPromises = getMessagesPromisesByLines(fileDiff, separator); const messagesPromises = getMessagesPromisesByLines(fileDiff, separator);

View File

@@ -1,8 +1,9 @@
import { tokenCount } from './tokenCount'
export function mergeStrings(arr: string[], maxStringLength: number): string[] { export function mergeStrings(arr: string[], maxStringLength: number): string[] {
const mergedArr: string[] = []; const mergedArr: string[] = [];
let currentItem: string = arr[0]; let currentItem: string = arr[0];
for (const item of arr.slice(1)) { for (const item of arr.slice(1)) {
if (currentItem.length + item.length <= maxStringLength) { if (tokenCount(currentItem + item) <= maxStringLength) {
currentItem += item; currentItem += item;
} else { } else {
mergedArr.push(currentItem); mergedArr.push(currentItem);

14
src/utils/tokenCount.ts Normal file
View File

@@ -0,0 +1,14 @@
import { Tiktoken } from "@dqbd/tiktoken/lite"
import cl100k_base from "@dqbd/tiktoken/encoders/cl100k_base.json" assert{type: "json"}
export function tokenCount(content: string): number {
const encoding = new Tiktoken(
cl100k_base.bpe_ranks,
cl100k_base.special_tokens,
cl100k_base.pat_str
);
const tokens = encoding.encode(content);
encoding.free();
return tokens.length;
}