Count file diff by token, not by length of string (#63)

* 1.1.23

* 1.1.24

* feat(package.json): add @dqbd/tiktoken dependency

refactor(generateCommitMessageFromGitDiff.ts): add tokenCount function to count the number of tokens in a string
refactor(generateCommitMessageFromGitDiff.ts): change the way the length of INIT_MESSAGES_PROMPT is calculated to use tokenCount function
refactor(generateCommitMessageFromGitDiff.ts): change the way the length of diff is calculated to use tokenCount function

refactor(generateCommitMessageFromGitDiff.ts): rename function parameter from diff to fileDiff and update function calls accordingly
feat(generateCommitMessageFromGitDiff.ts): add tokenCount function to count tokens in fileDiff and use it to check if fileDiff is bigger than MAX_REQ_TOKENS

feat(utils): add tokenCount function to count the number of tokens in a string
refactor(utils/mergeStrings.ts): use tokenCount function to count the number of tokens in a string instead of checking the length of the concatenated string

---------

Co-authored-by: di-sukharev <dim.sukharev@gmail.com>
This commit is contained in:
Raymond
2023-03-28 18:43:02 +08:00
committed by GitHub
parent 7c9feba3ba
commit 3103ae18b8
5 changed files with 33 additions and 12 deletions

View File

@@ -6,6 +6,7 @@ import { api } from './api';
import { getConfig } from './commands/config';
import { mergeStrings } from './utils/mergeStrings';
import { i18n, I18nLocals } from './i18n';
import { tokenCount } from './utils/tokenCount';
const config = getConfig();
const translation = i18n[(config?.language as I18nLocals) || 'en'];
@@ -13,12 +14,10 @@ const translation = i18n[(config?.language as I18nLocals) || 'en'];
const INIT_MESSAGES_PROMPT: Array<ChatCompletionRequestMessage> = [
{
role: ChatCompletionRequestMessageRoleEnum.System,
content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${
config?.emoji
? 'Use Gitmoji convention to preface the commit'
: 'Do not preface the commit with anything'
}, use the present tense. ${
config?.description
content: `You are to act as the author of a commit message in git. Your mission is to create clean and comprehensive commit messages in the conventional commit convention. I'll send you an output of 'git diff --staged' command, and you convert it into a commit message. ${config?.emoji
? 'Use Gitmoji convention to preface the commit'
: 'Do not preface the commit with anything'
}, use the present tense. ${config?.description
? 'Add a short description of what commit is about after the commit message. Don\'t start it with "This commit", just describe the changes.'
: "Don't add any descriptions to the commit, only commit message."
} Use ${translation.localLanguage} to answer.`
@@ -80,16 +79,16 @@ interface GenerateCommitMessageError {
}
const INIT_MESSAGES_PROMPT_LENGTH = INIT_MESSAGES_PROMPT.map(
(msg) => msg.content
).join('').length;
(msg) => tokenCount(msg.content) + 4
).reduce((a, b) => a + b, 0);
const MAX_REQ_TOKENS = 3900 - INIT_MESSAGES_PROMPT_LENGTH;
export const generateCommitMessageWithChatCompletion = async (
diff: string
): Promise<string | GenerateCommitMessageError> => {
try {
if (diff.length >= MAX_REQ_TOKENS) {
try {
if (tokenCount(diff) >= MAX_REQ_TOKENS) {
const commitMessagePromises = getCommitMsgsPromisesFromFileDiffs(diff);
const commitMessages = await Promise.all(commitMessagePromises);
@@ -144,7 +143,7 @@ function getCommitMsgsPromisesFromFileDiffs(diff: string) {
const commitMessagePromises = [];
for (const fileDiff of mergedFilesDiffs) {
if (fileDiff.length >= MAX_REQ_TOKENS) {
if (tokenCount(fileDiff) >= MAX_REQ_TOKENS) {
// if file-diff is bigger than gpt context — split fileDiff into lineDiff
const messagesPromises = getMessagesPromisesByLines(fileDiff, separator);

View File

@@ -1,8 +1,9 @@
import { tokenCount } from './tokenCount'
export function mergeStrings(arr: string[], maxStringLength: number): string[] {
const mergedArr: string[] = [];
let currentItem: string = arr[0];
for (const item of arr.slice(1)) {
if (currentItem.length + item.length <= maxStringLength) {
if (tokenCount(currentItem + item) <= maxStringLength) {
currentItem += item;
} else {
mergedArr.push(currentItem);

14
src/utils/tokenCount.ts Normal file
View File

@@ -0,0 +1,14 @@
import { Tiktoken } from "@dqbd/tiktoken/lite"
import cl100k_base from "@dqbd/tiktoken/encoders/cl100k_base.json" assert{type: "json"}
export function tokenCount(content: string): number {
const encoding = new Tiktoken(
cl100k_base.bpe_ranks,
cl100k_base.special_tokens,
cl100k_base.pat_str
);
const tokens = encoding.encode(content);
encoding.free();
return tokens.length;
}