fix(embeddings): modified embeddings utils to only index english docs (#2078)

This commit is contained in:
Waleed
2025-11-20 14:03:49 -08:00
committed by GitHub
parent 00ae718692
commit 4a0450d1fc
3 changed files with 36 additions and 10 deletions

View File

@@ -198,10 +198,30 @@ jobs:
"${IMAGE_BASE}:${{ github.sha }}-arm64"
docker manifest push "${IMAGE_BASE}:${{ github.sha }}"
# Process docs embeddings (after ECR images are pushed)
# Check if docs changed
check-docs-changes:
name: Check Docs Changes
runs-on: blacksmith-4vcpu-ubuntu-2404
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
outputs:
docs_changed: ${{ steps.filter.outputs.docs }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2 # Need at least 2 commits to detect changes
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
docs:
- 'apps/docs/content/docs/en/**'
- 'apps/sim/scripts/process-docs.ts'
- 'apps/sim/lib/chunkers/**'
# Process docs embeddings (only when docs change, after ECR images are pushed)
process-docs:
name: Process Docs
needs: build-amd64
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging')
needs: [build-amd64, check-docs-changes]
if: needs.check-docs-changes.outputs.docs_changed == 'true'
uses: ./.github/workflows/docs-embeddings.yml
secrets: inherit

View File

@@ -8,7 +8,7 @@ jobs:
process-docs-embeddings:
name: Process Documentation Embeddings
runs-on: blacksmith-8vcpu-ubuntu-2404
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging'
if: github.ref == 'refs/heads/main'
steps:
- name: Checkout code
@@ -41,6 +41,6 @@ jobs:
- name: Process docs embeddings
working-directory: ./apps/sim
env:
DATABASE_URL: ${{ github.ref == 'refs/heads/main' && secrets.DATABASE_URL || secrets.STAGING_DATABASE_URL }}
DATABASE_URL: ${{ secrets.DATABASE_URL }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: bun run scripts/process-docs.ts --clear

View File

@@ -34,7 +34,7 @@ interface ProcessingOptions {
*/
async function processDocs(options: ProcessingOptions = {}) {
const config = {
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs/en'),
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
chunkSize: options.chunkSize || 1024,
minChunkSize: options.minChunkSize || 100,
@@ -216,25 +216,31 @@ async function main() {
Usage: bun run process-docs.ts [options]
By default, processes English (en) documentation only.
Note: Use --clear flag when changing language scope to remove old embeddings.
Options:
--clear Clear existing embeddings before processing
--dry-run Process and display results without saving to DB
--verbose Show detailed output including text previews
--path <path> Custom path to docs directory
--path <path> Custom path to docs directory (default: docs/en)
--url <url> Custom base URL for links
--chunk-size <n> Custom chunk size in tokens (default: 1024)
--help, -h Show this help message
Examples:
# Dry run to test chunking
# Dry run to test chunking (English docs)
bun run process-docs.ts --dry-run
# Process and save to database
# Process and save to database (English docs)
bun run process-docs.ts
# Clear existing and reprocess
# Clear existing and reprocess (English docs)
bun run process-docs.ts --clear
# Process a different language
bun run process-docs.ts --path ../../apps/docs/content/docs/es
# Custom path with verbose output
bun run process-docs.ts --path ./my-docs --verbose
`)