mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-07 22:24:06 -05:00
fix(embeddings): modified embeddings utils to only index english docs (#2078)
This commit is contained in:
26
.github/workflows/ci.yml
vendored
26
.github/workflows/ci.yml
vendored
@@ -198,10 +198,30 @@ jobs:
|
||||
"${IMAGE_BASE}:${{ github.sha }}-arm64"
|
||||
docker manifest push "${IMAGE_BASE}:${{ github.sha }}"
|
||||
|
||||
# Process docs embeddings (after ECR images are pushed)
|
||||
# Check if docs changed
|
||||
check-docs-changes:
|
||||
name: Check Docs Changes
|
||||
runs-on: blacksmith-4vcpu-ubuntu-2404
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
outputs:
|
||||
docs_changed: ${{ steps.filter.outputs.docs }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2 # Need at least 2 commits to detect changes
|
||||
- uses: dorny/paths-filter@v3
|
||||
id: filter
|
||||
with:
|
||||
filters: |
|
||||
docs:
|
||||
- 'apps/docs/content/docs/en/**'
|
||||
- 'apps/sim/scripts/process-docs.ts'
|
||||
- 'apps/sim/lib/chunkers/**'
|
||||
|
||||
# Process docs embeddings (only when docs change, after ECR images are pushed)
|
||||
process-docs:
|
||||
name: Process Docs
|
||||
needs: build-amd64
|
||||
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging')
|
||||
needs: [build-amd64, check-docs-changes]
|
||||
if: needs.check-docs-changes.outputs.docs_changed == 'true'
|
||||
uses: ./.github/workflows/docs-embeddings.yml
|
||||
secrets: inherit
|
||||
|
||||
4
.github/workflows/docs-embeddings.yml
vendored
4
.github/workflows/docs-embeddings.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
||||
process-docs-embeddings:
|
||||
name: Process Documentation Embeddings
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404
|
||||
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging'
|
||||
if: github.ref == 'refs/heads/main'
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -41,6 +41,6 @@ jobs:
|
||||
- name: Process docs embeddings
|
||||
working-directory: ./apps/sim
|
||||
env:
|
||||
DATABASE_URL: ${{ github.ref == 'refs/heads/main' && secrets.DATABASE_URL || secrets.STAGING_DATABASE_URL }}
|
||||
DATABASE_URL: ${{ secrets.DATABASE_URL }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: bun run scripts/process-docs.ts --clear
|
||||
|
||||
@@ -34,7 +34,7 @@ interface ProcessingOptions {
|
||||
*/
|
||||
async function processDocs(options: ProcessingOptions = {}) {
|
||||
const config = {
|
||||
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
|
||||
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs/en'),
|
||||
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
|
||||
chunkSize: options.chunkSize || 1024,
|
||||
minChunkSize: options.minChunkSize || 100,
|
||||
@@ -216,25 +216,31 @@ async function main() {
|
||||
|
||||
Usage: bun run process-docs.ts [options]
|
||||
|
||||
By default, processes English (en) documentation only.
|
||||
Note: Use --clear flag when changing language scope to remove old embeddings.
|
||||
|
||||
Options:
|
||||
--clear Clear existing embeddings before processing
|
||||
--dry-run Process and display results without saving to DB
|
||||
--verbose Show detailed output including text previews
|
||||
--path <path> Custom path to docs directory
|
||||
--path <path> Custom path to docs directory (default: docs/en)
|
||||
--url <url> Custom base URL for links
|
||||
--chunk-size <n> Custom chunk size in tokens (default: 1024)
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
# Dry run to test chunking
|
||||
# Dry run to test chunking (English docs)
|
||||
bun run process-docs.ts --dry-run
|
||||
|
||||
# Process and save to database
|
||||
# Process and save to database (English docs)
|
||||
bun run process-docs.ts
|
||||
|
||||
# Clear existing and reprocess
|
||||
# Clear existing and reprocess (English docs)
|
||||
bun run process-docs.ts --clear
|
||||
|
||||
# Process a different language
|
||||
bun run process-docs.ts --path ../../apps/docs/content/docs/es
|
||||
|
||||
# Custom path with verbose output
|
||||
bun run process-docs.ts --path ./my-docs --verbose
|
||||
`)
|
||||
|
||||
Reference in New Issue
Block a user