mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-09 15:07:55 -05:00
fix(embeddings): modified embeddings utils to only index english docs (#2078)
This commit is contained in:
26
.github/workflows/ci.yml
vendored
26
.github/workflows/ci.yml
vendored
@@ -198,10 +198,30 @@ jobs:
|
|||||||
"${IMAGE_BASE}:${{ github.sha }}-arm64"
|
"${IMAGE_BASE}:${{ github.sha }}-arm64"
|
||||||
docker manifest push "${IMAGE_BASE}:${{ github.sha }}"
|
docker manifest push "${IMAGE_BASE}:${{ github.sha }}"
|
||||||
|
|
||||||
# Process docs embeddings (after ECR images are pushed)
|
# Check if docs changed
|
||||||
|
check-docs-changes:
|
||||||
|
name: Check Docs Changes
|
||||||
|
runs-on: blacksmith-4vcpu-ubuntu-2404
|
||||||
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||||
|
outputs:
|
||||||
|
docs_changed: ${{ steps.filter.outputs.docs }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 2 # Need at least 2 commits to detect changes
|
||||||
|
- uses: dorny/paths-filter@v3
|
||||||
|
id: filter
|
||||||
|
with:
|
||||||
|
filters: |
|
||||||
|
docs:
|
||||||
|
- 'apps/docs/content/docs/en/**'
|
||||||
|
- 'apps/sim/scripts/process-docs.ts'
|
||||||
|
- 'apps/sim/lib/chunkers/**'
|
||||||
|
|
||||||
|
# Process docs embeddings (only when docs change, after ECR images are pushed)
|
||||||
process-docs:
|
process-docs:
|
||||||
name: Process Docs
|
name: Process Docs
|
||||||
needs: build-amd64
|
needs: [build-amd64, check-docs-changes]
|
||||||
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging')
|
if: needs.check-docs-changes.outputs.docs_changed == 'true'
|
||||||
uses: ./.github/workflows/docs-embeddings.yml
|
uses: ./.github/workflows/docs-embeddings.yml
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
4
.github/workflows/docs-embeddings.yml
vendored
4
.github/workflows/docs-embeddings.yml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
process-docs-embeddings:
|
process-docs-embeddings:
|
||||||
name: Process Documentation Embeddings
|
name: Process Documentation Embeddings
|
||||||
runs-on: blacksmith-8vcpu-ubuntu-2404
|
runs-on: blacksmith-8vcpu-ubuntu-2404
|
||||||
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/staging'
|
if: github.ref == 'refs/heads/main'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
@@ -41,6 +41,6 @@ jobs:
|
|||||||
- name: Process docs embeddings
|
- name: Process docs embeddings
|
||||||
working-directory: ./apps/sim
|
working-directory: ./apps/sim
|
||||||
env:
|
env:
|
||||||
DATABASE_URL: ${{ github.ref == 'refs/heads/main' && secrets.DATABASE_URL || secrets.STAGING_DATABASE_URL }}
|
DATABASE_URL: ${{ secrets.DATABASE_URL }}
|
||||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
run: bun run scripts/process-docs.ts --clear
|
run: bun run scripts/process-docs.ts --clear
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ interface ProcessingOptions {
|
|||||||
*/
|
*/
|
||||||
async function processDocs(options: ProcessingOptions = {}) {
|
async function processDocs(options: ProcessingOptions = {}) {
|
||||||
const config = {
|
const config = {
|
||||||
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
|
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs/en'),
|
||||||
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
|
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
|
||||||
chunkSize: options.chunkSize || 1024,
|
chunkSize: options.chunkSize || 1024,
|
||||||
minChunkSize: options.minChunkSize || 100,
|
minChunkSize: options.minChunkSize || 100,
|
||||||
@@ -216,25 +216,31 @@ async function main() {
|
|||||||
|
|
||||||
Usage: bun run process-docs.ts [options]
|
Usage: bun run process-docs.ts [options]
|
||||||
|
|
||||||
|
By default, processes English (en) documentation only.
|
||||||
|
Note: Use --clear flag when changing language scope to remove old embeddings.
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--clear Clear existing embeddings before processing
|
--clear Clear existing embeddings before processing
|
||||||
--dry-run Process and display results without saving to DB
|
--dry-run Process and display results without saving to DB
|
||||||
--verbose Show detailed output including text previews
|
--verbose Show detailed output including text previews
|
||||||
--path <path> Custom path to docs directory
|
--path <path> Custom path to docs directory (default: docs/en)
|
||||||
--url <url> Custom base URL for links
|
--url <url> Custom base URL for links
|
||||||
--chunk-size <n> Custom chunk size in tokens (default: 1024)
|
--chunk-size <n> Custom chunk size in tokens (default: 1024)
|
||||||
--help, -h Show this help message
|
--help, -h Show this help message
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
# Dry run to test chunking
|
# Dry run to test chunking (English docs)
|
||||||
bun run process-docs.ts --dry-run
|
bun run process-docs.ts --dry-run
|
||||||
|
|
||||||
# Process and save to database
|
# Process and save to database (English docs)
|
||||||
bun run process-docs.ts
|
bun run process-docs.ts
|
||||||
|
|
||||||
# Clear existing and reprocess
|
# Clear existing and reprocess (English docs)
|
||||||
bun run process-docs.ts --clear
|
bun run process-docs.ts --clear
|
||||||
|
|
||||||
|
# Process a different language
|
||||||
|
bun run process-docs.ts --path ../../apps/docs/content/docs/es
|
||||||
|
|
||||||
# Custom path with verbose output
|
# Custom path with verbose output
|
||||||
bun run process-docs.ts --path ./my-docs --verbose
|
bun run process-docs.ts --path ./my-docs --verbose
|
||||||
`)
|
`)
|
||||||
|
|||||||
Reference in New Issue
Block a user