Merge pull request #827 from Pythagora-io/filter-files

filter task-relevant files based on file summaries
2026-01-10 13:37:55 -05:00 · 2024-04-08 09:55:00 -07:00
parent 8aa479a5db 5e8c4b8b2f
commit bb5010a78c
15 changed files with 367 additions and 44 deletions
--- a/pilot/.env.example
+++ b/pilot/.env.example
@@ -42,3 +42,6 @@ DB_PASSWORD=

 # Set extra buffer to wait on top of detected retry time when rate limmit is hit. defaults to 6
 # RATE_LIMIT_EXTRA_BUFFER=
+
+# Only send task-relevant files to the LLM. Enabled by default; uncomment and set this to "false" to disable.
+# FILTER_RELEVANT_FILES=true
--- a/pilot/const/function_calls.py
+++ b/pilot/const/function_calls.py
@@ -532,3 +532,50 @@ GET_BUG_REPORT_MISSING_DATA = {
        }
    }],
 }
+
+LIST_RELEVANT_FILES = {
+    'definitions': [{
+        'name': 'list_relevant_files',
+        'description': 'List of relevant files for the current task.',
+        'parameters': {
+            "type": "object",
+            "properties": {
+                "relevant_files": {
+                    "type": "array",
+                    "items": {
+                        "type": "string",
+                        "description": "Path to the file that is relevant for the current task, relative to the project root."
+                    },
+                }
+            },
+            "required": ["relevant_files"],
+            "additionalProperties": False
+        }
+    }],
+}
+
+DESCRIBE_FILE = {
+    'definitions': [{
+        'name': 'describe_file',
+        'description': 'Describe the content of the file.',
+        'parameters': {
+            "type": "object",
+            "properties": {
+                "summary": {
+                    "type": "string",
+                    "description": "Describe in detail the functionality being defined or implemented in this file. Be as detailed as possible."
+                },
+                "references": {
+                    "type": "array",
+                    "items": {
+                        "type": "string",
+                        "description": "Path to a file that is referenced in the current file, relative to the project root.",
+                    },
+                    "description": "List of file references."
+                }
+            },
+            "required": ["summary", "references"],
+            "additionalProperties": False,
+        }
+    }]
+}
--- a/pilot/helpers/AgentConvo.py
+++ b/pilot/helpers/AgentConvo.py
@@ -179,7 +179,8 @@ class AgentConvo:
            self.replace_files()

    def replace_files(self):
-        files = self.agent.project.get_all_coded_files()
+        relevant_files = getattr(self.agent, 'relevant_files', None)
+        files = self.agent.project.get_all_coded_files(relevant_files=relevant_files)
        for msg in self.messages:
            if msg['role'] == 'user':
                new_content = self.replace_files_in_one_message(files, msg["content"])
--- a/pilot/helpers/Project.py
+++ b/pilot/helpers/Project.py
@@ -1,7 +1,7 @@
 import json
 import os
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Optional, Union

 import peewee
 from playhouse.shortcuts import model_to_dict
@@ -36,6 +36,9 @@ from utils.telemetry import telemetry
 from utils.task import Task
 from utils.utils import remove_lines_with_string

+from utils.describe import describe_file
+from os.path import abspath, relpath
+

 class Project:
    def __init__(
@@ -326,14 +329,29 @@ class Project:
            "lines_of_code": len(item['content'].splitlines()),
        } for item in [model_to_dict(file) for file in file_snapshots]]

-    def get_all_coded_files(self):
+    @staticmethod
+    def relpath(file: Union[File, str]) -> str:
        """
-        Get all coded files in the project.
+        Return relative file path (including the name) within a project

-        Returns:
-            list: A list of coded files.
+        :param file: File object or file path
+        :return: Relative file path
        """
-        files = (
+        if isinstance(file, File):
+            fpath = f"{file.path}/{file.name}"
+        else:
+            fpath = file
+        if fpath.startswith("/"):
+            fpath = fpath[1:]
+        elif fpath.startswith("./"):
+            fpath = fpath[2:]
+        return fpath
+
+    def get_all_database_files(self) -> list[File]:
+        """
+        Get all project files from the database.
+        """
+        return (
            File
            .select()
            .where(
@@ -342,8 +360,35 @@ class Project:
            )
        )

+    def get_all_coded_files(self, relevant_files=None):
+        """
+        Get all coded files in the project.
+
+        Returns:
+            list: A list of coded files.
+        """
+        files = self.get_all_database_files()
+        if relevant_files:
+            n_files0 = len(files)
+            files = [file for file in files if self.relpath(file) in relevant_files]
+            n_files1 = len(files)
+            rel_txt = ",".join(relevant_files) if relevant_files else "(none)"
+            logger.debug(f"[get_all_coded_files] reduced context from {n_files0} to {n_files1} files, using: {rel_txt}")
+
        return self.get_files([file.path + '/' + file.name for file in files])

+    def get_file_summaries(self) -> Optional[dict[str, str]]:
+        """
+        Get summaries of all coded files in the project.
+
+        :returns: A dictionary of file summaries, or None if file filtering is not enabled.
+        """
+        if os.getenv('FILTER_RELEVANT_FILES', '').lower().strip() in ['false', '0', 'no', 'off']:
+            return None
+
+        files = self.get_all_database_files()
+        return {self.relpath(file): file.description or "(unknown)" for file in files if os.path.exists(file.full_path)}
+
    def get_files(self, files):
        """
        Get file contents.
@@ -405,11 +450,14 @@ class Project:

        if path and path[0] == '/':
            path = path.lstrip('/')
-        (File.insert(app=self.app, path=path, name=name, full_path=full_path)
+
+        description = describe_file(self, relpath(abspath(full_path), abspath(self.root_path)), data['content'])
+
+        (File.insert(app=self.app, path=path, name=name, full_path=full_path, description=description)
         .on_conflict(
            conflict_target=[File.app, File.name, File.path],
            preserve=[],
-            update={'name': name, 'path': path, 'full_path': full_path})
+            update={'name': name, 'path': path, 'full_path': full_path, 'description': description})
         .execute())

        if not self.skip_steps:
@@ -530,7 +578,10 @@ class Project:
        final_absolute_path = os.path.join(self.root_path, final_file_path[1:], final_file_name)
        return final_file_path, final_absolute_path

-    def save_files_snapshot(self, development_step_id):
+    def save_files_snapshot(self, development_step_id, summaries=None):
+        if summaries is None:
+            summaries = {}
+
        files = get_directory_contents(self.root_path)
        development_step, created = DevelopmentSteps.get_or_create(id=development_step_id)

@@ -547,16 +598,26 @@ class Project:
                app=self.app,
                name=file['name'],
                path=file['path'],
-                defaults={'full_path': file['full_path']},
+                defaults={
+                    'full_path': file['full_path'],
+                    'description': summaries.get(os.path.relpath(file['full_path'], self.root_path), ''),
+                },
            )

-            file_snapshot, created = FileSnapshot.get_or_create(
+            file_snapshot, _ = FileSnapshot.get_or_create(
                app=self.app,
                development_step=development_step,
                file=file_in_db,
                defaults={'content': file.get('content', '')}
            )
            file_snapshot.content = file['content']
+
+            # For a non-empty file, if we don't have a description, and the file is either new or
+            # we're loading a project, create the description.
+            if file['content'] and not file_in_db.description:
+                file_in_db.description = describe_file(self, relpath(abspath(file['full_path']), abspath(self.root_path)), file['content'])
+                file_in_db.save()
+
            file_snapshot.save()
            total_files += 1
            if isinstance(file['content'], str):
--- a/pilot/helpers/agents/Developer.py
+++ b/pilot/helpers/agents/Developer.py
@@ -3,6 +3,7 @@ import platform
 import uuid
 import re
 import json
+from typing import Optional

 from const.messages import WHEN_USER_DONE, AFFIRMATIVE_ANSWERS, NEGATIVE_ANSWERS, STUCK_IN_LOOP, NONE_OF_THESE
 from utils.exit import trace_code_event
@@ -27,12 +28,20 @@ from helpers.Agent import Agent
 from helpers.AgentConvo import AgentConvo
 from utils.utils import should_execute_step, array_of_objects_to_string, generate_app_data
 from helpers.cli import run_command_until_success, execute_command_and_check_cli_response
-from const.function_calls import (EXECUTE_COMMANDS, GET_TEST_TYPE, IMPLEMENT_TASK, COMMAND_TO_RUN,
-                                  ALTERNATIVE_SOLUTIONS, GET_BUG_REPORT_MISSING_DATA)
+from const.function_calls import (
+    EXECUTE_COMMANDS,
+    GET_TEST_TYPE,
+    IMPLEMENT_TASK,
+    COMMAND_TO_RUN,
+    ALTERNATIVE_SOLUTIONS,
+    GET_BUG_REPORT_MISSING_DATA,
+    LIST_RELEVANT_FILES,
+)
 from database.database import save_progress, edit_development_plan, edit_feature_plan, get_progress_steps, update_app_status
 from utils.telemetry import telemetry
 from prompts.prompts import ask_user
 from utils.print import print_task_progress, print_step_progress
+from utils.describe import describe_file

 ENVIRONMENT_SETUP_STEP = 'environment_setup'

@@ -44,6 +53,7 @@ class Developer(Agent):
        self.run_command = None
        self.save_dev_steps = True
        self.debugger = Debugger(self)
+        self.relevant_files = None

    def start_coding(self, task_source):
        print('Starting development...', type='verbose', category='agent:developer')
@@ -146,6 +156,8 @@ class Developer(Agent):
            # remove breakdown from the head of dev_steps_to_load; if it's last, record it in checkpoint
            self.project.cleanup_list('dev_steps_to_load', int(self.project.dev_steps_to_load[0]['id']) + 1)
        else:
+            file_summaries = self.project.get_file_summaries()
+            self.relevant_files = self.filter_relevant_files(file_summaries, current_task=development_task)
            instructions = convo_dev_task.send_message('development/task/breakdown.prompt', {
                "name": self.project.args['name'],
                "app_type": self.project.args['app_type'],
@@ -156,7 +168,8 @@ class Developer(Agent):
                "directory_tree": self.project.get_directory_tree(True),
                "current_task_index": i,
                "development_tasks": self.project.development_plan,
-                "files": self.project.get_all_coded_files(),
+                "file_summaries": file_summaries,
+                "files": self.project.get_all_coded_files(relevant_files=self.relevant_files),
                "architecture": self.project.architecture,
                "technologies": self.project.system_dependencies + self.project.package_dependencies,
                "task_type": 'feature' if self.project.finished else 'app',
@@ -223,6 +236,10 @@ class Developer(Agent):
                # dev_steps_to_load; if it's last, record it in checkpoint
                self.project.cleanup_list('dev_steps_to_load', max(id for id in ids if id is not None))

+        if self.relevant_files is None:
+            # Recompute relevant files after project load
+            self.relevant_files = self.filter_relevant_files(self.project.get_file_summaries(), current_task=development_task)
+
        while True:
            result = self.execute_task(convo_dev_task,
                                       steps,
@@ -337,6 +354,8 @@ class Developer(Agent):
        data = step['save_file']
        code_monkey = CodeMonkey(self.project)
        code_monkey.implement_code_changes(convo, data)
+        if self.relevant_files is not None:
+            self.relevant_files.add(data['path'])
        return {"success": True}

    def step_command_run(self, convo, task_steps, i, success_with_cli_response=False):
@@ -585,6 +604,8 @@ class Developer(Agent):
                    'path' in step[step['type']] and
                    step[step['type']]['path'] not in self.modified_files):
                self.modified_files.append(step[step['type']]['path'])
+                if self.relevant_files is not None:
+                    self.relevant_files.add(self.project.relpath(step[step['type']]['path']))
            # This means we are still loading the project and have all the steps until last iteration
            if self.project.last_iteration is not None or self.project.last_detailed_user_review_goal is not None:
                continue
@@ -757,7 +778,8 @@ class Developer(Agent):
                    "directory_tree": self.project.get_directory_tree(True),
                    "current_task": development_task,
                    "development_tasks": self.project.development_plan,
-                    "files": self.project.get_all_coded_files(),
+                    "files": self.project.get_all_coded_files(relevant_files=self.relevant_files),
+                    "file_summaries": self.project.get_file_summaries(),
                    "user_feedback": user_feedback,
                    "user_feedback_qa": user_feedback_qa,
                    "previous_solutions": llm_solutions[-3:],
@@ -862,11 +884,7 @@ class Developer(Agent):
        }
        """
        review_convo = AgentConvo(self)
-        files = [
-            file_dict for file_dict in self.project.get_all_coded_files()
-            if any(os.path.normpath(file_dict['full_path']).endswith(os.path.normpath(modified_file.lstrip('.'))) for
-                   modified_file in self.modified_files)
-        ]
+        files = self.project.get_all_coded_files(relevant_files=self.relevant_files)
        files_at_start_of_task = [
            file_dict for file_dict in self.files_at_start_of_task
            if any(os.path.normpath(file_dict['full_path']).endswith(os.path.normpath(modified_file.lstrip('.'))) for
@@ -879,6 +897,7 @@ class Developer(Agent):
            "tasks": self.project.development_plan,
            "current_task": self.project.current_task.data.get('task_description'),
            "files": files,
+            "file_summaries": self.project.get_file_summaries(),
            "all_feedbacks": [solution["user_feedback"].replace("```", "") for solution in llm_solutions],
            "modified_files": self.modified_files,
            "files_at_start_of_task": files_at_start_of_task,
@@ -1087,3 +1106,49 @@ class Developer(Agent):
            next_solution_to_try_index = 1

        return next_solution_to_try_index
+
+    def filter_relevant_files(self, file_summaries, current_task=None, user_input=None) -> Optional[set[str]]:
+        """
+        Filter task/iteration relevant files.
+
+        Asks the LLM to determine which files are relevant to the current task
+        based on the user input and the current task.
+
+        Only enabled if FILTER_RELEVANT_FILES feature flag is set, otherwise returns None.
+
+        :param file_summaries: The file summaries.
+        :param current_task: The current task.
+        :param user_input: The user input.
+        :returns: Set of relevant file paths.
+        """
+
+        if not file_summaries:
+            return None
+
+        if os.getenv('FILTER_RELEVANT_FILES', '').lower().strip() in ['false', '0', 'no', 'off']:
+            return None
+
+        convo = AgentConvo(self)
+        response = convo.send_message('development/filter_files.prompt', {
+            "name": self.project.args['name'],
+            "app_type": self.project.args['app_type'],
+            "app_summary": self.project.project_description,
+            "architecture": self.project.architecture,
+            "technologies": self.project.system_dependencies + self.project.package_dependencies,
+            "directory_tree": self.project.get_directory_tree(True),
+            "current_task": current_task,
+            "development_tasks": self.project.development_plan,
+            "user_input": user_input,
+            "previous_features": self.project.previous_features,
+            "current_feature": self.project.current_feature,
+            "file_summaries": file_summaries,
+        }, LIST_RELEVANT_FILES)
+
+        relevant_files = set()
+        for file in response['relevant_files']:
+            if file.startswith("./"):
+                file = file[2:]
+            if file in file_summaries:
+                relevant_files.add(file)
+
+        return relevant_files
--- a/pilot/helpers/agents/test_Developer.py
+++ b/pilot/helpers/agents/test_Developer.py
@@ -59,7 +59,8 @@ class TestDeveloper:
            'description': 'Do stuff',
            'user_review_goal': 'Do stuff',
        }]
-        project.get_all_coded_files = lambda: []
+        project.get_file_summaries = lambda: None
+        project.get_all_coded_files = lambda **kwargs: []
        project.current_step = 'test'

        # and a developer who will execute any task
@@ -85,7 +86,8 @@ class TestDeveloper:
            'description': 'Do stuff',
            'user_review_goal': 'Do stuff',
        }]
-        project.get_all_coded_files = lambda: []
+        project.get_file_summaries = lambda: None
+        project.get_all_coded_files = lambda **kwargs: []
        project.current_step = 'test'

        # and a developer who will execute any task except for `ls -al test`
--- a/pilot/helpers/test_Project.py
+++ b/pilot/helpers/test_Project.py
@@ -325,7 +325,8 @@ class TestProject:
    ])
    @patch('helpers.Project.update_file')
    @patch('helpers.Project.File')
-    def test_save_file(self, mock_file_insert, mock_update_file, test_data):
+    @patch('helpers.Project.describe_file')
+    def test_save_file(self, mock_describe_file, mock_file_insert, mock_update_file, test_data):
        # Given
        data = {'content': 'Hello World!'}
        if test_data['name'] is not None:
@@ -333,6 +334,7 @@ class TestProject:
        if test_data['path'] is not None:
            data['path'] = str(Path(test_data['path']))

+        mock_describe_file.return_value = "test description"
        project = create_project()

        # When
@@ -341,7 +343,7 @@ class TestProject:
        # Then assert that update_file with the correct path
        expected_saved_to = str(Path(test_data['saved_to']))
        mock_update_file.assert_called_once_with(expected_saved_to, 'Hello World!', project=project)
-
+        mock_describe_file.assert_called_once()
        # Also assert that File.insert was called with the expected arguments
        # expected_file_data = {'app': project.app, 'path': test_data['path'], 'name': test_data['name'],
        #                       'full_path': expected_saved_to}
@@ -408,7 +410,7 @@ class TestProjectFileLists:
 '''.lstrip()

    @patch('helpers.Project.DevelopmentSteps.get_or_create', return_value=('test', True))
-    @patch('helpers.Project.File.get_or_create', return_value=('test', True))
+    @patch('helpers.Project.File.get_or_create', return_value=(MagicMock(), True))
    @patch('helpers.Project.FileSnapshot.get_or_create', return_value=(MagicMock(), True))
    def test_save_files_snapshot(self, mock_snap, mock_file, mock_step):
        # Given a snapshot of the files in the project
--- a/pilot/prompts/components/files_list.prompt
+++ b/pilot/prompts/components/files_list.prompt
@@ -1,6 +1,8 @@
-{% if files|length > 0 %}Here are files that are currently implemented:
+{% if file_summaries %}These files are currently implemented:{% for fpath, summary in file_summaries.items() %}
+* `{{ fpath }}`: {{ summary }}{% endfor %}
+{% endif %}{% if files|length > 0 %}Here are the relevant files:
 ---START_OF_FILES---{% for file in files %}
-**{{ file.path }}/{{ file.name }}** ({{ file.lines_of_code }} lines of code):
+**{% if file.path %}{{ file.path }}/{% endif %}{{ file.name }}** ({{ file.lines_of_code }} lines of code):
 ```
 {{ file.content }}
 ```
--- a/pilot/prompts/development/filter_files.prompt
+++ b/pilot/prompts/development/filter_files.prompt
@@ -0,0 +1,36 @@
+You are working on a {{ app_type }} called "{{ name }}", writing the code for the entire application.
+
+Here is a high level description of "{{ name }}":
+```
+{{ app_summary }}
+```
+{{ features_list }}
+
+{% if development_tasks and current_task %}
+Development process of this app was split into smaller tasks. Here is the list of all tasks:
+```{% for task in development_tasks %}
+{{ loop.index }}. {{ task['description'] }}
+{% endfor %}
+```
+You are currently working on task "{{ current_task.description }}" and you have to focus only on that task.
+
+{% endif %}
+A part of the app is already finished.
+
+The app currently contains the following files:
+{% for fpath, summary in file_summaries.items() %}
+* `{{ fpath }}`: {{ summary }}{% endfor %}
+
+{% if user_feedback %}User who was using the app "{{ name }}" sent you this feedback:
+```
+{{ user_feedback }}
+```
+{% endif %}{% if next_solution_to_try %}
+Focus on solving this issue in the following way:
+```
+{{ next_solution_to_try }}
+```
+{% endif %}
+Now, before you can work on this, you need to select which files from the above list are relevant to this task. Output the relevant files in a JSON list.
+
+{{ relative_paths }}
--- a/pilot/prompts/development/review_task.prompt
+++ b/pilot/prompts/development/review_task.prompt
@@ -13,16 +13,7 @@ Development process of this app was split into smaller tasks. Here is the list o
 You are currently working on task "{{ current_task }}" and you have to focus only on that task.

 A part of the app is already finished.
-Here are files that were modified during this task implementation:
-{% if files|length > 0 %}---start_of_current_files---
-{% for file in files %}
-**{{ file.path }}/{{ file.name }}** ({{ file.lines_of_code }} lines of code):
-```
-{{ file.content }}
-```
-{% endfor %}
---end_of_current_files---
-{% endif -%}
+{{ files_list }}

 {% if all_feedbacks -%}While working on this task, your colleague who is testing the app "{{ name }}" sent you some additional information on what doesn't work as intended or what should be added. Here are all the inputs he sent you:
 ```{% for feedback in all_feedbacks %}
--- a/pilot/templates/init.py
+++ b/pilot/templates/init.py
@@ -88,7 +88,7 @@ def apply_project_template(

    last_development_step = project.checkpoints.get('last_development_step')
    if last_development_step:
-        project.save_files_snapshot(last_development_step['id'])
+        project.save_files_snapshot(last_development_step['id'], summaries=template.get("files"))

    trace_code_event('project-template', {'template': template_name})
    summary = "The code so far includes:\n" + template["summary"]
--- a/pilot/templates/javascript_react.py
+++ b/pilot/templates/javascript_react.py
@@ -20,4 +20,17 @@ JAVASCRIPT_REACT = {
        "* Minimal configuration to get started with React",
    ]),
    "install_hook": install_hook,
+    "files": {
+        "vite.config.js": "Configuration file for Vite, a fast developer-friendly Javascript bundler/devserver.",
+        "index.html": "Main entry point for the project. It includes a basic HTML structure with a root div element and a script tag importing a JavaScript file named main.jsx using the module type. References: src/main.jsx",
+        ".eslintrc.cjs": "Configuration file for ESLint, a static code analysis tool for identifying problematic patterns found in JavaScript code. It defines rules for linting JavaScript code with a focus on React applications.",
+        ".gitignore": "Specifies patterns to exclude files and directories from being tracked by Git version control system. It is used to prevent certain files from being committed to the repository.",
+        "package.json": "Standard Nodejs package metadata file, specifies dependencies and start scripts. It also specifies that the project is a module.",
+        "public/.gitkeep": "Empty file",
+        "src/App.css": "Contains styling rules for the root element of the application, setting a maximum width, centering it on the page, adding padding, and aligning text to the center.",
+        "src/index.css": "Defines styling rules for the root element, body, and h1 elements of a web page.",
+        "src/App.jsx": "Defines a functional component that serves as the root component in the project. The component is exported as the default export. References: src/App.css",
+        "src/main.jsx": "Main entry point for a React application. It imports necessary modules, renders the main component 'App' inside a 'React.StrictMode' component, and mounts it to the root element in the HTML document. References: App.jsx, index.css",
+        "src/assets/.gitkeep": "Empty file",
+    }
 }
--- a/pilot/templates/node_express_mongoose.py
+++ b/pilot/templates/node_express_mongoose.py
@@ -23,4 +23,21 @@ NODE_EXPRESS_MONGOOSE = {
        "* config loading from environment using dotenv with a placeholder .env.example file: you will need to create a .env file with your own values",
    ]),
    "install_hook": install_hook,
+    "files": {
+        ".env.example": "The .env.example file serves as a template for setting up environment variables used in the application. It provides placeholders for values such as the port number, MongoDB database URL, and session secret string.",
+        ".env": "This file is a configuration file in the form of a .env file. It contains environment variables used by the application, such as the port to listen on, the MongoDB database URL, and the session secret string.",
+        "server.js": "This `server.js` file sets up an Express server with MongoDB database connection, session management using connect-mongo, templating engine EJS, static file serving, authentication routes, error handling, and request logging. [References: dotenv, mongoose, express, express-session, connect-mongo, ./routes/authRoutes]",
+        "package.json": "This `package.json` file is used to define the metadata and dependencies for a Node.js project named 'tt0'. It specifies the project name, version, main entry point file, scripts for starting and testing the project, dependencies required by the project, and other metadata like author and license. [References: server.js]",
+        "views/login.ejs": "This file represents the login page of a web application using EJS (Embedded JavaScript) templating. It includes partials for the head, header, and footer sections, and contains a form for users to input their username and password to log in. [References: partials/_head.ejs, partials/_header.ejs, partials/_footer.ejs]",
+        "views/register.ejs": "The 'views/register.ejs' file contains the HTML markup for a registration form. It includes fields for username and password, along with a button to submit the form and a link to redirect to the login page if the user already has an account. [References: partials/_head.ejs, partials/_header.ejs, partials/_footer.ejs]",
+        "views/index.ejs": "This file represents the main view for a web application. It includes partials for the head, header, and footer sections, and contains a simple HTML structure with a main container displaying a heading. [References: partials/_head.ejs, partials/_header.ejs, partials/_footer.ejs, js/main.js]",
+        "views/partials/_header.ejs": "This file represents a partial view for the header section of a web page. It includes a navigation bar with a brand logo, toggle button, and links for Home, Login, and Logout based on the user's session status.",
+        "views/partials/_head.ejs": "This file represents the partial for the head section of an HTML document. It includes meta tags, a title tag, and links to external CSS files (Bootstrap and a custom stylesheet).",
+        "views/partials/_footer.ejs": "This file defines the footer section of a web page using EJS (Embedded JavaScript) templating. It includes a copyright notice and a link to the Bootstrap JavaScript library. [References: https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.min.js]",
+        "routes/authRoutes.js": "This file defines routes for user authentication including registration, login, and logout. It interacts with a User model to handle user data and uses bcrypt for password hashing and comparison. [References: models/User.js]",
+        "routes/middleware/authMiddleware.js": "This file defines a middleware function called isAuthenticated, which checks if a user is authenticated based on the presence of a userId in the session object. If authenticated, it allows the request to proceed to the next middleware or route handler; otherwise, it returns a 401 status response indicating the user is not authenticated.",
+        "models/User.js": "This file defines a Mongoose model for a user with fields for username and password. It includes a pre-save hook to hash the user's password before saving it to the database using bcrypt. [References: mongoose, bcrypt]",
+        "public/js/main.js": "The main.js file is a placeholder for future JavaScript code. It currently does not contain any specific functionality.",
+        "public/css/style.css": "This file is a placeholder for custom styles. It does not contain any specific styles but is intended for adding custom CSS styles."
+    }
 }
--- a/pilot/utils/describe.py
+++ b/pilot/utils/describe.py
@@ -0,0 +1,78 @@
+import os
+import time
+import json
+
+from logger.logger import logger
+from utils.llm_connection import create_gpt_chat_completion
+from const.function_calls import DESCRIBE_FILE
+
+
+DESCRIBE_PROMPT = """You're a software developer AI assistant. Your task is to explain the functionality implemented by a particular source code file.
+
+Given a file path and file contents, your output should contain:
+
+* a short explanation of what the file is about;
+* a list of all other files referenced (imported) from this file. note that some libraries, frameworks or libraries assume file extension and don't use it explicitly. For example, "import foo" in Python references "foo.py" without specifying the extension. In your response, use the complete file name including the implied extension;
+
+Output the result in a JSON format with the following structure, as in this example:
+
+Example:
+{
+    "summary": "Describe in detail the functionality being defind o implemented in this file. Be as detailed as possible",
+    "references": [
+        "some/file.py",
+        "some/other/file.js"
+    ],
+}
+
+Your response must be a valid JSON document, following the example format. Do not add any extra explanation or commentary outside the JSON document.
+"""
+
+def _get_describe_messages(fpath: str, content: str) -> list[dict[str, str]]:
+    """
+    Return a list of messages to send to the AI model to describe a file.
+
+    Internal to this module, use `describe_file` instead.
+
+    :param fpath: the file path
+    :param content: the file content
+    :return: a list of messages
+    """
+    return [
+        {"role": "system", "content": DESCRIBE_PROMPT},
+        {"role": "user", "content": f"Here's the `{fpath}` file:\n```\n{content}\n```\n"},
+    ]
+
+
+def describe_file(project, fpath: str, content: str) -> str:
+    if os.getenv('FILTER_RELEVANT_FILES', '').lower().strip() in ['false', '0', 'no', 'off']:
+        return ''
+
+    model_name = os.getenv("MODEL_NAME")
+    if model_name.startswith("gpt-4") or model_name.startswith("openai/gpt-4"):
+        model_name = "gpt-3.5-turbo"
+    elif model_name.startswith("claude-3") or model_name.startswith("anthropic/claude-3"):
+        model_name = "anthropic/claude-3-haiku-20240307"
+    else:
+        # Unknown default model (possibly local LLM), didsable file summaries
+        return ''
+
+    if not content or not content.strip():
+        return "(empty)"
+
+    logger.info("Calling %s to summarize file %s", model_name, fpath)
+    try:
+        response_text = create_gpt_chat_completion(
+            _get_describe_messages(fpath, content),
+            'project_description',
+            project,
+            function_calls=DESCRIBE_FILE,
+            temperature=0,
+            model_name=model_name,
+        )
+        response = json.loads(response_text['text'])
+        refs = (" [References: " + ", ".join(response["references"]) + "]") if response.get("references") else ""
+        return f"{response['summary']}{refs}"
+    except Exception as err:
+        logger.error("Error summarizing %s: %s", fpath, err, exc_info=True)
+        return '(unknown)'
--- a/pilot/utils/llm_connection.py
+++ b/pilot/utils/llm_connection.py
@@ -93,7 +93,8 @@ def test_api_access(project) -> bool:
 def create_gpt_chat_completion(messages: List[dict], req_type, project,
                               function_calls: FunctionCallSet = None,
                               prompt_data: dict = None,
-                               temperature: float = 0.7):
+                               temperature: float = 0.7,
+                               model_name: str = None):
    """
    Called from:
      - AgentConvo.send_message() - these calls often have `function_calls`, usually from `pilot/const/function_calls.py`
@@ -109,7 +110,9 @@ def create_gpt_chat_completion(messages: List[dict], req_type, project,
             {'function_calls': {'name': str, arguments: {...}}}
    """

-    model_name = os.getenv('MODEL_NAME', 'gpt-4')
+    if model_name is None:
+        model_name = os.getenv('MODEL_NAME', 'gpt-4')
+
    gpt_data = {
        'model': model_name,
        'n': 1,
@@ -145,7 +148,7 @@ def create_gpt_chat_completion(messages: List[dict], req_type, project,
                os.environ['ANTHROPIC_API_KEY'] = os.getenv('OPENAI_API_KEY')
            response = stream_anthropic(messages, function_call_message, gpt_data, model_name)
        else:
-            response = stream_gpt_completion(gpt_data, req_type, project)
+            response = stream_gpt_completion(gpt_data, req_type, project, model_name)

        # Remove JSON schema and any added retry messages
        while len(messages) > messages_length:
@@ -364,12 +367,13 @@ def trace_token_limit_error(request_tokens: int, messages: list[dict], err_str:


@retry_on_exception
-def stream_gpt_completion(data, req_type, project):
+def stream_gpt_completion(data, req_type, project, model=None):
    """
    Called from create_gpt_chat_completion()
    :param data:
    :param req_type: 'project_description' etc. See common.STEPS
    :param project: NEEDED FOR WRAPPER FUNCTION retry_on_exception
+    :param model: (optional) model name
    :return: {'text': str} or {'function_calls': {'name': str, arguments: '{...}'}}
    """
    # TODO add type dynamically - this isn't working when connected to the external process
@@ -410,7 +414,8 @@ def stream_gpt_completion(data, req_type, project):
    # print(yellow("Stream response from OpenAI:"))

    # Configure for the selected ENDPOINT
-    model = os.getenv('MODEL_NAME', 'gpt-4')
+    if model is None:
+        model = os.getenv('MODEL_NAME', 'gpt-4')
    endpoint = os.getenv('ENDPOINT')

    logger.info(f'> Request model: {model}')