merge main

2026-01-09 21:27:53 -05:00 · 2024-06-10 12:28:39 +01:00
parent 26e913b31a 3da225454b
commit 4d22159a58
39 changed files with 682 additions and 55 deletions
--- a/README.md
+++ b/README.md
@@ -151,6 +151,15 @@ any adjustments if needed).

 This will start two containers, one being a new image built by the `Dockerfile` and a Postgres database. The new image also has [ttyd](https://github.com/tsl0922/ttyd) installed so that you can easily interact with gpt-pilot. Node is also installed on the image and port 3000 is exposed.

+### PostgreSQL support
+
+GPT Pilot uses built-in SQLite database by default. If you want to use the PostgreSQL database, you need to additional install `asyncpg` and `psycopg2` packages:
+
+```bash
+pip install asyncpg psycopg2
+```
+
+Then, you need to update the `config.json` file to set `db.url` to `postgresql+asyncpg://<user>:<password>@<db-host>/<db-name>`.

 # 🧑‍💻️ CLI arguments

--- a/core/agents/developer.py
+++ b/core/agents/developer.py
@@ -133,6 +133,7 @@ class Developer(BaseAgent):
                user_feedback=user_feedback,
                user_feedback_qa=None,
                next_solution_to_try=None,
+                docs=self.current_state.docs,
            )
            .assistant(description)
            .template("parse_task")
@@ -168,17 +169,14 @@ class Developer(BaseAgent):

        log.debug(f"Current state files: {len(self.current_state.files)}, relevant {self.current_state.relevant_files}")
        # Check which files are relevant to the current task
-        if self.current_state.files and not self.current_state.relevant_files:
+        if self.current_state.files and self.current_state.relevant_files is None:
            await self.get_relevant_files()

        current_task_index = self.current_state.tasks.index(task)

        llm = self.get_llm()
        convo = AgentConvo(self).template(
-            "breakdown",
-            task=task,
-            iteration=None,
-            current_task_index=current_task_index,
+            "breakdown", task=task, iteration=None, current_task_index=current_task_index, docs=self.current_state.docs
        )
        response: str = await llm(convo)

@@ -302,7 +300,7 @@ class Developer(BaseAgent):

        self.next_state.current_task["description"] = user_response.text
        self.next_state.current_task["run_always"] = True
-        self.next_state.relevant_files = []
+        self.next_state.relevant_files = None
        log.info(f"Task description updated to: {user_response.text}")
        # Orchestrator will rerun us with the new task description
        return False
--- a/core/agents/error_handler.py
+++ b/core/agents/error_handler.py
@@ -74,6 +74,16 @@ class ErrorHandler(BaseAgent):
        if not cmd:
            raise ValueError("No command provided in command error response details")

+        confirm = await self.ask_question(
+            "Can I debug why this command failed?",
+            buttons={"yes": "Yes", "no": "No"},
+            default="yes",
+            buttons_only=True,
+        )
+        if confirm.cancelled or confirm.button == "no":
+            log.info("Skipping command error debug (requested by user)")
+            return AgentResponse.done(self)
+
        llm = self.get_llm()
        convo = AgentConvo(self).template(
            "debug",
--- a/core/agents/external_docs.py
+++ b/core/agents/external_docs.py
@@ -0,0 +1,160 @@
+import asyncio
+from urllib.parse import urljoin
+
+import httpx
+from pydantic import BaseModel
+
+from core.agents.base import BaseAgent
+from core.agents.convo import AgentConvo
+from core.agents.response import AgentResponse
+from core.config import EXTERNAL_DOCUMENTATION_API
+from core.llm.parser import JSONParser
+from core.log import get_logger
+from core.telemetry import telemetry
+
+log = get_logger(__name__)
+
+
+class DocQueries(BaseModel):
+    queries: list[str]
+
+
+class SelectedDocsets(BaseModel):
+    docsets: list[str]
+
+
+class ExternalDocumentation(BaseAgent):
+    """Agent in charge of collecting and storing additional documentation.
+
+    Docs are per task and are stores in the `tasks` variable in the project state.
+    This agent ensures documentation is collected only once per task.
+
+    Agent does 2 LLM interactions:
+        1. Ask the LLM to select useful documentation from a predefined list.
+        2. Ask the LLM to come up with a query to use to fetch the actual documentation snippets.
+
+    Agent does 2 calls to our documentation API:
+        1. Fetch all the available docsets. `docset` is a collection of documentation snippets
+           for a single topic, eg. VueJS API Reference docs.
+        2. Fetch the documentation snippets for given queries.
+
+    """
+
+    agent_type = "external-docs"
+    display_name = "Documentation"
+
+    async def run(self) -> AgentResponse:
+        available_docsets = await self._get_available_docsets()
+        selected_docsets = await self._select_docsets(available_docsets)
+        await telemetry.trace_code_event("docsets_used", selected_docsets)
+
+        if not selected_docsets:
+            log.info("No documentation selected for this task.")
+            await self._store_docs([], available_docsets)
+            return AgentResponse.done(self)
+
+        queries = await self._create_queries(selected_docsets)
+        doc_snippets = await self._fetch_snippets(queries)
+        await telemetry.trace_code_event("doc_snippets", {"num_stored": len(doc_snippets)})
+
+        await self._store_docs(doc_snippets, available_docsets)
+        return AgentResponse.done(self)
+
+    async def _get_available_docsets(self) -> list[tuple]:
+        url = urljoin(EXTERNAL_DOCUMENTATION_API, "docsets")
+        client = httpx.Client(transport=httpx.HTTPTransport(retries=3))
+        try:
+            resp = client.get(url)
+        except httpx.HTTPError:
+            # In case of any errors, we'll proceed without the documentation
+            log.warning("Failed to fetch available docsets due to an error.", exc_info=True)
+            return []
+
+        log.debug(f"Fetched {len(resp.json())} docsets.")
+        return resp.json()
+
+    async def _select_docsets(self, available_docsets: list[tuple]) -> dict[str, str]:
+        """From a list of available docsets, select the relevant ones."""
+
+        if not available_docsets:
+            return {}
+
+        llm = self.get_llm()
+        convo = (
+            AgentConvo(self)
+            .template(
+                "select_docset",
+                current_task=self.current_state.current_task,
+                available_docsets=available_docsets,
+            )
+            .require_schema(SelectedDocsets)
+        )
+        await self.send_message("Determining if external documentation is needed for the next task...")
+        llm_response: SelectedDocsets = await llm(convo, parser=JSONParser(spec=SelectedDocsets))
+        available_docsets = dict(available_docsets)
+        return {k: available_docsets[k] for k in llm_response.docsets if k in available_docsets}
+
+    async def _create_queries(self, docsets: dict[str, str]) -> dict[str, list[str]]:
+        """Return queries we have to make to the docs API.
+
+        Key is the docset_key and value is the list of queries for that docset.
+
+        """
+        queries = {}
+        await self.send_message("Getting relevant documentation for the following topics:")
+        for k, short_desc in docsets.items():
+            llm = self.get_llm()
+            convo = (
+                AgentConvo(self)
+                .template(
+                    "create_docs_queries",
+                    short_description=short_desc,
+                    current_task=self.current_state.current_task,
+                )
+                .require_schema(DocQueries)
+            )
+            llm_response: DocQueries = await llm(convo, parser=JSONParser(spec=DocQueries))
+            if llm_response.queries:
+                queries[k] = llm_response.queries
+
+        return queries
+
+    async def _fetch_snippets(self, queries: dict[str, list[str]]) -> list[tuple]:
+        """Query the docs API and fetch the documentation snippets.
+
+        Returns a list of tuples: (docset_key, snippets).
+
+        """
+        url = urljoin(EXTERNAL_DOCUMENTATION_API, "query")
+        snippets: list[tuple] = []
+        async with httpx.AsyncClient(transport=httpx.AsyncHTTPTransport(retries=3)) as client:
+            reqs = []
+            ordered_keys = []
+            for docset_key, qs in queries.items():
+                reqs.append(client.get(url, params={"q": qs, "doc_key": docset_key, "num_results": 3}))
+                ordered_keys.append(docset_key)
+
+            try:
+                results = await asyncio.gather(*reqs)
+            except httpx.HTTPError:
+                log.warning("Failed to fetch documentation snippets", exc_info=True)
+
+        for k, res in zip(ordered_keys, results):
+            snippets.append((k, res.json()))
+        return snippets
+
+    async def _store_docs(self, snippets: list[tuple], available_docsets: list[tuple]):
+        """Store the snippets into current task data.
+
+        Documentation snippets are stored as a list of dictionaries:
+        {"key": docset-key, "desc": documentation-description, "snippets": list-of-snippets}
+
+        """
+
+        docsets_dict = dict(available_docsets)
+        docs = []
+        for docset_key, snip in snippets:
+            docs.append({"key": docset_key, "desc": docsets_dict[docset_key], "snippets": snip})
+
+        self.next_state.docs = docs
+        self.next_state.flag_tasks_as_modified()
--- a/core/agents/importer.py
+++ b/core/agents/importer.py
@@ -0,0 +1,86 @@
+from uuid import uuid4
+
+from core.agents.base import BaseAgent
+from core.agents.convo import AgentConvo
+from core.agents.response import AgentResponse, ResponseType
+from core.db.models import Complexity
+from core.llm.parser import JSONParser
+from core.log import get_logger
+from core.templates.example_project import EXAMPLE_PROJECT_DESCRIPTION
+
+log = get_logger(__name__)
+
+MAX_PROJECT_LINES = 10000
+
+
+class Importer(BaseAgent):
+    agent_type = "importer"
+    display_name = "Project Analyist"
+
+    async def run(self) -> AgentResponse:
+        if self.prev_response and self.prev_response.type == ResponseType.IMPORT_PROJECT:
+            # Called by SpecWriter to start the import process
+            await self.start_import_process()
+            return AgentResponse.describe_files(self)
+
+        await self.analyze_project()
+        return AgentResponse.done(self)
+
+    async def start_import_process(self):
+        # TODO: Send a signal to the UI to copy the project files to workspace
+        project_root = self.state_manager.get_full_project_root()
+        await self.ui.import_project(project_root)
+        await self.send_message(
+            f"This is experimental feature and is currently limited to projects with size up to {MAX_PROJECT_LINES} lines of code."
+        )
+
+        await self.ask_question(
+            f"Please copy your project files to {project_root} and press Continue",
+            allow_empty=False,
+            buttons={
+                "continue": "Continue",
+            },
+            buttons_only=True,
+            default="continue",
+        )
+
+        imported_files, _ = await self.state_manager.import_files()
+        imported_lines = sum(len(f.content.content.splitlines()) for f in imported_files)
+        if imported_lines > MAX_PROJECT_LINES:
+            await self.send_message(
+                "WARNING: Your project ({imported_lines} LOC) is larger than supported and may cause issues in Pythagora."
+            )
+        await self.state_manager.commit()
+
+    async def analyze_project(self):
+        llm = self.get_llm()
+
+        self.send_message("Inspecting most important project files ...")
+
+        convo = AgentConvo(self).template("get_entrypoints")
+        llm_response = await llm(convo, parser=JSONParser())
+        relevant_files = [f for f in self.current_state.files if f.path in llm_response]
+
+        self.send_message("Analyzing project ...")
+
+        convo = AgentConvo(self).template(
+            "analyze_project", relevant_files=relevant_files, example_spec=EXAMPLE_PROJECT_DESCRIPTION
+        )
+        llm_response = await llm(convo)
+
+        spec = self.current_state.specification.clone()
+        spec.description = llm_response
+        self.next_state.specification = spec
+        self.next_state.epics = [
+            {
+                "id": uuid4().hex,
+                "name": "Import project",
+                "description": "Import an existing project into Pythagora",
+                "tasks": [],
+                "completed": True,
+                "test_instructions": None,
+                "source": "app",
+                "summary": None,
+                "complexity": Complexity.HARD if len(self.current_state.files) > 5 else Complexity.SIMPLE,
+            }
+        ]
--- a/core/agents/orchestrator.py
+++ b/core/agents/orchestrator.py
@@ -7,7 +7,9 @@ from core.agents.code_reviewer import CodeReviewer
 from core.agents.developer import Developer
 from core.agents.error_handler import ErrorHandler
 from core.agents.executor import Executor
+from core.agents.external_docs import ExternalDocumentation
 from core.agents.human_input import HumanInput
+from core.agents.importer import Importer
 from core.agents.problem_solver import ProblemSolver
 from core.agents.response import AgentResponse, ResponseType
 from core.agents.spec_writer import SpecWriter
@@ -175,10 +177,16 @@ class Orchestrator(BaseAgent):
                return HumanInput(self.state_manager, self.ui, prev_response=prev_response)
            if prev_response.type == ResponseType.TASK_REVIEW_FEEDBACK:
                return Developer(self.state_manager, self.ui, prev_response=prev_response)
+            if prev_response.type == ResponseType.IMPORT_PROJECT:
+                return Importer(self.state_manager, self.ui, prev_response=prev_response)

        if not state.specification.description:
-            # Ask the Spec Writer to refine and save the project specification
-            return SpecWriter(self.state_manager, self.ui)
+            if state.files:
+                # The project has been imported, but not analyzed yet
+                return Importer(self.state_manager, self.ui)
+            else:
+                # New project: ask the Spec Writer to refine and save the project specification
+                return SpecWriter(self.state_manager, self.ui)
        elif not state.specification.architecture:
            # Ask the Architect to design the project architecture and determine dependencies
            return Architect(self.state_manager, self.ui, process_manager=self.process_manager)
@@ -189,10 +197,12 @@ class Orchestrator(BaseAgent):
        ):
            # Ask the Tech Lead to break down the initial project or feature into tasks and apply project template
            return TechLead(self.state_manager, self.ui, process_manager=self.process_manager)
-        elif not state.steps and not state.iterations:
-            # Ask the Developer to break down current task into actionable steps
-            return Developer(self.state_manager, self.ui)

+        if state.current_task and state.docs is None:
+            return ExternalDocumentation(self.state_manager, self.ui)
+
+        # Current task status must be checked before Developer is called because we might want
+        # to skip it instead of breaking it down
        current_task_status = state.current_task.get("status") if state.current_task else None
        if current_task_status:
            # Status of the current task is set first time after the task was reviewed by user
@@ -207,6 +217,10 @@ class Orchestrator(BaseAgent):
                # Task is fully done or skipped, call TaskCompleter to mark it as completed
                return TaskCompleter(self.state_manager, self.ui)

+        if not state.steps and not state.iterations:
+            # Ask the Developer to break down current task into actionable steps
+            return Developer(self.state_manager, self.ui)
+
        if state.current_step:
            # Execute next step in the task
            # TODO: this can be parallelized in the future
--- a/core/agents/response.py
+++ b/core/agents/response.py
@@ -39,6 +39,9 @@ class ResponseType(str, Enum):
    TASK_REVIEW_FEEDBACK = "task-review-feedback"
    """Agent is providing feedback on the entire task."""

+    IMPORT_PROJECT = "import-project"
+    """User wants to import an existing project."""
+

 class AgentResponse:
    type: ResponseType = ResponseType.DONE
@@ -130,3 +133,7 @@ class AgentResponse:
                "feedback": feedback,
            },
        )
+
+    @staticmethod
+    def import_project(agent: "BaseAgent") -> "AgentResponse":
+        return AgentResponse(type=ResponseType.IMPORT_PROJECT, agent=agent)
--- a/core/agents/spec_writer.py
+++ b/core/agents/spec_writer.py
@@ -31,11 +31,15 @@ class SpecWriter(BaseAgent):
                # FIXME: must be lowercase becase VSCode doesn't recognize it otherwise. Needs a fix in the extension
                "continue": "continue",
                "example": "Start an example project",
+                "import": "Import an existing project",
            },
        )
        if response.cancelled:
            return AgentResponse.error(self, "No project description")

+        if response.button == "import":
+            return AgentResponse.import_project(self)
+
        if response.button == "example":
            await self.send_message("Starting example project with description:")
            await self.send_message(EXAMPLE_PROJECT_DESCRIPTION)
--- a/core/agents/tech_lead.py
+++ b/core/agents/tech_lead.py
@@ -90,7 +90,7 @@ class TechLead(BaseAgent):
        )
        # Saving template files will fill this in and we want it clear for the
        # first task.
-        self.next_state.relevant_files = []
+        self.next_state.relevant_files = None
        return summary

    async def ask_for_new_feature(self) -> AgentResponse:
--- a/core/cli/main.py
+++ b/core/cli/main.py
@@ -11,7 +11,7 @@ from core.llm.base import APIError, BaseLLMClient
 from core.log import get_logger
 from core.state.state_manager import StateManager
 from core.telemetry import telemetry
-from core.ui.base import UIBase, UIClosedError, pythagora_source
+from core.ui.base import UIBase, UIClosedError, UserInput, pythagora_source

 log = get_logger(__name__)

@@ -112,7 +112,15 @@ async def start_new_project(sm: StateManager, ui: UIBase) -> bool:
    :param ui: User interface.
    :return: True if the project was created successfully, False otherwise.
    """
-    user_input = await ui.ask_question("What is the project name?", allow_empty=False, source=pythagora_source)
+    try:
+        user_input = await ui.ask_question(
+            "What is the project name?",
+            allow_empty=False,
+            source=pythagora_source,
+        )
+    except (KeyboardInterrupt, UIClosedError):
+        user_input = UserInput(cancelled=True)
+
    if user_input.cancelled:
        return False

--- a/core/config/init.py
+++ b/core/config/init.py
@@ -1,6 +1,6 @@
 from enum import Enum
 from os.path import abspath, dirname, isdir, join
-from typing import Literal, Optional, Union
+from typing import Any, Literal, Optional, Union

 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from typing_extensions import Annotated
@@ -18,6 +18,7 @@ DEFAULT_IGNORE_PATHS = [
    "node_modules",
    "package-lock.json",
    "venv",
+    ".venv",
    "dist",
    "build",
    "target",
@@ -34,6 +35,9 @@ IGNORE_SIZE_THRESHOLD = 50000  # 50K+ files are ignored by default
 DEFAULT_AGENT_NAME = "default"
 DESCRIBE_FILES_AGENT_NAME = "CodeMonkey.describe_files"

+# Endpoint for the external documentation
+EXTERNAL_DOCUMENTATION_API = "http://docs-pythagora-io-439719575.us-east-1.elb.amazonaws.com"
+

 class _StrictModel(BaseModel):
    """
@@ -54,6 +58,7 @@ class LLMProvider(str, Enum):
    ANTHROPIC = "anthropic"
    GROQ = "groq"
    LM_STUDIO = "lm-studio"
+    AZURE = "azure"


 class UIAdapter(str, Enum):
@@ -88,6 +93,10 @@ class ProviderConfig(_StrictModel):
        description="Timeout (in seconds) for receiving a new chunk of data from the response stream",
        ge=0.0,
    )
+    extra: Optional[dict[str, Any]] = Field(
+        None,
+        description="Extra provider-specific configuration",
+    )


 class AgentLLMConfig(_StrictModel):
@@ -139,6 +148,10 @@ class LLMConfig(_StrictModel):
        description="Timeout (in seconds) for receiving a new chunk of data from the response stream",
        ge=0.0,
    )
+    extra: Optional[dict[str, Any]] = Field(
+        None,
+        description="Extra provider-specific configuration",
+    )

    @classmethod
    def from_provider_and_agent_configs(cls, provider: ProviderConfig, agent: AgentLLMConfig):
@@ -150,6 +163,7 @@ class LLMConfig(_StrictModel):
            temperature=agent.temperature,
            connect_timeout=provider.connect_timeout,
            read_timeout=provider.read_timeout,
+            extra=provider.extra,
        )


@@ -212,6 +226,12 @@ class DBConfig(_StrictModel):
    def validate_url_scheme(cls, v: str) -> str:
        if v.startswith("sqlite+aiosqlite://"):
            return v
+        if v.startswith("postgresql+asyncpg://"):
+            try:
+                import asyncpg  # noqa: F401
+            except ImportError:
+                raise ValueError("To use PostgreSQL database, please install `asyncpg` and `psycopg2` packages")
+            return v
        raise ValueError(f"Unsupported database URL scheme in: {v}")


--- a/core/db/migrations/versions/b760f66138c0_add_docs_column_to_project_states.py
+++ b/core/db/migrations/versions/b760f66138c0_add_docs_column_to_project_states.py
@@ -0,0 +1,34 @@
+"""Add docs column to project_states
+
+Revision ID: b760f66138c0
+Revises: f352dbe45751
+Create Date: 2024-06-08 10:00:44.222099
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "b760f66138c0"
+down_revision: Union[str, None] = "f352dbe45751"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("project_states", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("docs", sa.JSON(), nullable=True))
+
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("project_states", schema=None) as batch_op:
+        batch_op.drop_column("docs")
+
+    # ### end Alembic commands ###
--- a/core/db/migrations/versions/f352dbe45751_make_relevant_files_nullable.py
+++ b/core/db/migrations/versions/f352dbe45751_make_relevant_files_nullable.py
@@ -0,0 +1,34 @@
+"""Make relevant_files nullable
+
+Revision ID: f352dbe45751
+Revises: 0a1bb637fa26
+Create Date: 2024-06-04 15:07:40.175466
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+from sqlalchemy.dialects import sqlite
+
+# revision identifiers, used by Alembic.
+revision: str = "f352dbe45751"
+down_revision: Union[str, None] = "0a1bb637fa26"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("project_states", schema=None) as batch_op:
+        batch_op.alter_column("relevant_files", existing_type=sqlite.JSON(), nullable=True)
+
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("project_states", schema=None) as batch_op:
+        batch_op.alter_column("relevant_files", existing_type=sqlite.JSON(), nullable=False)
+
+    # ### end Alembic commands ###
--- a/core/db/models/project.py
+++ b/core/db/models/project.py
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Optional, Union
 from unicodedata import normalize
 from uuid import UUID, uuid4

-from sqlalchemy import delete, inspect, select
+from sqlalchemy import and_, delete, inspect, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import Mapped, mapped_column, relationship, selectinload
 from sqlalchemy.sql import func
@@ -79,7 +79,7 @@ class Project(Base):
        from core.db.models import Branch, ProjectState

        latest_state_query = (
-            select(ProjectState.branch_id, func.max(ProjectState.id).label("max_id"))
+            select(ProjectState.branch_id, func.max(ProjectState.step_index).label("max_index"))
            .group_by(ProjectState.branch_id)
            .subquery()
        )
@@ -88,7 +88,13 @@ class Project(Base):
            select(Project, Branch, ProjectState)
            .join(Branch, Project.branches)
            .join(ProjectState, Branch.states)
-            .join(latest_state_query, ProjectState.id == latest_state_query.columns.max_id)
+            .join(
+                latest_state_query,
+                and_(
+                    ProjectState.branch_id == latest_state_query.columns.branch_id,
+                    ProjectState.step_index == latest_state_query.columns.max_index,
+                ),
+            )
            .options(selectinload(Project.branches), selectinload(Branch.states))
            .order_by(Project.name, Branch.name)
        )
--- a/core/db/models/project_state.py
+++ b/core/db/models/project_state.py
@@ -51,8 +51,9 @@ class ProjectState(Base):
    tasks: Mapped[list[dict]] = mapped_column(default=list)
    steps: Mapped[list[dict]] = mapped_column(default=list)
    iterations: Mapped[list[dict]] = mapped_column(default=list)
-    relevant_files: Mapped[list[str]] = mapped_column(default=list)
+    relevant_files: Mapped[Optional[list[str]]] = mapped_column(default=None)
    modified_files: Mapped[dict] = mapped_column(default=dict)
+    docs: Mapped[Optional[list[dict]]] = mapped_column(default=None)
    run_command: Mapped[Optional[str]] = mapped_column()
    action: Mapped[Optional[str]] = mapped_column()

@@ -167,7 +168,10 @@ class ProjectState(Base):

        :return: List of tuples with file path and content.
        """
-        all_files = set(self.relevant_files + list(self.modified_files.keys()))
+        relevant_files = self.relevant_files or []
+        modified_files = self.modified_files or {}
+
+        all_files = set(relevant_files + list(modified_files.keys()))
        return [file for file in self.files if file.path in all_files]

    @staticmethod
@@ -219,6 +223,7 @@ class ProjectState(Base):
            files=[],
            relevant_files=deepcopy(self.relevant_files),
            modified_files=deepcopy(self.modified_files),
+            docs=deepcopy(self.docs),
            run_command=self.run_command,
        )

@@ -254,8 +259,9 @@ class ProjectState(Base):
        self.set_current_task_status(TaskStatus.DONE)
        self.steps = []
        self.iterations = []
-        self.relevant_files = []
+        self.relevant_files = None
        self.modified_files = {}
+        self.docs = None
        flag_modified(self, "tasks")

        if not self.unfinished_tasks and self.unfinished_epics:
@@ -362,6 +368,8 @@ class ProjectState(Base):

        if path not in self.modified_files and not external:
            self.modified_files[path] = original_content
+
+        self.relevant_files = self.relevant_files or []
        if path not in self.relevant_files:
            self.relevant_files.append(path)

--- a/core/llm/azure_client.py
+++ b/core/llm/azure_client.py
@@ -0,0 +1,29 @@
+from httpx import Timeout
+from openai import AsyncAzureOpenAI
+
+from core.config import LLMProvider
+from core.llm.openai_client import OpenAIClient
+from core.log import get_logger
+
+log = get_logger(__name__)
+
+
+class AzureClient(OpenAIClient):
+    provider = LLMProvider.AZURE
+    stream_options = None
+
+    def _init_client(self):
+        azure_deployment = self.config.extra.get("azure_deployment")
+        api_version = self.config.extra.get("api_version")
+
+        self.client = AsyncAzureOpenAI(
+            api_key=self.config.api_key,
+            azure_endpoint=self.config.base_url,
+            azure_deployment=azure_deployment,
+            api_version=api_version,
+            timeout=Timeout(
+                max(self.config.connect_timeout, self.config.read_timeout),
+                connect=self.config.connect_timeout,
+                read=self.config.read_timeout,
+            ),
+        )
--- a/core/llm/base.py
+++ b/core/llm/base.py
@@ -316,6 +316,7 @@ class BaseLLMClient:
        :return: Client class for the specified provider.
        """
        from .anthropic_client import AnthropicClient
+        from .azure_client import AzureClient
        from .groq_client import GroqClient
        from .openai_client import OpenAIClient

@@ -325,6 +326,8 @@ class BaseLLMClient:
            return AnthropicClient
        elif provider == LLMProvider.GROQ:
            return GroqClient
+        elif provider == LLMProvider.AZURE:
+            return AzureClient
        else:
            raise ValueError(f"Unsupported LLM provider: {provider.value}")

--- a/core/llm/openai_client.py
+++ b/core/llm/openai_client.py
@@ -17,6 +17,7 @@ tokenizer = tiktoken.get_encoding("cl100k_base")

 class OpenAIClient(BaseLLMClient):
    provider = LLMProvider.OPENAI
+    stream_options = {"include_usage": True}

    def _init_client(self):
        self.client = AsyncOpenAI(
@@ -40,10 +41,10 @@ class OpenAIClient(BaseLLMClient):
            "messages": convo.messages,
            "temperature": self.config.temperature if temperature is None else temperature,
            "stream": True,
-            "stream_options": {
-                "include_usage": True,
-            },
        }
+        if self.stream_options:
+            completion_kwargs["stream_options"] = self.stream_options
+
        if json_mode:
            completion_kwargs["response_format"] = {"type": "json_object"}

--- a/core/prompts/code-monkey/describe_file.prompt
+++ b/core/prompts/code-monkey/describe_file.prompt
@@ -14,7 +14,7 @@ Output the result in a JSON format with the following structure, as in this exam

 Example:
 {
-    "summary": "Describe in detail the functionality being defind o implemented in this file. Be as detailed as possible",
+    "summary": "Describe in detail the functionality being defined or implemented in this file. Be as detailed as possible",
    "references": [
        "some/file.py",
        "some/other/file.js"
--- a/core/prompts/developer/breakdown.prompt
+++ b/core/prompts/developer/breakdown.prompt
@@ -19,6 +19,8 @@ You are currently working on task #{{ current_task_index + 1 }} with the followi

 Now, tell me all the code that needs to be written to implement ONLY this task and have it fully working and all commands that need to be run to implement this task.

+{% include "partials/doc_snippets.prompt" %}
+
 **IMPORTANT**
 {%- if state.epics|length == 1 %}
 Remember, I created an empty folder where I will start writing files that you tell me and that are needed for this app.
--- a/core/prompts/external-docs/create_docs_queries.prompt
+++ b/core/prompts/external-docs/create_docs_queries.prompt
@@ -0,0 +1,14 @@
+{% include "partials/project_details.prompt" %}
+
+Here is the next task that needs to be implemented:
+```
+{{ current_task.description }}
+```
+
+Here is the list of the libraries, frameworks and APIs for which we have documentation available. The documentation is given in a sequence of pairs, one pair per line. First item in the pair is the documentation key. Second  item is the short description of what that documentation contains.
+Here's an example for React API documentation:
+"react-api-ref", "React API Reference documentation"
+
+We have additional documentation from "{{ short_description }}" that might be useful for completing this task.
+
+Now, give me a summary of what specifically from the {{ short_description }} you think would be useful for completing this task. Please provide only the topics of interest, no additional text. Only return the topics relevant to the actual implementation, NOT the topics related to library installation and setup, environment setup, database setup and similar. Return the topics in JSON format, as a list of strings, WITHOUT any additional formatting such as backticks, bullets and similar. Return a maximum of 3 topics you think would be most useful.
--- a/core/prompts/external-docs/select_docset.prompt
+++ b/core/prompts/external-docs/select_docset.prompt
@@ -0,0 +1,15 @@
+{% include "partials/project_details.prompt" %}
+
+Here is the next task that needs to be implemented:
+{{ current_task.description }}
+
+Here is the list of the libraries, frameworks and APIs for which we have documentation available. The documentation is given in a sequence of pairs, one pair per line. First item in the pair is the documentation key. Second  item is the short description of what that documentation contains.
+Here's an example for React API documentation:
+"react-api-ref", "React API Reference documentation"
+
+Here is the list of available documentations:
+{% for docset in available_docsets %}
+{{ docset[0], docset[1] }}
+{% endfor %}
+
+Now, give me the list of the additional documentation that you would like to use to complete the task listed above. Return only the documentation that is absolutely required for the given task, and only from the list of available documentations provided above. If there is no additional documentation in the list that you would like to use, return an empty list.
--- a/core/prompts/external-docs/system.prompt
+++ b/core/prompts/external-docs/system.prompt
@@ -0,0 +1,3 @@
+You are a world class full stack software developer working in a team.
+
+Your job is to select the documentation that might be useful for implementing a task at hand.
--- a/core/prompts/importer/analyze_project.prompt
+++ b/core/prompts/importer/analyze_project.prompt
@@ -0,0 +1,28 @@
+You're given an existing project you need to analyze and continue developing. To do this, you'll need to determine the project architecture, technologies used (platform, libraries, etc) and reverse-engineer the technical and functional spec.
+
+Here is the list of all the files in the project:
+
+{% for file in state.files %}
+* `{{ file.path }}` - {{ file.meta.get("description")}}
+{% endfor %}
+
+Here's the full content of interesting files that may help you to determine the specification:
+
+{% for file in state.files %}
+**`{{ file.path }}`**:
+```
+{{ file.content.content }}
+```
+
+{% endfor %}
+
+Based on this information, please provide detailed specification for the project. Here is an example specification format:
+
+---START_OF_EXAMPLE_SPEC---
+{{ example_spec }}
+---END_OF_EXAMPLE_SPEC---
+
+**IMPORTANT**: In the specification, you must include the following sections:
+* **Project Description**: A detailed description of what the project is about.
+* **Features**: A list of features that the project has implemented. Each feature should be described in detail.
+* **Technical Specification**: Detailed description of how the project works, including any important technical details.
--- a/core/prompts/importer/get_entrypoints.prompt
+++ b/core/prompts/importer/get_entrypoints.prompt
@@ -0,0 +1,21 @@
+You're given an existing project you need to analyze and continue developing. To do this, you'll need to determine the project architecture, technologies used (platform, libraries, etc) and reverse-engineer the technical and functional spec.
+
+As a first step, you have to identify which of the listed files to examine so you can determine this. After you identify the files, you'll be given full access to their contents so you can determine the project information.
+
+Here is the list of all the files in the project:
+
+{% for file in state.files %}
+* `{{ file.path }}` - {{ file.meta.get("description")}}
+{% endfor %}
+
+Based on this information, list the files (full path, as shown in the list) you would examine to determine the project architecture, technologies and specification. Output the list in JSON format like in the following example:
+
+```json
+{
+  "files": [
+    "README.md",
+    "pyproject.toml",
+    "settings/settings.py"
+  ]
+}
+```
--- a/core/prompts/partials/doc_snippets.prompt
+++ b/core/prompts/partials/doc_snippets.prompt
@@ -0,0 +1,14 @@
+{% if docs is defined and docs %}
+We have some some documentation snippets that might be helpful while working on this task, we will now list those.
+
+---START_OF_DOCUMENTATION_SNIPPETS---
+{% for d in docs %}
+Documentation snippets from {{ d.desc }}:
+{% for snippet in d.snippets %}
+{{ snippet }}
+
+{% endfor %}
+
+{% endfor %}
+---END_OF_DOCUMENTATION_SNIPPETS---
+{% endif %}
--- a/core/prompts/troubleshooter/iteration.prompt
+++ b/core/prompts/troubleshooter/iteration.prompt
@@ -39,6 +39,7 @@ Focus on solving this issue in the following way:
 {{ next_solution_to_try }}
 ```
 {% endif %}
+{% include "partials/doc_snippets.prompt" %}
 Now, you have to debug this issue and comply with the additional user feedback.

 **IMPORTANT**
--- a/core/state/state_manager.py
+++ b/core/state/state_manager.py
@@ -170,7 +170,6 @@ class StateManager:
        self.branch = state.branch
        self.project = state.branch.project
        self.next_state = await state.create_next_state()
-        # TODO: overwrite files?
        self.file_system = await self.init_file_system(load_existing=True)
        log.debug(
            f"Loaded project {self.project} ({self.project.id}) "
@@ -178,7 +177,7 @@ class StateManager:
            f"step {state.step_index} (state id={state.id})"
        )

-        if self.current_state.current_epic and self.ui:
+        if self.current_state.current_epic and self.current_state.current_task and self.ui:
            source = self.current_state.current_epic.get("source", "app")
            await self.ui.send_task_progress(
                self.current_state.tasks.index(self.current_state.current_task) + 1,
--- a/core/telemetry/init.py
+++ b/core/telemetry/init.py
@@ -81,6 +81,8 @@ class Telemetry:
            "model": config.agent["default"].model,
            # Initial prompt
            "initial_prompt": None,
+            # App complexity
+            "is_complex_app": None,
            # Optional template used for the project
            "template": None,
            # Optional user contact email
@@ -89,6 +91,10 @@ class Telemetry:
            "app_id": None,
            # Project architecture
            "architecture": None,
+            # Documentation sets used for a given task
+            "docsets_used": [],
+            # Number of documentation snippets stored for a given task
+            "doc_snippets_stored": 0,
        }
        if sys.platform == "linux":
            try:
--- a/core/ui/base.py
+++ b/core/ui/base.py
@@ -263,6 +263,17 @@ class UIBase:
        """
        raise NotImplementedError()

+    async def import_project(self, project_dir: str):
+        """
+        Ask the UI to import files from the project directory.
+
+        The UI should provide a way for the user to select the directory with
+        existing project, and recursively copy the files over.
+
+        :param project_dir: Project directory.
+        """
+        raise NotImplementedError()
+

 pythagora_source = UISource("Pythagora", "pythagora")
 success_source = UISource("Congratulations", "success")
--- a/core/ui/console.py
+++ b/core/ui/console.py
@@ -1,5 +1,7 @@
 from typing import Optional

+from prompt_toolkit.shortcuts import PromptSession
+
 from core.log import get_logger
 from core.ui.base import ProjectStage, UIBase, UIClosedError, UISource, UserInput

@@ -57,9 +59,12 @@ class PlainConsoleUI(UIBase):
                default_str = " (default)" if k == default else ""
                print(f"  [{k}]: {v}{default_str}")

+        session = PromptSession("> ")
+
        while True:
            try:
-                choice = input("> ").strip()
+                choice = await session.prompt_async(default=initial_text or "")
+                choice = choice.strip()
            except KeyboardInterrupt:
                raise UIClosedError()
            if not choice and default:
@@ -118,5 +123,8 @@ class PlainConsoleUI(UIBase):
    async def send_features_list(self, features: list[str]):
        pass

+    async def import_project(self, project_dir: str):
+        pass
+

 __all__ = ["PlainConsoleUI"]
--- a/core/ui/ipc_client.py
+++ b/core/ui/ipc_client.py
@@ -39,6 +39,7 @@ class MessageType(str, Enum):
    LOADING_FINISHED = "loadingFinished"
    PROJECT_DESCRIPTION = "projectDescription"
    FEATURES_LIST = "featuresList"
+    IMPORT_PROJECT = "importProject"


 class Message(BaseModel):
@@ -334,5 +335,8 @@ class IPCClientUI(UIBase):
    async def send_features_list(self, features: list[str]):
        await self._send(MessageType.FEATURES_LIST, content={"featuresList": features})

+    async def import_project(self, project_dir: str):
+        await self._send(MessageType.IMPORT_PROJECT, content={"project_dir": project_dir})
+

 __all__ = ["IPCClientUI"]
--- a/example-config.json
+++ b/example-config.json
@@ -1,6 +1,6 @@
 {
  // Configuration for the LLM providers that can be used. Pythagora supports
-  // OpenAI, Anthropic and Groq. Azure and OpenRouter and local LLMs (such as LM-Studio)
+  // OpenAI, Azure, Anthropic and Groq. OpenRouter and local LLMs (such as LM-Studio)
  // also work, you can use "openai" provider to define these.
  "llm": {
    "openai": {
@@ -9,6 +9,17 @@
      "api_key": null,
      "connect_timeout": 60.0,
      "read_timeout": 10.0
+    },
+    // Example config for Azure OpenAI (see https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#chat-completions)
+    "azure": {
+      "base_url": "https://your-resource-name.openai.azure.com/",
+      "api_key": "your-api-key",
+      "connect_timeout": 60.0,
+      "read_timeout": 10.0,
+      "extra": {
+        "azure_deployment": "your-azure-deployment-id",
+        "api_version": "2024-02-01"
+      }
    }
  },
  // Each agent can use a different model or configuration. The default, as before, is GPT4 Turbo
--- a/main.py
+++ b/main.py
@@ -6,20 +6,23 @@ import sys

 try:
    from core.cli.main import run_pythagora
-except ImportError:
-    pythagora_root = os.path.dirname(os.path.dirname(__file__))
+except ImportError as err:
+    pythagora_root = os.path.dirname(__file__)
    venv_path = os.path.join(pythagora_root, "venv")
    requirements_path = os.path.join(pythagora_root, "requirements.txt")
    if sys.prefix == sys.base_prefix:
        venv_python_path = os.path.join(venv_path, "scripts" if sys.platform == "win32" else "bin", "python")
-        print("Python environment for Pythagora is not set up.", file=sys.stderr)
+        print(f"Python environment for Pythagora is not set up: module `{err.name}` is missing.", file=sys.stderr)
        print(f"Please create Python virtual environment: {sys.executable} -m venv {venv_path}", file=sys.stderr)
        print(
            f"Then install the required dependencies with: {venv_python_path} -m pip install -r {requirements_path}",
            file=sys.stderr,
        )
    else:
-        print("Python environment for Pythagora is not completely set up.", file=sys.stderr)
+        print(
+            f"Python environment for Pythagora is not completely set up: module `{err.name}` is missing",
+            file=sys.stderr,
+        )
        print(
            f"Please run `{sys.executable} -m pip install -r {requirements_path}` to finish Python setup, and rerun Pythagora.",
            file=sys.stderr,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gpt-pilot"
-version = "0.2.1"
+version = "0.2.3"
 description = "Build complete apps using AI agents"
 authors = ["Senko Rasic <senko@pythagora.ai>"]
 license = "FSL-1.1-MIT"
@@ -37,6 +37,7 @@ psutil = "^5.9.8"
 httpx = "^0.27.0"
 alembic = "^1.13.1"
 python-dotenv = "^1.0.1"
+prompt-toolkit = "^3.0.45"

 [tool.poetry.group.dev.dependencies]
 pytest = "^8.1.1"
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,38 +2,40 @@ aiosqlite==0.20.0
 alembic==1.13.1
 annotated-types==0.7.0
 anthropic==0.25.9
-anyio==4.3.0
-certifi==2024.2.2
+anyio==4.4.0
+certifi==2024.6.2
 charset-normalizer==3.3.2
 colorama==0.4.6
 distro==1.9.0
 exceptiongroup==1.2.1
 filelock==3.14.0
-fsspec==2024.5.0
+fsspec==2024.6.0
 greenlet==3.0.3
 groq==0.6.0
 h11==0.14.0
 httpcore==1.0.5
 httpx==0.27.0
-huggingface-hub==0.23.1
+huggingface-hub==0.23.2
 idna==3.7
 jinja2==3.1.4
 mako==1.3.5
 markupsafe==2.1.5
-openai==1.30.1
+openai==1.31.0
 packaging==24.0
+prompt-toolkit==3.0.46
 psutil==5.9.8
-pydantic-core==2.18.2
-pydantic==2.7.1
+pydantic-core==2.18.4
+pydantic==2.7.3
 python-dotenv==1.0.1
 pyyaml==6.0.1
 regex==2024.5.15
-requests==2.32.2
+requests==2.32.3
 sniffio==1.3.1
 sqlalchemy==2.0.30
 sqlalchemy[asyncio]==2.0.30
 tiktoken==0.6.0
 tokenizers==0.19.1
 tqdm==4.66.4
-typing-extensions==4.11.0
+typing-extensions==4.12.1
 urllib3==2.2.1
+wcwidth==0.2.13
--- a/tests/agents/test_external_docs.py
+++ b/tests/agents/test_external_docs.py
@@ -0,0 +1,50 @@
+from unittest.mock import patch
+
+import pytest
+from httpx import HTTPError
+
+from core.agents.external_docs import DocQueries, ExternalDocumentation, SelectedDocsets
+
+
+@pytest.mark.asyncio
+async def test_stores_documentation_snippets_for_task(agentcontext):
+    sm, _, ui, mock_llm = agentcontext
+
+    sm.current_state.tasks = [{"description": "Some VueJS task", "status": "todo"}]
+    await sm.commit()
+
+    ed = ExternalDocumentation(sm, ui)
+    ed.get_llm = mock_llm(
+        side_effect=[SelectedDocsets(docsets=["vuejs-api-ref"]), DocQueries(queries=["VueJS component model"])]
+    )
+    await ed.run()
+    assert ed.next_state.docs[0]["key"] == "vuejs-api-ref"
+
+
+@pytest.mark.asyncio
+async def test_continues_without_docs_for_invalid_docset(agentcontext):
+    sm, _, ui, mock_llm = agentcontext
+
+    sm.current_state.tasks = [{"description": "Some VueJS task", "status": "todo"}]
+    await sm.commit()
+
+    ed = ExternalDocumentation(sm, ui)
+    ed.get_llm = mock_llm(
+        side_effect=[SelectedDocsets(docsets=["doesnt-exist"]), DocQueries(queries=["VueJS component model"])]
+    )
+    await ed.run()
+    assert ed.next_state.docs == []
+
+
+@pytest.mark.asyncio
+async def test_continues_without_docs_if_api_is_down(agentcontext):
+    sm, _, ui, _ = agentcontext
+
+    sm.current_state.tasks = [{"description": "Future Task", "status": "todo"}]
+    await sm.commit()
+
+    ed = ExternalDocumentation(sm, ui)
+    with patch("httpx.Client.get", side_effect=HTTPError("Failed")):
+        await ed.run()
+
+    assert ed.next_state.docs == []
--- a/tests/db/test_project_state.py
+++ b/tests/db/test_project_state.py
@@ -80,14 +80,14 @@ async def test_create_next_deep_copies_fields(testdb):
    next_state.tasks[0]["completed"] = True
    next_state.iterations[0]["completed"] = True
    next_state.steps[0]["completed"] = True
-    next_state.relevant_files.append("test.txt")
+    next_state.relevant_files = ["test.txt"]
    next_state.modified_files["test.txt"] = "Hello World"

    assert state.epics[0]["completed"] is False
    assert state.tasks[0]["completed"] is False
    assert state.iterations[0]["completed"] is False
    assert state.steps[0]["completed"] is False
-    assert state.relevant_files == []
+    assert state.relevant_files is None
    assert state.modified_files == {}


--- a/tests/ui/test_console.py
+++ b/tests/ui/test_console.py
@@ -1,4 +1,4 @@
-from unittest.mock import patch
+from unittest.mock import AsyncMock, patch

 import pytest

@@ -35,8 +35,9 @@ async def test_stream(capsys):


@pytest.mark.asyncio
-@patch("builtins.input", return_value="awesome")
-async def test_ask_question_simple(mock_input):
+@patch("core.ui.console.PromptSession")
+async def test_ask_question_simple(mock_PromptSession):
+    prompt_async = mock_PromptSession.return_value.prompt_async = AsyncMock(return_value="awesome")
    ui = PlainConsoleUI()

    await ui.start()
@@ -48,12 +49,13 @@ async def test_ask_question_simple(mock_input):

    await ui.stop()

-    mock_input.assert_called_once()
+    prompt_async.assert_awaited_once()


@pytest.mark.asyncio
-@patch("builtins.input", return_value="yes")
-async def test_ask_question_with_buttons(mock_input):
+@patch("core.ui.console.PromptSession")
+async def test_ask_question_with_buttons(mock_PromptSession):
+    prompt_async = mock_PromptSession.return_value.prompt_async = AsyncMock(return_value="yes")
    ui = PlainConsoleUI()

    await ui.start()
@@ -68,12 +70,13 @@ async def test_ask_question_with_buttons(mock_input):

    await ui.stop()

-    mock_input.assert_called_once()
+    prompt_async.assert_awaited_once()


@pytest.mark.asyncio
-@patch("builtins.input", side_effect=KeyboardInterrupt())
-async def test_ask_question_interrupted(mock_input):
+@patch("core.ui.console.PromptSession")
+async def test_ask_question_interrupted(mock_PromptSession):
+    prompt_async = mock_PromptSession.return_value.prompt_async = AsyncMock(side_effect=KeyboardInterrupt)
    ui = PlainConsoleUI()

    await ui.start()
@@ -82,4 +85,4 @@ async def test_ask_question_interrupted(mock_input):

    await ui.stop()

-    mock_input.assert_called_once()
+    prompt_async.assert_awaited_once()