daeisbae
diff --git a/‎.env.example
Lines changed: 3 additions & 1 deletion b/‎.env.example
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 9 additions & 1 deletion b/‎README.md
Lines changed: 9 additions & 1 deletion
diff --git a/‎backend/Dockerfile
Lines changed: 22 additions & 0 deletions b/‎backend/Dockerfile
Lines changed: 22 additions & 0 deletions
diff --git a/‎backend/__init__.py b/‎backend/__init__.py
diff --git a/‎backend/agent/__init__.py b/‎backend/agent/__init__.py
diff --git a/‎backend/agent/code_splitter.py
Lines changed: 125 additions & 0 deletions b/‎backend/agent/code_splitter.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎backend/agent/index.py
Lines changed: 89 additions & 0 deletions b/‎backend/agent/index.py
Lines changed: 89 additions & 0 deletions
diff --git a/‎backend/agent/prompt.py
Lines changed: 66 additions & 0 deletions b/‎backend/agent/prompt.py
Lines changed: 66 additions & 0 deletions
@@ -29,4 +29,6 @@ TOKEN_PROCESSING_CHARACTER_LIMIT=30000 # Approx useful for 64k context window, a
 # Maximum retries for trying to input the code. If it is still beyond the limit, it will be try max retries then stop. To prevent huge input token billing
 TOKEN_PROCESSING_MAX_RETRIES=3
 # Reduce the number of characters per retry. You can think it as PROCESSOR_CHAR_LIMIT - REDUCE_CHAR_PER_RETRY * retries of characters will be processed in each retry
-TOKEN_PROCESSING_REDUCE_CHAR_PER_RETRY=3000 # Approx useful for 64k context window
+TOKEN_PROCESSING_REDUCE_CHAR_PER_RETRY=3000 # Approx useful for 64k context window
+
+NEXT_PUBLIC_API_ENDPOINT=
@@ -19,8 +19,15 @@
 - PostgreSQL (For storing the summarized repository information)
 - Github API Key (To get more quota requesting the repository data)
 - Amazon S3 (You can ignore the parameters if you are going to use it locally. You need to use certificate for your Database if you are going to host it.)
+- Docker (If you are hosting locally)
 
-### Configuration
+### Configuration (Local)
+
+1. Copy `.env.example` to `.env`
+2. Configure all the variables given in `.env`
+3. Run `docker compose up` or `docker compose up -d` to hide the output
+
+### Configuration (Cloud)
 
 1. Create PostgreSQL instance
 2. Copy `.env.example` to `.env`
@@ -30,6 +37,7 @@
 6. Build the server (`npm run build`)
 7. Run (`npm start`)
 
+
 #### Ollama Configuration Guide
 
 - It's recommended if you can run bigger LLM than 14b parameter.
 
@@ -0,0 +1,22 @@
+FROM python:3.12-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8080
+
+CMD ["bash", "-c", "python db/scripts/init_db.py && uvicorn main:app --host 0.0.0.0 --port 8080"]
@@ -0,0 +1,125 @@
+from enum import Enum
+from typing import Optional, List
+
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
+
+from loguru import logger
+
+def get_language_from_extension(extension: str) -> Optional[Language]:
+    """
+    Retrieves the programming language associated with a given file extension.
+
+    :param extension: The file extension excluding the dot (e.g., 'js', 'py').
+    :return: The corresponding Language enum or None if not supported.
+    """
+    extension_to_language_map = {
+        'py': Language.PYTHON,
+        'js': Language.JS,
+        'jsx': Language.JS,
+        'ts': Language.TS,
+        'tsx': Language.TS,
+        'mjs': Language.JS,
+        'cjs': Language.JS,
+        'go': Language.GO,
+        'rb': Language.RUBY,
+        'rs': Language.RUST,
+        'php': Language.PHP,
+        'cpp': Language.CPP,
+        'cc': Language.CPP,
+        'c': Language.C,
+        'cxx': Language.CPP,
+        'hpp': Language.CPP,
+        'hxx': Language.CPP,
+        'h': Language.C,
+        'java': Language.JAVA,
+        'kt': Language.KOTLIN,
+        'cs': Language.CSHARP,
+        'scala': Language.SCALA,
+        'swift': Language.SWIFT,
+        'lua': Language.LUA,
+        'pl': Language.PERL,
+        'hs': Language.HASKELL,
+        'lhs': Language.HASKELL,
+        'md': Language.MARKDOWN
+    }
+    return extension_to_language_map.get(extension.lower())
+
+
+class LineNumberTextSplitter(RecursiveCharacterTextSplitter):
+    """
+    A custom text splitter that tracks and annotates line numbers for each chunk.
+    """
+
+    def create_documents(self, texts: List[str], **kwargs) -> List[Document]:
+        documents = []
+        current_line = 1  # Initialize the starting line number
+
+        for text in texts:
+            # Split the text into chunks using the parent class's method
+            chunks = self.split_text(text)
+            for chunk in chunks:
+                # Calculate the number of lines in the chunk
+                num_lines = chunk.count('\n') + 1
+                doc = Document(
+                    page_content=chunk,
+                    metadata={
+                        'loc': {
+                            'lines': {
+                                'from': current_line,
+                                'to': current_line + num_lines - 1
+                            }
+                        }
+                    }
+                )
+                documents.append(doc)
+                current_line += num_lines  # Update the current line number
+
+        return documents
+
+class CodeSplitter:
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        """
+        Constructor for CodeSplitter.
+
+        :param chunk_size: The size of each chunk.
+        :param chunk_overlap: The number of overlapping characters between chunks.
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+    def split_code(self, file_extension: str, code: str) -> Optional[str]:
+        """
+        Splits the provided code into chunks based on the file extension.
+
+        :param file_extension: The file extension indicating the programming language.
+        :param code: The code content to be split.
+        :return: The code with line numbers or None if the language is not supported.
+        """
+        language = get_language_from_extension(file_extension)
+        if not language:
+            logger.warning(f"Unsupported language for extension: {file_extension}")
+            return None
+
+        separators = RecursiveCharacterTextSplitter.get_separators_for_language(language.value)
+        splitter = LineNumberTextSplitter(
+            separators=separators,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=len
+        )
+
+        try:
+            docs = splitter.create_documents([code])
+        except Exception as e:
+            logger.critical(f"Error during splitting: {e}")
+            return None
+
+        doc_with_metadata = ''
+        for doc in docs:
+            loc = doc.metadata.get('loc', {})
+            lines = loc.get('lines', {})
+            from_line = lines.get('from', 'unknown')
+            to_line = lines.get('to', 'unknown')
+            doc_with_metadata += f'# Lines {from_line} - {to_line}\n{doc.page_content}\n\n'
+        return doc_with_metadata
@@ -0,0 +1,89 @@
+from typing import Optional, List
+
+from agent.prompt import CodePrompt, FolderPrompt
+from agent.schema_parser import SchemaParser
+from agent.schema_factory import FileSchema, FolderSchema
+from agent.prompt_generator import PromptGenerator, FilePromptTemplateVariables, PromptTemplateConfig, PromptType, \
+    RepoInfo, \
+    FolderPromptTemplateVariables
+from agent.code_splitter import CodeSplitter
+from llm.llm_provider import LLMProvider
+
+
+# Base Processor
+class BaseProcessor:
+    def __init__(self, llm: LLMProvider):
+        self.llm = llm
+        self.schema_parser: Optional[SchemaParser] = None
+        self.prompt_generator: Optional[PromptGenerator] = None
+
+    async def process(self, prompt: str) -> dict:
+        response = await self.llm.run(prompt)
+        return self.schema_parser.parse(response)
+
+
+# Code Processor
+class CodeProcessor(BaseProcessor):
+    def __init__(self, llm: LLMProvider):
+        super().__init__(llm)
+        self.code_splitter = CodeSplitter(200, 25)
+        self.schema_parser = SchemaParser(FileSchema)
+        self.prompt_generator = PromptGenerator(
+            PromptTemplateConfig(
+                template=(
+                    'The following instruction is given:\n{requirements}\n{format_instructions}\n'
+                    'The given repository owner is {repo_owner} with repository name of {repo_name}\n'
+                    'The commit SHA referenced is {commit_sha}\n'
+                    'The path of the file is {path}\n'
+                    'Below is the code for your task: {code}'
+                )
+            ),
+            PromptType.FILE
+        )
+
+    async def generate(self, code: str, repo_info: dict[str, str]) -> dict:
+        extension = repo_info.get('path').split('.').pop()
+        splitted_code = self.code_splitter.split_code(extension, code)
+        variables = FilePromptTemplateVariables(
+            requirements=CodePrompt,
+            format_instructions=self.schema_parser.format_instructions,
+            code=splitted_code,
+            repo_name=repo_info.get('repo_name'),
+            repo_owner=repo_info.get('repo_owner'),
+            commit_sha=repo_info.get('commit_sha'),
+            path=repo_info.get('path'),
+        )
+        prompt = await self.prompt_generator.generate(variables, code=variables.code)
+        return await self.process(prompt)
+
+
+# Folder Processor
+class FolderProcessor(BaseProcessor):
+    def __init__(self, llm: LLMProvider):
+        super().__init__(llm)
+        self.schema_parser = SchemaParser(FolderSchema)
+        self.prompt_generator = PromptGenerator(
+            PromptTemplateConfig(
+                template=(
+                    'The following instruction is given:\n{requirements}\n{format_instructions}\n'
+                    'The given repository owner is {repo_owner} with repository name of {repo_name}\n'
+                    'The commit SHA referenced is {commit_sha}\n'
+                    'The path of the folder is {path}\n'
+                    'Below are the summaries for the codebase:\n{ai_summaries}'
+                )
+            ),
+            PromptType.FOLDER
+        )
+
+    async def generate(self, ai_summaries: List[str], repo_info: dict[str, str]) -> dict:
+        variables = FolderPromptTemplateVariables(
+            requirements=FolderPrompt,
+            format_instructions=self.schema_parser.format_instructions,
+            ai_summaries='\n'.join(ai_summaries),
+            repo_owner=repo_info.get('repo_owner'),
+            commit_sha=repo_info.get('commit_sha'),
+            path=repo_info.get('path'),
+            repo_name=repo_info.get('repo_name'),
+        )
+        prompt = await self.prompt_generator.generate(variables, ai_summaries=ai_summaries)
+        return await self.process(prompt)
@@ -0,0 +1,66 @@
+CodePrompt: str = """
+You are an expert software engineer and your task is to deeply analyze a provided codebase from a GitHub repository. Your goal is to generate a comprehensive and structured summary of the codebase that is suitable for a developer-friendly wiki page in markdown format but without backticks.
+
+**Input:**
+
+You will receive the following information, extracted from a GitHub repository:
+
+1. **Repository Description:**
+*   'description': (A textual description of the repository, although it may not be available or be correct)
+2. **Code File:**
+* The raw content of code files within the repository.
+* The owner of the repository.
+* The repository name.
+* The commit sha of the repository.
+* The path to the code file within the repository.
+
+**Analysis Tasks:**
+
+1. **High-Level Overview:**
+*   Provide a concise summary of the file responsibilities and functionalities based on it\'s content.
+*   Explain its role in the overall system.
+*   Identify its dependencies on other modules/components.
+*   Highlight any important classes, functions, or data structures.
+*   Link all the code blocks (Class,Function,Enum,Exception) that are referenced using the following markdown link format: [`Description of Code Block`](Full github url of the file including the start line with optional ending line#L{startLine}-L{endLine}). This is in the form of "https://github.com/{owner}/{repo}/blob/{commitSha}/{path}#L{lineStart}-L{lineEnd}".
+2. **Code-Level Insights:**
+*   Analyze the code files to understand the implementation details.
+*   Identify core algorithms, data structures, and design patterns used.
+*   Provide a summary of how data flows between different parts of the system.
+3. **Dependencies and Relationships:**
+*   Clearly document the relationships between different modules, classes, and functions.
+*   Explain how different parts of the codebase interact with each other.
+
+**Output:**
+"""
+
+FolderPrompt: str = """
+You are an expert software engineer and your task is to deeply analyze a provided codebase from a GitHub repository. Your goal is to generate a comprehensive and structured summary of the codebase that is suitable for a developer-friendly wiki page in markdown format but without backticks.
+
+**Input:**
+
+You will receive the following information, summarized from the expert software engineer:
+
+1. **Repository Description:**
+*   'description': (A textual description of the repository, although it may not be available or be correct)
+2. **Code Files:**
+* The summary of code files within the repository.
+* The owner of the repository.
+* The repository name.
+* The commit sha of the repository.
+* The path to the code file within the repository.
+
+**Analysis Tasks:**
+
+1. **High-Level Overview:**
+*   Start by providing the core functionality among the folders or files. (ex. the folder name \"core\", \"src\" or folder with the same repository name usually contains the core functionality of the system. You can ignore utility folders unless they contain important information or there are nothing to explain.)
+*   Provide a concise summary of the folder's responsibilities and functionalities based on it\'s sub-files and sub-folders summaries.
+*   Explain its role in the overall system.
+*   Identify its dependencies on other modules/components/folder.
+*   Highlight any important classes, functions, or data structures in it's sub-files and sub-folders.
+*   Link all the code blocks that are referenced using the following markdown link format: [`Description of Code Block`](Full github url of the file including the start line with optional ending line#L{startLine}-L{endLine}). This is in the form of "https://github.com/{owner}/{repo}/blob/{commitSha}/{path}#L{lineStart}-L{lineEnd}".
+2. **Dependencies and Relationships:**
+*   Clearly document the relationships between different folders and files.
+*   Explain how different parts of the codebase interact with each other.
+
+**Output:**
+"""