Files
LLM_Engineering_OLD/week5/community-contributions/solisoma/intelligentRag.ipynb
2025-10-24 10:30:20 +01:00

510 lines
23 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"id": "3d526a96",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import initialize_agent, AgentType\n",
"from langchain.tools import tool\n",
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.chains import RetrievalQA\n",
"from pathlib import Path\n",
"import gradio as gr\n",
"from langchain.tools import Tool\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "3d14dedb",
"metadata": {},
"outputs": [],
"source": [
"class ProjectRootDetector:\n",
" def __init__(self):\n",
" self.root_indicators = [\n",
" '.git', \n",
" 'package.json', \n",
" 'requirements.txt', \n",
" 'pyproject.toml', \n",
" 'Cargo.toml', \n",
" 'go.mod', \n",
" 'pom.xml', \n",
" 'build.gradle', \n",
" 'Dockerfile', \n",
" 'docker-compose.yml',\n",
" 'README.md',\n",
" 'setup.py',\n",
" 'CMakeLists.txt'\n",
" ]\n",
" \n",
" def find_root(self, use_file_path_as_root=False, root_path=None):\n",
" \"\"\"Find project root starting from any file path\"\"\"\n",
" if root_path: \n",
" return root_path\n",
"\n",
" if use_file_path_as_root:\n",
" return str(Path.cwd())\n",
" \n",
" current_path = Path.cwd()\n",
" \n",
" while current_path != current_path.parent:\n",
" for indicator in self.root_indicators:\n",
" if (current_path / indicator).exists():\n",
" return str(current_path)\n",
" current_path = current_path.parent\n",
" \n",
" return str(Path.cwd())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa781fd3",
"metadata": {},
"outputs": [],
"source": [
"class RepoIntelligentAssistant:\n",
" system_prompt = \"\"\"\n",
" You are an intelligent software engineer and codebase assistant with deep expertise in all programming domains (software engineering, data science, web development, etc.).\n",
"\n",
" BEHAVIOR GUIDELINES:\n",
"\n",
" 1. QUESTION EVALUATION:\n",
" - First determine if the question requires repository context or general coding knowledge\n",
" - For repo-specific questions: Use tools to fetch relevant code/files\n",
" - For general coding questions: Use your expertise to answer directly\n",
"\n",
" 2. CODE EXPLANATION:\n",
" - When explaining code: Break it down line-by-line\n",
" - Show the actual code and explain what each line does\n",
" - Focus on the specific implementation in the repository\n",
" - Explain the purpose and context within the codebase\n",
"\n",
" 3. CODE IMPROVEMENTS:\n",
" - When suggesting improvements: Analyze the repository structure first\n",
" - If suggesting new functions/classes: Write complete, working code\n",
" - Base suggestions on the existing codebase patterns and architecture\n",
" - Show the user the complete code implementation\n",
"\n",
" 4. CODE MODIFICATION:\n",
" - NEVER modify files directly\n",
" - Always show the user the updated code\n",
" - Let the user decide whether to implement changes\n",
" - Provide clear instructions on where/how to apply changes\n",
"\n",
" 5. RESPONSE SEQUENCE:\n",
" a) Evaluate question type (repo-specific vs general)\n",
" b) If repo-specific: Fetch relevant context using tools\n",
" c) Provide detailed explanation with code examples\n",
" d) If suggesting improvements: Show complete code implementation\n",
" e) Always explain the reasoning behind suggestions\n",
"\n",
" Your tone: Precise, technical, and helpful - like a senior developer mentoring a colleague.\n",
" \"\"\"\n",
"\n",
" MODEL = \"gpt-4o-mini\"\n",
"\n",
" def __init__(self, use_file_path_as_root = False, root_path=None):\n",
" self.llm = ChatOpenAI(\n",
" temperature=0.7, \n",
" callbacks=[StreamingStdOutCallbackHandler()],\n",
" model_name=self.MODEL\n",
" )\n",
" self.root_folder = ProjectRootDetector().find_root(root_path=root_path, use_file_path_as_root=use_file_path_as_root)\n",
" self.agent = None \n",
"\n",
" \n",
" def get_file(self, file_name: str):\n",
" matches = []\n",
" \n",
" # Normalize the input file name\n",
" file_name = file_name.strip()\n",
" \n",
" for root, dirs, files in os.walk(self.root_folder):\n",
" for f in files:\n",
" # Case-insensitive comparison\n",
" if \".\" in file_name:\n",
" # Exact match for files with extension (case-insensitive)\n",
" if f.lower() == file_name.lower():\n",
" matches.append(os.path.join(root, f))\n",
" else:\n",
" # Match base name without extension (case-insensitive)\n",
" base_name = f.split(\".\")[0]\n",
" if file_name.lower() == base_name.lower():\n",
" matches.append(os.path.join(root, f))\n",
"\n",
" if not matches:\n",
" # Try fuzzy matching for better results\n",
" fuzzy_matches = []\n",
" for root, dirs, files in os.walk(self.root_folder):\n",
" for f in files:\n",
" if file_name.lower() in f.lower():\n",
" fuzzy_matches.append(os.path.join(root, f))\n",
" \n",
" if fuzzy_matches:\n",
" listed = \"\\n\".join(fuzzy_matches[:10]) # Limit to 10 results\n",
" return (\n",
" f\"No exact match for '{file_name}' found. \"\n",
" f\"Did you mean one of these?\\n\\n{listed}\"\n",
" )\n",
" else:\n",
" return f\"No file named '{file_name}' found in the repository.\"\n",
"\n",
" if len(matches) > 1:\n",
" listed = \"\\n\".join(matches)\n",
" return (\n",
" f\"There are multiple files named '{file_name}'. \"\n",
" f\"Please specify the full path.\\n\\n{listed}\"\n",
" )\n",
"\n",
" file_path = matches[0]\n",
" try:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" content = file.read()\n",
" return f\"Content of '{file_name}' (from {file_path}):\\n\\n{content}\"\n",
" except Exception as e:\n",
" return f\"Error reading file '{file_path}': {e}\"\n",
" \n",
" \n",
" def _get_all_folders_recursive(self, root_path):\n",
" \"\"\"Get all folders recursively from root path\"\"\"\n",
" all_folders = []\n",
" restricted_extensions = {\n",
" '.exe', '.dll', '.so', '.dylib', '.bin', '.app', '.deb', '.rpm',\n",
" '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz',\n",
" '.pyc', '.pyo', '.pyd', '.class', '.jar', '.war',\n",
" '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb'\n",
" }\n",
"\n",
" skip_folders = {\n",
" '.git', '.svn', '.hg', '.bzr', 'node_modules', 'bower_components', 'vendor', '.venv',\n",
" '__pycache__', '.pytest_cache', '.coverage', 'venv', '.venv', 'env', '.env',\n",
" 'dist', 'build', 'target', 'out', 'bin', 'obj', '.vscode', '.idea', '.vs', '.eclipse',\n",
" '.DS_Store', 'Thumbs.db', '.Trash', 'temp', 'tmp', 'cache', 'logs', '.cache', '.tmp',\n",
" 'packages', 'lib', 'libs', 'dependencies', 'generated', 'auto-generated', '.generated',\n",
" 'backup', 'backups', '.backup', 'coverage', '.nyc_output', 'htmlcov','docs/_build', 'site', '_site'\n",
" }\n",
" \n",
" for root, dirs, files in os.walk(root_path):\n",
" dirs[:] = [d for d in dirs if d not in skip_folders]\n",
" \n",
" root_name = os.path.basename(root)\n",
" if root_name in skip_folders:\n",
" continue\n",
" \n",
" has_restricted = any(os.path.splitext(file)[1].lower() in restricted_extensions for file in files)\n",
" if has_restricted:\n",
" continue\n",
"\n",
" all_folders.append(root)\n",
" \n",
" return all_folders\n",
"\n",
" def create_doc(self):\n",
" folders = self._get_all_folders_recursive(self.root_folder)\n",
" documents = []\n",
" for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" print(doc_type)\n",
" loader = DirectoryLoader(\n",
" folder, \n",
" glob=[\"**/*.py\", \"**/*.ipynb\", \"**/*.md\", \"**/*.txt\", \"**/*.json\", \"**/*.csv\", \"**/*.html\", \"**/*.css\", \"**/*.js\", \"**/*.ts\", \"**/*.java\", \"**/*.c\", \"**/*.cpp\", \"**/*.go\", \"**/*.rust\", \"**/*.scala\", \"**/*.swift\", \"**/*.kotlin\"], \n",
" loader_cls=TextLoader, loader_kwargs={\"encoding\": \"utf-8\"}\n",
" )\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
" documents.append(doc)\n",
" return documents\n",
" \n",
" def create_retriever(self):\n",
" embeddings = OpenAIEmbeddings()\n",
"\n",
" documents = self.create_doc()\n",
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
" chunks = text_splitter.split_documents(documents)\n",
"\n",
" vectorstore = FAISS.from_documents(chunks, embeddings)\n",
"\n",
" return vectorstore.as_retriever(k=30)\n",
"\n",
" def create_agent(self):\n",
" retriever = self.create_retriever()\n",
" rag_chain = RetrievalQA.from_chain_type(\n",
" llm=self.llm,\n",
" retriever=retriever,\n",
" chain_type=\"stuff\"\n",
" )\n",
"\n",
" rag_tool = Tool(\n",
" name=\"repo_context_search\",\n",
" func=lambda q: rag_chain.run(q),\n",
" description=\"Search the repository for specific code, functions, classes, or implementation details. Use this when the user asks about something specific in the codebase like 'what does this function do', 'explain this code', or 'how is X implemented'.\"\n",
" )\n",
"\n",
" @tool(\"get_file\", return_direct=False)\n",
" def get_file(x):\n",
" \"\"\"\n",
" Fetch the content of a specific file by name to analyze its code implementation.\n",
" Use when user asks to 'explain this file', 'show me the code for X', or 'what's in this file'.\n",
" If multiple files share the same name, list their paths instead.\n",
" \"\"\"\n",
" return self.get_file(x)\n",
"\n",
" tools = [rag_tool, get_file]\n",
" \n",
" memory = ConversationBufferMemory(\n",
" memory_key=\"chat_history\", \n",
" input_key=\"input\", \n",
" return_messages=True,\n",
" system_message=self.system_prompt \n",
" )\n",
"\n",
" agent = initialize_agent(\n",
" tools,\n",
" self.llm,\n",
" agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,\n",
" handle_parsing_errors=True,\n",
" memory=memory,\n",
" verbose=True\n",
" )\n",
"\n",
" return agent\n",
"\n",
" def run(self, question, history=None):\n",
" try:\n",
" # Create agent only once\n",
" if not self.agent:\n",
" self.agent = self.create_agent()\n",
" \n",
" response = self.agent.stream(question)\n",
" for chunk in response:\n",
" if chunk.get(\"output\"):\n",
" yield chunk[\"output\"]\n",
" except Exception as e:\n",
" yield f\"Error: {str(e)}\"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "075a4612",
"metadata": {},
"outputs": [],
"source": [
"root = os.path.abspath(\"../tourist-guide\")\n",
"G = RepoIntelligentAssistant(root_path=root)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "8c48444a",
"metadata": {},
"outputs": [],
"source": [
"def chatInterface(fn, chat_type=\"messages\"):\n",
" with gr.Blocks(\n",
" title=\"🤖 Intelligent Codebase Assistant\",\n",
" theme=gr.themes.Soft(),\n",
" css=\"\"\"\n",
" .gradio-container {\n",
" max-width: 1200px !important;\n",
" margin: auto !important;\n",
" }\n",
" .chat-message {\n",
" font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n",
" }\n",
" \"\"\"\n",
" ) as ui:\n",
" \n",
" # Header Section\n",
" with gr.Row():\n",
" gr.Markdown(\"\"\"\n",
" # 🤖 Intelligent Codebase Assistant\n",
" \n",
" **Your AI-powered coding companion for understanding and improving codebases**\n",
" \n",
" ---\n",
" \"\"\")\n",
" \n",
" # Main Chat Interface\n",
" with gr.Row(height=\"77dvh\", equal_height=True):\n",
" with gr.Column(scale=4):\n",
" gr.ChatInterface(\n",
" fn, \n",
" type=chat_type,\n",
" title=\"Chat with your codebase\",\n",
" description=\"Ask questions about your code, request explanations, or get improvement suggestions\",\n",
" examples=[\n",
" \"What is the codebase about?\",\n",
" \"Explain the key file in the codebase\",\n",
" \"Explain the architecture of the codebase\",\n",
" \"Suggest improvements for the codebase\",\n",
" \"What design patterns are used?\"\n",
" ],\n",
" cache_examples=False,\n",
" fill_height=True\n",
" )\n",
" \n",
" # Sidebar with info\n",
" with gr.Column(scale=1):\n",
" gr.Markdown(\"\"\"\n",
" ## 💡 Tips\n",
" \n",
" **Ask about:**\n",
" - Specific functions or classes\n",
" - Code explanations\n",
" - Architecture questions\n",
" - Improvement suggestions\n",
" - Design patterns\n",
" \n",
" **Examples:**\n",
" - \"What does the login function do?\"\n",
" - \"Explain this code line by line\"\n",
" - \"How can I improve this function?\"\n",
" - \"What's the overall architecture?\"\n",
" \"\"\")\n",
" \n",
" gr.Markdown(\"\"\"\n",
" ## 🛠️ Capabilities\n",
" \n",
" ✅ **Code Analysis** \n",
" ✅ **Line-by-line Explanations** \n",
" ✅ **Improvement Suggestions** \n",
" ✅ **Multi-language Support** \n",
" ✅ **Memory Persistence** \n",
" ✅ **Repository Context** \n",
" \"\"\")\n",
"\n",
" ui.launch(inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "4f16318f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7877\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7877/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tourist-guide\n",
"\n",
"\n",
"\u001b[1m> Entering new None chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3mTo explain the \"tourist-assistant\" file line by line, I need to fetch its content first. I'll look for the file to understand its structure and code implementation.\n",
"\n",
"Action: get_file \n",
"Action Input: tourist-assistant \n",
"\u001b[0m\n",
"Observation: \u001b[33;1m\u001b[1;3mNo file named 'tourist-assistant \n",
"' found in the repository.\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mIt seems that there isn't a file named \"tourist-assistant\" in the repository. I should check if there are any similar files or if the name might be slightly different. \n",
"\n",
"Action: repo_context_search \n",
"Action Input: tourist-assistant \n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3mIt seems like you are referring to the Tourist Assistant application. How can I assist you with it?\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mIt appears that \"tourist-assistant\" might refer to a broader application or a specific functionality rather than a direct file. I should search for related components or files that may contain implementations relevant to a tourist assistant feature.\n",
"\n",
"Action: repo_context_search \n",
"Action Input: Tourist Assistant \n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3mHello! How can I assist you today with your travel inquiries or questions about tourist destinations?\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mIt seems that the repository contains a conversational interface for a Tourist Assistant application, but I haven't found specific code files or implementations associated with it yet. I should look for files related to this application that might contain relevant code.\n",
"\n",
"Action: repo_context_search \n",
"Action Input: tourist \n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3mI'm here to help you with information about tourist destinations! What specific question do you have or what location are you interested in?\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mIt appears that the repository is structured around a conversational interface rather than a traditional file named \"tourist-assistant\". I need to continue searching for specific components related to tourism or tourist assistance features that might provide the code or functionality related to the user query.\n",
"\n",
"Action: repo_context_search \n",
"Action Input: assistant \n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3mI don't know.\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mIt seems that I haven't been able to find a specific file or code implementation directly related to \"tourist-assistant\" or any closely related terms. The repository might be using a different structure or naming convention. I could either try searching for more specific terms related to what the Tourist Assistant does or ask for clarification on what specific functionality or aspect of the Tourist Assistant the user would like to know about.\n",
"\n",
"Action: repo_context_search \n",
"Action Input: travel assistant \n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3mHello! How can I assist you with your travel plans or questions about tourist destinations? Feel free to ask!\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mI havent been able to locate a specific file or detailed code related to \"tourist-assistant\" or similar terms in the repository. It seems the repository is primarily focused on providing conversational responses to users rather than showing specific code implementations. To provide a more accurate answer, I might need to know more about what specific functionality or aspect of the Tourist Assistant you are interested in.\n",
"\n",
"Final Answer: I couldn't find a file named \"tourist-assistant\" or similar in the repository. The repository appears to focus on a conversational interface for a Tourist Assistant application. If you have specific aspects or functionalities you'd like to know about, please specify!\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\hp\\projects\\gen-ai\\llm_engineering\\.venv\\Lib\\site-packages\\langchain\\chains\\base.py:487: UserWarning: 'ConversationBufferMemory' got multiple output keys: dict_keys(['output', 'messages']). The default 'output' key is being used. If this is not desired, please manually set 'output_key'.\n",
" self.memory.save_context(inputs, outputs)\n"
]
}
],
"source": [
"interface = chatInterface(G.run, chat_type=\"messages\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}