Files
LLM_Engineering_OLD/week5/community-contributions/solisoma/intelligentRag.ipynb
2025-10-24 09:13:17 +01:00

352 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "3d526a96",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.agents import initialize_agent, AgentType\n",
"from langchain.tools import tool\n",
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.memory import ConversationBufferMemory\n",
"from langchain.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain.chains import RetrievalQA\n",
"from pathlib import Path\n",
"import gradio as gr\n",
"from langchain.tools import Tool\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d14dedb",
"metadata": {},
"outputs": [],
"source": [
"class ProjectRootDetector:\n",
" def __init__(self):\n",
" self.root_indicators = [\n",
" '.git', \n",
" 'package.json', \n",
" 'requirements.txt', \n",
" 'pyproject.toml', \n",
" 'Cargo.toml', \n",
" 'go.mod', \n",
" 'pom.xml', \n",
" 'build.gradle', \n",
" 'Dockerfile', \n",
" 'docker-compose.yml',\n",
" 'README.md',\n",
" 'setup.py',\n",
" 'CMakeLists.txt'\n",
" ]\n",
" \n",
" def find_root(self, use_file_path_as_root=False, root_path=None):\n",
" \"\"\"Find project root starting from any file path\"\"\"\n",
" if root_path: \n",
" return root_path\n",
"\n",
" if use_file_path_as_root:\n",
" return str(Path.cwd())\n",
" \n",
" current_path = Path.cwd()\n",
" \n",
" while current_path != current_path.parent:\n",
" for indicator in self.root_indicators:\n",
" if (current_path / indicator).exists():\n",
" return str(current_path)\n",
" current_path = current_path.parent\n",
" \n",
" return str(Path.cwd())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "fa781fd3",
"metadata": {},
"outputs": [],
"source": [
"class RepoIntelligentAssistant:\n",
" system_prompt = \"\"\"\n",
" You are an intelligent software engineer and codebase assistant with deep expertise in all programming domains (software engineering, data science, web development, etc.).\n",
"\n",
" BEHAVIOR GUIDELINES:\n",
"\n",
" 1. QUESTION EVALUATION:\n",
" - First determine if the question requires repository context or general coding knowledge\n",
" - For repo-specific questions: Use tools to fetch relevant code/files\n",
" - For general coding questions: Use your expertise to answer directly\n",
"\n",
" 2. CODE EXPLANATION:\n",
" - When explaining code: Break it down line-by-line\n",
" - Show the actual code and explain what each line does\n",
" - Focus on the specific implementation in the repository\n",
" - Explain the purpose and context within the codebase\n",
"\n",
" 3. CODE IMPROVEMENTS:\n",
" - When suggesting improvements: Analyze the repository structure first\n",
" - If suggesting new functions/classes: Write complete, working code\n",
" - Base suggestions on the existing codebase patterns and architecture\n",
" - Show the user the complete code implementation\n",
"\n",
" 4. CODE MODIFICATION:\n",
" - NEVER modify files directly\n",
" - Always show the user the updated code\n",
" - Let the user decide whether to implement changes\n",
" - Provide clear instructions on where/how to apply changes\n",
"\n",
" 5. RESPONSE SEQUENCE:\n",
" a) Evaluate question type (repo-specific vs general)\n",
" b) If repo-specific: Fetch relevant context using tools\n",
" c) Provide detailed explanation with code examples\n",
" d) If suggesting improvements: Show complete code implementation\n",
" e) Always explain the reasoning behind suggestions\n",
"\n",
" Your tone: Precise, technical, and helpful - like a senior developer mentoring a colleague.\n",
" \"\"\"\n",
"\n",
" MODEL = \"gpt-4o-mini\"\n",
"\n",
" def __init__(self, use_file_path_as_root = False, root_path=None):\n",
" self.llm = ChatOpenAI(\n",
" temperature=0.7, \n",
" callbacks=[StreamingStdOutCallbackHandler()],\n",
" model_name=self.MODEL\n",
" )\n",
" self.root_folder = ProjectRootDetector().find_root(root_path=root_path, use_file_path_as_root=use_file_path_as_root)\n",
" self.agent = None \n",
"\n",
" \n",
" def get_file(self, file_name: str):\n",
" matches = []\n",
" for root, dirs, files in os.walk(self.root_folder):\n",
" print(files, flush=True)\n",
" for f in files:\n",
" if \".\" in file_name:\n",
" if f == file_name:\n",
" matches.append(os.path.join(root, f))\n",
" else:\n",
" base_file_name = file_name.split(\".\")[0]\n",
" if file_name == base_file_name:\n",
" matches.append(os.path.join(root, f))\n",
"\n",
" if not matches:\n",
" return f\"No file named '{file_name}' found in the repository.\"\n",
"\n",
" if len(matches) > 1:\n",
" listed = \"\\n\".join(matches)\n",
" return (\n",
" f\"There are multiple files named '{file_name}'. \"\n",
" f\"Please specify the full path.\\n\\n{listed}\"\n",
" )\n",
"\n",
" file_path = matches[0]\n",
" try:\n",
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
" content = file.read()\n",
" return f\"Content of '{file_name}':\\n\\n{content}\"\n",
" except Exception as e:\n",
" return f\"Error reading file: {e}\"\n",
" \n",
" \n",
" def _get_all_folders_recursive(self, root_path):\n",
" \"\"\"Get all folders recursively from root path\"\"\"\n",
" all_folders = []\n",
" restricted_extensions = {\n",
" '.exe', '.dll', '.so', '.dylib', '.bin', '.app', '.deb', '.rpm',\n",
" '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz',\n",
" '.pyc', '.pyo', '.pyd', '.class', '.jar', '.war',\n",
" '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb'\n",
" }\n",
"\n",
" skip_folders = {\n",
" '.git', '.svn', '.hg', '.bzr', 'node_modules', 'bower_components', 'vendor', '.venv',\n",
" '__pycache__', '.pytest_cache', '.coverage', 'venv', '.venv', 'env', '.env',\n",
" 'dist', 'build', 'target', 'out', 'bin', 'obj', '.vscode', '.idea', '.vs', '.eclipse',\n",
" '.DS_Store', 'Thumbs.db', '.Trash', 'temp', 'tmp', 'cache', 'logs', '.cache', '.tmp',\n",
" 'packages', 'lib', 'libs', 'dependencies', 'generated', 'auto-generated', '.generated',\n",
" 'backup', 'backups', '.backup', 'coverage', '.nyc_output', 'htmlcov','docs/_build', 'site', '_site'\n",
" }\n",
" \n",
" for root, dirs, files in os.walk(root_path):\n",
" dirs[:] = [d for d in dirs if d not in skip_folders]\n",
" \n",
" root_name = os.path.basename(root)\n",
" if root_name in skip_folders:\n",
" continue\n",
" \n",
" has_restricted = any(os.path.splitext(file)[1].lower() in restricted_extensions for file in files)\n",
" if has_restricted:\n",
" continue\n",
"\n",
" all_folders.append(root)\n",
" \n",
" return all_folders\n",
"\n",
" def create_doc(self):\n",
" folders = self._get_all_folders_recursive(self.root_folder)\n",
" documents = []\n",
" for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" print(doc_type)\n",
" loader = DirectoryLoader(\n",
" folder, \n",
" glob=[\"**/*.py\", \"**/*.ipynb\", \"**/*.md\", \"**/*.txt\", \"**/*.json\", \"**/*.csv\", \"**/*.html\", \"**/*.css\", \"**/*.js\", \"**/*.ts\", \"**/*.java\", \"**/*.c\", \"**/*.cpp\", \"**/*.go\", \"**/*.rust\", \"**/*.scala\", \"**/*.swift\", \"**/*.kotlin\"], \n",
" loader_cls=TextLoader, loader_kwargs={\"encoding\": \"utf-8\"}\n",
" )\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
" documents.append(doc)\n",
" return documents\n",
" \n",
" def create_retriever(self):\n",
" embeddings = OpenAIEmbeddings()\n",
"\n",
" documents = self.create_doc()\n",
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
" chunks = text_splitter.split_documents(documents)\n",
"\n",
" vectorstore = FAISS.from_documents(chunks, embeddings)\n",
"\n",
" return vectorstore.as_retriever(k=30)\n",
"\n",
" def create_agent(self):\n",
" retriever = self.create_retriever()\n",
" rag_chain = RetrievalQA.from_chain_type(\n",
" llm=self.llm,\n",
" retriever=retriever,\n",
" chain_type=\"stuff\"\n",
" )\n",
"\n",
" rag_tool = Tool(\n",
" name=\"repo_context_search\",\n",
" func=lambda q: rag_chain.run(q),\n",
" description=\"Search the repository for specific code, functions, classes, or implementation details. Use this when the user asks about something specific in the codebase like 'what does this function do', 'explain this code', or 'how is X implemented'.\"\n",
" )\n",
"\n",
" @tool(\"get_file\", return_direct=False)\n",
" def get_file(x):\n",
" \"\"\"\n",
" Fetch the content of a specific file by name to analyze its code implementation.\n",
" Use when user asks to 'explain this file', 'show me the code for X', or 'what's in this file'.\n",
" If multiple files share the same name, list their paths instead.\n",
" \"\"\"\n",
" return self.get_file(x)\n",
"\n",
" tools = [rag_tool, get_file]\n",
" \n",
" memory = ConversationBufferMemory(\n",
" memory_key=\"chat_history\", \n",
" input_key=\"input\", \n",
" return_messages=True,\n",
" system_message=self.system_prompt \n",
" )\n",
"\n",
" agent = initialize_agent(\n",
" tools,\n",
" self.llm,\n",
" agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,\n",
" handle_parsing_errors=True,\n",
" memory=memory,\n",
" verbose=True\n",
" )\n",
"\n",
" return agent\n",
"\n",
" def run(self, question, history=None):\n",
" try:\n",
" # Create agent only once\n",
" if not self.agent:\n",
" self.agent = self.create_agent()\n",
" \n",
" response = self.agent.stream(question)\n",
" for chunk in response:\n",
" if chunk.get(\"output\"):\n",
" yield chunk[\"output\"]\n",
" except Exception as e:\n",
" yield f\"Error: {str(e)}\"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "075a4612",
"metadata": {},
"outputs": [],
"source": [
"G = RepoIntelligentAssistant(root_path=\"C:/Users/hp/projects/gen-ai/llm_engineering/week5/community-contributions/tourist-guide/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f16318f",
"metadata": {},
"outputs": [],
"source": [
"interface = gr.ChatInterface(G.run, type=\"messages\")\n",
"interface.launch()"
]
},
{
"cell_type": "markdown",
"id": "c0cf22c8",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b81ef168",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'..\\\\day 4 no_langchain'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.path.res(\"../day 4 no_langchain\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}