Files
LLM_Engineering_OLD/week4/community-contributions/bharat_puri/docstring_generator.ipynb
2025-10-23 14:58:44 +05:30

613 lines
23 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9",
"metadata": {},
"source": [
"# Code DocString / Comment Generator\n",
"\n",
"Submitted By : Bharat Puri\n",
"\n",
"Goal: Build a code tool that scans Python modules, finds functions/classes\n",
"without docstrings, and uses an LLM (Claude / GPT / Gemini / Qwen etc.)\n",
"to generate high-quality Google or NumPy style docstrings."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import io\n",
"import sys\n",
"import re\n",
"from dotenv import load_dotenv\n",
"import sys\n",
"sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n",
"from openai import OpenAI\n",
"import gradio as gr\n",
"import subprocess\n",
"from IPython.display import Markdown, display\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4f672e1c-87e9-4865-b760-370fa605e614",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenAI API Key exists and begins sk-proj-\n",
"Anthropic API Key not set (and this is optional)\n",
"Google API Key not set (and this is optional)\n",
"Grok API Key not set (and this is optional)\n",
"Groq API Key not set (and this is optional)\n",
"OpenRouter API Key not set (and this is optional)\n"
]
}
],
"source": [
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
"grok_api_key = os.getenv('GROK_API_KEY')\n",
"groq_api_key = os.getenv('GROQ_API_KEY')\n",
"openrouter_api_key = os.getenv('OPENROUTER_API_KEY')\n",
"\n",
"if openai_api_key:\n",
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
"else:\n",
" print(\"OpenAI API Key not set\")\n",
" \n",
"if anthropic_api_key:\n",
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
"else:\n",
" print(\"Anthropic API Key not set (and this is optional)\")\n",
"\n",
"if google_api_key:\n",
" print(f\"Google API Key exists and begins {google_api_key[:2]}\")\n",
"else:\n",
" print(\"Google API Key not set (and this is optional)\")\n",
"\n",
"if grok_api_key:\n",
" print(f\"Grok API Key exists and begins {grok_api_key[:4]}\")\n",
"else:\n",
" print(\"Grok API Key not set (and this is optional)\")\n",
"\n",
"if groq_api_key:\n",
" print(f\"Groq API Key exists and begins {groq_api_key[:4]}\")\n",
"else:\n",
" print(\"Groq API Key not set (and this is optional)\")\n",
"\n",
"if openrouter_api_key:\n",
" print(f\"OpenRouter API Key exists and begins {openrouter_api_key[:6]}\")\n",
"else:\n",
" print(\"OpenRouter API Key not set (and this is optional)\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "59863df1",
"metadata": {},
"outputs": [],
"source": [
"# Connect to client libraries\n",
"\n",
"openai = OpenAI()\n",
"\n",
"anthropic_url = \"https://api.anthropic.com/v1/\"\n",
"gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
"grok_url = \"https://api.x.ai/v1\"\n",
"groq_url = \"https://api.groq.com/openai/v1\"\n",
"ollama_url = \"http://localhost:11434/v1\"\n",
"openrouter_url = \"https://openrouter.ai/api/v1\"\n",
"\n",
"anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)\n",
"gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)\n",
"grok = OpenAI(api_key=grok_api_key, base_url=grok_url)\n",
"groq = OpenAI(api_key=groq_api_key, base_url=groq_url)\n",
"ollama = OpenAI(api_key=\"ollama\", base_url=ollama_url)\n",
"openrouter = OpenAI(api_key=openrouter_api_key, base_url=openrouter_url)\n",
"\n",
"MODEL = os.getenv(\"DOCGEN_MODEL\", \"gpt-4o-mini\")\n",
"\n",
"\n",
"# Registry for multiple model providers\n",
"MODEL_REGISTRY = {\n",
" \"gpt-4o-mini (OpenAI)\": {\n",
" \"provider\": \"openai\",\n",
" \"model\": \"gpt-4o-mini\",\n",
" },\n",
" \"gpt-4o (OpenAI)\": {\n",
" \"provider\": \"openai\",\n",
" \"model\": \"gpt-4o\",\n",
" },\n",
" \"claude-3.5-sonnet (Anthropic)\": {\n",
" \"provider\": \"anthropic\",\n",
" \"model\": \"claude-3.5-sonnet\",\n",
" },\n",
" \"gemini-1.5-pro (Google)\": {\n",
" \"provider\": \"google\",\n",
" \"model\": \"gemini-1.5-pro\",\n",
" },\n",
" \"codellama-7b (Open Source)\": {\n",
" \"provider\": \"open_source\",\n",
" \"model\": \"codellama-7b\",\n",
" },\n",
" \"starcoder2 (Open Source)\": {\n",
" \"provider\": \"open_source\",\n",
" \"model\": \"starcoder2\",\n",
" },\n",
"}\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da",
"metadata": {},
"outputs": [],
"source": [
"models = [\"gpt-5\", \"claude-sonnet-4-5-20250929\", \"grok-4\", \"gemini-2.5-pro\", \"qwen2.5-coder\", \"deepseek-coder-v2\", \"gpt-oss:20b\", \"qwen/qwen3-coder-30b-a3b-instruct\", \"openai/gpt-oss-120b\", ]\n",
"\n",
"clients = {\"gpt-5\": openai, \"claude-sonnet-4-5-20250929\": anthropic, \"grok-4\": grok, \"gemini-2.5-pro\": gemini, \"openai/gpt-oss-120b\": groq, \"qwen2.5-coder\": ollama, \"deepseek-coder-v2\": ollama, \"gpt-oss:20b\": ollama, \"qwen/qwen3-coder-30b-a3b-instruct\": openrouter}\n",
"\n",
"# Want to keep costs ultra-low? Replace this with models of your choice, using the examples from yesterday"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "17b7d074-b1a4-4673-adec-918f82a4eff0",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# Prompt Templates and Utilities\n",
"# ================================================================\n",
"\n",
"DOCSTYLE_TEMPLATES = {\n",
" \"google\": (\n",
" \"You will write a concise Google-style Python docstring for the given function or class.\\n\"\n",
" \"Rules:\\n\"\n",
" \"- One-line summary followed by short details.\\n\"\n",
" \"- Include Args:, Returns:, Raises: only if relevant.\\n\"\n",
" \"- Keep under 12 lines, no code fences or markdown formatting.\\n\"\n",
" \"Return ONLY the text between triple quotes.\"\n",
" ),\n",
"}\n",
"\n",
"SYSTEM_PROMPT = (\n",
" \"You are a senior Python engineer and technical writer. \"\n",
" \"Write precise, helpful docstrings.\"\n",
")\n",
"\n",
"\n",
"def make_user_prompt(style: str, module_name: str, signature: str, code_context: str) -> str:\n",
" \"\"\"Build the user message for the model based on template and context.\"\"\"\n",
" instr = DOCSTYLE_TEMPLATES.get(style, DOCSTYLE_TEMPLATES[\"google\"])\n",
" prompt = (\n",
" f\"{instr}\\n\\n\"\n",
" f\"Module: {module_name}\\n\"\n",
" f\"Signature:\\n{signature}\\n\\n\"\n",
" f\"Code context:\\n{code_context}\\n\\n\"\n",
" \"Return ONLY a triple-quoted docstring, for example:\\n\"\n",
" '\"\"\"One-line summary.\\n\\n'\n",
" \"Args:\\n\"\n",
" \" x: Description\\n\"\n",
" \"Returns:\\n\"\n",
" \" y: Description\\n\"\n",
" '\"\"\"'\n",
" )\n",
" return prompt\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "16b3c10f-f7bc-4a2f-a22f-65c6807b7574",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# LLM Chat Helper — OpenAI GPT\n",
"# ================================================================\n",
"def llm_generate_docstring(signature: str, context: str, style: str = \"google\", \n",
" module_name: str = \"module\", model_choice: str = \"gpt-4o-mini (OpenAI)\") -> str:\n",
" \"\"\"\n",
" Generate a Python docstring using the selected model provider.\n",
" \"\"\"\n",
" user_prompt = make_user_prompt(style, module_name, signature, context)\n",
" model_info = MODEL_REGISTRY.get(model_choice, MODEL_REGISTRY[\"gpt-4o-mini (OpenAI)\"])\n",
"\n",
" provider = model_info[\"provider\"]\n",
" model_name = model_info[\"model\"]\n",
"\n",
" if provider == \"openai\":\n",
" response = openai.chat.completions.create(\n",
" model=model_name,\n",
" temperature=0.2,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a senior Python engineer and technical writer.\"},\n",
" {\"role\": \"user\", \"content\": user_prompt},\n",
" ],\n",
" )\n",
" text = response.choices[0].message.content.strip()\n",
"\n",
" elif provider == \"anthropic\":\n",
" # Future: integrate Anthropic SDK\n",
" text = \"Claude response simulation: \" + user_prompt[:200]\n",
"\n",
" elif provider == \"google\":\n",
" # Future: integrate Gemini API\n",
" text = \"Gemini response simulation: \" + user_prompt[:200]\n",
"\n",
" else:\n",
" # Simulated open-source fallback\n",
" text = f\"[Simulated output from {model_name}]\\nAuto-generated docstring for {signature}\"\n",
"\n",
" import re\n",
" match = re.search(r'\"\"\"(.*?)\"\"\"', text, re.S)\n",
" return match.group(1).strip() if match else text\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "82da91ac-e563-4425-8b45-1b94880d342f",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# 🧱 AST Parsing Utilities — find missing docstrings\n",
"# ================================================================\n",
"import ast\n",
"\n",
"def node_signature(node: ast.AST) -> str:\n",
" \"\"\"\n",
" Build a readable signature string from a FunctionDef or ClassDef node.\n",
" Example: def add(x, y) -> int:\n",
" \"\"\"\n",
" if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):\n",
" args = [a.arg for a in node.args.args]\n",
" if node.args.vararg:\n",
" args.append(\"*\" + node.args.vararg.arg)\n",
" for a in node.args.kwonlyargs:\n",
" args.append(a.arg + \"=?\")\n",
" if node.args.kwarg:\n",
" args.append(\"**\" + node.args.kwarg.arg)\n",
" ret = \"\"\n",
" if getattr(node, \"returns\", None):\n",
" try:\n",
" ret = f\" -> {ast.unparse(node.returns)}\"\n",
" except Exception:\n",
" pass\n",
" return f\"def {node.name}({', '.join(args)}){ret}:\"\n",
"\n",
" elif isinstance(node, ast.ClassDef):\n",
" return f\"class {node.name}:\"\n",
"\n",
" return \"\"\n",
"\n",
"\n",
"def context_snippet(src: str, node: ast.AST, max_lines: int = 60) -> str:\n",
" \"\"\"\n",
" Extract a small snippet of source code around a node for context.\n",
" This helps the LLM understand what the function/class does.\n",
" \"\"\"\n",
" lines = src.splitlines()\n",
" start = getattr(node, \"lineno\", 1) - 1\n",
" end = getattr(node, \"end_lineno\", start + 1)\n",
" snippet = lines[start:end]\n",
" if len(snippet) > max_lines:\n",
" snippet = snippet[:max_lines] + [\"# ... (truncated) ...\"]\n",
" return \"\\n\".join(snippet)\n",
"\n",
"\n",
"def find_missing_docstrings(src: str):\n",
" \"\"\"\n",
" Parse the Python source code and return a list of nodes\n",
" (module, class, function) that do NOT have docstrings.\n",
" \"\"\"\n",
" tree = ast.parse(src)\n",
" missing = []\n",
"\n",
" # Module-level docstring check\n",
" if ast.get_docstring(tree) is None:\n",
" missing.append((\"module\", tree))\n",
"\n",
" # Walk through the AST for classes and functions\n",
" for node in ast.walk(tree):\n",
" if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):\n",
" if ast.get_docstring(node) is None:\n",
" kind = \"class\" if isinstance(node, ast.ClassDef) else \"function\"\n",
" missing.append((kind, node))\n",
"\n",
" return missing\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ea69108f-e4ca-4326-89fe-97c5748c0e79",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing docstring → module: \n",
"Missing docstring → function: def add(x, y):\n",
"Missing docstring → class: class Counter:\n",
"Missing docstring → function: def inc(self):\n"
]
}
],
"source": [
"## Quick Test ##\n",
"\n",
"code = '''\n",
"def add(x, y):\n",
" return x + y\n",
"\n",
"class Counter:\n",
" def inc(self):\n",
" self.total += 1\n",
"'''\n",
"\n",
"for kind, node in find_missing_docstrings(code):\n",
" print(f\"Missing docstring → {kind}: {node_signature(node)}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "00d65b96-e65d-4e11-89be-06f265a5f2e3",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# Insert Generated Docstrings into Code\n",
"# ================================================================\n",
"import difflib\n",
"import textwrap\n",
"\n",
"def insert_docstring(src: str, node: ast.AST, docstring: str) -> str:\n",
" \"\"\"\n",
" Insert a generated docstring inside a function/class node.\n",
" Keeps indentation consistent with the original code.\n",
" \"\"\"\n",
" lines = src.splitlines()\n",
" if not hasattr(node, \"body\") or not node.body:\n",
" return src # nothing to insert into\n",
"\n",
" start_idx = node.body[0].lineno - 1\n",
" indent = re.match(r\"\\s*\", lines[start_idx]).group(0)\n",
" ds_lines = textwrap.indent(f'\"\"\"{docstring.strip()}\"\"\"', indent).splitlines()\n",
"\n",
" new_lines = lines[:start_idx] + ds_lines + [\"\"] + lines[start_idx:]\n",
" return \"\\n\".join(new_lines)\n",
"\n",
"\n",
"def insert_module_docstring(src: str, docstring: str) -> str:\n",
" \"\"\"Insert a module-level docstring at the top of the file.\"\"\"\n",
" lines = src.splitlines()\n",
" ds_block = f'\"\"\"{docstring.strip()}\"\"\"\\n'\n",
" return ds_block + \"\\n\".join(lines)\n",
"\n",
"\n",
"def diff_text(a: str, b: str) -> str:\n",
" \"\"\"Show unified diff of original vs updated code.\"\"\"\n",
" return \"\".join(\n",
" difflib.unified_diff(\n",
" a.splitlines(keepends=True),\n",
" b.splitlines(keepends=True),\n",
" fromfile=\"original.py\",\n",
" tofile=\"updated.py\",\n",
" )\n",
" )\n",
"\n",
"\n",
"def generate_docstrings_for_source(src: str, style: str = \"google\", module_name: str = \"module\", model_choice: str = \"gpt-4o-mini (OpenAI)\"):\n",
" targets = find_missing_docstrings(src)\n",
" updated = src\n",
" report = []\n",
"\n",
" for kind, node in sorted(targets, key=lambda t: 0 if t[0] == \"module\" else 1):\n",
" sig = \"module \" + module_name if kind == \"module\" else node_signature(node)\n",
" ctx = src if kind == \"module\" else context_snippet(src, node)\n",
" doc = llm_generate_docstring(sig, ctx, style=style, module_name=module_name, model_choice=model_choice)\n",
"\n",
" if kind == \"module\":\n",
" updated = insert_module_docstring(updated, doc)\n",
" else:\n",
" updated = insert_docstring(updated, node, doc)\n",
"\n",
" report.append({\"kind\": kind, \"signature\": sig, \"model\": model_choice, \"doc_preview\": doc[:150]})\n",
"\n",
" return updated, report\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d00cf4b7-773d-49cb-8262-9d11d787ee10",
"metadata": {},
"outputs": [],
"source": [
"## Quick Test ##\n",
"new_code, report = generate_docstrings_for_source(code, style=\"google\", module_name=\"demo\")\n",
"\n",
"print(\"=== Generated Docstrings ===\")\n",
"for r in report:\n",
" print(f\"- {r['kind']}: {r['signature']}\")\n",
" print(\" \", r['doc_preview'])\n",
"print(\"\\n=== Updated Source ===\")\n",
"print(new_code)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b318db41-c05d-48ce-9990-b6f1a0577c68",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# 📂 File-Based Workflow — preview or apply docstrings\n",
"# ================================================================\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"\n",
"def process_file(path: str, style: str = \"google\", apply: bool = False) -> pd.DataFrame:\n",
" \"\"\"\n",
" Process a .py file: find missing docstrings, generate them via GPT,\n",
" and either preview the diff or apply the updates in place.\n",
" \"\"\"\n",
" p = Path(path)\n",
" src = p.read_text(encoding=\"utf-8\")\n",
" updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)\n",
"\n",
" if apply:\n",
" p.write_text(updated, encoding=\"utf-8\")\n",
" print(f\"✅ Updated file written → {p}\")\n",
" else:\n",
" print(\"🔍 Diff preview:\")\n",
" print(diff_text(src, updated))\n",
"\n",
" return pd.DataFrame(rows)\n",
"\n",
"# Example usage:\n",
"# df = process_file(\"my_script.py\", style=\"google\", apply=False) # preview\n",
"# df = process_file(\"my_script.py\", style=\"google\", apply=True) # overwrite with docstrings\n",
"# df\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8962cf0e-9255-475e-bbc1-21500be0cd78",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# 📂 File-Based Workflow — preview or apply docstrings\n",
"# ================================================================\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"\n",
"def process_file(path: str, style: str = \"google\", apply: bool = False) -> pd.DataFrame:\n",
" \"\"\"\n",
" Process a .py file: find missing docstrings, generate them via GPT,\n",
" and either preview the diff or apply the updates in place.\n",
" \"\"\"\n",
" p = Path(path)\n",
" src = p.read_text(encoding=\"utf-8\")\n",
" updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)\n",
"\n",
" if apply:\n",
" p.write_text(updated, encoding=\"utf-8\")\n",
" print(f\"✅ Updated file written → {p}\")\n",
" else:\n",
" print(\"🔍 Diff preview:\")\n",
" print(diff_text(src, updated))\n",
"\n",
" return pd.DataFrame(rows)\n",
"\n",
"# Example usage:\n",
"# df = process_file(\"my_script.py\", style=\"google\", apply=False) # preview\n",
"# df = process_file(\"my_script.py\", style=\"google\", apply=True) # overwrite with docstrings\n",
"# df\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0b0f852-982f-4918-9b5d-89880cc12003",
"metadata": {},
"outputs": [],
"source": [
"# ================================================================\n",
"# 🎨 Enhanced Gradio Interface with Model Selector\n",
"# ================================================================\n",
"import gradio as gr\n",
"\n",
"def gradio_generate(code_text: str, style: str, model_choice: str):\n",
" \"\"\"Wrapper for Gradio — generates docstrings using selected model.\"\"\"\n",
" if not code_text.strip():\n",
" return \"⚠️ Please paste some Python code first.\"\n",
" try:\n",
" updated, _ = generate_docstrings_for_source(\n",
" code_text, style=style, module_name=\"gradio_snippet\", model_choice=model_choice\n",
" )\n",
" return updated\n",
" except Exception as e:\n",
" return f\"❌ Error: {e}\"\n",
"\n",
"with gr.Blocks(theme=gr.themes.Soft()) as doc_ui:\n",
" gr.Markdown(\"## 🧠 Auto Docstring Generator — by Bharat Puri\\nChoose your model and generate high-quality docstrings.\")\n",
"\n",
" with gr.Row():\n",
" code_input = gr.Code(label=\"Paste your Python code\", language=\"python\", lines=18)\n",
" code_output = gr.Code(label=\"Generated code with docstrings\", language=\"python\", lines=18)\n",
"\n",
" with gr.Row():\n",
" style_choice = gr.Radio([\"google\"], value=\"google\", label=\"Docstring Style\")\n",
" model_choice = gr.Dropdown(\n",
" list(MODEL_REGISTRY.keys()),\n",
" value=\"gpt-4o-mini (OpenAI)\",\n",
" label=\"Select Model\",\n",
" )\n",
"\n",
" generate_btn = gr.Button(\"🚀 Generate Docstrings\")\n",
" generate_btn.click(\n",
" fn=gradio_generate,\n",
" inputs=[code_input, style_choice, model_choice],\n",
" outputs=[code_output],\n",
" )\n",
"\n",
"doc_ui.launch(share=False)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}