diff --git a/week4/community-contributions/bharat_puri/docstring_generator.ipynb b/week4/community-contributions/bharat_puri/docstring_generator.ipynb new file mode 100644 index 0000000..8f17a08 --- /dev/null +++ b/week4/community-contributions/bharat_puri/docstring_generator.ipynb @@ -0,0 +1,596 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9", + "metadata": {}, + "source": [ + "# Code DocString / Comment Generator\n", + "\n", + "Submitted By : Bharat Puri\n", + "\n", + "Goal: Build a code tool that scans Python modules, finds functions/classes\n", + "without docstrings, and uses an LLM (Claude / GPT / Gemini / Qwen etc.)\n", + "to generate high-quality Google or NumPy style docstrings." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import io\n", + "import sys\n", + "import re\n", + "from dotenv import load_dotenv\n", + "import sys\n", + "sys.path.append(os.path.abspath(os.path.join(\"..\", \"..\"))) \n", + "from openai import OpenAI\n", + "import gradio as gr\n", + "import subprocess\n", + "from IPython.display import Markdown, display\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f672e1c-87e9-4865-b760-370fa605e614", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')\n", + "grok_api_key = os.getenv('GROK_API_KEY')\n", + "groq_api_key = os.getenv('GROQ_API_KEY')\n", + "openrouter_api_key = os.getenv('OPENROUTER_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + " \n", + "if anthropic_api_key:\n", + " print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n", + "else:\n", + " print(\"Anthropic API Key not set (and this is optional)\")\n", + "\n", + "if google_api_key:\n", + " print(f\"Google API Key exists and begins {google_api_key[:2]}\")\n", + "else:\n", + " print(\"Google API Key not set (and this is optional)\")\n", + "\n", + "if grok_api_key:\n", + " print(f\"Grok API Key exists and begins {grok_api_key[:4]}\")\n", + "else:\n", + " print(\"Grok API Key not set (and this is optional)\")\n", + "\n", + "if groq_api_key:\n", + " print(f\"Groq API Key exists and begins {groq_api_key[:4]}\")\n", + "else:\n", + " print(\"Groq API Key not set (and this is optional)\")\n", + "\n", + "if openrouter_api_key:\n", + " print(f\"OpenRouter API Key exists and begins {openrouter_api_key[:6]}\")\n", + "else:\n", + " print(\"OpenRouter API Key not set (and this is optional)\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "59863df1", + "metadata": {}, + "outputs": [], + "source": [ + "# Connect to client libraries\n", + "\n", + "openai = OpenAI()\n", + "\n", + "anthropic_url = \"https://api.anthropic.com/v1/\"\n", + "gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n", + "grok_url = \"https://api.x.ai/v1\"\n", + "groq_url = \"https://api.groq.com/openai/v1\"\n", + "ollama_url = \"http://localhost:11434/v1\"\n", + "openrouter_url = \"https://openrouter.ai/api/v1\"\n", + "\n", + "anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)\n", + "gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)\n", + "grok = OpenAI(api_key=grok_api_key, base_url=grok_url)\n", + "groq = OpenAI(api_key=groq_api_key, base_url=groq_url)\n", + "ollama = OpenAI(api_key=\"ollama\", base_url=ollama_url)\n", + "openrouter = OpenAI(api_key=openrouter_api_key, base_url=openrouter_url)\n", + "\n", + "MODEL = os.getenv(\"DOCGEN_MODEL\", \"gpt-4o-mini\")\n", + "\n", + "\n", + "# Registry for multiple model providers\n", + "MODEL_REGISTRY = {\n", + " \"gpt-4o-mini (OpenAI)\": {\n", + " \"provider\": \"openai\",\n", + " \"model\": \"gpt-4o-mini\",\n", + " },\n", + " \"gpt-4o (OpenAI)\": {\n", + " \"provider\": \"openai\",\n", + " \"model\": \"gpt-4o\",\n", + " },\n", + " \"claude-3.5-sonnet (Anthropic)\": {\n", + " \"provider\": \"anthropic\",\n", + " \"model\": \"claude-3.5-sonnet\",\n", + " },\n", + " \"gemini-1.5-pro (Google)\": {\n", + " \"provider\": \"google\",\n", + " \"model\": \"gemini-1.5-pro\",\n", + " },\n", + " \"codellama-7b (Open Source)\": {\n", + " \"provider\": \"open_source\",\n", + " \"model\": \"codellama-7b\",\n", + " },\n", + " \"starcoder2 (Open Source)\": {\n", + " \"provider\": \"open_source\",\n", + " \"model\": \"starcoder2\",\n", + " },\n", + "}\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da", + "metadata": {}, + "outputs": [], + "source": [ + "models = [\"gpt-5\", \"claude-sonnet-4-5-20250929\", \"grok-4\", \"gemini-2.5-pro\", \"qwen2.5-coder\", \"deepseek-coder-v2\", \"gpt-oss:20b\", \"qwen/qwen3-coder-30b-a3b-instruct\", \"openai/gpt-oss-120b\", ]\n", + "\n", + "clients = {\"gpt-5\": openai, \"claude-sonnet-4-5-20250929\": anthropic, \"grok-4\": grok, \"gemini-2.5-pro\": gemini, \"openai/gpt-oss-120b\": groq, \"qwen2.5-coder\": ollama, \"deepseek-coder-v2\": ollama, \"gpt-oss:20b\": ollama, \"qwen/qwen3-coder-30b-a3b-instruct\": openrouter}\n", + "\n", + "# Want to keep costs ultra-low? Replace this with models of your choice, using the examples from yesterday" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "17b7d074-b1a4-4673-adec-918f82a4eff0", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# Prompt Templates and Utilities\n", + "# ================================================================\n", + "\n", + "DOCSTYLE_TEMPLATES = {\n", + " \"google\": (\n", + " \"You will write a concise Google-style Python docstring for the given function or class.\\n\"\n", + " \"Rules:\\n\"\n", + " \"- One-line summary followed by short details.\\n\"\n", + " \"- Include Args:, Returns:, Raises: only if relevant.\\n\"\n", + " \"- Keep under 12 lines, no code fences or markdown formatting.\\n\"\n", + " \"Return ONLY the text between triple quotes.\"\n", + " ),\n", + "}\n", + "\n", + "SYSTEM_PROMPT = (\n", + " \"You are a senior Python engineer and technical writer. \"\n", + " \"Write precise, helpful docstrings.\"\n", + ")\n", + "\n", + "\n", + "def make_user_prompt(style: str, module_name: str, signature: str, code_context: str) -> str:\n", + " \"\"\"Build the user message for the model based on template and context.\"\"\"\n", + " instr = DOCSTYLE_TEMPLATES.get(style, DOCSTYLE_TEMPLATES[\"google\"])\n", + " prompt = (\n", + " f\"{instr}\\n\\n\"\n", + " f\"Module: {module_name}\\n\"\n", + " f\"Signature:\\n{signature}\\n\\n\"\n", + " f\"Code context:\\n{code_context}\\n\\n\"\n", + " \"Return ONLY a triple-quoted docstring, for example:\\n\"\n", + " '\"\"\"One-line summary.\\n\\n'\n", + " \"Args:\\n\"\n", + " \" x: Description\\n\"\n", + " \"Returns:\\n\"\n", + " \" y: Description\\n\"\n", + " '\"\"\"'\n", + " )\n", + " return prompt\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "16b3c10f-f7bc-4a2f-a22f-65c6807b7574", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# LLM Chat Helper — OpenAI GPT\n", + "# ================================================================\n", + "def llm_generate_docstring(signature: str, context: str, style: str = \"google\", \n", + " module_name: str = \"module\", model_choice: str = \"gpt-4o-mini (OpenAI)\") -> str:\n", + " \"\"\"\n", + " Generate a Python docstring using the selected model provider.\n", + " \"\"\"\n", + " user_prompt = make_user_prompt(style, module_name, signature, context)\n", + " model_info = MODEL_REGISTRY.get(model_choice, MODEL_REGISTRY[\"gpt-4o-mini (OpenAI)\"])\n", + "\n", + " provider = model_info[\"provider\"]\n", + " model_name = model_info[\"model\"]\n", + "\n", + " if provider == \"openai\":\n", + " response = openai.chat.completions.create(\n", + " model=model_name,\n", + " temperature=0.2,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a senior Python engineer and technical writer.\"},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " ],\n", + " )\n", + " text = response.choices[0].message.content.strip()\n", + "\n", + " elif provider == \"anthropic\":\n", + " # Future: integrate Anthropic SDK\n", + " text = \"Claude response simulation: \" + user_prompt[:200]\n", + "\n", + " elif provider == \"google\":\n", + " # Future: integrate Gemini API\n", + " text = \"Gemini response simulation: \" + user_prompt[:200]\n", + "\n", + " else:\n", + " # Simulated open-source fallback\n", + " text = f\"[Simulated output from {model_name}]\\nAuto-generated docstring for {signature}\"\n", + "\n", + " import re\n", + " match = re.search(r'\"\"\"(.*?)\"\"\"', text, re.S)\n", + " return match.group(1).strip() if match else text\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "82da91ac-e563-4425-8b45-1b94880d342f", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# 🧱 AST Parsing Utilities — find missing docstrings\n", + "# ================================================================\n", + "import ast\n", + "\n", + "def node_signature(node: ast.AST) -> str:\n", + " \"\"\"\n", + " Build a readable signature string from a FunctionDef or ClassDef node.\n", + " Example: def add(x, y) -> int:\n", + " \"\"\"\n", + " if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):\n", + " args = [a.arg for a in node.args.args]\n", + " if node.args.vararg:\n", + " args.append(\"*\" + node.args.vararg.arg)\n", + " for a in node.args.kwonlyargs:\n", + " args.append(a.arg + \"=?\")\n", + " if node.args.kwarg:\n", + " args.append(\"**\" + node.args.kwarg.arg)\n", + " ret = \"\"\n", + " if getattr(node, \"returns\", None):\n", + " try:\n", + " ret = f\" -> {ast.unparse(node.returns)}\"\n", + " except Exception:\n", + " pass\n", + " return f\"def {node.name}({', '.join(args)}){ret}:\"\n", + "\n", + " elif isinstance(node, ast.ClassDef):\n", + " return f\"class {node.name}:\"\n", + "\n", + " return \"\"\n", + "\n", + "\n", + "def context_snippet(src: str, node: ast.AST, max_lines: int = 60) -> str:\n", + " \"\"\"\n", + " Extract a small snippet of source code around a node for context.\n", + " This helps the LLM understand what the function/class does.\n", + " \"\"\"\n", + " lines = src.splitlines()\n", + " start = getattr(node, \"lineno\", 1) - 1\n", + " end = getattr(node, \"end_lineno\", start + 1)\n", + " snippet = lines[start:end]\n", + " if len(snippet) > max_lines:\n", + " snippet = snippet[:max_lines] + [\"# ... (truncated) ...\"]\n", + " return \"\\n\".join(snippet)\n", + "\n", + "\n", + "def find_missing_docstrings(src: str):\n", + " \"\"\"\n", + " Parse the Python source code and return a list of nodes\n", + " (module, class, function) that do NOT have docstrings.\n", + " \"\"\"\n", + " tree = ast.parse(src)\n", + " missing = []\n", + "\n", + " # Module-level docstring check\n", + " if ast.get_docstring(tree) is None:\n", + " missing.append((\"module\", tree))\n", + "\n", + " # Walk through the AST for classes and functions\n", + " for node in ast.walk(tree):\n", + " if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):\n", + " if ast.get_docstring(node) is None:\n", + " kind = \"class\" if isinstance(node, ast.ClassDef) else \"function\"\n", + " missing.append((kind, node))\n", + "\n", + " return missing\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea69108f-e4ca-4326-89fe-97c5748c0e79", + "metadata": {}, + "outputs": [], + "source": [ + "## Quick Test ##\n", + "\n", + "code = '''\n", + "def add(x, y):\n", + " return x + y\n", + "\n", + "class Counter:\n", + " def inc(self):\n", + " self.total += 1\n", + "'''\n", + "\n", + "for kind, node in find_missing_docstrings(code):\n", + " print(f\"Missing docstring → {kind}: {node_signature(node)}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "00d65b96-e65d-4e11-89be-06f265a5f2e3", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# Insert Generated Docstrings into Code\n", + "# ================================================================\n", + "import difflib\n", + "import textwrap\n", + "\n", + "def insert_docstring(src: str, node: ast.AST, docstring: str) -> str:\n", + " \"\"\"\n", + " Insert a generated docstring inside a function/class node.\n", + " Keeps indentation consistent with the original code.\n", + " \"\"\"\n", + " lines = src.splitlines()\n", + " if not hasattr(node, \"body\") or not node.body:\n", + " return src # nothing to insert into\n", + "\n", + " start_idx = node.body[0].lineno - 1\n", + " indent = re.match(r\"\\s*\", lines[start_idx]).group(0)\n", + " ds_lines = textwrap.indent(f'\"\"\"{docstring.strip()}\"\"\"', indent).splitlines()\n", + "\n", + " new_lines = lines[:start_idx] + ds_lines + [\"\"] + lines[start_idx:]\n", + " return \"\\n\".join(new_lines)\n", + "\n", + "\n", + "def insert_module_docstring(src: str, docstring: str) -> str:\n", + " \"\"\"Insert a module-level docstring at the top of the file.\"\"\"\n", + " lines = src.splitlines()\n", + " ds_block = f'\"\"\"{docstring.strip()}\"\"\"\\n'\n", + " return ds_block + \"\\n\".join(lines)\n", + "\n", + "\n", + "def diff_text(a: str, b: str) -> str:\n", + " \"\"\"Show unified diff of original vs updated code.\"\"\"\n", + " return \"\".join(\n", + " difflib.unified_diff(\n", + " a.splitlines(keepends=True),\n", + " b.splitlines(keepends=True),\n", + " fromfile=\"original.py\",\n", + " tofile=\"updated.py\",\n", + " )\n", + " )\n", + "\n", + "\n", + "def generate_docstrings_for_source(src: str, style: str = \"google\", module_name: str = \"module\", model_choice: str = \"gpt-4o-mini (OpenAI)\"):\n", + " targets = find_missing_docstrings(src)\n", + " updated = src\n", + " report = []\n", + "\n", + " for kind, node in sorted(targets, key=lambda t: 0 if t[0] == \"module\" else 1):\n", + " sig = \"module \" + module_name if kind == \"module\" else node_signature(node)\n", + " ctx = src if kind == \"module\" else context_snippet(src, node)\n", + " doc = llm_generate_docstring(sig, ctx, style=style, module_name=module_name, model_choice=model_choice)\n", + "\n", + " if kind == \"module\":\n", + " updated = insert_module_docstring(updated, doc)\n", + " else:\n", + " updated = insert_docstring(updated, node, doc)\n", + "\n", + " report.append({\"kind\": kind, \"signature\": sig, \"model\": model_choice, \"doc_preview\": doc[:150]})\n", + "\n", + " return updated, report\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d00cf4b7-773d-49cb-8262-9d11d787ee10", + "metadata": {}, + "outputs": [], + "source": [ + "## Quick Test ##\n", + "new_code, report = generate_docstrings_for_source(code, style=\"google\", module_name=\"demo\")\n", + "\n", + "print(\"=== Generated Docstrings ===\")\n", + "for r in report:\n", + " print(f\"- {r['kind']}: {r['signature']}\")\n", + " print(\" \", r['doc_preview'])\n", + "print(\"\\n=== Updated Source ===\")\n", + "print(new_code)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b318db41-c05d-48ce-9990-b6f1a0577c68", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# 📂 File-Based Workflow — preview or apply docstrings\n", + "# ================================================================\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "\n", + "def process_file(path: str, style: str = \"google\", apply: bool = False) -> pd.DataFrame:\n", + " \"\"\"\n", + " Process a .py file: find missing docstrings, generate them via GPT,\n", + " and either preview the diff or apply the updates in place.\n", + " \"\"\"\n", + " p = Path(path)\n", + " src = p.read_text(encoding=\"utf-8\")\n", + " updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)\n", + "\n", + " if apply:\n", + " p.write_text(updated, encoding=\"utf-8\")\n", + " print(f\"✅ Updated file written → {p}\")\n", + " else:\n", + " print(\"🔍 Diff preview:\")\n", + " print(diff_text(src, updated))\n", + "\n", + " return pd.DataFrame(rows)\n", + "\n", + "# Example usage:\n", + "# df = process_file(\"my_script.py\", style=\"google\", apply=False) # preview\n", + "# df = process_file(\"my_script.py\", style=\"google\", apply=True) # overwrite with docstrings\n", + "# df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "8962cf0e-9255-475e-bbc1-21500be0cd78", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# 📂 File-Based Workflow — preview or apply docstrings\n", + "# ================================================================\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "\n", + "def process_file(path: str, style: str = \"google\", apply: bool = False) -> pd.DataFrame:\n", + " \"\"\"\n", + " Process a .py file: find missing docstrings, generate them via GPT,\n", + " and either preview the diff or apply the updates in place.\n", + " \"\"\"\n", + " p = Path(path)\n", + " src = p.read_text(encoding=\"utf-8\")\n", + " updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)\n", + "\n", + " if apply:\n", + " p.write_text(updated, encoding=\"utf-8\")\n", + " print(f\"✅ Updated file written → {p}\")\n", + " else:\n", + " print(\"🔍 Diff preview:\")\n", + " print(diff_text(src, updated))\n", + "\n", + " return pd.DataFrame(rows)\n", + "\n", + "# Example usage:\n", + "# df = process_file(\"my_script.py\", style=\"google\", apply=False) # preview\n", + "# df = process_file(\"my_script.py\", style=\"google\", apply=True) # overwrite with docstrings\n", + "# df\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0b0f852-982f-4918-9b5d-89880cc12003", + "metadata": {}, + "outputs": [], + "source": [ + "# ================================================================\n", + "# 🎨 Enhanced Gradio Interface with Model Selector\n", + "# ================================================================\n", + "import gradio as gr\n", + "\n", + "def gradio_generate(code_text: str, style: str, model_choice: str):\n", + " \"\"\"Wrapper for Gradio — generates docstrings using selected model.\"\"\"\n", + " if not code_text.strip():\n", + " return \"⚠️ Please paste some Python code first.\"\n", + " try:\n", + " updated, _ = generate_docstrings_for_source(\n", + " code_text, style=style, module_name=\"gradio_snippet\", model_choice=model_choice\n", + " )\n", + " return updated\n", + " except Exception as e:\n", + " return f\"❌ Error: {e}\"\n", + "\n", + "with gr.Blocks(theme=gr.themes.Soft()) as doc_ui:\n", + " gr.Markdown(\"## 🧠 Auto Docstring Generator — by Bharat Puri\\nChoose your model and generate high-quality docstrings.\")\n", + "\n", + " with gr.Row():\n", + " code_input = gr.Code(label=\"Paste your Python code\", language=\"python\", lines=18)\n", + " code_output = gr.Code(label=\"Generated code with docstrings\", language=\"python\", lines=18)\n", + "\n", + " with gr.Row():\n", + " style_choice = gr.Radio([\"google\"], value=\"google\", label=\"Docstring Style\")\n", + " model_choice = gr.Dropdown(\n", + " list(MODEL_REGISTRY.keys()),\n", + " value=\"gpt-4o-mini (OpenAI)\",\n", + " label=\"Select Model\",\n", + " )\n", + "\n", + " generate_btn = gr.Button(\"🚀 Generate Docstrings\")\n", + " generate_btn.click(\n", + " fn=gradio_generate,\n", + " inputs=[code_input, style_choice, model_choice],\n", + " outputs=[code_output],\n", + " )\n", + "\n", + "doc_ui.launch(share=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e6d6720-de8e-4cbb-be9f-82bac3dcc71a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}