From 756bd0b623be3b8906b677e50ad7f119ae842d14 Mon Sep 17 00:00:00 2001 From: Krabulek Date: Fri, 19 Sep 2025 13:45:07 +0200 Subject: [PATCH 1/3] Week 3 excercise - Intelligent Dataset Generator --- .../intelligent_dataset_generator.ipynb | 600 ++++++++++++++++++ 1 file changed, 600 insertions(+) create mode 100644 week3/community-contributions/intelligent_dataset_generator.ipynb diff --git a/week3/community-contributions/intelligent_dataset_generator.ipynb b/week3/community-contributions/intelligent_dataset_generator.ipynb new file mode 100644 index 0000000..9a374a1 --- /dev/null +++ b/week3/community-contributions/intelligent_dataset_generator.ipynb @@ -0,0 +1,600 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "QTJt9pwUTbHo" + }, + "source": [ + "# Intelligent Synthetic Dataset Generator\n", + "\n", + "An AI-powered tool that creates realistic synthetic datasets for any business case—whether you provide the schema or let it intelligently design one for you.\n", + "\n", + "It works with Claude, Gemini, GPT and HugginFace APIs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l_FljmlTUoka" + }, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aONqZ-SjUJdg", + "outputId": "1f5c7b2e-95f0-4f23-cf01-2bd5bda0807a" + }, + "outputs": [], + "source": [ + "!pip install -q requests bitsandbytes anthropic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ub1unBFvTatE" + }, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import json\n", + "from google.colab import userdata\n", + "\n", + "from openai import OpenAI\n", + "import anthropic\n", + "from huggingface_hub import login\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", + "import torch\n", + "import pandas as pd\n", + "\n", + "import gradio as gr\n", + "import gc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "viZNPtObUOcz" + }, + "outputs": [], + "source": [ + "hf_token = userdata.get('HF_TOKEN')\n", + "openai_api_key = userdata.get('OPENAI_API_KEY')\n", + "anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n", + "google_api_key = userdata.get('GOOGLE_API_KEY')\n", + "\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9Q94S6JTUWn5" + }, + "outputs": [], + "source": [ + "quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mrjdVEpaUxHz" + }, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LvNE6foEUPaz" + }, + "outputs": [], + "source": [ + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", + "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n", + "GEMMA2 = \"google/gemma-2-2b-it\"\n", + "GPT = \"gpt-4o-mini\"\n", + "CLAUDE = \"claude-3-haiku-20240307\"\n", + "GEMINI = \"gemini-2.0-flash\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tvafTFD8XmaO" + }, + "outputs": [], + "source": [ + "MODELS = {\n", + " 'LLama 3.1' : LLAMA,\n", + " 'Phi 3 mini': PHI3,\n", + " 'Gemma 2': GEMMA2,\n", + " 'GPT 4.o mini': GPT,\n", + " 'Claude 3 Haiku': CLAUDE,\n", + " 'Gemini 2.0 Flash': GEMINI,\n", + "}\n", + "\n", + "HF_MODELS = [LLAMA, PHI3, GEMMA2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2LZqA9QXXl0t" + }, + "outputs": [], + "source": [ + "FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".json\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d6EnN7SVXhza", + "outputId": "55f6ac4d-adeb-4216-b2a8-d67524b005d3" + }, + "outputs": [], + "source": [ + "SCHEMA = [\n", + " (\"Name\", \"TEXT\", \"Name of the restaurant\", \"Blue River Bistro\"),\n", + " (\"Address\", \"TEXT\", \"Restaurant address\", \"742 Evergreen Terrace, Springfield, IL 62704\"),\n", + " (\"Type\", \"TEXT\", \"Kitchen type\", 'One of [\"Thai\",\"Mediterranean\",\"Vegan\",\"Steakhouse\",\"Japanese\"] or other potential types'),\n", + " (\"Average Price\", \"TEXT\", \"Average meal price\", \"$45, or '--' if unknown\"),\n", + " (\"Year\", \"INT\", \"Year of restaurant opening\", 2015),\n", + " (\"Menu\", \"Array\", \"List of meals\", '[\"Grilled Salmon\", \"Caesar Salad\", \"Pad Thai\", \"Margherita Pizza\", ...]'),\n", + "]\n", + "\n", + "DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}\" for i, col in enumerate(SCHEMA)])\n", + "print(DEFAULT_SCHEMA_TEXT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W-46TDTOXiS7" + }, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + "You are an expert in generating synthetic datasets tailored to a given business case and user requirements.\n", + "If the user does not specify output columns, infer and create the most appropriate columns based on your expertise.\n", + "Do NOT repeat column values from one row to another. Only output valid JSONL without any comments.\"\n", + "\"\"\"\n", + "\n", + "\n", + "def get_user_prompt(business_case, schema_text, nr_records):\n", + " prompt = f\"The business case is: {business_case}.\\nGenerate {nr_records} rows of data in JSONL format.\\n\"\n", + "\n", + " if schema_text is not None:\n", + " prompt += f\"Each line should be a JSON object with the following fields: \\n{schema_text}\\n\"\n", + "\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gPf1GcAwhwa_" + }, + "source": [ + "## LLM handler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tf-WEQUKhY-z" + }, + "outputs": [], + "source": [ + "def ask_gpt(model: str, user_prompt: str):\n", + " client = OpenAI(api_key=openai_api_key)\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " temperature=0.7\n", + " )\n", + " content = response.choices[0].message.content\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "856pnIHahzDd" + }, + "outputs": [], + "source": [ + "def ask_claude(model: str, user_prompt: str):\n", + " client = anthropic.Anthropic(api_key=anthropic_api_key)\n", + " response = client.messages.create(\n", + " model=model,\n", + " messages=[{\"role\": \"user\", \"content\": user_prompt}],\n", + " max_tokens=4000,\n", + " temperature=0.7,\n", + " system=system_prompt\n", + " )\n", + " content = response.content[0].text\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p0AfSbcBiUlg" + }, + "outputs": [], + "source": [ + "def ask_gemini(model: str, user_prompt: str):\n", + " client = OpenAI(\n", + " api_key=google_api_key,\n", + " base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n", + " )\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=messages,\n", + " temperature=0.7\n", + " )\n", + " content = response.choices[0].message.content\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K9LZZPJ9irrH" + }, + "outputs": [], + "source": [ + "def ask_hf(model: str, user_prompt: str):\n", + " global tokenizer, inputs, hf_model, outputs\n", + "\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + "\n", + " tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + " if hf_model == None:\n", + " hf_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n", + " outputs = hf_model.generate(inputs, max_new_tokens=4000)\n", + "\n", + " _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n", + " content = after.strip()\n", + "\n", + " return content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eu7Sv3bDhXdI" + }, + "outputs": [], + "source": [ + "def query_llm(model_name: str, user_prompt):\n", + " try:\n", + " model = MODELS[model_name]\n", + "\n", + " if \"gpt\" in model.lower():\n", + " response = ask_gpt(model, user_prompt)\n", + "\n", + " elif \"claude\" in model.lower():\n", + " response = ask_claude(model, user_prompt)\n", + "\n", + " elif \"gemini\" in model.lower():\n", + " response = ask_gemini(model, user_prompt)\n", + "\n", + " elif model in HF_MODELS:\n", + " response = ask_hf(model, user_prompt)\n", + "\n", + " else:\n", + " raise ValueError(f\"Unsupported model. Use one of {', '.join(MODELS.keys())}\")\n", + "\n", + " lines = [line.strip() for line in response.strip().splitlines() if line.strip().startswith(\"{\")]\n", + "\n", + " return [json.loads(line) for line in lines]\n", + "\n", + " except Exception as e:\n", + " raise Exception(f\"Model query failed: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mxuwLUsVlBlY" + }, + "source": [ + "## Output Formatter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IAKfqgZIlGuP" + }, + "outputs": [], + "source": [ + "def save_dataset(records, file_format: str, file_name: str):\n", + " df = pd.DataFrame(records)\n", + " print(df.shape)\n", + " if file_format == \".csv\":\n", + " df.to_csv(file_name, index=False)\n", + " elif file_format == \".tsv\":\n", + " df.to_csv(file_name, sep=\"\\t\", index=False)\n", + " elif file_format == \".jsonl\":\n", + " with open(file_name, \"w\") as f:\n", + " for record in records:\n", + " f.write(json.dumps(record) + \"\\n\")\n", + " elif file_format == \".json\":\n", + " df.to_json(file_name, orient=\"records\", index=False)\n", + " else:\n", + " raise ValueError(\"Unsupported file format\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gkpkQ0nal_5B" + }, + "outputs": [], + "source": [ + "def generate_dataset(\n", + " model_name: str,\n", + " business_case: str,\n", + " num_records: int = 100,\n", + " schema_text: str = None,\n", + " file_format: str = '.jsonl',\n", + " file_name: str = 'test_dataset.jsonl'\n", + "):\n", + " \"\"\"\n", + " Generates a synthetic dataset using an LLM based on the given business case and optional schema.\n", + "\n", + " Returns:\n", + " Tuple[str, pd.DataFrame | None]: A status message and a preview DataFrame (first 10 rows) if successful.\n", + " \"\"\"\n", + " try:\n", + " # Validate number of records\n", + " if num_records <= 10:\n", + " return \"❌ Error: Number of records must be greater than 10.\", None\n", + " if num_records > 1000:\n", + " return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n", + "\n", + " # Validate file format\n", + " if file_format not in FILE_FORMATS:\n", + " return f\"❌ Error: Invalid file format '{file_format}'. Supported formats: {FILE_FORMATS}\", None\n", + "\n", + " # Ensure file name has correct extension\n", + " if not file_name.endswith(file_format):\n", + " file_name += file_format\n", + "\n", + " # Generate the prompt and query the model\n", + " prompt = get_user_prompt(business_case, schema_text, num_records)\n", + " records = query_llm(model_name, prompt)\n", + "\n", + " if not records:\n", + " return \"❌ Error: No valid records were generated by the model.\", None\n", + "\n", + " # Save dataset\n", + " save_dataset(records, file_format, file_name)\n", + "\n", + " # Prepare preview\n", + " df = pd.DataFrame(records)\n", + " preview = df.head(10)\n", + "\n", + " success_message = (\n", + " f\"✅ Generated {len(records)} records successfully!\\n\"\n", + " f\"📁 Saved to: {file_name}\\n\"\n", + " )\n", + "\n", + " return success_message, preview\n", + "\n", + " except Exception as e:\n", + " return f\"❌ Error: {str(e)}\", None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 702 + }, + "id": "Z9WdaSfFUakj", + "outputId": "2fbce2c5-a6d3-4dd8-a9d2-0e38c18d202e" + }, + "outputs": [], + "source": [ + "with gr.Blocks(title=\"Synthetic Dataset Generator\", theme=gr.themes.Monochrome()) as interface:\n", + " tokenizer = None\n", + " inputs = None\n", + " hf_model = None\n", + " outputs = None\n", + "\n", + " gr.Markdown(\"# Dataset Generator\")\n", + " gr.Markdown(\"Generate synthetic datasets using AI models\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=2):\n", + " schema_input = gr.Textbox(\n", + " label=\"Schema\",\n", + " value=DEFAULT_SCHEMA_TEXT,\n", + " lines=15,\n", + " placeholder=\"Define your dataset schema here... Please follow this format: Name (TYPE) - Description, example: Example\"\n", + " )\n", + "\n", + " business_case_input = gr.Textbox(\n", + " label=\"Business Case\",\n", + " value=\"I want to generate restaurant dataset\",\n", + " lines=1,\n", + " placeholder=\"Enter business case description...\"\n", + " )\n", + "\n", + " with gr.Row():\n", + " model_dropdown = gr.Dropdown(\n", + " label=\"Model\",\n", + " choices=list(MODELS.keys()),\n", + " value=list(MODELS.keys())[0],\n", + " interactive=True\n", + " )\n", + "\n", + " nr_records_input = gr.Number(\n", + " label=\"Number of records\",\n", + " value=27,\n", + " minimum=11,\n", + " maximum=1000,\n", + " step=1\n", + " )\n", + "\n", + " with gr.Row():\n", + " filename_input = gr.Textbox(\n", + " label=\"Save as\",\n", + " value=\"restaurant_dataset\",\n", + " placeholder=\"Enter filename (extension will be added automatically)\"\n", + " )\n", + "\n", + " file_format_dropdown = gr.Dropdown(\n", + " label=\"File format\",\n", + " choices=FILE_FORMATS,\n", + " value=FILE_FORMATS[0],\n", + " interactive=True\n", + " )\n", + "\n", + " generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n", + "\n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"\"\"\n", + " ### 📝 Dataset Generation Instructions\n", + "\n", + " 1. **🗂 Schema** – Define your dataset structure\n", + " *(default: restaurant schema provided)*\n", + " 2. **💡 Business Case** – Enter a prompt to guide the AI for generating data\n", + " 3. **🤖 Model** – Choose your AI model: GPT, Claude, Gemini, or Hugging Face\n", + " 4. **📊 Number of Records** – Specify entries to generate\n", + " *(min: 11, max: 1000)*\n", + " 5. **📁 File Format** – Select output type: `.csv`, `.tsv`, `.jsonl`, or `.json`\n", + " 6. **💾 Save As** – Provide a filename *(extension auto-added)*\n", + " 7. **🚀 Generate** – Click **Generate** to create your dataset\n", + "\n", + " ### 🔧 Requirements\n", + "\n", + " Set API keys in Colab’s secret section:\n", + " `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `HF_TOKEN`\n", + " \"\"\")\n", + " output_status = gr.Textbox(\n", + " label=\"Status\",\n", + " lines=4,\n", + " interactive=False\n", + " )\n", + "\n", + " output_preview = gr.Dataframe(\n", + " label=\"Preview (first 10 rows)\",\n", + " interactive=False,\n", + " wrap=True\n", + " )\n", + "\n", + " generate_btn.click(\n", + " fn=generate_dataset,\n", + " inputs=[\n", + " model_dropdown,\n", + " business_case_input,\n", + " nr_records_input,\n", + " schema_input,\n", + " file_format_dropdown,\n", + " filename_input\n", + " ],\n", + " outputs=[output_status, output_preview]\n", + " )\n", + "\n", + "interface.launch(debug=True)\n", + "\n", + "del tokenizer, inputs, hf_model, outputs\n", + "gc.collect()\n", + "torch.cuda.empty_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w-ewbsjInopm" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 530ff7148b74a3b0f83537d26cd3a90a99c5798e Mon Sep 17 00:00:00 2001 From: Krabulek Date: Fri, 19 Sep 2025 14:03:37 +0200 Subject: [PATCH 2/3] small fix in week 4 contributions - python code documentation assistant --- .../Python_code_documentation_assistant.ipynb | 828 ++++++++++++++++++ 1 file changed, 828 insertions(+) create mode 100644 week4/community-contributions/Python_code_documentation_assistant.ipynb diff --git a/week4/community-contributions/Python_code_documentation_assistant.ipynb b/week4/community-contributions/Python_code_documentation_assistant.ipynb new file mode 100644 index 0000000..aebc0e3 --- /dev/null +++ b/week4/community-contributions/Python_code_documentation_assistant.ipynb @@ -0,0 +1,828 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9", + "metadata": {}, + "source": [ + "# Python Code Documentation Assistant\n", + "\n", + "The requirement: use a Frontier model to add docstrings and comments to your Python code\n" + ] + }, + { + "cell_type": "markdown", + "id": "d4634170-c444-4326-9e68-5f87c63fa0e0", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f72dfaf-9f20-4d81-b082-018eda152c9f", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U -q \"google-genai\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import io\n", + "import sys\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "from google import genai\n", + "from google.genai import types\n", + "import anthropic\n", + "from IPython.display import Markdown, display, update_display\n", + "import gradio as gr\n", + "import subprocess" + ] + }, + { + "cell_type": "markdown", + "id": "f91e8b32-4c98-4210-a1e1-bfe0b1fddab7", + "metadata": {}, + "source": [ + "## Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f672e1c-87e9-4865-b760-370fa605e614", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins with: {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + " \n", + "if anthropic_api_key:\n", + " print(f\"Anthropic API Key exists and begins with: {anthropic_api_key[:7]}\")\n", + "else:\n", + " print(\"Anthropic API Key not set\")\n", + "\n", + "if google_api_key:\n", + " print(f\"Google API Key exists and begins with: {google_api_key[:4]}\")\n", + "else:\n", + " print(\"Google API Key not set\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da", + "metadata": {}, + "outputs": [], + "source": [ + "openai = OpenAI()\n", + "claude = anthropic.Anthropic()\n", + "gemini = genai.Client()\n", + "\n", + "OPENAI_MODEL = \"o4-mini\"\n", + "CLAUDE_MODEL = \"claude-3-7-sonnet-latest\"\n", + "GEMINI_MODEL = \"gemini-2.5-flash\"" + ] + }, + { + "cell_type": "markdown", + "id": "88a18c58-40d5-4592-8dd3-d7c7b0d951aa", + "metadata": {}, + "source": [ + "## Prompts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6896636f-923e-4a2c-9d6c-fac07828a201", + "metadata": {}, + "outputs": [], + "source": [ + "system_message = \"\"\"\n", + "You are an assistant that documents Python code. \n", + "Your task: \n", + "- Add concise, clear, and informative docstrings to functions, classes, and modules. \n", + "- Add inline comments only where they improve readability or clarify intent. \n", + "- Do not modify the code logic or structure. \n", + "- Respond with Python code only. \n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb", + "metadata": {}, + "outputs": [], + "source": [ + "def user_prompt_for(python):\n", + " user_prompt = \"Add docstrings and comments to the following Python code:\\n\"\n", + " user_prompt += python\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6190659-f54c-4951-bef4-4960f8e51cc4", + "metadata": {}, + "outputs": [], + "source": [ + "def messages_for(python):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(python)}\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "624e5066-bcf6-490d-a790-608d2bb34184", + "metadata": {}, + "source": [ + "## Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71e1ba8c-5b05-4726-a9f3-8d8c6257350b", + "metadata": {}, + "outputs": [], + "source": [ + "def write_output(python, filename_suffix):\n", + " filename = f\"annotated_{filename_suffix}.py\"\n", + " code = python.replace(\"```python\",\"\").replace(\"```\",\"\")\n", + " with open(filename, \"w\") as f:\n", + " f.write(code)\n", + " print(f\"\\nWritten code to {filename}\")\n", + " return filename" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7d2fea8-74c6-4421-8f1e-0e76d5b201b9", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_with_gpt(python, task_name): \n", + " stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python), stream=True)\n", + " reply = \"\"\n", + " for chunk in stream:\n", + " fragment = chunk.choices[0].delta.content or \"\"\n", + " reply += fragment\n", + " print(fragment, end='', flush=True)\n", + " return write_output(reply, f\"{task_name}_gpt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd84ad8-d55c-4fe0-9eeb-1895c95c4a9d", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_with_claude(python, task_name):\n", + " result = claude.messages.stream(\n", + " model=CLAUDE_MODEL,\n", + " max_tokens=2000,\n", + " system=system_message,\n", + " messages=[{\"role\": \"user\", \"content\": user_prompt_for(python)}],\n", + " )\n", + " reply = \"\"\n", + " with result as stream:\n", + " for text in stream.text_stream:\n", + " reply += text\n", + " print(text, end=\"\", flush=True)\n", + " return write_output(reply, f\"{task_name}_claude\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8a35102-1c95-469b-8855-e85f4c9bdbdf", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_with_gemini(python, task_name):\n", + " reply = gemini.models.generate_content(\n", + " model=GEMINI_MODEL,\n", + " contents=user_prompt_for(python),\n", + " config=types.GenerateContentConfig(\n", + " system_instruction=system_message,\n", + " )\n", + " )\n", + "\n", + " print(reply.text)\n", + " return write_output(reply.text, f\"{task_name}_gemini\")" + ] + }, + { + "cell_type": "markdown", + "id": "028dcfdd-2d52-4e11-a79e-2214a97cb26d", + "metadata": {}, + "source": [ + "# Run the Annotator" + ] + }, + { + "cell_type": "markdown", + "id": "7462d9f9-6215-4fb0-9471-1d0141d33205", + "metadata": {}, + "source": [ + "## Pi example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1cbb778-fa57-43de-b04b-ed523f396c38", + "metadata": {}, + "outputs": [], + "source": [ + "pi = \"\"\"\n", + "import time\n", + "\n", + "def calculate(iterations, param1, param2):\n", + " result = 1.0\n", + " for i in range(1, iterations+1):\n", + " j = i * param1 - param2\n", + " result -= (1/j)\n", + " j = i * param1 + param2\n", + " result += (1/j)\n", + " return result\n", + "\n", + "start_time = time.time()\n", + "result = calculate(100_000_000, 4, 1) * 4\n", + "end_time = time.time()\n", + "\n", + "print(f\"Result: {result:.12f}\")\n", + "print(f\"Execution Time: {(end_time - start_time):.6f} seconds\")\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "105db6f9-343c-491d-8e44-3a5328b81719", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_pi = annotate_with_gpt(pi, \"pi))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "415819d0-fc95-4f78-a6ae-5c7d6781c6a7", + "metadata": {}, + "outputs": [], + "source": [ + "# check if the script works\n", + "\n", + "exec(open(gpt_pi).read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "983a11fe-e24d-4c65-8269-9802c5ef3ae6", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "claude_pi = annotate_with_claude(pi, \"pi\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52f5b710-0dea-4884-8ed7-a94059d88281", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open(claude_pi).read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01f331f2-caac-48f6-9a03-8a228ee521bc", + "metadata": {}, + "outputs": [], + "source": [ + "gemini_pi = annotate_with_gemini(pi, \"pi\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23529942-53fa-46ad-a5db-1f3096dd6607", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open(gemini_pi).read())" + ] + }, + { + "cell_type": "markdown", + "id": "7d1eaeca-61be-4d0a-a525-dd09f52aaa0f", + "metadata": {}, + "source": [ + "## Hard example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3b497b3-f569-420e-b92e-fb0f49957ce0", + "metadata": {}, + "outputs": [], + "source": [ + "python_hard = \"\"\"# Be careful to support large number sizes\n", + "\n", + "def lcg(seed, a=1664525, c=1013904223, m=2**32):\n", + " value = seed\n", + " while True:\n", + " value = (a * value + c) % m\n", + " yield value\n", + " \n", + "def max_subarray_sum(n, seed, min_val, max_val):\n", + " lcg_gen = lcg(seed)\n", + " random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n", + " max_sum = float('-inf')\n", + " for i in range(n):\n", + " current_sum = 0\n", + " for j in range(i, n):\n", + " current_sum += random_numbers[j]\n", + " if current_sum > max_sum:\n", + " max_sum = current_sum\n", + " return max_sum\n", + "\n", + "def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n", + " total_sum = 0\n", + " lcg_gen = lcg(initial_seed)\n", + " for _ in range(20):\n", + " seed = next(lcg_gen)\n", + " total_sum += max_subarray_sum(n, seed, min_val, max_val)\n", + " return total_sum\n", + "\n", + "# Parameters\n", + "n = 10000 # Number of random numbers\n", + "initial_seed = 42 # Initial seed for the LCG\n", + "min_val = -10 # Minimum value of random numbers\n", + "max_val = 10 # Maximum value of random numbers\n", + "\n", + "# Timing the function\n", + "import time\n", + "start_time = time.time()\n", + "result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n", + "end_time = time.time()\n", + "\n", + "print(\"Total Maximum Subarray Sum (20 runs):\", result)\n", + "print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dab5e4bc-276c-4555-bd4c-12c699d5e899", + "metadata": {}, + "outputs": [], + "source": [ + "exec(python_hard)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d24ed5-2c15-4f55-80e7-13a3952b3cb8", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_hard = annotate_with_gpt(python_hard, \"hard\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80a15259-3d51-47b8-953c-6271fbd4b6fb", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open(gpt_hard).read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9305446-1d0c-4b51-866a-b8c1e299bf5c", + "metadata": {}, + "outputs": [], + "source": [ + "gemini_hard = annotate_with_gemini(python_hard, \"hard\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad6eecc8-0517-43d8-bd21-5bbdedae7a10", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open(gemini_hard).read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ee75e72-9ecb-4edd-a74a-4d3a83c1eb79", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "claude_hard = annotate_with_claude(python_hard, \"hard\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47af1516-455f-4d1c-8a1c-2da5a38c0ba5", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open(claude_hard).read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f60d33c-f6b7-4fc5-bc2b-57957b076e34", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "This module implements a Linear Congruential Generator (LCG) and uses it\n", + "to generate random numbers for calculating the maximum subarray sum.\n", + "It includes functions for the LCG, finding the maximum subarray sum, and\n", + "aggregating results over multiple runs.\n", + "\"\"\"\n", + "\n", + "def lcg(seed, a=1664525, c=1013904223, m=2**32):\n", + " \"\"\"\n", + " Implements a Linear Congruential Generator (LCG) to produce a sequence of\n", + " pseudorandom numbers.\n", + "\n", + " The generator uses the formula: X_{n+1} = (a * X_n + c) % m.\n", + "\n", + " Args:\n", + " seed (int): The initial seed value for the generator (X_0).\n", + " a (int, optional): The multiplier. Defaults to 1664525 (common LCG parameter).\n", + " c (int, optional): The increment. Defaults to 1013904223 (common LCG parameter).\n", + " m (int, optional): The modulus. Defaults to 2**32, meaning numbers will be\n", + " between 0 and m-1.\n", + "\n", + " Yields:\n", + " int: The next pseudorandom number in the sequence.\n", + " \"\"\"\n", + " value = seed\n", + " while True:\n", + " # Calculate the next pseudorandom number using the LCG formula.\n", + " value = (a * value + c) % m\n", + " yield value\n", + "\n", + "def max_subarray_sum(n, seed, min_val, max_val):\n", + " \"\"\"\n", + " Calculates the maximum possible sum of a contiguous subarray within a list\n", + " of 'n' pseudorandom numbers.\n", + "\n", + " The random numbers are generated using an LCG based on the provided seed,\n", + " and then mapped to the range [min_val, max_val].\n", + " This implementation uses a brute-force approach with O(n^2) complexity.\n", + "\n", + " Args:\n", + " n (int): The number of random integers to generate for the array.\n", + " seed (int): The seed for the LCG to generate the random numbers.\n", + " min_val (int): The minimum possible value for the generated random numbers.\n", + " max_val (int): The maximum possible value for the generated random numbers.\n", + "\n", + " Returns:\n", + " int: The maximum sum found among all contiguous subarrays.\n", + " \"\"\"\n", + " lcg_gen = lcg(seed)\n", + " # Generate a list of 'n' random numbers within the specified range [min_val, max_val].\n", + " random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n", + "\n", + " max_sum = float('-inf') # Initialize max_sum to negative infinity to handle all negative numbers.\n", + "\n", + " # Iterate through all possible starting points of a subarray.\n", + " for i in range(n):\n", + " current_sum = 0\n", + " # Iterate through all possible ending points for the current starting point.\n", + " for j in range(i, n):\n", + " current_sum += random_numbers[j]\n", + " # Update max_sum if the current subarray sum is greater.\n", + " if current_sum > max_sum:\n", + " max_sum = current_sum\n", + " return max_sum\n", + "\n", + "def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n", + " \"\"\"\n", + " Calculates the sum of maximum subarray sums over 20 separate runs.\n", + "\n", + " Each run generates a new set of 'n' random numbers for `max_subarray_sum`\n", + " using a new seed derived from the initial LCG sequence.\n", + "\n", + " Args:\n", + " n (int): The number of random integers for each subarray sum calculation.\n", + " initial_seed (int): The initial seed for the LCG that generates seeds\n", + " for individual `max_subarray_sum` runs.\n", + " min_val (int): The minimum possible value for random numbers in each run.\n", + " max_val (int): The maximum possible value for random numbers in each run.\n", + "\n", + " Returns:\n", + " int: The sum of the maximum subarray sums across all 20 runs.\n", + " \"\"\"\n", + " total_sum = 0\n", + " lcg_gen = lcg(initial_seed) # LCG to generate seeds for subsequent runs.\n", + " # Perform 20 independent runs.\n", + " for _ in range(20):\n", + " # Get a new seed for each run from the initial LCG generator.\n", + " seed = next(lcg_gen)\n", + " # Add the maximum subarray sum of the current run to the total sum.\n", + " total_sum += max_subarray_sum(n, seed, min_val, max_val)\n", + " return total_sum\n", + "\n", + "# Parameters for the simulation\n", + "n = 10000 # Number of random numbers to generate for each subarray\n", + "initial_seed = 42 # Initial seed for the LCG that generates seeds for runs\n", + "min_val = -10 # Minimum value for the random numbers\n", + "max_val = 10 # Maximum value for the random numbers\n", + "\n", + "# Import the time module to measure execution time.\n", + "import time\n", + "\n", + "# Record the start time before executing the main function.\n", + "start_time = time.time()\n", + "# Call the function to calculate the total maximum subarray sum over multiple runs.\n", + "result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n", + "# Record the end time after the function completes.\n", + "end_time = time.time()\n", + "\n", + "# Print the final aggregated result.\n", + "print(\"Total Maximum Subarray Sum (20 runs):\", result)\n", + "# Print the total execution time, formatted to 6 decimal places.\n", + "print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))" + ] + }, + { + "cell_type": "markdown", + "id": "ff02ce09-0544-49a5-944d-a57b25bf9b72", + "metadata": {}, + "source": [ + "# Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0be9f47d-5213-4700-b0e2-d444c7c738c0", + "metadata": {}, + "outputs": [], + "source": [ + "def stream_gpt(python): \n", + " stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python), stream=True)\n", + " reply = \"\"\n", + " for chunk in stream:\n", + " fragment = chunk.choices[0].delta.content or \"\"\n", + " reply += fragment\n", + " yield reply.replace('```python\\n','').replace('```','')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8669f56b-8314-4582-a167-78842caea131", + "metadata": {}, + "outputs": [], + "source": [ + "def stream_claude(python):\n", + " result = claude.messages.stream(\n", + " model=CLAUDE_MODEL,\n", + " max_tokens=2000,\n", + " system=system_message,\n", + " messages=[{\"role\": \"user\", \"content\": user_prompt_for(python)}],\n", + " )\n", + " reply = \"\"\n", + " with result as stream:\n", + " for text in stream.text_stream:\n", + " reply += text\n", + " yield reply.replace('```python\\n','').replace('```','')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d48d44df-c082-4ed1-b3ea-fc2a880591c2", + "metadata": {}, + "outputs": [], + "source": [ + "def stream_gemini(python):\n", + " stream = gemini.models.generate_content_stream(\n", + " model=GEMINI_MODEL,\n", + " contents=user_prompt_for(python),\n", + " config=types.GenerateContentConfig(\n", + " system_instruction=system_message,\n", + " ),\n", + " )\n", + " reply = \"\"\n", + " for chunk in stream:\n", + " reply += chunk.text\n", + " yield reply.replace('```python\\n','').replace('```','')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f1ae8f5-16c8-40a0-aa18-63b617df078d", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate(python, model):\n", + " if model == \"GPT\":\n", + " result = stream_gpt(python)\n", + " elif model == \"Claude\":\n", + " result = stream_claude(python)\n", + " elif model == \"Gemini\":\n", + " result = stream_gemini(python)\n", + " else:\n", + " raise ValueError(\"Unknown model\")\n", + " for stream_so_far in result:\n", + " yield stream_so_far " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19bf2bff-a822-4009-a539-f003b1651383", + "metadata": {}, + "outputs": [], + "source": [ + "def execute_python(code):\n", + " try:\n", + " output = io.StringIO()\n", + " sys.stdout = output\n", + " exec(code)\n", + " finally:\n", + " sys.stdout = sys.__stdout__\n", + " return output.getvalue()" + ] + }, + { + "cell_type": "markdown", + "id": "8391444b-b938-4f92-982f-91439b38d901", + "metadata": {}, + "source": [ + "# Gradio App" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a2274f1-d03b-42c0-8dcc-4ce159b18442", + "metadata": {}, + "outputs": [], + "source": [ + "css = \"\"\"\n", + ".python {background-color: #306998;}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76167ea9-d0a1-4bc6-8d73-633d3b8c8df6", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "\n", + "# Parameters\n", + "LINES = 25\n", + "LINE_HEIGHT = 20 # px, typical CodeMirror line height\n", + "PADDING = 10 # px, top + bottom padding\n", + "\n", + "CODE_HEIGHT = LINES * LINE_HEIGHT + PADDING\n", + "\n", + "\n", + "with gr.Blocks(\n", + " theme=gr.themes.Soft(),\n", + " css=f\"\"\"\n", + "#code_input .cm-editor, #annotated_code .cm-editor {{\n", + " height: {CODE_HEIGHT}px !important;\n", + " overflow-y: auto !important;\n", + "}}\n", + "\"\"\"\n", + ") as demo_v2:\n", + " gr.Markdown(\"## 🐍 Annotate Python Code with Docstrings and Comments\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"### Python code:\")\n", + " code_input = gr.Code(\n", + " language=\"python\", \n", + " value=python_hard,\n", + " elem_id=\"code_input\"\n", + " )\n", + " \n", + " with gr.Column(scale=1):\n", + " gr.Markdown(\"### Annotated code:\")\n", + " annotated_output = gr.Code(\n", + " language=\"python\",\n", + " elem_id=\"annotated_code\",\n", + " interactive=False\n", + " )\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=1):\n", + " model_dropdown = gr.Dropdown(\n", + " choices=[\"Gemini\", \"GPT-4\", \"Claude\"],\n", + " value=\"Gemini\",\n", + " label=\"Select model\"\n", + " )\n", + " with gr.Column(scale=1):\n", + " annotate_btn = gr.Button(\"✨ Annotate code\", variant=\"primary\")\n", + " run_btn = gr.Button(\"▶️ Run Python\", variant=\"secondary\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column():\n", + " gr.Markdown(\"### Python result:\")\n", + " result_output = gr.Textbox(\n", + " lines=5, \n", + " label=\"Output\",\n", + " interactive=False\n", + " )\n", + " \n", + " annotate_btn.click(\n", + " annotate,\n", + " inputs=[code_input, model_dropdown],\n", + " outputs=[annotated_output]\n", + " )\n", + " run_btn.click(execute_python, inputs=[annotated_output], outputs=[result_output])\n", + "\n", + " \n", + "demo_v2.launch(inbrowser=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea42883b-fdba-46ed-97be-f42e3cb41f11", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From c7bda8a44230f7aa053bd1b45516836c1d176bcb Mon Sep 17 00:00:00 2001 From: Krabulek Date: Fri, 19 Sep 2025 19:44:26 +0200 Subject: [PATCH 3/3] removed the python code documentation assistant - it is added in another branch: --- .../Python_code_documentation_assistant.ipynb | 828 ------------------ 1 file changed, 828 deletions(-) delete mode 100644 week4/community-contributions/Python_code_documentation_assistant.ipynb diff --git a/week4/community-contributions/Python_code_documentation_assistant.ipynb b/week4/community-contributions/Python_code_documentation_assistant.ipynb deleted file mode 100644 index aebc0e3..0000000 --- a/week4/community-contributions/Python_code_documentation_assistant.ipynb +++ /dev/null @@ -1,828 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9", - "metadata": {}, - "source": [ - "# Python Code Documentation Assistant\n", - "\n", - "The requirement: use a Frontier model to add docstrings and comments to your Python code\n" - ] - }, - { - "cell_type": "markdown", - "id": "d4634170-c444-4326-9e68-5f87c63fa0e0", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f72dfaf-9f20-4d81-b082-018eda152c9f", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -U -q \"google-genai\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import io\n", - "import sys\n", - "from dotenv import load_dotenv\n", - "from openai import OpenAI\n", - "from google import genai\n", - "from google.genai import types\n", - "import anthropic\n", - "from IPython.display import Markdown, display, update_display\n", - "import gradio as gr\n", - "import subprocess" - ] - }, - { - "cell_type": "markdown", - "id": "f91e8b32-4c98-4210-a1e1-bfe0b1fddab7", - "metadata": {}, - "source": [ - "## Environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f672e1c-87e9-4865-b760-370fa605e614", - "metadata": {}, - "outputs": [], - "source": [ - "load_dotenv(override=True)\n", - "openai_api_key = os.getenv('OPENAI_API_KEY')\n", - "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", - "google_api_key = os.getenv('GOOGLE_API_KEY')\n", - "\n", - "if openai_api_key:\n", - " print(f\"OpenAI API Key exists and begins with: {openai_api_key[:8]}\")\n", - "else:\n", - " print(\"OpenAI API Key not set\")\n", - " \n", - "if anthropic_api_key:\n", - " print(f\"Anthropic API Key exists and begins with: {anthropic_api_key[:7]}\")\n", - "else:\n", - " print(\"Anthropic API Key not set\")\n", - "\n", - "if google_api_key:\n", - " print(f\"Google API Key exists and begins with: {google_api_key[:4]}\")\n", - "else:\n", - " print(\"Google API Key not set\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da", - "metadata": {}, - "outputs": [], - "source": [ - "openai = OpenAI()\n", - "claude = anthropic.Anthropic()\n", - "gemini = genai.Client()\n", - "\n", - "OPENAI_MODEL = \"o4-mini\"\n", - "CLAUDE_MODEL = \"claude-3-7-sonnet-latest\"\n", - "GEMINI_MODEL = \"gemini-2.5-flash\"" - ] - }, - { - "cell_type": "markdown", - "id": "88a18c58-40d5-4592-8dd3-d7c7b0d951aa", - "metadata": {}, - "source": [ - "## Prompts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6896636f-923e-4a2c-9d6c-fac07828a201", - "metadata": {}, - "outputs": [], - "source": [ - "system_message = \"\"\"\n", - "You are an assistant that documents Python code. \n", - "Your task: \n", - "- Add concise, clear, and informative docstrings to functions, classes, and modules. \n", - "- Add inline comments only where they improve readability or clarify intent. \n", - "- Do not modify the code logic or structure. \n", - "- Respond with Python code only. \n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb", - "metadata": {}, - "outputs": [], - "source": [ - "def user_prompt_for(python):\n", - " user_prompt = \"Add docstrings and comments to the following Python code:\\n\"\n", - " user_prompt += python\n", - " return user_prompt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6190659-f54c-4951-bef4-4960f8e51cc4", - "metadata": {}, - "outputs": [], - "source": [ - "def messages_for(python):\n", - " return [\n", - " {\"role\": \"system\", \"content\": system_message},\n", - " {\"role\": \"user\", \"content\": user_prompt_for(python)}\n", - " ]" - ] - }, - { - "cell_type": "markdown", - "id": "624e5066-bcf6-490d-a790-608d2bb34184", - "metadata": {}, - "source": [ - "## Helper functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71e1ba8c-5b05-4726-a9f3-8d8c6257350b", - "metadata": {}, - "outputs": [], - "source": [ - "def write_output(python, filename_suffix):\n", - " filename = f\"annotated_{filename_suffix}.py\"\n", - " code = python.replace(\"```python\",\"\").replace(\"```\",\"\")\n", - " with open(filename, \"w\") as f:\n", - " f.write(code)\n", - " print(f\"\\nWritten code to {filename}\")\n", - " return filename" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7d2fea8-74c6-4421-8f1e-0e76d5b201b9", - "metadata": {}, - "outputs": [], - "source": [ - "def annotate_with_gpt(python, task_name): \n", - " stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python), stream=True)\n", - " reply = \"\"\n", - " for chunk in stream:\n", - " fragment = chunk.choices[0].delta.content or \"\"\n", - " reply += fragment\n", - " print(fragment, end='', flush=True)\n", - " return write_output(reply, f\"{task_name}_gpt\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7cd84ad8-d55c-4fe0-9eeb-1895c95c4a9d", - "metadata": {}, - "outputs": [], - "source": [ - "def annotate_with_claude(python, task_name):\n", - " result = claude.messages.stream(\n", - " model=CLAUDE_MODEL,\n", - " max_tokens=2000,\n", - " system=system_message,\n", - " messages=[{\"role\": \"user\", \"content\": user_prompt_for(python)}],\n", - " )\n", - " reply = \"\"\n", - " with result as stream:\n", - " for text in stream.text_stream:\n", - " reply += text\n", - " print(text, end=\"\", flush=True)\n", - " return write_output(reply, f\"{task_name}_claude\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8a35102-1c95-469b-8855-e85f4c9bdbdf", - "metadata": {}, - "outputs": [], - "source": [ - "def annotate_with_gemini(python, task_name):\n", - " reply = gemini.models.generate_content(\n", - " model=GEMINI_MODEL,\n", - " contents=user_prompt_for(python),\n", - " config=types.GenerateContentConfig(\n", - " system_instruction=system_message,\n", - " )\n", - " )\n", - "\n", - " print(reply.text)\n", - " return write_output(reply.text, f\"{task_name}_gemini\")" - ] - }, - { - "cell_type": "markdown", - "id": "028dcfdd-2d52-4e11-a79e-2214a97cb26d", - "metadata": {}, - "source": [ - "# Run the Annotator" - ] - }, - { - "cell_type": "markdown", - "id": "7462d9f9-6215-4fb0-9471-1d0141d33205", - "metadata": {}, - "source": [ - "## Pi example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1cbb778-fa57-43de-b04b-ed523f396c38", - "metadata": {}, - "outputs": [], - "source": [ - "pi = \"\"\"\n", - "import time\n", - "\n", - "def calculate(iterations, param1, param2):\n", - " result = 1.0\n", - " for i in range(1, iterations+1):\n", - " j = i * param1 - param2\n", - " result -= (1/j)\n", - " j = i * param1 + param2\n", - " result += (1/j)\n", - " return result\n", - "\n", - "start_time = time.time()\n", - "result = calculate(100_000_000, 4, 1) * 4\n", - "end_time = time.time()\n", - "\n", - "print(f\"Result: {result:.12f}\")\n", - "print(f\"Execution Time: {(end_time - start_time):.6f} seconds\")\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "105db6f9-343c-491d-8e44-3a5328b81719", - "metadata": {}, - "outputs": [], - "source": [ - "gpt_pi = annotate_with_gpt(pi, \"pi))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "415819d0-fc95-4f78-a6ae-5c7d6781c6a7", - "metadata": {}, - "outputs": [], - "source": [ - "# check if the script works\n", - "\n", - "exec(open(gpt_pi).read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "983a11fe-e24d-4c65-8269-9802c5ef3ae6", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "claude_pi = annotate_with_claude(pi, \"pi\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f5b710-0dea-4884-8ed7-a94059d88281", - "metadata": {}, - "outputs": [], - "source": [ - "exec(open(claude_pi).read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01f331f2-caac-48f6-9a03-8a228ee521bc", - "metadata": {}, - "outputs": [], - "source": [ - "gemini_pi = annotate_with_gemini(pi, \"pi\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23529942-53fa-46ad-a5db-1f3096dd6607", - "metadata": {}, - "outputs": [], - "source": [ - "exec(open(gemini_pi).read())" - ] - }, - { - "cell_type": "markdown", - "id": "7d1eaeca-61be-4d0a-a525-dd09f52aaa0f", - "metadata": {}, - "source": [ - "## Hard example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3b497b3-f569-420e-b92e-fb0f49957ce0", - "metadata": {}, - "outputs": [], - "source": [ - "python_hard = \"\"\"# Be careful to support large number sizes\n", - "\n", - "def lcg(seed, a=1664525, c=1013904223, m=2**32):\n", - " value = seed\n", - " while True:\n", - " value = (a * value + c) % m\n", - " yield value\n", - " \n", - "def max_subarray_sum(n, seed, min_val, max_val):\n", - " lcg_gen = lcg(seed)\n", - " random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n", - " max_sum = float('-inf')\n", - " for i in range(n):\n", - " current_sum = 0\n", - " for j in range(i, n):\n", - " current_sum += random_numbers[j]\n", - " if current_sum > max_sum:\n", - " max_sum = current_sum\n", - " return max_sum\n", - "\n", - "def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n", - " total_sum = 0\n", - " lcg_gen = lcg(initial_seed)\n", - " for _ in range(20):\n", - " seed = next(lcg_gen)\n", - " total_sum += max_subarray_sum(n, seed, min_val, max_val)\n", - " return total_sum\n", - "\n", - "# Parameters\n", - "n = 10000 # Number of random numbers\n", - "initial_seed = 42 # Initial seed for the LCG\n", - "min_val = -10 # Minimum value of random numbers\n", - "max_val = 10 # Maximum value of random numbers\n", - "\n", - "# Timing the function\n", - "import time\n", - "start_time = time.time()\n", - "result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n", - "end_time = time.time()\n", - "\n", - "print(\"Total Maximum Subarray Sum (20 runs):\", result)\n", - "print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dab5e4bc-276c-4555-bd4c-12c699d5e899", - "metadata": {}, - "outputs": [], - "source": [ - "exec(python_hard)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8d24ed5-2c15-4f55-80e7-13a3952b3cb8", - "metadata": {}, - "outputs": [], - "source": [ - "gpt_hard = annotate_with_gpt(python_hard, \"hard\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80a15259-3d51-47b8-953c-6271fbd4b6fb", - "metadata": {}, - "outputs": [], - "source": [ - "exec(open(gpt_hard).read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9305446-1d0c-4b51-866a-b8c1e299bf5c", - "metadata": {}, - "outputs": [], - "source": [ - "gemini_hard = annotate_with_gemini(python_hard, \"hard\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad6eecc8-0517-43d8-bd21-5bbdedae7a10", - "metadata": {}, - "outputs": [], - "source": [ - "exec(open(gemini_hard).read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ee75e72-9ecb-4edd-a74a-4d3a83c1eb79", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "claude_hard = annotate_with_claude(python_hard, \"hard\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47af1516-455f-4d1c-8a1c-2da5a38c0ba5", - "metadata": {}, - "outputs": [], - "source": [ - "exec(open(claude_hard).read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f60d33c-f6b7-4fc5-bc2b-57957b076e34", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "This module implements a Linear Congruential Generator (LCG) and uses it\n", - "to generate random numbers for calculating the maximum subarray sum.\n", - "It includes functions for the LCG, finding the maximum subarray sum, and\n", - "aggregating results over multiple runs.\n", - "\"\"\"\n", - "\n", - "def lcg(seed, a=1664525, c=1013904223, m=2**32):\n", - " \"\"\"\n", - " Implements a Linear Congruential Generator (LCG) to produce a sequence of\n", - " pseudorandom numbers.\n", - "\n", - " The generator uses the formula: X_{n+1} = (a * X_n + c) % m.\n", - "\n", - " Args:\n", - " seed (int): The initial seed value for the generator (X_0).\n", - " a (int, optional): The multiplier. Defaults to 1664525 (common LCG parameter).\n", - " c (int, optional): The increment. Defaults to 1013904223 (common LCG parameter).\n", - " m (int, optional): The modulus. Defaults to 2**32, meaning numbers will be\n", - " between 0 and m-1.\n", - "\n", - " Yields:\n", - " int: The next pseudorandom number in the sequence.\n", - " \"\"\"\n", - " value = seed\n", - " while True:\n", - " # Calculate the next pseudorandom number using the LCG formula.\n", - " value = (a * value + c) % m\n", - " yield value\n", - "\n", - "def max_subarray_sum(n, seed, min_val, max_val):\n", - " \"\"\"\n", - " Calculates the maximum possible sum of a contiguous subarray within a list\n", - " of 'n' pseudorandom numbers.\n", - "\n", - " The random numbers are generated using an LCG based on the provided seed,\n", - " and then mapped to the range [min_val, max_val].\n", - " This implementation uses a brute-force approach with O(n^2) complexity.\n", - "\n", - " Args:\n", - " n (int): The number of random integers to generate for the array.\n", - " seed (int): The seed for the LCG to generate the random numbers.\n", - " min_val (int): The minimum possible value for the generated random numbers.\n", - " max_val (int): The maximum possible value for the generated random numbers.\n", - "\n", - " Returns:\n", - " int: The maximum sum found among all contiguous subarrays.\n", - " \"\"\"\n", - " lcg_gen = lcg(seed)\n", - " # Generate a list of 'n' random numbers within the specified range [min_val, max_val].\n", - " random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n", - "\n", - " max_sum = float('-inf') # Initialize max_sum to negative infinity to handle all negative numbers.\n", - "\n", - " # Iterate through all possible starting points of a subarray.\n", - " for i in range(n):\n", - " current_sum = 0\n", - " # Iterate through all possible ending points for the current starting point.\n", - " for j in range(i, n):\n", - " current_sum += random_numbers[j]\n", - " # Update max_sum if the current subarray sum is greater.\n", - " if current_sum > max_sum:\n", - " max_sum = current_sum\n", - " return max_sum\n", - "\n", - "def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n", - " \"\"\"\n", - " Calculates the sum of maximum subarray sums over 20 separate runs.\n", - "\n", - " Each run generates a new set of 'n' random numbers for `max_subarray_sum`\n", - " using a new seed derived from the initial LCG sequence.\n", - "\n", - " Args:\n", - " n (int): The number of random integers for each subarray sum calculation.\n", - " initial_seed (int): The initial seed for the LCG that generates seeds\n", - " for individual `max_subarray_sum` runs.\n", - " min_val (int): The minimum possible value for random numbers in each run.\n", - " max_val (int): The maximum possible value for random numbers in each run.\n", - "\n", - " Returns:\n", - " int: The sum of the maximum subarray sums across all 20 runs.\n", - " \"\"\"\n", - " total_sum = 0\n", - " lcg_gen = lcg(initial_seed) # LCG to generate seeds for subsequent runs.\n", - " # Perform 20 independent runs.\n", - " for _ in range(20):\n", - " # Get a new seed for each run from the initial LCG generator.\n", - " seed = next(lcg_gen)\n", - " # Add the maximum subarray sum of the current run to the total sum.\n", - " total_sum += max_subarray_sum(n, seed, min_val, max_val)\n", - " return total_sum\n", - "\n", - "# Parameters for the simulation\n", - "n = 10000 # Number of random numbers to generate for each subarray\n", - "initial_seed = 42 # Initial seed for the LCG that generates seeds for runs\n", - "min_val = -10 # Minimum value for the random numbers\n", - "max_val = 10 # Maximum value for the random numbers\n", - "\n", - "# Import the time module to measure execution time.\n", - "import time\n", - "\n", - "# Record the start time before executing the main function.\n", - "start_time = time.time()\n", - "# Call the function to calculate the total maximum subarray sum over multiple runs.\n", - "result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n", - "# Record the end time after the function completes.\n", - "end_time = time.time()\n", - "\n", - "# Print the final aggregated result.\n", - "print(\"Total Maximum Subarray Sum (20 runs):\", result)\n", - "# Print the total execution time, formatted to 6 decimal places.\n", - "print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))" - ] - }, - { - "cell_type": "markdown", - "id": "ff02ce09-0544-49a5-944d-a57b25bf9b72", - "metadata": {}, - "source": [ - "# Streaming" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0be9f47d-5213-4700-b0e2-d444c7c738c0", - "metadata": {}, - "outputs": [], - "source": [ - "def stream_gpt(python): \n", - " stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python), stream=True)\n", - " reply = \"\"\n", - " for chunk in stream:\n", - " fragment = chunk.choices[0].delta.content or \"\"\n", - " reply += fragment\n", - " yield reply.replace('```python\\n','').replace('```','')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8669f56b-8314-4582-a167-78842caea131", - "metadata": {}, - "outputs": [], - "source": [ - "def stream_claude(python):\n", - " result = claude.messages.stream(\n", - " model=CLAUDE_MODEL,\n", - " max_tokens=2000,\n", - " system=system_message,\n", - " messages=[{\"role\": \"user\", \"content\": user_prompt_for(python)}],\n", - " )\n", - " reply = \"\"\n", - " with result as stream:\n", - " for text in stream.text_stream:\n", - " reply += text\n", - " yield reply.replace('```python\\n','').replace('```','')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d48d44df-c082-4ed1-b3ea-fc2a880591c2", - "metadata": {}, - "outputs": [], - "source": [ - "def stream_gemini(python):\n", - " stream = gemini.models.generate_content_stream(\n", - " model=GEMINI_MODEL,\n", - " contents=user_prompt_for(python),\n", - " config=types.GenerateContentConfig(\n", - " system_instruction=system_message,\n", - " ),\n", - " )\n", - " reply = \"\"\n", - " for chunk in stream:\n", - " reply += chunk.text\n", - " yield reply.replace('```python\\n','').replace('```','')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f1ae8f5-16c8-40a0-aa18-63b617df078d", - "metadata": {}, - "outputs": [], - "source": [ - "def annotate(python, model):\n", - " if model == \"GPT\":\n", - " result = stream_gpt(python)\n", - " elif model == \"Claude\":\n", - " result = stream_claude(python)\n", - " elif model == \"Gemini\":\n", - " result = stream_gemini(python)\n", - " else:\n", - " raise ValueError(\"Unknown model\")\n", - " for stream_so_far in result:\n", - " yield stream_so_far " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19bf2bff-a822-4009-a539-f003b1651383", - "metadata": {}, - "outputs": [], - "source": [ - "def execute_python(code):\n", - " try:\n", - " output = io.StringIO()\n", - " sys.stdout = output\n", - " exec(code)\n", - " finally:\n", - " sys.stdout = sys.__stdout__\n", - " return output.getvalue()" - ] - }, - { - "cell_type": "markdown", - "id": "8391444b-b938-4f92-982f-91439b38d901", - "metadata": {}, - "source": [ - "# Gradio App" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a2274f1-d03b-42c0-8dcc-4ce159b18442", - "metadata": {}, - "outputs": [], - "source": [ - "css = \"\"\"\n", - ".python {background-color: #306998;}\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76167ea9-d0a1-4bc6-8d73-633d3b8c8df6", - "metadata": {}, - "outputs": [], - "source": [ - "import gradio as gr\n", - "\n", - "# Parameters\n", - "LINES = 25\n", - "LINE_HEIGHT = 20 # px, typical CodeMirror line height\n", - "PADDING = 10 # px, top + bottom padding\n", - "\n", - "CODE_HEIGHT = LINES * LINE_HEIGHT + PADDING\n", - "\n", - "\n", - "with gr.Blocks(\n", - " theme=gr.themes.Soft(),\n", - " css=f\"\"\"\n", - "#code_input .cm-editor, #annotated_code .cm-editor {{\n", - " height: {CODE_HEIGHT}px !important;\n", - " overflow-y: auto !important;\n", - "}}\n", - "\"\"\"\n", - ") as demo_v2:\n", - " gr.Markdown(\"## 🐍 Annotate Python Code with Docstrings and Comments\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=1):\n", - " gr.Markdown(\"### Python code:\")\n", - " code_input = gr.Code(\n", - " language=\"python\", \n", - " value=python_hard,\n", - " elem_id=\"code_input\"\n", - " )\n", - " \n", - " with gr.Column(scale=1):\n", - " gr.Markdown(\"### Annotated code:\")\n", - " annotated_output = gr.Code(\n", - " language=\"python\",\n", - " elem_id=\"annotated_code\",\n", - " interactive=False\n", - " )\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=1):\n", - " model_dropdown = gr.Dropdown(\n", - " choices=[\"Gemini\", \"GPT-4\", \"Claude\"],\n", - " value=\"Gemini\",\n", - " label=\"Select model\"\n", - " )\n", - " with gr.Column(scale=1):\n", - " annotate_btn = gr.Button(\"✨ Annotate code\", variant=\"primary\")\n", - " run_btn = gr.Button(\"▶️ Run Python\", variant=\"secondary\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column():\n", - " gr.Markdown(\"### Python result:\")\n", - " result_output = gr.Textbox(\n", - " lines=5, \n", - " label=\"Output\",\n", - " interactive=False\n", - " )\n", - " \n", - " annotate_btn.click(\n", - " annotate,\n", - " inputs=[code_input, model_dropdown],\n", - " outputs=[annotated_output]\n", - " )\n", - " run_btn.click(execute_python, inputs=[annotated_output], outputs=[result_output])\n", - "\n", - " \n", - "demo_v2.launch(inbrowser=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea42883b-fdba-46ed-97be-f42e3cb41f11", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}