Add the community contribution for Week3/4/5
This commit is contained in:
@@ -0,0 +1,551 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "GD5Omr5EfWgb"
|
||||
},
|
||||
"source": [
|
||||
"# Date Generator\n",
|
||||
"\n",
|
||||
"generate synthetic data when given scheme, business problem description, model, number of records, file name, file type, and environment\n",
|
||||
"\n",
|
||||
"# Available models\n",
|
||||
" Model API:\n",
|
||||
"\n",
|
||||
" 1. gpt-4o-mini\n",
|
||||
" 2. claude-3-haiku-20240307\n",
|
||||
" 3. gemini-2.0-flash\n",
|
||||
" 4. deepseek-chat\"\n",
|
||||
"\n",
|
||||
" HuggingFace API:\n",
|
||||
"\n",
|
||||
" 5. meta-llama/Meta-Llama-3.1-8B-Instruct\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Available environment\n",
|
||||
"\n",
|
||||
"Colab: set up HF token and API keys in Colab secret section\n",
|
||||
"\n",
|
||||
"Local: set up HF token and API keys in .env file\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### *** This project is developed based on the idea of 'week3/community-contributuins/Week3-Dataset_Generator-DP'. Really appreciate it! Then, the project is improved to run both on Colab or locally, and integrate HuggingFace API"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "4FiCnE0MmU56"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
|
||||
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0\n",
|
||||
"!pip install anthropic dotenv pyarrow"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "JeyKw5guoH3r"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from typing import List\n",
|
||||
"import google.generativeai\n",
|
||||
"import anthropic\n",
|
||||
"from itertools import chain\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import gradio as gr\n",
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"import random\n",
|
||||
"import re\n",
|
||||
"import subprocess\n",
|
||||
"import pyarrow as pa\n",
|
||||
"import torch\n",
|
||||
"import gc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "7UyjFdRZoIAS"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Schema Definition ---\n",
|
||||
"SCHEMA = [\n",
|
||||
" (\"Name\", \"TEXT\", '\"Northern Cafe\"'),\n",
|
||||
" (\"Location\", \"TEXT\", '\"2904 S Figueroa St, Los Angeles, CA 90007\"'),\n",
|
||||
" (\"Type\", \"TEXT\", 'One of [\"Chinese\",\"Mexico\",\"French\",\"Korean\",\"Italy\"] or other potential types'),\n",
|
||||
" (\"Average Price\", \"TEXT\", '\"$30\", or \"--\" if unkown'),\n",
|
||||
" (\"History/Age\", \"INT\", 'integer age of resturant, e.g., 7'),\n",
|
||||
" (\"Menu\", \"Array\", '[\"Beef Noodle\", \"Fried Rice\", \"Dumpling\", ...]'),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "jXcTQATLoICV"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Default schema text for the textbox\n",
|
||||
"DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) Example: {col[2]}\" for i, col in enumerate(SCHEMA)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "4Irf5JV3oIEe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Available models\n",
|
||||
"MODELS = [\n",
|
||||
" \"gpt-4o-mini\",\n",
|
||||
" \"claude-3-haiku-20240307\",\n",
|
||||
" \"gemini-2.0-flash\",\n",
|
||||
" \"deepseek-chat\",\n",
|
||||
" \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "JJ6r2SH9oIGf"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Available file formats\n",
|
||||
"FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".parquet\", \".arrow\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "B98j45E3vq5g"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"\"\"You are a helpful assistant whose main purpose is to generate datasets for a given business problem based on given schema.\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "lsX16cWfwf6x"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_env_info(env):\n",
|
||||
" try:\n",
|
||||
" global hf_token, openai_api_key, anthropic_api_key, google_api_key, deepseek_api_key\n",
|
||||
" if env == \"Colab\":\n",
|
||||
" # Colab environment\n",
|
||||
" from google.colab import drive\n",
|
||||
" from google.colab import userdata\n",
|
||||
" hf_token = userdata.get('HF_TOKEN')\n",
|
||||
" openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
||||
" anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n",
|
||||
" google_api_key = userdata.get('GOOGLE_API_KEY')\n",
|
||||
" deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')\n",
|
||||
" elif env == \"Local\":\n",
|
||||
" # Local environment\n",
|
||||
" load_dotenv(override=True)\n",
|
||||
" hf_token = os.getenv('HF_TOKEN')\n",
|
||||
" openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
" anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
" google_api_key = os.getenv('GOOGLE_API_KEY')\n",
|
||||
" deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
|
||||
" except Exception as e:\n",
|
||||
" raise Exception(f\"Please check your environment: {str(e)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "2gLUFAwGv29Q"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_prompt(schema_text, business_problem, nr_records):\n",
|
||||
" prompt = f\"\"\"\n",
|
||||
" The problem is: {business_problem}\n",
|
||||
"\n",
|
||||
" Generate {nr_records} rows data in JSONL format, each line a JSON object with the following fields:\n",
|
||||
"\n",
|
||||
" {schema_text}\n",
|
||||
"\n",
|
||||
" Do NOT repeat column values from one row to another.\n",
|
||||
"\n",
|
||||
" Only output valid JSONL.\n",
|
||||
" \"\"\"\n",
|
||||
" return prompt.strip()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "YZe1FVH8wf84"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- LLM Interface ---\n",
|
||||
"def query(user_prompt, model):\n",
|
||||
" try:\n",
|
||||
" if \"gpt\" in model.lower():\n",
|
||||
" client = OpenAI(api_key=openai_api_key)\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.7\n",
|
||||
" )\n",
|
||||
" content = response.choices[0].message.content\n",
|
||||
"\n",
|
||||
" elif \"claude\" in model.lower():\n",
|
||||
" client = anthropic.Anthropic(api_key=anthropic_api_key)\n",
|
||||
" response = client.messages.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
|
||||
" max_tokens=4000,\n",
|
||||
" temperature=0.7,\n",
|
||||
" system=system_prompt\n",
|
||||
" )\n",
|
||||
" content = response.content[0].text\n",
|
||||
" elif \"gemini\" in model.lower():\n",
|
||||
" client = OpenAI(\n",
|
||||
" api_key=google_api_key,\n",
|
||||
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
|
||||
" )\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.7\n",
|
||||
" )\n",
|
||||
" content = response.choices[0].message.content\n",
|
||||
"\n",
|
||||
" elif \"deepseek\" in model.lower():\n",
|
||||
" client = OpenAI(\n",
|
||||
" api_key=deepseek_api_key,\n",
|
||||
" base_url=\"https://api.deepseek.com\"\n",
|
||||
" )\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" model=model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.7\n",
|
||||
" )\n",
|
||||
" content = response.choices[0].message.content\n",
|
||||
"\n",
|
||||
" elif \"llama\" in model.lower():\n",
|
||||
" global tokenizer, inputs, llama_model, outputs\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" login(hf_token, add_to_git_credential=True)\n",
|
||||
" quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
|
||||
" tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
" if llama_model == None:\n",
|
||||
" llama_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n",
|
||||
" outputs = llama_model.generate(inputs, max_new_tokens=4000)\n",
|
||||
"\n",
|
||||
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
|
||||
" content = after.strip()\n",
|
||||
" else:\n",
|
||||
" raise ValueError(f\"Unsupported model. Use one of {MODELS}\")\n",
|
||||
"\n",
|
||||
" # Parse JSONL output\n",
|
||||
" lines = [line.strip() for line in content.strip().splitlines() if line.strip().startswith(\"{\")]\n",
|
||||
" return [json.loads(line) for line in lines]\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" raise Exception(f\"Model query failed: {str(e)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "4WUj-XqM5IYT"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Output Formatter ---\n",
|
||||
"def save_dataset(records, file_format, filename):\n",
|
||||
" df = pd.DataFrame(records)\n",
|
||||
" if file_format == \".csv\":\n",
|
||||
" df.to_csv(filename, index=False)\n",
|
||||
" elif file_format == \".tsv\":\n",
|
||||
" df.to_csv(filename, sep=\"\\t\", index=False)\n",
|
||||
" elif file_format == \".jsonl\":\n",
|
||||
" with open(filename, \"w\") as f:\n",
|
||||
" for record in records:\n",
|
||||
" f.write(json.dumps(record) + \"\\n\")\n",
|
||||
" elif file_format == \".parquet\":\n",
|
||||
" df.to_parquet(filename, engine=\"pyarrow\", index=False)\n",
|
||||
" elif file_format == \".arrow\":\n",
|
||||
" table = pa.Table.from_pandas(df)\n",
|
||||
" with pa.OSFile(filename, \"wb\") as sink:\n",
|
||||
" with pa.ipc.new_file(sink, table.schema) as writer:\n",
|
||||
" writer.write(table)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unsupported file format\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "WenbNqrpwf-_"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Main Generation Function ---\n",
|
||||
"def generate_dataset(schema_text, business_problem, model, nr_records, file_format, save_as, env):\n",
|
||||
" try:\n",
|
||||
" # Validation\n",
|
||||
" if nr_records <= 10:\n",
|
||||
" return \"❌ Error: Number of records must be greater than 10.\", None\n",
|
||||
" if nr_records > 1000:\n",
|
||||
" return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n",
|
||||
"\n",
|
||||
" if file_format not in FILE_FORMATS:\n",
|
||||
" return \"❌ Error: Invalid file format.\", None\n",
|
||||
"\n",
|
||||
" if not (save_as or save_as.strip() == \"\"):\n",
|
||||
" save_as = f\"default{file_format}\"\n",
|
||||
" elif not save_as.endswith(file_format):\n",
|
||||
" save_as = save_as + file_format\n",
|
||||
"\n",
|
||||
" # Load env\n",
|
||||
" get_env_info(env)\n",
|
||||
"\n",
|
||||
" # Generate prompt\n",
|
||||
" user_prompt = get_prompt(schema_text, business_problem, nr_records)\n",
|
||||
"\n",
|
||||
" # Query model\n",
|
||||
" records = query(user_prompt, model)\n",
|
||||
"\n",
|
||||
" if not records:\n",
|
||||
" return \"❌ Error: No valid records generated from the model.\", None\n",
|
||||
"\n",
|
||||
" # Save dataset\n",
|
||||
" save_dataset(records, file_format, save_as)\n",
|
||||
"\n",
|
||||
" # Create preview\n",
|
||||
" df = pd.DataFrame(records)\n",
|
||||
" preview = df.head(10) # Show first 10 rows\n",
|
||||
"\n",
|
||||
" success_message = f\"✅ Generated {len(records)} records successfully!\\n📁 Saved to: {save_as}\\n📊 \"\n",
|
||||
"\n",
|
||||
" return success_message, preview\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"❌ Error: {str(e)}\", None"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "pHiP8ky8wgEb"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Gradio Interface ---\n",
|
||||
"\n",
|
||||
"with gr.Blocks(title=\"Dataset Generator\", theme=gr.themes.Citrus()) as interface:\n",
|
||||
" hf_token = None\n",
|
||||
" openai_api_key = None\n",
|
||||
" anthropic_api_key = None\n",
|
||||
" google_api_key = None\n",
|
||||
" deepseek_api_key = None\n",
|
||||
" tokenizer = None\n",
|
||||
" inputs = None\n",
|
||||
" llama_model = None\n",
|
||||
" outputs = None\n",
|
||||
"\n",
|
||||
" gr.Markdown(\"# Dataset Generator\")\n",
|
||||
" gr.Markdown(\"Generate synthetic datasets using AI models\")\n",
|
||||
"\n",
|
||||
" with gr.Row():\n",
|
||||
" with gr.Column(scale=2):\n",
|
||||
" schema_input = gr.Textbox(\n",
|
||||
" label=\"Schema\",\n",
|
||||
" value=DEFAULT_SCHEMA_TEXT,\n",
|
||||
" lines=15,\n",
|
||||
" placeholder=\"Define your dataset schema here... Please follow this format: Field_Name, Field_Type, Field Example\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" business_problem_input = gr.Textbox(\n",
|
||||
" label=\"Business Problem\",\n",
|
||||
" value=\"I want to generate restuant records\",\n",
|
||||
" lines=1,\n",
|
||||
" placeholder=\"Enter business problem desciption for the model...\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" with gr.Row():\n",
|
||||
" model_dropdown = gr.Dropdown(\n",
|
||||
" label=\"Model\",\n",
|
||||
" choices=MODELS,\n",
|
||||
" value=MODELS[0],\n",
|
||||
" interactive=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" nr_records_input = gr.Number(\n",
|
||||
" label=\"Number of records\",\n",
|
||||
" value=27,\n",
|
||||
" minimum=11,\n",
|
||||
" maximum=1000,\n",
|
||||
" step=1\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" with gr.Row():\n",
|
||||
" save_as_input = gr.Textbox(\n",
|
||||
" label=\"Save as\",\n",
|
||||
" value=\"restaurant_dataset\",\n",
|
||||
" placeholder=\"Enter filename (extension will be added automatically)\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" file_format_dropdown = gr.Dropdown(\n",
|
||||
" label=\"File format\",\n",
|
||||
" choices=FILE_FORMATS,\n",
|
||||
" value=FILE_FORMATS[0],\n",
|
||||
" interactive=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" env_dropdown = gr.Dropdown(\n",
|
||||
" label=\"Environment\",\n",
|
||||
" choices=[\"Colab\", \"Local\"],\n",
|
||||
" value=\"Colab\",\n",
|
||||
" interactive=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n",
|
||||
"\n",
|
||||
" with gr.Column(scale=1):\n",
|
||||
" output_status = gr.Textbox(\n",
|
||||
" label=\"Status\",\n",
|
||||
" lines=4,\n",
|
||||
" interactive=False\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" output_preview = gr.Dataframe(\n",
|
||||
" label=\"Preview (First 10 rows)\",\n",
|
||||
" interactive=False,\n",
|
||||
" wrap=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Connect the generate button\n",
|
||||
" generate_btn.click(\n",
|
||||
" fn=generate_dataset,\n",
|
||||
" inputs=[\n",
|
||||
" schema_input,\n",
|
||||
" business_problem_input,\n",
|
||||
" model_dropdown,\n",
|
||||
" nr_records_input,\n",
|
||||
" file_format_dropdown,\n",
|
||||
" save_as_input,\n",
|
||||
" env_dropdown\n",
|
||||
" ],\n",
|
||||
" outputs=[output_status, output_preview]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" gr.Markdown(\"\"\"\n",
|
||||
" ### 📝 Instructions:\n",
|
||||
" 1. **Schema**: Define the structure of your dataset (pre-filled with restaurant schema)\n",
|
||||
" 2. **Business problem**: User prompt to guide the AI model\n",
|
||||
" 3. **Model**: Choose between GPT, Claude, Gemini, DeepSeek or Llama models\n",
|
||||
" 4. **Number of records**: Number of records to generate (minimum 11)\n",
|
||||
" 5. **File format**: Choose output format (.csv, .tsv, .jsonl, .parquet, .arrow)\n",
|
||||
" 6. **Save as**: Filename (extension added automatically)\n",
|
||||
" 7. Click **Generate** to create your dataset\n",
|
||||
"\n",
|
||||
" ### 🔧 Requirements:\n",
|
||||
" - For local mode, set up HF token and API keys in `.env` file (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
|
||||
" - For colab mode, set up HF token and API keys in Colab secret section (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
|
||||
" \"\"\")\n",
|
||||
"\n",
|
||||
"interface.launch(debug=True)\n",
|
||||
"\n",
|
||||
"del tokenizer, inputs, llama_model, outputs\n",
|
||||
"gc.collect()\n",
|
||||
"torch.cuda.empty_cache()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,523 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "It89APiAtTUF"
|
||||
},
|
||||
"source": [
|
||||
"# Create meeting minutes from an Audio file\n",
|
||||
"\n",
|
||||
"I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here: \n",
|
||||
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
|
||||
"\n",
|
||||
"If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
|
||||
"\n",
|
||||
"The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
|
||||
"\n",
|
||||
"For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "sJPSCwPX3MOV"
|
||||
},
|
||||
"source": [
|
||||
"## Again - please note: 2 important pro-tips for using Colab:\n",
|
||||
"\n",
|
||||
"**Pro-tip 1:**\n",
|
||||
"\n",
|
||||
"The top of every colab has some pip installs. You may receive errors from pip when you run this, such as:\n",
|
||||
"\n",
|
||||
"> gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\n",
|
||||
"\n",
|
||||
"These pip compatibility errors can be safely ignored; and while it's tempting to try to fix them by changing version numbers, that will actually introduce real problems!\n",
|
||||
"\n",
|
||||
"**Pro-tip 2:**\n",
|
||||
"\n",
|
||||
"In the middle of running a Colab, you might get an error like this:\n",
|
||||
"\n",
|
||||
"> Runtime error: CUDA is required but not available for bitsandbytes. Please consider installing [...]\n",
|
||||
"\n",
|
||||
"This is a super-misleading error message! Please don't try changing versions of packages...\n",
|
||||
"\n",
|
||||
"This actually happens because Google has switched out your Colab runtime, perhaps because Google Colab was too busy. The solution is:\n",
|
||||
"\n",
|
||||
"1. Kernel menu >> Disconnect and delete runtime\n",
|
||||
"2. Reload the colab from fresh and Edit menu >> Clear All Outputs\n",
|
||||
"3. Connect to a new T4 using the button at the top right\n",
|
||||
"4. Select \"View resources\" from the menu on the top right to confirm you have a GPU\n",
|
||||
"5. Rerun the cells in the colab, from the top down, starting with the pip installs\n",
|
||||
"\n",
|
||||
"And all should work great - otherwise, ask me!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "f2vvgnFpHpID"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
|
||||
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "FW8nl3XRFrz0"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from google.colab import drive\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"from google.colab import userdata\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "q3D1_T0uG_Qh"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"AUDIO_MODEL = \"whisper-1\"\n",
|
||||
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "Es9GkQ0FGCMt"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# New capability - connect this Colab to my Google Drive\n",
|
||||
"# See immediately below this for instructions to obtain denver_extract.mp3\n",
|
||||
"\n",
|
||||
"drive.mount(\"/content/drive\")\n",
|
||||
"audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "HTl3mcjyzIEE"
|
||||
},
|
||||
"source": [
|
||||
"# Download denver_extract.mp3\n",
|
||||
"\n",
|
||||
"You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
|
||||
"\n",
|
||||
"If you want to use the same as me, then please download my extract here, and put this on your Google Drive: \n",
|
||||
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "xYW8kQYtF-3L"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Sign in to HuggingFace Hub\n",
|
||||
"\n",
|
||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
||||
"login(hf_token, add_to_git_credential=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "qP6OB2OeGC2C"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Sign in to OpenAI using Secrets in Colab\n",
|
||||
"\n",
|
||||
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
||||
"openai = OpenAI(api_key=openai_api_key)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "GMShdVGlGGr4"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
|
||||
"# If you'd prefer to use an Open Source model, class student Youssef has contributed an open source version\n",
|
||||
"# which I've added to the bottom of this colab\n",
|
||||
"\n",
|
||||
"audio_file = open(audio_filename, \"rb\")\n",
|
||||
"transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format=\"text\")\n",
|
||||
"print(transcription)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "piEMmcSfMH-O"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
|
||||
"user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "UcRKUgcxMew6"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "6CujZRAgMimy"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
"# inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
"streamer = TextStreamer(tokenizer)\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config, trust_remote_code=True)\n",
|
||||
"# outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "MaLNmJ5PSqcH"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
"outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "102tdU_3Peam"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = tokenizer.decode(outputs[0])\n",
|
||||
"response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "KlomN6CwMdoN"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display(Markdown(response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "0jZElVOMSPAr"
|
||||
},
|
||||
"source": [
|
||||
"Day5 exercise - Gradio UI for meeting minutes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "5iiYYxQMHf0i"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gradio as gr\n",
|
||||
"import tempfile\n",
|
||||
"import soundfile as sf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "aGwXW7BjPcTM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install pydub\n",
|
||||
"# !apt-get install ffmpeg\n",
|
||||
"\n",
|
||||
"from pydub import AudioSegment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "RNu-reHuCYj_"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Make sure that the tokenizeer and model is already generated\n",
|
||||
"\n",
|
||||
"# tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
|
||||
"# tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
"# streamer = TextStreamer(tokenizer)\n",
|
||||
"# model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "KOuoH0YOPruE"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# def save_as_mp3(audio_np):\n",
|
||||
"# sr, data = audio_np\n",
|
||||
"# # Convert float32 or int16 to PCM wav and then mp3\n",
|
||||
"# wav_path = tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False).name\n",
|
||||
"# mp3_path = tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False).name\n",
|
||||
"\n",
|
||||
"# sf.write(wav_path, data, sr)\n",
|
||||
"# audio_segment = AudioSegment.from_wav(wav_path)\n",
|
||||
"# audio_segment.export(mp3_path, format=\"mp3\", bitrate=\"64k\") # Low bitrate = small file\n",
|
||||
"# return mp3_path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "toBIPBJoSNw0"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Handles audio input as numpy array and returns updated chat history\n",
|
||||
"def speak_send(audio_np):\n",
|
||||
"\n",
|
||||
" # If use numpy as input: audio_input = gr.Audio(sources=\"upload\", type=\"numpy\", label=\"Upload audio file to generate meeting minutes\")\n",
|
||||
" # mp3_path = save_as_mp3(audio_np)\n",
|
||||
"\n",
|
||||
" # with open(mp3_path, \"rb\") as audio_file:\n",
|
||||
" # transcription = openai.audio.transcriptions.create(\n",
|
||||
" # model=AUDIO_MODEL,\n",
|
||||
" # file=audio_file,\n",
|
||||
" # response_format=\"text\"\n",
|
||||
" # )\n",
|
||||
"\n",
|
||||
" audio = AudioSegment.from_file(audio_np)\n",
|
||||
" with tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False) as tmpfile:\n",
|
||||
" audio.export(tmpfile.name, format=\"mp3\")\n",
|
||||
" with open(tmpfile.name, \"rb\") as file:\n",
|
||||
" transcript = openai.audio.transcriptions.create(\n",
|
||||
" model=AUDIO_MODEL,\n",
|
||||
" file=file,\n",
|
||||
" response_format=\"text\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
|
||||
" user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
|
||||
"\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
" outputs = model.generate(inputs, max_new_tokens=2000)\n",
|
||||
"\n",
|
||||
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
|
||||
" return after.strip()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "xXJfabpDSN5R"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with gr.Blocks() as demo:\n",
|
||||
"\n",
|
||||
" with gr.Row():\n",
|
||||
" audio_input = gr.Audio(sources=\"upload\", type=\"filepath\", label=\"Upload audio file to generate meeting minutes\")\n",
|
||||
" with gr.Row():\n",
|
||||
" audio_submit = gr.Button(\"Send\")\n",
|
||||
" with gr.Row():\n",
|
||||
" outputs = [gr.Markdown(label=\"Meeting minutes:\")]\n",
|
||||
"\n",
|
||||
" audio_submit.click(speak_send, inputs=audio_input, outputs=outputs)\n",
|
||||
"\n",
|
||||
"demo.launch(debug=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "kuxYecT2QDQ9"
|
||||
},
|
||||
"source": [
|
||||
"# Student contribution\n",
|
||||
"\n",
|
||||
"Student Emad S. has made this powerful variation that uses `TextIteratorStreamer` to stream back results into a Gradio UI, and takes advantage of background threads for performance! I'm sharing it here if you'd like to take a look at some very interesting work. Thank you, Emad!\n",
|
||||
"\n",
|
||||
"https://colab.research.google.com/drive/1Ja5zyniyJo5y8s1LKeCTSkB2xyDPOt6D"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "AU3uAEyU3a-o"
|
||||
},
|
||||
"source": [
|
||||
"## Alternative implementation\n",
|
||||
"\n",
|
||||
"Class student Youssef has contributed this variation in which we use an open-source model to transcribe the meeting Audio.\n",
|
||||
"\n",
|
||||
"Thank you Youssef!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "phYYgAbBRvu5"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "HdQnWEzW3lzP"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"AUDIO_MODEL = \"openai/whisper-medium\"\n",
|
||||
"speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)\n",
|
||||
"speech_model.to('cuda')\n",
|
||||
"processor = AutoProcessor.from_pretrained(AUDIO_MODEL)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "ZhA_fbeCSAeZ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pipe = pipeline(\n",
|
||||
" \"automatic-speech-recognition\",\n",
|
||||
" model=speech_model,\n",
|
||||
" tokenizer=processor.tokenizer,\n",
|
||||
" feature_extractor=processor.feature_extractor,\n",
|
||||
" torch_dtype=torch.float16,\n",
|
||||
" device='cuda',\n",
|
||||
" return_timestamps=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "nrQjKtD53omJ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
|
||||
"result = pipe(audio_filename)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "G_XSljOY3tDf"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"transcription = result[\"text\"]\n",
|
||||
"print(transcription)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,841 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Power Coder\n",
|
||||
"\n",
|
||||
"1. Convert code between two programming language; supporting languages are Python, Java, JavaScript, TypeScript, C, C++, C#, Go, Rust, Kotlin, Swift, PHP, Julia\n",
|
||||
"2. Automatically add docstring/comments based on selected comment style\n",
|
||||
"3. Automatically generate unit tests based on selected unit test style\n",
|
||||
"4. Supporting models: gpt-4o, claude-3-5-sonnet-20240620, gemini-2.5-flash\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import io\n",
|
||||
"import sys\n",
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import google.generativeai\n",
|
||||
"import anthropic\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"import gradio as gr\n",
|
||||
"import subprocess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f672e1c-87e9-4865-b760-370fa605e614",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# environment\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# initialize\n",
|
||||
"\n",
|
||||
"openai = OpenAI()\n",
|
||||
"claude = anthropic.Anthropic()\n",
|
||||
"gemini_via_openai_client = OpenAI(\n",
|
||||
" api_key=os.environ['GOOGLE_API_KEY'], \n",
|
||||
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
|
||||
")\n",
|
||||
"OPENAI_MODEL = \"gpt-4o\"\n",
|
||||
"CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
|
||||
"GEMINI_MODEL = \"gemini-2.5-flash\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "37b204dd-f770-41d9-9b19-7e1baa5273cd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Convesion Part"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6896636f-923e-4a2c-9d6c-fac07828a201",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_system_prompt_for(in_lang, out_lang):\n",
|
||||
" convert_system_message = f\"You are an assistant that reimplements {in_lang} code in high performance {out_lang}. \"\n",
|
||||
" convert_system_message += f\"Respond only with {out_lang} code; use comments sparingly and do not provide any explanation other than occasional comments. \"\n",
|
||||
" convert_system_message += f\"The {out_lang} response needs to produce an identical output in the fastest possible time. Keep implementations of random number generators identical so that results match exactly.\"\n",
|
||||
" return convert_system_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code):\n",
|
||||
" convert_user_prompt = f\"Rewrite this {in_lang} code in {out_lang} with the fastest possible implementation that produces identical output in the least time. \"\n",
|
||||
" convert_user_prompt += f\"Respond only with {out_lang} code; do not explain your work other than a few comments. \"\n",
|
||||
" convert_user_prompt += f\"Pay attention to number types to ensure no int overflows. Remember to include all necessary {out_lang} packages or modules, for example, iomanip for C++.\\n\\n\"\n",
|
||||
" if input_instruct:\n",
|
||||
" convert_user_prompt += \"Addtional instruction is: \" + input_instruct\n",
|
||||
" convert_user_prompt += in_code\n",
|
||||
" return convert_user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c6190659-f54c-4951-bef4-4960f8e51cc4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_messages_for(in_lang, out_lang, input_instruct, in_code):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": convert_system_prompt_for(in_lang, out_lang)},\n",
|
||||
" {\"role\": \"user\", \"content\": convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3b497b3-f569-420e-b92e-fb0f49957ce0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_hard = \"\"\"# Be careful to support large number sizes\n",
|
||||
"\n",
|
||||
"def lcg(seed, a=1664525, c=1013904223, m=2**32):\n",
|
||||
" value = seed\n",
|
||||
" while True:\n",
|
||||
" value = (a * value + c) % m\n",
|
||||
" yield value\n",
|
||||
" \n",
|
||||
"def max_subarray_sum(n, seed, min_val, max_val):\n",
|
||||
" lcg_gen = lcg(seed)\n",
|
||||
" random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n",
|
||||
" max_sum = float('-inf')\n",
|
||||
" for i in range(n):\n",
|
||||
" current_sum = 0\n",
|
||||
" for j in range(i, n):\n",
|
||||
" current_sum += random_numbers[j]\n",
|
||||
" if current_sum > max_sum:\n",
|
||||
" max_sum = current_sum\n",
|
||||
" return max_sum\n",
|
||||
"\n",
|
||||
"def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n",
|
||||
" total_sum = 0\n",
|
||||
" lcg_gen = lcg(initial_seed)\n",
|
||||
" for _ in range(20):\n",
|
||||
" seed = next(lcg_gen)\n",
|
||||
" total_sum += max_subarray_sum(n, seed, min_val, max_val)\n",
|
||||
" return total_sum\n",
|
||||
"\n",
|
||||
"# Parameters\n",
|
||||
"n = 10000 # Number of random numbers\n",
|
||||
"initial_seed = 42 # Initial seed for the LCG\n",
|
||||
"min_val = -10 # Minimum value of random numbers\n",
|
||||
"max_val = 10 # Maximum value of random numbers\n",
|
||||
"\n",
|
||||
"# Timing the function\n",
|
||||
"import time\n",
|
||||
"start_time = time.time()\n",
|
||||
"result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n",
|
||||
"end_time = time.time()\n",
|
||||
"\n",
|
||||
"print(\"Total Maximum Subarray Sum (20 runs):\", result)\n",
|
||||
"print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0be9f47d-5213-4700-b0e2-d444c7c738c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_stream_gpt(in_lang, out_lang, input_instruct, in_code): \n",
|
||||
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=convert_messages_for(in_lang, out_lang, input_instruct, in_code), temperature=0.0, stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8669f56b-8314-4582-a167-78842caea131",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_stream_claude(in_lang, out_lang, input_instruct, in_code):\n",
|
||||
" result = claude.messages.stream(\n",
|
||||
" model=CLAUDE_MODEL,\n",
|
||||
" max_tokens=2000,\n",
|
||||
" temperature=0.0,\n",
|
||||
" system=convert_system_prompt_for(in_lang, out_lang),\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code)}],\n",
|
||||
" )\n",
|
||||
" reply = \"\"\n",
|
||||
" with result as stream:\n",
|
||||
" for text in stream.text_stream:\n",
|
||||
" reply += text\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "01d3cd4f-c100-4e25-8670-0663513f6136",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def convert_stream_gemini(in_lang, out_lang, input_instruct, in_code): \n",
|
||||
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=convert_messages_for(in_lang, out_lang, input_instruct, in_code), temperature=0.0, stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f1ae8f5-16c8-40a0-aa18-63b617df078d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def optimize(in_lang, out_lang, in_code, input_instruct, convert_model):\n",
|
||||
" if \"gpt\" in convert_model.lower():\n",
|
||||
" result = convert_stream_gpt(in_lang, out_lang, input_instruct, in_code)\n",
|
||||
" elif \"claude\" in convert_model.lower():\n",
|
||||
" result = convert_stream_claude(in_lang, out_lang, input_instruct, in_code)\n",
|
||||
" elif \"gemini\" in convert_model.lower():\n",
|
||||
" result = convert_stream_gemini(in_lang, out_lang, input_instruct, in_code)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown convert model\")\n",
|
||||
" for stream_so_far in result:\n",
|
||||
" yield stream_so_far "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07383878-f887-464f-8bc7-527c669d3edd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Comment part"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d254038c-fdd6-4ef8-8b7a-a074f1e7405d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_system_prompt_for(lang, comment_style):\n",
|
||||
" comment_system_message = f\"You are an assistant that generate necessary, concise and clear comment/docstring for the {lang} code by applying {comment_style} comment style. \"\n",
|
||||
" comment_system_message += f\"Respond only with added comments, and do not provide any redundant explanation. \"\n",
|
||||
" return comment_system_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e95cee4f-f229-4c9f-8e67-8a68cc9534c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_user_prompt_for(lang, code, comment_style):\n",
|
||||
" comment_user_prompt = f\"Add the comments/docstring on the given code for the {lang} programming language in {comment_style} comment style. \"\n",
|
||||
" comment_user_prompt += f\"Respond only with added comments, and do not provide any redundant explanation.\\n\\n\"\n",
|
||||
" comment_user_prompt += f\"The given code is as follows: \"\n",
|
||||
" comment_user_prompt += code\n",
|
||||
" return comment_user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "507426c2-cf5a-4041-b904-b18a5afe83b6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_messages_for(lang, code, comment_style):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": comment_system_prompt_for(lang, comment_style)},\n",
|
||||
" {\"role\": \"user\", \"content\": comment_user_prompt_for(lang, code, comment_style)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7e1c8cf6-7a15-4e79-82f6-6bb2a0b85773",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_stream_gpt(lang, code, comment_style): \n",
|
||||
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=comment_messages_for(lang, code, comment_style), temperature=0.0, stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26f27781-4a3e-4e5f-a8ab-9a25944a9879",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_stream_claude(lang, code, comment_style):\n",
|
||||
" result = claude.messages.stream(\n",
|
||||
" model=CLAUDE_MODEL,\n",
|
||||
" max_tokens=2000,\n",
|
||||
" temperature=0.0,\n",
|
||||
" system=comment_system_prompt_for(lang, comment_style),\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": comment_user_prompt_for(lang, code, comment_style)}],\n",
|
||||
" )\n",
|
||||
" reply = \"\"\n",
|
||||
" with result as stream:\n",
|
||||
" for text in stream.text_stream:\n",
|
||||
" reply += text\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8e6719e7-f2f3-40ea-8fed-01d84a641306",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def comment_stream_gemini(lang, code, comment_style): \n",
|
||||
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=comment_messages_for(lang, code, comment_style), temperature=0.0, stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2b98acc4-23d8-4671-8f19-92d72631b55d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_comments_via_model(lang, code, comment_style, comment_model):\n",
|
||||
" if \"gpt\" in comment_model.lower():\n",
|
||||
" result = comment_stream_gpt(lang, code, comment_style)\n",
|
||||
" elif \"claude\" in comment_model.lower():\n",
|
||||
" result = comment_stream_claude(lang, code, comment_style)\n",
|
||||
" elif \"gemini\" in comment_model.lower():\n",
|
||||
" result = comment_stream_gemini(lang, code, comment_style)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown comment model\")\n",
|
||||
" for stream_so_far in result:\n",
|
||||
" yield stream_so_far "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "282c75ae-d8c3-4866-a024-f7ecf87b3cde",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_comments_fn(comment_option, in_lang, out_lang, in_code, out_code, in_comment_style, out_comment_style, comment_model):\n",
|
||||
" if 'input' in comment_option:\n",
|
||||
" in_gen = generate_comments_via_model(in_lang, in_code, in_comment_style, comment_model)\n",
|
||||
" for in_output in in_gen:\n",
|
||||
" yield in_output, \"\"\n",
|
||||
" elif 'output' in comment_option:\n",
|
||||
" out_gen = generate_comments_via_model(out_lang, out_code, out_comment_style, comment_model)\n",
|
||||
" for out_output in out_gen:\n",
|
||||
" yield \"\", out_output\n",
|
||||
" elif 'both' in comment_option:\n",
|
||||
" in_gen = generate_comments_via_model(in_lang, in_code, in_comment_style, comment_model)\n",
|
||||
" out_gen = generate_comments_via_model(out_lang, out_code, out_comment_style, comment_model)\n",
|
||||
" for in_output, out_output in zip(in_gen, out_gen):\n",
|
||||
" yield in_output, out_output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce2c178c-d03c-49c0-b0e9-c57c699bca08",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Unit test part"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e5a4743e-e1a8-42c7-8f1f-a73d49c0895d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_system_prompt_for(lang, unit_test_style):\n",
|
||||
" unit_test_system_message = f\"You are an assistant that generate necessary, concise, clear and executable unit tests for the {lang} code by applying {unit_test_style} unit test style. \"\n",
|
||||
" unit_test_system_message += f\"Respond only with generated unit tests; use comments sparingly and do not provide any explanation other than occasional comments. \"\n",
|
||||
" return unit_test_system_message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "334d5e40-71ff-4d24-8cef-b6c81c188e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_user_prompt_for(lang, code, unit_test_style):\n",
|
||||
" unit_test_user_prompt = f\"Add the unit tests on the given code for the {lang} programming language in {unit_test_style} unit test style. \"\n",
|
||||
" unit_test_user_prompt += f\"Respond only with generated unit tests; use comments sparingly and do not provide any explanation other than occasional comments.\\n\\n\"\n",
|
||||
" unit_test_user_prompt += f\"The given code is as follows: \"\n",
|
||||
" unit_test_user_prompt += code\n",
|
||||
" return unit_test_user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a8e061f-3993-4746-9425-d938d2537f65",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_messages_for(lang, code, unit_test_style):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": unit_test_system_prompt_for(lang, unit_test_style)},\n",
|
||||
" {\"role\": \"user\", \"content\": unit_test_user_prompt_for(lang, code, unit_test_style)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "71c1613b-7a16-4443-acec-d0a2d9bed192",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_stream_gpt(lang, code, unit_test_style): \n",
|
||||
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=unit_test_messages_for(lang, code, unit_test_style), stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a6e3502-f7ff-42b8-8fc5-2697b2d1f36e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_stream_claude(lang, code, unit_test_style):\n",
|
||||
" result = claude.messages.stream(\n",
|
||||
" model=CLAUDE_MODEL,\n",
|
||||
" max_tokens=2000,\n",
|
||||
" system=unit_test_system_prompt_for(lang, unit_test_style),\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": unit_test_user_prompt_for(lang, code, unit_test_style)}],\n",
|
||||
" )\n",
|
||||
" reply = \"\"\n",
|
||||
" with result as stream:\n",
|
||||
" for text in stream.text_stream:\n",
|
||||
" reply += text\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d7f694f-a276-4bdc-9cfb-755483fd4380",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def unit_test_stream_gemini(lang, code, unit_test_style): \n",
|
||||
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=unit_test_messages_for(lang, code, unit_test_style), stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c824429a-b18a-4320-8258-0141037a6531",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_unit_test_via_model(lang, code, unit_test_style, unit_test_model):\n",
|
||||
" if \"gpt\" in unit_test_model.lower():\n",
|
||||
" result = unit_test_stream_gpt(lang, code, unit_test_style)\n",
|
||||
" elif \"claude\" in unit_test_model.lower():\n",
|
||||
" result = unit_test_stream_claude(lang, code, unit_test_style)\n",
|
||||
" elif \"gemini\" in unit_test_model.lower():\n",
|
||||
" result = unit_test_stream_gemini(lang, code, unit_test_style)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown unit test model\")\n",
|
||||
" for stream_so_far in result:\n",
|
||||
" yield stream_so_far "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3e59e26-37c0-4429-b69c-deb581423dd0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_unit_test_fn(unit_test_option, in_lang, out_lang, in_code, out_code, in_unit_test_style, out_unit_test_style, unit_test_model):\n",
|
||||
" if 'input' in unit_test_option:\n",
|
||||
" in_gen = generate_unit_test_via_model(in_lang, in_code, in_unit_test_style, unit_test_model)\n",
|
||||
" for in_output in in_gen:\n",
|
||||
" yield in_output, \"\"\n",
|
||||
" elif 'output' in unit_test_option:\n",
|
||||
" out_gen = generate_unit_test_via_model(out_lang, out_code, out_unit_test_style, unit_test_model)\n",
|
||||
" for out_output in out_gen:\n",
|
||||
" yield \"\", out_output\n",
|
||||
" elif 'both' in unit_test_option:\n",
|
||||
" in_gen = generate_unit_test_via_model(in_lang, in_code, in_unit_test_style, unit_test_model)\n",
|
||||
" out_gen = generate_unit_test_via_model(out_lang, out_code, out_unit_test_style, unit_test_model)\n",
|
||||
" for in_output, out_output in zip(in_gen, out_gen):\n",
|
||||
" yield in_output, out_output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a1f4d0c-f417-4de4-be9f-441cbe5a6db3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Gradio UI part"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a2274f1-d03b-42c0-8dcc-4ce159b18442",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LANGUAGE_INFO = {\n",
|
||||
" \"Python\": {\n",
|
||||
" \"doc_style\": [\"Google-style\", \"NumPy-style\", \"reST\", \"Doxygen\"],\n",
|
||||
" \"unit_test_style\": [\"unittest\", \"pytest\", \"doctest\"]\n",
|
||||
" },\n",
|
||||
" \"Java\": {\n",
|
||||
" \"doc_style\": [\"Javadoc\"],\n",
|
||||
" \"unit_test_style\": [\"JUnit4\", \"JUnit5\", \"TestNG\"]\n",
|
||||
" },\n",
|
||||
" \"JavaScript\": {\n",
|
||||
" \"doc_style\": [\"JSDoc\"],\n",
|
||||
" \"unit_test_style\": [\"Jest\", \"Mocha + Chai\", \"Jasmine\"]\n",
|
||||
" },\n",
|
||||
" \"TypeScript\": {\n",
|
||||
" \"doc_style\": [\"JSDoc\", \"TSDoc\"],\n",
|
||||
" \"unit_test_style\": [\"Jest\", \"Mocha + Chai\", \"Vitest\"]\n",
|
||||
" },\n",
|
||||
" \"C\": {\n",
|
||||
" \"doc_style\": [\"Doxygen\"],\n",
|
||||
" \"unit_test_style\": [\"Google Test (gtest)\", \"CppUnit\", \"Catch2\"]\n",
|
||||
" },\n",
|
||||
" \"C++\": {\n",
|
||||
" \"doc_style\": [\"Doxygen\"],\n",
|
||||
" \"unit_test_style\": [\"Google Test (gtest)\", \"CppUnit\", \"Catch2\"]\n",
|
||||
" },\n",
|
||||
" \"C#\": {\n",
|
||||
" \"doc_style\": [\"XML comments\"],\n",
|
||||
" \"unit_test_style\": [\"xUnit\", \"NUnit\", \"MSTest\"]\n",
|
||||
" },\n",
|
||||
" \"Go\": {\n",
|
||||
" \"doc_style\": [\"Godoc\"],\n",
|
||||
" \"unit_test_style\": [\"Built-in testing package\"]\n",
|
||||
" },\n",
|
||||
" \"Rust\": {\n",
|
||||
" \"doc_style\": [\"Rustdoc\", \"Markdown\"],\n",
|
||||
" \"unit_test_style\": [\"Built-in #[test] annotation\"]\n",
|
||||
" },\n",
|
||||
" \"Kotlin\": {\n",
|
||||
" \"doc_style\": [\"KDoc\"],\n",
|
||||
" \"unit_test_style\": [\"JUnit\", \"Kotest\", \"Spek\"]\n",
|
||||
" },\n",
|
||||
" \"Swift\": {\n",
|
||||
" \"doc_style\": [\"Mark-style comments\"],\n",
|
||||
" \"unit_test_style\": [\"XCTest\"]\n",
|
||||
" },\n",
|
||||
" \"PHP\": {\n",
|
||||
" \"doc_style\": [\"PHPDoc\"],\n",
|
||||
" \"unit_test_style\": [\"PHPUnit\"]\n",
|
||||
" },\n",
|
||||
" \"Julia\": {\n",
|
||||
" \"doc_style\": [\"Markdown\"],\n",
|
||||
" \"unit_test_style\": [\"Built-in Test standard library\"]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"LANGUAGES = list(LANGUAGE_INFO.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b50e7833-8f6f-407e-8174-37af9cec2030",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with gr.Blocks(title=\"Power Coder\", theme=gr.themes.Citrus(), css=\"\"\"\n",
|
||||
".selected {\n",
|
||||
" background-color: orange !important;\n",
|
||||
" box-shadow: 0 4px 12px rgba(255, 140, 0, 0.5) !important;\n",
|
||||
" color: black;\n",
|
||||
"}\n",
|
||||
".unselected {\n",
|
||||
" background-color: gray !important;\n",
|
||||
" box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);\n",
|
||||
" color: white;\n",
|
||||
"}\n",
|
||||
"\"\"\") as ui:\n",
|
||||
" current_selected = gr.State(\"\")\n",
|
||||
" initial_in_lang = \"Python\"\n",
|
||||
" initial_out_lang = \"Java\"\n",
|
||||
" in_comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_in_lang][\"doc_style\"]\n",
|
||||
" out_comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_out_lang][\"doc_style\"]\n",
|
||||
" in_unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_in_lang][\"unit_test_style\"]\n",
|
||||
" out_unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_out_lang][\"unit_test_style\"]\n",
|
||||
" in_code_file_name = gr.State(\"in_code.txt\")\n",
|
||||
" out_code_file_name = gr.State(\"out_code.txt\")\n",
|
||||
" in_comments_file_name = gr.State(\"in_comments.txt\")\n",
|
||||
" out_comments_file_name = gr.State(\"out_comments.txt\")\n",
|
||||
" in_unit_test_file_name = gr.State(\"in_unit_tests.txt\")\n",
|
||||
" out_unit_test_file_name = gr.State(\"out_unit_tests.txt\")\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" gr.Markdown(\"## Code Helper\")\n",
|
||||
"\n",
|
||||
" def load_file_content(file):\n",
|
||||
" if file is None:\n",
|
||||
" return \"\"\n",
|
||||
" with open(file.name, \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" return f.read()\n",
|
||||
"\n",
|
||||
" def change_lang(lang):\n",
|
||||
" comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[lang][\"doc_style\"]\n",
|
||||
" unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[lang][\"unit_test_style\"]\n",
|
||||
" return (\n",
|
||||
" gr.update(choices=comment_style_choices, value=str(comment_style_choices[0])), \n",
|
||||
" gr.update(choices=unit_test_style_choices, value=str(unit_test_style_choices[0]))\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def download_fn(in_text, out_text, in_file_name, out_file_name):\n",
|
||||
" if in_text:\n",
|
||||
" with open(in_file_name, \"w\") as f:\n",
|
||||
" f.write(in_text)\n",
|
||||
" if out_text:\n",
|
||||
" with open(out_file_name, \"w\") as f:\n",
|
||||
" f.write(out_text)\n",
|
||||
" \n",
|
||||
" # Conversion part\n",
|
||||
" with gr.Row():\n",
|
||||
" in_lang = gr.Dropdown(choices=LANGUAGES, label=\"Select input language\", value=initial_in_lang, interactive=True)\n",
|
||||
" out_lang = gr.Dropdown(choices=LANGUAGES, label=\"Select output language\", value=initial_out_lang, interactive=True)\n",
|
||||
" with gr.Row():\n",
|
||||
" input_file = gr.File(label=\"Upload a source code file or input below\")\n",
|
||||
" input_instruct = gr.Textbox(\n",
|
||||
" label=\"Additional instruction(optional)\",\n",
|
||||
" placeholder=\"Enter the instruction you want the ouput code to follow...\\n\\nFor example: Define the variable using snake_case style.\",\n",
|
||||
" lines=8\n",
|
||||
" )\n",
|
||||
" with gr.Row():\n",
|
||||
" in_code = gr.Textbox(label=\"Input Code:\", value=python_hard, lines=10)\n",
|
||||
" out_code = gr.Textbox(label=\"Output Code:\", lines=10)\n",
|
||||
" with gr.Row():\n",
|
||||
" convert_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
|
||||
" with gr.Row():\n",
|
||||
" convert = gr.Button(\"Convert code\")\n",
|
||||
" download_code = gr.Button(\"Download code\")\n",
|
||||
"\n",
|
||||
" gr.HTML(\"<hr style='border: none; height: 1px; background-color: #333;'>\")\n",
|
||||
"\n",
|
||||
" def show_comment(current_selected):\n",
|
||||
" if current_selected == \"comment\":\n",
|
||||
" return (\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" \"\"\n",
|
||||
" )\n",
|
||||
" else:\n",
|
||||
" return (\n",
|
||||
" gr.update(visible=True),\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(elem_classes=[\"selected\"]),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" \"comment\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def show_unit_test(current_selected):\n",
|
||||
" if current_selected == \"unit_test\":\n",
|
||||
" return (\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" \"\"\n",
|
||||
" )\n",
|
||||
" else:\n",
|
||||
" return (\n",
|
||||
" gr.update(visible=False),\n",
|
||||
" gr.update(visible=True),\n",
|
||||
" gr.update(elem_classes=[\"unselected\"]),\n",
|
||||
" gr.update(elem_classes=[\"selected\"]),\n",
|
||||
" \"unit_test\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" with gr.Blocks() as demo:\n",
|
||||
" with gr.Row():\n",
|
||||
" comment_show_up = gr.Button(\"Comment\", elem_id=\"comment-btn\", elem_classes=[\"unselected\"])\n",
|
||||
" unit_test_show_up = gr.Button(\"Unit Test\", elem_id=\"unit-test-btn\", elem_classes=[\"unselected\"])\n",
|
||||
" \n",
|
||||
" comment_section = gr.Column(visible=False)\n",
|
||||
" unit_test_section = gr.Column(visible=False)\n",
|
||||
" \n",
|
||||
" with comment_section:\n",
|
||||
" # Comment section\n",
|
||||
" with gr.Row():\n",
|
||||
" comment_option = gr.Radio(\n",
|
||||
" choices=[\n",
|
||||
" \"Comment input code\",\n",
|
||||
" \"Comment output code\",\n",
|
||||
" \"Comment both\"\n",
|
||||
" ],\n",
|
||||
" label=\"Commenting Options\",\n",
|
||||
" value=\"Comment input code\",\n",
|
||||
" interactive=True\n",
|
||||
" )\n",
|
||||
" with gr.Row():\n",
|
||||
" in_comment_style = gr.Dropdown(choices=in_comment_style_choices, label=\"Select comment style for input code\", value=in_comment_style_choices[0], interactive=True)\n",
|
||||
" out_comment_style = gr.Dropdown(choices=out_comment_style_choices, label=\"Select comment style for oupt code\", value=out_comment_style_choices[0], interactive=True)\n",
|
||||
" with gr.Row():\n",
|
||||
" comment_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
|
||||
" with gr.Row():\n",
|
||||
" generate_comments = gr.Button(\"Generate comments\")\n",
|
||||
" download_comments = gr.Button(\"Download comments\")\n",
|
||||
" with gr.Row():\n",
|
||||
" in_comments = gr.Textbox(label=\"Comments for Input Code:\", lines=10)\n",
|
||||
" out_comments = gr.Textbox(label=\"Comments for Output Code:\", lines=10)\n",
|
||||
" \n",
|
||||
" with unit_test_section:\n",
|
||||
" # Unit test part\n",
|
||||
" with gr.Row():\n",
|
||||
" unit_test_option = gr.Radio(\n",
|
||||
" choices=[\n",
|
||||
" \"Add unit test for input code\",\n",
|
||||
" \"Add unit test for output code\",\n",
|
||||
" \"Add unit test for both\"\n",
|
||||
" ],\n",
|
||||
" label=\"Unit Test Options\",\n",
|
||||
" value=\"Add unit test for input code\",\n",
|
||||
" interactive=True\n",
|
||||
" )\n",
|
||||
" with gr.Row():\n",
|
||||
" in_unit_test_style = gr.Dropdown(choices=in_unit_test_style_choices, label=\"Select unit test style for input code\", value=in_unit_test_style_choices[0], interactive=True)\n",
|
||||
" out_unit_test_style = gr.Dropdown(choices=out_unit_test_style_choices, label=\"Select unit test style for oupt code\", value=out_unit_test_style_choices[0], interactive=True)\n",
|
||||
" with gr.Row():\n",
|
||||
" unit_test_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
|
||||
" with gr.Row():\n",
|
||||
" generate_unit_test = gr.Button(\"Generate unit test\")\n",
|
||||
" download_unit_test = gr.Button(\"Download unit text\")\n",
|
||||
" with gr.Row():\n",
|
||||
" in_unit_test = gr.Textbox(label=\"Unit Test for Input Code:\", lines=10)\n",
|
||||
" out_unit_test = gr.Textbox(label=\"Unit Test for Output Code:\", lines=10)\n",
|
||||
"\n",
|
||||
" in_lang.change(fn=change_lang, inputs=in_lang, outputs=[in_comment_style, in_unit_test_style])\n",
|
||||
" out_lang.change(fn=change_lang, inputs=out_lang, outputs=[out_comment_style, out_unit_test_style])\n",
|
||||
" input_file.change(fn=load_file_content, inputs=input_file, outputs=in_code)\n",
|
||||
" \n",
|
||||
" convert.click(optimize, inputs=[in_lang, out_lang, in_code, input_instruct, convert_model], outputs=[out_code])\n",
|
||||
" download_code.click(download_fn, inputs=[in_code, out_code, in_code_file_name, out_code_file_name])\n",
|
||||
" \n",
|
||||
" comment_show_up.click(fn=show_comment, inputs=current_selected, outputs=[comment_section, unit_test_section, comment_show_up, unit_test_show_up, current_selected])\n",
|
||||
" unit_test_show_up.click(fn=show_unit_test, inputs=current_selected, outputs=[comment_section, unit_test_section, comment_show_up, unit_test_show_up, current_selected])\n",
|
||||
"\n",
|
||||
" generate_comments.click(generate_comments_fn, inputs=[comment_option, in_lang, out_lang, in_code, out_code, in_comment_style, out_comment_style, comment_model], outputs=[in_comments, out_comments])\n",
|
||||
" download_comments.click(download_fn, inputs=[in_comments, out_comments, in_comments_file_name, out_comments_file_name])\n",
|
||||
" generate_unit_test.click(generate_unit_test_fn, inputs=[unit_test_option, in_lang, out_lang, in_code, out_code, in_unit_test_style, out_unit_test_style, unit_test_model], outputs=[in_unit_test, out_unit_test])\n",
|
||||
" download_unit_test.click(download_fn, inputs=[in_unit_test, out_unit_test, in_unit_test_file_name, out_unit_test_file_name])\n",
|
||||
" \n",
|
||||
"ui.launch()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0266734c-0bee-46c0-9b17-9fd2ae86cc3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user