Merge pull request #510 from Zhufeng-Qiu/zephyr-week3_4_5

Add the community contribution for Week3/4/5
This commit is contained in:
Ed Donner
2025-07-12 14:12:54 -04:00
committed by GitHub
3 changed files with 1915 additions and 0 deletions

View File

@@ -0,0 +1,551 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "GD5Omr5EfWgb"
},
"source": [
"# Date Generator\n",
"\n",
"generate synthetic data when given scheme, business problem description, model, number of records, file name, file type, and environment\n",
"\n",
"# Available models\n",
" Model API:\n",
"\n",
" 1. gpt-4o-mini\n",
" 2. claude-3-haiku-20240307\n",
" 3. gemini-2.0-flash\n",
" 4. deepseek-chat\"\n",
"\n",
" HuggingFace API:\n",
"\n",
" 5. meta-llama/Meta-Llama-3.1-8B-Instruct\n",
"\n",
"\n",
"# Available environment\n",
"\n",
"Colab: set up HF token and API keys in Colab secret section\n",
"\n",
"Local: set up HF token and API keys in .env file\n",
"\n",
"\n",
"\n",
"### *** This project is developed based on the idea of 'week3/community-contributuins/Week3-Dataset_Generator-DP'. Really appreciate it! Then, the project is improved to run both on Colab or locally, and integrate HuggingFace API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4FiCnE0MmU56"
},
"outputs": [],
"source": [
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0\n",
"!pip install anthropic dotenv pyarrow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JeyKw5guoH3r"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from huggingface_hub import login\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"from bs4 import BeautifulSoup\n",
"from typing import List\n",
"import google.generativeai\n",
"import anthropic\n",
"from itertools import chain\n",
"from dotenv import load_dotenv\n",
"import gradio as gr\n",
"import json\n",
"import pandas as pd\n",
"import random\n",
"import re\n",
"import subprocess\n",
"import pyarrow as pa\n",
"import torch\n",
"import gc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7UyjFdRZoIAS"
},
"outputs": [],
"source": [
"# --- Schema Definition ---\n",
"SCHEMA = [\n",
" (\"Name\", \"TEXT\", '\"Northern Cafe\"'),\n",
" (\"Location\", \"TEXT\", '\"2904 S Figueroa St, Los Angeles, CA 90007\"'),\n",
" (\"Type\", \"TEXT\", 'One of [\"Chinese\",\"Mexico\",\"French\",\"Korean\",\"Italy\"] or other potential types'),\n",
" (\"Average Price\", \"TEXT\", '\"$30\", or \"--\" if unkown'),\n",
" (\"History/Age\", \"INT\", 'integer age of resturant, e.g., 7'),\n",
" (\"Menu\", \"Array\", '[\"Beef Noodle\", \"Fried Rice\", \"Dumpling\", ...]'),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jXcTQATLoICV"
},
"outputs": [],
"source": [
"# Default schema text for the textbox\n",
"DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) Example: {col[2]}\" for i, col in enumerate(SCHEMA)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4Irf5JV3oIEe"
},
"outputs": [],
"source": [
"# Available models\n",
"MODELS = [\n",
" \"gpt-4o-mini\",\n",
" \"claude-3-haiku-20240307\",\n",
" \"gemini-2.0-flash\",\n",
" \"deepseek-chat\",\n",
" \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JJ6r2SH9oIGf"
},
"outputs": [],
"source": [
"# Available file formats\n",
"FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".parquet\", \".arrow\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B98j45E3vq5g"
},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are a helpful assistant whose main purpose is to generate datasets for a given business problem based on given schema.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lsX16cWfwf6x"
},
"outputs": [],
"source": [
"def get_env_info(env):\n",
" try:\n",
" global hf_token, openai_api_key, anthropic_api_key, google_api_key, deepseek_api_key\n",
" if env == \"Colab\":\n",
" # Colab environment\n",
" from google.colab import drive\n",
" from google.colab import userdata\n",
" hf_token = userdata.get('HF_TOKEN')\n",
" openai_api_key = userdata.get('OPENAI_API_KEY')\n",
" anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n",
" google_api_key = userdata.get('GOOGLE_API_KEY')\n",
" deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')\n",
" elif env == \"Local\":\n",
" # Local environment\n",
" load_dotenv(override=True)\n",
" hf_token = os.getenv('HF_TOKEN')\n",
" openai_api_key = os.getenv('OPENAI_API_KEY')\n",
" anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
" google_api_key = os.getenv('GOOGLE_API_KEY')\n",
" deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
" except Exception as e:\n",
" raise Exception(f\"Please check your environment: {str(e)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2gLUFAwGv29Q"
},
"outputs": [],
"source": [
"def get_prompt(schema_text, business_problem, nr_records):\n",
" prompt = f\"\"\"\n",
" The problem is: {business_problem}\n",
"\n",
" Generate {nr_records} rows data in JSONL format, each line a JSON object with the following fields:\n",
"\n",
" {schema_text}\n",
"\n",
" Do NOT repeat column values from one row to another.\n",
"\n",
" Only output valid JSONL.\n",
" \"\"\"\n",
" return prompt.strip()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YZe1FVH8wf84"
},
"outputs": [],
"source": [
"# --- LLM Interface ---\n",
"def query(user_prompt, model):\n",
" try:\n",
" if \"gpt\" in model.lower():\n",
" client = OpenAI(api_key=openai_api_key)\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"claude\" in model.lower():\n",
" client = anthropic.Anthropic(api_key=anthropic_api_key)\n",
" response = client.messages.create(\n",
" model=model,\n",
" messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
" max_tokens=4000,\n",
" temperature=0.7,\n",
" system=system_prompt\n",
" )\n",
" content = response.content[0].text\n",
" elif \"gemini\" in model.lower():\n",
" client = OpenAI(\n",
" api_key=google_api_key,\n",
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
" )\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"deepseek\" in model.lower():\n",
" client = OpenAI(\n",
" api_key=deepseek_api_key,\n",
" base_url=\"https://api.deepseek.com\"\n",
" )\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"llama\" in model.lower():\n",
" global tokenizer, inputs, llama_model, outputs\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" login(hf_token, add_to_git_credential=True)\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" if llama_model == None:\n",
" llama_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n",
" outputs = llama_model.generate(inputs, max_new_tokens=4000)\n",
"\n",
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
" content = after.strip()\n",
" else:\n",
" raise ValueError(f\"Unsupported model. Use one of {MODELS}\")\n",
"\n",
" # Parse JSONL output\n",
" lines = [line.strip() for line in content.strip().splitlines() if line.strip().startswith(\"{\")]\n",
" return [json.loads(line) for line in lines]\n",
"\n",
" except Exception as e:\n",
" raise Exception(f\"Model query failed: {str(e)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4WUj-XqM5IYT"
},
"outputs": [],
"source": [
"# --- Output Formatter ---\n",
"def save_dataset(records, file_format, filename):\n",
" df = pd.DataFrame(records)\n",
" if file_format == \".csv\":\n",
" df.to_csv(filename, index=False)\n",
" elif file_format == \".tsv\":\n",
" df.to_csv(filename, sep=\"\\t\", index=False)\n",
" elif file_format == \".jsonl\":\n",
" with open(filename, \"w\") as f:\n",
" for record in records:\n",
" f.write(json.dumps(record) + \"\\n\")\n",
" elif file_format == \".parquet\":\n",
" df.to_parquet(filename, engine=\"pyarrow\", index=False)\n",
" elif file_format == \".arrow\":\n",
" table = pa.Table.from_pandas(df)\n",
" with pa.OSFile(filename, \"wb\") as sink:\n",
" with pa.ipc.new_file(sink, table.schema) as writer:\n",
" writer.write(table)\n",
" else:\n",
" raise ValueError(\"Unsupported file format\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WenbNqrpwf-_"
},
"outputs": [],
"source": [
"# --- Main Generation Function ---\n",
"def generate_dataset(schema_text, business_problem, model, nr_records, file_format, save_as, env):\n",
" try:\n",
" # Validation\n",
" if nr_records <= 10:\n",
" return \"❌ Error: Number of records must be greater than 10.\", None\n",
" if nr_records > 1000:\n",
" return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n",
"\n",
" if file_format not in FILE_FORMATS:\n",
" return \"❌ Error: Invalid file format.\", None\n",
"\n",
" if not (save_as or save_as.strip() == \"\"):\n",
" save_as = f\"default{file_format}\"\n",
" elif not save_as.endswith(file_format):\n",
" save_as = save_as + file_format\n",
"\n",
" # Load env\n",
" get_env_info(env)\n",
"\n",
" # Generate prompt\n",
" user_prompt = get_prompt(schema_text, business_problem, nr_records)\n",
"\n",
" # Query model\n",
" records = query(user_prompt, model)\n",
"\n",
" if not records:\n",
" return \"❌ Error: No valid records generated from the model.\", None\n",
"\n",
" # Save dataset\n",
" save_dataset(records, file_format, save_as)\n",
"\n",
" # Create preview\n",
" df = pd.DataFrame(records)\n",
" preview = df.head(10) # Show first 10 rows\n",
"\n",
" success_message = f\"✅ Generated {len(records)} records successfully!\\n📁 Saved to: {save_as}\\n📊 \"\n",
"\n",
" return success_message, preview\n",
"\n",
" except Exception as e:\n",
" return f\"❌ Error: {str(e)}\", None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pHiP8ky8wgEb"
},
"outputs": [],
"source": [
"# --- Gradio Interface ---\n",
"\n",
"with gr.Blocks(title=\"Dataset Generator\", theme=gr.themes.Citrus()) as interface:\n",
" hf_token = None\n",
" openai_api_key = None\n",
" anthropic_api_key = None\n",
" google_api_key = None\n",
" deepseek_api_key = None\n",
" tokenizer = None\n",
" inputs = None\n",
" llama_model = None\n",
" outputs = None\n",
"\n",
" gr.Markdown(\"# Dataset Generator\")\n",
" gr.Markdown(\"Generate synthetic datasets using AI models\")\n",
"\n",
" with gr.Row():\n",
" with gr.Column(scale=2):\n",
" schema_input = gr.Textbox(\n",
" label=\"Schema\",\n",
" value=DEFAULT_SCHEMA_TEXT,\n",
" lines=15,\n",
" placeholder=\"Define your dataset schema here... Please follow this format: Field_Name, Field_Type, Field Example\"\n",
" )\n",
"\n",
" business_problem_input = gr.Textbox(\n",
" label=\"Business Problem\",\n",
" value=\"I want to generate restuant records\",\n",
" lines=1,\n",
" placeholder=\"Enter business problem desciption for the model...\"\n",
" )\n",
"\n",
" with gr.Row():\n",
" model_dropdown = gr.Dropdown(\n",
" label=\"Model\",\n",
" choices=MODELS,\n",
" value=MODELS[0],\n",
" interactive=True\n",
" )\n",
"\n",
" nr_records_input = gr.Number(\n",
" label=\"Number of records\",\n",
" value=27,\n",
" minimum=11,\n",
" maximum=1000,\n",
" step=1\n",
" )\n",
"\n",
" with gr.Row():\n",
" save_as_input = gr.Textbox(\n",
" label=\"Save as\",\n",
" value=\"restaurant_dataset\",\n",
" placeholder=\"Enter filename (extension will be added automatically)\"\n",
" )\n",
"\n",
" file_format_dropdown = gr.Dropdown(\n",
" label=\"File format\",\n",
" choices=FILE_FORMATS,\n",
" value=FILE_FORMATS[0],\n",
" interactive=True\n",
" )\n",
"\n",
" env_dropdown = gr.Dropdown(\n",
" label=\"Environment\",\n",
" choices=[\"Colab\", \"Local\"],\n",
" value=\"Colab\",\n",
" interactive=True\n",
" )\n",
"\n",
"\n",
"\n",
" generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n",
"\n",
" with gr.Column(scale=1):\n",
" output_status = gr.Textbox(\n",
" label=\"Status\",\n",
" lines=4,\n",
" interactive=False\n",
" )\n",
"\n",
" output_preview = gr.Dataframe(\n",
" label=\"Preview (First 10 rows)\",\n",
" interactive=False,\n",
" wrap=True\n",
" )\n",
"\n",
" # Connect the generate button\n",
" generate_btn.click(\n",
" fn=generate_dataset,\n",
" inputs=[\n",
" schema_input,\n",
" business_problem_input,\n",
" model_dropdown,\n",
" nr_records_input,\n",
" file_format_dropdown,\n",
" save_as_input,\n",
" env_dropdown\n",
" ],\n",
" outputs=[output_status, output_preview]\n",
" )\n",
"\n",
" gr.Markdown(\"\"\"\n",
" ### 📝 Instructions:\n",
" 1. **Schema**: Define the structure of your dataset (pre-filled with restaurant schema)\n",
" 2. **Business problem**: User prompt to guide the AI model\n",
" 3. **Model**: Choose between GPT, Claude, Gemini, DeepSeek or Llama models\n",
" 4. **Number of records**: Number of records to generate (minimum 11)\n",
" 5. **File format**: Choose output format (.csv, .tsv, .jsonl, .parquet, .arrow)\n",
" 6. **Save as**: Filename (extension added automatically)\n",
" 7. Click **Generate** to create your dataset\n",
"\n",
" ### 🔧 Requirements:\n",
" - For local mode, set up HF token and API keys in `.env` file (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
" - For colab mode, set up HF token and API keys in Colab secret section (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
" \"\"\")\n",
"\n",
"interface.launch(debug=True)\n",
"\n",
"del tokenizer, inputs, llama_model, outputs\n",
"gc.collect()\n",
"torch.cuda.empty_cache()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,523 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "It89APiAtTUF"
},
"source": [
"# Create meeting minutes from an Audio file\n",
"\n",
"I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
"\n",
"If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
"\n",
"The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
"\n",
"For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sJPSCwPX3MOV"
},
"source": [
"## Again - please note: 2 important pro-tips for using Colab:\n",
"\n",
"**Pro-tip 1:**\n",
"\n",
"The top of every colab has some pip installs. You may receive errors from pip when you run this, such as:\n",
"\n",
"> gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\n",
"\n",
"These pip compatibility errors can be safely ignored; and while it's tempting to try to fix them by changing version numbers, that will actually introduce real problems!\n",
"\n",
"**Pro-tip 2:**\n",
"\n",
"In the middle of running a Colab, you might get an error like this:\n",
"\n",
"> Runtime error: CUDA is required but not available for bitsandbytes. Please consider installing [...]\n",
"\n",
"This is a super-misleading error message! Please don't try changing versions of packages...\n",
"\n",
"This actually happens because Google has switched out your Colab runtime, perhaps because Google Colab was too busy. The solution is:\n",
"\n",
"1. Kernel menu >> Disconnect and delete runtime\n",
"2. Reload the colab from fresh and Edit menu >> Clear All Outputs\n",
"3. Connect to a new T4 using the button at the top right\n",
"4. Select \"View resources\" from the menu on the top right to confirm you have a GPU\n",
"5. Rerun the cells in the colab, from the top down, starting with the pip installs\n",
"\n",
"And all should work great - otherwise, ask me!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f2vvgnFpHpID"
},
"outputs": [],
"source": [
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FW8nl3XRFrz0"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q3D1_T0uG_Qh"
},
"outputs": [],
"source": [
"# Constants\n",
"\n",
"AUDIO_MODEL = \"whisper-1\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Es9GkQ0FGCMt"
},
"outputs": [],
"source": [
"# New capability - connect this Colab to my Google Drive\n",
"# See immediately below this for instructions to obtain denver_extract.mp3\n",
"\n",
"drive.mount(\"/content/drive\")\n",
"audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HTl3mcjyzIEE"
},
"source": [
"# Download denver_extract.mp3\n",
"\n",
"You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
"\n",
"If you want to use the same as me, then please download my extract here, and put this on your Google Drive: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xYW8kQYtF-3L"
},
"outputs": [],
"source": [
"# Sign in to HuggingFace Hub\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qP6OB2OeGC2C"
},
"outputs": [],
"source": [
"# Sign in to OpenAI using Secrets in Colab\n",
"\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GMShdVGlGGr4"
},
"outputs": [],
"source": [
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
"# If you'd prefer to use an Open Source model, class student Youssef has contributed an open source version\n",
"# which I've added to the bottom of this colab\n",
"\n",
"audio_file = open(audio_filename, \"rb\")\n",
"transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format=\"text\")\n",
"print(transcription)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "piEMmcSfMH-O"
},
"outputs": [],
"source": [
"system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
"user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UcRKUgcxMew6"
},
"outputs": [],
"source": [
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6CujZRAgMimy"
},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"# inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
"streamer = TextStreamer(tokenizer)\n",
"model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config, trust_remote_code=True)\n",
"# outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MaLNmJ5PSqcH"
},
"outputs": [],
"source": [
"inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
"outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "102tdU_3Peam"
},
"outputs": [],
"source": [
"response = tokenizer.decode(outputs[0])\n",
"response"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KlomN6CwMdoN"
},
"outputs": [],
"source": [
"display(Markdown(response))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0jZElVOMSPAr"
},
"source": [
"Day5 exercise - Gradio UI for meeting minutes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5iiYYxQMHf0i"
},
"outputs": [],
"source": [
"import gradio as gr\n",
"import tempfile\n",
"import soundfile as sf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aGwXW7BjPcTM"
},
"outputs": [],
"source": [
"# !pip install pydub\n",
"# !apt-get install ffmpeg\n",
"\n",
"from pydub import AudioSegment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RNu-reHuCYj_"
},
"outputs": [],
"source": [
"# Make sure that the tokenizeer and model is already generated\n",
"\n",
"# tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"# tokenizer.pad_token = tokenizer.eos_token\n",
"# streamer = TextStreamer(tokenizer)\n",
"# model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KOuoH0YOPruE"
},
"outputs": [],
"source": [
"# def save_as_mp3(audio_np):\n",
"# sr, data = audio_np\n",
"# # Convert float32 or int16 to PCM wav and then mp3\n",
"# wav_path = tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False).name\n",
"# mp3_path = tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False).name\n",
"\n",
"# sf.write(wav_path, data, sr)\n",
"# audio_segment = AudioSegment.from_wav(wav_path)\n",
"# audio_segment.export(mp3_path, format=\"mp3\", bitrate=\"64k\") # Low bitrate = small file\n",
"# return mp3_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "toBIPBJoSNw0"
},
"outputs": [],
"source": [
"# Handles audio input as numpy array and returns updated chat history\n",
"def speak_send(audio_np):\n",
"\n",
" # If use numpy as input: audio_input = gr.Audio(sources=\"upload\", type=\"numpy\", label=\"Upload audio file to generate meeting minutes\")\n",
" # mp3_path = save_as_mp3(audio_np)\n",
"\n",
" # with open(mp3_path, \"rb\") as audio_file:\n",
" # transcription = openai.audio.transcriptions.create(\n",
" # model=AUDIO_MODEL,\n",
" # file=audio_file,\n",
" # response_format=\"text\"\n",
" # )\n",
"\n",
" audio = AudioSegment.from_file(audio_np)\n",
" with tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False) as tmpfile:\n",
" audio.export(tmpfile.name, format=\"mp3\")\n",
" with open(tmpfile.name, \"rb\") as file:\n",
" transcript = openai.audio.transcriptions.create(\n",
" model=AUDIO_MODEL,\n",
" file=file,\n",
" response_format=\"text\"\n",
" )\n",
"\n",
" system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
" user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" outputs = model.generate(inputs, max_new_tokens=2000)\n",
"\n",
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
" return after.strip()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xXJfabpDSN5R"
},
"outputs": [],
"source": [
"with gr.Blocks() as demo:\n",
"\n",
" with gr.Row():\n",
" audio_input = gr.Audio(sources=\"upload\", type=\"filepath\", label=\"Upload audio file to generate meeting minutes\")\n",
" with gr.Row():\n",
" audio_submit = gr.Button(\"Send\")\n",
" with gr.Row():\n",
" outputs = [gr.Markdown(label=\"Meeting minutes:\")]\n",
"\n",
" audio_submit.click(speak_send, inputs=audio_input, outputs=outputs)\n",
"\n",
"demo.launch(debug=True)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kuxYecT2QDQ9"
},
"source": [
"# Student contribution\n",
"\n",
"Student Emad S. has made this powerful variation that uses `TextIteratorStreamer` to stream back results into a Gradio UI, and takes advantage of background threads for performance! I'm sharing it here if you'd like to take a look at some very interesting work. Thank you, Emad!\n",
"\n",
"https://colab.research.google.com/drive/1Ja5zyniyJo5y8s1LKeCTSkB2xyDPOt6D"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AU3uAEyU3a-o"
},
"source": [
"## Alternative implementation\n",
"\n",
"Class student Youssef has contributed this variation in which we use an open-source model to transcribe the meeting Audio.\n",
"\n",
"Thank you Youssef!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "phYYgAbBRvu5"
},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HdQnWEzW3lzP"
},
"outputs": [],
"source": [
"AUDIO_MODEL = \"openai/whisper-medium\"\n",
"speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)\n",
"speech_model.to('cuda')\n",
"processor = AutoProcessor.from_pretrained(AUDIO_MODEL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZhA_fbeCSAeZ"
},
"outputs": [],
"source": [
"pipe = pipeline(\n",
" \"automatic-speech-recognition\",\n",
" model=speech_model,\n",
" tokenizer=processor.tokenizer,\n",
" feature_extractor=processor.feature_extractor,\n",
" torch_dtype=torch.float16,\n",
" device='cuda',\n",
" return_timestamps=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nrQjKtD53omJ"
},
"outputs": [],
"source": [
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
"result = pipe(audio_filename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "G_XSljOY3tDf"
},
"outputs": [],
"source": [
"transcription = result[\"text\"]\n",
"print(transcription)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,841 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9",
"metadata": {},
"source": [
"# Power Coder\n",
"\n",
"1. Convert code between two programming language; supporting languages are Python, Java, JavaScript, TypeScript, C, C++, C#, Go, Rust, Kotlin, Swift, PHP, Julia\n",
"2. Automatically add docstring/comments based on selected comment style\n",
"3. Automatically generate unit tests based on selected unit test style\n",
"4. Supporting models: gpt-4o, claude-3-5-sonnet-20240620, gemini-2.5-flash\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3",
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import io\n",
"import sys\n",
"import json\n",
"import requests\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import google.generativeai\n",
"import anthropic\n",
"from IPython.display import Markdown, display, update_display\n",
"import gradio as gr\n",
"import subprocess"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f672e1c-87e9-4865-b760-370fa605e614",
"metadata": {},
"outputs": [],
"source": [
"# environment\n",
"\n",
"load_dotenv(override=True)\n",
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')\n",
"os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da",
"metadata": {},
"outputs": [],
"source": [
"# initialize\n",
"\n",
"openai = OpenAI()\n",
"claude = anthropic.Anthropic()\n",
"gemini_via_openai_client = OpenAI(\n",
" api_key=os.environ['GOOGLE_API_KEY'], \n",
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
")\n",
"OPENAI_MODEL = \"gpt-4o\"\n",
"CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n",
"GEMINI_MODEL = \"gemini-2.5-flash\""
]
},
{
"cell_type": "markdown",
"id": "37b204dd-f770-41d9-9b19-7e1baa5273cd",
"metadata": {},
"source": [
"## 1. Convesion Part"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6896636f-923e-4a2c-9d6c-fac07828a201",
"metadata": {},
"outputs": [],
"source": [
"def convert_system_prompt_for(in_lang, out_lang):\n",
" convert_system_message = f\"You are an assistant that reimplements {in_lang} code in high performance {out_lang}. \"\n",
" convert_system_message += f\"Respond only with {out_lang} code; use comments sparingly and do not provide any explanation other than occasional comments. \"\n",
" convert_system_message += f\"The {out_lang} response needs to produce an identical output in the fastest possible time. Keep implementations of random number generators identical so that results match exactly.\"\n",
" return convert_system_message"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb",
"metadata": {},
"outputs": [],
"source": [
"def convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code):\n",
" convert_user_prompt = f\"Rewrite this {in_lang} code in {out_lang} with the fastest possible implementation that produces identical output in the least time. \"\n",
" convert_user_prompt += f\"Respond only with {out_lang} code; do not explain your work other than a few comments. \"\n",
" convert_user_prompt += f\"Pay attention to number types to ensure no int overflows. Remember to include all necessary {out_lang} packages or modules, for example, iomanip for C++.\\n\\n\"\n",
" if input_instruct:\n",
" convert_user_prompt += \"Addtional instruction is: \" + input_instruct\n",
" convert_user_prompt += in_code\n",
" return convert_user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6190659-f54c-4951-bef4-4960f8e51cc4",
"metadata": {},
"outputs": [],
"source": [
"def convert_messages_for(in_lang, out_lang, input_instruct, in_code):\n",
" return [\n",
" {\"role\": \"system\", \"content\": convert_system_prompt_for(in_lang, out_lang)},\n",
" {\"role\": \"user\", \"content\": convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3b497b3-f569-420e-b92e-fb0f49957ce0",
"metadata": {},
"outputs": [],
"source": [
"python_hard = \"\"\"# Be careful to support large number sizes\n",
"\n",
"def lcg(seed, a=1664525, c=1013904223, m=2**32):\n",
" value = seed\n",
" while True:\n",
" value = (a * value + c) % m\n",
" yield value\n",
" \n",
"def max_subarray_sum(n, seed, min_val, max_val):\n",
" lcg_gen = lcg(seed)\n",
" random_numbers = [next(lcg_gen) % (max_val - min_val + 1) + min_val for _ in range(n)]\n",
" max_sum = float('-inf')\n",
" for i in range(n):\n",
" current_sum = 0\n",
" for j in range(i, n):\n",
" current_sum += random_numbers[j]\n",
" if current_sum > max_sum:\n",
" max_sum = current_sum\n",
" return max_sum\n",
"\n",
"def total_max_subarray_sum(n, initial_seed, min_val, max_val):\n",
" total_sum = 0\n",
" lcg_gen = lcg(initial_seed)\n",
" for _ in range(20):\n",
" seed = next(lcg_gen)\n",
" total_sum += max_subarray_sum(n, seed, min_val, max_val)\n",
" return total_sum\n",
"\n",
"# Parameters\n",
"n = 10000 # Number of random numbers\n",
"initial_seed = 42 # Initial seed for the LCG\n",
"min_val = -10 # Minimum value of random numbers\n",
"max_val = 10 # Maximum value of random numbers\n",
"\n",
"# Timing the function\n",
"import time\n",
"start_time = time.time()\n",
"result = total_max_subarray_sum(n, initial_seed, min_val, max_val)\n",
"end_time = time.time()\n",
"\n",
"print(\"Total Maximum Subarray Sum (20 runs):\", result)\n",
"print(\"Execution Time: {:.6f} seconds\".format(end_time - start_time))\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0be9f47d-5213-4700-b0e2-d444c7c738c0",
"metadata": {},
"outputs": [],
"source": [
"def convert_stream_gpt(in_lang, out_lang, input_instruct, in_code): \n",
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=convert_messages_for(in_lang, out_lang, input_instruct, in_code), temperature=0.0, stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8669f56b-8314-4582-a167-78842caea131",
"metadata": {},
"outputs": [],
"source": [
"def convert_stream_claude(in_lang, out_lang, input_instruct, in_code):\n",
" result = claude.messages.stream(\n",
" model=CLAUDE_MODEL,\n",
" max_tokens=2000,\n",
" temperature=0.0,\n",
" system=convert_system_prompt_for(in_lang, out_lang),\n",
" messages=[{\"role\": \"user\", \"content\": convert_user_prompt_for(in_lang, out_lang, input_instruct, in_code)}],\n",
" )\n",
" reply = \"\"\n",
" with result as stream:\n",
" for text in stream.text_stream:\n",
" reply += text\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01d3cd4f-c100-4e25-8670-0663513f6136",
"metadata": {},
"outputs": [],
"source": [
"def convert_stream_gemini(in_lang, out_lang, input_instruct, in_code): \n",
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=convert_messages_for(in_lang, out_lang, input_instruct, in_code), temperature=0.0, stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f1ae8f5-16c8-40a0-aa18-63b617df078d",
"metadata": {},
"outputs": [],
"source": [
"def optimize(in_lang, out_lang, in_code, input_instruct, convert_model):\n",
" if \"gpt\" in convert_model.lower():\n",
" result = convert_stream_gpt(in_lang, out_lang, input_instruct, in_code)\n",
" elif \"claude\" in convert_model.lower():\n",
" result = convert_stream_claude(in_lang, out_lang, input_instruct, in_code)\n",
" elif \"gemini\" in convert_model.lower():\n",
" result = convert_stream_gemini(in_lang, out_lang, input_instruct, in_code)\n",
" else:\n",
" raise ValueError(\"Unknown convert model\")\n",
" for stream_so_far in result:\n",
" yield stream_so_far "
]
},
{
"cell_type": "markdown",
"id": "07383878-f887-464f-8bc7-527c669d3edd",
"metadata": {},
"source": [
"## 2. Comment part"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d254038c-fdd6-4ef8-8b7a-a074f1e7405d",
"metadata": {},
"outputs": [],
"source": [
"def comment_system_prompt_for(lang, comment_style):\n",
" comment_system_message = f\"You are an assistant that generate necessary, concise and clear comment/docstring for the {lang} code by applying {comment_style} comment style. \"\n",
" comment_system_message += f\"Respond only with added comments, and do not provide any redundant explanation. \"\n",
" return comment_system_message"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e95cee4f-f229-4c9f-8e67-8a68cc9534c3",
"metadata": {},
"outputs": [],
"source": [
"def comment_user_prompt_for(lang, code, comment_style):\n",
" comment_user_prompt = f\"Add the comments/docstring on the given code for the {lang} programming language in {comment_style} comment style. \"\n",
" comment_user_prompt += f\"Respond only with added comments, and do not provide any redundant explanation.\\n\\n\"\n",
" comment_user_prompt += f\"The given code is as follows: \"\n",
" comment_user_prompt += code\n",
" return comment_user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "507426c2-cf5a-4041-b904-b18a5afe83b6",
"metadata": {},
"outputs": [],
"source": [
"def comment_messages_for(lang, code, comment_style):\n",
" return [\n",
" {\"role\": \"system\", \"content\": comment_system_prompt_for(lang, comment_style)},\n",
" {\"role\": \"user\", \"content\": comment_user_prompt_for(lang, code, comment_style)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e1c8cf6-7a15-4e79-82f6-6bb2a0b85773",
"metadata": {},
"outputs": [],
"source": [
"def comment_stream_gpt(lang, code, comment_style): \n",
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=comment_messages_for(lang, code, comment_style), temperature=0.0, stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26f27781-4a3e-4e5f-a8ab-9a25944a9879",
"metadata": {},
"outputs": [],
"source": [
"def comment_stream_claude(lang, code, comment_style):\n",
" result = claude.messages.stream(\n",
" model=CLAUDE_MODEL,\n",
" max_tokens=2000,\n",
" temperature=0.0,\n",
" system=comment_system_prompt_for(lang, comment_style),\n",
" messages=[{\"role\": \"user\", \"content\": comment_user_prompt_for(lang, code, comment_style)}],\n",
" )\n",
" reply = \"\"\n",
" with result as stream:\n",
" for text in stream.text_stream:\n",
" reply += text\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e6719e7-f2f3-40ea-8fed-01d84a641306",
"metadata": {},
"outputs": [],
"source": [
"def comment_stream_gemini(lang, code, comment_style): \n",
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=comment_messages_for(lang, code, comment_style), temperature=0.0, stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2b98acc4-23d8-4671-8f19-92d72631b55d",
"metadata": {},
"outputs": [],
"source": [
"def generate_comments_via_model(lang, code, comment_style, comment_model):\n",
" if \"gpt\" in comment_model.lower():\n",
" result = comment_stream_gpt(lang, code, comment_style)\n",
" elif \"claude\" in comment_model.lower():\n",
" result = comment_stream_claude(lang, code, comment_style)\n",
" elif \"gemini\" in comment_model.lower():\n",
" result = comment_stream_gemini(lang, code, comment_style)\n",
" else:\n",
" raise ValueError(\"Unknown comment model\")\n",
" for stream_so_far in result:\n",
" yield stream_so_far "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "282c75ae-d8c3-4866-a024-f7ecf87b3cde",
"metadata": {},
"outputs": [],
"source": [
"def generate_comments_fn(comment_option, in_lang, out_lang, in_code, out_code, in_comment_style, out_comment_style, comment_model):\n",
" if 'input' in comment_option:\n",
" in_gen = generate_comments_via_model(in_lang, in_code, in_comment_style, comment_model)\n",
" for in_output in in_gen:\n",
" yield in_output, \"\"\n",
" elif 'output' in comment_option:\n",
" out_gen = generate_comments_via_model(out_lang, out_code, out_comment_style, comment_model)\n",
" for out_output in out_gen:\n",
" yield \"\", out_output\n",
" elif 'both' in comment_option:\n",
" in_gen = generate_comments_via_model(in_lang, in_code, in_comment_style, comment_model)\n",
" out_gen = generate_comments_via_model(out_lang, out_code, out_comment_style, comment_model)\n",
" for in_output, out_output in zip(in_gen, out_gen):\n",
" yield in_output, out_output"
]
},
{
"cell_type": "markdown",
"id": "ce2c178c-d03c-49c0-b0e9-c57c699bca08",
"metadata": {},
"source": [
"## 3. Unit test part"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5a4743e-e1a8-42c7-8f1f-a73d49c0895d",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_system_prompt_for(lang, unit_test_style):\n",
" unit_test_system_message = f\"You are an assistant that generate necessary, concise, clear and executable unit tests for the {lang} code by applying {unit_test_style} unit test style. \"\n",
" unit_test_system_message += f\"Respond only with generated unit tests; use comments sparingly and do not provide any explanation other than occasional comments. \"\n",
" return unit_test_system_message"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "334d5e40-71ff-4d24-8cef-b6c81c188e4d",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_user_prompt_for(lang, code, unit_test_style):\n",
" unit_test_user_prompt = f\"Add the unit tests on the given code for the {lang} programming language in {unit_test_style} unit test style. \"\n",
" unit_test_user_prompt += f\"Respond only with generated unit tests; use comments sparingly and do not provide any explanation other than occasional comments.\\n\\n\"\n",
" unit_test_user_prompt += f\"The given code is as follows: \"\n",
" unit_test_user_prompt += code\n",
" return unit_test_user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a8e061f-3993-4746-9425-d938d2537f65",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_messages_for(lang, code, unit_test_style):\n",
" return [\n",
" {\"role\": \"system\", \"content\": unit_test_system_prompt_for(lang, unit_test_style)},\n",
" {\"role\": \"user\", \"content\": unit_test_user_prompt_for(lang, code, unit_test_style)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71c1613b-7a16-4443-acec-d0a2d9bed192",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_stream_gpt(lang, code, unit_test_style): \n",
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=unit_test_messages_for(lang, code, unit_test_style), stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a6e3502-f7ff-42b8-8fc5-2697b2d1f36e",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_stream_claude(lang, code, unit_test_style):\n",
" result = claude.messages.stream(\n",
" model=CLAUDE_MODEL,\n",
" max_tokens=2000,\n",
" system=unit_test_system_prompt_for(lang, unit_test_style),\n",
" messages=[{\"role\": \"user\", \"content\": unit_test_user_prompt_for(lang, code, unit_test_style)}],\n",
" )\n",
" reply = \"\"\n",
" with result as stream:\n",
" for text in stream.text_stream:\n",
" reply += text\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d7f694f-a276-4bdc-9cfb-755483fd4380",
"metadata": {},
"outputs": [],
"source": [
"def unit_test_stream_gemini(lang, code, unit_test_style): \n",
" stream = gemini_via_openai_client.chat.completions.create(model=GEMINI_MODEL, messages=unit_test_messages_for(lang, code, unit_test_style), stream=True)\n",
" reply = \"\"\n",
" for chunk in stream:\n",
" fragment = chunk.choices[0].delta.content or \"\"\n",
" reply += fragment\n",
" yield reply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c824429a-b18a-4320-8258-0141037a6531",
"metadata": {},
"outputs": [],
"source": [
"def generate_unit_test_via_model(lang, code, unit_test_style, unit_test_model):\n",
" if \"gpt\" in unit_test_model.lower():\n",
" result = unit_test_stream_gpt(lang, code, unit_test_style)\n",
" elif \"claude\" in unit_test_model.lower():\n",
" result = unit_test_stream_claude(lang, code, unit_test_style)\n",
" elif \"gemini\" in unit_test_model.lower():\n",
" result = unit_test_stream_gemini(lang, code, unit_test_style)\n",
" else:\n",
" raise ValueError(\"Unknown unit test model\")\n",
" for stream_so_far in result:\n",
" yield stream_so_far "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3e59e26-37c0-4429-b69c-deb581423dd0",
"metadata": {},
"outputs": [],
"source": [
"def generate_unit_test_fn(unit_test_option, in_lang, out_lang, in_code, out_code, in_unit_test_style, out_unit_test_style, unit_test_model):\n",
" if 'input' in unit_test_option:\n",
" in_gen = generate_unit_test_via_model(in_lang, in_code, in_unit_test_style, unit_test_model)\n",
" for in_output in in_gen:\n",
" yield in_output, \"\"\n",
" elif 'output' in unit_test_option:\n",
" out_gen = generate_unit_test_via_model(out_lang, out_code, out_unit_test_style, unit_test_model)\n",
" for out_output in out_gen:\n",
" yield \"\", out_output\n",
" elif 'both' in unit_test_option:\n",
" in_gen = generate_unit_test_via_model(in_lang, in_code, in_unit_test_style, unit_test_model)\n",
" out_gen = generate_unit_test_via_model(out_lang, out_code, out_unit_test_style, unit_test_model)\n",
" for in_output, out_output in zip(in_gen, out_gen):\n",
" yield in_output, out_output"
]
},
{
"cell_type": "markdown",
"id": "2a1f4d0c-f417-4de4-be9f-441cbe5a6db3",
"metadata": {},
"source": [
"## 4. Gradio UI part"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a2274f1-d03b-42c0-8dcc-4ce159b18442",
"metadata": {},
"outputs": [],
"source": [
"LANGUAGE_INFO = {\n",
" \"Python\": {\n",
" \"doc_style\": [\"Google-style\", \"NumPy-style\", \"reST\", \"Doxygen\"],\n",
" \"unit_test_style\": [\"unittest\", \"pytest\", \"doctest\"]\n",
" },\n",
" \"Java\": {\n",
" \"doc_style\": [\"Javadoc\"],\n",
" \"unit_test_style\": [\"JUnit4\", \"JUnit5\", \"TestNG\"]\n",
" },\n",
" \"JavaScript\": {\n",
" \"doc_style\": [\"JSDoc\"],\n",
" \"unit_test_style\": [\"Jest\", \"Mocha + Chai\", \"Jasmine\"]\n",
" },\n",
" \"TypeScript\": {\n",
" \"doc_style\": [\"JSDoc\", \"TSDoc\"],\n",
" \"unit_test_style\": [\"Jest\", \"Mocha + Chai\", \"Vitest\"]\n",
" },\n",
" \"C\": {\n",
" \"doc_style\": [\"Doxygen\"],\n",
" \"unit_test_style\": [\"Google Test (gtest)\", \"CppUnit\", \"Catch2\"]\n",
" },\n",
" \"C++\": {\n",
" \"doc_style\": [\"Doxygen\"],\n",
" \"unit_test_style\": [\"Google Test (gtest)\", \"CppUnit\", \"Catch2\"]\n",
" },\n",
" \"C#\": {\n",
" \"doc_style\": [\"XML comments\"],\n",
" \"unit_test_style\": [\"xUnit\", \"NUnit\", \"MSTest\"]\n",
" },\n",
" \"Go\": {\n",
" \"doc_style\": [\"Godoc\"],\n",
" \"unit_test_style\": [\"Built-in testing package\"]\n",
" },\n",
" \"Rust\": {\n",
" \"doc_style\": [\"Rustdoc\", \"Markdown\"],\n",
" \"unit_test_style\": [\"Built-in #[test] annotation\"]\n",
" },\n",
" \"Kotlin\": {\n",
" \"doc_style\": [\"KDoc\"],\n",
" \"unit_test_style\": [\"JUnit\", \"Kotest\", \"Spek\"]\n",
" },\n",
" \"Swift\": {\n",
" \"doc_style\": [\"Mark-style comments\"],\n",
" \"unit_test_style\": [\"XCTest\"]\n",
" },\n",
" \"PHP\": {\n",
" \"doc_style\": [\"PHPDoc\"],\n",
" \"unit_test_style\": [\"PHPUnit\"]\n",
" },\n",
" \"Julia\": {\n",
" \"doc_style\": [\"Markdown\"],\n",
" \"unit_test_style\": [\"Built-in Test standard library\"]\n",
" }\n",
"}\n",
"LANGUAGES = list(LANGUAGE_INFO.keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b50e7833-8f6f-407e-8174-37af9cec2030",
"metadata": {},
"outputs": [],
"source": [
"with gr.Blocks(title=\"Power Coder\", theme=gr.themes.Citrus(), css=\"\"\"\n",
".selected {\n",
" background-color: orange !important;\n",
" box-shadow: 0 4px 12px rgba(255, 140, 0, 0.5) !important;\n",
" color: black;\n",
"}\n",
".unselected {\n",
" background-color: gray !important;\n",
" box-shadow: 0 4px 12px rgba(128, 128, 128, 0.4);\n",
" color: white;\n",
"}\n",
"\"\"\") as ui:\n",
" current_selected = gr.State(\"\")\n",
" initial_in_lang = \"Python\"\n",
" initial_out_lang = \"Java\"\n",
" in_comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_in_lang][\"doc_style\"]\n",
" out_comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_out_lang][\"doc_style\"]\n",
" in_unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_in_lang][\"unit_test_style\"]\n",
" out_unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[initial_out_lang][\"unit_test_style\"]\n",
" in_code_file_name = gr.State(\"in_code.txt\")\n",
" out_code_file_name = gr.State(\"out_code.txt\")\n",
" in_comments_file_name = gr.State(\"in_comments.txt\")\n",
" out_comments_file_name = gr.State(\"out_comments.txt\")\n",
" in_unit_test_file_name = gr.State(\"in_unit_tests.txt\")\n",
" out_unit_test_file_name = gr.State(\"out_unit_tests.txt\")\n",
" \n",
" \n",
" gr.Markdown(\"## Code Helper\")\n",
"\n",
" def load_file_content(file):\n",
" if file is None:\n",
" return \"\"\n",
" with open(file.name, \"r\", encoding=\"utf-8\") as f:\n",
" return f.read()\n",
"\n",
" def change_lang(lang):\n",
" comment_style_choices = [\"Standard\"] + LANGUAGE_INFO[lang][\"doc_style\"]\n",
" unit_test_style_choices = [\"Standard\"] + LANGUAGE_INFO[lang][\"unit_test_style\"]\n",
" return (\n",
" gr.update(choices=comment_style_choices, value=str(comment_style_choices[0])), \n",
" gr.update(choices=unit_test_style_choices, value=str(unit_test_style_choices[0]))\n",
" )\n",
"\n",
" def download_fn(in_text, out_text, in_file_name, out_file_name):\n",
" if in_text:\n",
" with open(in_file_name, \"w\") as f:\n",
" f.write(in_text)\n",
" if out_text:\n",
" with open(out_file_name, \"w\") as f:\n",
" f.write(out_text)\n",
" \n",
" # Conversion part\n",
" with gr.Row():\n",
" in_lang = gr.Dropdown(choices=LANGUAGES, label=\"Select input language\", value=initial_in_lang, interactive=True)\n",
" out_lang = gr.Dropdown(choices=LANGUAGES, label=\"Select output language\", value=initial_out_lang, interactive=True)\n",
" with gr.Row():\n",
" input_file = gr.File(label=\"Upload a source code file or input below\")\n",
" input_instruct = gr.Textbox(\n",
" label=\"Additional instruction(optional)\",\n",
" placeholder=\"Enter the instruction you want the ouput code to follow...\\n\\nFor example: Define the variable using snake_case style.\",\n",
" lines=8\n",
" )\n",
" with gr.Row():\n",
" in_code = gr.Textbox(label=\"Input Code:\", value=python_hard, lines=10)\n",
" out_code = gr.Textbox(label=\"Output Code:\", lines=10)\n",
" with gr.Row():\n",
" convert_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
" with gr.Row():\n",
" convert = gr.Button(\"Convert code\")\n",
" download_code = gr.Button(\"Download code\")\n",
"\n",
" gr.HTML(\"<hr style='border: none; height: 1px; background-color: #333;'>\")\n",
"\n",
" def show_comment(current_selected):\n",
" if current_selected == \"comment\":\n",
" return (\n",
" gr.update(visible=False),\n",
" gr.update(visible=False),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" \"\"\n",
" )\n",
" else:\n",
" return (\n",
" gr.update(visible=True),\n",
" gr.update(visible=False),\n",
" gr.update(elem_classes=[\"selected\"]),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" \"comment\"\n",
" )\n",
"\n",
" def show_unit_test(current_selected):\n",
" if current_selected == \"unit_test\":\n",
" return (\n",
" gr.update(visible=False),\n",
" gr.update(visible=False),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" \"\"\n",
" )\n",
" else:\n",
" return (\n",
" gr.update(visible=False),\n",
" gr.update(visible=True),\n",
" gr.update(elem_classes=[\"unselected\"]),\n",
" gr.update(elem_classes=[\"selected\"]),\n",
" \"unit_test\"\n",
" )\n",
" \n",
" with gr.Blocks() as demo:\n",
" with gr.Row():\n",
" comment_show_up = gr.Button(\"Comment\", elem_id=\"comment-btn\", elem_classes=[\"unselected\"])\n",
" unit_test_show_up = gr.Button(\"Unit Test\", elem_id=\"unit-test-btn\", elem_classes=[\"unselected\"])\n",
" \n",
" comment_section = gr.Column(visible=False)\n",
" unit_test_section = gr.Column(visible=False)\n",
" \n",
" with comment_section:\n",
" # Comment section\n",
" with gr.Row():\n",
" comment_option = gr.Radio(\n",
" choices=[\n",
" \"Comment input code\",\n",
" \"Comment output code\",\n",
" \"Comment both\"\n",
" ],\n",
" label=\"Commenting Options\",\n",
" value=\"Comment input code\",\n",
" interactive=True\n",
" )\n",
" with gr.Row():\n",
" in_comment_style = gr.Dropdown(choices=in_comment_style_choices, label=\"Select comment style for input code\", value=in_comment_style_choices[0], interactive=True)\n",
" out_comment_style = gr.Dropdown(choices=out_comment_style_choices, label=\"Select comment style for oupt code\", value=out_comment_style_choices[0], interactive=True)\n",
" with gr.Row():\n",
" comment_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
" with gr.Row():\n",
" generate_comments = gr.Button(\"Generate comments\")\n",
" download_comments = gr.Button(\"Download comments\")\n",
" with gr.Row():\n",
" in_comments = gr.Textbox(label=\"Comments for Input Code:\", lines=10)\n",
" out_comments = gr.Textbox(label=\"Comments for Output Code:\", lines=10)\n",
" \n",
" with unit_test_section:\n",
" # Unit test part\n",
" with gr.Row():\n",
" unit_test_option = gr.Radio(\n",
" choices=[\n",
" \"Add unit test for input code\",\n",
" \"Add unit test for output code\",\n",
" \"Add unit test for both\"\n",
" ],\n",
" label=\"Unit Test Options\",\n",
" value=\"Add unit test for input code\",\n",
" interactive=True\n",
" )\n",
" with gr.Row():\n",
" in_unit_test_style = gr.Dropdown(choices=in_unit_test_style_choices, label=\"Select unit test style for input code\", value=in_unit_test_style_choices[0], interactive=True)\n",
" out_unit_test_style = gr.Dropdown(choices=out_unit_test_style_choices, label=\"Select unit test style for oupt code\", value=out_unit_test_style_choices[0], interactive=True)\n",
" with gr.Row():\n",
" unit_test_model = gr.Dropdown([\"Claude\", \"GPT\", \"Gemini\"], label=\"Select model\", value=\"Claude\")\n",
" with gr.Row():\n",
" generate_unit_test = gr.Button(\"Generate unit test\")\n",
" download_unit_test = gr.Button(\"Download unit text\")\n",
" with gr.Row():\n",
" in_unit_test = gr.Textbox(label=\"Unit Test for Input Code:\", lines=10)\n",
" out_unit_test = gr.Textbox(label=\"Unit Test for Output Code:\", lines=10)\n",
"\n",
" in_lang.change(fn=change_lang, inputs=in_lang, outputs=[in_comment_style, in_unit_test_style])\n",
" out_lang.change(fn=change_lang, inputs=out_lang, outputs=[out_comment_style, out_unit_test_style])\n",
" input_file.change(fn=load_file_content, inputs=input_file, outputs=in_code)\n",
" \n",
" convert.click(optimize, inputs=[in_lang, out_lang, in_code, input_instruct, convert_model], outputs=[out_code])\n",
" download_code.click(download_fn, inputs=[in_code, out_code, in_code_file_name, out_code_file_name])\n",
" \n",
" comment_show_up.click(fn=show_comment, inputs=current_selected, outputs=[comment_section, unit_test_section, comment_show_up, unit_test_show_up, current_selected])\n",
" unit_test_show_up.click(fn=show_unit_test, inputs=current_selected, outputs=[comment_section, unit_test_section, comment_show_up, unit_test_show_up, current_selected])\n",
"\n",
" generate_comments.click(generate_comments_fn, inputs=[comment_option, in_lang, out_lang, in_code, out_code, in_comment_style, out_comment_style, comment_model], outputs=[in_comments, out_comments])\n",
" download_comments.click(download_fn, inputs=[in_comments, out_comments, in_comments_file_name, out_comments_file_name])\n",
" generate_unit_test.click(generate_unit_test_fn, inputs=[unit_test_option, in_lang, out_lang, in_code, out_code, in_unit_test_style, out_unit_test_style, unit_test_model], outputs=[in_unit_test, out_unit_test])\n",
" download_unit_test.click(download_fn, inputs=[in_unit_test, out_unit_test, in_unit_test_file_name, out_unit_test_file_name])\n",
" \n",
"ui.launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0266734c-0bee-46c0-9b17-9fd2ae86cc3a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}