Merge pull request #510 from Zhufeng-Qiu/zephyr-week3_4_5

Add the community contribution for Week3/4/5
2025-07-12 14:12:54 -04:00
parent aff4340e0f 1bc1229395
commit 8090e8509c
3 changed files with 1915 additions and 0 deletions
--- a/week3/community-contributions/Week3_Exercise_Data_Generator.ipynb
+++ b/week3/community-contributions/Week3_Exercise_Data_Generator.ipynb
@@ -0,0 +1,551 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "GD5Omr5EfWgb"
+   },
+   "source": [
+    "# Date Generator\n",
+    "\n",
+    "generate synthetic data when given scheme, business problem description, model, number of records, file name, file type, and environment\n",
+    "\n",
+    "# Available models\n",
+    "  Model API:\n",
+    "\n",
+    "    1. gpt-4o-mini\n",
+    "    2. claude-3-haiku-20240307\n",
+    "    3. gemini-2.0-flash\n",
+    "    4. deepseek-chat\"\n",
+    "\n",
+    "  HuggingFace API:\n",
+    "\n",
+    "    5. meta-llama/Meta-Llama-3.1-8B-Instruct\n",
+    "\n",
+    "\n",
+    "# Available environment\n",
+    "\n",
+    "Colab: set up HF token and API keys in Colab secret section\n",
+    "\n",
+    "Local: set up HF token and API keys in .env file\n",
+    "\n",
+    "\n",
+    "\n",
+    "### *** This project is developed based on the idea of 'week3/community-contributuins/Week3-Dataset_Generator-DP'. Really appreciate it! Then, the project is improved to run both on Colab or locally, and integrate HuggingFace API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4FiCnE0MmU56"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
+    "!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0\n",
+    "!pip install anthropic dotenv pyarrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JeyKw5guoH3r"
+   },
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import requests\n",
+    "from IPython.display import Markdown, display, update_display\n",
+    "from openai import OpenAI\n",
+    "from huggingface_hub import login\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "from bs4 import BeautifulSoup\n",
+    "from typing import List\n",
+    "import google.generativeai\n",
+    "import anthropic\n",
+    "from itertools import chain\n",
+    "from dotenv import load_dotenv\n",
+    "import gradio as gr\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "import re\n",
+    "import subprocess\n",
+    "import pyarrow as pa\n",
+    "import torch\n",
+    "import gc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7UyjFdRZoIAS"
+   },
+   "outputs": [],
+   "source": [
+    "# --- Schema Definition ---\n",
+    "SCHEMA = [\n",
+    "    (\"Name\", \"TEXT\", '\"Northern Cafe\"'),\n",
+    "    (\"Location\", \"TEXT\", '\"2904 S Figueroa St, Los Angeles, CA 90007\"'),\n",
+    "    (\"Type\", \"TEXT\", 'One of [\"Chinese\",\"Mexico\",\"French\",\"Korean\",\"Italy\"] or other potential types'),\n",
+    "    (\"Average Price\", \"TEXT\", '\"$30\", or \"--\" if unkown'),\n",
+    "    (\"History/Age\", \"INT\", 'integer age of resturant, e.g., 7'),\n",
+    "    (\"Menu\", \"Array\", '[\"Beef Noodle\", \"Fried Rice\", \"Dumpling\", ...]'),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "jXcTQATLoICV"
+   },
+   "outputs": [],
+   "source": [
+    "# Default schema text for the textbox\n",
+    "DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) Example: {col[2]}\" for i, col in enumerate(SCHEMA)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4Irf5JV3oIEe"
+   },
+   "outputs": [],
+   "source": [
+    "# Available models\n",
+    "MODELS = [\n",
+    "    \"gpt-4o-mini\",\n",
+    "    \"claude-3-haiku-20240307\",\n",
+    "    \"gemini-2.0-flash\",\n",
+    "    \"deepseek-chat\",\n",
+    "    \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JJ6r2SH9oIGf"
+   },
+   "outputs": [],
+   "source": [
+    "# Available file formats\n",
+    "FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".parquet\", \".arrow\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "B98j45E3vq5g"
+   },
+   "outputs": [],
+   "source": [
+    "system_prompt = \"\"\"You are a helpful assistant whose main purpose is to generate datasets for a given business problem based on given schema.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lsX16cWfwf6x"
+   },
+   "outputs": [],
+   "source": [
+    "def get_env_info(env):\n",
+    "  try:\n",
+    "    global hf_token, openai_api_key, anthropic_api_key, google_api_key, deepseek_api_key\n",
+    "    if env == \"Colab\":\n",
+    "      # Colab environment\n",
+    "      from google.colab import drive\n",
+    "      from google.colab import userdata\n",
+    "      hf_token = userdata.get('HF_TOKEN')\n",
+    "      openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "      anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n",
+    "      google_api_key = userdata.get('GOOGLE_API_KEY')\n",
+    "      deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')\n",
+    "    elif env == \"Local\":\n",
+    "      # Local environment\n",
+    "      load_dotenv(override=True)\n",
+    "      hf_token = os.getenv('HF_TOKEN')\n",
+    "      openai_api_key = os.getenv('OPENAI_API_KEY')\n",
+    "      anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
+    "      google_api_key = os.getenv('GOOGLE_API_KEY')\n",
+    "      deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
+    "  except Exception as e:\n",
+    "      raise Exception(f\"Please check your environment: {str(e)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2gLUFAwGv29Q"
+   },
+   "outputs": [],
+   "source": [
+    "def get_prompt(schema_text, business_problem, nr_records):\n",
+    "    prompt = f\"\"\"\n",
+    "      The problem is: {business_problem}\n",
+    "\n",
+    "      Generate {nr_records} rows data in JSONL format, each line a JSON object with the following fields:\n",
+    "\n",
+    "      {schema_text}\n",
+    "\n",
+    "      Do NOT repeat column values from one row to another.\n",
+    "\n",
+    "      Only output valid JSONL.\n",
+    "      \"\"\"\n",
+    "    return prompt.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "YZe1FVH8wf84"
+   },
+   "outputs": [],
+   "source": [
+    "# --- LLM Interface ---\n",
+    "def query(user_prompt, model):\n",
+    "    try:\n",
+    "        if \"gpt\" in model.lower():\n",
+    "            client = OpenAI(api_key=openai_api_key)\n",
+    "            messages = [\n",
+    "                {\"role\": \"system\", \"content\": system_prompt},\n",
+    "                {\"role\": \"user\", \"content\": user_prompt}\n",
+    "              ]\n",
+    "            response = client.chat.completions.create(\n",
+    "                model=model,\n",
+    "                messages=messages,\n",
+    "                temperature=0.7\n",
+    "            )\n",
+    "            content = response.choices[0].message.content\n",
+    "\n",
+    "        elif \"claude\" in model.lower():\n",
+    "            client = anthropic.Anthropic(api_key=anthropic_api_key)\n",
+    "            response = client.messages.create(\n",
+    "                model=model,\n",
+    "                messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
+    "                max_tokens=4000,\n",
+    "                temperature=0.7,\n",
+    "                system=system_prompt\n",
+    "            )\n",
+    "            content = response.content[0].text\n",
+    "        elif \"gemini\" in model.lower():\n",
+    "            client = OpenAI(\n",
+    "                api_key=google_api_key,\n",
+    "                base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
+    "            )\n",
+    "            messages = [\n",
+    "                {\"role\": \"system\", \"content\": system_prompt},\n",
+    "                {\"role\": \"user\", \"content\": user_prompt}\n",
+    "              ]\n",
+    "            response = client.chat.completions.create(\n",
+    "                model=model,\n",
+    "                messages=messages,\n",
+    "                temperature=0.7\n",
+    "            )\n",
+    "            content = response.choices[0].message.content\n",
+    "\n",
+    "        elif \"deepseek\" in model.lower():\n",
+    "            client = OpenAI(\n",
+    "                api_key=deepseek_api_key,\n",
+    "                base_url=\"https://api.deepseek.com\"\n",
+    "            )\n",
+    "            messages = [\n",
+    "                {\"role\": \"system\", \"content\": system_prompt},\n",
+    "                {\"role\": \"user\", \"content\": user_prompt}\n",
+    "              ]\n",
+    "            response = client.chat.completions.create(\n",
+    "                model=model,\n",
+    "                messages=messages,\n",
+    "                temperature=0.7\n",
+    "            )\n",
+    "            content = response.choices[0].message.content\n",
+    "\n",
+    "        elif \"llama\" in model.lower():\n",
+    "            global tokenizer, inputs, llama_model, outputs\n",
+    "            messages = [\n",
+    "                  {\"role\": \"system\", \"content\": system_prompt},\n",
+    "                  {\"role\": \"user\", \"content\": user_prompt}\n",
+    "                ]\n",
+    "\n",
+    "            login(hf_token, add_to_git_credential=True)\n",
+    "            quant_config = BitsAndBytesConfig(\n",
+    "                load_in_4bit=True,\n",
+    "                bnb_4bit_use_double_quant=True,\n",
+    "                bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "                bnb_4bit_quant_type=\"nf4\"\n",
+    "            )\n",
+    "\n",
+    "            tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
+    "            tokenizer.pad_token = tokenizer.eos_token\n",
+    "            inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "            if llama_model == None:\n",
+    "                llama_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n",
+    "            outputs = llama_model.generate(inputs, max_new_tokens=4000)\n",
+    "\n",
+    "            _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
+    "            content = after.strip()\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported model. Use one of {MODELS}\")\n",
+    "\n",
+    "        # Parse JSONL output\n",
+    "        lines = [line.strip() for line in content.strip().splitlines() if line.strip().startswith(\"{\")]\n",
+    "        return [json.loads(line) for line in lines]\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        raise Exception(f\"Model query failed: {str(e)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4WUj-XqM5IYT"
+   },
+   "outputs": [],
+   "source": [
+    "# --- Output Formatter ---\n",
+    "def save_dataset(records, file_format, filename):\n",
+    "    df = pd.DataFrame(records)\n",
+    "    if file_format == \".csv\":\n",
+    "        df.to_csv(filename, index=False)\n",
+    "    elif file_format == \".tsv\":\n",
+    "        df.to_csv(filename, sep=\"\\t\", index=False)\n",
+    "    elif file_format == \".jsonl\":\n",
+    "        with open(filename, \"w\") as f:\n",
+    "            for record in records:\n",
+    "                f.write(json.dumps(record) + \"\\n\")\n",
+    "    elif file_format == \".parquet\":\n",
+    "        df.to_parquet(filename, engine=\"pyarrow\", index=False)\n",
+    "    elif file_format == \".arrow\":\n",
+    "        table = pa.Table.from_pandas(df)\n",
+    "        with pa.OSFile(filename, \"wb\") as sink:\n",
+    "            with pa.ipc.new_file(sink, table.schema) as writer:\n",
+    "                writer.write(table)\n",
+    "    else:\n",
+    "        raise ValueError(\"Unsupported file format\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WenbNqrpwf-_"
+   },
+   "outputs": [],
+   "source": [
+    "# --- Main Generation Function ---\n",
+    "def generate_dataset(schema_text, business_problem, model, nr_records, file_format, save_as, env):\n",
+    "    try:\n",
+    "        # Validation\n",
+    "        if nr_records <= 10:\n",
+    "            return \"❌ Error: Number of records must be greater than 10.\", None\n",
+    "        if nr_records > 1000:\n",
+    "            return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n",
+    "\n",
+    "        if file_format not in FILE_FORMATS:\n",
+    "            return \"❌ Error: Invalid file format.\", None\n",
+    "\n",
+    "        if not (save_as or save_as.strip() == \"\"):\n",
+    "            save_as = f\"default{file_format}\"\n",
+    "        elif not save_as.endswith(file_format):\n",
+    "            save_as = save_as + file_format\n",
+    "\n",
+    "        # Load env\n",
+    "        get_env_info(env)\n",
+    "\n",
+    "        # Generate prompt\n",
+    "        user_prompt = get_prompt(schema_text, business_problem, nr_records)\n",
+    "\n",
+    "        # Query model\n",
+    "        records = query(user_prompt, model)\n",
+    "\n",
+    "        if not records:\n",
+    "            return \"❌ Error: No valid records generated from the model.\", None\n",
+    "\n",
+    "        # Save dataset\n",
+    "        save_dataset(records, file_format, save_as)\n",
+    "\n",
+    "        # Create preview\n",
+    "        df = pd.DataFrame(records)\n",
+    "        preview = df.head(10)  # Show first 10 rows\n",
+    "\n",
+    "        success_message = f\"✅ Generated {len(records)} records successfully!\\n📁 Saved to: {save_as}\\n📊 \"\n",
+    "\n",
+    "        return success_message, preview\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        return f\"❌ Error: {str(e)}\", None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pHiP8ky8wgEb"
+   },
+   "outputs": [],
+   "source": [
+    "# --- Gradio Interface ---\n",
+    "\n",
+    "with gr.Blocks(title=\"Dataset Generator\", theme=gr.themes.Citrus()) as interface:\n",
+    "    hf_token = None\n",
+    "    openai_api_key = None\n",
+    "    anthropic_api_key = None\n",
+    "    google_api_key = None\n",
+    "    deepseek_api_key = None\n",
+    "    tokenizer = None\n",
+    "    inputs = None\n",
+    "    llama_model = None\n",
+    "    outputs = None\n",
+    "\n",
+    "    gr.Markdown(\"# Dataset Generator\")\n",
+    "    gr.Markdown(\"Generate synthetic datasets using AI models\")\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        with gr.Column(scale=2):\n",
+    "            schema_input = gr.Textbox(\n",
+    "                label=\"Schema\",\n",
+    "                value=DEFAULT_SCHEMA_TEXT,\n",
+    "                lines=15,\n",
+    "                placeholder=\"Define your dataset schema here... Please follow this format: Field_Name, Field_Type, Field Example\"\n",
+    "            )\n",
+    "\n",
+    "            business_problem_input = gr.Textbox(\n",
+    "                label=\"Business Problem\",\n",
+    "                value=\"I want to generate restuant records\",\n",
+    "                lines=1,\n",
+    "                placeholder=\"Enter business problem desciption for the model...\"\n",
+    "            )\n",
+    "\n",
+    "            with gr.Row():\n",
+    "                model_dropdown = gr.Dropdown(\n",
+    "                    label=\"Model\",\n",
+    "                    choices=MODELS,\n",
+    "                    value=MODELS[0],\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "\n",
+    "                nr_records_input = gr.Number(\n",
+    "                    label=\"Number of records\",\n",
+    "                    value=27,\n",
+    "                    minimum=11,\n",
+    "                    maximum=1000,\n",
+    "                    step=1\n",
+    "                )\n",
+    "\n",
+    "            with gr.Row():\n",
+    "                save_as_input = gr.Textbox(\n",
+    "                      label=\"Save as\",\n",
+    "                      value=\"restaurant_dataset\",\n",
+    "                      placeholder=\"Enter filename (extension will be added automatically)\"\n",
+    "                  )\n",
+    "\n",
+    "                file_format_dropdown = gr.Dropdown(\n",
+    "                    label=\"File format\",\n",
+    "                    choices=FILE_FORMATS,\n",
+    "                    value=FILE_FORMATS[0],\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "\n",
+    "                env_dropdown = gr.Dropdown(\n",
+    "                    label=\"Environment\",\n",
+    "                    choices=[\"Colab\", \"Local\"],\n",
+    "                    value=\"Colab\",\n",
+    "                    interactive=True\n",
+    "                )\n",
+    "\n",
+    "\n",
+    "\n",
+    "            generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n",
+    "\n",
+    "        with gr.Column(scale=1):\n",
+    "            output_status = gr.Textbox(\n",
+    "                label=\"Status\",\n",
+    "                lines=4,\n",
+    "                interactive=False\n",
+    "            )\n",
+    "\n",
+    "            output_preview = gr.Dataframe(\n",
+    "                label=\"Preview (First 10 rows)\",\n",
+    "                interactive=False,\n",
+    "                wrap=True\n",
+    "            )\n",
+    "\n",
+    "    # Connect the generate button\n",
+    "    generate_btn.click(\n",
+    "        fn=generate_dataset,\n",
+    "        inputs=[\n",
+    "            schema_input,\n",
+    "            business_problem_input,\n",
+    "            model_dropdown,\n",
+    "            nr_records_input,\n",
+    "            file_format_dropdown,\n",
+    "            save_as_input,\n",
+    "            env_dropdown\n",
+    "        ],\n",
+    "        outputs=[output_status, output_preview]\n",
+    "    )\n",
+    "\n",
+    "    gr.Markdown(\"\"\"\n",
+    "    ### 📝 Instructions:\n",
+    "    1. **Schema**: Define the structure of your dataset (pre-filled with restaurant schema)\n",
+    "    2. **Business problem**: User prompt to guide the AI model\n",
+    "    3. **Model**: Choose between GPT, Claude, Gemini, DeepSeek or Llama models\n",
+    "    4. **Number of records**: Number of records to generate (minimum 11)\n",
+    "    5. **File format**: Choose output format (.csv, .tsv, .jsonl, .parquet, .arrow)\n",
+    "    6. **Save as**: Filename (extension added automatically)\n",
+    "    7. Click **Generate** to create your dataset\n",
+    "\n",
+    "    ### 🔧 Requirements:\n",
+    "    - For local mode, set up HF token and API keys in `.env` file (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
+    "    - For colab mode, set up HF token and API keys in Colab secret section (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
+    "    \"\"\")\n",
+    "\n",
+    "interface.launch(debug=True)\n",
+    "\n",
+    "del tokenizer, inputs, llama_model, outputs\n",
+    "gc.collect()\n",
+    "torch.cuda.empty_cache()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/week3/community-contributions/Week_3_Day_5_Meeting_Minutes_product_with_Gradio.ipynb
+++ b/week3/community-contributions/Week_3_Day_5_Meeting_Minutes_product_with_Gradio.ipynb
@@ -0,0 +1,523 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "It89APiAtTUF"
+   },
+   "source": [
+    "# Create meeting minutes from an Audio file\n",
+    "\n",
+    "I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here:  \n",
+    "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
+    "\n",
+    "If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
+    "\n",
+    "The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
+    "\n",
+    "For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "sJPSCwPX3MOV"
+   },
+   "source": [
+    "## Again - please note: 2 important pro-tips for using Colab:\n",
+    "\n",
+    "**Pro-tip 1:**\n",
+    "\n",
+    "The top of every colab has some pip installs. You may receive errors from pip when you run this, such as:\n",
+    "\n",
+    "> gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\n",
+    "\n",
+    "These pip compatibility errors can be safely ignored; and while it's tempting to try to fix them by changing version numbers, that will actually introduce real problems!\n",
+    "\n",
+    "**Pro-tip 2:**\n",
+    "\n",
+    "In the middle of running a Colab, you might get an error like this:\n",
+    "\n",
+    "> Runtime error: CUDA is required but not available for bitsandbytes. Please consider installing [...]\n",
+    "\n",
+    "This is a super-misleading error message! Please don't try changing versions of packages...\n",
+    "\n",
+    "This actually happens because Google has switched out your Colab runtime, perhaps because Google Colab was too busy. The solution is:\n",
+    "\n",
+    "1. Kernel menu >> Disconnect and delete runtime\n",
+    "2. Reload the colab from fresh and Edit menu >> Clear All Outputs\n",
+    "3. Connect to a new T4 using the button at the top right\n",
+    "4. Select \"View resources\" from the menu on the top right to confirm you have a GPU\n",
+    "5. Rerun the cells in the colab, from the top down, starting with the pip installs\n",
+    "\n",
+    "And all should work great - otherwise, ask me!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "f2vvgnFpHpID"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
+    "!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "FW8nl3XRFrz0"
+   },
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import requests\n",
+    "from IPython.display import Markdown, display, update_display\n",
+    "from openai import OpenAI\n",
+    "from google.colab import drive\n",
+    "from huggingface_hub import login\n",
+    "from google.colab import userdata\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "q3D1_T0uG_Qh"
+   },
+   "outputs": [],
+   "source": [
+    "# Constants\n",
+    "\n",
+    "AUDIO_MODEL = \"whisper-1\"\n",
+    "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Es9GkQ0FGCMt"
+   },
+   "outputs": [],
+   "source": [
+    "# New capability - connect this Colab to my Google Drive\n",
+    "# See immediately below this for instructions to obtain denver_extract.mp3\n",
+    "\n",
+    "drive.mount(\"/content/drive\")\n",
+    "audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HTl3mcjyzIEE"
+   },
+   "source": [
+    "# Download denver_extract.mp3\n",
+    "\n",
+    "You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
+    "\n",
+    "If you want to use the same as me, then please download my extract here, and put this on your Google Drive:  \n",
+    "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xYW8kQYtF-3L"
+   },
+   "outputs": [],
+   "source": [
+    "# Sign in to HuggingFace Hub\n",
+    "\n",
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qP6OB2OeGC2C"
+   },
+   "outputs": [],
+   "source": [
+    "# Sign in to OpenAI using Secrets in Colab\n",
+    "\n",
+    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "openai = OpenAI(api_key=openai_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GMShdVGlGGr4"
+   },
+   "outputs": [],
+   "source": [
+    "# Use the Whisper OpenAI model to convert the Audio to Text\n",
+    "# If you'd prefer to use an Open Source model, class student Youssef has contributed an open source version\n",
+    "# which I've added to the bottom of this colab\n",
+    "\n",
+    "audio_file = open(audio_filename, \"rb\")\n",
+    "transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format=\"text\")\n",
+    "print(transcription)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "piEMmcSfMH-O"
+   },
+   "outputs": [],
+   "source": [
+    "system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
+    "user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": system_message},\n",
+    "    {\"role\": \"user\", \"content\": user_prompt}\n",
+    "  ]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "UcRKUgcxMew6"
+   },
+   "outputs": [],
+   "source": [
+    "quant_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_quant_type=\"nf4\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6CujZRAgMimy"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "# inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "streamer = TextStreamer(tokenizer)\n",
+    "model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config, trust_remote_code=True)\n",
+    "# outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "MaLNmJ5PSqcH"
+   },
+   "outputs": [],
+   "source": [
+    "inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "102tdU_3Peam"
+   },
+   "outputs": [],
+   "source": [
+    "response = tokenizer.decode(outputs[0])\n",
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KlomN6CwMdoN"
+   },
+   "outputs": [],
+   "source": [
+    "display(Markdown(response))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0jZElVOMSPAr"
+   },
+   "source": [
+    "Day5 exercise - Gradio UI for meeting minutes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5iiYYxQMHf0i"
+   },
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "import tempfile\n",
+    "import soundfile as sf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "aGwXW7BjPcTM"
+   },
+   "outputs": [],
+   "source": [
+    "# !pip install pydub\n",
+    "# !apt-get install ffmpeg\n",
+    "\n",
+    "from pydub import AudioSegment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RNu-reHuCYj_"
+   },
+   "outputs": [],
+   "source": [
+    "# Make sure that the tokenizeer and model is already generated\n",
+    "\n",
+    "# tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+    "# tokenizer.pad_token = tokenizer.eos_token\n",
+    "# streamer = TextStreamer(tokenizer)\n",
+    "# model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "KOuoH0YOPruE"
+   },
+   "outputs": [],
+   "source": [
+    "# def save_as_mp3(audio_np):\n",
+    "#     sr, data = audio_np\n",
+    "#     # Convert float32 or int16 to PCM wav and then mp3\n",
+    "#     wav_path = tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False).name\n",
+    "#     mp3_path = tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False).name\n",
+    "\n",
+    "#     sf.write(wav_path, data, sr)\n",
+    "#     audio_segment = AudioSegment.from_wav(wav_path)\n",
+    "#     audio_segment.export(mp3_path, format=\"mp3\", bitrate=\"64k\")  # Low bitrate = small file\n",
+    "#     return mp3_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "toBIPBJoSNw0"
+   },
+   "outputs": [],
+   "source": [
+    "# Handles audio input as numpy array and returns updated chat history\n",
+    "def speak_send(audio_np):\n",
+    "\n",
+    "    # If use numpy as input: audio_input = gr.Audio(sources=\"upload\", type=\"numpy\", label=\"Upload audio file to generate meeting minutes\")\n",
+    "    # mp3_path = save_as_mp3(audio_np)\n",
+    "\n",
+    "    # with open(mp3_path, \"rb\") as audio_file:\n",
+    "    #     transcription = openai.audio.transcriptions.create(\n",
+    "    #         model=AUDIO_MODEL,\n",
+    "    #         file=audio_file,\n",
+    "    #         response_format=\"text\"\n",
+    "    #     )\n",
+    "\n",
+    "    audio = AudioSegment.from_file(audio_np)\n",
+    "    with tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False) as tmpfile:\n",
+    "        audio.export(tmpfile.name, format=\"mp3\")\n",
+    "        with open(tmpfile.name, \"rb\") as file:\n",
+    "            transcript = openai.audio.transcriptions.create(\n",
+    "                model=AUDIO_MODEL,\n",
+    "                file=file,\n",
+    "                response_format=\"text\"\n",
+    "            )\n",
+    "\n",
+    "    system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
+    "    user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
+    "\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": system_message},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt}\n",
+    "      ]\n",
+    "\n",
+    "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "    outputs = model.generate(inputs, max_new_tokens=2000)\n",
+    "\n",
+    "    _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
+    "    return after.strip()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xXJfabpDSN5R"
+   },
+   "outputs": [],
+   "source": [
+    "with gr.Blocks() as demo:\n",
+    "\n",
+    "    with gr.Row():\n",
+    "        audio_input = gr.Audio(sources=\"upload\", type=\"filepath\", label=\"Upload audio file to generate meeting minutes\")\n",
+    "    with gr.Row():\n",
+    "        audio_submit = gr.Button(\"Send\")\n",
+    "    with gr.Row():\n",
+    "      outputs = [gr.Markdown(label=\"Meeting minutes:\")]\n",
+    "\n",
+    "    audio_submit.click(speak_send, inputs=audio_input, outputs=outputs)\n",
+    "\n",
+    "demo.launch(debug=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kuxYecT2QDQ9"
+   },
+   "source": [
+    "# Student contribution\n",
+    "\n",
+    "Student Emad S. has made this powerful variation that uses `TextIteratorStreamer` to stream back results into a Gradio UI, and takes advantage of background threads for performance! I'm sharing it here if you'd like to take a look at some very interesting work. Thank you, Emad!\n",
+    "\n",
+    "https://colab.research.google.com/drive/1Ja5zyniyJo5y8s1LKeCTSkB2xyDPOt6D"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "AU3uAEyU3a-o"
+   },
+   "source": [
+    "## Alternative implementation\n",
+    "\n",
+    "Class student Youssef has contributed this variation in which we use an open-source model to transcribe the meeting Audio.\n",
+    "\n",
+    "Thank you Youssef!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "phYYgAbBRvu5"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HdQnWEzW3lzP"
+   },
+   "outputs": [],
+   "source": [
+    "AUDIO_MODEL = \"openai/whisper-medium\"\n",
+    "speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)\n",
+    "speech_model.to('cuda')\n",
+    "processor = AutoProcessor.from_pretrained(AUDIO_MODEL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ZhA_fbeCSAeZ"
+   },
+   "outputs": [],
+   "source": [
+    "pipe = pipeline(\n",
+    "    \"automatic-speech-recognition\",\n",
+    "    model=speech_model,\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    feature_extractor=processor.feature_extractor,\n",
+    "    torch_dtype=torch.float16,\n",
+    "    device='cuda',\n",
+    "    return_timestamps=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nrQjKtD53omJ"
+   },
+   "outputs": [],
+   "source": [
+    "# Use the Whisper OpenAI model to convert the Audio to Text\n",
+    "result = pipe(audio_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "G_XSljOY3tDf"
+   },
+   "outputs": [],
+   "source": [
+    "transcription = result[\"text\"]\n",
+    "print(transcription)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}