Add the community contribution for Week3/4/5

This commit is contained in:
Zhufeng-Qiu
2025-07-10 16:47:53 -07:00
parent a3b03fcd8f
commit 1bc1229395
3 changed files with 1915 additions and 0 deletions

View File

@@ -0,0 +1,551 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "GD5Omr5EfWgb"
},
"source": [
"# Date Generator\n",
"\n",
"generate synthetic data when given scheme, business problem description, model, number of records, file name, file type, and environment\n",
"\n",
"# Available models\n",
" Model API:\n",
"\n",
" 1. gpt-4o-mini\n",
" 2. claude-3-haiku-20240307\n",
" 3. gemini-2.0-flash\n",
" 4. deepseek-chat\"\n",
"\n",
" HuggingFace API:\n",
"\n",
" 5. meta-llama/Meta-Llama-3.1-8B-Instruct\n",
"\n",
"\n",
"# Available environment\n",
"\n",
"Colab: set up HF token and API keys in Colab secret section\n",
"\n",
"Local: set up HF token and API keys in .env file\n",
"\n",
"\n",
"\n",
"### *** This project is developed based on the idea of 'week3/community-contributuins/Week3-Dataset_Generator-DP'. Really appreciate it! Then, the project is improved to run both on Colab or locally, and integrate HuggingFace API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4FiCnE0MmU56"
},
"outputs": [],
"source": [
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0\n",
"!pip install anthropic dotenv pyarrow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JeyKw5guoH3r"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from huggingface_hub import login\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"from bs4 import BeautifulSoup\n",
"from typing import List\n",
"import google.generativeai\n",
"import anthropic\n",
"from itertools import chain\n",
"from dotenv import load_dotenv\n",
"import gradio as gr\n",
"import json\n",
"import pandas as pd\n",
"import random\n",
"import re\n",
"import subprocess\n",
"import pyarrow as pa\n",
"import torch\n",
"import gc"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7UyjFdRZoIAS"
},
"outputs": [],
"source": [
"# --- Schema Definition ---\n",
"SCHEMA = [\n",
" (\"Name\", \"TEXT\", '\"Northern Cafe\"'),\n",
" (\"Location\", \"TEXT\", '\"2904 S Figueroa St, Los Angeles, CA 90007\"'),\n",
" (\"Type\", \"TEXT\", 'One of [\"Chinese\",\"Mexico\",\"French\",\"Korean\",\"Italy\"] or other potential types'),\n",
" (\"Average Price\", \"TEXT\", '\"$30\", or \"--\" if unkown'),\n",
" (\"History/Age\", \"INT\", 'integer age of resturant, e.g., 7'),\n",
" (\"Menu\", \"Array\", '[\"Beef Noodle\", \"Fried Rice\", \"Dumpling\", ...]'),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jXcTQATLoICV"
},
"outputs": [],
"source": [
"# Default schema text for the textbox\n",
"DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) Example: {col[2]}\" for i, col in enumerate(SCHEMA)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4Irf5JV3oIEe"
},
"outputs": [],
"source": [
"# Available models\n",
"MODELS = [\n",
" \"gpt-4o-mini\",\n",
" \"claude-3-haiku-20240307\",\n",
" \"gemini-2.0-flash\",\n",
" \"deepseek-chat\",\n",
" \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JJ6r2SH9oIGf"
},
"outputs": [],
"source": [
"# Available file formats\n",
"FILE_FORMATS = [\".csv\", \".tsv\", \".jsonl\", \".parquet\", \".arrow\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "B98j45E3vq5g"
},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are a helpful assistant whose main purpose is to generate datasets for a given business problem based on given schema.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lsX16cWfwf6x"
},
"outputs": [],
"source": [
"def get_env_info(env):\n",
" try:\n",
" global hf_token, openai_api_key, anthropic_api_key, google_api_key, deepseek_api_key\n",
" if env == \"Colab\":\n",
" # Colab environment\n",
" from google.colab import drive\n",
" from google.colab import userdata\n",
" hf_token = userdata.get('HF_TOKEN')\n",
" openai_api_key = userdata.get('OPENAI_API_KEY')\n",
" anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')\n",
" google_api_key = userdata.get('GOOGLE_API_KEY')\n",
" deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')\n",
" elif env == \"Local\":\n",
" # Local environment\n",
" load_dotenv(override=True)\n",
" hf_token = os.getenv('HF_TOKEN')\n",
" openai_api_key = os.getenv('OPENAI_API_KEY')\n",
" anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
" google_api_key = os.getenv('GOOGLE_API_KEY')\n",
" deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
" except Exception as e:\n",
" raise Exception(f\"Please check your environment: {str(e)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "2gLUFAwGv29Q"
},
"outputs": [],
"source": [
"def get_prompt(schema_text, business_problem, nr_records):\n",
" prompt = f\"\"\"\n",
" The problem is: {business_problem}\n",
"\n",
" Generate {nr_records} rows data in JSONL format, each line a JSON object with the following fields:\n",
"\n",
" {schema_text}\n",
"\n",
" Do NOT repeat column values from one row to another.\n",
"\n",
" Only output valid JSONL.\n",
" \"\"\"\n",
" return prompt.strip()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YZe1FVH8wf84"
},
"outputs": [],
"source": [
"# --- LLM Interface ---\n",
"def query(user_prompt, model):\n",
" try:\n",
" if \"gpt\" in model.lower():\n",
" client = OpenAI(api_key=openai_api_key)\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"claude\" in model.lower():\n",
" client = anthropic.Anthropic(api_key=anthropic_api_key)\n",
" response = client.messages.create(\n",
" model=model,\n",
" messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
" max_tokens=4000,\n",
" temperature=0.7,\n",
" system=system_prompt\n",
" )\n",
" content = response.content[0].text\n",
" elif \"gemini\" in model.lower():\n",
" client = OpenAI(\n",
" api_key=google_api_key,\n",
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
" )\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"deepseek\" in model.lower():\n",
" client = OpenAI(\n",
" api_key=deepseek_api_key,\n",
" base_url=\"https://api.deepseek.com\"\n",
" )\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" response = client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=0.7\n",
" )\n",
" content = response.choices[0].message.content\n",
"\n",
" elif \"llama\" in model.lower():\n",
" global tokenizer, inputs, llama_model, outputs\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" login(hf_token, add_to_git_credential=True)\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" if llama_model == None:\n",
" llama_model = AutoModelForCausalLM.from_pretrained(model, device_map=\"auto\", quantization_config=quant_config)\n",
" outputs = llama_model.generate(inputs, max_new_tokens=4000)\n",
"\n",
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
" content = after.strip()\n",
" else:\n",
" raise ValueError(f\"Unsupported model. Use one of {MODELS}\")\n",
"\n",
" # Parse JSONL output\n",
" lines = [line.strip() for line in content.strip().splitlines() if line.strip().startswith(\"{\")]\n",
" return [json.loads(line) for line in lines]\n",
"\n",
" except Exception as e:\n",
" raise Exception(f\"Model query failed: {str(e)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4WUj-XqM5IYT"
},
"outputs": [],
"source": [
"# --- Output Formatter ---\n",
"def save_dataset(records, file_format, filename):\n",
" df = pd.DataFrame(records)\n",
" if file_format == \".csv\":\n",
" df.to_csv(filename, index=False)\n",
" elif file_format == \".tsv\":\n",
" df.to_csv(filename, sep=\"\\t\", index=False)\n",
" elif file_format == \".jsonl\":\n",
" with open(filename, \"w\") as f:\n",
" for record in records:\n",
" f.write(json.dumps(record) + \"\\n\")\n",
" elif file_format == \".parquet\":\n",
" df.to_parquet(filename, engine=\"pyarrow\", index=False)\n",
" elif file_format == \".arrow\":\n",
" table = pa.Table.from_pandas(df)\n",
" with pa.OSFile(filename, \"wb\") as sink:\n",
" with pa.ipc.new_file(sink, table.schema) as writer:\n",
" writer.write(table)\n",
" else:\n",
" raise ValueError(\"Unsupported file format\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WenbNqrpwf-_"
},
"outputs": [],
"source": [
"# --- Main Generation Function ---\n",
"def generate_dataset(schema_text, business_problem, model, nr_records, file_format, save_as, env):\n",
" try:\n",
" # Validation\n",
" if nr_records <= 10:\n",
" return \"❌ Error: Number of records must be greater than 10.\", None\n",
" if nr_records > 1000:\n",
" return \"❌ Error: Number of records must be less than or equal to 1000.\", None\n",
"\n",
" if file_format not in FILE_FORMATS:\n",
" return \"❌ Error: Invalid file format.\", None\n",
"\n",
" if not (save_as or save_as.strip() == \"\"):\n",
" save_as = f\"default{file_format}\"\n",
" elif not save_as.endswith(file_format):\n",
" save_as = save_as + file_format\n",
"\n",
" # Load env\n",
" get_env_info(env)\n",
"\n",
" # Generate prompt\n",
" user_prompt = get_prompt(schema_text, business_problem, nr_records)\n",
"\n",
" # Query model\n",
" records = query(user_prompt, model)\n",
"\n",
" if not records:\n",
" return \"❌ Error: No valid records generated from the model.\", None\n",
"\n",
" # Save dataset\n",
" save_dataset(records, file_format, save_as)\n",
"\n",
" # Create preview\n",
" df = pd.DataFrame(records)\n",
" preview = df.head(10) # Show first 10 rows\n",
"\n",
" success_message = f\"✅ Generated {len(records)} records successfully!\\n📁 Saved to: {save_as}\\n📊 \"\n",
"\n",
" return success_message, preview\n",
"\n",
" except Exception as e:\n",
" return f\"❌ Error: {str(e)}\", None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pHiP8ky8wgEb"
},
"outputs": [],
"source": [
"# --- Gradio Interface ---\n",
"\n",
"with gr.Blocks(title=\"Dataset Generator\", theme=gr.themes.Citrus()) as interface:\n",
" hf_token = None\n",
" openai_api_key = None\n",
" anthropic_api_key = None\n",
" google_api_key = None\n",
" deepseek_api_key = None\n",
" tokenizer = None\n",
" inputs = None\n",
" llama_model = None\n",
" outputs = None\n",
"\n",
" gr.Markdown(\"# Dataset Generator\")\n",
" gr.Markdown(\"Generate synthetic datasets using AI models\")\n",
"\n",
" with gr.Row():\n",
" with gr.Column(scale=2):\n",
" schema_input = gr.Textbox(\n",
" label=\"Schema\",\n",
" value=DEFAULT_SCHEMA_TEXT,\n",
" lines=15,\n",
" placeholder=\"Define your dataset schema here... Please follow this format: Field_Name, Field_Type, Field Example\"\n",
" )\n",
"\n",
" business_problem_input = gr.Textbox(\n",
" label=\"Business Problem\",\n",
" value=\"I want to generate restuant records\",\n",
" lines=1,\n",
" placeholder=\"Enter business problem desciption for the model...\"\n",
" )\n",
"\n",
" with gr.Row():\n",
" model_dropdown = gr.Dropdown(\n",
" label=\"Model\",\n",
" choices=MODELS,\n",
" value=MODELS[0],\n",
" interactive=True\n",
" )\n",
"\n",
" nr_records_input = gr.Number(\n",
" label=\"Number of records\",\n",
" value=27,\n",
" minimum=11,\n",
" maximum=1000,\n",
" step=1\n",
" )\n",
"\n",
" with gr.Row():\n",
" save_as_input = gr.Textbox(\n",
" label=\"Save as\",\n",
" value=\"restaurant_dataset\",\n",
" placeholder=\"Enter filename (extension will be added automatically)\"\n",
" )\n",
"\n",
" file_format_dropdown = gr.Dropdown(\n",
" label=\"File format\",\n",
" choices=FILE_FORMATS,\n",
" value=FILE_FORMATS[0],\n",
" interactive=True\n",
" )\n",
"\n",
" env_dropdown = gr.Dropdown(\n",
" label=\"Environment\",\n",
" choices=[\"Colab\", \"Local\"],\n",
" value=\"Colab\",\n",
" interactive=True\n",
" )\n",
"\n",
"\n",
"\n",
" generate_btn = gr.Button(\"🚀 Generate\", variant=\"secondary\", size=\"lg\")\n",
"\n",
" with gr.Column(scale=1):\n",
" output_status = gr.Textbox(\n",
" label=\"Status\",\n",
" lines=4,\n",
" interactive=False\n",
" )\n",
"\n",
" output_preview = gr.Dataframe(\n",
" label=\"Preview (First 10 rows)\",\n",
" interactive=False,\n",
" wrap=True\n",
" )\n",
"\n",
" # Connect the generate button\n",
" generate_btn.click(\n",
" fn=generate_dataset,\n",
" inputs=[\n",
" schema_input,\n",
" business_problem_input,\n",
" model_dropdown,\n",
" nr_records_input,\n",
" file_format_dropdown,\n",
" save_as_input,\n",
" env_dropdown\n",
" ],\n",
" outputs=[output_status, output_preview]\n",
" )\n",
"\n",
" gr.Markdown(\"\"\"\n",
" ### 📝 Instructions:\n",
" 1. **Schema**: Define the structure of your dataset (pre-filled with restaurant schema)\n",
" 2. **Business problem**: User prompt to guide the AI model\n",
" 3. **Model**: Choose between GPT, Claude, Gemini, DeepSeek or Llama models\n",
" 4. **Number of records**: Number of records to generate (minimum 11)\n",
" 5. **File format**: Choose output format (.csv, .tsv, .jsonl, .parquet, .arrow)\n",
" 6. **Save as**: Filename (extension added automatically)\n",
" 7. Click **Generate** to create your dataset\n",
"\n",
" ### 🔧 Requirements:\n",
" - For local mode, set up HF token and API keys in `.env` file (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
" - For colab mode, set up HF token and API keys in Colab secret section (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)\n",
" \"\"\")\n",
"\n",
"interface.launch(debug=True)\n",
"\n",
"del tokenizer, inputs, llama_model, outputs\n",
"gc.collect()\n",
"torch.cuda.empty_cache()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,523 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "It89APiAtTUF"
},
"source": [
"# Create meeting minutes from an Audio file\n",
"\n",
"I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
"\n",
"If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
"\n",
"The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
"\n",
"For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sJPSCwPX3MOV"
},
"source": [
"## Again - please note: 2 important pro-tips for using Colab:\n",
"\n",
"**Pro-tip 1:**\n",
"\n",
"The top of every colab has some pip installs. You may receive errors from pip when you run this, such as:\n",
"\n",
"> gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\n",
"\n",
"These pip compatibility errors can be safely ignored; and while it's tempting to try to fix them by changing version numbers, that will actually introduce real problems!\n",
"\n",
"**Pro-tip 2:**\n",
"\n",
"In the middle of running a Colab, you might get an error like this:\n",
"\n",
"> Runtime error: CUDA is required but not available for bitsandbytes. Please consider installing [...]\n",
"\n",
"This is a super-misleading error message! Please don't try changing versions of packages...\n",
"\n",
"This actually happens because Google has switched out your Colab runtime, perhaps because Google Colab was too busy. The solution is:\n",
"\n",
"1. Kernel menu >> Disconnect and delete runtime\n",
"2. Reload the colab from fresh and Edit menu >> Clear All Outputs\n",
"3. Connect to a new T4 using the button at the top right\n",
"4. Select \"View resources\" from the menu on the top right to confirm you have a GPU\n",
"5. Rerun the cells in the colab, from the top down, starting with the pip installs\n",
"\n",
"And all should work great - otherwise, ask me!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f2vvgnFpHpID"
},
"outputs": [],
"source": [
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FW8nl3XRFrz0"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q3D1_T0uG_Qh"
},
"outputs": [],
"source": [
"# Constants\n",
"\n",
"AUDIO_MODEL = \"whisper-1\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Es9GkQ0FGCMt"
},
"outputs": [],
"source": [
"# New capability - connect this Colab to my Google Drive\n",
"# See immediately below this for instructions to obtain denver_extract.mp3\n",
"\n",
"drive.mount(\"/content/drive\")\n",
"audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HTl3mcjyzIEE"
},
"source": [
"# Download denver_extract.mp3\n",
"\n",
"You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
"\n",
"If you want to use the same as me, then please download my extract here, and put this on your Google Drive: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xYW8kQYtF-3L"
},
"outputs": [],
"source": [
"# Sign in to HuggingFace Hub\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qP6OB2OeGC2C"
},
"outputs": [],
"source": [
"# Sign in to OpenAI using Secrets in Colab\n",
"\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "GMShdVGlGGr4"
},
"outputs": [],
"source": [
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
"# If you'd prefer to use an Open Source model, class student Youssef has contributed an open source version\n",
"# which I've added to the bottom of this colab\n",
"\n",
"audio_file = open(audio_filename, \"rb\")\n",
"transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format=\"text\")\n",
"print(transcription)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "piEMmcSfMH-O"
},
"outputs": [],
"source": [
"system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
"user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "UcRKUgcxMew6"
},
"outputs": [],
"source": [
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6CujZRAgMimy"
},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"# inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
"streamer = TextStreamer(tokenizer)\n",
"model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config, trust_remote_code=True)\n",
"# outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "MaLNmJ5PSqcH"
},
"outputs": [],
"source": [
"inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
"outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "102tdU_3Peam"
},
"outputs": [],
"source": [
"response = tokenizer.decode(outputs[0])\n",
"response"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KlomN6CwMdoN"
},
"outputs": [],
"source": [
"display(Markdown(response))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0jZElVOMSPAr"
},
"source": [
"Day5 exercise - Gradio UI for meeting minutes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "5iiYYxQMHf0i"
},
"outputs": [],
"source": [
"import gradio as gr\n",
"import tempfile\n",
"import soundfile as sf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aGwXW7BjPcTM"
},
"outputs": [],
"source": [
"# !pip install pydub\n",
"# !apt-get install ffmpeg\n",
"\n",
"from pydub import AudioSegment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RNu-reHuCYj_"
},
"outputs": [],
"source": [
"# Make sure that the tokenizeer and model is already generated\n",
"\n",
"# tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"# tokenizer.pad_token = tokenizer.eos_token\n",
"# streamer = TextStreamer(tokenizer)\n",
"# model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KOuoH0YOPruE"
},
"outputs": [],
"source": [
"# def save_as_mp3(audio_np):\n",
"# sr, data = audio_np\n",
"# # Convert float32 or int16 to PCM wav and then mp3\n",
"# wav_path = tempfile.NamedTemporaryFile(suffix=\".wav\", delete=False).name\n",
"# mp3_path = tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False).name\n",
"\n",
"# sf.write(wav_path, data, sr)\n",
"# audio_segment = AudioSegment.from_wav(wav_path)\n",
"# audio_segment.export(mp3_path, format=\"mp3\", bitrate=\"64k\") # Low bitrate = small file\n",
"# return mp3_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "toBIPBJoSNw0"
},
"outputs": [],
"source": [
"# Handles audio input as numpy array and returns updated chat history\n",
"def speak_send(audio_np):\n",
"\n",
" # If use numpy as input: audio_input = gr.Audio(sources=\"upload\", type=\"numpy\", label=\"Upload audio file to generate meeting minutes\")\n",
" # mp3_path = save_as_mp3(audio_np)\n",
"\n",
" # with open(mp3_path, \"rb\") as audio_file:\n",
" # transcription = openai.audio.transcriptions.create(\n",
" # model=AUDIO_MODEL,\n",
" # file=audio_file,\n",
" # response_format=\"text\"\n",
" # )\n",
"\n",
" audio = AudioSegment.from_file(audio_np)\n",
" with tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False) as tmpfile:\n",
" audio.export(tmpfile.name, format=\"mp3\")\n",
" with open(tmpfile.name, \"rb\") as file:\n",
" transcript = openai.audio.transcriptions.create(\n",
" model=AUDIO_MODEL,\n",
" file=file,\n",
" response_format=\"text\"\n",
" )\n",
"\n",
" system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
" user_prompt = f\"Below is an extract transcript of a Denver council meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" outputs = model.generate(inputs, max_new_tokens=2000)\n",
"\n",
" _, _, after = tokenizer.decode(outputs[0]).partition(\"assistant<|end_header_id|>\")\n",
" return after.strip()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xXJfabpDSN5R"
},
"outputs": [],
"source": [
"with gr.Blocks() as demo:\n",
"\n",
" with gr.Row():\n",
" audio_input = gr.Audio(sources=\"upload\", type=\"filepath\", label=\"Upload audio file to generate meeting minutes\")\n",
" with gr.Row():\n",
" audio_submit = gr.Button(\"Send\")\n",
" with gr.Row():\n",
" outputs = [gr.Markdown(label=\"Meeting minutes:\")]\n",
"\n",
" audio_submit.click(speak_send, inputs=audio_input, outputs=outputs)\n",
"\n",
"demo.launch(debug=True)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kuxYecT2QDQ9"
},
"source": [
"# Student contribution\n",
"\n",
"Student Emad S. has made this powerful variation that uses `TextIteratorStreamer` to stream back results into a Gradio UI, and takes advantage of background threads for performance! I'm sharing it here if you'd like to take a look at some very interesting work. Thank you, Emad!\n",
"\n",
"https://colab.research.google.com/drive/1Ja5zyniyJo5y8s1LKeCTSkB2xyDPOt6D"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "AU3uAEyU3a-o"
},
"source": [
"## Alternative implementation\n",
"\n",
"Class student Youssef has contributed this variation in which we use an open-source model to transcribe the meeting Audio.\n",
"\n",
"Thank you Youssef!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "phYYgAbBRvu5"
},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HdQnWEzW3lzP"
},
"outputs": [],
"source": [
"AUDIO_MODEL = \"openai/whisper-medium\"\n",
"speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)\n",
"speech_model.to('cuda')\n",
"processor = AutoProcessor.from_pretrained(AUDIO_MODEL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ZhA_fbeCSAeZ"
},
"outputs": [],
"source": [
"pipe = pipeline(\n",
" \"automatic-speech-recognition\",\n",
" model=speech_model,\n",
" tokenizer=processor.tokenizer,\n",
" feature_extractor=processor.feature_extractor,\n",
" torch_dtype=torch.float16,\n",
" device='cuda',\n",
" return_timestamps=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nrQjKtD53omJ"
},
"outputs": [],
"source": [
"# Use the Whisper OpenAI model to convert the Audio to Text\n",
"result = pipe(audio_filename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "G_XSljOY3tDf"
},
"outputs": [],
"source": [
"transcription = result[\"text\"]\n",
"print(transcription)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}