{ "cells": [ { "cell_type": "markdown", "id": "18b82c6b-10dc-4d94-b8dc-592ff011ce2b", "metadata": {}, "source": [ "# Meeting minutes creator\n", "\n", "In this colab, we make a meeting minutes program.\n", "\n", "It includes useful code to connect your Google Drive to your colab.\n", "\n", "Upload your own audio to make this work!!\n", "\n", "https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing\n", "\n", "This should run nicely on a low-cost or free T4 box.\n", "\n", "## **Assignment:**\n", "Put Everything into a nice Gradio UI (similar to last week)\n", "Input file name of audio to process.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e9289ba7-200c-43a9-b67a-c5ce826c9537", "metadata": {}, "outputs": [], "source": [ "# imports\n", "import re, requests, json, tempfile, gradio as gr, torch, os\n", "from bs4 import BeautifulSoup\n", "from IPython.display import Markdown, display, update_display\n", "from google.colab import drive, userdata\n", "from huggingface_hub import login\n", "from openai import OpenAI\n", "from pydub import AudioSegment\n", "from pydub.playback import play\n", "from io import BytesIO\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", "\n", "# Sign in to HuggingFace Hub\n", "hf_token = userdata.get('HF_TOKEN')\n", "login(hf_token, add_to_git_credential=True)\n", "\n", "# Sign in to OpenAI using Secrets in Colab\n", "openai_api_key = userdata.get('OPENAI_API_KEY')\n", "\n", "# Initialize client\n", "try:\n", " openai = OpenAI(api_key=openai_api_key)\n", "except Exception as e:\n", " openai = None\n", " print(f\"OpenAI client not initialized: {e}\")\n", "\n", "# Constants\n", "AUDIO_MODEL = \"whisper-1\"\n", "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", "\n", "# Google Drive\n", "drive.mount(\"/content/drive\")\n", "\n", "# Local LLM setup (Llama 3.1)\n", "try:\n", " quant_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_use_double_quant=True,\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", " bnb_4bit_quant_type=\"nf4\"\n", " )\n", " tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", "\n", " # Set the pad token to the end-of-sequence token for generation\n", " tokenizer.pad_token = tokenizer.eos_token\n", "\n", " model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n", " # model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", torch_dtype=torch.bfloat16, quantization_config=quant_config, trust_remote_code=True)\n", "\n", " model.eval() # Set model to evaluation mode\n", "except Exception as e:\n", " # If the local model fails to load, set variables to None\n", " model = None\n", " tokenizer = None\n", " print(f\"Failed to load local model: {e}\")\n", "\n", "# Updated function to handle audio transcription\n", "def transcribe_audio(audio_file):\n", " \"\"\"\n", " Transcribes an audio file to text using OpenAI's Whisper model.\n", " Handles both local file paths and mounted Google Drive file paths.\n", " \"\"\"\n", " if not openai:\n", " return \"OpenAI client not initialized. Please check your API key.\"\n", "\n", " if audio_file is None:\n", " return \"No audio input provided.\"\n", "\n", " # Check if the file exists before attempting to open it\n", " # Construct the expected path in Google Drive\n", " # If the input is from the microphone, it will be a temporary file path\n", " # If the input is from the textbox, it could be a full path or just a filename\n", " if audio_file.startswith(\"/content/drive/MyDrive/llms/\"):\n", " file_path_to_open = audio_file\n", " else:\n", " # Assume it's either a local path or just a filename in MyDrive/llms\n", " # We'll prioritize checking MyDrive/llms first\n", " gdrive_path_attempt = os.path.join(\"/content/drive/MyDrive/llms\", os.path.basename(audio_file))\n", " if os.path.exists(gdrive_path_attempt):\n", " file_path_to_open = gdrive_path_attempt\n", " elif os.path.exists(audio_file):\n", " file_path_to_open = audio_file\n", " else:\n", " return f\"File not found: {audio_file}. Please ensure the file exists in your Google Drive at /content/drive/MyDrive/llms/ or is a valid local path.\"\n", "\n", "\n", " if not os.path.exists(file_path_to_open):\n", " return f\"File not found: {file_path_to_open}. Please ensure the file exists.\"\n", "\n", "\n", " try:\n", " with open(file_path_to_open, \"rb\") as f:\n", " transcription = openai.audio.transcriptions.create(\n", " model=AUDIO_MODEL,\n", " file=f,\n", " response_format=\"text\"\n", " )\n", " return transcription\n", " except Exception as e:\n", " return f\"An error occurred during transcription: {e}\"\n", "\n", "def generate_minutes(transcription):\n", " \"\"\"\n", " Generates meeting minutes from a transcript using a local Llama model.\n", " Format the input, generate a response, and return the complete text string.\n", " \"\"\"\n", " # Check if the local model and tokenizer were successfully loaded\n", " if not model or not tokenizer:\n", " return \"Local Llama model not loaded. Check model paths and hardware compatibility.\"\n", "\n", " system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n", " user_prompt = f\"Below is an extract transcript of an Audio recording. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n", "\n", " messages = [\n", " {\"role\": \"system\", \"content\": system_message},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ]\n", "\n", " try:\n", " # Apply the chat template to format the messages for the model\n", " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", "\n", " # Generate the output. max_new_tokens controls the length of the generated text.\n", " outputs = model.generate(inputs, max_new_tokens=2000)\n", "\n", " # Decode only the new tokens generated by the model (not the input tokens) to a human-readable string\n", " response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", "\n", " # The model's response will contain the full conversation.\n", " # Extract only the assistant's part!\n", " assistant_start = \"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n\"\n", " if assistant_start in response_text:\n", " response_text = response_text.split(assistant_start)[-1]\n", "\n", " return response_text\n", "\n", " except Exception as e:\n", " return f\"An error occurred during local model generation: {e}\"\n", "\n", "# Gradio UI components\n", "with gr.Blocks() as ui:\n", " gr.Markdown(\"# Meeting Minutes Generator\")\n", " with gr.Row():\n", " chatbot = gr.Chatbot(height=500, label=\"AI Assistant\")\n", " with gr.Row():\n", " entry = gr.Textbox(label=\"Provide the filename or path of the audio file to transcribe:\", scale=4)\n", " submit_btn = gr.Button(\"Generate Minutes\", scale=1)\n", " with gr.Row():\n", " audio_input = gr.Audio(sources=[\"microphone\"], type=\"filepath\", label=\"Or speak to our AI Assistant to transcribe\", scale=4)\n", " submit_audio_btn = gr.Button(\"Transcribe Audio\", scale=1)\n", "\n", " with gr.Row():\n", " clear = gr.Button(\"Clear\")\n", "\n", " def process_file_and_generate(file_path, history):\n", " transcribed_text = transcribe_audio(file_path)\n", " minutes = generate_minutes(transcribed_text)\n", " new_history = history + [[f\"Transcription of '{os.path.basename(file_path)}':\\n{transcribed_text}\", minutes]]\n", " return new_history\n", "\n", " def process_audio_and_generate(audio_file, history):\n", " transcribed_text = transcribe_audio(audio_file)\n", " minutes = generate_minutes(transcribed_text)\n", " new_history = history + [[f\"Transcription of your recording:\\n{transcribed_text}\", minutes]]\n", " return new_history\n", "\n", "\n", " submit_btn.click(\n", " process_file_and_generate,\n", " inputs=[entry, chatbot],\n", " outputs=[chatbot],\n", " queue=False\n", " )\n", "\n", " submit_audio_btn.click(\n", " process_audio_and_generate,\n", " inputs=[audio_input, chatbot],\n", " outputs=[chatbot],\n", " queue=False\n", " )\n", "\n", " clear.click(lambda: None, inputs=None, outputs=[chatbot], queue=False)\n", "\n", "ui.launch(inbrowser=True, debug=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "cd2020d3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }