Improvements to descriptions and links

This commit is contained in:
Edward Donner
2025-03-14 09:16:31 -04:00
parent 3a2eb97cf2
commit c80065df86
19 changed files with 948 additions and 821 deletions

View File

@@ -1,150 +1,160 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyPtAT7Yq5xd4vDcJEZtg69J"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "6gGKXU5RXORf"
},
"outputs": [],
"source": [
"# getting the latest transformers first, since this will require a restart\n",
"\n",
"!pip install git+https://github.com/huggingface/transformers.git"
]
},
"cells": [
{
"cell_type": "code",
"source": [
"# getting the latest transformers first, since this will require a restart\n",
"\n",
"!pip install git+https://github.com/huggingface/transformers.git"
],
"metadata": {
"id": "6gGKXU5RXORf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# imports\n",
"\n",
"import torch\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"from transformers import AutoProcessor, AutoModelForImageTextToText\n",
"from google.colab import files"
],
"metadata": {
"id": "yCRrF4aiXPPo"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# logging in to HF\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
],
"metadata": {
"id": "AAlOQuCbXcrv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_RRVc2j2Vun-"
},
"outputs": [],
"source": [
"# this will start an input prompt for uploading local files\n",
"\n",
"uploaded = files.upload()\n",
"print(uploaded.keys()) # this will look sth like dict_keys([\"note2.jpg\"])"
]
},
{
"cell_type": "code",
"source": [
"'''\n",
"ChatGPT and Gemini explain the following part roughly like so:\n",
"The string contained in image_path is the key of the entry in the dictionary of uploaded files (see box above).\n",
"The value to that key contains the image in binary format.\n",
"The \"with open(image_path, \"wb\") as f\" part means: Create a new file \"note2.jpg\" on the server, and write to it in binary mode (\"wb\").\n",
"f.write(image) writes the binary image to that new file. \"note2.jpg\" aka image_path will now contain the image.\n",
"'''\n",
"\n",
"image_path = \"note2.jpg\" # update this string depending on the printout in the previous cell!\n",
"image = uploaded[image_path]\n",
"with open(image_path, \"wb\") as f:\n",
" f.write(image)"
],
"metadata": {
"id": "V_UAuSSkXBKh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# from HF model instructions\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"model = AutoModelForImageTextToText.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\", device_map=device)\n",
"processor = AutoProcessor.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\")"
],
"metadata": {
"id": "AiFP-mQtXrpV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# also from HF documentation about this model, see https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf\n",
"\n",
"image = image_path\n",
"inputs = processor(image, return_tensors=\"pt\").to(device)\n",
"\n",
"ocr = model.generate(\n",
" **inputs,\n",
" do_sample=False,\n",
" tokenizer=processor.tokenizer,\n",
" stop_strings=\"<|im_end|>\",\n",
" max_new_tokens=4096,\n",
")"
],
"metadata": {
"id": "7Adr8HB_YNf5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# prints out the recognized text. This can read my handwriting pretty well! And it works super quick on the free T4 GPU server here.\n",
"\n",
"print(processor.decode(ocr[0, inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True))"
],
"metadata": {
"id": "nRsRUIIuYdJ9"
},
"execution_count": null,
"outputs": []
}
]
}
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yCRrF4aiXPPo"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import torch\n",
"from google.colab import userdata\n",
"from huggingface_hub import login\n",
"from transformers import AutoProcessor, AutoModelForImageTextToText\n",
"from google.colab import files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "AAlOQuCbXcrv"
},
"outputs": [],
"source": [
"# logging in to HF\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_RRVc2j2Vun-"
},
"outputs": [],
"source": [
"# this will start an input prompt for uploading local files\n",
"\n",
"uploaded = files.upload()\n",
"print(uploaded.keys()) # this will look sth like dict_keys([\"note2.jpg\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "V_UAuSSkXBKh"
},
"outputs": [],
"source": [
"'''\n",
"ChatGPT and Gemini explain the following part roughly like so:\n",
"The string contained in image_path is the key of the entry in the dictionary of uploaded files (see box above).\n",
"The value to that key contains the image in binary format.\n",
"The \"with open(image_path, \"wb\") as f\" part means: Create a new file \"note2.jpg\" on the server, and write to it in binary mode (\"wb\").\n",
"f.write(image) writes the binary image to that new file. \"note2.jpg\" aka image_path will now contain the image.\n",
"'''\n",
"\n",
"image_path = \"note2.jpg\" # update this string depending on the printout in the previous cell!\n",
"image = uploaded[image_path]\n",
"with open(image_path, \"wb\") as f:\n",
" f.write(image)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "AiFP-mQtXrpV"
},
"outputs": [],
"source": [
"# from HF model instructions\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"model = AutoModelForImageTextToText.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\", device_map=device)\n",
"processor = AutoProcessor.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "7Adr8HB_YNf5"
},
"outputs": [],
"source": [
"# also from HF documentation about this model, see https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf\n",
"\n",
"image = image_path\n",
"inputs = processor(image, return_tensors=\"pt\").to(device)\n",
"\n",
"ocr = model.generate(\n",
" **inputs,\n",
" do_sample=False,\n",
" tokenizer=processor.tokenizer,\n",
" stop_strings=\"<|im_end|>\",\n",
" max_new_tokens=4096,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nRsRUIIuYdJ9"
},
"outputs": [],
"source": [
"# prints out the recognized text. This can read my handwriting pretty well! And it works super quick on the free T4 GPU server here.\n",
"\n",
"print(processor.decode(ocr[0, inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True))"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"authorship_tag": "ABX9TyPtAT7Yq5xd4vDcJEZtg69J",
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -1,302 +1,312 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "It89APiAtTUF"
},
"source": [
"# Create meeting minutes from an Audio file\n",
"\n",
"I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
"\n",
"If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
"\n",
"The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
"\n",
"For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n",
"\n",
"## Please note:\n",
"\n",
"When you run the pip installs in the first cell below, you might get this error - it can be safely ignored - it sounds quite severe, but it doesn't seem to affect anything else in this project!\n",
"\n",
"\n",
"> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
"\n"
]
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Create meeting minutes from an Audio file\n",
"\n",
"I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
"\n",
"If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
"\n",
"The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
"\n",
"For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n",
"\n",
"## Please note:\n",
"\n",
"When you run the pip installs in the first cell below, you might get this error - it can be safely ignored - it sounds quite severe, but it doesn't seem to affect anything else in this project!\n",
"\n",
"\n",
"> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
"\n"
],
"metadata": {
"id": "It89APiAtTUF"
}
},
{
"cell_type": "code",
"source": [
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
],
"metadata": {
"id": "f2vvgnFpHpID"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FW8nl3XRFrz0"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch\n",
"import gradio as gr"
]
},
{
"cell_type": "code",
"source": [
"# Constants\n",
"\n",
"AUDIO_MODEL = \"whisper-1\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
],
"metadata": {
"id": "q3D1_T0uG_Qh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# New capability - connect this Colab to my Google Drive\n",
"# See immediately below this for instructions to obtain denver_extract.mp3\n",
"\n",
"drive.mount(\"/content/drive\")\n",
"audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
],
"metadata": {
"id": "Es9GkQ0FGCMt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Download denver_extract.mp3\n",
"\n",
"You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
"\n",
"If you want to use the same as me, then please download my extract here, and put this on your Google Drive: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
],
"metadata": {
"id": "HTl3mcjyzIEE"
}
},
{
"cell_type": "code",
"source": [
"# Sign in to HuggingFace Hub\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
],
"metadata": {
"id": "xYW8kQYtF-3L"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Sign in to OpenAI using Secrets in Colab\n",
"\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
],
"metadata": {
"id": "qP6OB2OeGC2C"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Initialize Llama model and tokenizer\n",
"\n",
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" LLAMA,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
")"
],
"metadata": {
"id": "hgQBeIYUyaqj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Generate meeting minutes\n",
"\n",
"def generate_minutes(transcription, model, tokenizer, progress=gr.Progress()):\n",
" progress(0.6, desc=\"Generating meeting minutes from transcript...\")\n",
"\n",
" system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
" user_prompt = f\"Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" outputs = model.generate(inputs, max_new_tokens=2000)\n",
" response = tokenizer.decode(outputs[0])\n",
"\n",
" # Clean up the response, keep only the minutes\n",
" progress(0.9, desc=\"Cleaning and formatting minutes...\")\n",
" response = response.split(\"<|end_header_id|>\")[-1].strip().replace(\"<|eot_id|>\",\"\")\n",
"\n",
" return response"
],
"metadata": {
"id": "u9aFA7tjy3Ri"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Transcribe the uploaded audio file using OpenAI's Whisper model\n",
"\n",
"def transcribe_audio(audio_path, progress=gr.Progress()):\n",
" progress(0.3, desc=\"Creating transcript from audio...\")\n",
"\n",
" try:\n",
" with open(audio_path, \"rb\") as audio_file:\n",
" transcription = openai.audio.transcriptions.create(\n",
" model=AUDIO_MODEL,\n",
" file=audio_file,\n",
" response_format=\"text\"\n",
" )\n",
" return transcription\n",
" except Exception as e:\n",
" return f\"Error during transcription: {str(e)}\""
],
"metadata": {
"id": "OEuqR90Vy4AZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Process the uploaded audio file, transcribe it, and generate meeting minutes\n",
"\n",
"def process_upload(audio_file, progress=gr.Progress()):\n",
" progress(0.1, desc=\"Starting process...\")\n",
"\n",
" if audio_file is None:\n",
" return \"Please upload an audio file.\"\n",
"\n",
" try:\n",
" # Check file format\n",
" if not str(audio_file).lower().endswith('.mp3'):\n",
" return \"Please upload an MP3 file.\"\n",
"\n",
" # Get transcription\n",
" transcription = transcribe_audio(audio_file)\n",
" if transcription.startswith(\"Error\"):\n",
" return transcription\n",
"\n",
" # Generate minutes\n",
" minutes = generate_minutes(transcription, model, tokenizer)\n",
" progress(1.0, desc=\"Process complete!\")\n",
" return minutes\n",
"\n",
" except Exception as e:\n",
" return f\"Error processing file: {str(e)}\""
],
"metadata": {
"id": "lmdsy2iDy5d7"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Create Gradio interface\n",
"\n",
"interface = gr.Interface(\n",
" fn=process_upload,\n",
" inputs=gr.Audio(type=\"filepath\", label=\"Upload MP3 File\", format=\"mp3\"),\n",
" outputs=gr.Markdown(label=\"Meeting Minutes\", min_height=60),\n",
" title=\"Meeting Minutes Generator\",\n",
" description=\"Upload an MP3 recording of your meeting to get AI-generated meeting minutes. This process may take a few minutes.\",\n",
" flagging_mode=\"never\"\n",
")"
],
"metadata": {
"id": "k2U2bWtey7Yo"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Launch Gradio interface\n",
"\n",
"interface.launch()"
],
"metadata": {
"id": "X3JbzRNRy9oG"
},
"execution_count": null,
"outputs": []
}
]
}
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f2vvgnFpHpID"
},
"outputs": [],
"source": [
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "FW8nl3XRFrz0"
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import os\n",
"import requests\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch\n",
"import gradio as gr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q3D1_T0uG_Qh"
},
"outputs": [],
"source": [
"# Constants\n",
"\n",
"AUDIO_MODEL = \"whisper-1\"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Es9GkQ0FGCMt"
},
"outputs": [],
"source": [
"# New capability - connect this Colab to my Google Drive\n",
"# See immediately below this for instructions to obtain denver_extract.mp3\n",
"\n",
"drive.mount(\"/content/drive\")\n",
"audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HTl3mcjyzIEE"
},
"source": [
"# Download denver_extract.mp3\n",
"\n",
"You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
"\n",
"If you want to use the same as me, then please download my extract here, and put this on your Google Drive: \n",
"https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xYW8kQYtF-3L"
},
"outputs": [],
"source": [
"# Sign in to HuggingFace Hub\n",
"\n",
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qP6OB2OeGC2C"
},
"outputs": [],
"source": [
"# Sign in to OpenAI using Secrets in Colab\n",
"\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "hgQBeIYUyaqj"
},
"outputs": [],
"source": [
"# Initialize Llama model and tokenizer\n",
"\n",
"quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" LLAMA,\n",
" device_map=\"auto\",\n",
" quantization_config=quant_config\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "u9aFA7tjy3Ri"
},
"outputs": [],
"source": [
"# Generate meeting minutes\n",
"\n",
"def generate_minutes(transcription, model, tokenizer, progress=gr.Progress()):\n",
" progress(0.6, desc=\"Generating meeting minutes from transcript...\")\n",
"\n",
" system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
" user_prompt = f\"Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" outputs = model.generate(inputs, max_new_tokens=2000)\n",
" response = tokenizer.decode(outputs[0])\n",
"\n",
" # Clean up the response, keep only the minutes\n",
" progress(0.9, desc=\"Cleaning and formatting minutes...\")\n",
" response = response.split(\"<|end_header_id|>\")[-1].strip().replace(\"<|eot_id|>\",\"\")\n",
"\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "OEuqR90Vy4AZ"
},
"outputs": [],
"source": [
"# Transcribe the uploaded audio file using OpenAI's Whisper model\n",
"\n",
"def transcribe_audio(audio_path, progress=gr.Progress()):\n",
" progress(0.3, desc=\"Creating transcript from audio...\")\n",
"\n",
" try:\n",
" with open(audio_path, \"rb\") as audio_file:\n",
" transcription = openai.audio.transcriptions.create(\n",
" model=AUDIO_MODEL,\n",
" file=audio_file,\n",
" response_format=\"text\"\n",
" )\n",
" return transcription\n",
" except Exception as e:\n",
" return f\"Error during transcription: {str(e)}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lmdsy2iDy5d7"
},
"outputs": [],
"source": [
"# Process the uploaded audio file, transcribe it, and generate meeting minutes\n",
"\n",
"def process_upload(audio_file, progress=gr.Progress()):\n",
" progress(0.1, desc=\"Starting process...\")\n",
"\n",
" if audio_file is None:\n",
" return \"Please upload an audio file.\"\n",
"\n",
" try:\n",
" # Check file format\n",
" if not str(audio_file).lower().endswith('.mp3'):\n",
" return \"Please upload an MP3 file.\"\n",
"\n",
" # Get transcription\n",
" transcription = transcribe_audio(audio_file)\n",
" if transcription.startswith(\"Error\"):\n",
" return transcription\n",
"\n",
" # Generate minutes\n",
" minutes = generate_minutes(transcription, model, tokenizer)\n",
" progress(1.0, desc=\"Process complete!\")\n",
" return minutes\n",
"\n",
" except Exception as e:\n",
" return f\"Error processing file: {str(e)}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "k2U2bWtey7Yo"
},
"outputs": [],
"source": [
"# Create Gradio interface\n",
"\n",
"interface = gr.Interface(\n",
" fn=process_upload,\n",
" inputs=gr.Audio(type=\"filepath\", label=\"Upload MP3 File\", format=\"mp3\"),\n",
" outputs=gr.Markdown(label=\"Meeting Minutes\", min_height=60),\n",
" title=\"Meeting Minutes Generator\",\n",
" description=\"Upload an MP3 recording of your meeting to get AI-generated meeting minutes. This process may take a few minutes.\",\n",
" flagging_mode=\"never\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "X3JbzRNRy9oG"
},
"outputs": [],
"source": [
"# Launch Gradio interface\n",
"\n",
"interface.launch()"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -1,322 +1,332 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yqlQTsxNdKrN"
},
"outputs": [],
"source": [
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
]
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "yqlQTsxNdKrN"
},
"outputs": [],
"source": [
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch\n",
"import gradio as gr\n",
"import re"
],
"metadata": {
"id": "eyfvQrLxdkGT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# one can always add more models, of course\n",
"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"OPENAI_MODEL = \"gpt-4o-mini\""
],
"metadata": {
"id": "WW-cSZk7dnp6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
],
"metadata": {
"id": "XG7Iam6Rdw8F"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"force_dark_mode = \"\"\"\n",
"function refresh() {\n",
" const url = new URL(window.location);\n",
" if (url.searchParams.get('__theme') !== 'dark') {\n",
" url.searchParams.set('__theme', 'dark');\n",
" window.location.href = url.href;\n",
" }\n",
"}\n",
"\"\"\""
],
"metadata": {
"id": "Ov7WSdx9dzSt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def dataset_generator(model, nature, shots, volume, language):\n",
"\n",
" examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
" system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
" user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
" sentences = \"\"\n",
"\n",
" if language == \"English\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
"Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
"but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
"Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
"\n",
" user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
"Do not comment or format yor output in any way, shape, or form.\"\n",
"\n",
" elif language == \"German\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
"Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
"plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
"Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
"\n",
" user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
"Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
"\n",
" elif language == \"French\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
"Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
"mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
"Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
"\n",
" user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
"Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" if model == \"Llama\":\n",
"\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" streamer = TextStreamer(tokenizer)\n",
" model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
" outputs = model.generate(inputs, max_new_tokens=10000)\n",
"\n",
" response = tokenizer.decode(outputs[0])\n",
" sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
"\n",
" elif model == \"OpenAI\":\n",
" response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
" sentences = response.choices[0].message.content\n",
"\n",
" return sentences"
],
"metadata": {
"id": "bEF8w_Mdd2Nb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"global data\n",
"data = \"\"\n",
"\n",
"with gr.Blocks(\n",
" css=\"\"\"\n",
" .red-button {\n",
" background-color: darkred !important;\n",
" border-color: red !important;\n",
" }\n",
" .blue-button {\n",
" background-color: darkblue !important;\n",
" border-color: blue !important;\n",
" }\n",
" .green-button {\n",
" background-color: green !important;\n",
" border-color: green !important;\n",
" }\n",
" \"\"\"\n",
") as view:\n",
" with gr.Row():\n",
" title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
" subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
"<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
"<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
"<li>Hit <q>Create Dataset</q></li>\\\n",
"<li>Save the output (.txt) to your Google Drive</li>\")\n",
" with gr.Row():\n",
" language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
" model_choice = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
" volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
" with gr.Row():\n",
" typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
" with gr.Row():\n",
" sentence_1 = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
" instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" sentence_2 = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
" instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" sentence_3 = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
" instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" liveSentences = gr.Markdown(\n",
" value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
" label=\"Generated sentences:\",\n",
" min_height=60,\n",
" max_height=200\n",
" )\n",
" with gr.Row():\n",
" generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
" with gr.Row():\n",
" clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
" with gr.Row():\n",
" outputPath = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
" with gr.Row():\n",
" save = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
"\n",
" def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
" global data\n",
" nature = \"\"\n",
" shots = {}\n",
" amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
"\n",
" if typeInput != None:\n",
" nature = typeInput\n",
" else:\n",
" nature = \"Random sentences of mixed nature\"\n",
"\n",
" if s1 != None:\n",
" if i1 != None:\n",
" shots[i1] = s1\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s1\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
"\n",
" if s2 != None:\n",
" if i2 != None:\n",
" shots[i2] = s2\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s2\n",
"\n",
" if s3 != None:\n",
" if i3 != None:\n",
" shots[i3] = s3\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s3\n",
"\n",
" sentences = dataset_generator(model, nature, shots, amount, language)\n",
" data = sentences\n",
"\n",
" return sentences\n",
"\n",
" def saveData(path):\n",
" global data\n",
" drive.mount(\"/content/drive\")\n",
"\n",
" dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
"\n",
" if not os.path.exists(dir_path):\n",
" os.makedirs(dir_path)\n",
"\n",
" with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(data)\n",
"\n",
" generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
" clear.click(\n",
" lambda: [\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
" None,\n",
" [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
" sentence_3, instruction_3, liveSentences, outputPath, save],\n",
" queue=False\n",
" )\n",
" save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
"\n",
"view.launch(share=True) #, debug=True)"
],
"metadata": {
"id": "VRKdu0fEt8mg"
},
"execution_count": null,
"outputs": []
}
]
}
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eyfvQrLxdkGT"
},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"from IPython.display import Markdown, display, update_display\n",
"from openai import OpenAI\n",
"from google.colab import drive\n",
"from huggingface_hub import login\n",
"from google.colab import userdata\n",
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
"import torch\n",
"import gradio as gr\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WW-cSZk7dnp6"
},
"outputs": [],
"source": [
"# one can always add more models, of course\n",
"\n",
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
"OPENAI_MODEL = \"gpt-4o-mini\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "XG7Iam6Rdw8F"
},
"outputs": [],
"source": [
"hf_token = userdata.get('HF_TOKEN')\n",
"login(hf_token, add_to_git_credential=True)\n",
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
"openai = OpenAI(api_key=openai_api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Ov7WSdx9dzSt"
},
"outputs": [],
"source": [
"force_dark_mode = \"\"\"\n",
"function refresh() {\n",
" const url = new URL(window.location);\n",
" if (url.searchParams.get('__theme') !== 'dark') {\n",
" url.searchParams.set('__theme', 'dark');\n",
" window.location.href = url.href;\n",
" }\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "bEF8w_Mdd2Nb"
},
"outputs": [],
"source": [
"def dataset_generator(model, nature, shots, volume, language):\n",
"\n",
" examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
" system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
" user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
" sentences = \"\"\n",
"\n",
" if language == \"English\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
"Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
"but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
"Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
"\n",
" user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
"Do not comment or format yor output in any way, shape, or form.\"\n",
"\n",
" elif language == \"German\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
"Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
"plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
"Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
"\n",
" user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
"Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
"\n",
" elif language == \"French\":\n",
"\n",
" for shot in list(shots.keys()):\n",
" examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
"\n",
" system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
"Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
"mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
"Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
"\n",
" user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
"Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_message},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
"\n",
" if model == \"Llama\":\n",
"\n",
" quant_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_quant_type=\"nf4\"\n",
" )\n",
"\n",
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
" streamer = TextStreamer(tokenizer)\n",
" model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
" outputs = model.generate(inputs, max_new_tokens=10000)\n",
"\n",
" response = tokenizer.decode(outputs[0])\n",
" sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
"\n",
" elif model == \"OpenAI\":\n",
" response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
" sentences = response.choices[0].message.content\n",
"\n",
" return sentences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VRKdu0fEt8mg"
},
"outputs": [],
"source": [
"global data\n",
"data = \"\"\n",
"\n",
"with gr.Blocks(\n",
" css=\"\"\"\n",
" .red-button {\n",
" background-color: darkred !important;\n",
" border-color: red !important;\n",
" }\n",
" .blue-button {\n",
" background-color: darkblue !important;\n",
" border-color: blue !important;\n",
" }\n",
" .green-button {\n",
" background-color: green !important;\n",
" border-color: green !important;\n",
" }\n",
" \"\"\"\n",
") as view:\n",
" with gr.Row():\n",
" title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
" subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
"<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
"<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
"<li>Hit <q>Create Dataset</q></li>\\\n",
"<li>Save the output (.txt) to your Google Drive</li>\")\n",
" with gr.Row():\n",
" language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
" model_choice = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
" volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
" with gr.Row():\n",
" typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
" with gr.Row():\n",
" sentence_1 = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
" instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" sentence_2 = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
" instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" sentence_3 = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
" instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
" with gr.Row():\n",
" liveSentences = gr.Markdown(\n",
" value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
" label=\"Generated sentences:\",\n",
" min_height=60,\n",
" max_height=200\n",
" )\n",
" with gr.Row():\n",
" generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
" with gr.Row():\n",
" clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
" with gr.Row():\n",
" outputPath = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
" with gr.Row():\n",
" save = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
"\n",
" def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
" global data\n",
" nature = \"\"\n",
" shots = {}\n",
" amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
"\n",
" if typeInput != None:\n",
" nature = typeInput\n",
" else:\n",
" nature = \"Random sentences of mixed nature\"\n",
"\n",
" if s1 != None:\n",
" if i1 != None:\n",
" shots[i1] = s1\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s1\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
"\n",
" if s2 != None:\n",
" if i2 != None:\n",
" shots[i2] = s2\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s2\n",
"\n",
" if s3 != None:\n",
" if i3 != None:\n",
" shots[i3] = s3\n",
" else:\n",
" shots[\"A medium-long random sentence about anything\"] = s3\n",
"\n",
" sentences = dataset_generator(model, nature, shots, amount, language)\n",
" data = sentences\n",
"\n",
" return sentences\n",
"\n",
" def saveData(path):\n",
" global data\n",
" drive.mount(\"/content/drive\")\n",
"\n",
" dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
"\n",
" if not os.path.exists(dir_path):\n",
" os.makedirs(dir_path)\n",
"\n",
" with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(data)\n",
"\n",
" generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
" clear.click(\n",
" lambda: [\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"\"),\n",
" gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
" gr.update(value=\"\"),\n",
" gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
" None,\n",
" [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
" sentence_3, instruction_3, liveSentences, outputPath, save],\n",
" queue=False\n",
" )\n",
" save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
"\n",
"view.launch(share=True) #, debug=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR",
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -387,7 +387,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "llm_engineering-yg2xCEUG",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -401,9 +401,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}