Improvements to descriptions and links

2025-03-14 09:16:31 -04:00
parent 3a2eb97cf2
commit c80065df86
19 changed files with 948 additions and 821 deletions
--- a/week3/community-contributions/day4_OCR_model_exercise.ipynb
+++ b/week3/community-contributions/day4_OCR_model_exercise.ipynb
@@ -1,150 +1,160 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4",
-      "authorship_tag": "ABX9TyPtAT7Yq5xd4vDcJEZtg69J"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6gGKXU5RXORf"
+   },
+   "outputs": [],
+   "source": [
+    "# getting the latest transformers first, since this will require a restart\n",
+    "\n",
+    "!pip install git+https://github.com/huggingface/transformers.git"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "source": [
-        "# getting the latest transformers first, since this will require a restart\n",
-        "\n",
-        "!pip install git+https://github.com/huggingface/transformers.git"
-      ],
-      "metadata": {
-        "id": "6gGKXU5RXORf"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# imports\n",
-        "\n",
-        "import torch\n",
-        "from google.colab import userdata\n",
-        "from huggingface_hub import login\n",
-        "from transformers import AutoProcessor, AutoModelForImageTextToText\n",
-        "from google.colab import files"
-      ],
-      "metadata": {
-        "id": "yCRrF4aiXPPo"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# logging in to HF\n",
-        "\n",
-        "hf_token = userdata.get('HF_TOKEN')\n",
-        "login(hf_token, add_to_git_credential=True)"
-      ],
-      "metadata": {
-        "id": "AAlOQuCbXcrv"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "_RRVc2j2Vun-"
-      },
-      "outputs": [],
-      "source": [
-        "# this will start an input prompt for uploading local files\n",
-        "\n",
-        "uploaded = files.upload()\n",
-        "print(uploaded.keys()) # this will look sth like dict_keys([\"note2.jpg\"])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "'''\n",
-        "ChatGPT and Gemini explain the following part roughly like so:\n",
-        "The string contained in image_path is the key of the entry in the dictionary of uploaded files (see box above).\n",
-        "The value to that key contains the image in binary format.\n",
-        "The \"with open(image_path, \"wb\") as f\" part means: Create a new file \"note2.jpg\" on the server, and write to it in binary mode (\"wb\").\n",
-        "f.write(image) writes the binary image to that new file. \"note2.jpg\" aka image_path will now contain the image.\n",
-        "'''\n",
-        "\n",
-        "image_path = \"note2.jpg\" # update this string depending on the printout in the previous cell!\n",
-        "image = uploaded[image_path]\n",
-        "with open(image_path, \"wb\") as f:\n",
-        "  f.write(image)"
-      ],
-      "metadata": {
-        "id": "V_UAuSSkXBKh"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# from HF model instructions\n",
-        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-        "model = AutoModelForImageTextToText.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\", device_map=device)\n",
-        "processor = AutoProcessor.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\")"
-      ],
-      "metadata": {
-        "id": "AiFP-mQtXrpV"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# also from HF documentation about this model, see https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf\n",
-        "\n",
-        "image = image_path\n",
-        "inputs = processor(image, return_tensors=\"pt\").to(device)\n",
-        "\n",
-        "ocr = model.generate(\n",
-        "    **inputs,\n",
-        "    do_sample=False,\n",
-        "    tokenizer=processor.tokenizer,\n",
-        "    stop_strings=\"<|im_end|>\",\n",
-        "    max_new_tokens=4096,\n",
-        ")"
-      ],
-      "metadata": {
-        "id": "7Adr8HB_YNf5"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# prints out the recognized text. This can read my handwriting pretty well! And it works super quick on the free T4 GPU server here.\n",
-        "\n",
-        "print(processor.decode(ocr[0, inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True))"
-      ],
-      "metadata": {
-        "id": "nRsRUIIuYdJ9"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yCRrF4aiXPPo"
+   },
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import torch\n",
+    "from google.colab import userdata\n",
+    "from huggingface_hub import login\n",
+    "from transformers import AutoProcessor, AutoModelForImageTextToText\n",
+    "from google.colab import files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AAlOQuCbXcrv"
+   },
+   "outputs": [],
+   "source": [
+    "# logging in to HF\n",
+    "\n",
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_RRVc2j2Vun-"
+   },
+   "outputs": [],
+   "source": [
+    "# this will start an input prompt for uploading local files\n",
+    "\n",
+    "uploaded = files.upload()\n",
+    "print(uploaded.keys()) # this will look sth like dict_keys([\"note2.jpg\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "V_UAuSSkXBKh"
+   },
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "ChatGPT and Gemini explain the following part roughly like so:\n",
+    "The string contained in image_path is the key of the entry in the dictionary of uploaded files (see box above).\n",
+    "The value to that key contains the image in binary format.\n",
+    "The \"with open(image_path, \"wb\") as f\" part means: Create a new file \"note2.jpg\" on the server, and write to it in binary mode (\"wb\").\n",
+    "f.write(image) writes the binary image to that new file. \"note2.jpg\" aka image_path will now contain the image.\n",
+    "'''\n",
+    "\n",
+    "image_path = \"note2.jpg\" # update this string depending on the printout in the previous cell!\n",
+    "image = uploaded[image_path]\n",
+    "with open(image_path, \"wb\") as f:\n",
+    "  f.write(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AiFP-mQtXrpV"
+   },
+   "outputs": [],
+   "source": [
+    "# from HF model instructions\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "model = AutoModelForImageTextToText.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\", device_map=device)\n",
+    "processor = AutoProcessor.from_pretrained(\"stepfun-ai/GOT-OCR-2.0-hf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "7Adr8HB_YNf5"
+   },
+   "outputs": [],
+   "source": [
+    "# also from HF documentation about this model, see https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf\n",
+    "\n",
+    "image = image_path\n",
+    "inputs = processor(image, return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "ocr = model.generate(\n",
+    "    **inputs,\n",
+    "    do_sample=False,\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    stop_strings=\"<|im_end|>\",\n",
+    "    max_new_tokens=4096,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nRsRUIIuYdJ9"
+   },
+   "outputs": [],
+   "source": [
+    "# prints out the recognized text. This can read my handwriting pretty well! And it works super quick on the free T4 GPU server here.\n",
+    "\n",
+    "print(processor.decode(ocr[0, inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True))"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "authorship_tag": "ABX9TyPtAT7Yq5xd4vDcJEZtg69J",
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/week3/community-contributions/day5_with_Gradio.ipynb
+++ b/week3/community-contributions/day5_with_Gradio.ipynb
@@ -1,302 +1,312 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "It89APiAtTUF"
+   },
+   "source": [
+    "# Create meeting minutes from an Audio file\n",
+    "\n",
+    "I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here:  \n",
+    "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
+    "\n",
+    "If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
+    "\n",
+    "The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
+    "\n",
+    "For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n",
+    "\n",
+    "## Please note:\n",
+    "\n",
+    "When you run the pip installs in the first cell below, you might get this error - it can be safely ignored - it sounds quite severe, but it doesn't seem to affect anything else in this project!\n",
+    "\n",
+    "\n",
+    "> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+    "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
+    "\n"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Create meeting minutes from an Audio file\n",
-        "\n",
-        "I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here:  \n",
-        "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n",
-        "\n",
-        "If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).\n",
-        "\n",
-        "The goal of this product is to use the Audio to generate meeting minutes, including actions.\n",
-        "\n",
-        "For this project, you can either use the Denver meeting minutes, or you can record something of your own!\n",
-        "\n",
-        "## Please note:\n",
-        "\n",
-        "When you run the pip installs in the first cell below, you might get this error - it can be safely ignored - it sounds quite severe, but it doesn't seem to affect anything else in this project!\n",
-        "\n",
-        "\n",
-        "> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-        "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
-        "\n"
-      ],
-      "metadata": {
-        "id": "It89APiAtTUF"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
-      ],
-      "metadata": {
-        "id": "f2vvgnFpHpID"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "FW8nl3XRFrz0"
-      },
-      "outputs": [],
-      "source": [
-        "# imports\n",
-        "\n",
-        "import os\n",
-        "import requests\n",
-        "from openai import OpenAI\n",
-        "from google.colab import drive\n",
-        "from huggingface_hub import login\n",
-        "from google.colab import userdata\n",
-        "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
-        "import torch\n",
-        "import gradio as gr"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Constants\n",
-        "\n",
-        "AUDIO_MODEL = \"whisper-1\"\n",
-        "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
-      ],
-      "metadata": {
-        "id": "q3D1_T0uG_Qh"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# New capability - connect this Colab to my Google Drive\n",
-        "# See immediately below this for instructions to obtain denver_extract.mp3\n",
-        "\n",
-        "drive.mount(\"/content/drive\")\n",
-        "audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
-      ],
-      "metadata": {
-        "id": "Es9GkQ0FGCMt"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Download denver_extract.mp3\n",
-        "\n",
-        "You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
-        "\n",
-        "If you want to use the same as me, then please download my extract here, and put this on your Google Drive:  \n",
-        "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
-      ],
-      "metadata": {
-        "id": "HTl3mcjyzIEE"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Sign in to HuggingFace Hub\n",
-        "\n",
-        "hf_token = userdata.get('HF_TOKEN')\n",
-        "login(hf_token, add_to_git_credential=True)"
-      ],
-      "metadata": {
-        "id": "xYW8kQYtF-3L"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Sign in to OpenAI using Secrets in Colab\n",
-        "\n",
-        "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
-        "openai = OpenAI(api_key=openai_api_key)"
-      ],
-      "metadata": {
-        "id": "qP6OB2OeGC2C"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Initialize Llama model and tokenizer\n",
-        "\n",
-        "quant_config = BitsAndBytesConfig(\n",
-        "    load_in_4bit=True,\n",
-        "    bnb_4bit_use_double_quant=True,\n",
-        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
-        "    bnb_4bit_quant_type=\"nf4\"\n",
-        ")\n",
-        "\n",
-        "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
-        "tokenizer.pad_token = tokenizer.eos_token\n",
-        "\n",
-        "model = AutoModelForCausalLM.from_pretrained(\n",
-        "    LLAMA,\n",
-        "    device_map=\"auto\",\n",
-        "    quantization_config=quant_config\n",
-        ")"
-      ],
-      "metadata": {
-        "id": "hgQBeIYUyaqj"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Generate meeting minutes\n",
-        "\n",
-        "def generate_minutes(transcription, model, tokenizer, progress=gr.Progress()):\n",
-        "    progress(0.6, desc=\"Generating meeting minutes from transcript...\")\n",
-        "\n",
-        "    system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
-        "    user_prompt = f\"Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
-        "\n",
-        "    messages = [\n",
-        "        {\"role\": \"system\", \"content\": system_message},\n",
-        "        {\"role\": \"user\", \"content\": user_prompt}\n",
-        "    ]\n",
-        "\n",
-        "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
-        "    outputs = model.generate(inputs, max_new_tokens=2000)\n",
-        "    response = tokenizer.decode(outputs[0])\n",
-        "\n",
-        "    # Clean up the response, keep only the minutes\n",
-        "    progress(0.9, desc=\"Cleaning and formatting minutes...\")\n",
-        "    response = response.split(\"<|end_header_id|>\")[-1].strip().replace(\"<|eot_id|>\",\"\")\n",
-        "\n",
-        "    return response"
-      ],
-      "metadata": {
-        "id": "u9aFA7tjy3Ri"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Transcribe the uploaded audio file using OpenAI's Whisper model\n",
-        "\n",
-        "def transcribe_audio(audio_path, progress=gr.Progress()):\n",
-        "    progress(0.3, desc=\"Creating transcript from audio...\")\n",
-        "\n",
-        "    try:\n",
-        "        with open(audio_path, \"rb\") as audio_file:\n",
-        "            transcription = openai.audio.transcriptions.create(\n",
-        "                model=AUDIO_MODEL,\n",
-        "                file=audio_file,\n",
-        "                response_format=\"text\"\n",
-        "            )\n",
-        "            return transcription\n",
-        "    except Exception as e:\n",
-        "        return f\"Error during transcription: {str(e)}\""
-      ],
-      "metadata": {
-        "id": "OEuqR90Vy4AZ"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Process the uploaded audio file, transcribe it, and generate meeting minutes\n",
-        "\n",
-        "def process_upload(audio_file, progress=gr.Progress()):\n",
-        "    progress(0.1, desc=\"Starting process...\")\n",
-        "\n",
-        "    if audio_file is None:\n",
-        "        return \"Please upload an audio file.\"\n",
-        "\n",
-        "    try:\n",
-        "        # Check file format\n",
-        "        if not str(audio_file).lower().endswith('.mp3'):\n",
-        "            return \"Please upload an MP3 file.\"\n",
-        "\n",
-        "        # Get transcription\n",
-        "        transcription = transcribe_audio(audio_file)\n",
-        "        if transcription.startswith(\"Error\"):\n",
-        "            return transcription\n",
-        "\n",
-        "        # Generate minutes\n",
-        "        minutes = generate_minutes(transcription, model, tokenizer)\n",
-        "        progress(1.0, desc=\"Process complete!\")\n",
-        "        return minutes\n",
-        "\n",
-        "    except Exception as e:\n",
-        "        return f\"Error processing file: {str(e)}\""
-      ],
-      "metadata": {
-        "id": "lmdsy2iDy5d7"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Create Gradio interface\n",
-        "\n",
-        "interface = gr.Interface(\n",
-        "    fn=process_upload,\n",
-        "    inputs=gr.Audio(type=\"filepath\", label=\"Upload MP3 File\", format=\"mp3\"),\n",
-        "    outputs=gr.Markdown(label=\"Meeting Minutes\", min_height=60),\n",
-        "    title=\"Meeting Minutes Generator\",\n",
-        "    description=\"Upload an MP3 recording of your meeting to get AI-generated meeting minutes. This process may take a few minutes.\",\n",
-        "    flagging_mode=\"never\"\n",
-        ")"
-      ],
-      "metadata": {
-        "id": "k2U2bWtey7Yo"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Launch Gradio interface\n",
-        "\n",
-        "interface.launch()"
-      ],
-      "metadata": {
-        "id": "X3JbzRNRy9oG"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "f2vvgnFpHpID"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "FW8nl3XRFrz0"
+   },
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "import os\n",
+    "import requests\n",
+    "from openai import OpenAI\n",
+    "from google.colab import drive\n",
+    "from huggingface_hub import login\n",
+    "from google.colab import userdata\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "import torch\n",
+    "import gradio as gr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "q3D1_T0uG_Qh"
+   },
+   "outputs": [],
+   "source": [
+    "# Constants\n",
+    "\n",
+    "AUDIO_MODEL = \"whisper-1\"\n",
+    "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Es9GkQ0FGCMt"
+   },
+   "outputs": [],
+   "source": [
+    "# New capability - connect this Colab to my Google Drive\n",
+    "# See immediately below this for instructions to obtain denver_extract.mp3\n",
+    "\n",
+    "drive.mount(\"/content/drive\")\n",
+    "audio_filename = \"/content/drive/MyDrive/llms/denver_extract.mp3\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HTl3mcjyzIEE"
+   },
+   "source": [
+    "# Download denver_extract.mp3\n",
+    "\n",
+    "You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..\n",
+    "\n",
+    "If you want to use the same as me, then please download my extract here, and put this on your Google Drive:  \n",
+    "https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xYW8kQYtF-3L"
+   },
+   "outputs": [],
+   "source": [
+    "# Sign in to HuggingFace Hub\n",
+    "\n",
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "login(hf_token, add_to_git_credential=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qP6OB2OeGC2C"
+   },
+   "outputs": [],
+   "source": [
+    "# Sign in to OpenAI using Secrets in Colab\n",
+    "\n",
+    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "openai = OpenAI(api_key=openai_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "hgQBeIYUyaqj"
+   },
+   "outputs": [],
+   "source": [
+    "# Initialize Llama model and tokenizer\n",
+    "\n",
+    "quant_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_quant_type=\"nf4\"\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    LLAMA,\n",
+    "    device_map=\"auto\",\n",
+    "    quantization_config=quant_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "u9aFA7tjy3Ri"
+   },
+   "outputs": [],
+   "source": [
+    "# Generate meeting minutes\n",
+    "\n",
+    "def generate_minutes(transcription, model, tokenizer, progress=gr.Progress()):\n",
+    "    progress(0.6, desc=\"Generating meeting minutes from transcript...\")\n",
+    "\n",
+    "    system_message = \"You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown.\"\n",
+    "    user_prompt = f\"Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\\n{transcription}\"\n",
+    "\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": system_message},\n",
+    "        {\"role\": \"user\", \"content\": user_prompt}\n",
+    "    ]\n",
+    "\n",
+    "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "    outputs = model.generate(inputs, max_new_tokens=2000)\n",
+    "    response = tokenizer.decode(outputs[0])\n",
+    "\n",
+    "    # Clean up the response, keep only the minutes\n",
+    "    progress(0.9, desc=\"Cleaning and formatting minutes...\")\n",
+    "    response = response.split(\"<|end_header_id|>\")[-1].strip().replace(\"<|eot_id|>\",\"\")\n",
+    "\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "OEuqR90Vy4AZ"
+   },
+   "outputs": [],
+   "source": [
+    "# Transcribe the uploaded audio file using OpenAI's Whisper model\n",
+    "\n",
+    "def transcribe_audio(audio_path, progress=gr.Progress()):\n",
+    "    progress(0.3, desc=\"Creating transcript from audio...\")\n",
+    "\n",
+    "    try:\n",
+    "        with open(audio_path, \"rb\") as audio_file:\n",
+    "            transcription = openai.audio.transcriptions.create(\n",
+    "                model=AUDIO_MODEL,\n",
+    "                file=audio_file,\n",
+    "                response_format=\"text\"\n",
+    "            )\n",
+    "            return transcription\n",
+    "    except Exception as e:\n",
+    "        return f\"Error during transcription: {str(e)}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lmdsy2iDy5d7"
+   },
+   "outputs": [],
+   "source": [
+    "# Process the uploaded audio file, transcribe it, and generate meeting minutes\n",
+    "\n",
+    "def process_upload(audio_file, progress=gr.Progress()):\n",
+    "    progress(0.1, desc=\"Starting process...\")\n",
+    "\n",
+    "    if audio_file is None:\n",
+    "        return \"Please upload an audio file.\"\n",
+    "\n",
+    "    try:\n",
+    "        # Check file format\n",
+    "        if not str(audio_file).lower().endswith('.mp3'):\n",
+    "            return \"Please upload an MP3 file.\"\n",
+    "\n",
+    "        # Get transcription\n",
+    "        transcription = transcribe_audio(audio_file)\n",
+    "        if transcription.startswith(\"Error\"):\n",
+    "            return transcription\n",
+    "\n",
+    "        # Generate minutes\n",
+    "        minutes = generate_minutes(transcription, model, tokenizer)\n",
+    "        progress(1.0, desc=\"Process complete!\")\n",
+    "        return minutes\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        return f\"Error processing file: {str(e)}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "k2U2bWtey7Yo"
+   },
+   "outputs": [],
+   "source": [
+    "# Create Gradio interface\n",
+    "\n",
+    "interface = gr.Interface(\n",
+    "    fn=process_upload,\n",
+    "    inputs=gr.Audio(type=\"filepath\", label=\"Upload MP3 File\", format=\"mp3\"),\n",
+    "    outputs=gr.Markdown(label=\"Meeting Minutes\", min_height=60),\n",
+    "    title=\"Meeting Minutes Generator\",\n",
+    "    description=\"Upload an MP3 recording of your meeting to get AI-generated meeting minutes. This process may take a few minutes.\",\n",
+    "    flagging_mode=\"never\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "X3JbzRNRy9oG"
+   },
+   "outputs": [],
+   "source": [
+    "# Launch Gradio interface\n",
+    "\n",
+    "interface.launch()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/week3/community-contributions/en-de-fr_dataset_generator.ipynb
+++ b/week3/community-contributions/en-de-fr_dataset_generator.ipynb
@@ -1,322 +1,332 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4",
-      "authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yqlQTsxNdKrN"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
+   ]
  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "yqlQTsxNdKrN"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import os\n",
-        "import requests\n",
-        "from IPython.display import Markdown, display, update_display\n",
-        "from openai import OpenAI\n",
-        "from google.colab import drive\n",
-        "from huggingface_hub import login\n",
-        "from google.colab import userdata\n",
-        "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
-        "import torch\n",
-        "import gradio as gr\n",
-        "import re"
-      ],
-      "metadata": {
-        "id": "eyfvQrLxdkGT"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# one can always add more models, of course\n",
-        "\n",
-        "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
-        "OPENAI_MODEL = \"gpt-4o-mini\""
-      ],
-      "metadata": {
-        "id": "WW-cSZk7dnp6"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "hf_token = userdata.get('HF_TOKEN')\n",
-        "login(hf_token, add_to_git_credential=True)\n",
-        "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
-        "openai = OpenAI(api_key=openai_api_key)"
-      ],
-      "metadata": {
-        "id": "XG7Iam6Rdw8F"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "force_dark_mode = \"\"\"\n",
-        "function refresh() {\n",
-        "    const url = new URL(window.location);\n",
-        "    if (url.searchParams.get('__theme') !== 'dark') {\n",
-        "        url.searchParams.set('__theme', 'dark');\n",
-        "        window.location.href = url.href;\n",
-        "    }\n",
-        "}\n",
-        "\"\"\""
-      ],
-      "metadata": {
-        "id": "Ov7WSdx9dzSt"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "def dataset_generator(model, nature, shots, volume, language):\n",
-        "\n",
-        "  examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
-        "  system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
-        "  user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
-        "  sentences = \"\"\n",
-        "\n",
-        "  if language == \"English\":\n",
-        "\n",
-        "    for shot in list(shots.keys()):\n",
-        "      examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
-        "\n",
-        "    system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
-        "Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
-        "but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
-        "Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
-        "\n",
-        "    user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
-        "Do not comment or format yor output in any way, shape, or form.\"\n",
-        "\n",
-        "  elif language == \"German\":\n",
-        "\n",
-        "    for shot in list(shots.keys()):\n",
-        "      examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
-        "\n",
-        "    system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
-        "Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
-        "plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
-        "Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
-        "\n",
-        "    user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
-        "Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
-        "\n",
-        "  elif language == \"French\":\n",
-        "\n",
-        "    for shot in list(shots.keys()):\n",
-        "      examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
-        "\n",
-        "    system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
-        "Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
-        "mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
-        "Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
-        "\n",
-        "    user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
-        "Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
-        "\n",
-        "  messages = [\n",
-        "      {\"role\": \"system\", \"content\": system_message},\n",
-        "      {\"role\": \"user\", \"content\": user_prompt}\n",
-        "    ]\n",
-        "\n",
-        "  if model == \"Llama\":\n",
-        "\n",
-        "    quant_config = BitsAndBytesConfig(\n",
-        "        load_in_4bit=True,\n",
-        "        bnb_4bit_use_double_quant=True,\n",
-        "        bnb_4bit_compute_dtype=torch.bfloat16,\n",
-        "        bnb_4bit_quant_type=\"nf4\"\n",
-        "    )\n",
-        "\n",
-        "    tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
-        "    tokenizer.pad_token = tokenizer.eos_token\n",
-        "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
-        "    streamer = TextStreamer(tokenizer)\n",
-        "    model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
-        "    outputs = model.generate(inputs, max_new_tokens=10000)\n",
-        "\n",
-        "    response  = tokenizer.decode(outputs[0])\n",
-        "    sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
-        "\n",
-        "  elif model == \"OpenAI\":\n",
-        "    response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
-        "    sentences = response.choices[0].message.content\n",
-        "\n",
-        "  return sentences"
-      ],
-      "metadata": {
-        "id": "bEF8w_Mdd2Nb"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "global data\n",
-        "data = \"\"\n",
-        "\n",
-        "with gr.Blocks(\n",
-        "        css=\"\"\"\n",
-        "    .red-button {\n",
-        "        background-color: darkred !important;\n",
-        "        border-color: red !important;\n",
-        "    }\n",
-        "    .blue-button {\n",
-        "        background-color: darkblue !important;\n",
-        "        border-color: blue !important;\n",
-        "    }\n",
-        "    .green-button {\n",
-        "        background-color: green !important;\n",
-        "        border-color: green !important;\n",
-        "    }\n",
-        "    \"\"\"\n",
-        ") as view:\n",
-        "  with gr.Row():\n",
-        "    title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
-        "    subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
-        "<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
-        "<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
-        "<li>Hit <q>Create Dataset</q></li>\\\n",
-        "<li>Save the output (.txt) to your Google Drive</li>\")\n",
-        "  with gr.Row():\n",
-        "    language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
-        "    model_choice    = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
-        "    volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    sentence_1    = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
-        "    instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    sentence_2    = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
-        "    instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    sentence_3    = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
-        "    instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    liveSentences = gr.Markdown(\n",
-        "        value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
-        "        label=\"Generated sentences:\",\n",
-        "         min_height=60,\n",
-        "         max_height=200\n",
-        "        )\n",
-        "  with gr.Row():\n",
-        "    generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
-        "  with gr.Row():\n",
-        "    clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
-        "  with gr.Row():\n",
-        "    outputPath  = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
-        "  with gr.Row():\n",
-        "    save  = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
-        "\n",
-        "  def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
-        "    global data\n",
-        "    nature = \"\"\n",
-        "    shots = {}\n",
-        "    amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
-        "\n",
-        "    if typeInput != None:\n",
-        "      nature = typeInput\n",
-        "    else:\n",
-        "      nature = \"Random sentences of mixed nature\"\n",
-        "\n",
-        "    if s1 != None:\n",
-        "      if i1 != None:\n",
-        "        shots[i1] = s1\n",
-        "      else:\n",
-        "        shots[\"A medium-long random sentence about anything\"] = s1\n",
-        "    else:\n",
-        "      shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
-        "\n",
-        "    if s2 != None:\n",
-        "      if i2 != None:\n",
-        "        shots[i2] = s2\n",
-        "      else:\n",
-        "        shots[\"A medium-long random sentence about anything\"] = s2\n",
-        "\n",
-        "    if s3 != None:\n",
-        "      if i3 != None:\n",
-        "        shots[i3] = s3\n",
-        "      else:\n",
-        "        shots[\"A medium-long random sentence about anything\"] = s3\n",
-        "\n",
-        "    sentences = dataset_generator(model, nature, shots, amount, language)\n",
-        "    data = sentences\n",
-        "\n",
-        "    return sentences\n",
-        "\n",
-        "  def saveData(path):\n",
-        "    global data\n",
-        "    drive.mount(\"/content/drive\")\n",
-        "\n",
-        "    dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
-        "\n",
-        "    if not os.path.exists(dir_path):\n",
-        "      os.makedirs(dir_path)\n",
-        "\n",
-        "    with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
-        "      f.write(data)\n",
-        "\n",
-        "  generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
-        "  clear.click(\n",
-        "      lambda: [\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
-        "          gr.update(value=\"\"),\n",
-        "          gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
-        "      None,\n",
-        "      [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
-        "         sentence_3, instruction_3, liveSentences, outputPath, save],\n",
-        "      queue=False\n",
-        "      )\n",
-        "  save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
-        "\n",
-        "view.launch(share=True) #, debug=True)"
-      ],
-      "metadata": {
-        "id": "VRKdu0fEt8mg"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "eyfvQrLxdkGT"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "from IPython.display import Markdown, display, update_display\n",
+    "from openai import OpenAI\n",
+    "from google.colab import drive\n",
+    "from huggingface_hub import login\n",
+    "from google.colab import userdata\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+    "import torch\n",
+    "import gradio as gr\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WW-cSZk7dnp6"
+   },
+   "outputs": [],
+   "source": [
+    "# one can always add more models, of course\n",
+    "\n",
+    "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+    "OPENAI_MODEL = \"gpt-4o-mini\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XG7Iam6Rdw8F"
+   },
+   "outputs": [],
+   "source": [
+    "hf_token = userdata.get('HF_TOKEN')\n",
+    "login(hf_token, add_to_git_credential=True)\n",
+    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+    "openai = OpenAI(api_key=openai_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Ov7WSdx9dzSt"
+   },
+   "outputs": [],
+   "source": [
+    "force_dark_mode = \"\"\"\n",
+    "function refresh() {\n",
+    "    const url = new URL(window.location);\n",
+    "    if (url.searchParams.get('__theme') !== 'dark') {\n",
+    "        url.searchParams.set('__theme', 'dark');\n",
+    "        window.location.href = url.href;\n",
+    "    }\n",
+    "}\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bEF8w_Mdd2Nb"
+   },
+   "outputs": [],
+   "source": [
+    "def dataset_generator(model, nature, shots, volume, language):\n",
+    "\n",
+    "  examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
+    "  system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
+    "  user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
+    "  sentences = \"\"\n",
+    "\n",
+    "  if language == \"English\":\n",
+    "\n",
+    "    for shot in list(shots.keys()):\n",
+    "      examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
+    "\n",
+    "    system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
+    "Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
+    "but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
+    "Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
+    "\n",
+    "    user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
+    "Do not comment or format yor output in any way, shape, or form.\"\n",
+    "\n",
+    "  elif language == \"German\":\n",
+    "\n",
+    "    for shot in list(shots.keys()):\n",
+    "      examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
+    "\n",
+    "    system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
+    "Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
+    "plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
+    "Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
+    "\n",
+    "    user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
+    "Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
+    "\n",
+    "  elif language == \"French\":\n",
+    "\n",
+    "    for shot in list(shots.keys()):\n",
+    "      examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
+    "\n",
+    "    system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
+    "Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
+    "mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
+    "Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
+    "\n",
+    "    user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
+    "Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
+    "\n",
+    "  messages = [\n",
+    "      {\"role\": \"system\", \"content\": system_message},\n",
+    "      {\"role\": \"user\", \"content\": user_prompt}\n",
+    "    ]\n",
+    "\n",
+    "  if model == \"Llama\":\n",
+    "\n",
+    "    quant_config = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_use_double_quant=True,\n",
+    "        bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "        bnb_4bit_quant_type=\"nf4\"\n",
+    "    )\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+    "    streamer = TextStreamer(tokenizer)\n",
+    "    model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
+    "    outputs = model.generate(inputs, max_new_tokens=10000)\n",
+    "\n",
+    "    response  = tokenizer.decode(outputs[0])\n",
+    "    sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
+    "\n",
+    "  elif model == \"OpenAI\":\n",
+    "    response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
+    "    sentences = response.choices[0].message.content\n",
+    "\n",
+    "  return sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VRKdu0fEt8mg"
+   },
+   "outputs": [],
+   "source": [
+    "global data\n",
+    "data = \"\"\n",
+    "\n",
+    "with gr.Blocks(\n",
+    "        css=\"\"\"\n",
+    "    .red-button {\n",
+    "        background-color: darkred !important;\n",
+    "        border-color: red !important;\n",
+    "    }\n",
+    "    .blue-button {\n",
+    "        background-color: darkblue !important;\n",
+    "        border-color: blue !important;\n",
+    "    }\n",
+    "    .green-button {\n",
+    "        background-color: green !important;\n",
+    "        border-color: green !important;\n",
+    "    }\n",
+    "    \"\"\"\n",
+    ") as view:\n",
+    "  with gr.Row():\n",
+    "    title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
+    "    subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
+    "<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
+    "<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
+    "<li>Hit <q>Create Dataset</q></li>\\\n",
+    "<li>Save the output (.txt) to your Google Drive</li>\")\n",
+    "  with gr.Row():\n",
+    "    language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
+    "    model_choice    = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
+    "    volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    sentence_1    = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
+    "    instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    sentence_2    = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
+    "    instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    sentence_3    = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
+    "    instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    liveSentences = gr.Markdown(\n",
+    "        value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
+    "        label=\"Generated sentences:\",\n",
+    "         min_height=60,\n",
+    "         max_height=200\n",
+    "        )\n",
+    "  with gr.Row():\n",
+    "    generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
+    "  with gr.Row():\n",
+    "    clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
+    "  with gr.Row():\n",
+    "    outputPath  = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
+    "  with gr.Row():\n",
+    "    save  = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
+    "\n",
+    "  def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
+    "    global data\n",
+    "    nature = \"\"\n",
+    "    shots = {}\n",
+    "    amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
+    "\n",
+    "    if typeInput != None:\n",
+    "      nature = typeInput\n",
+    "    else:\n",
+    "      nature = \"Random sentences of mixed nature\"\n",
+    "\n",
+    "    if s1 != None:\n",
+    "      if i1 != None:\n",
+    "        shots[i1] = s1\n",
+    "      else:\n",
+    "        shots[\"A medium-long random sentence about anything\"] = s1\n",
+    "    else:\n",
+    "      shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
+    "\n",
+    "    if s2 != None:\n",
+    "      if i2 != None:\n",
+    "        shots[i2] = s2\n",
+    "      else:\n",
+    "        shots[\"A medium-long random sentence about anything\"] = s2\n",
+    "\n",
+    "    if s3 != None:\n",
+    "      if i3 != None:\n",
+    "        shots[i3] = s3\n",
+    "      else:\n",
+    "        shots[\"A medium-long random sentence about anything\"] = s3\n",
+    "\n",
+    "    sentences = dataset_generator(model, nature, shots, amount, language)\n",
+    "    data = sentences\n",
+    "\n",
+    "    return sentences\n",
+    "\n",
+    "  def saveData(path):\n",
+    "    global data\n",
+    "    drive.mount(\"/content/drive\")\n",
+    "\n",
+    "    dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
+    "\n",
+    "    if not os.path.exists(dir_path):\n",
+    "      os.makedirs(dir_path)\n",
+    "\n",
+    "    with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
+    "      f.write(data)\n",
+    "\n",
+    "  generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
+    "  clear.click(\n",
+    "      lambda: [\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
+    "          gr.update(value=\"\"),\n",
+    "          gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
+    "      None,\n",
+    "      [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
+    "         sentence_3, instruction_3, liveSentences, outputPath, save],\n",
+    "      queue=False\n",
+    "      )\n",
+    "  save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
+    "\n",
+    "view.launch(share=True) #, debug=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR",
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/week3/community-contributions/synthetic_data_generator.ipynb
+++ b/week3/community-contributions/synthetic_data_generator.ipynb
@@ -387,7 +387,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "llm_engineering-yg2xCEUG",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -401,9 +401,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.11.11"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }