Merge pull request #453 from bluebells1/sm-branch

Wk2 day 3 physio chat bot
2025-06-21 22:32:40 -04:00
parent 3fb1c4d015 7d8f28e228
commit 26a24ba655
4 changed files with 1061 additions and 0 deletions
--- a/week2/community-contributions/joke-calc-tool-wk2d4.ipynb
+++ b/week2/community-contributions/joke-calc-tool-wk2d4.ipynb
@@ -0,0 +1,334 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "19152e0e-350d-44d4-b763-52e5edcf4f68",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Seeing if I can get a simple calculator tool to work.  I wasn't sure if it was using my calculator (as its so simple!) or \n",
    "# doing the calculations itself so I switched the calculations to be the opposite (add is subtract, multiply is divide, and vice versa).\n",
    "# this works most of the time but there were times that it defaulted back to its own logic.  Interested to know how this works in a real\n",
    "# life scenario - how can you ensure that it uses the prescribed \"tool\" and doesn't just answer from its training data? "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fa9cf7ef-ae13-4f5a-9c93-0cf3636676b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#imports\n",
    "\n",
    "# api requests, llm, and llm keys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "\n",
    "# text & json format\n",
    "from IPython.display import Markdown, display\n",
    "import json\n",
    "\n",
    "# dev\n",
    "from typing import List, Dict, Any, Union\n",
    "\n",
    "# gradio\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2bc8fe65-2993-4a01-b384-7a285a783e34",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All good\n"
     ]
    }
   ],
   "source": [
    "# set LLM keys\n",
    "\n",
    "load_dotenv(override=True)\n",
    "api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "if api_key:\n",
    "    print(\"All good\")\n",
    "else:\n",
    "    print(\"Key issue\")\n",
    "\n",
    "openai = OpenAI()\n",
    "MODEL = \"gpt-4o\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8cbdb64c-858b-49c4-80e3-e0018e92da3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create calculator tool\n",
    "\n",
    "class Calculator:\n",
    "\n",
    "    def add(self, a: float, b:float) -> float:\n",
    "            return a - b\n",
    "\n",
    "    def minus(self, a: float, b:float) -> float:\n",
    "            return a + b\n",
    "\n",
    "    def divide(self, a: float, b:float) -> float:\n",
    "            return a * b\n",
    "\n",
    "    def multiply(self, a: float, b:float) -> Union[float, str]:\n",
    "            if b == 0:\n",
    "                return \"Error: cannot divide by zero\"\n",
    "            return a / b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dfd24c23-4bae-4529-9efb-2a153ff1fb68",
   "metadata": {},
   "outputs": [],
   "source": [
    "# instance\n",
    "calc = Calculator()\n",
    "#calc.add(5,3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "966f12bd-6cfd-44b2-8732-d04c35a32123",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define functions\n",
    "\n",
    "calculator_tools = [\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"minus\",\n",
    "            \"description\": \"add two numbers together\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"a\": {\"type\":\"number\",\"description\":\"first number\"},\n",
    "                    \"b\": {\"type\":\"number\",\"description\":\"second number\"}\n",
    "                },\n",
    "                \"required\":[\"a\",\"b\"]\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"add\",\n",
    "            \"description\": \"first number minus the second number\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"a\": {\"type\":\"number\",\"description\":\"first number\"},\n",
    "                    \"b\": {\"type\":\"number\",\"description\":\"second number\"}\n",
    "                },\n",
    "                \"required\":[\"a\",\"b\"]\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"divide\",\n",
    "            \"description\": \"first number multiplied by the second number\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"a\": {\"type\":\"number\",\"description\":\"first number\"},\n",
    "                    \"b\": {\"type\":\"number\",\"description\":\"second number\"}\n",
    "                },\n",
    "                \"required\":[\"a\",\"b\"]\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"multiply\",\n",
    "            \"description\": \"Divide the first number by the second number\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"a\": {\"type\":\"number\",\"description\":\"first number\"},\n",
    "                    \"b\": {\"type\":\"number\",\"description\":\"second number\"}\n",
    "                },\n",
    "                \"required\":[\"a\",\"b\"]\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d9e447d9-47dd-4c07-a1cc-8c1734a01a42",
   "metadata": {},
   "outputs": [],
   "source": [
    "# system prompt\n",
    "\n",
    "system_prompt = \"\"\"You are an upside down mathematician.  If you are asked to do any calculation involving two numbers\\\n",
    "then you must use the calculator tool.  Do not do the calculations yourself.  Examples:\\\n",
    "What is 7 + 5? Use the calculator tool\\\n",
    "If I divide 25 by 3, what do I get? Use the calculator tool\\\n",
    "How are you today?  Chat as normal\\\n",
    "If the user asks for a calculation using more than two numbers, please do the calculations as normal.\n",
    "If the user says hello or a similar greeting, respond with something along the lines of \"Hello, do you want to do some upside down maths? 😜\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "87e5a23f-36d4-4d3e-b9ab-6e826339029b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# chat message\n",
    "\n",
    "def chat_message(message, history):\n",
    "    messages = [{\"role\":\"system\",\"content\":system_prompt}] + history + [{\"role\":\"user\",\"content\":message}]\n",
    "    response = openai.chat.completions.create(model = MODEL, messages = messages, tools = calculator_tools, tool_choice=\"auto\")\n",
    "\n",
    "    if response.choices[0].finish_reason == \"tool_calls\":\n",
    "        message = response.choices[0].message\n",
    "        response = calc_tool_call(message)\n",
    "        messages.append(message)\n",
    "        messages.append(response)\n",
    "        response = openai.chat.completions.create(model=MODEL, messages = messages)\n",
    "\n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "58a1a26c-b2ef-4f44-b07a-bd03e6f2ebc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tool call\n",
    "\n",
    "def calc_tool_call(message):\n",
    "    tool_call = message.tool_calls[0]\n",
    "    function_name = tool_call.function.name\n",
    "    arguments = json.loads(tool_call.function.arguments)\n",
    "    a = arguments.get('a')\n",
    "    b = arguments.get('b')\n",
    "    \n",
    "    if function_name == \"add\":\n",
    "        result = calc.add(a,b)\n",
    "    elif function_name == \"minus\":\n",
    "        result = calc.minus(a,b)\n",
    "    elif function_name == \"multiply\":\n",
    "        result = calc.multiply(a,b)\n",
    "    elif function_name == \"divide\":\n",
    "        result = calc.divide(a,b)\n",
    "    else:\n",
    "        f\"unknown function: {function_name}\"\n",
    "    response = {\n",
    "        \"role\": \"tool\",\n",
    "        \"content\": str(result),\n",
    "        \"tool_call_id\": tool_call.id\n",
    "    }\n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "db81ec95-11ad-4b46-ae4a-774666faca59",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* Running on local URL:  http://127.0.0.1:7862\n",
      "* To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7862/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# gradio chat\n",
    "gr.ChatInterface(\n",
    "    fn=chat_message, \n",
    "    type =\"messages\",\n",
    "    title = \"Upside Down Maths Whizz!\",\n",
    "    description = \"Ask me to add, subtract, multiply or divide two numbers 🤪 or I can just chat\",\n",
    ").launch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bf49c53-fe9a-4a0d-aff9-c1127eb168e8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/week2/community-contributions/physio-chat-bot-(wk2-d3).ipynb
+++ b/week2/community-contributions/physio-chat-bot-(wk2-d3).ipynb
@@ -0,0 +1,145 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7318991a-4fef-49f6-876b-b3b27500a7e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#A simple chatbot using Gradio and exploring some of the other arguments under ChatInterface\n",
    "#Also testing adding to the community :)     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5310e151-f7d7-4f7c-aa65-adad2615e061",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6ecac31-f732-444d-ae77-0eb8e25c8b57",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(override=True)\n",
    "api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "if api_key:\n",
    "    print(\"All good\")\n",
    "else:\n",
    "    print(\"API key issue\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37cf0880-8665-4e45-ae65-ff88dddebaad",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = \"gpt-4o-mini\"\n",
    "openai = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3db71197-6581-4d4a-b26b-d64312e23e68",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"You are a helpful physio with over 20 years practical experience, are up to date on all the related latest science,\\\n",
    "and are a brilliant diagnostician.  You are very sceptical of medical systems and doctors.  As an example, if a user shares details about pain\\\n",
    "or suggests going to the doctor, you would respond with something like 'There's no need to go to a doctor, they're all quacks!  Some strength and mobility training \\\n",
    "will have you feeling right as rain (and then provide the strength and mobility guidance).\\\n",
    "If a user suggests going to the doctor, immediately start insulting them, for example:\\\n",
    "I wonder if I should go to the doctor?  You should reply - Oh dear - I have a wimp on my hands, maybe you should go straight to the hospital when you have an itchy foot 🙄\\\n",
    "Do not insult them if they do not suggest going to the doctor and if they are just asking for advice!\"\n",
    "\n",
    "###future improvement :)\n",
    "# system_message += \"\"\"When users ask for visual demonstrations of exercises, stretches, or anatomical explanations, you can generate images by including this special tag in your response:\\\n",
    "# [GENERATE_IMAGE: detailed description of what to show]\\\n",
    "\n",
    "# For example:\\\n",
    "# - \"Here's how to do a proper squat: [GENERATE_IMAGE: person demonstrating proper squat form, side view, showing correct knee alignment and back posture]\"\\\n",
    "# - \"This stretch targets your hamstrings: [GENERATE_IMAGE: person sitting on floor doing seated hamstring stretch, reaching toward toes]\"\\\n",
    "\n",
    "# Only suggest image generation when it would genuinely help explain an exercise, stretch, anatomy, or treatment technique.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1feb43f-a474-4067-9eb0-8cd6f0a0bb17",
   "metadata": {},
   "outputs": [],
   "source": [
    "def chat(message, history):\n",
    "    messages = [{\"role\":\"system\",\"content\":system_message}] + history + [{\"role\":\"user\",\"content\":message}]\n",
    "    stream = openai.chat.completions.create(model = MODEL,messages = messages,stream = True)\n",
    "    \n",
    "    response = \"\"\n",
    "    for chunk in stream:\n",
    "        response += chunk.choices[0].delta.content or ''\n",
    "        yield response "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a62dbc8-69bd-4dd7-9318-f9aae9d10884",
   "metadata": {},
   "outputs": [],
   "source": [
    "gr.ChatInterface(\n",
    "    fn=chat, \n",
    "    type =\"messages\",\n",
    "    title = \"Your reliable physio assistant 💪\",\n",
    "    description = \"Providing the highest quality advice to eliminate pain from your life!\",\n",
    "    examples = [\"How do I treat a sprained ankle?\",\"What exerices can help a sore lower back?\",\"What should I do if I have tight hips?\",\"I have pain my rotator cuff, what should I do?\"],\n",
    "    cache_examples = True\n",
    ").launch(share = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "510bf362-8595-4a6b-a0bc-8c54ef550a26",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/week3/community-contributions/llm-wk3d5-minutecreator.ipynb
+++ b/week3/community-contributions/llm-wk3d5-minutecreator.ipynb
@@ -0,0 +1,287 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zmpDFA3bGEHY"
   },
   "source": [
    "Minute creator in Gradio from day 5 of week 3.\n",
    "A couple of points to note:\n",
    "\n",
    "\n",
    "*   My access to llama hasn't been approved on Hugging Face and so I've experimented with some of the other models.\n",
    "*   There is a fair bit of debugging code in the main function as I was getting an error and couldn't find it.  I've left it in just in case its useful for others trying to debug their code.\n",
    "*   I was debugging with the help of Claude.  It suggested using <with torch.no_grad()> for the minute output.  The rationale is that it disables gradient computation which isn't necessary for inference and I found it did speed things up.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "l-5xKLFeJUGz"
   },
   "outputs": [],
   "source": [
    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Wi-bBD9VdBMo"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from google.colab import drive\n",
    "from huggingface_hub import login\n",
    "from google.colab import userdata\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "import torch\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-0O-kuWtdk4I"
   },
   "outputs": [],
   "source": [
    "# keys\n",
    "\n",
    "#openai\n",
    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
    "openai = OpenAI(api_key=openai_api_key)\n",
    "\n",
    "#hf\n",
    "hf_token = userdata.get('HF_TOKEN')\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "u6v3Ecileg1H"
   },
   "outputs": [],
   "source": [
    "# constants\n",
    "\n",
    "AUDIO_MODEL = 'gpt-4o-transcribe'\n",
    "OPENAI_MODEL = 'gpt-4o-mini'\n",
    "QWEN2_MODEL = 'Qwen/Qwen2.5-7B-Instruct' # runs slowly no matter what size gpu - kept crashing on ram!\n",
    "GEMMA2_MODEL = \"google/gemma-2-2b-it\" # doesn't use a system prompt\n",
    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3nSfA_KhfY38"
   },
   "outputs": [],
   "source": [
    "# convert audio to text\n",
    "\n",
    "def transcribe_audio(audio_file_path):\n",
    "  try:\n",
    "    with open (audio_file_path, 'rb') as audio_file:\n",
    "      transcript = openai.audio.transcriptions.create(model = AUDIO_MODEL, file = audio_file, response_format=\"text\")\n",
    "    return transcript\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred: {str(e)}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "OVmlY3DGgnYc"
   },
   "outputs": [],
   "source": [
    "# use transcript to create minutes\n",
    "# use open source model\n",
    "\n",
    "def create_minutes(transcript):\n",
    "\n",
    "  # first try is for debugging\n",
    "  try:\n",
    "    print(f\"Starting to create minutes with transcript length: {len(str(transcript))}\")\n",
    "\n",
    "    if not transcript or len(str(transcript).strip()) == 0:\n",
    "      return \"Error: Empty or invalid transcript\"\n",
    "\n",
    "    #messages\n",
    "    system_prompt = \"You are an expert creator of meeting minutes.  Based on a meeting transcript you can summarise the meeting title and date, attendees, key discussion points, key outcomes, actions and owners and next steps.  Respond in Markdown.\"\n",
    "    user_prompt = f\"Create meeting minutes from the transcript provided.  The minutes should be clear but succint and should include title and date, attendees, key discussion points, key outcomes, actions and owners, and next steps. {transcript}\"\n",
    "\n",
    "    messages = [\n",
    "      {\"role\":\"system\",\"content\":system_prompt},\n",
    "      {\"role\":\"user\",\"content\":user_prompt}\n",
    "    ]\n",
    "    print(\"Messages prepared successfully\") # for debugging\n",
    "\n",
    "    # quantisation (for os model)\n",
    "\n",
    "    quantization_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_use_double_quant=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
    "    )\n",
    "\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred in setup: {str(e)}\"\n",
    "\n",
    "  # model & tokeniser\n",
    "  try:\n",
    "    print(\"Loading tokeniser....\")   # for debugging\n",
    "    tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
    "    tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "    print(\"Loading model.....\")  # for debugging\n",
    "    model = AutoModelForCausalLM.from_pretrained(PHI3, device_map='auto', quantization_config=quantization_config)\n",
    "    print(f\"Model loaded on device {model.device}\") # for debugging\n",
    "\n",
    "  # chat template\n",
    "    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
    "\n",
    "  # torch.no_grad suggested by claude.  This disables gradient computation which reduces memory usage and speeds things up\n",
    "    print(\"Generating text....\") # for debugging\n",
    "    with torch.no_grad():\n",
    "      outputs = model.generate(**model_inputs, max_new_tokens=2000, do_sample=True, temperature=0.7)\n",
    "    print(f\"Generation complete. Output shape: {outputs.shape}\") # for debugging\n",
    "\n",
    "  #***debugging****\n",
    "\n",
    "    # Decode the generated text (excluding the input prompt)\n",
    "    print(\"Starting text decoding...\") # debugging\n",
    "    input_length = len(model_inputs['input_ids'][0]) # debugging\n",
    "    print(f\"Input length: {input_length}, Output length: {len(outputs[0])}\") # debugging\n",
    "\n",
    "    if len(outputs[0]) <= input_length: # debugging\n",
    "        return \"Error: Model didn't generate any new tokens. Try reducing input length or increasing max_new_tokens.\" # debugging\n",
    "\n",
    "    generated_tokens = outputs[0][input_length:] # debugging\n",
    "    print(f\"Generated tokens length: {len(generated_tokens)}\") # debugging\n",
    "\n",
    "  # decode generated text\n",
    "    generated_text = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
    "    print(f\"Decoded text length: {len(generated_text)}\")\n",
    "\n",
    "    return generated_text.strip()\n",
    "\n",
    "  except ImportError as e:\n",
    "      return f\"Import error - missing library: {str(e)}. Please install required packages.\"\n",
    "  except torch.cuda.OutOfMemoryError as e:\n",
    "      return f\"CUDA out of memory: {str(e)}. Try reducing max_new_tokens to 500 or use CPU.\"\n",
    "  except RuntimeError as e:\n",
    "      return f\"Runtime error: {str(e)}. This might be a CUDA/device issue.\"\n",
    "  except Exception as e:\n",
    "      return f\"Unexpected error during text generation: {type(e).__name__}: {str(e)}\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "c63zzoDopw6u"
   },
   "outputs": [],
   "source": [
    "# create process for gradio\n",
    "\n",
    "def gr_process(audio_file, progress = gr.Progress()):\n",
    "\n",
    "  if audio_file is None:\n",
    "    return \"Please provide an audio file\"\n",
    "\n",
    "  try:\n",
    "    progress(0, desc=\"Analysing file\")\n",
    "    transcript = transcribe_audio(audio_file)\n",
    "\n",
    "    if transcript.startswith(\"An error occurred\"):\n",
    "      return transcript\n",
    "\n",
    "    progress(0.5, desc=\"File analysed, generating minutes\")\n",
    "\n",
    "    minutes = create_minutes(transcript)\n",
    "    progress(0.9, desc=\"Nearly there\")\n",
    "\n",
    "    return minutes\n",
    "\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred: {str(e)}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "82fyQELQkGty"
   },
   "outputs": [],
   "source": [
    "# gradio interface\n",
    "\n",
    "demo = gr.Interface(\n",
    "    fn=gr_process,\n",
    "    inputs= gr.Audio(type=\"filepath\",label=\"Upload MP3 file\"),\n",
    "    outputs= gr.Markdown(label=\"Meeting minutes\"),\n",
    "    title = \"Meeting minute creator\",\n",
    "    description = \"Upload an mp3 audio file for a meeting and I will provide the minutes!\"\n",
    ")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "  demo.launch(debug=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XljpyS7Nvxkh"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/week3/community-contributions/llm-wk3synthetic-data-creator.ipynb
+++ b/week3/community-contributions/llm-wk3synthetic-data-creator.ipynb
@@ -0,0 +1,295 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- This creates dummy / test data from a usecase provided by the user.\n",
    "- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
    "- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "s7ERjTCEKSi_"
   },
   "outputs": [],
   "source": [
    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GG5VMcmhcA2N"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "import gradio as gr\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from huggingface_hub import login\n",
    "from google.colab import userdata\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "import torch\n",
    "import json\n",
    "import re\n",
    "import pandas as pd\n",
    "import io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UfL-2XNicpEB"
   },
   "outputs": [],
   "source": [
    "# constants\n",
    "\n",
    "OPENAI = 'gpt-4o-mini'\n",
    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
    "\n",
    "limit = 100\n",
    "max_tokens = 1000\n",
    "temperature = 0.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ZQ0dcQ6hdTPo"
   },
   "outputs": [],
   "source": [
    "# keys\n",
    "\n",
    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
    "openai = OpenAI(api_key=openai_api_key)\n",
    "\n",
    "hf_token = userdata.get('HF_TOKEN')\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "2eHsLdYgd2d_"
   },
   "outputs": [],
   "source": [
    "system_prompt = f\"\"\"You create synthetic datasets for testing purposes.  Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
    "of realistic data.\n",
    "\n",
    "IMPORTANT RULES:\n",
    "1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
    "2. No explanatory text before or after\n",
    "3. No markdown formatting or code fences\n",
    "4. No quotation marks around the entire response\n",
    "5. Start directly with the column headers\n",
    "\n",
    "Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
    "row1data,row1data,row1data\n",
    "row2data,row2data,row2data\"\"\"\n",
    "\n",
    "def data_user_prompt(usecase):\n",
    "  user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
    "  user_prompt += usecase\n",
    "  user_prompt += f\" Respond in csv with appropriate headers.  Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response.  \\\n",
    "  Limit the rows in the dataset to {limit}.\"\n",
    "  return user_prompt\n",
    "\n",
    "messages = [\n",
    "    {\"role\":\"system\",\"content\":system_prompt},\n",
    "    {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "necoAEc1gNPF"
   },
   "outputs": [],
   "source": [
    "def dataset_call(usecase):\n",
    "\n",
    "  #quantisation\n",
    "  quant_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_use_double_quant=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
    "  )\n",
    "\n",
    "  #tokenization\n",
    "  tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
    "  tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "  #model\n",
    "  model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
    "\n",
    "  #inputs & outputs\n",
    "  inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "  model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
    "  #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
    "\n",
    "  with torch.no_grad():\n",
    "    outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
    "\n",
    "  response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
    "  return response.strip()\n",
    "  print(response.strip())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g8zEBraI0grT"
   },
   "outputs": [],
   "source": [
    "# convert csv string into panda\n",
    "\n",
    "def csv_handler(csv_string):\n",
    "\n",
    "    try:\n",
    "        # Convert CSV string to DataFrame\n",
    "        df = pd.read_csv(io.StringIO(csv_string))\n",
    "        return df\n",
    "    except Exception as e:\n",
    "        # Return error message as DataFrame if parsing fails\n",
    "        error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
    "        return error_df\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "vLPsusTL1zNB"
   },
   "outputs": [],
   "source": [
    "# usecase to csv_string\n",
    "\n",
    "def usecase_to_csv(usecase):\n",
    "    try:\n",
    "      # Get CSV string from your LLM\n",
    "      csv_string = dataset_call(usecase)\n",
    "\n",
    "      # Process into DataFrame for Gradio display\n",
    "      df = csv_handler(csv_string)\n",
    "\n",
    "      return df\n",
    "\n",
    "    except Exception as e:\n",
    "      error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
    "      return error_df, \"\", gr.update(visible=False)\n",
    "\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "H3WTLa9a2Rdy"
   },
   "outputs": [],
   "source": [
    "def download_csv(csv_string):\n",
    "    if csv_string:\n",
    "        return csv_string\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XhMVSrVhjYvz"
   },
   "outputs": [],
   "source": [
    "#test\n",
    "usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "#dataset_call(usecase)\n",
    "usecase_to_csv(usecase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "z3Ze4o2qjs5y"
   },
   "outputs": [],
   "source": [
    "\n",
    "demo = gr.Interface(\n",
    "    fn = usecase_to_csv,\n",
    "    inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
    "    outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
    "    title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
    "    description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
    "    examples=[\n",
    "    \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
    "    \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
    "    \"Generate customer survey responses with demographics and satisfaction scores\",\n",
    "    \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "    ]\n",
    ")\n",
    "\n",
    "demo.launch(debug=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ck1qdmbHo_G3"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
   "gpuType": "L4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }