Add projects from week 3 day 5

2025-06-19 09:38:22 +01:00
parent 28420a8541
commit abd17ec917
2 changed files with 582 additions and 0 deletions
--- a/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb
+++ b/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb
@@ -0,0 +1,295 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- This creates dummy / test data from a usecase provided by the user.\n",
    "- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
    "- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "s7ERjTCEKSi_"
   },
   "outputs": [],
   "source": [
    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GG5VMcmhcA2N"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "import gradio as gr\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from huggingface_hub import login\n",
    "from google.colab import userdata\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "import torch\n",
    "import json\n",
    "import re\n",
    "import pandas as pd\n",
    "import io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UfL-2XNicpEB"
   },
   "outputs": [],
   "source": [
    "# constants\n",
    "\n",
    "OPENAI = 'gpt-4o-mini'\n",
    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
    "\n",
    "limit = 100\n",
    "max_tokens = 1000\n",
    "temperature = 0.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ZQ0dcQ6hdTPo"
   },
   "outputs": [],
   "source": [
    "# keys\n",
    "\n",
    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
    "openai = OpenAI(api_key=openai_api_key)\n",
    "\n",
    "hf_token = userdata.get('HF_TOKEN')\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "2eHsLdYgd2d_"
   },
   "outputs": [],
   "source": [
    "system_prompt = f\"\"\"You create synthetic datasets for testing purposes.  Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
    "of realistic data.\n",
    "\n",
    "IMPORTANT RULES:\n",
    "1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
    "2. No explanatory text before or after\n",
    "3. No markdown formatting or code fences\n",
    "4. No quotation marks around the entire response\n",
    "5. Start directly with the column headers\n",
    "\n",
    "Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
    "row1data,row1data,row1data\n",
    "row2data,row2data,row2data\"\"\"\n",
    "\n",
    "def data_user_prompt(usecase):\n",
    "  user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
    "  user_prompt += usecase\n",
    "  user_prompt += f\" Respond in csv with appropriate headers.  Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response.  \\\n",
    "  Limit the rows in the dataset to {limit}.\"\n",
    "  return user_prompt\n",
    "\n",
    "messages = [\n",
    "    {\"role\":\"system\",\"content\":system_prompt},\n",
    "    {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "necoAEc1gNPF"
   },
   "outputs": [],
   "source": [
    "def dataset_call(usecase):\n",
    "\n",
    "  #quantisation\n",
    "  quant_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_use_double_quant=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
    "  )\n",
    "\n",
    "  #tokenization\n",
    "  tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
    "  tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "  #model\n",
    "  model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
    "\n",
    "  #inputs & outputs\n",
    "  inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "  model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
    "  #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
    "\n",
    "  with torch.no_grad():\n",
    "    outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
    "\n",
    "  response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
    "  return response.strip()\n",
    "  print(response.strip())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g8zEBraI0grT"
   },
   "outputs": [],
   "source": [
    "# convert csv string into panda\n",
    "\n",
    "def csv_handler(csv_string):\n",
    "\n",
    "    try:\n",
    "        # Convert CSV string to DataFrame\n",
    "        df = pd.read_csv(io.StringIO(csv_string))\n",
    "        return df\n",
    "    except Exception as e:\n",
    "        # Return error message as DataFrame if parsing fails\n",
    "        error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
    "        return error_df\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "vLPsusTL1zNB"
   },
   "outputs": [],
   "source": [
    "# usecase to csv_string\n",
    "\n",
    "def usecase_to_csv(usecase):\n",
    "    try:\n",
    "      # Get CSV string from your LLM\n",
    "      csv_string = dataset_call(usecase)\n",
    "\n",
    "      # Process into DataFrame for Gradio display\n",
    "      df = csv_handler(csv_string)\n",
    "\n",
    "      return df\n",
    "\n",
    "    except Exception as e:\n",
    "      error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
    "      return error_df, \"\", gr.update(visible=False)\n",
    "\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "H3WTLa9a2Rdy"
   },
   "outputs": [],
   "source": [
    "def download_csv(csv_string):\n",
    "    if csv_string:\n",
    "        return csv_string\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XhMVSrVhjYvz"
   },
   "outputs": [],
   "source": [
    "#test\n",
    "usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "#dataset_call(usecase)\n",
    "usecase_to_csv(usecase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "z3Ze4o2qjs5y"
   },
   "outputs": [],
   "source": [
    "\n",
    "demo = gr.Interface(\n",
    "    fn = usecase_to_csv,\n",
    "    inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
    "    outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
    "    title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
    "    description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
    "    examples=[\n",
    "    \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
    "    \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
    "    \"Generate customer survey responses with demographics and satisfaction scores\",\n",
    "    \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "    ]\n",
    ")\n",
    "\n",
    "demo.launch(debug=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ck1qdmbHo_G3"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
   "gpuType": "L4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/week3/community-contributions/llm_wk3d5_minutecreator.ipynb
+++ b/week3/community-contributions/llm_wk3d5_minutecreator.ipynb
@@ -0,0 +1,287 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zmpDFA3bGEHY"
   },
   "source": [
    "Minute creator in Gradio from day 5 of week 3.\n",
    "A couple of points to note:\n",
    "\n",
    "\n",
    "*   My access to llama hasn't been approved on Hugging Face and so I've experimented with some of the other models.\n",
    "*   There is a fair bit of debugging code in the main function as I was getting an error and couldn't find it.  I've left it in just in case its useful for others trying to debug their code.\n",
    "*   I was debugging with the help of Claude.  It suggested using <with torch.no_grad()> for the minute output.  The rationale is that it disables gradient computation which isn't necessary for inference and I found it did speed things up.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "l-5xKLFeJUGz"
   },
   "outputs": [],
   "source": [
    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Wi-bBD9VdBMo"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from google.colab import drive\n",
    "from huggingface_hub import login\n",
    "from google.colab import userdata\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "import torch\n",
    "import gradio as gr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "-0O-kuWtdk4I"
   },
   "outputs": [],
   "source": [
    "# keys\n",
    "\n",
    "#openai\n",
    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
    "openai = OpenAI(api_key=openai_api_key)\n",
    "\n",
    "#hf\n",
    "hf_token = userdata.get('HF_TOKEN')\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "u6v3Ecileg1H"
   },
   "outputs": [],
   "source": [
    "# constants\n",
    "\n",
    "AUDIO_MODEL = 'gpt-4o-transcribe'\n",
    "OPENAI_MODEL = 'gpt-4o-mini'\n",
    "QWEN2_MODEL = 'Qwen/Qwen2.5-7B-Instruct' # runs slowly no matter what size gpu - kept crashing on ram!\n",
    "GEMMA2_MODEL = \"google/gemma-2-2b-it\" # doesn't use a system prompt\n",
    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3nSfA_KhfY38"
   },
   "outputs": [],
   "source": [
    "# convert audio to text\n",
    "\n",
    "def transcribe_audio(audio_file_path):\n",
    "  try:\n",
    "    with open (audio_file_path, 'rb') as audio_file:\n",
    "      transcript = openai.audio.transcriptions.create(model = AUDIO_MODEL, file = audio_file, response_format=\"text\")\n",
    "    return transcript\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred: {str(e)}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "OVmlY3DGgnYc"
   },
   "outputs": [],
   "source": [
    "# use transcript to create minutes\n",
    "# use open source model\n",
    "\n",
    "def create_minutes(transcript):\n",
    "\n",
    "  # first try is for debugging\n",
    "  try:\n",
    "    print(f\"Starting to create minutes with transcript length: {len(str(transcript))}\")\n",
    "\n",
    "    if not transcript or len(str(transcript).strip()) == 0:\n",
    "      return \"Error: Empty or invalid transcript\"\n",
    "\n",
    "    #messages\n",
    "    system_prompt = \"You are an expert creator of meeting minutes.  Based on a meeting transcript you can summarise the meeting title and date, attendees, key discussion points, key outcomes, actions and owners and next steps.  Respond in Markdown.\"\n",
    "    user_prompt = f\"Create meeting minutes from the transcript provided.  The minutes should be clear but succint and should include title and date, attendees, key discussion points, key outcomes, actions and owners, and next steps. {transcript}\"\n",
    "\n",
    "    messages = [\n",
    "      {\"role\":\"system\",\"content\":system_prompt},\n",
    "      {\"role\":\"user\",\"content\":user_prompt}\n",
    "    ]\n",
    "    print(\"Messages prepared successfully\") # for debugging\n",
    "\n",
    "    # quantisation (for os model)\n",
    "\n",
    "    quantization_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_use_double_quant=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
    "    )\n",
    "\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred in setup: {str(e)}\"\n",
    "\n",
    "  # model & tokeniser\n",
    "  try:\n",
    "    print(\"Loading tokeniser....\")   # for debugging\n",
    "    tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
    "    tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "    print(\"Loading model.....\")  # for debugging\n",
    "    model = AutoModelForCausalLM.from_pretrained(PHI3, device_map='auto', quantization_config=quantization_config)\n",
    "    print(f\"Model loaded on device {model.device}\") # for debugging\n",
    "\n",
    "  # chat template\n",
    "    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "    model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
    "\n",
    "  # torch.no_grad suggested by claude.  This disables gradient computation which reduces memory usage and speeds things up\n",
    "    print(\"Generating text....\") # for debugging\n",
    "    with torch.no_grad():\n",
    "      outputs = model.generate(**model_inputs, max_new_tokens=2000, do_sample=True, temperature=0.7)\n",
    "    print(f\"Generation complete. Output shape: {outputs.shape}\") # for debugging\n",
    "\n",
    "  #***debugging****\n",
    "\n",
    "    # Decode the generated text (excluding the input prompt)\n",
    "    print(\"Starting text decoding...\") # debugging\n",
    "    input_length = len(model_inputs['input_ids'][0]) # debugging\n",
    "    print(f\"Input length: {input_length}, Output length: {len(outputs[0])}\") # debugging\n",
    "\n",
    "    if len(outputs[0]) <= input_length: # debugging\n",
    "        return \"Error: Model didn't generate any new tokens. Try reducing input length or increasing max_new_tokens.\" # debugging\n",
    "\n",
    "    generated_tokens = outputs[0][input_length:] # debugging\n",
    "    print(f\"Generated tokens length: {len(generated_tokens)}\") # debugging\n",
    "\n",
    "  # decode generated text\n",
    "    generated_text = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
    "    print(f\"Decoded text length: {len(generated_text)}\")\n",
    "\n",
    "    return generated_text.strip()\n",
    "\n",
    "  except ImportError as e:\n",
    "      return f\"Import error - missing library: {str(e)}. Please install required packages.\"\n",
    "  except torch.cuda.OutOfMemoryError as e:\n",
    "      return f\"CUDA out of memory: {str(e)}. Try reducing max_new_tokens to 500 or use CPU.\"\n",
    "  except RuntimeError as e:\n",
    "      return f\"Runtime error: {str(e)}. This might be a CUDA/device issue.\"\n",
    "  except Exception as e:\n",
    "      return f\"Unexpected error during text generation: {type(e).__name__}: {str(e)}\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "c63zzoDopw6u"
   },
   "outputs": [],
   "source": [
    "# create process for gradio\n",
    "\n",
    "def gr_process(audio_file, progress = gr.Progress()):\n",
    "\n",
    "  if audio_file is None:\n",
    "    return \"Please provide an audio file\"\n",
    "\n",
    "  try:\n",
    "    progress(0, desc=\"Analysing file\")\n",
    "    transcript = transcribe_audio(audio_file)\n",
    "\n",
    "    if transcript.startswith(\"An error occurred\"):\n",
    "      return transcript\n",
    "\n",
    "    progress(0.5, desc=\"File analysed, generating minutes\")\n",
    "\n",
    "    minutes = create_minutes(transcript)\n",
    "    progress(0.9, desc=\"Nearly there\")\n",
    "\n",
    "    return minutes\n",
    "\n",
    "  except Exception as e:\n",
    "    return f\"An error occurred: {str(e)}\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "82fyQELQkGty"
   },
   "outputs": [],
   "source": [
    "# gradio interface\n",
    "\n",
    "demo = gr.Interface(\n",
    "    fn=gr_process,\n",
    "    inputs= gr.Audio(type=\"filepath\",label=\"Upload MP3 file\"),\n",
    "    outputs= gr.Markdown(label=\"Meeting minutes\"),\n",
    "    title = \"Meeting minute creator\",\n",
    "    description = \"Upload an mp3 audio file for a meeting and I will provide the minutes!\"\n",
    ")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "  demo.launch(debug=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XljpyS7Nvxkh"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }