From bc5675666319b9faec8efb33ef26b8f640c7d96b Mon Sep 17 00:00:00 2001
From: bluebells1 <142216106+bluebells1@users.noreply.github.com>
Date: Thu, 19 Jun 2025 10:31:37 +0100
Subject: [PATCH] Delete
 week3/community-contributions/llm.wk3synthetic-data-creator.ipynb

---
 .../llm.wk3synthetic-data-creator.ipynb       | 295 ------------------
 1 file changed, 295 deletions(-)
 delete mode 100644 week3/community-contributions/llm.wk3synthetic-data-creator.ipynb

diff --git a/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb b/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb
deleted file mode 100644
index f026965..0000000
--- a/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb
+++ /dev/null
@@ -1,295 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- This creates dummy / test data from a usecase provided by the user.\n",
-    "- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
-    "- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "s7ERjTCEKSi_"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "GG5VMcmhcA2N"
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import requests\n",
-    "from openai import OpenAI\n",
-    "import gradio as gr\n",
-    "from IPython.display import Markdown, display, update_display\n",
-    "from huggingface_hub import login\n",
-    "from google.colab import userdata\n",
-    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
-    "import torch\n",
-    "import json\n",
-    "import re\n",
-    "import pandas as pd\n",
-    "import io"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "UfL-2XNicpEB"
-   },
-   "outputs": [],
-   "source": [
-    "# constants\n",
-    "\n",
-    "OPENAI = 'gpt-4o-mini'\n",
-    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
-    "\n",
-    "limit = 100\n",
-    "max_tokens = 1000\n",
-    "temperature = 0.3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "ZQ0dcQ6hdTPo"
-   },
-   "outputs": [],
-   "source": [
-    "# keys\n",
-    "\n",
-    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
-    "openai = OpenAI(api_key=openai_api_key)\n",
-    "\n",
-    "hf_token = userdata.get('HF_TOKEN')\n",
-    "login(hf_token, add_to_git_credential=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "2eHsLdYgd2d_"
-   },
-   "outputs": [],
-   "source": [
-    "system_prompt = f\"\"\"You create synthetic datasets for testing purposes.  Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
-    "of realistic data.\n",
-    "\n",
-    "IMPORTANT RULES:\n",
-    "1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
-    "2. No explanatory text before or after\n",
-    "3. No markdown formatting or code fences\n",
-    "4. No quotation marks around the entire response\n",
-    "5. Start directly with the column headers\n",
-    "\n",
-    "Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
-    "row1data,row1data,row1data\n",
-    "row2data,row2data,row2data\"\"\"\n",
-    "\n",
-    "def data_user_prompt(usecase):\n",
-    "  user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
-    "  user_prompt += usecase\n",
-    "  user_prompt += f\" Respond in csv with appropriate headers.  Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response.  \\\n",
-    "  Limit the rows in the dataset to {limit}.\"\n",
-    "  return user_prompt\n",
-    "\n",
-    "messages = [\n",
-    "    {\"role\":\"system\",\"content\":system_prompt},\n",
-    "    {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "necoAEc1gNPF"
-   },
-   "outputs": [],
-   "source": [
-    "def dataset_call(usecase):\n",
-    "\n",
-    "  #quantisation\n",
-    "  quant_config = BitsAndBytesConfig(\n",
-    "      load_in_4bit=True,\n",
-    "      bnb_4bit_use_double_quant=True,\n",
-    "      bnb_4bit_quant_type=\"nf4\",\n",
-    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
-    "  )\n",
-    "\n",
-    "  #tokenization\n",
-    "  tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
-    "  tokenizer.pad_token = tokenizer.eos_token\n",
-    "\n",
-    "  #model\n",
-    "  model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
-    "\n",
-    "  #inputs & outputs\n",
-    "  inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
-    "  model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
-    "  #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
-    "\n",
-    "  with torch.no_grad():\n",
-    "    outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
-    "\n",
-    "  response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
-    "  return response.strip()\n",
-    "  print(response.strip())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "g8zEBraI0grT"
-   },
-   "outputs": [],
-   "source": [
-    "# convert csv string into panda\n",
-    "\n",
-    "def csv_handler(csv_string):\n",
-    "\n",
-    "    try:\n",
-    "        # Convert CSV string to DataFrame\n",
-    "        df = pd.read_csv(io.StringIO(csv_string))\n",
-    "        return df\n",
-    "    except Exception as e:\n",
-    "        # Return error message as DataFrame if parsing fails\n",
-    "        error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
-    "        return error_df\n",
-    "    print(df, error_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "vLPsusTL1zNB"
-   },
-   "outputs": [],
-   "source": [
-    "# usecase to csv_string\n",
-    "\n",
-    "def usecase_to_csv(usecase):\n",
-    "    try:\n",
-    "      # Get CSV string from your LLM\n",
-    "      csv_string = dataset_call(usecase)\n",
-    "\n",
-    "      # Process into DataFrame for Gradio display\n",
-    "      df = csv_handler(csv_string)\n",
-    "\n",
-    "      return df\n",
-    "\n",
-    "    except Exception as e:\n",
-    "      error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
-    "      return error_df, \"\", gr.update(visible=False)\n",
-    "\n",
-    "    print(df, error_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "H3WTLa9a2Rdy"
-   },
-   "outputs": [],
-   "source": [
-    "def download_csv(csv_string):\n",
-    "    if csv_string:\n",
-    "        return csv_string\n",
-    "    return \"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "XhMVSrVhjYvz"
-   },
-   "outputs": [],
-   "source": [
-    "#test\n",
-    "usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
-    "#dataset_call(usecase)\n",
-    "usecase_to_csv(usecase)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "z3Ze4o2qjs5y"
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "demo = gr.Interface(\n",
-    "    fn = usecase_to_csv,\n",
-    "    inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
-    "    outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
-    "    title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
-    "    description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
-    "    examples=[\n",
-    "    \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
-    "    \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
-    "    \"Generate customer survey responses with demographics and satisfaction scores\",\n",
-    "    \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "demo.launch(debug=True)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "ck1qdmbHo_G3"
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
-   "gpuType": "L4",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}