LLM_Engineering_OLD/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- This creates dummy / test data from a usecase provided by the user.\n",
    "- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
    "- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "s7ERjTCEKSi_"
   },
   "outputs": [],
   "source": [
    "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "GG5VMcmhcA2N"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "import gradio as gr\n",
    "from IPython.display import Markdown, display, update_display\n",
    "from huggingface_hub import login\n",
    "from google.colab import userdata\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
    "import torch\n",
    "import json\n",
    "import re\n",
    "import pandas as pd\n",
    "import io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UfL-2XNicpEB"
   },
   "outputs": [],
   "source": [
    "# constants\n",
    "\n",
    "OPENAI = 'gpt-4o-mini'\n",
    "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
    "\n",
    "limit = 100\n",
    "max_tokens = 1000\n",
    "temperature = 0.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ZQ0dcQ6hdTPo"
   },
   "outputs": [],
   "source": [
    "# keys\n",
    "\n",
    "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
    "openai = OpenAI(api_key=openai_api_key)\n",
    "\n",
    "hf_token = userdata.get('HF_TOKEN')\n",
    "login(hf_token, add_to_git_credential=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "2eHsLdYgd2d_"
   },
   "outputs": [],
   "source": [
    "system_prompt = f\"\"\"You create synthetic datasets for testing purposes.  Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
    "of realistic data.\n",
    "\n",
    "IMPORTANT RULES:\n",
    "1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
    "2. No explanatory text before or after\n",
    "3. No markdown formatting or code fences\n",
    "4. No quotation marks around the entire response\n",
    "5. Start directly with the column headers\n",
    "\n",
    "Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
    "row1data,row1data,row1data\n",
    "row2data,row2data,row2data\"\"\"\n",
    "\n",
    "def data_user_prompt(usecase):\n",
    "  user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
    "  user_prompt += usecase\n",
    "  user_prompt += f\" Respond in csv with appropriate headers.  Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response.  \\\n",
    "  Limit the rows in the dataset to {limit}.\"\n",
    "  return user_prompt\n",
    "\n",
    "messages = [\n",
    "    {\"role\":\"system\",\"content\":system_prompt},\n",
    "    {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "necoAEc1gNPF"
   },
   "outputs": [],
   "source": [
    "def dataset_call(usecase):\n",
    "\n",
    "  #quantisation\n",
    "  quant_config = BitsAndBytesConfig(\n",
    "      load_in_4bit=True,\n",
    "      bnb_4bit_use_double_quant=True,\n",
    "      bnb_4bit_quant_type=\"nf4\",\n",
    "      bnb_4bit_compute_dtype=torch.bfloat16\n",
    "  )\n",
    "\n",
    "  #tokenization\n",
    "  tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
    "  tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "  #model\n",
    "  model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
    "\n",
    "  #inputs & outputs\n",
    "  inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "  model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
    "  #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
    "\n",
    "  with torch.no_grad():\n",
    "    outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
    "\n",
    "  response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
    "  return response.strip()\n",
    "  print(response.strip())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g8zEBraI0grT"
   },
   "outputs": [],
   "source": [
    "# convert csv string into panda\n",
    "\n",
    "def csv_handler(csv_string):\n",
    "\n",
    "    try:\n",
    "        # Convert CSV string to DataFrame\n",
    "        df = pd.read_csv(io.StringIO(csv_string))\n",
    "        return df\n",
    "    except Exception as e:\n",
    "        # Return error message as DataFrame if parsing fails\n",
    "        error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
    "        return error_df\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "vLPsusTL1zNB"
   },
   "outputs": [],
   "source": [
    "# usecase to csv_string\n",
    "\n",
    "def usecase_to_csv(usecase):\n",
    "    try:\n",
    "      # Get CSV string from your LLM\n",
    "      csv_string = dataset_call(usecase)\n",
    "\n",
    "      # Process into DataFrame for Gradio display\n",
    "      df = csv_handler(csv_string)\n",
    "\n",
    "      return df\n",
    "\n",
    "    except Exception as e:\n",
    "      error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
    "      return error_df, \"\", gr.update(visible=False)\n",
    "\n",
    "    print(df, error_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "H3WTLa9a2Rdy"
   },
   "outputs": [],
   "source": [
    "def download_csv(csv_string):\n",
    "    if csv_string:\n",
    "        return csv_string\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "XhMVSrVhjYvz"
   },
   "outputs": [],
   "source": [
    "#test\n",
    "usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "#dataset_call(usecase)\n",
    "usecase_to_csv(usecase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "z3Ze4o2qjs5y"
   },
   "outputs": [],
   "source": [
    "\n",
    "demo = gr.Interface(\n",
    "    fn = usecase_to_csv,\n",
    "    inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
    "    outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
    "    title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
    "    description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
    "    examples=[\n",
    "    \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
    "    \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
    "    \"Generate customer survey responses with demographics and satisfaction scores\",\n",
    "    \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
    "    ]\n",
    ")\n",
    "\n",
    "demo.launch(debug=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ck1qdmbHo_G3"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
   "gpuType": "L4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}