From bc5675666319b9faec8efb33ef26b8f640c7d96b Mon Sep 17 00:00:00 2001 From: bluebells1 <142216106+bluebells1@users.noreply.github.com> Date: Thu, 19 Jun 2025 10:31:37 +0100 Subject: [PATCH] Delete week3/community-contributions/llm.wk3synthetic-data-creator.ipynb --- .../llm.wk3synthetic-data-creator.ipynb | 295 ------------------ 1 file changed, 295 deletions(-) delete mode 100644 week3/community-contributions/llm.wk3synthetic-data-creator.ipynb diff --git a/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb b/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb deleted file mode 100644 index f026965..0000000 --- a/week3/community-contributions/llm.wk3synthetic-data-creator.ipynb +++ /dev/null @@ -1,295 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- This creates dummy / test data from a usecase provided by the user.\n", - "- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n", - "- I've used a Phi3 model as I'm having issues with llama access on Hugging Face." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "s7ERjTCEKSi_" - }, - "outputs": [], - "source": [ - "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GG5VMcmhcA2N" - }, - "outputs": [], - "source": [ - "import os\n", - "import requests\n", - "from openai import OpenAI\n", - "import gradio as gr\n", - "from IPython.display import Markdown, display, update_display\n", - "from huggingface_hub import login\n", - "from google.colab import userdata\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", - "import torch\n", - "import json\n", - "import re\n", - "import pandas as pd\n", - "import io" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UfL-2XNicpEB" - }, - "outputs": [], - "source": [ - "# constants\n", - "\n", - "OPENAI = 'gpt-4o-mini'\n", - "PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n", - "\n", - "limit = 100\n", - "max_tokens = 1000\n", - "temperature = 0.3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZQ0dcQ6hdTPo" - }, - "outputs": [], - "source": [ - "# keys\n", - "\n", - "openai_api_key = userdata.get('OPENAI_API_KEY')\n", - "openai = OpenAI(api_key=openai_api_key)\n", - "\n", - "hf_token = userdata.get('HF_TOKEN')\n", - "login(hf_token, add_to_git_credential=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2eHsLdYgd2d_" - }, - "outputs": [], - "source": [ - "system_prompt = f\"\"\"You create synthetic datasets for testing purposes. Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n", - "of realistic data.\n", - "\n", - "IMPORTANT RULES:\n", - "1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n", - "2. No explanatory text before or after\n", - "3. No markdown formatting or code fences\n", - "4. No quotation marks around the entire response\n", - "5. Start directly with the column headers\n", - "\n", - "Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n", - "row1data,row1data,row1data\n", - "row2data,row2data,row2data\"\"\"\n", - "\n", - "def data_user_prompt(usecase):\n", - " user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n", - " user_prompt += usecase\n", - " user_prompt += f\" Respond in csv with appropriate headers. Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response. \\\n", - " Limit the rows in the dataset to {limit}.\"\n", - " return user_prompt\n", - "\n", - "messages = [\n", - " {\"role\":\"system\",\"content\":system_prompt},\n", - " {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "necoAEc1gNPF" - }, - "outputs": [], - "source": [ - "def dataset_call(usecase):\n", - "\n", - " #quantisation\n", - " quant_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_use_double_quant=True,\n", - " bnb_4bit_quant_type=\"nf4\",\n", - " bnb_4bit_compute_dtype=torch.bfloat16\n", - " )\n", - "\n", - " #tokenization\n", - " tokenizer = AutoTokenizer.from_pretrained(PHI3)\n", - " tokenizer.pad_token = tokenizer.eos_token\n", - "\n", - " #model\n", - " model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n", - "\n", - " #inputs & outputs\n", - " inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n", - " model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n", - " #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", - "\n", - " with torch.no_grad():\n", - " outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n", - "\n", - " response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n", - " return response.strip()\n", - " print(response.strip())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "g8zEBraI0grT" - }, - "outputs": [], - "source": [ - "# convert csv string into panda\n", - "\n", - "def csv_handler(csv_string):\n", - "\n", - " try:\n", - " # Convert CSV string to DataFrame\n", - " df = pd.read_csv(io.StringIO(csv_string))\n", - " return df\n", - " except Exception as e:\n", - " # Return error message as DataFrame if parsing fails\n", - " error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n", - " return error_df\n", - " print(df, error_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vLPsusTL1zNB" - }, - "outputs": [], - "source": [ - "# usecase to csv_string\n", - "\n", - "def usecase_to_csv(usecase):\n", - " try:\n", - " # Get CSV string from your LLM\n", - " csv_string = dataset_call(usecase)\n", - "\n", - " # Process into DataFrame for Gradio display\n", - " df = csv_handler(csv_string)\n", - "\n", - " return df\n", - "\n", - " except Exception as e:\n", - " error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n", - " return error_df, \"\", gr.update(visible=False)\n", - "\n", - " print(df, error_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H3WTLa9a2Rdy" - }, - "outputs": [], - "source": [ - "def download_csv(csv_string):\n", - " if csv_string:\n", - " return csv_string\n", - " return \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XhMVSrVhjYvz" - }, - "outputs": [], - "source": [ - "#test\n", - "usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n", - "#dataset_call(usecase)\n", - "usecase_to_csv(usecase)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z3Ze4o2qjs5y" - }, - "outputs": [], - "source": [ - "\n", - "demo = gr.Interface(\n", - " fn = usecase_to_csv,\n", - " inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n", - " outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n", - " title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n", - " description = \"Let me know your use case for synthetic data and I will create it for you.\",\n", - " examples=[\n", - " \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n", - " \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n", - " \"Generate customer survey responses with demographics and satisfaction scores\",\n", - " \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n", - " ]\n", - ")\n", - "\n", - "demo.launch(debug=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ck1qdmbHo_G3" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX", - "gpuType": "L4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}