Delete week3/community-contributions/llm.wk3synthetic-data-creator.ipynb
This commit is contained in:
@@ -1,295 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"- This creates dummy / test data from a usecase provided by the user.\n",
|
|
||||||
"- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
|
|
||||||
"- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "s7ERjTCEKSi_"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "GG5VMcmhcA2N"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import requests\n",
|
|
||||||
"from openai import OpenAI\n",
|
|
||||||
"import gradio as gr\n",
|
|
||||||
"from IPython.display import Markdown, display, update_display\n",
|
|
||||||
"from huggingface_hub import login\n",
|
|
||||||
"from google.colab import userdata\n",
|
|
||||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import json\n",
|
|
||||||
"import re\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import io"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "UfL-2XNicpEB"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# constants\n",
|
|
||||||
"\n",
|
|
||||||
"OPENAI = 'gpt-4o-mini'\n",
|
|
||||||
"PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
|
|
||||||
"\n",
|
|
||||||
"limit = 100\n",
|
|
||||||
"max_tokens = 1000\n",
|
|
||||||
"temperature = 0.3"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "ZQ0dcQ6hdTPo"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# keys\n",
|
|
||||||
"\n",
|
|
||||||
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
|
||||||
"openai = OpenAI(api_key=openai_api_key)\n",
|
|
||||||
"\n",
|
|
||||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
|
||||||
"login(hf_token, add_to_git_credential=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "2eHsLdYgd2d_"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"system_prompt = f\"\"\"You create synthetic datasets for testing purposes. Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
|
|
||||||
"of realistic data.\n",
|
|
||||||
"\n",
|
|
||||||
"IMPORTANT RULES:\n",
|
|
||||||
"1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
|
|
||||||
"2. No explanatory text before or after\n",
|
|
||||||
"3. No markdown formatting or code fences\n",
|
|
||||||
"4. No quotation marks around the entire response\n",
|
|
||||||
"5. Start directly with the column headers\n",
|
|
||||||
"\n",
|
|
||||||
"Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
|
|
||||||
"row1data,row1data,row1data\n",
|
|
||||||
"row2data,row2data,row2data\"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
"def data_user_prompt(usecase):\n",
|
|
||||||
" user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
|
|
||||||
" user_prompt += usecase\n",
|
|
||||||
" user_prompt += f\" Respond in csv with appropriate headers. Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response. \\\n",
|
|
||||||
" Limit the rows in the dataset to {limit}.\"\n",
|
|
||||||
" return user_prompt\n",
|
|
||||||
"\n",
|
|
||||||
"messages = [\n",
|
|
||||||
" {\"role\":\"system\",\"content\":system_prompt},\n",
|
|
||||||
" {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
|
|
||||||
"]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "necoAEc1gNPF"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def dataset_call(usecase):\n",
|
|
||||||
"\n",
|
|
||||||
" #quantisation\n",
|
|
||||||
" quant_config = BitsAndBytesConfig(\n",
|
|
||||||
" load_in_4bit=True,\n",
|
|
||||||
" bnb_4bit_use_double_quant=True,\n",
|
|
||||||
" bnb_4bit_quant_type=\"nf4\",\n",
|
|
||||||
" bnb_4bit_compute_dtype=torch.bfloat16\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" #tokenization\n",
|
|
||||||
" tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
|
|
||||||
" tokenizer.pad_token = tokenizer.eos_token\n",
|
|
||||||
"\n",
|
|
||||||
" #model\n",
|
|
||||||
" model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
|
|
||||||
"\n",
|
|
||||||
" #inputs & outputs\n",
|
|
||||||
" inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
|
|
||||||
" model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
|
|
||||||
" #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
|
|
||||||
"\n",
|
|
||||||
" with torch.no_grad():\n",
|
|
||||||
" outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
|
|
||||||
"\n",
|
|
||||||
" response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
|
|
||||||
" return response.strip()\n",
|
|
||||||
" print(response.strip())\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "g8zEBraI0grT"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# convert csv string into panda\n",
|
|
||||||
"\n",
|
|
||||||
"def csv_handler(csv_string):\n",
|
|
||||||
"\n",
|
|
||||||
" try:\n",
|
|
||||||
" # Convert CSV string to DataFrame\n",
|
|
||||||
" df = pd.read_csv(io.StringIO(csv_string))\n",
|
|
||||||
" return df\n",
|
|
||||||
" except Exception as e:\n",
|
|
||||||
" # Return error message as DataFrame if parsing fails\n",
|
|
||||||
" error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
|
|
||||||
" return error_df\n",
|
|
||||||
" print(df, error_df)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "vLPsusTL1zNB"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# usecase to csv_string\n",
|
|
||||||
"\n",
|
|
||||||
"def usecase_to_csv(usecase):\n",
|
|
||||||
" try:\n",
|
|
||||||
" # Get CSV string from your LLM\n",
|
|
||||||
" csv_string = dataset_call(usecase)\n",
|
|
||||||
"\n",
|
|
||||||
" # Process into DataFrame for Gradio display\n",
|
|
||||||
" df = csv_handler(csv_string)\n",
|
|
||||||
"\n",
|
|
||||||
" return df\n",
|
|
||||||
"\n",
|
|
||||||
" except Exception as e:\n",
|
|
||||||
" error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
|
|
||||||
" return error_df, \"\", gr.update(visible=False)\n",
|
|
||||||
"\n",
|
|
||||||
" print(df, error_df)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "H3WTLa9a2Rdy"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def download_csv(csv_string):\n",
|
|
||||||
" if csv_string:\n",
|
|
||||||
" return csv_string\n",
|
|
||||||
" return \"\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "XhMVSrVhjYvz"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#test\n",
|
|
||||||
"usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
|
|
||||||
"#dataset_call(usecase)\n",
|
|
||||||
"usecase_to_csv(usecase)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "z3Ze4o2qjs5y"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"\n",
|
|
||||||
"demo = gr.Interface(\n",
|
|
||||||
" fn = usecase_to_csv,\n",
|
|
||||||
" inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
|
|
||||||
" outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
|
|
||||||
" title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
|
|
||||||
" description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
|
|
||||||
" examples=[\n",
|
|
||||||
" \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
|
|
||||||
" \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
|
|
||||||
" \"Generate customer survey responses with demographics and satisfaction scores\",\n",
|
|
||||||
" \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
|
|
||||||
" ]\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"demo.launch(debug=True)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "ck1qdmbHo_G3"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"accelerator": "GPU",
|
|
||||||
"colab": {
|
|
||||||
"authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
|
|
||||||
"gpuType": "L4",
|
|
||||||
"provenance": []
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user