296 lines
8.6 KiB
Plaintext
296 lines
8.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"- This creates dummy / test data from a usecase provided by the user.\n",
|
|
"- The usecase can be as simple or complex as the user wants (I've tested both and the results are good).\n",
|
|
"- I've used a Phi3 model as I'm having issues with llama access on Hugging Face."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "s7ERjTCEKSi_"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "GG5VMcmhcA2N"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import requests\n",
|
|
"from openai import OpenAI\n",
|
|
"import gradio as gr\n",
|
|
"from IPython.display import Markdown, display, update_display\n",
|
|
"from huggingface_hub import login\n",
|
|
"from google.colab import userdata\n",
|
|
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
|
"import torch\n",
|
|
"import json\n",
|
|
"import re\n",
|
|
"import pandas as pd\n",
|
|
"import io"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "UfL-2XNicpEB"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# constants\n",
|
|
"\n",
|
|
"OPENAI = 'gpt-4o-mini'\n",
|
|
"PHI3 = \"microsoft/Phi-3-mini-4k-instruct\"\n",
|
|
"\n",
|
|
"limit = 100\n",
|
|
"max_tokens = 1000\n",
|
|
"temperature = 0.3"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "ZQ0dcQ6hdTPo"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# keys\n",
|
|
"\n",
|
|
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
|
"openai = OpenAI(api_key=openai_api_key)\n",
|
|
"\n",
|
|
"hf_token = userdata.get('HF_TOKEN')\n",
|
|
"login(hf_token, add_to_git_credential=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "2eHsLdYgd2d_"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"system_prompt = f\"\"\"You create synthetic datasets for testing purposes. Based on the use case description, generate a CSV dataset with appropriate columns and a maximum of {limit} rows\n",
|
|
"of realistic data.\n",
|
|
"\n",
|
|
"IMPORTANT RULES:\n",
|
|
"1. Return ONLY the CSV data with headers and ensure there are no duplicate headers\n",
|
|
"2. No explanatory text before or after\n",
|
|
"3. No markdown formatting or code fences\n",
|
|
"4. No quotation marks around the entire response\n",
|
|
"5. Start directly with the column headers\n",
|
|
"\n",
|
|
"Format: column1 (e.g. customer_id),column2 (e.g. country),column3 (e.g. age)\n",
|
|
"row1data,row1data,row1data\n",
|
|
"row2data,row2data,row2data\"\"\"\n",
|
|
"\n",
|
|
"def data_user_prompt(usecase):\n",
|
|
" user_prompt = \"Create a synthetic dataset for the use case provided below: \"\n",
|
|
" user_prompt += usecase\n",
|
|
" user_prompt += f\" Respond in csv with appropriate headers. Do not include any other explanatory text, markdown formatting or code fences, or quotation marks around the entire response. \\\n",
|
|
" Limit the rows in the dataset to {limit}.\"\n",
|
|
" return user_prompt\n",
|
|
"\n",
|
|
"messages = [\n",
|
|
" {\"role\":\"system\",\"content\":system_prompt},\n",
|
|
" {\"role\":\"user\",\"content\":data_user_prompt(usecase)}\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "necoAEc1gNPF"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def dataset_call(usecase):\n",
|
|
"\n",
|
|
" #quantisation\n",
|
|
" quant_config = BitsAndBytesConfig(\n",
|
|
" load_in_4bit=True,\n",
|
|
" bnb_4bit_use_double_quant=True,\n",
|
|
" bnb_4bit_quant_type=\"nf4\",\n",
|
|
" bnb_4bit_compute_dtype=torch.bfloat16\n",
|
|
" )\n",
|
|
"\n",
|
|
" #tokenization\n",
|
|
" tokenizer = AutoTokenizer.from_pretrained(PHI3)\n",
|
|
" tokenizer.pad_token = tokenizer.eos_token\n",
|
|
"\n",
|
|
" #model\n",
|
|
" model = AutoModelForCausalLM.from_pretrained(PHI3, quantization_config=quant_config, device_map=\"auto\")\n",
|
|
"\n",
|
|
" #inputs & outputs\n",
|
|
" inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
|
|
" model_inputs = tokenizer(inputs, return_tensors=\"pt\").to(model.device)\n",
|
|
" #streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
|
|
"\n",
|
|
" with torch.no_grad():\n",
|
|
" outputs = model.generate(**model_inputs, max_new_tokens=max_tokens,do_sample=True, temperature=temperature)\n",
|
|
"\n",
|
|
" response = tokenizer.decode(outputs[0][len(model_inputs['input_ids'][0]):],skip_special_tokens=True)\n",
|
|
" return response.strip()\n",
|
|
" print(response.strip())\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "g8zEBraI0grT"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# convert csv string into panda\n",
|
|
"\n",
|
|
"def csv_handler(csv_string):\n",
|
|
"\n",
|
|
" try:\n",
|
|
" # Convert CSV string to DataFrame\n",
|
|
" df = pd.read_csv(io.StringIO(csv_string))\n",
|
|
" return df\n",
|
|
" except Exception as e:\n",
|
|
" # Return error message as DataFrame if parsing fails\n",
|
|
" error_df = pd.DataFrame({\"Error\": [f\"Failed to parse CSV: {str(e)}\"]})\n",
|
|
" return error_df\n",
|
|
" print(df, error_df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "vLPsusTL1zNB"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# usecase to csv_string\n",
|
|
"\n",
|
|
"def usecase_to_csv(usecase):\n",
|
|
" try:\n",
|
|
" # Get CSV string from your LLM\n",
|
|
" csv_string = dataset_call(usecase)\n",
|
|
"\n",
|
|
" # Process into DataFrame for Gradio display\n",
|
|
" df = csv_handler(csv_string)\n",
|
|
"\n",
|
|
" return df\n",
|
|
"\n",
|
|
" except Exception as e:\n",
|
|
" error_df = pd.DataFrame({\"Error\": [f\"LLM processing failed: {str(e)}\"]})\n",
|
|
" return error_df, \"\", gr.update(visible=False)\n",
|
|
"\n",
|
|
" print(df, error_df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "H3WTLa9a2Rdy"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def download_csv(csv_string):\n",
|
|
" if csv_string:\n",
|
|
" return csv_string\n",
|
|
" return \"\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "XhMVSrVhjYvz"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#test\n",
|
|
"usecase = \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
|
|
"#dataset_call(usecase)\n",
|
|
"usecase_to_csv(usecase)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "z3Ze4o2qjs5y"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"demo = gr.Interface(\n",
|
|
" fn = usecase_to_csv,\n",
|
|
" inputs = gr.Textbox(lines=5,label=\"Describe your usecase\",placeholder=\"Describe the dataset you would like to create and how you will use it\"),\n",
|
|
" outputs = gr.DataFrame(label=\"Here is your dataset!\",interactive=True),\n",
|
|
" title = \"Friendly Neighbourhood Synthetic Data Creator!\",\n",
|
|
" description = \"Let me know your use case for synthetic data and I will create it for you.\",\n",
|
|
" examples=[\n",
|
|
" \"Generate a dataset of 10 employees with name, department, salary, and years of experience\",\n",
|
|
" \"Create sample e-commerce data with product names, categories, prices, and ratings\",\n",
|
|
" \"Generate customer survey responses with demographics and satisfaction scores\",\n",
|
|
" \"A financial services company is looking for synthetic data to test its Expected Credit Losses (ECL) model under IFRS9.\"\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"demo.launch(debug=True)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "ck1qdmbHo_G3"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"authorship_tag": "ABX9TyOay+EACzwO0uXDLuayhscX",
|
|
"gpuType": "L4",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|