From b63f06ee879af8d09066797bc585abde22bf8641 Mon Sep 17 00:00:00 2001 From: Dmitry Kisselev <956988+dkisselev-zz@users.noreply.github.com> Date: Sun, 19 Oct 2025 20:46:46 -0700 Subject: [PATCH] final clean --- ...eek3_Excercise_Synthetic_Dataset_PGx.ipynb | 1169 +++++------------ 1 file changed, 323 insertions(+), 846 deletions(-) diff --git a/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb b/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb index def9002..c2955db 100644 --- a/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb +++ b/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb @@ -37,9 +37,8 @@ "outputs": [], "source": [ "# Install dependencies\n", - "%pip install -q --upgrade torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124\n", - "%pip install -q requests bitsandbytes==0.48.1 transformers==4.57.1 accelerate==1.10.1\n", - "%pip install -q openai gradio nltk pandas\n" + "%pip install -q --upgrade bitsandbytes accelerate transformers\n", + "%pip install -q openai gradio nltk\n" ] }, { @@ -63,6 +62,16 @@ " print(\"NOT CONNECTED TO A T4\")" ] }, + { + "cell_type": "markdown", + "source": [ + "## Start" + ], + "metadata": { + "id": "jokJ6H7o5qaF" + }, + "id": "jokJ6H7o5qaF" + }, { "cell_type": "code", "execution_count": null, @@ -74,6 +83,8 @@ "source": [ "# Imports and Setup\n", "import os\n", + "import io\n", + "import time\n", "import json\n", "import pandas as pd\n", "import random\n", @@ -84,6 +95,9 @@ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", + "# Google Colab\n", + "from google.colab import files\n", + "\n", "# LLM APIs\n", "from openai import OpenAI\n", "\n", @@ -94,7 +108,6 @@ "# Data processing\n", "import nltk\n", "from nltk.corpus import wordnet\n", - "# import pyarrow as pa\n", "\n", "# UI\n", "import gradio as gr\n", @@ -195,7 +208,8 @@ "outputs": [], "source": [ "# Model Configuration\n", - "# HuggingFace Models (Primary Focus)\n", + "\n", + "# HuggingFace Models\n", "HUGGINGFACE_MODELS = {\n", " \"Llama 3.1 8B\": {\n", " \"model_id\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -248,7 +262,7 @@ " }\n", "}\n", "\n", - "# Commercial Models (Additional Options)\n", + "# Commercial Models\n", "COMMERCIAL_MODELS = {\n", " \"GPT-5 Mini\": {\n", " \"model_id\": \"gpt-5-mini\",\n", @@ -301,22 +315,83 @@ "\n", "DEFAULT_SCHEMA_TEXT = \"\\n\".join([f\"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}\" for i, col in enumerate(DEFAULT_SCHEMA)])\n", "\n", - "print(\"✅ Model configuration loaded!\")\n", "print(f\"📊 Available HuggingFace models: {len(HUGGINGFACE_MODELS)}\")\n", "print(f\"🌐 Available Commercial models: {len(COMMERCIAL_MODELS)}\")\n" ] }, { "cell_type": "code", - "execution_count": null, - "id": "dFYWA5y0ZmJr", - "metadata": { - "id": "dFYWA5y0ZmJr" - }, - "outputs": [], "source": [ - "schema_manager.generate_schema_with_llm(\"realstate dataset for residential houses\",'Gemini 2.5 Flash', 0.7)" - ] + "# HuggingFace Model Loading\n", + "def load_huggingface_model(model_id, model_class_name, quantization_config, torch_dtype):\n", + " \"\"\"Load HuggingFace model with correct model class\"\"\"\n", + " try:\n", + " # Import the specific model class\n", + " if model_class_name == \"LlamaForCausalLM\":\n", + " from transformers import LlamaForCausalLM\n", + " model_class = LlamaForCausalLM\n", + " elif model_class_name == \"Phi3ForCausalLM\":\n", + " from transformers import Phi3ForCausalLM\n", + " model_class = Phi3ForCausalLM\n", + " elif model_class_name == \"GemmaForCausalLM\":\n", + " from transformers import GemmaForCausalLM\n", + " model_class = GemmaForCausalLM\n", + " elif model_class_name == \"Qwen2ForCausalLM\":\n", + " from transformers import Qwen2ForCausalLM\n", + " model_class = Qwen2ForCausalLM\n", + " elif model_class_name == \"MistralForCausalLM\":\n", + " from transformers import MistralForCausalLM\n", + " model_class = MistralForCausalLM\n", + " else:\n", + " # Fallback to AutoModelForCausalLM\n", + " model_class = AutoModelForCausalLM\n", + "\n", + " # Load the model\n", + " model = model_class.from_pretrained(\n", + " model_id,\n", + " device_map=\"auto\",\n", + " quantization_config=quantization_config,\n", + " torch_dtype=torch_dtype\n", + " )\n", + " return model\n", + "\n", + " except Exception as e:\n", + " print(f\"Error loading {model_class_name}: {str(e)}\")\n", + " # Fallback to AutoModelForCausalLM\n", + " try:\n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " model_id,\n", + " device_map=\"auto\",\n", + " quantization_config=quantization_config,\n", + " torch_dtype=torch_dtype\n", + " )\n", + " return model\n", + " except Exception as e2:\n", + " raise Exception(f\"Failed to load model with both specific and auto classes: {str(e2)}\")" + ], + "metadata": { + "id": "NaShTv335Zjr" + }, + "id": "NaShTv335Zjr", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "quantization_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + ")" + ], + "metadata": { + "id": "7IRVMhT65axX" + }, + "id": "7IRVMhT65axX", + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -334,6 +409,7 @@ " def __init__(self):\n", " self.current_schema = None\n", " self.schema_text = None\n", + " self.quantization_config = quantization_config\n", "\n", " def generate_schema_with_llm(self, business_case: str, model_name: str, temperature: float = 0.7) -> str:\n", " \"\"\"Generate complete schema from business case using LLM\"\"\"\n", @@ -433,9 +509,72 @@ " model_info = HUGGINGFACE_MODELS[model_name]\n", " model_id = model_info[\"model_id\"]\n", "\n", - " # This will be implemented in the generation module\n", - " # For now, return a placeholder\n", - " return f\"Schema generation with {model_name} (HuggingFace) - to be implemented\"\n", + " try:\n", + " # Check if model is already loaded\n", + " if model_name not in dataset_generator.loaded_models:\n", + " print(f\"🔄 Loading {model_name} for schema generation...\")\n", + "\n", + " # Load tokenizer\n", + " tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " print(f\"Tokenizer loaded for {model_name}\")\n", + "\n", + " # Load model with quantization using correct model class\n", + " model_class_name = model_info.get(\"model_class\", \"AutoModelForCausalLM\")\n", + " model = load_huggingface_model(\n", + " model_id,\n", + " model_class_name,\n", + " dataset_generator.quantization_config,\n", + " torch.bfloat16\n", + " )\n", + "\n", + " dataset_generator.loaded_models[model_name] = {\n", + " 'model': model,\n", + " 'tokenizer': tokenizer\n", + " }\n", + " print(f\"✅ {model_name} loaded successfully for schema generation!\")\n", + "\n", + " # Get model and tokenizer\n", + " model = dataset_generator.loaded_models[model_name]['model']\n", + " tokenizer = dataset_generator.loaded_models[model_name]['tokenizer']\n", + "\n", + " # Prepare messages\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + "\n", + " # Tokenize\n", + " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + "\n", + " # Generate\n", + " with torch.no_grad():\n", + " outputs = model.generate(\n", + " inputs,\n", + " max_new_tokens=2000,\n", + " temperature=temperature,\n", + " do_sample=True,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + "\n", + " # Decode response\n", + " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + " # Extract only the assistant's response\n", + " if \"<|assistant|>\" in response:\n", + " response = response.split(\"<|assistant|>\")[-1].strip()\n", + " elif \"assistant\" in response:\n", + " response = response.split(\"assistant\")[-1].strip()\n", + "\n", + " return response\n", + "\n", + " except Exception as e:\n", + " # Clean up on error\n", + " if model_name in dataset_generator.loaded_models:\n", + " del dataset_generator.loaded_models[model_name]\n", + " gc.collect()\n", + " torch.cuda.empty_cache()\n", + " raise Exception(f\"HuggingFace schema generation error: {str(e)}\")\n", "\n", " def _query_commercial(self, model_name: str, system_prompt: str, user_prompt: str, temperature: float) -> str:\n", " \"\"\"Query commercial API models\"\"\"\n", @@ -451,7 +590,7 @@ " {\"role\": \"system\", \"content\": system_prompt},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", - " temperature=temperature\n", + " temperature = temperature if model_id != \"gpt-5-mini\" else 1.0\n", " )\n", " return response.choices[0].message.content\n", "\n", @@ -460,64 +599,7 @@ "\n", "# Initialize schema manager\n", "schema_manager = SchemaManager()\n", - "print(\"✅ Schema Management Module loaded!\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52c7cb55", - "metadata": {}, - "outputs": [], - "source": [ - "# Fixed HuggingFace Model Loading\n", - "def load_huggingface_model_with_correct_class(model_id, model_class_name, quantization_config, torch_dtype):\n", - " \"\"\"Load HuggingFace model with correct model class\"\"\"\n", - " try:\n", - " # Import the specific model class\n", - " if model_class_name == \"LlamaForCausalLM\":\n", - " from transformers import LlamaForCausalLM\n", - " model_class = LlamaForCausalLM\n", - " elif model_class_name == \"Phi3ForCausalLM\":\n", - " from transformers import Phi3ForCausalLM\n", - " model_class = Phi3ForCausalLM\n", - " elif model_class_name == \"GemmaForCausalLM\":\n", - " from transformers import GemmaForCausalLM\n", - " model_class = GemmaForCausalLM\n", - " elif model_class_name == \"Qwen2ForCausalLM\":\n", - " from transformers import Qwen2ForCausalLM\n", - " model_class = Qwen2ForCausalLM\n", - " elif model_class_name == \"MistralForCausalLM\":\n", - " from transformers import MistralForCausalLM\n", - " model_class = MistralForCausalLM\n", - " else:\n", - " # Fallback to AutoModelForCausalLM\n", - " model_class = AutoModelForCausalLM\n", - " \n", - " # Load the model\n", - " model = model_class.from_pretrained(\n", - " model_id,\n", - " device_map=\"auto\",\n", - " quantization_config=quantization_config,\n", - " torch_dtype=torch_dtype\n", - " )\n", - " return model\n", - " \n", - " except Exception as e:\n", - " print(f\"Error loading {model_class_name}: {str(e)}\")\n", - " # Fallback to AutoModelForCausalLM\n", - " try:\n", - " model = AutoModelForCausalLM.from_pretrained(\n", - " model_id,\n", - " device_map=\"auto\",\n", - " quantization_config=quantization_config,\n", - " torch_dtype=torch_dtype\n", - " )\n", - " return model\n", - " except Exception as e2:\n", - " raise Exception(f\"Failed to load model with both specific and auto classes: {str(e2)}\")\n", - "\n", - "print(\"✅ Fixed HuggingFace model loading function created!\")\n" + "\n" ] }, { @@ -535,12 +617,7 @@ "\n", " def __init__(self):\n", " self.loaded_models = {} # Cache for HuggingFace models\n", - " self.quantization_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_use_double_quant=True,\n", - " bnb_4bit_compute_dtype=torch.bfloat16,\n", - " bnb_4bit_quant_type=\"nf4\"\n", - " )\n", + " self.quantization_config = quantization_config\n", "\n", " def generate_dataset(self, schema_text: str, business_case: str, model_name: str,\n", " temperature: float, num_records: int, examples: str = \"\") -> Tuple[str, List[Dict]]:\n", @@ -620,10 +697,10 @@ "\n", " # Load model with quantization using correct model class\n", " model_class_name = model_info.get(\"model_class\", \"AutoModelForCausalLM\")\n", - " model = load_huggingface_model_with_correct_class(\n", - " model_id, \n", - " model_class_name, \n", - " self.quantization_config, \n", + " model = load_huggingface_model(\n", + " model_id,\n", + " model_class_name,\n", + " self.quantization_config,\n", " torch.bfloat16\n", " )\n", "\n", @@ -688,7 +765,7 @@ " {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n", " {\"role\": \"user\", \"content\": prompt}\n", " ],\n", - " temperature=temperature\n", + " temperature = temperature if model_id != \"gpt-5-mini\" else 1.0\n", " )\n", " return response.choices[0].message.content\n", "\n", @@ -735,596 +812,6 @@ "print(\"✅ Dataset Generation Module loaded!\")\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5d8d07f", - "metadata": {}, - "outputs": [], - "source": [ - "# Fixed Schema Generation for HuggingFace Models\n", - "def fixed_schema_generation_huggingface(model_name: str, system_prompt: str, user_prompt: str, temperature: float) -> str:\n", - " \"\"\"Fixed HuggingFace schema generation\"\"\"\n", - " model_info = HUGGINGFACE_MODELS[model_name]\n", - " model_id = model_info[\"model_id\"]\n", - "\n", - " try:\n", - " # Check if model is already loaded\n", - " if model_name not in dataset_generator.loaded_models:\n", - " print(f\"🔄 Loading {model_name} for schema generation...\")\n", - "\n", - " # Load tokenizer\n", - " tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", - " tokenizer.pad_token = tokenizer.eos_token\n", - "\n", - " # Load model with quantization using correct model class\n", - " model_class_name = model_info.get(\"model_class\", \"AutoModelForCausalLM\")\n", - " model = load_huggingface_model_with_correct_class(\n", - " model_id, \n", - " model_class_name, \n", - " dataset_generator.quantization_config, \n", - " torch.bfloat16\n", - " )\n", - "\n", - " dataset_generator.loaded_models[model_name] = {\n", - " 'model': model,\n", - " 'tokenizer': tokenizer\n", - " }\n", - " print(f\"✅ {model_name} loaded successfully for schema generation!\")\n", - "\n", - " # Get model and tokenizer\n", - " model = dataset_generator.loaded_models[model_name]['model']\n", - " tokenizer = dataset_generator.loaded_models[model_name]['tokenizer']\n", - "\n", - " # Prepare messages\n", - " messages = [\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": user_prompt}\n", - " ]\n", - "\n", - " # Tokenize\n", - " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", - "\n", - " # Generate\n", - " with torch.no_grad():\n", - " outputs = model.generate(\n", - " inputs,\n", - " max_new_tokens=2000,\n", - " temperature=temperature,\n", - " do_sample=True,\n", - " pad_token_id=tokenizer.eos_token_id\n", - " )\n", - "\n", - " # Decode response\n", - " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", - "\n", - " # Extract only the assistant's response\n", - " if \"<|assistant|>\" in response:\n", - " response = response.split(\"<|assistant|>\")[-1].strip()\n", - " elif \"assistant\" in response:\n", - " response = response.split(\"assistant\")[-1].strip()\n", - "\n", - " return response\n", - "\n", - " except Exception as e:\n", - " # Clean up on error\n", - " if model_name in dataset_generator.loaded_models:\n", - " del dataset_generator.loaded_models[model_name]\n", - " gc.collect()\n", - " torch.cuda.empty_cache()\n", - " raise Exception(f\"HuggingFace schema generation error: {str(e)}\")\n", - "\n", - "print(\"✅ Fixed HuggingFace schema generation function created!\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fd4b8c8", - "metadata": {}, - "outputs": [], - "source": [ - "# Fixed File Download for Google Colab\n", - "import io\n", - "from google.colab import files\n", - "\n", - "def save_dataset_colab(records: List[Dict], file_format: str, filename: str) -> str:\n", - " \"\"\"Save dataset and trigger download in Google Colab\"\"\"\n", - " if not records:\n", - " return \"❌ Error: No data to export\"\n", - "\n", - " try:\n", - " # Ensure filename has correct extension\n", - " if not filename.endswith(file_format):\n", - " filename += file_format\n", - "\n", - " # Create DataFrame\n", - " df = pd.DataFrame(records)\n", - "\n", - " if file_format == \".csv\":\n", - " csv_buffer = io.StringIO()\n", - " df.to_csv(csv_buffer, index=False)\n", - " csv_data = csv_buffer.getvalue()\n", - " files.download(io.BytesIO(csv_data.encode()), filename)\n", - " \n", - " elif file_format == \".tsv\":\n", - " tsv_buffer = io.StringIO()\n", - " df.to_csv(tsv_buffer, sep=\"\\t\", index=False)\n", - " tsv_data = tsv_buffer.getvalue()\n", - " files.download(io.BytesIO(tsv_data.encode()), filename)\n", - " \n", - " elif file_format == \".json\":\n", - " json_data = df.to_json(orient=\"records\", indent=2)\n", - " files.download(io.BytesIO(json_data.encode()), filename)\n", - " \n", - " elif file_format == \".jsonl\":\n", - " jsonl_data = \"\\n\".join([json.dumps(record) for record in records])\n", - " files.download(io.BytesIO(jsonl_data.encode()), filename)\n", - " else:\n", - " return f\"❌ Error: Unsupported format {file_format}\"\n", - "\n", - " return f\"✅ Dataset downloaded as {filename} ({len(records)} records)\"\n", - "\n", - " except Exception as e:\n", - " return f\"❌ Error saving dataset: {str(e)}\"\n", - "\n", - "def save_with_scores_colab(records: List[Dict], scores: List[int], file_format: str, filename: str) -> str:\n", - " \"\"\"Save dataset with quality scores and trigger download in Google Colab\"\"\"\n", - " if not records or not scores:\n", - " return \"❌ Error: No data or scores to export\"\n", - "\n", - " try:\n", - " # Add scores to records\n", - " records_with_scores = []\n", - " for i, record in enumerate(records):\n", - " record_with_score = record.copy()\n", - " record_with_score['quality_score'] = scores[i] if i < len(scores) else 0\n", - " records_with_scores.append(record_with_score)\n", - "\n", - " return save_dataset_colab(records_with_scores, file_format, filename)\n", - "\n", - " except Exception as e:\n", - " return f\"❌ Error saving dataset with scores: {str(e)}\"\n", - "\n", - "print(\"✅ Fixed file download functions for Google Colab created!\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e94ff70", - "metadata": {}, - "outputs": [], - "source": [ - "# Fixed UI Functions with Schema Flow and File Download\n", - "def generate_schema_fixed(business_case, schema_mode, schema_text, model_name, temperature):\n", - " \"\"\"Generate or enhance schema based on mode - FIXED VERSION\"\"\"\n", - " global current_schema_text\n", - " \n", - " if schema_mode == \"LLM Generate\":\n", - " if model_name in HUGGINGFACE_MODELS:\n", - " result = fixed_schema_generation_huggingface(\n", - " business_case, \n", - " \"You are an expert data scientist. Given a business case, generate a comprehensive dataset schema. Return the schema in this exact format: field_name (TYPE) - Description, example: example_value. Include 8-12 relevant fields that would be useful for the business case. Use realistic field names and appropriate data types (TEXT, INT, FLOAT, BOOLEAN, ARRAY). Provide clear descriptions and realistic examples.\",\n", - " f\"Business case: {business_case}\\n\\nGenerate a dataset schema for this business case. Include fields that would be relevant for analysis and decision-making.\",\n", - " temperature\n", - " )\n", - " else:\n", - " result = schema_manager.generate_schema_with_llm(business_case, model_name, temperature)\n", - " current_schema_text = result\n", - " return result, result\n", - " elif schema_mode == \"LLM Enhance Manual\":\n", - " if model_name in HUGGINGFACE_MODELS:\n", - " result = fixed_schema_generation_huggingface(\n", - " business_case,\n", - " \"You are an expert data scientist. Given a partial schema and business case, enhance it by: 1. Adding missing relevant fields 2. Improving field descriptions 3. Adding realistic examples 4. Ensuring proper data types. Return the enhanced schema in the same format as the original.\",\n", - " f\"Business case: {business_case}\\n\\nCurrent partial schema:\\n{schema_text}\\n\\nPlease enhance this schema by adding missing fields and improving the existing ones.\",\n", - " temperature\n", - " )\n", - " else:\n", - " result = schema_manager.enhance_schema_with_llm(schema_text, business_case, model_name, temperature)\n", - " current_schema_text = result\n", - " return result, result\n", - " else: # Manual Entry\n", - " current_schema_text = schema_text\n", - " return schema_text, schema_text\n", - "\n", - "def export_dataset_fixed(file_format, filename, include_scores):\n", - " \"\"\"Export dataset to specified format - FIXED VERSION for Google Colab\"\"\"\n", - " global current_dataset, current_scores\n", - "\n", - " if not current_dataset:\n", - " return \"No dataset to export\"\n", - "\n", - " try:\n", - " if include_scores and current_scores:\n", - " result = save_with_scores_colab(current_dataset, current_scores, file_format, filename)\n", - " else:\n", - " result = save_dataset_colab(current_dataset, file_format, filename)\n", - " return result\n", - " except Exception as e:\n", - " return f\"❌ Error exporting dataset: {str(e)}\"\n", - "\n", - "print(\"✅ Fixed UI functions with schema flow and file download created!\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8e47887", - "metadata": {}, - "outputs": [], - "source": [ - "# Updated Gradio Interface with Fixed Functions\n", - "def create_fixed_gradio_interface():\n", - " \"\"\"Create the main Gradio interface with 5 tabs - FIXED VERSION\"\"\"\n", - "\n", - " # Combine all models for dropdowns\n", - " all_models = list(HUGGINGFACE_MODELS.keys()) + list(COMMERCIAL_MODELS.keys())\n", - "\n", - " with gr.Blocks(title=\"Synthetic Dataset Generator\", theme=gr.themes.Soft()) as interface:\n", - "\n", - " gr.Markdown(\"# 🧬 Synthetic Dataset Generator with Quality Scoring\")\n", - " gr.Markdown(\"Generate realistic synthetic datasets using multiple LLM models with flexible schema creation, synonym permutation, and automated quality scoring.\")\n", - "\n", - " # Status bar\n", - " with gr.Row():\n", - " gpu_status = gr.Textbox(\n", - " label=\"GPU Status\",\n", - " value=dataset_generator.get_memory_usage(),\n", - " interactive=False,\n", - " scale=1\n", - " )\n", - " current_status = gr.Textbox(\n", - " label=\"Current Status\",\n", - " value=\"Ready to generate datasets\",\n", - " interactive=False,\n", - " scale=2\n", - " )\n", - "\n", - " # Tab 1: Schema Definition\n", - " with gr.Tab(\"📋 Schema Definition\"):\n", - " gr.Markdown(\"### Define your dataset schema\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=2):\n", - " schema_mode = gr.Radio(\n", - " choices=[\"LLM Generate\", \"Manual Entry\", \"LLM Enhance Manual\"],\n", - " value=\"Manual Entry\",\n", - " label=\"Schema Mode\"\n", - " )\n", - "\n", - " business_case_input = gr.Textbox(\n", - " label=\"Business Case\",\n", - " value=current_business_case,\n", - " lines=3,\n", - " placeholder=\"Describe your business case or data requirements...\"\n", - " )\n", - "\n", - " schema_input = gr.Textbox(\n", - " label=\"Schema Definition\",\n", - " value=current_schema_text,\n", - " lines=15,\n", - " placeholder=\"Define your dataset schema here...\"\n", - " )\n", - "\n", - " with gr.Row():\n", - " schema_model = gr.Dropdown(\n", - " choices=all_models,\n", - " value=all_models[0],\n", - " label=\"Model for Schema Generation\"\n", - " )\n", - " schema_temperature = gr.Slider(\n", - " minimum=0.0,\n", - " maximum=2.0,\n", - " value=0.7,\n", - " step=0.1,\n", - " label=\"Temperature\"\n", - " )\n", - "\n", - " generate_schema_btn = gr.Button(\"🔄 Generate/Enhance Schema\", variant=\"primary\")\n", - "\n", - " with gr.Column(scale=1):\n", - " schema_output = gr.Textbox(\n", - " label=\"Generated Schema\",\n", - " lines=15,\n", - " interactive=False\n", - " )\n", - "\n", - " # Tab 2: Dataset Generation\n", - " with gr.Tab(\"🚀 Dataset Generation\"):\n", - " gr.Markdown(\"### Generate synthetic dataset\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=2):\n", - " generation_schema = gr.Textbox(\n", - " label=\"Schema (from Tab 1)\",\n", - " value=current_schema_text,\n", - " lines=8,\n", - " interactive=False\n", - " )\n", - "\n", - " generation_business_case = gr.Textbox(\n", - " label=\"Business Case\",\n", - " value=current_business_case,\n", - " lines=2\n", - " )\n", - "\n", - " examples_input = gr.Textbox(\n", - " label=\"Few-shot Examples (JSON format)\",\n", - " lines=5,\n", - " placeholder='[{\"instruction\": \"example\", \"response\": \"example\"}]',\n", - " value=\"\"\n", - " )\n", - "\n", - " with gr.Row():\n", - " generation_model = gr.Dropdown(\n", - " choices=all_models,\n", - " value=all_models[0],\n", - " label=\"Generation Model\"\n", - " )\n", - " generation_temperature = gr.Slider(\n", - " minimum=0.0,\n", - " maximum=2.0,\n", - " value=0.7,\n", - " step=0.1,\n", - " label=\"Temperature\"\n", - " )\n", - " num_records = gr.Number(\n", - " value=50,\n", - " minimum=11,\n", - " maximum=1000,\n", - " step=1,\n", - " label=\"Number of Records\"\n", - " )\n", - "\n", - " generate_dataset_btn = gr.Button(\"🚀 Generate Dataset\", variant=\"primary\", size=\"lg\")\n", - "\n", - " with gr.Column(scale=1):\n", - " generation_status = gr.Textbox(\n", - " label=\"Generation Status\",\n", - " lines=3,\n", - " interactive=False\n", - " )\n", - "\n", - " dataset_preview = gr.Dataframe(\n", - " label=\"Dataset Preview (First 20 rows)\",\n", - " interactive=False,\n", - " wrap=True\n", - " )\n", - "\n", - " record_count = gr.Number(\n", - " label=\"Total Records Generated\",\n", - " interactive=False\n", - " )\n", - "\n", - " # Tab 3: Synonym Permutation\n", - " with gr.Tab(\"🔄 Synonym Permutation\"):\n", - " gr.Markdown(\"### Add diversity with synonym replacement\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=2):\n", - " enable_permutation = gr.Checkbox(\n", - " label=\"Enable Synonym Permutation\",\n", - " value=False\n", - " )\n", - "\n", - " fields_to_permute = gr.CheckboxGroup(\n", - " label=\"Fields to Permute\",\n", - " choices=[],\n", - " value=[]\n", - " )\n", - "\n", - " permutation_rate = gr.Slider(\n", - " minimum=0,\n", - " maximum=50,\n", - " value=20,\n", - " step=5,\n", - " label=\"Permutation Rate (%)\"\n", - " )\n", - "\n", - " apply_permutation_btn = gr.Button(\"🔄 Apply Permutation\", variant=\"secondary\")\n", - "\n", - " with gr.Column(scale=1):\n", - " permutation_status = gr.Textbox(\n", - " label=\"Permutation Status\",\n", - " lines=2,\n", - " interactive=False\n", - " )\n", - "\n", - " permuted_preview = gr.Dataframe(\n", - " label=\"Permuted Dataset Preview\",\n", - " interactive=False,\n", - " wrap=True\n", - " )\n", - "\n", - " # Tab 4: Quality Scoring\n", - " with gr.Tab(\"📊 Quality Scoring\"):\n", - " gr.Markdown(\"### Evaluate dataset quality\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=2):\n", - " scoring_model = gr.Dropdown(\n", - " choices=all_models,\n", - " value=all_models[0],\n", - " label=\"Scoring Model\"\n", - " )\n", - "\n", - " scoring_temperature = gr.Slider(\n", - " minimum=0.0,\n", - " maximum=2.0,\n", - " value=0.3,\n", - " step=0.1,\n", - " label=\"Temperature\"\n", - " )\n", - "\n", - " score_dataset_btn = gr.Button(\"📊 Score Dataset Quality\", variant=\"primary\")\n", - "\n", - " with gr.Column(scale=1):\n", - " scoring_status = gr.Textbox(\n", - " label=\"Scoring Status\",\n", - " lines=2,\n", - " interactive=False\n", - " )\n", - "\n", - " scores_dataframe = gr.Dataframe(\n", - " label=\"Quality Scores\",\n", - " interactive=False\n", - " )\n", - "\n", - " quality_report = gr.JSON(\n", - " label=\"Quality Report\"\n", - " )\n", - "\n", - " # Tab 5: Export\n", - " with gr.Tab(\"💾 Export\"):\n", - " gr.Markdown(\"### Export your dataset\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=2):\n", - " file_format = gr.Dropdown(\n", - " choices=OUTPUT_FORMATS,\n", - " value=\".csv\",\n", - " label=\"File Format\"\n", - " )\n", - "\n", - " filename = gr.Textbox(\n", - " label=\"Filename\",\n", - " value=\"synthetic_dataset\",\n", - " placeholder=\"Enter filename (extension added automatically)\"\n", - " )\n", - "\n", - " include_scores = gr.Checkbox(\n", - " label=\"Include Quality Scores\",\n", - " value=False\n", - " )\n", - "\n", - " export_btn = gr.Button(\"💾 Export Dataset\", variant=\"primary\")\n", - "\n", - " with gr.Column(scale=1):\n", - " export_status = gr.Textbox(\n", - " label=\"Export Status\",\n", - " lines=3,\n", - " interactive=False\n", - " )\n", - "\n", - " # Event handlers - FIXED VERSION\n", - " generate_schema_btn.click(\n", - " generate_schema_fixed,\n", - " inputs=[business_case_input, schema_mode, schema_input, schema_model, schema_temperature],\n", - " outputs=[schema_output, schema_input, generation_schema]\n", - " )\n", - "\n", - " generate_dataset_btn.click(\n", - " generate_dataset_ui,\n", - " inputs=[generation_schema, generation_business_case, generation_model, generation_temperature, num_records, examples_input],\n", - " outputs=[generation_status, dataset_preview, record_count]\n", - " )\n", - "\n", - " apply_permutation_btn.click(\n", - " apply_synonym_permutation,\n", - " inputs=[enable_permutation, fields_to_permute, permutation_rate],\n", - " outputs=[permuted_preview, permutation_status]\n", - " )\n", - "\n", - " score_dataset_btn.click(\n", - " score_dataset_quality,\n", - " inputs=[scoring_model, scoring_temperature],\n", - " outputs=[scoring_status, scores_dataframe, quality_report]\n", - " )\n", - "\n", - " export_btn.click(\n", - " export_dataset_fixed,\n", - " inputs=[file_format, filename, include_scores],\n", - " outputs=[export_status]\n", - " )\n", - "\n", - " # Update field choices when dataset is generated\n", - " def update_field_choices():\n", - " fields = get_available_fields()\n", - " return gr.CheckboxGroup(choices=fields, value=[])\n", - "\n", - " # Auto-update field choices\n", - " generate_dataset_btn.click(\n", - " update_field_choices,\n", - " outputs=[fields_to_permute]\n", - " )\n", - "\n", - " return interface\n", - "\n", - "print(\"✅ Fixed Gradio Interface created!\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "469152ce", - "metadata": {}, - "outputs": [], - "source": [ - "# Launch the Fixed Gradio Interface\n", - "print(\"🚀 Launching Fixed Synthetic Dataset Generator...\")\n", - "interface = create_fixed_gradio_interface()\n", - "interface.launch(debug=True, share=True)\n" - ] - }, - { - "cell_type": "markdown", - "id": "42127e24", - "metadata": {}, - "source": [ - "## 🔧 Bug Fixes Applied\n", - "\n", - "### ✅ Issues Fixed:\n", - "\n", - "1. **Schema Flow Issue**: \n", - " - Fixed schema generation to properly pass generated schema to Dataset Generation tab\n", - " - Updated `generate_schema_fixed()` function to update global `current_schema_text`\n", - " - Added proper output connections in Gradio interface\n", - "\n", - "2. **File Download Issue**:\n", - " - Implemented Google Colab-compatible file download using `google.colab.files.download()`\n", - " - Created `save_dataset_colab()` and `save_with_scores_colab()` functions\n", - " - Files now download directly to browser instead of saving to local storage\n", - "\n", - "3. **HuggingFace Schema Generation**:\n", - " - Implemented `fixed_schema_generation_huggingface()` function\n", - " - Added proper model loading and inference for schema generation\n", - " - Integrated with existing schema management system\n", - "\n", - "4. **HuggingFace Model Import Issues**:\n", - " - Added correct model classes for each HuggingFace model:\n", - " - Llama models: `LlamaForCausalLM`\n", - " - Phi-3.5: `Phi3ForCausalLM`\n", - " - Gemma 2: `GemmaForCausalLM`\n", - " - Qwen 2.5: `Qwen2ForCausalLM`\n", - " - Mistral models: `MistralForCausalLM`\n", - " - Created `load_huggingface_model_with_correct_class()` function with fallback to `AutoModelForCausalLM`\n", - " - Updated model configuration with `model_class` field\n", - "\n", - "5. **Updated Dependencies**:\n", - " - Added `google-colab` package for proper Colab integration\n", - " - Fixed import issues for Google Colab environment\n", - "\n", - "### 🚀 How to Use the Fixed Version:\n", - "\n", - "1. **Run all cells in order** - the fixes are applied automatically\n", - "2. **Schema Tab**: Generate schema with any model (HuggingFace or Commercial)\n", - "3. **Dataset Tab**: Schema automatically flows from Tab 1\n", - "4. **Export Tab**: Files download directly to your browser\n", - "5. **All HuggingFace models** now work properly for both schema generation and dataset generation\n", - "\n", - "### 🔧 Technical Details:\n", - "\n", - "- **Model Loading**: Uses correct model classes with fallback to AutoModelForCausalLM\n", - "- **File Downloads**: Uses `google.colab.files.download()` for browser downloads\n", - "- **Schema Flow**: Global variables ensure schema passes between tabs\n", - "- **Error Handling**: Comprehensive error handling with model cleanup\n", - "- **Memory Management**: Proper GPU memory cleanup on errors\n", - "\n", - "The application should now work seamlessly in Google Colab with all features functional!\n" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1550,8 +1037,7 @@ " return recommendations\n", "\n", "# Initialize quality scorer\n", - "quality_scorer = QualityScorer()\n", - "print(\"✅ Quality Scoring Module loaded!\")\n" + "quality_scorer = QualityScorer()\n" ] }, { @@ -1677,8 +1163,7 @@ " self.synonym_cache.clear()\n", "\n", "# Initialize synonym permutator\n", - "synonym_permutator = SynonymPermutator()\n", - "print(\"✅ Synonym Permutation Module loaded!\")\n" + "synonym_permutator = SynonymPermutator()\n" ] }, { @@ -1700,48 +1185,50 @@ " self.export_history = []\n", "\n", " def save_dataset(self, records: List[Dict], file_format: str, filename: str) -> str:\n", - " \"\"\"Save dataset to specified format\"\"\"\n", + " \"\"\"Save dataset using Gradio File component approach - WORKING VERSION\"\"\"\n", " if not records:\n", - " return \"❌ Error: No data to export\"\n", + " return None # Return None to indicate no file\n", "\n", " try:\n", " # Ensure filename has correct extension\n", " if not filename.endswith(file_format):\n", " filename += file_format\n", "\n", + " # Generate unique filename to avoid caching issues\n", + " timestamp = int(time.time())\n", + " base_name = filename.replace(file_format, '')\n", + " unique_filename = f\"{base_name}_{timestamp}{file_format}\"\n", + "\n", + " # Create file path in /content directory\n", + " file_path = f\"/content/{unique_filename}\"\n", + "\n", " # Create DataFrame\n", " df = pd.DataFrame(records)\n", "\n", " if file_format == \".csv\":\n", - " df.to_csv(filename, index=False)\n", + " df.to_csv(file_path, index=False)\n", " elif file_format == \".tsv\":\n", - " df.to_csv(filename, sep=\"\\t\", index=False)\n", + " df.to_csv(file_path, sep=\"\\t\", index=False)\n", " elif file_format == \".json\":\n", - " df.to_json(filename, orient=\"records\", indent=2)\n", + " df.to_json(file_path, orient=\"records\", indent=2)\n", " elif file_format == \".jsonl\":\n", - " with open(filename, \"w\") as f:\n", + " with open(file_path, 'w') as f:\n", " for record in records:\n", - " f.write(json.dumps(record) + \"\\n\")\n", + " f.write(json.dumps(record) + '\\n')\n", " else:\n", - " return f\"❌ Error: Unsupported format {file_format}\"\n", + " return None\n", "\n", - " # Track export\n", - " self.export_history.append({\n", - " 'filename': filename,\n", - " 'format': file_format,\n", - " 'records': len(records),\n", - " 'timestamp': pd.Timestamp.now()\n", - " })\n", - "\n", - " return f\"✅ Dataset saved to {filename} ({len(records)} records)\"\n", + " print(f\"File generated and saved at: {file_path}\")\n", + " return file_path\n", "\n", " except Exception as e:\n", - " return f\"❌ Error saving dataset: {str(e)}\"\n", + " print(f\"Error saving dataset: {str(e)}\")\n", + " return None\n", "\n", " def save_with_scores(self, records: List[Dict], scores: List[int], file_format: str, filename: str) -> str:\n", - " \"\"\"Save dataset with quality scores included\"\"\"\n", + " \"\"\"Save dataset with quality scores using Gradio File component approach\"\"\"\n", " if not records or not scores:\n", - " return \"❌ Error: No data or scores to export\"\n", + " return None\n", "\n", " try:\n", " # Add scores to records\n", @@ -1754,7 +1241,8 @@ " return self.save_dataset(records_with_scores, file_format, filename)\n", "\n", " except Exception as e:\n", - " return f\"❌ Error saving dataset with scores: {str(e)}\"\n", + " print(f\"Error saving dataset with scores: {str(e)}\")\n", + " return None\n", "\n", " def export_quality_report(self, scores: List[int], dataset: List[Dict], filename: str) -> str:\n", " \"\"\"Export quality report as JSON\"\"\"\n", @@ -1765,7 +1253,6 @@ " # Generate quality report\n", " report = quality_scorer.generate_quality_report(scores, dataset)\n", "\n", - " # Add additional metadata\n", " report['export_timestamp'] = pd.Timestamp.now().isoformat()\n", " report['dataset_size'] = len(dataset)\n", " report['score_statistics'] = {\n", @@ -1819,8 +1306,7 @@ " self.export_history.clear()\n", "\n", "# Initialize dataset exporter\n", - "dataset_exporter = DatasetExporter()\n", - "print(\"✅ Output & Export Module loaded!\")\n" + "dataset_exporter = DatasetExporter()\n" ] }, { @@ -1843,12 +1329,18 @@ " \"\"\"Generate or enhance schema based on mode\"\"\"\n", " if schema_mode == \"LLM Generate\":\n", " result = schema_manager.generate_schema_with_llm(business_case, model_name, temperature)\n", - " return result, result\n", + " current_schema_text = result\n", + " current_business_case = business_case\n", + " return result, result, result, business_case\n", " elif schema_mode == \"LLM Enhance Manual\":\n", " result = schema_manager.enhance_schema_with_llm(schema_text, business_case, model_name, temperature)\n", - " return result, result\n", + " current_schema_text = result\n", + " current_business_case = business_case\n", + " return result, result, result, business_case\n", " else: # Manual Entry\n", - " return schema_text, schema_text\n", + " current_schema_text = schema_text\n", + " current_business_case = business_case\n", + " return schema_text, schema_text, schema_text, business_case\n", "\n", "def generate_dataset_ui(schema_text, business_case, model_name, temperature, num_records, examples):\n", " \"\"\"Generate dataset using selected model\"\"\"\n", @@ -1864,23 +1356,46 @@ " return status, preview_df, len(records)\n", "\n", "def apply_synonym_permutation(enable_permutation, fields_to_permute, permutation_rate):\n", - " \"\"\"Apply synonym permutation to dataset\"\"\"\n", + " \"\"\"Apply synonym permutation to dataset - FIXED VERSION\"\"\"\n", " global current_dataset\n", "\n", - " if not enable_permutation or not current_dataset or not fields_to_permute:\n", - " return current_dataset, \"No permutation applied\"\n", + " if not enable_permutation:\n", + " return current_dataset, \"❌ Permutation is disabled - check the 'Enable Synonym Permutation' checkbox\"\n", "\n", - " permuted_dataset, stats = synonym_permutator.permute_with_synonyms(\n", - " current_dataset, fields_to_permute, permutation_rate / 100\n", - " )\n", + " if not current_dataset:\n", + " return [], \"❌ No dataset available - generate a dataset first\"\n", "\n", - " current_dataset = permuted_dataset\n", - " preview_df = dataset_exporter.create_preview_dataframe(permuted_dataset, 20)\n", + " if not fields_to_permute:\n", + " # Try to auto-identify fields if none are selected\n", + " try:\n", + " auto_fields = synonym_permutator.identify_text_fields(current_dataset)\n", + " if auto_fields:\n", + " fields_to_permute = auto_fields[:2] # Use first 2 fields as default\n", + " print(f\"DEBUG: Auto-selected fields: {fields_to_permute}\")\n", + " else:\n", + " return current_dataset, \"❌ No text fields found for permutation\"\n", + " except Exception as e:\n", + " return current_dataset, f\"❌ Error identifying fields: {str(e)}\"\n", "\n", - " stats_text = f\"Permutation applied to {len(fields_to_permute)} fields. \"\n", - " stats_text += f\"Replacement counts: {stats}\"\n", + " try:\n", + " permuted_dataset, stats = synonym_permutator.permute_with_synonyms(\n", + " current_dataset, fields_to_permute, permutation_rate / 100\n", + " )\n", "\n", - " return preview_df, stats_text\n", + " current_dataset = permuted_dataset\n", + "\n", + " # Convert to DataFrame for proper display\n", + " import pandas as pd\n", + " preview_df = pd.DataFrame(permuted_dataset)\n", + "\n", + " stats_text = f\"✅ Permutation applied to {len(fields_to_permute)} fields. \"\n", + " stats_text += f\"Replacement counts: {stats}\"\n", + "\n", + " return preview_df, stats_text\n", + "\n", + " except Exception as e:\n", + " print(f\"DEBUG: Error during permutation: {str(e)}\")\n", + " return current_dataset, f\"❌ Error during permutation: {str(e)}\"\n", "\n", "def score_dataset_quality(scoring_model, scoring_temperature):\n", " \"\"\"Score dataset quality using selected model\"\"\"\n", @@ -1918,23 +1433,33 @@ " if not current_dataset:\n", " return \"No dataset to export\"\n", "\n", - " if include_scores and current_scores:\n", - " result = dataset_exporter.save_with_scores(current_dataset, current_scores, file_format, filename)\n", - " else:\n", - " result = dataset_exporter.save_dataset(current_dataset, file_format, filename)\n", - "\n", - " return result\n", + " try:\n", + " if include_scores and current_scores:\n", + " result = dataset_exporter.save_with_scores(current_dataset, current_scores, file_format, filename)\n", + " else:\n", + " result = dataset_exporter.save_dataset(current_dataset, file_format, filename)\n", + " return result\n", + " except Exception as e:\n", + " return f\"❌ Error exporting dataset: {str(e)}\"\n", "\n", "def get_available_fields():\n", " \"\"\"Get available fields for permutation\"\"\"\n", " if not current_dataset:\n", " return []\n", "\n", - " return synonym_permutator.identify_text_fields(current_dataset)\n", - "\n", - "print(\"✅ UI Functions loaded!\")\n" + " return synonym_permutator.identify_text_fields(current_dataset)\n" ] }, + { + "cell_type": "markdown", + "source": [ + "## Graddle" + ], + "metadata": { + "id": "fDerxxf1zfpu" + }, + "id": "fDerxxf1zfpu" + }, { "cell_type": "code", "execution_count": null, @@ -1949,11 +1474,11 @@ " \"\"\"Create the main Gradio interface with 5 tabs\"\"\"\n", "\n", " # Combine all models for dropdowns\n", - " all_models = list(HUGGINGFACE_MODELS.keys()) + list(COMMERCIAL_MODELS.keys())\n", + " all_models = list(COMMERCIAL_MODELS.keys())+list(HUGGINGFACE_MODELS.keys())\n", "\n", " with gr.Blocks(title=\"Synthetic Dataset Generator\", theme=gr.themes.Soft()) as interface:\n", "\n", - " gr.Markdown(\"# 🧬 Synthetic Dataset Generator with Quality Scoring\")\n", + " gr.Markdown(\"# Synthetic Dataset Generator with Quality Scoring\")\n", " gr.Markdown(\"Generate realistic synthetic datasets using multiple LLM models with flexible schema creation, synonym permutation, and automated quality scoring.\")\n", "\n", " # Status bar\n", @@ -2020,11 +1545,6 @@ " interactive=False\n", " )\n", "\n", - " schema_preview = gr.Dataframe(\n", - " label=\"Schema Preview\",\n", - " interactive=False\n", - " )\n", - "\n", " # Tab 2: Dataset Generation\n", " with gr.Tab(\"🚀 Dataset Generation\"):\n", " gr.Markdown(\"### Generate synthetic dataset\")\n", @@ -2126,11 +1646,12 @@ " interactive=False\n", " )\n", "\n", - " permuted_preview = gr.Dataframe(\n", - " label=\"Permuted Dataset Preview\",\n", - " interactive=False,\n", - " wrap=True\n", - " )\n", + " permuted_preview = gr.Dataframe(\n", + " label=\"Permuted Dataset Preview\",\n", + " interactive=False,\n", + " wrap=True,\n", + " datatype=[\"str\"] * 10\n", + " )\n", "\n", " # Tab 4: Quality Scoring\n", " with gr.Tab(\"📊 Quality Scoring\"):\n", @@ -2170,7 +1691,6 @@ " label=\"Quality Report\"\n", " )\n", "\n", - " # Tab 5: Export\n", " with gr.Tab(\"💾 Export\"):\n", " gr.Markdown(\"### Export your dataset\")\n", "\n", @@ -2196,22 +1716,24 @@ " export_btn = gr.Button(\"💾 Export Dataset\", variant=\"primary\")\n", "\n", " with gr.Column(scale=1):\n", + " # Use gr.File component for download\n", + " download_file = gr.File(\n", + " label=\"Download your file here\",\n", + " interactive=False,\n", + " visible=True\n", + " )\n", + "\n", " export_status = gr.Textbox(\n", " label=\"Export Status\",\n", " lines=3,\n", " interactive=False\n", " )\n", "\n", - " export_history = gr.Dataframe(\n", - " label=\"Export History\",\n", - " interactive=False\n", - " )\n", - "\n", " # Event handlers\n", " generate_schema_btn.click(\n", " generate_schema,\n", " inputs=[business_case_input, schema_mode, schema_input, schema_model, schema_temperature],\n", - " outputs=[schema_output, schema_input]\n", + " outputs=[schema_output, schema_input, generation_schema, generation_business_case]\n", " )\n", "\n", " generate_dataset_btn.click(\n", @@ -2232,26 +1754,66 @@ " outputs=[scoring_status, scores_dataframe, quality_report]\n", " )\n", "\n", + "\n", + " def export_dataset_with_file(file_format, filename, include_scores):\n", + " \"\"\"Export dataset with file download\"\"\"\n", + " global current_dataset, current_scores\n", + "\n", + " if not current_dataset:\n", + " return None, \"❌ No dataset to export\"\n", + "\n", + " try:\n", + " if include_scores and current_scores:\n", + " file_path = dataset_exporter.save_with_scores(current_dataset, current_scores, file_format, filename)\n", + " else:\n", + " file_path = dataset_exporter.save_dataset(current_dataset, file_format, filename)\n", + "\n", + " if file_path:\n", + " return file_path, f\"✅ Dataset ready for download: {filename}\"\n", + " else:\n", + " return None, \"❌ Error creating file\"\n", + "\n", + " except Exception as e:\n", + " return None, f\"❌ Error exporting dataset: {str(e)}\"\n", + "\n", " export_btn.click(\n", - " export_dataset,\n", + " export_dataset_with_file,\n", " inputs=[file_format, filename, include_scores],\n", - " outputs=[export_status]\n", + " outputs=[download_file, export_status]\n", " )\n", "\n", - " # Update field choices when dataset is generated\n", " def update_field_choices():\n", - " fields = get_available_fields()\n", - " return gr.CheckboxGroup(choices=fields, value=[])\n", + " \"\"\"Update field choices when dataset is generated - FIXED VERSION\"\"\"\n", + " global current_dataset\n", + "\n", + " if not current_dataset:\n", + " print(\"DEBUG: No current dataset available\")\n", + " return gr.CheckboxGroup(choices=[], value=[])\n", + "\n", + " try:\n", + " fields = synonym_permutator.identify_text_fields(current_dataset)\n", + " print(f\"DEBUG: Available fields for permutation: {fields}\")\n", + "\n", + " if not fields:\n", + " print(\"DEBUG: No text fields identified\")\n", + " return gr.CheckboxGroup(choices=[], value=[])\n", + "\n", + " return gr.CheckboxGroup(choices=fields, value=[])\n", + " except Exception as e:\n", + " print(f\"DEBUG: Error identifying fields: {str(e)}\")\n", + " return gr.CheckboxGroup(choices=[], value=[])\n", "\n", " # Auto-update field choices\n", " generate_dataset_btn.click(\n", - " update_field_choices,\n", + " generate_dataset_ui,\n", + " inputs=[generation_schema, generation_business_case, generation_model, generation_temperature, num_records, examples_input],\n", + " outputs=[generation_status, dataset_preview, record_count]\n", + " ).then(\n", + " update_field_choices, # This should run after dataset generation\n", " outputs=[fields_to_permute]\n", " )\n", "\n", - " return interface\n", - "\n", - "print(\"✅ Gradio Interface created!\")\n" + " return interface\n" ] }, { @@ -2264,7 +1826,6 @@ "outputs": [], "source": [ "# Launch the Gradio Interface\n", - "print(\"🚀 Launching Synthetic Dataset Generator...\")\n", "interface = create_gradio_interface()\n", "interface.launch(debug=True, share=True)\n" ] @@ -2276,7 +1837,7 @@ "id": "212aa78a" }, "source": [ - "## Example Workflow: Pharmacogenomics Dataset\n", + "## Example Workflow: Dataset\n", "\n", "This section demonstrates the complete pipeline using a pharmacogenomics (PGx) example.\n", "\n", @@ -2309,13 +1870,7 @@ "### Step 5: Export\n", "1. Choose format (CSV for analysis, JSON for APIs)\n", "2. Include quality scores if needed\n", - "3. Download your dataset\n", - "\n", - "### Expected Results\n", - "- **High-quality synthetic data** that mimics real pharmacogenomics datasets\n", - "- **Diverse patient profiles** with realistic genetic variants\n", - "- **Consistent drug-gene interactions** following known pharmacogenomics principles\n", - "- **Quality scores** to identify any problematic records\n" + "3. Download your dataset\n" ] }, { @@ -2345,7 +1900,7 @@ " print(\"🔄 Testing OpenAI schema generation...\")\n", " result = schema_manager.generate_schema_with_llm(\n", " \"Generate a dataset for e-commerce customer analysis\",\n", - " \"GPT-5 Mini\",\n", + " \"Phi-3.5 Mini\",\n", " 1\n", " )\n", " print(f\"✅ OpenAI schema generation: {len(result)} characters\")\n", @@ -2440,84 +1995,6 @@ "# Run integration tests\n", "run_integration_test()\n" ] - }, - { - "cell_type": "markdown", - "id": "6577036b", - "metadata": { - "id": "6577036b" - }, - "source": [ - "## 🎯 Key Features Summary\n", - "\n", - "### ✅ Implemented Features\n", - "\n", - "1. **Multi-Model Support**\n", - " - 7 HuggingFace models (Llama, Phi, Gemma, Qwen, Mistral, Zephyr)\n", - " - 4 Commercial APIs (OpenAI, Anthropic, Google, DeepSeek)\n", - " - GPU optimization for T4 Colab environments\n", - "\n", - "2. **Flexible Schema Creation**\n", - " - LLM-generated schemas from business cases\n", - " - Manual schema entry with validation\n", - " - LLM enhancement of partial schemas\n", - " - Default pharmacogenomics schema included\n", - "\n", - "3. **Advanced Dataset Generation**\n", - " - Temperature control for creativity/consistency\n", - " - Few-shot examples support\n", - " - Batch processing for large datasets\n", - " - Progress tracking and error handling\n", - "\n", - "4. **Synonym Permutation**\n", - " - NLTK WordNet integration for synonym lookup\n", - " - Configurable permutation rates (0-50%)\n", - " - Field-specific permutation\n", - " - Preserves capitalization and punctuation\n", - "\n", - "5. **Quality Scoring System**\n", - " - Separate model selection for scoring\n", - " - 5-criteria scoring (schema compliance, uniqueness, relevance, realism, diversity)\n", - " - Per-record and aggregate statistics\n", - " - Quality report generation with recommendations\n", - "\n", - "6. **Multiple Export Formats**\n", - " - CSV, TSV, JSON, JSONL support\n", - " - Quality scores integration\n", - " - Export history tracking\n", - " - Dataset summary statistics\n", - "\n", - "7. **User-Friendly Interface**\n", - " - 5-tab modular design\n", - " - Real-time status updates\n", - " - GPU memory monitoring\n", - " - Interactive previews and reports\n", - "\n", - "### 🚀 Usage Instructions\n", - "\n", - "1. **Start with Schema Tab**: Define your dataset structure\n", - "2. **Generate in Dataset Tab**: Create synthetic data with your chosen model\n", - "3. **Enhance in Permutation Tab**: Add diversity with synonym replacement\n", - "4. **Evaluate in Scoring Tab**: Assess data quality with separate model\n", - "5. **Export in Export Tab**: Download in your preferred format\n", - "\n", - "### 🔧 Technical Specifications\n", - "\n", - "- **GPU Optimized**: 4-bit quantization for T4 compatibility\n", - "- **Memory Efficient**: Model caching and garbage collection\n", - "- **Error Resilient**: Comprehensive error handling and recovery\n", - "- **Scalable**: Supports 11-1000 records per generation\n", - "- **Extensible**: Easy to add new models and features\n", - "\n", - "### 📊 Expected Performance\n", - "\n", - "- **Generation Speed**: 50 records in 30-60 seconds (HuggingFace), 10-20 seconds (Commercial APIs)\n", - "- **Quality Scores**: 70-90% average for well-designed schemas\n", - "- **Memory Usage**: 8-12GB VRAM for largest models on T4\n", - "- **Success Rate**: >95% for commercial APIs, >90% for HuggingFace models\n", - "\n", - "This implementation provides a comprehensive, production-ready synthetic dataset generator with advanced features for quality assurance and diversity enhancement.\n" - ] } ], "metadata": { @@ -2545,4 +2022,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file