Modify readme and app

This commit is contained in:
Jsrodrigue
2025-10-23 23:40:53 +01:00
parent 101b0baf62
commit 9c5d5fb99e
2 changed files with 139 additions and 105 deletions

View File

@@ -1,6 +1,8 @@
# Synthetic Data Generator
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
# Synthetic Data Generator
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
@@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis
- Python 3.12+
- OpenAI account with API key
### Installation with pip
### Option 1: Using pip
```bash
# Create virtual environment
python -m venv venv
@@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt
```
### Installation with uv
### Option 2: Using uv
```bash
# Clone the repository
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
cd synthetic_data
cd synthetic-data-creator
# Install dependencies
uv sync
@@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here
## 🎯 Usage
### Start the application
You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`):
```bash
# Option 1: using Python
python app.py
# Option 2: using uv (no need to activate venv manually)
uv run app.py
```
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.

View File

@@ -1,131 +1,156 @@
import gradio as gr
import os
import atexit
from src.IO_utils import cleanup_temp_files
from src.data_generation import generate_and_evaluate_data
from src.plot_utils import display_reference_csv
from dotenv import load_dotenv
import os
import gradio as gr
import openai
from dotenv import load_dotenv
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
from src.data_generation import generate_and_evaluate_data
from src.IO_utils import cleanup_temp_files
from src.plot_utils import display_reference_csv
# ==========================================================
# Setup
# ==========================================================
#Load the api key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def main():
# ==========================================================
# Setup
# ==========================================================
# Temporary folder for images
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
# Load the api key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Ensure temporary plot images are deleted when the program exits
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
# Temporary folder for images
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
# ==========================================================
# Gradio App
# ==========================================================
with gr.Blocks() as demo:
# Ensure temporary plot images are deleted when the program exits
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
# Store temp folder in state
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
# ==========================================================
# Gradio App
# ==========================================================
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)")
# Store temp folder in state
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
# ======================================================
# Tabs for organized sections
# ======================================================
with gr.Tabs():
gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)")
# ------------------------------
# Tab 1: Input
# ------------------------------
with gr.Tab("Input"):
# ======================================================
# Tabs for organized sections
# ======================================================
with gr.Tabs():
# System prompt in collapsible
with gr.Accordion("System Prompt (click to expand)", open=False):
system_prompt_input = gr.Textbox(
label="System Prompt",
value=SYSTEM_PROMPT,
lines=20
# ------------------------------
# Tab 1: Input
# ------------------------------
with gr.Tab("Input"):
# System prompt in collapsible
with gr.Accordion("System Prompt (click to expand)", open=False):
system_prompt_input = gr.Textbox(
label="System Prompt", value=SYSTEM_PROMPT, lines=20
)
# User prompt box
user_prompt_input = gr.Textbox(
label="User Prompt", value=USER_PROMPT, lines=5
)
# User prompt box
user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5)
# Model selection
model_select = gr.Dropdown(
label="OpenAI Model",
choices=["gpt-4o-mini", "gpt-4.1-mini"],
value="gpt-4o-mini",
)
# Model selection
model_select = gr.Dropdown(
label="OpenAI Model",
choices=["gpt-4o-mini", "gpt-4.1-mini"],
value="gpt-4o-mini"
)
# Reference CSV upload
reference_input = gr.File(
label="Reference CSV (optional)", file_types=[".csv"]
)
# Reference CSV upload
reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"])
# Examples
gr.Examples(
examples=[
"data/sentiment_reference.csv",
"data/people_reference.csv",
"data/wine_reference.csv",
],
inputs=reference_input,
)
# Examples
gr.Examples(
examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"],
inputs=reference_input
)
# Generate button
generate_btn = gr.Button("🚀 Generate Data")
# Generate button
generate_btn = gr.Button("🚀 Generate Data")
# Download button
download_csv = gr.File(label="Download CSV")
# Download button
download_csv = gr.File(label="Download CSV")
# ------------------------------
# Tab 2: Reference Table
# ------------------------------
with gr.Tab("Reference Table"):
reference_display = gr.DataFrame(label="Reference CSV Preview")
# ------------------------------
# Tab 2: Reference Table
# ------------------------------
with gr.Tab("Reference Table"):
reference_display = gr.DataFrame(label="Reference CSV Preview")
# ------------------------------
# Tab 3: Generated Table
# ------------------------------
with gr.Tab("Generated Table"):
output_df = gr.DataFrame(label="Generated Data")
# ------------------------------
# Tab 3: Generated Table
# ------------------------------
with gr.Tab("Generated Table"):
output_df = gr.DataFrame(label="Generated Data")
# ------------------------------
# Tab 4: Evaluation
# ------------------------------
with gr.Tab("Comparison"):
with gr.Accordion("Evaluation Results (click to expand)", open=True):
evaluation_df = gr.DataFrame(label="Evaluation Results")
# ------------------------------
# Tab 4: Evaluation
# ------------------------------
with gr.Tab("Comparison"):
with gr.Accordion("Evaluation Results (click to expand)", open=True):
evaluation_df = gr.DataFrame(label="Evaluation Results")
# ------------------------------
# Tab 5: Visualizations
# ------------------------------
# ------------------------------
# Tab 5: Visualizations
# ------------------------------
with gr.Tab("Visualizations"):
gr.Markdown("# Click on the box to expand")
with gr.Tab("Visualizations"):
gr.Markdown("# Click on the box to expand")
images_gallery = gr.Gallery(
label="Column Visualizations",
show_label=True,
columns=2,
height='auto',
interactive=True
)
images_gallery = gr.Gallery(
label="Column Visualizations",
show_label=True,
columns=2,
height="auto",
interactive=True,
)
# Hidden state for internal use
generated_state = gr.State()
# Hidden state for internal use
generated_state = gr.State()
# ======================================================
# Event bindings
# ======================================================
generate_btn.click(
fn=generate_and_evaluate_data,
inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select],
outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery]
)
# ======================================================
# Event bindings
# ======================================================
generate_btn.click(
fn=generate_and_evaluate_data,
inputs=[
system_prompt_input,
user_prompt_input,
temp_dir_state,
reference_input,
model_select,
],
outputs=[
output_df,
download_csv,
evaluation_df,
generated_state,
images_gallery,
],
)
reference_input.change(
fn=display_reference_csv,
inputs=[reference_input],
outputs=[reference_display]
)
reference_input.change(
fn=display_reference_csv,
inputs=[reference_input],
outputs=[reference_display],
)
demo.launch(debug=True)
demo.launch(debug=True)
if __name__ == "__main__":
main()