Modify readme and app

This commit is contained in:
Jsrodrigue
2025-10-23 23:40:53 +01:00
parent 101b0baf62
commit 9c5d5fb99e
2 changed files with 139 additions and 105 deletions

View File

@@ -1,6 +1,8 @@
# Synthetic Data Generator # Synthetic Data Generator
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator. **NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
# Synthetic Data Generator
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio. An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques. > **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
@@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis
- Python 3.12+ - Python 3.12+
- OpenAI account with API key - OpenAI account with API key
### Installation with pip ### Option 1: Using pip
```bash ```bash
# Create virtual environment # Create virtual environment
python -m venv venv python -m venv venv
@@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt pip install -r requirements.txt
``` ```
### Installation with uv ### Option 2: Using uv
```bash ```bash
# Clone the repository # Clone the repository
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
cd synthetic_data cd synthetic-data-creator
# Install dependencies # Install dependencies
uv sync uv sync
@@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here
## 🎯 Usage ## 🎯 Usage
### Start the application ### Start the application
You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`):
```bash ```bash
# Option 1: using Python
python app.py python app.py
# Option 2: using uv (no need to activate venv manually)
uv run app.py
``` ```
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser. The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.

View File

@@ -1,131 +1,156 @@
import gradio as gr
import os
import atexit import atexit
from src.IO_utils import cleanup_temp_files import os
from src.data_generation import generate_and_evaluate_data
from src.plot_utils import display_reference_csv import gradio as gr
from dotenv import load_dotenv
import openai import openai
from dotenv import load_dotenv
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
from src.data_generation import generate_and_evaluate_data
from src.IO_utils import cleanup_temp_files
from src.plot_utils import display_reference_csv
# ==========================================================
# Setup
# ==========================================================
#Load the api key def main():
load_dotenv() # ==========================================================
openai.api_key = os.getenv("OPENAI_API_KEY") # Setup
# ==========================================================
# Temporary folder for images # Load the api key
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True) load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Ensure temporary plot images are deleted when the program exits # Temporary folder for images
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR)) os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
# ========================================================== # Ensure temporary plot images are deleted when the program exits
# Gradio App atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
# ==========================================================
with gr.Blocks() as demo:
# Store temp folder in state # ==========================================================
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR) # Gradio App
# ==========================================================
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)") # Store temp folder in state
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
# ====================================================== gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)")
# Tabs for organized sections
# ======================================================
with gr.Tabs():
# ------------------------------ # ======================================================
# Tab 1: Input # Tabs for organized sections
# ------------------------------ # ======================================================
with gr.Tab("Input"): with gr.Tabs():
# System prompt in collapsible # ------------------------------
with gr.Accordion("System Prompt (click to expand)", open=False): # Tab 1: Input
system_prompt_input = gr.Textbox( # ------------------------------
label="System Prompt", with gr.Tab("Input"):
value=SYSTEM_PROMPT,
lines=20 # System prompt in collapsible
with gr.Accordion("System Prompt (click to expand)", open=False):
system_prompt_input = gr.Textbox(
label="System Prompt", value=SYSTEM_PROMPT, lines=20
)
# User prompt box
user_prompt_input = gr.Textbox(
label="User Prompt", value=USER_PROMPT, lines=5
) )
# User prompt box # Model selection
user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5) model_select = gr.Dropdown(
label="OpenAI Model",
choices=["gpt-4o-mini", "gpt-4.1-mini"],
value="gpt-4o-mini",
)
# Model selection # Reference CSV upload
model_select = gr.Dropdown( reference_input = gr.File(
label="OpenAI Model", label="Reference CSV (optional)", file_types=[".csv"]
choices=["gpt-4o-mini", "gpt-4.1-mini"], )
value="gpt-4o-mini"
)
# Reference CSV upload # Examples
reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"]) gr.Examples(
examples=[
"data/sentiment_reference.csv",
"data/people_reference.csv",
"data/wine_reference.csv",
],
inputs=reference_input,
)
# Examples # Generate button
gr.Examples( generate_btn = gr.Button("🚀 Generate Data")
examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"],
inputs=reference_input
)
# Generate button # Download button
generate_btn = gr.Button("🚀 Generate Data") download_csv = gr.File(label="Download CSV")
# Download button # ------------------------------
download_csv = gr.File(label="Download CSV") # Tab 2: Reference Table
# ------------------------------
with gr.Tab("Reference Table"):
reference_display = gr.DataFrame(label="Reference CSV Preview")
# ------------------------------ # ------------------------------
# Tab 2: Reference Table # Tab 3: Generated Table
# ------------------------------ # ------------------------------
with gr.Tab("Reference Table"): with gr.Tab("Generated Table"):
reference_display = gr.DataFrame(label="Reference CSV Preview") output_df = gr.DataFrame(label="Generated Data")
# ------------------------------ # ------------------------------
# Tab 3: Generated Table # Tab 4: Evaluation
# ------------------------------ # ------------------------------
with gr.Tab("Generated Table"): with gr.Tab("Comparison"):
output_df = gr.DataFrame(label="Generated Data") with gr.Accordion("Evaluation Results (click to expand)", open=True):
evaluation_df = gr.DataFrame(label="Evaluation Results")
# ------------------------------ # ------------------------------
# Tab 4: Evaluation # Tab 5: Visualizations
# ------------------------------ # ------------------------------
with gr.Tab("Comparison"):
with gr.Accordion("Evaluation Results (click to expand)", open=True):
evaluation_df = gr.DataFrame(label="Evaluation Results")
# ------------------------------ with gr.Tab("Visualizations"):
# Tab 5: Visualizations gr.Markdown("# Click on the box to expand")
# ------------------------------
with gr.Tab("Visualizations"): images_gallery = gr.Gallery(
gr.Markdown("# Click on the box to expand") label="Column Visualizations",
show_label=True,
images_gallery = gr.Gallery( columns=2,
label="Column Visualizations", height="auto",
show_label=True, interactive=True,
columns=2, )
height='auto',
interactive=True
)
# Hidden state for internal use # Hidden state for internal use
generated_state = gr.State() generated_state = gr.State()
# ====================================================== # ======================================================
# Event bindings # Event bindings
# ====================================================== # ======================================================
generate_btn.click( generate_btn.click(
fn=generate_and_evaluate_data, fn=generate_and_evaluate_data,
inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select], inputs=[
outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery] system_prompt_input,
) user_prompt_input,
temp_dir_state,
reference_input,
model_select,
],
outputs=[
output_df,
download_csv,
evaluation_df,
generated_state,
images_gallery,
],
)
reference_input.change( reference_input.change(
fn=display_reference_csv, fn=display_reference_csv,
inputs=[reference_input], inputs=[reference_input],
outputs=[reference_display] outputs=[reference_display],
) )
demo.launch(debug=True) demo.launch(debug=True)
if __name__ == "__main__":
main()