Modify readme and app

This commit is contained in:
Jsrodrigue
2025-10-23 23:40:53 +01:00
parent 101b0baf62
commit 9c5d5fb99e
2 changed files with 139 additions and 105 deletions

View File

@@ -1,6 +1,8 @@
# Synthetic Data Generator
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
# Synthetic Data Generator
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
@@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis
- Python 3.12+
- OpenAI account with API key
### Installation with pip
### Option 1: Using pip
```bash
# Create virtual environment
python -m venv venv
@@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt
```
### Installation with uv
### Option 2: Using uv
```bash
# Clone the repository
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
cd synthetic_data
cd synthetic-data-creator
# Install dependencies
uv sync
@@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here
## 🎯 Usage
### Start the application
You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`):
```bash
# Option 1: using Python
python app.py
# Option 2: using uv (no need to activate venv manually)
uv run app.py
```
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.

View File

@@ -1,31 +1,35 @@
import gradio as gr
import os
import atexit
from src.IO_utils import cleanup_temp_files
from src.data_generation import generate_and_evaluate_data
from src.plot_utils import display_reference_csv
from dotenv import load_dotenv
import os
import gradio as gr
import openai
from dotenv import load_dotenv
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
from src.data_generation import generate_and_evaluate_data
from src.IO_utils import cleanup_temp_files
from src.plot_utils import display_reference_csv
# ==========================================================
# Setup
# ==========================================================
#Load the api key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def main():
# ==========================================================
# Setup
# ==========================================================
# Temporary folder for images
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
# Load the api key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Ensure temporary plot images are deleted when the program exits
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
# Temporary folder for images
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
# ==========================================================
# Gradio App
# ==========================================================
with gr.Blocks() as demo:
# Ensure temporary plot images are deleted when the program exits
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
# ==========================================================
# Gradio App
# ==========================================================
with gr.Blocks() as demo:
# Store temp folder in state
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
@@ -45,28 +49,34 @@ with gr.Blocks() as demo:
# System prompt in collapsible
with gr.Accordion("System Prompt (click to expand)", open=False):
system_prompt_input = gr.Textbox(
label="System Prompt",
value=SYSTEM_PROMPT,
lines=20
label="System Prompt", value=SYSTEM_PROMPT, lines=20
)
# User prompt box
user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5)
user_prompt_input = gr.Textbox(
label="User Prompt", value=USER_PROMPT, lines=5
)
# Model selection
model_select = gr.Dropdown(
label="OpenAI Model",
choices=["gpt-4o-mini", "gpt-4.1-mini"],
value="gpt-4o-mini"
value="gpt-4o-mini",
)
# Reference CSV upload
reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"])
reference_input = gr.File(
label="Reference CSV (optional)", file_types=[".csv"]
)
# Examples
gr.Examples(
examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"],
inputs=reference_input
examples=[
"data/sentiment_reference.csv",
"data/people_reference.csv",
"data/wine_reference.csv",
],
inputs=reference_input,
)
# Generate button
@@ -87,7 +97,6 @@ with gr.Blocks() as demo:
with gr.Tab("Generated Table"):
output_df = gr.DataFrame(label="Generated Data")
# ------------------------------
# Tab 4: Evaluation
# ------------------------------
@@ -106,8 +115,8 @@ with gr.Blocks() as demo:
label="Column Visualizations",
show_label=True,
columns=2,
height='auto',
interactive=True
height="auto",
interactive=True,
)
# Hidden state for internal use
@@ -118,14 +127,30 @@ with gr.Blocks() as demo:
# ======================================================
generate_btn.click(
fn=generate_and_evaluate_data,
inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select],
outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery]
inputs=[
system_prompt_input,
user_prompt_input,
temp_dir_state,
reference_input,
model_select,
],
outputs=[
output_df,
download_csv,
evaluation_df,
generated_state,
images_gallery,
],
)
reference_input.change(
fn=display_reference_csv,
inputs=[reference_input],
outputs=[reference_display]
outputs=[reference_display],
)
demo.launch(debug=True)
demo.launch(debug=True)
if __name__ == "__main__":
main()