Modify readme and app

This commit is contained in:
Jsrodrigue
2025-10-23 23:40:53 +01:00
parent 101b0baf62
commit 9c5d5fb99e
2 changed files with 139 additions and 105 deletions

View File

@@ -1,6 +1,8 @@
# Synthetic Data Generator # Synthetic Data Generator
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator. **NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
# Synthetic Data Generator
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio. An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques. > **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
@@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis
- Python 3.12+ - Python 3.12+
- OpenAI account with API key - OpenAI account with API key
### Installation with pip ### Option 1: Using pip
```bash ```bash
# Create virtual environment # Create virtual environment
python -m venv venv python -m venv venv
@@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt pip install -r requirements.txt
``` ```
### Installation with uv ### Option 2: Using uv
```bash ```bash
# Clone the repository # Clone the repository
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
cd synthetic_data cd synthetic-data-creator
# Install dependencies # Install dependencies
uv sync uv sync
@@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here
## 🎯 Usage ## 🎯 Usage
### Start the application ### Start the application
You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`):
```bash ```bash
# Option 1: using Python
python app.py python app.py
# Option 2: using uv (no need to activate venv manually)
uv run app.py
``` ```
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser. The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.

View File

@@ -1,13 +1,17 @@
import gradio as gr
import os
import atexit import atexit
from src.IO_utils import cleanup_temp_files import os
from src.data_generation import generate_and_evaluate_data
from src.plot_utils import display_reference_csv
from dotenv import load_dotenv
import openai
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
import gradio as gr
import openai
from dotenv import load_dotenv
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
from src.data_generation import generate_and_evaluate_data
from src.IO_utils import cleanup_temp_files
from src.plot_utils import display_reference_csv
def main():
# ========================================================== # ==========================================================
# Setup # Setup
# ========================================================== # ==========================================================
@@ -45,28 +49,34 @@ with gr.Blocks() as demo:
# System prompt in collapsible # System prompt in collapsible
with gr.Accordion("System Prompt (click to expand)", open=False): with gr.Accordion("System Prompt (click to expand)", open=False):
system_prompt_input = gr.Textbox( system_prompt_input = gr.Textbox(
label="System Prompt", label="System Prompt", value=SYSTEM_PROMPT, lines=20
value=SYSTEM_PROMPT,
lines=20
) )
# User prompt box # User prompt box
user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5) user_prompt_input = gr.Textbox(
label="User Prompt", value=USER_PROMPT, lines=5
)
# Model selection # Model selection
model_select = gr.Dropdown( model_select = gr.Dropdown(
label="OpenAI Model", label="OpenAI Model",
choices=["gpt-4o-mini", "gpt-4.1-mini"], choices=["gpt-4o-mini", "gpt-4.1-mini"],
value="gpt-4o-mini" value="gpt-4o-mini",
) )
# Reference CSV upload # Reference CSV upload
reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"]) reference_input = gr.File(
label="Reference CSV (optional)", file_types=[".csv"]
)
# Examples # Examples
gr.Examples( gr.Examples(
examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"], examples=[
inputs=reference_input "data/sentiment_reference.csv",
"data/people_reference.csv",
"data/wine_reference.csv",
],
inputs=reference_input,
) )
# Generate button # Generate button
@@ -87,7 +97,6 @@ with gr.Blocks() as demo:
with gr.Tab("Generated Table"): with gr.Tab("Generated Table"):
output_df = gr.DataFrame(label="Generated Data") output_df = gr.DataFrame(label="Generated Data")
# ------------------------------ # ------------------------------
# Tab 4: Evaluation # Tab 4: Evaluation
# ------------------------------ # ------------------------------
@@ -106,8 +115,8 @@ with gr.Blocks() as demo:
label="Column Visualizations", label="Column Visualizations",
show_label=True, show_label=True,
columns=2, columns=2,
height='auto', height="auto",
interactive=True interactive=True,
) )
# Hidden state for internal use # Hidden state for internal use
@@ -118,14 +127,30 @@ with gr.Blocks() as demo:
# ====================================================== # ======================================================
generate_btn.click( generate_btn.click(
fn=generate_and_evaluate_data, fn=generate_and_evaluate_data,
inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select], inputs=[
outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery] system_prompt_input,
user_prompt_input,
temp_dir_state,
reference_input,
model_select,
],
outputs=[
output_df,
download_csv,
evaluation_df,
generated_state,
images_gallery,
],
) )
reference_input.change( reference_input.change(
fn=display_reference_csv, fn=display_reference_csv,
inputs=[reference_input], inputs=[reference_input],
outputs=[reference_display] outputs=[reference_display],
) )
demo.launch(debug=True) demo.launch(debug=True)
if __name__ == "__main__":
main()