Modify readme and app
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
# Synthetic Data Generator
|
# Synthetic Data Generator
|
||||||
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
|
**NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator.
|
||||||
|
|
||||||
|
# Synthetic Data Generator
|
||||||
|
|
||||||
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
|
An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio.
|
||||||
|
|
||||||
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
|
> **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques.
|
||||||
@@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis
|
|||||||
- Python 3.12+
|
- Python 3.12+
|
||||||
- OpenAI account with API key
|
- OpenAI account with API key
|
||||||
|
|
||||||
### Installation with pip
|
### Option 1: Using pip
|
||||||
```bash
|
```bash
|
||||||
# Create virtual environment
|
# Create virtual environment
|
||||||
python -m venv venv
|
python -m venv venv
|
||||||
@@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### Installation with uv
|
### Option 2: Using uv
|
||||||
```bash
|
```bash
|
||||||
# Clone the repository
|
# Clone the repository
|
||||||
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
|
git clone https://github.com/Jsrodrigue/synthetic-data-creator.git
|
||||||
cd synthetic_data
|
cd synthetic-data-creator
|
||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
uv sync
|
uv sync
|
||||||
@@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here
|
|||||||
## 🎯 Usage
|
## 🎯 Usage
|
||||||
|
|
||||||
### Start the application
|
### Start the application
|
||||||
|
|
||||||
|
You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Option 1: using Python
|
||||||
python app.py
|
python app.py
|
||||||
|
|
||||||
|
# Option 2: using uv (no need to activate venv manually)
|
||||||
|
uv run app.py
|
||||||
```
|
```
|
||||||
|
|
||||||
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.
|
The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser.
|
||||||
|
|||||||
@@ -1,131 +1,156 @@
|
|||||||
import gradio as gr
|
|
||||||
import os
|
|
||||||
import atexit
|
import atexit
|
||||||
from src.IO_utils import cleanup_temp_files
|
import os
|
||||||
from src.data_generation import generate_and_evaluate_data
|
|
||||||
from src.plot_utils import display_reference_csv
|
import gradio as gr
|
||||||
from dotenv import load_dotenv
|
|
||||||
import openai
|
import openai
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
|
from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT
|
||||||
|
from src.data_generation import generate_and_evaluate_data
|
||||||
|
from src.IO_utils import cleanup_temp_files
|
||||||
|
from src.plot_utils import display_reference_csv
|
||||||
|
|
||||||
# ==========================================================
|
|
||||||
# Setup
|
|
||||||
# ==========================================================
|
|
||||||
|
|
||||||
#Load the api key
|
def main():
|
||||||
load_dotenv()
|
# ==========================================================
|
||||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
# Setup
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
# Temporary folder for images
|
# Load the api key
|
||||||
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
|
load_dotenv()
|
||||||
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
# Ensure temporary plot images are deleted when the program exits
|
# Temporary folder for images
|
||||||
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
|
os.makedirs(PROJECT_TEMP_DIR, exist_ok=True)
|
||||||
|
|
||||||
# ==========================================================
|
# Ensure temporary plot images are deleted when the program exits
|
||||||
# Gradio App
|
atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR))
|
||||||
# ==========================================================
|
|
||||||
with gr.Blocks() as demo:
|
|
||||||
|
|
||||||
# Store temp folder in state
|
# ==========================================================
|
||||||
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
|
# Gradio App
|
||||||
|
# ==========================================================
|
||||||
|
with gr.Blocks() as demo:
|
||||||
|
|
||||||
gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)")
|
# Store temp folder in state
|
||||||
|
temp_dir_state = gr.State(value=PROJECT_TEMP_DIR)
|
||||||
|
|
||||||
# ======================================================
|
gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)")
|
||||||
# Tabs for organized sections
|
|
||||||
# ======================================================
|
|
||||||
with gr.Tabs():
|
|
||||||
|
|
||||||
# ------------------------------
|
# ======================================================
|
||||||
# Tab 1: Input
|
# Tabs for organized sections
|
||||||
# ------------------------------
|
# ======================================================
|
||||||
with gr.Tab("Input"):
|
with gr.Tabs():
|
||||||
|
|
||||||
# System prompt in collapsible
|
# ------------------------------
|
||||||
with gr.Accordion("System Prompt (click to expand)", open=False):
|
# Tab 1: Input
|
||||||
system_prompt_input = gr.Textbox(
|
# ------------------------------
|
||||||
label="System Prompt",
|
with gr.Tab("Input"):
|
||||||
value=SYSTEM_PROMPT,
|
|
||||||
lines=20
|
# System prompt in collapsible
|
||||||
|
with gr.Accordion("System Prompt (click to expand)", open=False):
|
||||||
|
system_prompt_input = gr.Textbox(
|
||||||
|
label="System Prompt", value=SYSTEM_PROMPT, lines=20
|
||||||
|
)
|
||||||
|
|
||||||
|
# User prompt box
|
||||||
|
user_prompt_input = gr.Textbox(
|
||||||
|
label="User Prompt", value=USER_PROMPT, lines=5
|
||||||
)
|
)
|
||||||
|
|
||||||
# User prompt box
|
# Model selection
|
||||||
user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5)
|
model_select = gr.Dropdown(
|
||||||
|
label="OpenAI Model",
|
||||||
|
choices=["gpt-4o-mini", "gpt-4.1-mini"],
|
||||||
|
value="gpt-4o-mini",
|
||||||
|
)
|
||||||
|
|
||||||
# Model selection
|
# Reference CSV upload
|
||||||
model_select = gr.Dropdown(
|
reference_input = gr.File(
|
||||||
label="OpenAI Model",
|
label="Reference CSV (optional)", file_types=[".csv"]
|
||||||
choices=["gpt-4o-mini", "gpt-4.1-mini"],
|
)
|
||||||
value="gpt-4o-mini"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Reference CSV upload
|
# Examples
|
||||||
reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"])
|
gr.Examples(
|
||||||
|
examples=[
|
||||||
|
"data/sentiment_reference.csv",
|
||||||
|
"data/people_reference.csv",
|
||||||
|
"data/wine_reference.csv",
|
||||||
|
],
|
||||||
|
inputs=reference_input,
|
||||||
|
)
|
||||||
|
|
||||||
# Examples
|
# Generate button
|
||||||
gr.Examples(
|
generate_btn = gr.Button("🚀 Generate Data")
|
||||||
examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"],
|
|
||||||
inputs=reference_input
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate button
|
# Download button
|
||||||
generate_btn = gr.Button("🚀 Generate Data")
|
download_csv = gr.File(label="Download CSV")
|
||||||
|
|
||||||
# Download button
|
# ------------------------------
|
||||||
download_csv = gr.File(label="Download CSV")
|
# Tab 2: Reference Table
|
||||||
|
# ------------------------------
|
||||||
|
with gr.Tab("Reference Table"):
|
||||||
|
reference_display = gr.DataFrame(label="Reference CSV Preview")
|
||||||
|
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Tab 2: Reference Table
|
# Tab 3: Generated Table
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
with gr.Tab("Reference Table"):
|
with gr.Tab("Generated Table"):
|
||||||
reference_display = gr.DataFrame(label="Reference CSV Preview")
|
output_df = gr.DataFrame(label="Generated Data")
|
||||||
|
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
# Tab 3: Generated Table
|
# Tab 4: Evaluation
|
||||||
# ------------------------------
|
# ------------------------------
|
||||||
with gr.Tab("Generated Table"):
|
with gr.Tab("Comparison"):
|
||||||
output_df = gr.DataFrame(label="Generated Data")
|
with gr.Accordion("Evaluation Results (click to expand)", open=True):
|
||||||
|
evaluation_df = gr.DataFrame(label="Evaluation Results")
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Tab 5: Visualizations
|
||||||
|
# ------------------------------
|
||||||
|
|
||||||
|
with gr.Tab("Visualizations"):
|
||||||
|
gr.Markdown("# Click on the box to expand")
|
||||||
|
|
||||||
|
images_gallery = gr.Gallery(
|
||||||
|
label="Column Visualizations",
|
||||||
|
show_label=True,
|
||||||
|
columns=2,
|
||||||
|
height="auto",
|
||||||
|
interactive=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hidden state for internal use
|
||||||
|
generated_state = gr.State()
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Event bindings
|
||||||
|
# ======================================================
|
||||||
|
generate_btn.click(
|
||||||
|
fn=generate_and_evaluate_data,
|
||||||
|
inputs=[
|
||||||
|
system_prompt_input,
|
||||||
|
user_prompt_input,
|
||||||
|
temp_dir_state,
|
||||||
|
reference_input,
|
||||||
|
model_select,
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
output_df,
|
||||||
|
download_csv,
|
||||||
|
evaluation_df,
|
||||||
|
generated_state,
|
||||||
|
images_gallery,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
reference_input.change(
|
||||||
|
fn=display_reference_csv,
|
||||||
|
inputs=[reference_input],
|
||||||
|
outputs=[reference_display],
|
||||||
|
)
|
||||||
|
|
||||||
|
demo.launch(debug=True)
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------
|
if __name__ == "__main__":
|
||||||
# Tab 4: Evaluation
|
main()
|
||||||
# ------------------------------
|
|
||||||
with gr.Tab("Comparison"):
|
|
||||||
with gr.Accordion("Evaluation Results (click to expand)", open=True):
|
|
||||||
evaluation_df = gr.DataFrame(label="Evaluation Results")
|
|
||||||
|
|
||||||
# ------------------------------
|
|
||||||
# Tab 5: Visualizations
|
|
||||||
# ------------------------------
|
|
||||||
|
|
||||||
with gr.Tab("Visualizations"):
|
|
||||||
gr.Markdown("# Click on the box to expand")
|
|
||||||
|
|
||||||
images_gallery = gr.Gallery(
|
|
||||||
label="Column Visualizations",
|
|
||||||
show_label=True,
|
|
||||||
columns=2,
|
|
||||||
height='auto',
|
|
||||||
interactive=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Hidden state for internal use
|
|
||||||
generated_state = gr.State()
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# Event bindings
|
|
||||||
# ======================================================
|
|
||||||
generate_btn.click(
|
|
||||||
fn=generate_and_evaluate_data,
|
|
||||||
inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select],
|
|
||||||
outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery]
|
|
||||||
)
|
|
||||||
|
|
||||||
reference_input.change(
|
|
||||||
fn=display_reference_csv,
|
|
||||||
inputs=[reference_input],
|
|
||||||
outputs=[reference_display]
|
|
||||||
)
|
|
||||||
|
|
||||||
demo.launch(debug=True)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user