diff --git a/week3/community-contributions/juan_synthetic_data/README.md b/week3/community-contributions/juan_synthetic_data/README.md index 7cbb0f7..f49367a 100644 --- a/week3/community-contributions/juan_synthetic_data/README.md +++ b/week3/community-contributions/juan_synthetic_data/README.md @@ -1,6 +1,8 @@ # Synthetic Data Generator **NOTE:** This is a copy of the repository https://github.com/Jsrodrigue/synthetic-data-creator. +# Synthetic Data Generator + An intelligent synthetic data generator that uses OpenAI models to create realistic tabular datasets based on reference data. This project includes an intuitive web interface built with Gradio. > **🎓 Educational Project**: This project was inspired by the highly regarded LLM Engineering course on Udemy: [LLM Engineering: Master AI and Large Language Models](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/learn/lecture/52941433#questions/23828099). It demonstrates practical applications of LLM engineering principles, prompt engineering, and synthetic data generation techniques. @@ -61,7 +63,7 @@ An intelligent synthetic data generator that uses OpenAI models to create realis - Python 3.12+ - OpenAI account with API key -### Installation with pip +### Option 1: Using pip ```bash # Create virtual environment python -m venv venv @@ -71,11 +73,11 @@ source venv/bin/activate # On Windows: venv\Scripts\activate pip install -r requirements.txt ``` -### Installation with uv +### Option 2: Using uv ```bash # Clone the repository git clone https://github.com/Jsrodrigue/synthetic-data-creator.git -cd synthetic_data +cd synthetic-data-creator # Install dependencies uv sync @@ -100,8 +102,15 @@ OPENAI_API_KEY=your_api_key_here ## 🎯 Usage ### Start the application + +You can run the app either with **Python** or with **uv** (recommended if you installed dependencies using `uv sync`): + ```bash +# Option 1: using Python python app.py + +# Option 2: using uv (no need to activate venv manually) +uv run app.py ``` The script will print a local URL (e.g., http://localhost:7860) — open that link in your browser. diff --git a/week3/community-contributions/juan_synthetic_data/app.py b/week3/community-contributions/juan_synthetic_data/app.py index ac5b950..0244f34 100644 --- a/week3/community-contributions/juan_synthetic_data/app.py +++ b/week3/community-contributions/juan_synthetic_data/app.py @@ -1,131 +1,156 @@ -import gradio as gr -import os import atexit -from src.IO_utils import cleanup_temp_files -from src.data_generation import generate_and_evaluate_data -from src.plot_utils import display_reference_csv -from dotenv import load_dotenv +import os + +import gradio as gr import openai +from dotenv import load_dotenv + from src.constants import PROJECT_TEMP_DIR, SYSTEM_PROMPT, USER_PROMPT +from src.data_generation import generate_and_evaluate_data +from src.IO_utils import cleanup_temp_files +from src.plot_utils import display_reference_csv -# ========================================================== -# Setup -# ========================================================== -#Load the api key -load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") +def main(): + # ========================================================== + # Setup + # ========================================================== -# Temporary folder for images -os.makedirs(PROJECT_TEMP_DIR, exist_ok=True) + # Load the api key + load_dotenv() + openai.api_key = os.getenv("OPENAI_API_KEY") -# Ensure temporary plot images are deleted when the program exits -atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR)) + # Temporary folder for images + os.makedirs(PROJECT_TEMP_DIR, exist_ok=True) -# ========================================================== -# Gradio App -# ========================================================== -with gr.Blocks() as demo: + # Ensure temporary plot images are deleted when the program exits + atexit.register(lambda: cleanup_temp_files(PROJECT_TEMP_DIR)) - # Store temp folder in state - temp_dir_state = gr.State(value=PROJECT_TEMP_DIR) + # ========================================================== + # Gradio App + # ========================================================== + with gr.Blocks() as demo: - gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)") + # Store temp folder in state + temp_dir_state = gr.State(value=PROJECT_TEMP_DIR) - # ====================================================== - # Tabs for organized sections - # ====================================================== - with gr.Tabs(): + gr.Markdown("# 🧠 Synthetic Data Generator (with OpenAI)") - # ------------------------------ - # Tab 1: Input - # ------------------------------ - with gr.Tab("Input"): + # ====================================================== + # Tabs for organized sections + # ====================================================== + with gr.Tabs(): - # System prompt in collapsible - with gr.Accordion("System Prompt (click to expand)", open=False): - system_prompt_input = gr.Textbox( - label="System Prompt", - value=SYSTEM_PROMPT, - lines=20 + # ------------------------------ + # Tab 1: Input + # ------------------------------ + with gr.Tab("Input"): + + # System prompt in collapsible + with gr.Accordion("System Prompt (click to expand)", open=False): + system_prompt_input = gr.Textbox( + label="System Prompt", value=SYSTEM_PROMPT, lines=20 + ) + + # User prompt box + user_prompt_input = gr.Textbox( + label="User Prompt", value=USER_PROMPT, lines=5 ) - # User prompt box - user_prompt_input = gr.Textbox(label="User Prompt", value=USER_PROMPT, lines=5) + # Model selection + model_select = gr.Dropdown( + label="OpenAI Model", + choices=["gpt-4o-mini", "gpt-4.1-mini"], + value="gpt-4o-mini", + ) - # Model selection - model_select = gr.Dropdown( - label="OpenAI Model", - choices=["gpt-4o-mini", "gpt-4.1-mini"], - value="gpt-4o-mini" - ) + # Reference CSV upload + reference_input = gr.File( + label="Reference CSV (optional)", file_types=[".csv"] + ) - # Reference CSV upload - reference_input = gr.File(label="Reference CSV (optional)", file_types=[".csv"]) + # Examples + gr.Examples( + examples=[ + "data/sentiment_reference.csv", + "data/people_reference.csv", + "data/wine_reference.csv", + ], + inputs=reference_input, + ) - # Examples - gr.Examples( - examples=["data/sentiment_reference.csv","data/people_reference.csv","data/wine_reference.csv"], - inputs=reference_input - ) + # Generate button + generate_btn = gr.Button("🚀 Generate Data") - # Generate button - generate_btn = gr.Button("🚀 Generate Data") + # Download button + download_csv = gr.File(label="Download CSV") - # Download button - download_csv = gr.File(label="Download CSV") + # ------------------------------ + # Tab 2: Reference Table + # ------------------------------ + with gr.Tab("Reference Table"): + reference_display = gr.DataFrame(label="Reference CSV Preview") - # ------------------------------ - # Tab 2: Reference Table - # ------------------------------ - with gr.Tab("Reference Table"): - reference_display = gr.DataFrame(label="Reference CSV Preview") + # ------------------------------ + # Tab 3: Generated Table + # ------------------------------ + with gr.Tab("Generated Table"): + output_df = gr.DataFrame(label="Generated Data") - # ------------------------------ - # Tab 3: Generated Table - # ------------------------------ - with gr.Tab("Generated Table"): - output_df = gr.DataFrame(label="Generated Data") - + # ------------------------------ + # Tab 4: Evaluation + # ------------------------------ + with gr.Tab("Comparison"): + with gr.Accordion("Evaluation Results (click to expand)", open=True): + evaluation_df = gr.DataFrame(label="Evaluation Results") - # ------------------------------ - # Tab 4: Evaluation - # ------------------------------ - with gr.Tab("Comparison"): - with gr.Accordion("Evaluation Results (click to expand)", open=True): - evaluation_df = gr.DataFrame(label="Evaluation Results") + # ------------------------------ + # Tab 5: Visualizations + # ------------------------------ - # ------------------------------ - # Tab 5: Visualizations - # ------------------------------ + with gr.Tab("Visualizations"): + gr.Markdown("# Click on the box to expand") - with gr.Tab("Visualizations"): - gr.Markdown("# Click on the box to expand") - - images_gallery = gr.Gallery( - label="Column Visualizations", - show_label=True, - columns=2, - height='auto', - interactive=True - ) + images_gallery = gr.Gallery( + label="Column Visualizations", + show_label=True, + columns=2, + height="auto", + interactive=True, + ) - # Hidden state for internal use - generated_state = gr.State() + # Hidden state for internal use + generated_state = gr.State() - # ====================================================== - # Event bindings - # ====================================================== - generate_btn.click( - fn=generate_and_evaluate_data, - inputs=[system_prompt_input, user_prompt_input, temp_dir_state, reference_input, model_select], - outputs=[output_df, download_csv, evaluation_df, generated_state, images_gallery] - ) + # ====================================================== + # Event bindings + # ====================================================== + generate_btn.click( + fn=generate_and_evaluate_data, + inputs=[ + system_prompt_input, + user_prompt_input, + temp_dir_state, + reference_input, + model_select, + ], + outputs=[ + output_df, + download_csv, + evaluation_df, + generated_state, + images_gallery, + ], + ) - reference_input.change( - fn=display_reference_csv, - inputs=[reference_input], - outputs=[reference_display] - ) + reference_input.change( + fn=display_reference_csv, + inputs=[reference_input], + outputs=[reference_display], + ) -demo.launch(debug=True) + demo.launch(debug=True) + + +if __name__ == "__main__": + main()