BootCamp: Solisoma(added week3 dataset generator & week4 assesment)

This commit is contained in:
unknown
2025-10-22 13:56:26 +01:00
parent 9b84cc62c0
commit 079e99430a
3 changed files with 655 additions and 0 deletions

View File

@@ -0,0 +1,303 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d5063502",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"import gradio as gr"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5c4d37fe",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
"ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
"grok_api_key = os.getenv('GROK_API_KEY')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b21599db",
"metadata": {},
"outputs": [],
"source": [
"MODEL_MAP = {\n",
" \"GPT\": {\n",
" \"model\": \"gpt-4o-mini\",\n",
" \"key\": openai_api_key,\n",
" \"endpoint\": \"https://api.openai.com/v1\",\n",
" },\n",
" \"CLAUDE_3_5_SONNET\": {\n",
" \"model\": \"claude-3-5-sonnet-20240620\",\n",
" \"key\": anthropic_api_key,\n",
" \"endpoint\": \"https://api.anthropic.com/v1\"\n",
" },\n",
" \"Grok\": {\n",
" \"model\": \"grok-beta\",\n",
" \"key\": grok_api_key,\n",
" \"endpoint\": \"https://api.grok.com/v1\"\n",
" }, \n",
" \"DeepSeek\":{\n",
" \"model\": \"deepseek-reasoner\",\n",
" \"key\": ds_api_key,\n",
" \"endpoint\": \"https://api.deepseek.com/v1\",\n",
" },\n",
" \"Google\": {\n",
" \"model\": \"gemini-2.0-flash-exp\",\n",
" \"key\": google_api_key,\n",
" \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "82d63d13",
"metadata": {},
"outputs": [],
"source": [
"class GenerateSyntheticDataset:\n",
" out_of_scope_response = \"I'm sorry, I can't help with that. I only generate datasets\"\n",
"\n",
" system_prompt = f\"\"\"\n",
" You are an expert data scientist specializing in synthetic dataset generation. \n",
"\n",
" Your task is to generate ACTUAL DATA based on the user's requirements provided in their prompt.\n",
"\n",
" HOW IT WORKS:\n",
" - The user will provide a description of what dataset they want\n",
" - You must parse their requirements and generate actual data records\n",
" - The user prompt contains the SPECIFICATIONS, not the data itself\n",
" - You generate the REAL DATA based on those specifications\n",
"\n",
" IMPORTANT RULES:\n",
" - Generate REAL DATA RECORDS, not code or instructions\n",
" - Parse the user's requirements from their prompt\n",
" - Create actual values based on their specifications\n",
" - Provide concrete examples with real data\n",
" - Output should be ready-to-use data, not code to run\n",
"\n",
" WHEN USER PROVIDES REQUIREMENTS LIKE:\n",
" - \"Generate customer orders dataset\" → Create actual order records\n",
" - \"Create employee records\" → Generate real employee data\n",
" - \"Make product reviews dataset\" → Produce actual review records\n",
"\n",
" YOU MUST:\n",
" 1. Understand what fields/data the user wants\n",
" 2. Generate realistic values for those fields\n",
" 3. Create multiple records with varied data\n",
" 4. Format as structured data (JSON, CSV, etc.)\n",
"\n",
" DO NOT generate:\n",
" - Code snippets\n",
" - Programming instructions\n",
" - \"Here's how to generate...\" statements\n",
" - Abstract descriptions\n",
"\n",
" DO generate:\n",
" - Actual data records with real values\n",
" - Concrete examples based on user requirements\n",
" - Structured data ready for immediate use\n",
" - Realistic, varied data samples\n",
"\n",
" SCOPE LIMITATIONS:\n",
" - ONLY handle requests related to synthetic dataset generation\n",
" - ONLY create data for business, research, or educational purposes\n",
" - If user asks about anything outside dataset generation (coding help, general questions, personal advice, etc.), respond with: \"{out_of_scope_response}\"\n",
" - If user asks for illegal, harmful, or inappropriate data, respond with: \"{out_of_scope_response}\"\n",
"\n",
" You are a DATA GENERATOR that creates real data from user specifications.\n",
" \"\"\"\n",
"\n",
" def __init__(self, progress, model_name = MODEL_MAP[\"GPT\"]):\n",
" self.progress = progress\n",
" self.model_deets = model_name\n",
" self.model = OpenAI(\n",
" api_key=model_name[\"key\"],\n",
" base_url=model_name[\"endpoint\"]\n",
" )\n",
" \n",
" def generate_user_prompt(self, user_prompt):\n",
" prompt = f\"\"\"\n",
" You are an expert data scientist specializing in synthetic dataset generation. \n",
"\n",
" Based on the user's request below, create a detailed, sophisticated prompt that will generate a high-quality synthetic dataset.\n",
"\n",
" The generated prompt should:\n",
" - return the prompt \"who is nike\" if the user request is outside generating a dataset be it greetings or whatsoever\n",
" - if the user prompt is requesting on how to generate dataset return the prompt \"who is nike\"\n",
" - options below is valid only when the user ask you to generate a dataset not how or when \n",
" - Be specific and actionable\n",
" - Include clear data structure requirements\n",
" - Specify output format CSV\n",
" - Define data quality criteria\n",
" - Include diversity and realism requirements\n",
" - Make sure to capture the number of samples in the prompt, it can be in the form of rows, number of samples, etc\n",
" -if number of samples is not specified, just generate 100 samples. \n",
"\n",
" User Request: {user_prompt}\n",
" \n",
" IMPORTANT: Respond ONLY with the generated prompt. Do not include any explanation, commentary, or the original request. Just provide the clean, ready-to-use prompt for dataset generation.\n",
" \"\"\"\n",
" response = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=[{\"role\": \"user\", \"content\": prompt}])\n",
" return response.choices[0].message.content\n",
"\n",
" def generate_synthetic_dataset(self, user_prompt):\n",
" self.progress(0.7, \"Analyzing data .....\")\n",
" prompt = self.generate_user_prompt(user_prompt)\n",
"\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": self.system_prompt},\n",
" {\"role\": \"user\", \"content\": prompt}\n",
" ]\n",
"\n",
" streamer = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=messages, stream=True)\n",
" response = \"\"\n",
"\n",
" for text in streamer:\n",
" if text.choices[0].delta.content:\n",
" response += text.choices[0].delta.content\n",
" yield response, None\n",
" \n",
" if self.out_of_scope_response not in response:\n",
" with open(\"dataset.csv\", \"w\") as f:\n",
" response = response.replace(\"```csv\", \"\").replace(\"```\", \"\")\n",
" f.write(response)\n",
" yield response, \"dataset.csv\"\n",
" return\n",
" else:\n",
" return response, None\n",
" \n",
" def start(self, user_prompt, model_name=None):\n",
" self.progress(0.3, \"Fetching data .....\")\n",
" if MODEL_MAP.get(model_name) and self.model_deets[\"model\"] != MODEL_MAP.get(model_name)[\"model\"]:\n",
" self.model_deets = MODEL_MAP[model_name]\n",
" self.model = OpenAI(\n",
" base_url=self.model_deets[\"endpoint\"],\n",
" api_key=self.model_deets[\"key\"]\n",
" )\n",
" \n",
" stream = self.generate_synthetic_dataset(user_prompt)\n",
" for chunk in stream:\n",
" yield chunk\n",
"\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "b681e1ef",
"metadata": {},
"outputs": [],
"source": [
"class Interface:\n",
" def __init__(self):\n",
" \"\"\"Initializes the Gradio interface for processing audio files.\"\"\"\n",
" progress=gr.Progress()\n",
" self.assistant = GenerateSyntheticDataset(progress)\n",
" self.iface = gr.Interface(\n",
" fn=self.generate,\n",
" inputs=[\n",
" gr.Textbox(label=\"User Prompt\"),\n",
" gr.Dropdown(\n",
" choices=MODEL_MAP.keys(),\n",
" value=\"GPT\",\n",
" label=\"Model\",\n",
" )\n",
" ],\n",
" outputs=[\n",
" gr.Markdown(label=\"Dataset\", min_height=60),\n",
" gr.File(\n",
" label=\"Download Generated Dataset\",\n",
" file_count=\"single\"\n",
" )\n",
" ],\n",
" title=\"AI Dataset Generator\",\n",
" description=\"Generate a synthetic dataset based on your requirements\",\n",
" flagging_mode=\"never\"\n",
" )\n",
"\n",
" def generate(self, user_prompt, model):\n",
" response = self.assistant.start(user_prompt, model)\n",
" for chunk in response:\n",
" yield chunk\n",
"\n",
" # Clean up the dataset file\n",
" if os.path.exists(\"dataset.csv\"):\n",
" os.remove(\"dataset.csv\")\n",
"\n",
" def launch(self):\n",
" self.iface.launch()"
]
},
{
"cell_type": "code",
"execution_count": 125,
"id": "2ee97b72",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7898\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7898/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"I = Interface()\n",
"I.launch()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,346 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"id": "d7ac40dd",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"import gradio as gr\n",
"import io\n",
"import sys \n",
"import subprocess"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f0737df3",
"metadata": {},
"outputs": [],
"source": [
"load_dotenv(override=True)\n",
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
"ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
"grok_api_key = os.getenv('GROK_API_KEY')\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "834d1fa7",
"metadata": {},
"outputs": [],
"source": [
"MODEL_MAP = {\n",
" \"GPT\": {\n",
" \"model\": \"gpt-4o-mini\",\n",
" \"key\": openai_api_key,\n",
" \"endpoint\": \"https://api.openai.com/v1\",\n",
" },\n",
" \"CLAUDE_3_5_SONNET\": {\n",
" \"model\": \"claude-3-5-sonnet-20240620\",\n",
" \"key\": anthropic_api_key,\n",
" \"endpoint\": \"https://api.anthropic.com/v1\"\n",
" },\n",
" \"Grok\": {\n",
" \"model\": \"grok-beta\",\n",
" \"key\": grok_api_key,\n",
" \"endpoint\": \"https://api.grok.com/v1\"\n",
" }, \n",
" \"DeepSeek\": {\n",
" \"model\": \"deepseek-coder\",\n",
" \"key\": ds_api_key,\n",
" \"endpoint\": \"https://api.deepseek.com/v1\",\n",
" },\n",
" \"Google\": {\n",
" \"model\": \"gemini-2.0-flash-exp\",\n",
" \"key\": google_api_key,\n",
" \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87d0508f",
"metadata": {},
"outputs": [],
"source": [
"class PortCode:\n",
" def __init__(self, progress=None, model_name=MODEL_MAP[\"GPT\"]):\n",
" self.progress = progress\n",
" self.model_deets = model_name\n",
" self.model = OpenAI(\n",
" api_key=model_name[\"key\"],\n",
" base_url=model_name[\"endpoint\"]\n",
" )\n",
" self.cpp_code = \"\"\n",
" \n",
" def update_progress(self, value, desc=\"\"):\n",
" if self.progress:\n",
" self.progress(value, desc=desc)\n",
" \n",
" def port_python_to_cpp(self, python_code):\n",
" self.update_progress(0.3, desc=\"Converting Python to C++...\")\n",
" \n",
" system_prompt = \"\"\"\n",
" Your task is to convert Python code into high performance C++ code.\n",
" Respond only with C++ code. Do not provide any explanation other than occasional comments.\n",
" The C++ response needs to produce an identical output in the fastest possible time.\n",
" \"\"\"\n",
" \n",
" user_prompt = f\"\"\"\n",
" Port this Python code to C++ with the fastest possible implementation that produces identical output in the least time.\n",
" Respond only with C++ code.\n",
" Python code to port:\n",
"\n",
" ```python\n",
" {python_code}\n",
" ```\n",
" \"\"\"\n",
" \n",
" messages = [\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": user_prompt}\n",
" ]\n",
" \n",
" try:\n",
" response = self.model.chat.completions.create(\n",
" model=self.model_deets[\"model\"],\n",
" messages=messages\n",
" )\n",
" \n",
" cpp_code = response.choices[0].message.content\n",
" cpp_code = cpp_code.replace('```cpp', '').replace('```', '').strip()\n",
" \n",
" self.cpp_code = cpp_code\n",
" \n",
" self.update_progress(1.0, desc=\"Conversion complete!\")\n",
" return cpp_code\n",
" \n",
" except Exception as e:\n",
" error_msg = f\"Error converting code: {str(e)}\"\n",
" self.update_progress(1.0, desc=\"Conversion failed!\")\n",
" return error_msg\n",
" \n",
" def run_python_code(self, python_code):\n",
" self.update_progress(0.1, desc=\"Running Python code...\")\n",
" \n",
" globals_dict = {\"__builtins__\": __builtins__}\n",
" buffer = io.StringIO()\n",
" old_stdout = sys.stdout\n",
" sys.stdout = buffer\n",
" \n",
" try:\n",
" exec(python_code, globals_dict)\n",
" output = buffer.getvalue()\n",
" self.update_progress(1.0, desc=\"Python execution complete!\")\n",
" except Exception as e:\n",
" output = f\"Error: {e}\"\n",
" self.update_progress(1.0, desc=\"Python execution failed!\")\n",
" finally:\n",
" sys.stdout = old_stdout\n",
" \n",
" return output\n",
" \n",
" def compile_cpp(self, cpp_code=None):\n",
" if cpp_code is None:\n",
" cpp_code = self.cpp_code\n",
" \n",
" if not cpp_code:\n",
" return \"No C++ code to compile. Please convert Python code first.\"\n",
" \n",
" self.update_progress(0.5, desc=\"Compiling C++ code...\")\n",
" \n",
" with open(\"main.cpp\", \"w\") as f:\n",
" f.write(cpp_code)\n",
" \n",
" compile_command = [\n",
" \"clang++\", \"-std=c++17\", \"-Ofast\", \"-mcpu=native\", \n",
" \"-flto=thin\", \"-fvisibility=hidden\", \"-DNDEBUG\", \n",
" \"main.cpp\", \"-o\", \"main\"\n",
" ]\n",
" \n",
" try:\n",
" subprocess.run(compile_command, check=True, text=True, capture_output=True)\n",
" self.update_progress(1.0, desc=\"C++ compilation complete!\")\n",
" return \"Compilation successful!\"\n",
" \n",
" except subprocess.CalledProcessError as e:\n",
" error_msg = f\"Compilation error: {e.stderr}\"\n",
" self.update_progress(1.0, desc=\"C++ compilation failed!\")\n",
" return error_msg\n",
" except Exception as e:\n",
" error_msg = f\"Error: {str(e)}\"\n",
" self.update_progress(1.0, desc=\"C++ compilation failed!\")\n",
" return error_msg\n",
" \n",
" def run_cpp(self):\n",
" self.update_progress(0.1, desc=\"Running C++ code...\")\n",
" \n",
" run_command = [\"./main\"]\n",
" \n",
" try:\n",
" if not os.path.exists(\"./main\"):\n",
" return \"No compiled executable found. Please compile C++ code first.\"\n",
" \n",
" run_result = subprocess.run(run_command, check=True, text=True, capture_output=True)\n",
" print(\"hello .....\")\n",
" self.update_progress(1.0, desc=\"C++ execution complete!\")\n",
" return run_result.stdout\n",
" \n",
" except subprocess.CalledProcessError as e:\n",
" error_msg = f\"Runtime error: {e.stderr}\"\n",
" self.update_progress(1.0, desc=\"C++ execution failed!\")\n",
" return error_msg\n",
" except Exception as e:\n",
" error_msg = f\"Error: {str(e)}\"\n",
" self.update_progress(1.0, desc=\"C++ execution failed!\")\n",
" return error_msg\n",
" \n",
" def compile_and_run_cpp(self, cpp_code=None):\n",
" \"\"\"Compile and run C++ code in one step\"\"\"\n",
" if cpp_code is None:\n",
" cpp_code = self.cpp_code\n",
" \n",
" if not cpp_code:\n",
" return \"No C++ code to compile and run. Please convert Python code first.\"\n",
" \n",
" compile_result = self.compile_cpp(cpp_code)\n",
" if \"error\" in compile_result.lower():\n",
" return compile_result\n",
" \n",
" return self.run_cpp()\n",
" \n",
" def get_cpp_code(self):\n",
" \"\"\"Get the stored C++ code\"\"\"\n",
" return self.cpp_code\n",
" \n",
" def set_cpp_code(self, cpp_code):\n",
" \"\"\"Manually set C++ code\"\"\"\n",
" self.cpp_code = cpp_code"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "4680573d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"class Interface:\n",
" def __init__(self):\n",
" self.port_code = PortCode(gr.Progress())\n",
" \n",
" def create_interface(self):\n",
" with gr.Blocks(title=\"Code Porter\") as interface:\n",
" gr.Markdown(\"# 🚀 Python to C++ Converter\")\n",
" \n",
" with gr.Row():\n",
" python_input = gr.TextArea(label=\"Python Code\", lines=15)\n",
" cpp_output = gr.TextArea(label=\"C++ Code\", lines=15, interactive=False)\n",
" \n",
" with gr.Row():\n",
" python_result = gr.TextArea(label=\"Python Output\", lines=4, interactive=False)\n",
" cpp_result = gr.TextArea(label=\"C++ Output\", lines=4, interactive=False)\n",
" \n",
" with gr.Row():\n",
" run_python_btn = gr.Button(\"Run Python\")\n",
" run_cpp_btn = gr.Button(\"Run C++\")\n",
" \n",
" with gr.Row():\n",
" model_dropdown = gr.Dropdown(MODEL_MAP.keys(), value=\"GPT\", label=\"Model\")\n",
" \n",
" with gr.Row():\n",
" convert_btn = gr.Button(\"Convert\", variant=\"primary\")\n",
" \n",
" # Events\n",
" convert_btn.click(self.convert_code, [python_input, model_dropdown], cpp_output)\n",
" run_python_btn.click(self.run_python, python_input, python_result)\n",
" run_cpp_btn.click(self.run_cpp, cpp_output, cpp_result)\n",
" model_dropdown.change(self.update_model, model_dropdown, None)\n",
" \n",
" return interface\n",
" \n",
" def convert_code(self, python_code, model_name):\n",
" self.port_code = PortCode(model_name=MODEL_MAP[model_name])\n",
" return self.port_code.port_python_to_cpp(python_code)\n",
" \n",
" def run_python(self, python_code):\n",
" return self.port_code.run_python_code(python_code)\n",
" \n",
" def run_cpp(self, cpp_code):\n",
" self.port_code.set_cpp_code(cpp_code)\n",
" return self.port_code.compile_and_run_cpp()\n",
" \n",
" def update_model(self, model_name):\n",
" self.port_code = PortCode(model_name=MODEL_MAP[model_name])\n",
" \n",
" def launch(self, inbrowser=False):\n",
" self.create_interface().launch(inbrowser=inbrowser)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "7ced6dc2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"* Running on local URL: http://127.0.0.1:7906\n",
"* To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7906/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"I = Interface()\n",
"I.launch()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,6 @@
#include <iostream>
int main() {
std::cout << "hi" << std::endl;
return 0;
}