diff --git a/week3/community-contributions/solisoma/synthetic_dataset_generator.ipynb b/week3/community-contributions/solisoma/synthetic_dataset_generator.ipynb new file mode 100644 index 0000000..f7f0a8d --- /dev/null +++ b/week3/community-contributions/solisoma/synthetic_dataset_generator.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "d5063502", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5c4d37fe", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')\n", + "ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n", + "grok_api_key = os.getenv('GROK_API_KEY')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b21599db", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_MAP = {\n", + " \"GPT\": {\n", + " \"model\": \"gpt-4o-mini\",\n", + " \"key\": openai_api_key,\n", + " \"endpoint\": \"https://api.openai.com/v1\",\n", + " },\n", + " \"CLAUDE_3_5_SONNET\": {\n", + " \"model\": \"claude-3-5-sonnet-20240620\",\n", + " \"key\": anthropic_api_key,\n", + " \"endpoint\": \"https://api.anthropic.com/v1\"\n", + " },\n", + " \"Grok\": {\n", + " \"model\": \"grok-beta\",\n", + " \"key\": grok_api_key,\n", + " \"endpoint\": \"https://api.grok.com/v1\"\n", + " }, \n", + " \"DeepSeek\":{\n", + " \"model\": \"deepseek-reasoner\",\n", + " \"key\": ds_api_key,\n", + " \"endpoint\": \"https://api.deepseek.com/v1\",\n", + " },\n", + " \"Google\": {\n", + " \"model\": \"gemini-2.0-flash-exp\",\n", + " \"key\": google_api_key,\n", + " \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "82d63d13", + "metadata": {}, + "outputs": [], + "source": [ + "class GenerateSyntheticDataset:\n", + " out_of_scope_response = \"I'm sorry, I can't help with that. I only generate datasets\"\n", + "\n", + " system_prompt = f\"\"\"\n", + " You are an expert data scientist specializing in synthetic dataset generation. \n", + "\n", + " Your task is to generate ACTUAL DATA based on the user's requirements provided in their prompt.\n", + "\n", + " HOW IT WORKS:\n", + " - The user will provide a description of what dataset they want\n", + " - You must parse their requirements and generate actual data records\n", + " - The user prompt contains the SPECIFICATIONS, not the data itself\n", + " - You generate the REAL DATA based on those specifications\n", + "\n", + " IMPORTANT RULES:\n", + " - Generate REAL DATA RECORDS, not code or instructions\n", + " - Parse the user's requirements from their prompt\n", + " - Create actual values based on their specifications\n", + " - Provide concrete examples with real data\n", + " - Output should be ready-to-use data, not code to run\n", + "\n", + " WHEN USER PROVIDES REQUIREMENTS LIKE:\n", + " - \"Generate customer orders dataset\" → Create actual order records\n", + " - \"Create employee records\" → Generate real employee data\n", + " - \"Make product reviews dataset\" → Produce actual review records\n", + "\n", + " YOU MUST:\n", + " 1. Understand what fields/data the user wants\n", + " 2. Generate realistic values for those fields\n", + " 3. Create multiple records with varied data\n", + " 4. Format as structured data (JSON, CSV, etc.)\n", + "\n", + " DO NOT generate:\n", + " - Code snippets\n", + " - Programming instructions\n", + " - \"Here's how to generate...\" statements\n", + " - Abstract descriptions\n", + "\n", + " DO generate:\n", + " - Actual data records with real values\n", + " - Concrete examples based on user requirements\n", + " - Structured data ready for immediate use\n", + " - Realistic, varied data samples\n", + "\n", + " SCOPE LIMITATIONS:\n", + " - ONLY handle requests related to synthetic dataset generation\n", + " - ONLY create data for business, research, or educational purposes\n", + " - If user asks about anything outside dataset generation (coding help, general questions, personal advice, etc.), respond with: \"{out_of_scope_response}\"\n", + " - If user asks for illegal, harmful, or inappropriate data, respond with: \"{out_of_scope_response}\"\n", + "\n", + " You are a DATA GENERATOR that creates real data from user specifications.\n", + " \"\"\"\n", + "\n", + " def __init__(self, progress, model_name = MODEL_MAP[\"GPT\"]):\n", + " self.progress = progress\n", + " self.model_deets = model_name\n", + " self.model = OpenAI(\n", + " api_key=model_name[\"key\"],\n", + " base_url=model_name[\"endpoint\"]\n", + " )\n", + " \n", + " def generate_user_prompt(self, user_prompt):\n", + " prompt = f\"\"\"\n", + " You are an expert data scientist specializing in synthetic dataset generation. \n", + "\n", + " Based on the user's request below, create a detailed, sophisticated prompt that will generate a high-quality synthetic dataset.\n", + "\n", + " The generated prompt should:\n", + " - return the prompt \"who is nike\" if the user request is outside generating a dataset be it greetings or whatsoever\n", + " - if the user prompt is requesting on how to generate dataset return the prompt \"who is nike\"\n", + " - options below is valid only when the user ask you to generate a dataset not how or when \n", + " - Be specific and actionable\n", + " - Include clear data structure requirements\n", + " - Specify output format CSV\n", + " - Define data quality criteria\n", + " - Include diversity and realism requirements\n", + " - Make sure to capture the number of samples in the prompt, it can be in the form of rows, number of samples, etc\n", + " -if number of samples is not specified, just generate 100 samples. \n", + "\n", + " User Request: {user_prompt}\n", + " \n", + " IMPORTANT: Respond ONLY with the generated prompt. Do not include any explanation, commentary, or the original request. Just provide the clean, ready-to-use prompt for dataset generation.\n", + " \"\"\"\n", + " response = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=[{\"role\": \"user\", \"content\": prompt}])\n", + " return response.choices[0].message.content\n", + "\n", + " def generate_synthetic_dataset(self, user_prompt):\n", + " self.progress(0.7, \"Analyzing data .....\")\n", + " prompt = self.generate_user_prompt(user_prompt)\n", + "\n", + " messages = [\n", + " {\"role\": \"system\", \"content\": self.system_prompt},\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ]\n", + "\n", + " streamer = self.model.chat.completions.create(model=self.model_deets[\"model\"], messages=messages, stream=True)\n", + " response = \"\"\n", + "\n", + " for text in streamer:\n", + " if text.choices[0].delta.content:\n", + " response += text.choices[0].delta.content\n", + " yield response, None\n", + " \n", + " if self.out_of_scope_response not in response:\n", + " with open(\"dataset.csv\", \"w\") as f:\n", + " response = response.replace(\"```csv\", \"\").replace(\"```\", \"\")\n", + " f.write(response)\n", + " yield response, \"dataset.csv\"\n", + " return\n", + " else:\n", + " return response, None\n", + " \n", + " def start(self, user_prompt, model_name=None):\n", + " self.progress(0.3, \"Fetching data .....\")\n", + " if MODEL_MAP.get(model_name) and self.model_deets[\"model\"] != MODEL_MAP.get(model_name)[\"model\"]:\n", + " self.model_deets = MODEL_MAP[model_name]\n", + " self.model = OpenAI(\n", + " base_url=self.model_deets[\"endpoint\"],\n", + " api_key=self.model_deets[\"key\"]\n", + " )\n", + " \n", + " stream = self.generate_synthetic_dataset(user_prompt)\n", + " for chunk in stream:\n", + " yield chunk\n", + "\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "b681e1ef", + "metadata": {}, + "outputs": [], + "source": [ + "class Interface:\n", + " def __init__(self):\n", + " \"\"\"Initializes the Gradio interface for processing audio files.\"\"\"\n", + " progress=gr.Progress()\n", + " self.assistant = GenerateSyntheticDataset(progress)\n", + " self.iface = gr.Interface(\n", + " fn=self.generate,\n", + " inputs=[\n", + " gr.Textbox(label=\"User Prompt\"),\n", + " gr.Dropdown(\n", + " choices=MODEL_MAP.keys(),\n", + " value=\"GPT\",\n", + " label=\"Model\",\n", + " )\n", + " ],\n", + " outputs=[\n", + " gr.Markdown(label=\"Dataset\", min_height=60),\n", + " gr.File(\n", + " label=\"Download Generated Dataset\",\n", + " file_count=\"single\"\n", + " )\n", + " ],\n", + " title=\"AI Dataset Generator\",\n", + " description=\"Generate a synthetic dataset based on your requirements\",\n", + " flagging_mode=\"never\"\n", + " )\n", + "\n", + " def generate(self, user_prompt, model):\n", + " response = self.assistant.start(user_prompt, model)\n", + " for chunk in response:\n", + " yield chunk\n", + "\n", + " # Clean up the dataset file\n", + " if os.path.exists(\"dataset.csv\"):\n", + " os.remove(\"dataset.csv\")\n", + "\n", + " def launch(self):\n", + " self.iface.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "2ee97b72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7898\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "I = Interface()\n", + "I.launch()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week4/community-contributions/solisoma/end_of_week_assesment.ipynb b/week4/community-contributions/solisoma/end_of_week_assesment.ipynb new file mode 100644 index 0000000..ac4670e --- /dev/null +++ b/week4/community-contributions/solisoma/end_of_week_assesment.ipynb @@ -0,0 +1,346 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "id": "d7ac40dd", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "import io\n", + "import sys \n", + "import subprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f0737df3", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')\n", + "ds_api_key = os.getenv('DEEPSEEK_API_KEY')\n", + "grok_api_key = os.getenv('GROK_API_KEY')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "834d1fa7", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_MAP = {\n", + " \"GPT\": {\n", + " \"model\": \"gpt-4o-mini\",\n", + " \"key\": openai_api_key,\n", + " \"endpoint\": \"https://api.openai.com/v1\",\n", + " },\n", + " \"CLAUDE_3_5_SONNET\": {\n", + " \"model\": \"claude-3-5-sonnet-20240620\",\n", + " \"key\": anthropic_api_key,\n", + " \"endpoint\": \"https://api.anthropic.com/v1\"\n", + " },\n", + " \"Grok\": {\n", + " \"model\": \"grok-beta\",\n", + " \"key\": grok_api_key,\n", + " \"endpoint\": \"https://api.grok.com/v1\"\n", + " }, \n", + " \"DeepSeek\": {\n", + " \"model\": \"deepseek-coder\",\n", + " \"key\": ds_api_key,\n", + " \"endpoint\": \"https://api.deepseek.com/v1\",\n", + " },\n", + " \"Google\": {\n", + " \"model\": \"gemini-2.0-flash-exp\",\n", + " \"key\": google_api_key,\n", + " \"endpoint\": \"https://generativelanguage.googleapis.com/v1beta/openai\"\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87d0508f", + "metadata": {}, + "outputs": [], + "source": [ + "class PortCode:\n", + " def __init__(self, progress=None, model_name=MODEL_MAP[\"GPT\"]):\n", + " self.progress = progress\n", + " self.model_deets = model_name\n", + " self.model = OpenAI(\n", + " api_key=model_name[\"key\"],\n", + " base_url=model_name[\"endpoint\"]\n", + " )\n", + " self.cpp_code = \"\"\n", + " \n", + " def update_progress(self, value, desc=\"\"):\n", + " if self.progress:\n", + " self.progress(value, desc=desc)\n", + " \n", + " def port_python_to_cpp(self, python_code):\n", + " self.update_progress(0.3, desc=\"Converting Python to C++...\")\n", + " \n", + " system_prompt = \"\"\"\n", + " Your task is to convert Python code into high performance C++ code.\n", + " Respond only with C++ code. Do not provide any explanation other than occasional comments.\n", + " The C++ response needs to produce an identical output in the fastest possible time.\n", + " \"\"\"\n", + " \n", + " user_prompt = f\"\"\"\n", + " Port this Python code to C++ with the fastest possible implementation that produces identical output in the least time.\n", + " Respond only with C++ code.\n", + " Python code to port:\n", + "\n", + " ```python\n", + " {python_code}\n", + " ```\n", + " \"\"\"\n", + " \n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " \n", + " try:\n", + " response = self.model.chat.completions.create(\n", + " model=self.model_deets[\"model\"],\n", + " messages=messages\n", + " )\n", + " \n", + " cpp_code = response.choices[0].message.content\n", + " cpp_code = cpp_code.replace('```cpp', '').replace('```', '').strip()\n", + " \n", + " self.cpp_code = cpp_code\n", + " \n", + " self.update_progress(1.0, desc=\"Conversion complete!\")\n", + " return cpp_code\n", + " \n", + " except Exception as e:\n", + " error_msg = f\"Error converting code: {str(e)}\"\n", + " self.update_progress(1.0, desc=\"Conversion failed!\")\n", + " return error_msg\n", + " \n", + " def run_python_code(self, python_code):\n", + " self.update_progress(0.1, desc=\"Running Python code...\")\n", + " \n", + " globals_dict = {\"__builtins__\": __builtins__}\n", + " buffer = io.StringIO()\n", + " old_stdout = sys.stdout\n", + " sys.stdout = buffer\n", + " \n", + " try:\n", + " exec(python_code, globals_dict)\n", + " output = buffer.getvalue()\n", + " self.update_progress(1.0, desc=\"Python execution complete!\")\n", + " except Exception as e:\n", + " output = f\"Error: {e}\"\n", + " self.update_progress(1.0, desc=\"Python execution failed!\")\n", + " finally:\n", + " sys.stdout = old_stdout\n", + " \n", + " return output\n", + " \n", + " def compile_cpp(self, cpp_code=None):\n", + " if cpp_code is None:\n", + " cpp_code = self.cpp_code\n", + " \n", + " if not cpp_code:\n", + " return \"No C++ code to compile. Please convert Python code first.\"\n", + " \n", + " self.update_progress(0.5, desc=\"Compiling C++ code...\")\n", + " \n", + " with open(\"main.cpp\", \"w\") as f:\n", + " f.write(cpp_code)\n", + " \n", + " compile_command = [\n", + " \"clang++\", \"-std=c++17\", \"-Ofast\", \"-mcpu=native\", \n", + " \"-flto=thin\", \"-fvisibility=hidden\", \"-DNDEBUG\", \n", + " \"main.cpp\", \"-o\", \"main\"\n", + " ]\n", + " \n", + " try:\n", + " subprocess.run(compile_command, check=True, text=True, capture_output=True)\n", + " self.update_progress(1.0, desc=\"C++ compilation complete!\")\n", + " return \"Compilation successful!\"\n", + " \n", + " except subprocess.CalledProcessError as e:\n", + " error_msg = f\"Compilation error: {e.stderr}\"\n", + " self.update_progress(1.0, desc=\"C++ compilation failed!\")\n", + " return error_msg\n", + " except Exception as e:\n", + " error_msg = f\"Error: {str(e)}\"\n", + " self.update_progress(1.0, desc=\"C++ compilation failed!\")\n", + " return error_msg\n", + " \n", + " def run_cpp(self):\n", + " self.update_progress(0.1, desc=\"Running C++ code...\")\n", + " \n", + " run_command = [\"./main\"]\n", + " \n", + " try:\n", + " if not os.path.exists(\"./main\"):\n", + " return \"No compiled executable found. Please compile C++ code first.\"\n", + " \n", + " run_result = subprocess.run(run_command, check=True, text=True, capture_output=True)\n", + " print(\"hello .....\")\n", + " self.update_progress(1.0, desc=\"C++ execution complete!\")\n", + " return run_result.stdout\n", + " \n", + " except subprocess.CalledProcessError as e:\n", + " error_msg = f\"Runtime error: {e.stderr}\"\n", + " self.update_progress(1.0, desc=\"C++ execution failed!\")\n", + " return error_msg\n", + " except Exception as e:\n", + " error_msg = f\"Error: {str(e)}\"\n", + " self.update_progress(1.0, desc=\"C++ execution failed!\")\n", + " return error_msg\n", + " \n", + " def compile_and_run_cpp(self, cpp_code=None):\n", + " \"\"\"Compile and run C++ code in one step\"\"\"\n", + " if cpp_code is None:\n", + " cpp_code = self.cpp_code\n", + " \n", + " if not cpp_code:\n", + " return \"No C++ code to compile and run. Please convert Python code first.\"\n", + " \n", + " compile_result = self.compile_cpp(cpp_code)\n", + " if \"error\" in compile_result.lower():\n", + " return compile_result\n", + " \n", + " return self.run_cpp()\n", + " \n", + " def get_cpp_code(self):\n", + " \"\"\"Get the stored C++ code\"\"\"\n", + " return self.cpp_code\n", + " \n", + " def set_cpp_code(self, cpp_code):\n", + " \"\"\"Manually set C++ code\"\"\"\n", + " self.cpp_code = cpp_code" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4680573d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class Interface:\n", + " def __init__(self):\n", + " self.port_code = PortCode(gr.Progress())\n", + " \n", + " def create_interface(self):\n", + " with gr.Blocks(title=\"Code Porter\") as interface:\n", + " gr.Markdown(\"# 🚀 Python to C++ Converter\")\n", + " \n", + " with gr.Row():\n", + " python_input = gr.TextArea(label=\"Python Code\", lines=15)\n", + " cpp_output = gr.TextArea(label=\"C++ Code\", lines=15, interactive=False)\n", + " \n", + " with gr.Row():\n", + " python_result = gr.TextArea(label=\"Python Output\", lines=4, interactive=False)\n", + " cpp_result = gr.TextArea(label=\"C++ Output\", lines=4, interactive=False)\n", + " \n", + " with gr.Row():\n", + " run_python_btn = gr.Button(\"Run Python\")\n", + " run_cpp_btn = gr.Button(\"Run C++\")\n", + " \n", + " with gr.Row():\n", + " model_dropdown = gr.Dropdown(MODEL_MAP.keys(), value=\"GPT\", label=\"Model\")\n", + " \n", + " with gr.Row():\n", + " convert_btn = gr.Button(\"Convert\", variant=\"primary\")\n", + " \n", + " # Events\n", + " convert_btn.click(self.convert_code, [python_input, model_dropdown], cpp_output)\n", + " run_python_btn.click(self.run_python, python_input, python_result)\n", + " run_cpp_btn.click(self.run_cpp, cpp_output, cpp_result)\n", + " model_dropdown.change(self.update_model, model_dropdown, None)\n", + " \n", + " return interface\n", + " \n", + " def convert_code(self, python_code, model_name):\n", + " self.port_code = PortCode(model_name=MODEL_MAP[model_name])\n", + " return self.port_code.port_python_to_cpp(python_code)\n", + " \n", + " def run_python(self, python_code):\n", + " return self.port_code.run_python_code(python_code)\n", + " \n", + " def run_cpp(self, cpp_code):\n", + " self.port_code.set_cpp_code(cpp_code)\n", + " return self.port_code.compile_and_run_cpp()\n", + " \n", + " def update_model(self, model_name):\n", + " self.port_code = PortCode(model_name=MODEL_MAP[model_name])\n", + " \n", + " def launch(self, inbrowser=False):\n", + " self.create_interface().launch(inbrowser=inbrowser)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "7ced6dc2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7906\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "I = Interface()\n", + "I.launch()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week4/community-contributions/solisoma/main.cpp b/week4/community-contributions/solisoma/main.cpp new file mode 100644 index 0000000..fc5beb2 --- /dev/null +++ b/week4/community-contributions/solisoma/main.cpp @@ -0,0 +1,6 @@ +#include + +int main() { + std::cout << "hi" << std::endl; + return 0; +} \ No newline at end of file