diff --git a/week3/community-contributions/week3_exercise_solution-Stephen.ipynb b/week3/community-contributions/week3_exercise_solution-Stephen.ipynb new file mode 100644 index 0000000..bbc99e7 --- /dev/null +++ b/week3/community-contributions/week3_exercise_solution-Stephen.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c58e628f", + "metadata": {}, + "source": [ + "\n", + "## **Week 3 task.**\n", + "Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.\n", + "\n", + "https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0ddde9ed", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "import torch\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "from huggingface_hub import login\n", + "from huggingface_hub import login\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n", + "from dotenv import load_dotenv\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbbc6cc8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "load_dotenv(override=True)\n", + "\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "llama_api_key = \"ollama\"\n", + "\n", + "# hf_token = userdata.get('HF_TOKEN')\n", + "# login(hf_token, add_to_git_credential=True)\n", + "\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + "\n", + "if llama_api_key:\n", + " print(f\"LLama API Key exists\")\n", + "else:\n", + " print(\"LLama API Key not set\")\n", + " \n", + "GPT_MODEL = \"gpt-4.1-mini\"\n", + "LLAMA_MODEL = \"llama3.1\"\n", + "\n", + "\n", + "openai = OpenAI()\n", + "\n", + "llama_url = \"http://localhost:11434/v1\"\n", + "llama = OpenAI(api_key=llama_api_key, base_url=llama_url)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ef083ec6", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n", + " \"\"\"\n", + " Generates synthetic data using OpenAI's GPT.\n", + " Return a JSON string.\n", + " \"\"\"\n", + " if not openai:\n", + " return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n", + "\n", + " try:\n", + " response = openai.chat.completions.create(\n", + " model=GPT_MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + " \n", + " json_text = response.choices[0].message.content\n", + " return json_text\n", + " except APIError as e:\n", + " return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n", + " except Exception as e:\n", + " return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)\n", + "\n", + "def generate_with_gpt(user_prompt: str, num_samples: int = 5):\n", + " \"\"\"\n", + " Generates synthetic data using OpenAI's GPT.\n", + " Return a JSON string.\n", + " \"\"\"\n", + " if not openai:\n", + " return json.dumps({\"error\": \"OpenAI client not initialized. Please check your API key.\"}, indent=2)\n", + "\n", + " try:\n", + " response = openai.chat.completions.create(\n", + " model=GPT_MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": f\"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting.\"},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + " \n", + " json_text = response.choices[0].message.content\n", + " return json_text\n", + " except APIError as e:\n", + " return json.dumps({\"error\": f\"Error from OpenAI API: {e.body}\"}, indent=2)\n", + " except Exception as e:\n", + " return json.dumps({\"error\": f\"An unexpected error occurred: {e}\"}, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b98f84d8", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(user_prompt, model_choice):\n", + " \"\"\"\n", + " Wrapper function that calls the appropriate generation function based on model choice.\n", + " \"\"\"\n", + " if not user_prompt:\n", + " return json.dumps({\"error\": \"Please provide a description for the data.\"}, indent=2)\n", + "\n", + " if model_choice == f\"Hugging Face ({LLAMA_MODEL})\":\n", + " return generate_with_llama(user_prompt)\n", + " elif model_choice == f\"OpenAI ({GPT_MODEL})\":\n", + " return generate_with_gpt(user_prompt)\n", + " else:\n", + " return json.dumps({\"error\": \"Invalid model choice.\"}, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adbc19a8", + "metadata": {}, + "outputs": [], + "source": [ + "# Gradio UI\n", + "with gr.Blocks(theme=gr.themes.Glass(), title=\"Synthetic Data Generator\") as ui:\n", + " gr.Markdown(\"# Synthetic Data Generator\")\n", + " gr.Markdown(\"Describe the type of data you need, select a model, and click 'Generate'.\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " data_prompt = gr.Textbox(\n", + " lines=5,\n", + " label=\"Data Prompt\",\n", + " placeholder=\"e.g., a list of customer profiles with name, email, and a favorite product\"\n", + " )\n", + " \n", + " with gr.Column(scale=1):\n", + " model_choice = gr.Radio(\n", + " [f\"Hugging Face ({LLAMA_MODEL})\", f\"OpenAI ({GPT_MODEL})\"],\n", + " label=\"Choose a Model\",\n", + " value=f\"Hugging Face ({LLAMA_MODEL})\"\n", + " )\n", + " \n", + " generate_btn = gr.Button(\"Generate Data\")\n", + " \n", + " with gr.Row():\n", + " output_json = gr.JSON(label=\"Generated Data\")\n", + " \n", + " generate_btn.click(\n", + " fn=generate_data,\n", + " inputs=[data_prompt, model_choice],\n", + " outputs=output_json\n", + " )\n", + "\n", + "ui.launch(inbrowser=True, debug=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}