{ "cells": [ { "cell_type": "markdown", "id": "e568e8cc", "metadata": {}, "source": [ "# Synthetic Data Generator\n", "\n", "Tool for generating sample synthetic data using a local Llama model" ] }, { "cell_type": "code", "execution_count": 40, "id": "4191b928", "metadata": {}, "outputs": [], "source": [ "# imports \n", "\n", "from openai import OpenAI\n", "import json\n" ] }, { "cell_type": "code", "execution_count": 41, "id": "93d63879", "metadata": {}, "outputs": [], "source": [ "openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')" ] }, { "cell_type": "code", "execution_count": 42, "id": "0b9821dc", "metadata": {}, "outputs": [], "source": [ "# model\n", "\n", "MODEL = \"llama3.2\"" ] }, { "cell_type": "code", "execution_count": 43, "id": "5fe77aa5", "metadata": {}, "outputs": [], "source": [ "def generate_synthetic_data(user_prompt = (\n", " \"Generate 5 realistic customer reviews for a product. \"\n", " \"The review should be 1-2 sentences long and contain a mix of positive and negative comments. \"\n", " \"The review should be formatted as a JSON object with the following fields: \"\n", " \"review: a string containing the review text\"\n", " )):\n", " \n", " system_message = (\n", " \"You are a helpful assistant that generates synthetic data.\"\n", " )\n", " response = openai.chat.completions.create(\n", " model=MODEL,\n", " messages=[\n", " {\"role\": \"system\", \"content\": system_message},\n", " {\"role\": \"user\", \"content\": user_prompt}\n", " ],\n", " response_format={\"type\": \"json_object\"}\n", " )\n", " result = json.loads(response.choices[0].message.content)\n", " return result\n", " " ] }, { "cell_type": "code", "execution_count": 44, "id": "047309d4", "metadata": {}, "outputs": [], "source": [ "result = generate_synthetic_data()\n", "\n", "formatted_json_result = json.dumps(result, indent=4)" ] }, { "cell_type": "code", "execution_count": 45, "id": "07124b11", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"review1\": \"I'm really impressed with how easy the setup was for this product! It only took me about 10 minutes to get everything up and running.\",\n", " \"review2\": \"The quality of the material is top-notch, but I've noticed a few scratches after using it for a week.\",\n", " \"review3\": \"I was skeptical at first, but this product has truly exceeded my expectations - it's even more functional than I thought it would be!\",\n", " \"review4\": \"Unfortunately, the battery life could be longer. It's fine for occasional use, but it doesn't hold up as well during extended periods.\",\n", " \"review5\": \"I love how compact and lightweight this product is - perfect for my morning commute! The only reason I'm giving 4 stars instead of 5 is because the charging port can get a bit finicky.\"\n", "}\n" ] } ], "source": [ "print(formatted_json_result)" ] }, { "cell_type": "code", "execution_count": 46, "id": "a937ac81", "metadata": {}, "outputs": [], "source": [ "user_prompt = \"\"\"\n", "Generate a dataset of 5 employees with name, department, salary, and years of experience.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 47, "id": "2cef4545", "metadata": {}, "outputs": [], "source": [ "result = generate_synthetic_data(user_prompt)\n", "\n", "formatted_json_result = json.dumps(result, indent=4)" ] }, { "cell_type": "code", "execution_count": 48, "id": "f7d64ed3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"employees\": [\n", " {\n", " \"name\": \"John Doe\",\n", " \"department\": \"Marketing\",\n", " \"salary\": 60000,\n", " \"years_of_experience\": 8\n", " },\n", " {\n", " \"name\": \"Jane Smith\",\n", " \"department\": \"IT\",\n", " \"salary\": 70000,\n", " \"years_of_experience\": 5\n", " },\n", " {\n", " \"name\": \"Bob Johnson\",\n", " \"department\": \"Sales\",\n", " \"salary\": 55000,\n", " \"years_of_experience\": 10\n", " },\n", " {\n", " \"name\": \"Emily Chen\",\n", " \"department\": \"Marketing\",\n", " \"salary\": 65000,\n", " \"years_of_experience\": 6\n", " },\n", " {\n", " \"name\": \"Michael Davis\",\n", " \"department\": \"IT\",\n", " \"salary\": 75000,\n", " \"years_of_experience\": 7\n", " }\n", " ]\n", "}\n" ] } ], "source": [ "print(formatted_json_result)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }