From a4db90be83b516b4fa5860eb5e1c05905e169086 Mon Sep 17 00:00:00 2001 From: SABEEH Shaikh Date: Tue, 20 May 2025 22:16:19 +0200 Subject: [PATCH 01/23] Added my dataset generator to contributions folder --- .../llm_dataset_generator.ipynb | 1801 +++++++++++++++++ 1 file changed, 1801 insertions(+) create mode 100644 week3/community-contributions/llm_dataset_generator.ipynb diff --git a/week3/community-contributions/llm_dataset_generator.ipynb b/week3/community-contributions/llm_dataset_generator.ipynb new file mode 100644 index 0000000..3de4ce1 --- /dev/null +++ b/week3/community-contributions/llm_dataset_generator.ipynb @@ -0,0 +1,1801 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Synthetic Data Generator Notebook\n", + "## About\n", + "This colab notebook demonstrates the use of Frontier and Open-source LLM models for generating synthetic dataset for a business scenario provided by the user. From a UI interface implemented in gradio, a user can define their business scenario in detail, select the number of records needed along with the its format and adjust the number of max output tokens to be generated by the chosen LLM.\n", + "\n", + "It does not stop here. Once the records have been produced in the LLM output, it can be extracted and stored in a file, format same as set by user before. The file is stored in colab notebook under the contents directory. All of this is extraction is done with the help of the 're' library. My first time using it and I totally enjoyed learning it.\n", + "\n", + "## Outlook\n", + "Sometimes the response is loaded with the user prompt and a lot of tags when using an open-source models, such as Mixtral from Mistral. This is because of the prompt format being used. The 'assistant' 'role' format does not suit them. This is an optimization to look for and can be easily done by using custom prompt template for such models and these templates are hinted on their huggingface repo." + ], + "metadata": { + "id": "SFA6R-4jL7SS" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ip4I4Lff3B2M" + }, + "source": [ + "## Install & Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8zVlW-GMcBaU", + "outputId": "0c473564-fb93-41a9-c819-e6aa2382d75a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.2/54.2 MB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m323.1/323.1 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.0/264.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m48.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m83.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.1/76.1 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install -q gradio anthropic requests torch bitsandbytes transformers accelerate openai" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "YKVNzE5sFH2l" + }, + "outputs": [], + "source": [ + "# imports\n", + "import re\n", + "import os\n", + "import sys\n", + "import gc\n", + "import io\n", + "import json\n", + "import anthropic\n", + "import gradio as gr\n", + "import requests\n", + "import subprocess\n", + "import google.generativeai as ggai\n", + "import torch\n", + "import tempfile\n", + "import shutil\n", + "from io import StringIO\n", + "import pandas as pd\n", + "from google.colab import userdata\n", + "from huggingface_hub import login\n", + "from openai import OpenAI\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "from IPython.display import Markdown, display, update_display\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LWpD6bZv3mAR" + }, + "source": [ + "## HuggingFace Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "aeC2oWY2FTv7" + }, + "outputs": [], + "source": [ + "# Sign in to HuggingFace Hub\n", + "\n", + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Au2UPVy3vn5" + }, + "source": [ + "## Frontier Models configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "geBBsd14X3UL" + }, + "outputs": [], + "source": [ + "openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))\n", + "anthropic_client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))\n", + "ggai.configure(api_key=userdata.get('GOOGLE_API_KEY'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tCnDIOlKgjbO" + }, + "source": [ + "## Defining Prompts" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "gkwXZsxofAU1" + }, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + "You are a synthetic dataset generator. Your role is to create synthetic dataset that infers structured data schemas from business scenarios given by the user.\n", + "\n", + "Your task is to:\n", + "1. Understand the user's business problem(s) or use case(s).\n", + "2. Identify the key fields needed to support that scenario.\n", + "3. Define appropriate field names, data types, and formats.\n", + "4. Generate synthetic records that match the inferred schema.\n", + "\n", + "Guidelines:\n", + "- Use realistic field names and values. Do not invent unrelated fields or values.\n", + "- Choose sensible data types: string, integer, float, date, boolean, enum, etc.\n", + "- Respect logical constraints (e.g., age range, date ranges, email formats).\n", + "- Output the dataset in the format the user requests (json, csv, txt, markdown table).\n", + "- If the scenario is vague or broad, make reasonable assumptions and explain them briefly before generating the dataset.\n", + "- Always generate a dataset that supports the business use case logically.\n", + "\n", + "Before generating the data, display the inferred schema in a readable format.\n", + "\"\"\"\n", + "\n", + "# trial_user_prompt = \"I’m building a churn prediction model for a telecom company. Can you generate a synthetic dataset with 100 rows?\"\n", + "def get_user_prompt(business_problem, no_of_samples, file_format):\n", + " return f\"\"\"\n", + " The business scenario for which I want you to generate a dataset is defined below:\n", + " {business_problem}\n", + "\n", + " Generate a synthetic dataset of {no_of_samples} records in {file_format} format.\n", + " When generating the dataset, wrap it between the '<<<>>>' tag. Make sure the tag is there in the output.\n", + " Do not include any other special characters in between the tags, other than the ones required in producing the correct format of data.\n", + " For examples: When a 'csv' format is given, only the ',' character can be used in between the tags.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yNpVf9-oQdoO" + }, + "source": [ + "### Quanitzation Config" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "3ErZ315MQdU3" + }, + "outputs": [], + "source": [ + "# This allows us to load the model into memory and use less memory\n", + "def get_quantization_config():\n", + " return BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "clGtRh0N4951" + }, + "source": [ + "## HF Model inference" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "MAhyn1ehb3Dh" + }, + "outputs": [], + "source": [ + "# All in one HuggingFace Model Response function\n", + "def run_hfmodel_and_get_response(prompt, model_name, output_tokens):\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " inputs = tokenizer.apply_chat_template(prompt, return_tensors=\"pt\")\n", + " if torch.cuda.is_available():\n", + " inputs = inputs.to(\"cuda\")\n", + " streamer = TextStreamer(tokenizer)\n", + " if \"microsoft/bitnet-b1.58-2B-4T\" in model_name:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", trust_remote_code=True)\n", + " elif \"tiiuae/Falcon-E-3B-Instruct\" in model_name:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", torch_dtype=torch.float16 )\n", + " else:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", quantization_config=get_quantization_config())\n", + " outputs = model.generate(inputs, max_new_tokens=output_tokens, streamer=streamer)\n", + " response = tokenizer.decode(outputs[0])\n", + " del model, inputs, tokenizer, outputs\n", + " gc.collect()\n", + " torch.cuda.empty_cache()\n", + " return response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gh_Ny1aM-L8z" + }, + "source": [ + "## Frontier Models Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "h11WlZNhfHCR" + }, + "outputs": [], + "source": [ + "# ChatGPT, Claude and Gemini response function\n", + "def get_chatgpt_response(prompt, model_name, output_tokens):\n", + " response = openai_client.chat.completions.create(\n", + " model=model_name,\n", + " messages=prompt,\n", + " max_tokens=output_tokens,\n", + " )\n", + " return response.choices[0].message.content\n", + "\n", + "def get_claude_response(prompt, model_name, output_tokens):\n", + " response = anthropic_client.messages.create(\n", + " model=model_name,\n", + " max_tokens=output_tokens,\n", + " system=system_prompt,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " )\n", + " return response.content[0].text\n", + "\n", + "def get_gemini_response(prompt, model_name, output_tokens):\n", + " model = ggai.GenerativeModel(\n", + " model_name=model_name,\n", + " system_instruction=system_prompt,\n", + " )\n", + "\n", + " response = model.generate_content(prompt, generation_config={\n", + " \"max_output_tokens\": output_tokens,\n", + " \"temperature\": 0.7,\n", + " })\n", + " return response.text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nzHbM_WQvRgT" + }, + "source": [ + "## Gradio Implementation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uFWZqw1R-al_" + }, + "source": [ + "### Dropdowns Selection Lists" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "rOzEb0o--aD7" + }, + "outputs": [], + "source": [ + "# Dropdown List Values for the user\n", + "MODEL_TYPES=[\"GPT\", \"Claude\", \"Gemini\", \"HuggingFace\"]\n", + "OPENAI_MODEL_NAMES=[\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n", + "ANTHROPIC_MODELS=[\"claude-3-7-sonnet-latest\", \"claude-3-5-haiku-latest\", \"claude-3-opus-latest\"]\n", + "GOOGLE_MODELS=[\"gemini-2.0-flash\", \"gemini-1.5-pro\"]\n", + "HUGGINGFACE_MODELS=[\n", + " \"meta-llama/Llama-3.2-3B-Instruct\",\n", + " \"microsoft/bitnet-b1.58-2B-4T\",\n", + " \"ByteDance-Seed/Seed-Coder-8B-Instruct\",\n", + " \"tiiuae/Falcon-E-3B-Instruct\",\n", + " \"Qwen/Qwen2.5-7B-Instruct\"\n", + "]\n", + "MODEL_NAMES = {\n", + " \"GPT\": OPENAI_MODEL_NAMES,\n", + " \"Claude\": ANTHROPIC_MODELS,\n", + " \"Gemini\": GOOGLE_MODELS,\n", + " \"HuggingFace\": HUGGINGFACE_MODELS\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sbXGL8_4-oKc" + }, + "source": [ + "### UI" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "_0NCY7FgCVHj" + }, + "outputs": [], + "source": [ + "with gr.Blocks() as generator_ui:\n", + " gr.Markdown(\"# 🧠 Business Scenario → Synthetic Dataset Generator\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " with gr.Row():\n", + " dataset_size=gr.Number(value=10, label=\"Enter the number of data samples to generate.\", show_label=True)\n", + " format=gr.Dropdown([\"json\", \"csv\", \"txt\", \"markdown\"], label=\"Select the format for the dataset\", show_label=True)\n", + " with gr.Row():\n", + " scenario=gr.Textbox(label=\"Business Scenario\", lines=5, placeholder=\"Describe your business scenario here\")\n", + " with gr.Row():\n", + " error = gr.Markdown(visible=False)\n", + " with gr.Row():\n", + " clear = gr.Button(\"Clear Everything\")\n", + " submit = gr.Button(\"Generate Dataset\", variant=\"primary\")\n", + "\n", + " with gr.Column(scale=1):\n", + " model_type = gr.Dropdown(MODEL_TYPES, label=\"Model Type\", show_label=True, info=\"Select the model type you want to use\")\n", + " model_name = gr.Dropdown(MODEL_NAMES[model_type.value], label=\"Model Name\", show_label=True, allow_custom_value=True, info=\"Select the model name or enter one manually\")\n", + " output_tokens= gr.Number(value=1000, label=\"Enter the max number of output tokens to generate.\", show_label=True, info=\"This will impact the length of the response containg the dataset\")\n", + "\n", + " with gr.Row():\n", + " # Chatbot Interface\n", + " chatbot = gr.Chatbot(\n", + " type='messages',\n", + " label='Chatbot',\n", + " show_label=True,\n", + " height=300,\n", + " resizable=True,\n", + " elem_id=\"chatbot\",\n", + " avatar_images=(\"🧑\", \"🤖\",)\n", + " )\n", + " with gr.Row(variant=\"compact\"):\n", + " extract_btn = gr.Button(\"Extract and Save Dataset\", variant=\"huggingface\", visible=False)\n", + " file_name = gr.Textbox(label=\"Enter file name here (without file extension)\", placeholder=\"e.g. cancer_synthetic, warehouse_synthetic (no digits)\", visible=False)\n", + " with gr.Row():\n", + " markdown_preview = gr.Markdown(visible = False)\n", + " dataset_preview = gr.Textbox(label=\"Dataset Preview\",visible=False)\n", + " with gr.Row():\n", + " file_saved = gr.Textbox(visible=False)\n", + "\n", + " def run_inference(scenario, model_type, model_name, output_tokens, dataset_size, format):\n", + " \"\"\"Run the model and get the response\"\"\"\n", + " model_type=model_type.lower()\n", + " print(f\"scenario: {scenario}\")\n", + " print(f\"model_type: {model_type}\")\n", + " print(f\"model_name: {model_name}\")\n", + " if not scenario.strip():\n", + " return gr.update(value=\"❌ **Error:** Please define a scenario first!\",visible=True), []\n", + "\n", + " user_prompt = get_user_prompt(scenario, dataset_size, format)\n", + " prompt = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " ]\n", + "\n", + " if model_type == \"gpt\":\n", + " response = get_chatgpt_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", + " elif model_type == \"claude\":\n", + " response = get_claude_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", + " elif model_type == \"gemini\":\n", + " response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", + " else:\n", + " response = run_hfmodel_and_get_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", + " torch.cuda.empty_cache()\n", + " history = [\n", + " {\"role\": \"user\", \"content\": scenario},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " return gr.update(visible=False), history\n", + "\n", + " def extract_dataset_string(response):\n", + " \"\"\"Extract dataset content between defined tags using regex.\"\"\"\n", + " # Remove known artificial tokens (common in HuggingFace or Claude)\n", + " response = re.sub(r\"<\\[.*?\\]>\", \"\", response)\n", + "\n", + " # Remove system or prompt echo if repeated before dataset\n", + " response = re.sub(r\"(?is)^.*?<<<\", \"<<<\", response.strip(), count=1)\n", + "\n", + " # 1. Match strict <<<>>>...<<<>>> tag blocks (use last match)\n", + " matches = re.findall(r\"<<<>>>[\\s\\r\\n]*(.*?)[\\s\\r\\n]*<<<>>>\", response, re.DOTALL)\n", + " if matches:\n", + " return matches[-1].strip()\n", + "\n", + " # 2. Match loose <<< ... >>> format\n", + " matches = re.findall(r\"<<<[\\s\\r\\n]*(.*?)[\\s\\r\\n]*>>>\", response, re.DOTALL)\n", + " if matches:\n", + " return matches[-1].strip()\n", + "\n", + " # 3. Match final fallback: take everything after last <<< as raw data\n", + " last_open = response.rfind(\"<<<\")\n", + " if last_open != -1:\n", + " raw = response[last_open + 3 :].strip()\n", + " # Optionally cut off noisy trailing notes, explanations, etc.\n", + " raw = re.split(r\"\\n\\s*\\n|Explanation:|Note:|---\", raw)[0]\n", + " return raw.strip()\n", + "\n", + " return \"Could not extract dataset! Try again with a different model.\"\n", + "\n", + " def extract_dataset_from_response(chatbot_history, file_name, file_type):\n", + " \"\"\"Extract dataset and update in gradio UI components\"\"\"\n", + " response = chatbot_history[-1][\"content\"]\n", + " if not response:\n", + " return gr.update(visible=True, value=\"Could not find LLM Response! Try again.\"), gr.update(visible=False)\n", + "\n", + " # match = re.search(r'<<<\\s*(.*?)\\s*>>>', response, re.DOTALL)\n", + " # print(match)\n", + " # if match and match.group(1).strip() == \"\":\n", + " # match = re.search(r'<<<>>>\\s*(.*?)\\s*<<<>>>', response, re.DOTALL)\n", + " # print(match)\n", + " # if match is None:\n", + " # return gr.update(visible=True, value=\"Could not extract dataset! Try again with a different model.\"), gr.update(visible=False)\n", + " # dataset = match.group(1).strip()\n", + " dataset = extract_dataset_string(response)\n", + " if dataset == \"Could not extract dataset! Try again with a different model.\":\n", + " return gr.update(visible=True, value=dataset), gr.update(visible=False)\n", + " text = save_dataset(dataset, file_type, file_name)\n", + " return gr.update(visible=True, value=text), gr.update(visible=True, value=dataset)\n", + "\n", + " def save_dataset(dataset, file_format, file_name):\n", + " \"\"\"Save dataset to a file based on the selected format.\"\"\"\n", + " file_name=file_name+\".\"+file_format\n", + " print(dataset)\n", + " print(file_name)\n", + " if file_format == \"json\":\n", + " try:\n", + " data = json.loads(dataset)\n", + " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(data, f, indent=4)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + " elif file_format == \"csv\":\n", + " try:\n", + " df = pd.read_csv(StringIO(dataset))\n", + " df.to_csv(file_name, index=False)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + " elif file_format == \"txt\":\n", + " try:\n", + " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", + " f.write(dataset)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + "\n", + " def clear_chat():\n", + " \"\"\"Clear the chat history.\"\"\"\n", + " return \"\", [], gr.update(visible=False), gr.update(visible=False)\n", + "\n", + " def show_extract_btn(chatbot_history, format):\n", + " \"\"\"Show the extract button if the response has been displayed in the chatbot and format is not set to markdown\"\"\"\n", + " if chatbot_history == []:\n", + " return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)\n", + " if format == \"markdown\":\n", + " return gr.update(visible=True, value=chatbot_history[1][\"content\"]), gr.update(visible=False), gr.update(visible=False)\n", + " return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)\n", + "\n", + " extract_btn.click(\n", + " fn=extract_dataset_from_response,\n", + " inputs=[chatbot, file_name, format],\n", + " outputs=[file_saved, dataset_preview]\n", + " )\n", + "\n", + " chatbot.change(\n", + " fn=show_extract_btn,\n", + " inputs=[chatbot, format],\n", + " outputs=[markdown_preview, extract_btn, file_name]\n", + " )\n", + "\n", + " model_type.change(\n", + " fn=lambda x: gr.update(choices=MODEL_NAMES[x], value=MODEL_NAMES[x][0]),\n", + " inputs=[model_type],\n", + " outputs=[model_name]\n", + " )\n", + "\n", + " submit.click(\n", + " fn=run_inference,\n", + " inputs=[scenario, model_type, model_name, output_tokens, dataset_size, format],\n", + " outputs=[error, chatbot],\n", + " show_progress=True\n", + " )\n", + "\n", + " clear.click(\n", + " clear_chat,\n", + " outputs=[scenario, chatbot, dataset_preview, file_saved]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "kzDUJahK8uRN", + "outputId": "c5674be2-b262-4439-ae91-4f3e1f49e041" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", + "* Running on public URL: https://d076a9fef9034a4f24.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "scenario: Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n", + "model_type: gpt\n", + "model_name: gpt-4o\n", + "Loan Amount,Applicant Income,Co-applicant Income,Employment Type,Credit History,Loan Term,Number of Dependents,Education Level,Loan Approval Status\n", + "250000,60000,15000,Salaried,1,240,1,Graduate,Approved\n", + "350000,80000,0,Salaried,1,360,2,Graduate,Approved\n", + "120000,30000,10000,Self-employed,0,180,1,Not Graduate,Rejected\n", + "500000,150000,50000,Self-employed,1,300,3,Graduate,Approved\n", + "75000,20000,0,Unemployed,0,120,0,Graduate,Rejected\n", + "275000,75000,25000,Salaried,0,240,2,Not Graduate,Rejected\n", + "100000,40000,20000,Salaried,1,60,0,Graduate,Approved\n", + "310000,95000,0,Self-employed,1,360,1,Graduate,Approved\n", + "450000,50000,0,Self-employed,0,180,4,Not Graduate,Rejected\n", + "200000,55000,20000,Salaried,1,120,3,Graduate,Approved\n", + "100000,35000,0,Unemployed,0,60,0,Not Graduate,Rejected\n", + "230000,68000,13000,Salaried,1,240,1,Graduate,Approved\n", + "330000,99000,40000,Self-employed,1,300,2,Graduate,Approved\n", + "150000,18000,7500,Unemployed,0,48,0,Not Graduate,Rejected\n", + "210000,64000,0,Salaried,0,120,1,Graduate,Rejected\n", + "310000,87000,30000,Self-employed,1,360,2,Graduate,Approved\n", + "50000,22000,7000,Unemployed,0,24,0,Not Graduate,Rejected\n", + "290000,92000,20000,Salaried,1,240,3,Graduate,Approved\n", + "110000,45000,0,Salaried,0,36,0,Graduate,Rejected\n", + "450000,76000,25000,Self-employed,1,360,2,Graduate,Approved\n", + "loan_approval_synthetic.txt\n", + "scenario: Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", + "model_type: gpt\n", + "model_name: gpt-4o\n", + "scenario: Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", + "model_type: gpt\n", + "model_name: gpt-4o\n", + "[\n", + " {\n", + " \"appointment_id\": \"AID001\",\n", + " \"scheduled_date\": \"2023-11-01\",\n", + " \"appointment_date\": \"2023-11-10\",\n", + " \"lead_time\": 9,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 45,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID002\",\n", + " \"scheduled_date\": \"2023-11-03\",\n", + " \"appointment_date\": \"2023-11-15\",\n", + " \"lead_time\": 12,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 34,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID003\",\n", + " \"scheduled_date\": \"2023-11-05\",\n", + " \"appointment_date\": \"2023-11-11\",\n", + " \"lead_time\": 6,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 29,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID004\",\n", + " \"scheduled_date\": \"2023-11-02\",\n", + " \"appointment_date\": \"2023-11-14\",\n", + " \"lead_time\": 12,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 62,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID005\",\n", + " \"scheduled_date\": \"2023-11-06\",\n", + " \"appointment_date\": \"2023-11-13\",\n", + " \"lead_time\": 7,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 21,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID006\",\n", + " \"scheduled_date\": \"2023-11-08\",\n", + " \"appointment_date\": \"2023-11-17\",\n", + " \"lead_time\": 9,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 58,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID007\",\n", + " \"scheduled_date\": \"2023-11-10\",\n", + " \"appointment_date\": \"2023-11-18\",\n", + " \"lead_time\": 8,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 41,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID008\",\n", + " \"scheduled_date\": \"2023-11-07\",\n", + " \"appointment_date\": \"2023-11-12\",\n", + " \"lead_time\": 5,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 67,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID009\",\n", + " \"scheduled_date\": \"2023-11-12\",\n", + " \"appointment_date\": \"2023-11-20\",\n", + " \"lead_time\": 8,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 74,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID010\",\n", + " \"scheduled_date\": \"2023-11-09\",\n", + " \"appointment_date\": \"2023-11-16\",\n", + " \"lead_time\": 7,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 25,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID011\",\n", + " \"scheduled_date\": \"2023-11-13\",\n", + " \"appointment_date\": \"2023-11-21\",\n", + " \"lead_time\": 8,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 32,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID012\",\n", + " \"scheduled_date\": \"2023-11-14\",\n", + " \"appointment_date\": \"2023-11-25\",\n", + " \"lead_time\": 11,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 48,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID013\",\n", + " \"scheduled_date\": \"2023-11-15\",\n", + " \"appointment_date\": \"2023-11-27\",\n", + " \"lead_time\": 12,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 36,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID014\",\n", + " \"scheduled_date\": \"2023-11-17\",\n", + " \"appointment_date\": \"2023-12-02\",\n", + " \"lead_time\": 15,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 28,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID015\",\n", + " \"scheduled_date\": \"2023-11-16\",\n", + " \"appointment_date\": \"2023-12-01\",\n", + " \"lead_time\": 15,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 60,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID016\",\n", + " \"scheduled_date\": \"2023-11-18\",\n", + " \"appointment_date\": \"2023-12-05\",\n", + " \"lead_time\": 17,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 40,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID017\",\n", + " \"scheduled_date\": \"2023-11-19\",\n", + " \"appointment_date\": \"2023-12-03\",\n", + " \"lead_time\": 14,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 19,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID018\",\n", + " \"scheduled_date\": \"2023-11-21\",\n", + " \"appointment_date\": \"2023-12-07\",\n", + " \"lead_time\": 16,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 51,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID019\",\n", + " \"scheduled_date\": \"2023-11-23\",\n", + " \"appointment_date\": \"2023-12-09\",\n", + " \"lead_time\": 16,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 55,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID020\",\n", + " \"scheduled_date\": \"2023-11-22\",\n", + " \"appointment_date\": \"2023-12-08\",\n", + " \"lead_time\": 16,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 23,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID021\",\n", + " \"scheduled_date\": \"2023-11-24\",\n", + " \"appointment_date\": \"2023-12-10\",\n", + " \"lead_time\": 16,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 47,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID022\",\n", + " \"scheduled_date\": \"2023-11-25\",\n", + " \"appointment_date\": \"2023-12-12\",\n", + " \"lead_time\": 17,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 33,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID023\",\n", + " \"scheduled_date\": \"2023-11-27\",\n", + " \"appointment_date\": \"2023-12-14\",\n", + " \"lead_time\": 17,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 42,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID024\",\n", + " \"scheduled_date\": \"2023-11-29\",\n", + " \"appointment_date\": \"2023-12-15\",\n", + " \"lead_time\": 16,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 64,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID025\",\n", + " \"scheduled_date\": \"2023-12-01\",\n", + " \"appointment_date\": \"2023-12-20\",\n", + " \"lead_time\": 19,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 26,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID026\",\n", + " \"scheduled_date\": \"2023-12-03\",\n", + " \"appointment_date\": \"2023-12-22\",\n", + " \"lead_time\": 19,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 31,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID027\",\n", + " \"scheduled_date\": \"2023-12-05\",\n", + " \"appointment_date\": \"2023-12-24\",\n", + " \"lead_time\": 19,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 50,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID028\",\n", + " \"scheduled_date\": \"2023-12-06\",\n", + " \"appointment_date\": \"2023-12-25\",\n", + " \"lead_time\": 19,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 39,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID029\",\n", + " \"scheduled_date\": \"2023-12-07\",\n", + " \"appointment_date\": \"2023-12-27\",\n", + " \"lead_time\": 20,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 71,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID030\",\n", + " \"scheduled_date\": \"2023-12-08\",\n", + " \"appointment_date\": \"2023-12-28\",\n", + " \"lead_time\": 20,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 44,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID031\",\n", + " \"scheduled_date\": \"2023-12-10\",\n", + " \"appointment_date\": \"2023-12-31\",\n", + " \"lead_time\": 21,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 38,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID032\",\n", + " \"scheduled_date\": \"2023-12-11\",\n", + " \"appointment_date\": \"2024-01-02\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 53,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID033\",\n", + " \"scheduled_date\": \"2023-12-13\",\n", + " \"appointment_date\": \"2024-01-04\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 27,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID034\",\n", + " \"scheduled_date\": \"2023-12-15\",\n", + " \"appointment_date\": \"2024-01-06\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 46,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID035\",\n", + " \"scheduled_date\": \"2023-12-17\",\n", + " \"appointment_date\": \"2024-01-09\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 68,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID036\",\n", + " \"scheduled_date\": \"2023-12-19\",\n", + " \"appointment_date\": \"2024-01-10\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 37,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID037\",\n", + " \"scheduled_date\": \"2023-12-20\",\n", + " \"appointment_date\": \"2024-01-12\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 57,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID038\",\n", + " \"scheduled_date\": \"2023-12-22\",\n", + " \"appointment_date\": \"2024-01-14\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 43,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID039\",\n", + " \"scheduled_date\": \"2023-12-23\",\n", + " \"appointment_date\": \"2024-01-16\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 65,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID040\",\n", + " \"scheduled_date\": \"2023-12-25\",\n", + " \"appointment_date\": \"2024-01-17\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 49,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID041\",\n", + " \"scheduled_date\": \"2023-12-27\",\n", + " \"appointment_date\": \"2024-01-20\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 30,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID042\",\n", + " \"scheduled_date\": \"2023-12-29\",\n", + " \"appointment_date\": \"2024-01-22\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 24,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID043\",\n", + " \"scheduled_date\": \"2024-01-01\",\n", + " \"appointment_date\": \"2024-01-25\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 72,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID044\",\n", + " \"scheduled_date\": \"2024-01-03\",\n", + " \"appointment_date\": \"2024-01-27\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 35,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID045\",\n", + " \"scheduled_date\": \"2024-01-04\",\n", + " \"appointment_date\": \"2024-01-28\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 61,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID046\",\n", + " \"scheduled_date\": \"2024-01-05\",\n", + " \"appointment_date\": \"2024-01-30\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 68,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID047\",\n", + " \"scheduled_date\": \"2024-01-07\",\n", + " \"appointment_date\": \"2024-02-01\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 22,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID048\",\n", + " \"scheduled_date\": \"2024-01-08\",\n", + " \"appointment_date\": \"2024-02-03\",\n", + " \"lead_time\": 26,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 52,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID049\",\n", + " \"scheduled_date\": \"2024-01-10\",\n", + " \"appointment_date\": \"2024-02-04\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 73,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID050\",\n", + " \"scheduled_date\": \"2024-01-12\",\n", + " \"appointment_date\": \"2024-02-06\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 56,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID051\",\n", + " \"scheduled_date\": \"2024-01-15\",\n", + " \"appointment_date\": \"2024-02-07\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 62,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID052\",\n", + " \"scheduled_date\": \"2024-01-17\",\n", + " \"appointment_date\": \"2024-02-10\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 80,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID053\",\n", + " \"scheduled_date\": \"2024-01-19\",\n", + " \"appointment_date\": \"2024-02-12\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 29,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID054\",\n", + " \"scheduled_date\": \"2024-01-21\",\n", + " \"appointment_date\": \"2024-02-13\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 66,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID055\",\n", + " \"scheduled_date\": \"2024-01-23\",\n", + " \"appointment_date\": \"2024-02-15\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 77,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID056\",\n", + " \"scheduled_date\": \"2024-01-25\",\n", + " \"appointment_date\": \"2024-02-17\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 54,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID057\",\n", + " \"scheduled_date\": \"2024-01-28\",\n", + " \"appointment_date\": \"2024-02-19\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 28,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID058\",\n", + " \"scheduled_date\": \"2024-01-30\",\n", + " \"appointment_date\": \"2024-02-22\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 45,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID059\",\n", + " \"scheduled_date\": \"2024-02-01\",\n", + " \"appointment_date\": \"2024-02-24\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 69,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID060\",\n", + " \"scheduled_date\": \"2024-02-02\",\n", + " \"appointment_date\": \"2024-02-26\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 51,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID061\",\n", + " \"scheduled_date\": \"2024-02-04\",\n", + " \"appointment_date\": \"2024-02-27\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 33,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID062\",\n", + " \"scheduled_date\": \"2024-02-06\",\n", + " \"appointment_date\": \"2024-03-01\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 84,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID063\",\n", + " \"scheduled_date\": \"2024-02-09\",\n", + " \"appointment_date\": \"2024-03-04\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 47,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID064\",\n", + " \"scheduled_date\": \"2024-02-10\",\n", + " \"appointment_date\": \"2024-03-06\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 59,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID065\",\n", + " \"scheduled_date\": \"2024-02-12\",\n", + " \"appointment_date\": \"2024-03-08\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 20,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID066\",\n", + " \"scheduled_date\": \"2024-02-14\",\n", + " \"appointment_date\": \"2024-03-10\",\n", + " \"lead_time\": 25,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 48,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID067\",\n", + " \"scheduled_date\": \"2024-02-17\",\n", + " \"appointment_date\": \"2024-03-12\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 38,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID068\",\n", + " \"scheduled_date\": \"2024-02-19\",\n", + " \"appointment_date\": \"2024-03-14\",\n", + " \"lead_time\": 24,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 76,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID069\",\n", + " \"scheduled_date\": \"2024-02-21\",\n", + " \"appointment_date\": \"2024-03-15\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 34,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID070\",\n", + " \"scheduled_date\": \"2024-02-23\",\n", + " \"appointment_date\": \"2024-03-17\",\n", + " \"lead_time\": 23,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 26,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID071\",\n", + " \"scheduled_date\": \"2024-02-25\",\n", + " \"appointment_date\": \"2024-03-19\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 22,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 2,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID072\",\n", + " \"scheduled_date\": \"2024-02-27\",\n", + " \"appointment_date\": \"2024-03-20\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 0,\n", + " \"patient_age\": 58,\n", + " \"gender\": \"Other\",\n", + " \"health_condition_severity\": 1,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID073\",\n", + " \"scheduled_date\": \"2024-02-29\",\n", + " \"appointment_date\": \"2024-03-22\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 3,\n", + " \"patient_age\": 67,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 3,\n", + " \"no_show_status\": false\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID074\",\n", + " \"scheduled_date\": \"2024-03-02\",\n", + " \"appointment_date\": \"2024-03-24\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 2,\n", + " \"patient_age\": 32,\n", + " \"gender\": \"Female\",\n", + " \"health_condition_severity\": 4,\n", + " \"no_show_status\": true\n", + " },\n", + " {\n", + " \"appointment_id\": \"AID075\",\n", + " \"scheduled_date\": \"2024-03-04\",\n", + " \"appointment_date\": \"2024-03-26\",\n", + " \"lead_time\": 22,\n", + " \"sms_reminders_sent\": 1,\n", + " \"patient_age\": 46,\n", + " \"gender\": \"Male\",\n", + " \"health_condition_severity\": 5,\n", + " \"no_show_status\": false\n", + " }\n", + "]\n", + "medical_appointment.json\n", + "scenario: Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", + "model_type: claude\n", + "model_name: claude-3-7-sonnet-latest\n", + "scenario: Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", + "model_type: claude\n", + "model_name: claude-3-7-sonnet-latest\n", + "transaction_id,amount,timestamp,merchant_category,customer_location,card_presence,device_type,fraud_label\n", + "TX123456789,45.99,2023-11-01 08:23:15,Retail,New York,Yes,POS Terminal,No\n", + "TX123456790,899.50,2023-11-01 09:45:22,Electronics,Chicago,Yes,POS Terminal,No\n", + "TX123456791,12.35,2023-11-01 10:12:45,Food & Beverage,Los Angeles,No,Mobile,No\n", + "TX123456792,5423.80,2023-11-01 11:30:18,Jewelry,Miami,No,Web Browser,Yes\n", + "TX123456793,76.24,2023-11-01 14:22:56,Groceries,Denver,Yes,POS Terminal,No\n", + "TX123456794,149.99,2023-11-02 07:15:33,Clothing,Seattle,No,Mobile,No\n", + "TX123456795,2500.00,2023-11-02 08:45:12,Electronics,Toronto,No,Web Browser,Yes\n", + "TX123456796,35.50,2023-11-02 12:33:47,Food & Beverage,Boston,Yes,POS Terminal,No\n", + "TX123456797,10.99,2023-11-02 15:20:09,Entertainment,Philadelphia,No,Mobile,No\n", + "TX123456798,750.25,2023-11-02 16:45:18,Travel,San Francisco,No,Web Browser,No\n", + "TX123456799,65.40,2023-11-02 19:22:31,Retail,Austin,Yes,POS Terminal,No\n", + "TX123456800,3299.99,2023-11-03 05:45:22,Electronics,London,No,Web Browser,Yes\n", + "TX123456801,22.50,2023-11-03 08:12:40,Food & Beverage,Atlanta,Yes,POS Terminal,No\n", + "TX123456802,129.95,2023-11-03 10:33:27,Clothing,Chicago,No,Mobile,No\n", + "TX123456803,50.00,2023-11-03 12:15:39,Gas Station,Dallas,Yes,POS Terminal,No\n", + "TX123456804,1999.00,2023-11-03 14:30:45,Electronics,Singapore,No,Web Browser,No\n", + "TX123456805,8.75,2023-11-03 18:22:14,Food & Beverage,Montreal,No,Mobile,No\n", + "TX123456806,459.99,2023-11-04 09:15:33,Home Goods,Houston,Yes,POS Terminal,No\n", + "TX123456807,2750.00,2023-11-04 10:45:28,Travel,Paris,No,Web Browser,Yes\n", + "TX123456808,85.00,2023-11-04 11:33:52,Healthcare,New York,Yes,POS Terminal,No\n", + "TX123456809,17.25,2023-11-04 13:10:44,Food & Beverage,Los Angeles,No,Mobile,No\n", + "TX123456810,150.49,2023-11-04 15:22:18,Entertainment,Miami,No,Mobile,No\n", + "TX123456811,4500.00,2023-11-04 19:45:02,Jewelry,Dubai,No,Web Browser,Yes\n", + "TX123456812,27.99,2023-11-05 08:33:27,Groceries,Seattle,Yes,POS Terminal,No\n", + "TX123456813,1250.00,2023-11-05 10:15:42,Electronics,Tokyo,No,Web Browser,No\n", + "TX123456814,56.75,2023-11-05 12:20:35,Clothing,San Diego,No,Mobile,No\n", + "TX123456815,18.50,2023-11-05 14:30:19,Food & Beverage,Denver,Yes,POS Terminal,No\n", + "TX123456816,3750.25,2023-11-05 16:45:08,Travel,Sydney,No,Web Browser,Yes\n", + "TX123456817,95.00,2023-11-05 18:22:56,Healthcare,Boston,No,Mobile,No\n", + "TX123456818,2345.67,2023-11-05 20:15:33,Electronics,Berlin,No,Web Browser,Yes\n", + "fraud_transactions.csv\n", + "scenario: Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", + "model_type: gemini\n", + "model_name: gemini-1.5-pro\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:tornado.access:429 POST /v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 409.67ms\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.11/dist-packages/gradio/queueing.py\", line 625, in process_events\n", + " response = await route_utils.call_process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py\", line 322, in call_process_api\n", + " output = await app.get_blocks().process_api(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/gradio/blocks.py\", line 2181, in process_api\n", + " result = await self.call_function(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/gradio/blocks.py\", line 1692, in call_function\n", + " prediction = await anyio.to_thread.run_sync( # type: ignore\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/anyio/to_thread.py\", line 56, in run_sync\n", + " return await get_async_backend().run_sync_in_worker_thread(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py\", line 2470, in run_sync_in_worker_thread\n", + " return await future\n", + " ^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py\", line 967, in run\n", + " result = context.run(func, *args)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/gradio/utils.py\", line 889, in wrapper\n", + " response = f(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"\", line 62, in run_inference\n", + " response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"\", line 30, in get_gemini_response\n", + " response = model.generate_content(prompt, generation_config={\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/generativeai/generative_models.py\", line 331, in generate_content\n", + " response = self._client.generate_content(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/client.py\", line 835, in generate_content\n", + " response = rpc(\n", + " ^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/gapic_v1/method.py\", line 131, in __call__\n", + " return wrapped_func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 293, in retry_wrapped_func\n", + " return retry_target(\n", + " ^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 153, in retry_target\n", + " _retry_error_helper(\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_base.py\", line 212, in _retry_error_helper\n", + " raise final_exc from source_exc\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 144, in retry_target\n", + " result = target()\n", + " ^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/timeout.py\", line 130, in func_with_timeout\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/grpc_helpers.py\", line 76, in error_remapped_callable\n", + " return callable_(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/transports/rest.py\", line 1161, in __call__\n", + " raise core_exceptions.from_http_response(response)\n", + "google.api_core.exceptions.TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "scenario: Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", + "model_type: gemini\n", + "model_name: gemini-2.0-flash\n", + "CustomerID,PortfolioValue,Age,IncomeBracket,RiskAppetite,TransactionsPerMonth,PreferredInvestmentType,RiskScore\n", + "1,75000.00,32,Medium,High,8,\"Stocks, Options\",78\n", + "2,120000.50,45,High,Medium,3,\"Bonds, Mutual Funds\",55\n", + "3,30000.75,28,Low,Low,1,\"Bonds\",25\n", + "4,250000.00,58,High,High,12,\"Stocks, Real Estate\",85\n", + "5,80000.25,39,Medium,Medium,5,\"Mutual Funds\",60\n", + "6,150000.00,48,High,Low,2,\"Bonds, ETFs\",40\n", + "7,45000.50,25,Low,Medium,4,\"Stocks\",50\n", + "8,300000.75,62,High,High,15,\"Stocks, Options, Real Estate\",92\n", + "9,90000.00,35,Medium,Medium,6,\"ETFs, Mutual Funds\",65\n", + "10,180000.25,50,High,Low,1,\"Bonds\",35\n", + "11,60000.50,29,Low,Low,2,\"Bonds, ETFs\",30\n", + "12,400000.00,65,High,High,18,\"Stocks, Options, Cryptocurrency\",95\n", + "13,100000.75,42,Medium,Medium,7,\"Mutual Funds, Real Estate\",70\n", + "14,200000.00,55,High,Low,0,\"Bonds, Annuities\",20\n", + "15,70000.25,31,Low,Medium,3,\"Stocks, ETFs\",58\n", + "16,130000.50,47,High,Medium,4,\"Bonds, Mutual Funds\",52\n", + "17,35000.75,27,Low,Low,1,\"Bonds\",28\n", + "18,280000.00,60,High,High,14,\"Stocks, Real Estate\",88\n", + "19,85000.25,37,Medium,Medium,5,\"ETFs\",63\n", + "20,160000.00,52,High,Low,2,\"Bonds, CDs\",38\n", + "21,50000.50,26,Low,Low,1,\"Bonds, Government Securities\",22\n", + "22,450000.75,68,High,High,20,\"Stocks, Options, Venture Capital\",97\n", + "23,110000.00,44,Medium,Medium,8,\"Mutual Funds, ETFs\",73\n", + "24,220000.25,57,High,Low,0,\"Bonds, Treasury Bills\",18\n", + "25,72000.50,33,Low,Medium,4,\"Stocks\",56\n", + "26,140000.00,49,High,Medium,3,\"Bonds, Mutual Funds\",54\n", + "27,32000.75,29,Low,Low,1,\"Bonds\",26\n", + "28,260000.00,61,High,High,13,\"Stocks, Real Estate\",86\n", + "29,82000.25,38,Medium,Medium,6,\"ETFs, Index Funds\",61\n", + "30,170000.50,53,High,Low,2,\"Bonds\",36\n", + "31,55000.75,24,Low,Low,2,\"Bonds, Money Market Accounts\",24\n", + "32,350000.00,64,High,High,17,\"Stocks, Options, Commodities\",93\n", + "33,95000.25,41,Medium,Medium,7,\"Mutual Funds, REITs\",68\n", + "34,190000.50,56,High,Low,0,\"Bonds, Fixed Income\",19\n", + "35,65000.00,30,Low,Medium,3,\"Stocks, Small Cap Stocks\",59\n", + "36,125000.75,46,High,Medium,4,\"Bonds, Large Cap Funds\",51\n", + "37,33000.25,28,Low,Low,1,\"Bonds\",27\n", + "38,270000.50,59,High,High,14,\"Stocks, Emerging Markets\",87\n", + "39,88000.00,36,Medium,Medium,5,\"ETFs, Balanced Funds\",64\n", + "40,155000.75,51,High,Low,2,\"Bonds, Corporate Bonds\",37\n", + "41,48000.25,25,Low,Low,1,\"Bonds, Municipal Bonds\",21\n", + "42,420000.00,67,High,High,19,\"Stocks, Options, Derivatives\",96\n", + "43,105000.75,43,Medium,Medium,8,\"Mutual Funds, Sector Funds\",71\n", + "44,210000.00,54,High,Low,0,\"Bonds, Government Bonds\",17\n", + "45,71000.25,32,Low,Medium,4,\"Stocks\",57\n", + "46,135000.50,48,High,Medium,3,\"Bonds, Index Funds\",53\n", + "47,34000.75,27,Low,Low,1,\"Bonds\",29\n", + "48,290000.00,63,High,High,16,\"Stocks, Real Estate, Private Equity\",90\n", + "49,89000.25,40,Medium,Medium,6,\"ETFs\",62\n", + "50,175000.50,50,High,Low,2,\"Bonds, Preferred Stocks\",39\n", + "investment_customers.csv\n", + "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", + "model_type: gemini\n", + "model_name: gemini-2.0-flash\n", + "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", + "model_type: gemini\n", + "model_name: gemini-2.0-flash\n", + "\n", + "testinggemini.json\n", + "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", + "model_type: gemini\n", + "model_name: gemini-2.0-flash\n", + "CustomerID,MonthlyCharges,ContractType,Tenure,SupportCalls,InternetUsage,PaymentMethod,Churned\n", + "TEL2847592374,67.55,Month-to-Month,9,3,145.2,Electronic Check,Yes\n", + "TEL9283746510,92.30,One Year,48,1,87.9,Credit Card,No\n", + "TEL1837465921,25.00,Month-to-Month,2,0,25.6,Mailed Check,Yes\n", + "TEL7364582910,115.75,Two Year,65,2,203.4,Bank Transfer,No\n", + "TEL5928374615,48.20,Month-to-Month,15,4,98.7,Electronic Check,Yes\n", + "TEL3847592016,78.90,One Year,36,1,167.1,Credit Card,No\n", + "TEL8273645910,31.50,Month-to-Month,3,0,30.2,Mailed Check,Yes\n", + "TEL6354789210,102.40,Two Year,70,3,185.9,Bank Transfer,No\n", + "TEL4738291056,55.85,Month-to-Month,11,2,112.5,Electronic Check,Yes\n", + "TEL1928374650,85.60,One Year,42,1,76.3,Credit Card,No\n", + "TEL7463529108,28.75,Month-to-Month,5,0,28.9,Mailed Check,Yes\n", + "TEL5293847610,110.30,Two Year,68,2,192.7,Bank Transfer,No\n", + "TEL3647582910,62.10,Month-to-Month,13,3,134.8,Electronic Check,Yes\n", + "TEL9182736450,98.45,One Year,39,1,91.5,Credit Card,No\n", + "TEL2736458109,34.90,Month-to-Month,7,0,33.6,Mailed Check,Yes\n", + "TEL8547392016,107.60,Two Year,62,2,179.3,Bank Transfer,No\n", + "TEL6192837450,59.35,Month-to-Month,10,3,123.4,Electronic Check,Yes\n", + "TEL4928374651,82.90,One Year,45,1,82.1,Credit Card,No\n", + "TEL1635294810,22.50,Month-to-Month,4,0,22.3,Mailed Check,Yes\n", + "TEL7283746509,118.20,Two Year,71,2,210.5,Bank Transfer,No\n", + "TEL5829374610,69.70,Month-to-Month,12,3,156.9,Electronic Check,Yes\n", + "TEL3918273640,95.15,One Year,40,1,89.7,Credit Card,No\n", + "TEL9374628105,37.40,Month-to-Month,6,0,36.2,Mailed Check,Yes\n", + "TEL6458293710,104.90,Two Year,67,2,188.1,Bank Transfer,No\n", + "TEL4829374615,57.10,Month-to-Month,14,3,118.2,Electronic Check,Yes\n", + "TEL1536472910,80.55,One Year,43,1,78.9,Credit Card,No\n", + "TEL7192837465,25.30,Month-to-Month,2,0,25.9,Mailed Check,Yes\n", + "TEL5374829106,112.90,Two Year,69,2,195.3,Bank Transfer,No\n", + "TEL3746582910,64.85,Month-to-Month,8,3,140.6,Electronic Check,Yes\n", + "TEL9263548107,90.20,One Year,46,1,85.5,Credit Card,No\n", + "TEL2635478109,32.65,Month-to-Month,4,0,31.4,Mailed Check,Yes\n", + "TEL8473920165,109.70,Two Year,63,2,182.5,Bank Transfer,No\n", + "TEL6283749105,54.50,Month-to-Month,16,3,110.1,Electronic Check,Yes\n", + "TEL4192837460,77.30,One Year,41,1,75.2,Credit Card,No\n", + "TEL1746352910,29.90,Month-to-Month,5,0,29.6,Mailed Check,Yes\n", + "TEL7382910564,117.10,Two Year,72,2,207.9,Bank Transfer,No\n", + "TEL5928374610,72.00,Month-to-Month,13,3,159.7,Electronic Check,Yes\n", + "TEL3847592016,97.85,One Year,38,1,93.2,Credit Card,No\n", + "TEL9182736450,39.55,Month-to-Month,7,0,38.3,Mailed Check,Yes\n", + "TEL6354789210,106.30,Two Year,66,2,190.8,Bank Transfer,No\n", + "TEL4738291056,51.75,Month-to-Month,11,3,105.9,Electronic Check,Yes\n", + "TEL1928374650,74.60,One Year,44,1,73.1,Credit Card,No\n", + "TEL7463529108,27.10,Month-to-Month,3,0,26.7,Mailed Check,Yes\n", + "TEL5293847610,114.50,Two Year,70,2,198.6,Bank Transfer,No\n", + "TEL3647582910,66.45,Month-to-Month,12,3,138.5,Electronic Check,Yes\n", + "TEL9182736450,93.50,One Year,47,1,84.2,Credit Card,No\n", + "TEL2736458109,35.15,Month-to-Month,6,0,34.9,Mailed Check,Yes\n", + "TEL8547392016,103.80,Two Year,64,2,176.1,Bank Transfer,No\n", + "TEL6192837450,58.20,Month-to-Month,14,3,120.7,Electronic Check,Yes\n", + "TEL4928374651,81.65,One Year,41,1,80.5,Credit Card,No\n", + "TEL1635294810,23.70,Month-to-Month,5,0,23.4,Mailed Check,Yes\n", + "TEL7283746509,119.90,Two Year,68,2,213.2,Bank Transfer,No\n", + "TEL5829374610,70.85,Month-to-Month,9,3,153.7,Electronic Check,Yes\n", + "TEL3918273640,96.20,One Year,45,1,92.4,Credit Card,No\n", + "TEL9374628105,36.80,Month-to-Month,7,0,35.6,Mailed Check,Yes\n", + "TEL6458293710,105.50,Two Year,69,2,185.4,Bank Transfer,No\n", + "TEL4829374615,56.30,Month-to-Month,15,3,115.1,Electronic Check,Yes\n", + "TEL1536472910,79.40,One Year,42,1,77.8,Credit Card,No\n", + "TEL7192837465,24.50,Month-to-Month,4,0,24.2,Mailed Check,Yes\n", + "TEL5374829106,111.80,Two Year,67,2,193.9,Bank Transfer,No\n", + "TEL3746582910,63.70,Month-to-Month,10,3,137.4,Electronic Check,Yes\n", + "TEL9263548107,89.10,One Year,40,1,83.9,Credit Card,No\n", + "TEL2635478109,33.85,Month-to-Month,6,0,32.5,Mailed Check,Yes\n", + "TEL8473920165,108.60,Two Year,65,2,179.9,Bank Transfer,No\n", + "TEL6283749105,53.40,Month-to-Month,11,3,107.8,Electronic Check,Yes\n", + "TEL4192837460,76.20,One Year,43,1,74.1,Credit Card,No\n", + "TEL1746352910,30.50,Month-to-Month,5,0,30.2,Mailed Check,Yes\n", + "TEL7382910564,116.00,Two Year,71,2,205.3,Bank Transfer,No\n", + "TEL5928374610,71.15,Month-to-Month,16,3,157.6,Electronic Check,Yes\n", + "TEL3847592016,97.00,One Year,39,1,90.9,Credit Card,No\n", + "TEL9182736450,38.70,Month-to-Month,3,0,37.4,Mailed Check,Yes\n", + "TEL6354789210,105.20,Two Year,68,2,188.7,Bank Transfer,No\n", + "TEL4738291056,52.55,Month-to-Month,14,3,104.2,Electronic Check,Yes\n", + "TEL1928374650,75.40,One Year,46,1,72.4,Credit Card,No\n", + "TEL7463529108,26.30,Month-to-Month,2,0,26.0,Mailed Check,Yes\n", + "TEL5293847610,113.70,Two Year,66,2,196.8,Bank Transfer,No\n", + "TEL3647582910,65.60,Month-to-Month,15,3,139.1,Electronic Check,Yes\n", + "TEL9182736450,94.35,One Year,42,1,86.8,Credit Card,No\n", + "TEL2736458109,34.30,Month-to-Month,4,0,34.0,Mailed Check,Yes\n", + "TEL8547392016,102.70,Two Year,63,2,173.5,Bank Transfer,No\n", + "TEL6192837450,59.90,Month-to-Month,13,3,121.3,Electronic Check,Yes\n", + "TEL4928374651,82.20,One Year,47,1,79.2,Credit Card,No\n", + "TEL1635294810,23.10,Month-to-Month,6,0,22.8,Mailed Check,Yes\n", + "TEL7283746509,119.30,Two Year,69,2,211.6,Bank Transfer,No\n", + "TEL5829374610,71.40,Month-to-Month,10,3,154.3,Electronic Check,Yes\n", + "TEL3918273640,96.70,One Year,44,1,91.7,Credit Card,No\n", + "TEL9374628105,37.10,Month-to-Month,5,0,36.8,Mailed Check,Yes\n", + "TEL6458293710,106.00,Two Year,70,2,186.1,Bank Transfer,No\n", + "TEL4829374615,55.70,Month-to-Month,12,3,112.0,Electronic Check,Yes\n", + "TEL1536472910,78.80,One Year,41,1,76.5,Credit Card,No\n", + "TEL7192837465,25.00,Month-to-Month,7,0,24.7,Mailed Check,Yes\n", + "TEL5374829106,111.20,Two Year,64,2,191.3,Bank Transfer,No\n", + "TEL3746582910,64.20,Month-to-Month,14,3,136.1,Electronic Check,Yes\n", + "TEL9263548107,90.80,One Year,43,1,82.6,Credit Card,No\n", + "TEL2635478109,33.20,Month-to-Month,5,0,31.9,Mailed Check,Yes\n", + "TEL8473920165,109.10,Two Year,67,2,177.4,Bank Transfer,No\n", + "TEL6283749105,54.00,Month-to-Month,16,3,109.4,Electronic Check,Yes\n", + "TEL4192837460,75.60,One Year,40,1,73.4,Credit Card,No\n", + "TEL1746352910,31.10,Month-to-Month,3,0,30.8,Mailed Check,Yes\n", + "TEL7382910564,115.40,Two Year,65,2,202.7,Bank Transfer,No\n", + "testinggemini.txt\n", + "Keyboard interruption in main thread... closing server.\n", + "Killing tunnel 127.0.0.1:7860 <> https://d076a9fef9034a4f24.gradio.live\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [] + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "# Example Scenarios\n", + "\n", + "# Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", + "# Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n", + "# Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", + "# Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", + "# Create a dataset of hospitalized patients to predict readmission within 30 days. Include patient ID, age, gender, number of prior admissions, diagnosis codes, length of stay, discharge type, medications prescribed, and readmission label.\n", + "# Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", + "\n", + "generator_ui.launch(share=True, debug=True, inbrowser=True)" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "_9HIC_AzfZBZ" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 045d1e4b36fd6c2130d9745a917de7987fff62be Mon Sep 17 00:00:00 2001 From: SABEEH Shaikh Date: Sat, 24 May 2025 17:37:22 +0200 Subject: [PATCH 02/23] Cleared output of all cells as per feedback given --- .../llm_dataset_generator.ipynb | 2402 +++++------------ 1 file changed, 603 insertions(+), 1799 deletions(-) diff --git a/week3/community-contributions/llm_dataset_generator.ipynb b/week3/community-contributions/llm_dataset_generator.ipynb index 3de4ce1..c407ad4 100644 --- a/week3/community-contributions/llm_dataset_generator.ipynb +++ b/week3/community-contributions/llm_dataset_generator.ipynb @@ -1,1801 +1,605 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Synthetic Data Generator Notebook\n", - "## About\n", - "This colab notebook demonstrates the use of Frontier and Open-source LLM models for generating synthetic dataset for a business scenario provided by the user. From a UI interface implemented in gradio, a user can define their business scenario in detail, select the number of records needed along with the its format and adjust the number of max output tokens to be generated by the chosen LLM.\n", - "\n", - "It does not stop here. Once the records have been produced in the LLM output, it can be extracted and stored in a file, format same as set by user before. The file is stored in colab notebook under the contents directory. All of this is extraction is done with the help of the 're' library. My first time using it and I totally enjoyed learning it.\n", - "\n", - "## Outlook\n", - "Sometimes the response is loaded with the user prompt and a lot of tags when using an open-source models, such as Mixtral from Mistral. This is because of the prompt format being used. The 'assistant' 'role' format does not suit them. This is an optimization to look for and can be easily done by using custom prompt template for such models and these templates are hinted on their huggingface repo." - ], - "metadata": { - "id": "SFA6R-4jL7SS" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ip4I4Lff3B2M" - }, - "source": [ - "## Install & Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8zVlW-GMcBaU", - "outputId": "0c473564-fb93-41a9-c819-e6aa2382d75a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.2/54.2 MB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m323.1/323.1 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.0/264.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m48.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m83.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.1/76.1 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.2/95.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "!pip install -q gradio anthropic requests torch bitsandbytes transformers accelerate openai" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "YKVNzE5sFH2l" - }, - "outputs": [], - "source": [ - "# imports\n", - "import re\n", - "import os\n", - "import sys\n", - "import gc\n", - "import io\n", - "import json\n", - "import anthropic\n", - "import gradio as gr\n", - "import requests\n", - "import subprocess\n", - "import google.generativeai as ggai\n", - "import torch\n", - "import tempfile\n", - "import shutil\n", - "from io import StringIO\n", - "import pandas as pd\n", - "from google.colab import userdata\n", - "from huggingface_hub import login\n", - "from openai import OpenAI\n", - "from pathlib import Path\n", - "from datetime import datetime\n", - "from IPython.display import Markdown, display, update_display\n", - "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LWpD6bZv3mAR" - }, - "source": [ - "## HuggingFace Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "aeC2oWY2FTv7" - }, - "outputs": [], - "source": [ - "# Sign in to HuggingFace Hub\n", - "\n", - "hf_token = userdata.get('HF_TOKEN')\n", - "login(hf_token, add_to_git_credential=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8Au2UPVy3vn5" - }, - "source": [ - "## Frontier Models configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "geBBsd14X3UL" - }, - "outputs": [], - "source": [ - "openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))\n", - "anthropic_client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))\n", - "ggai.configure(api_key=userdata.get('GOOGLE_API_KEY'))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tCnDIOlKgjbO" - }, - "source": [ - "## Defining Prompts" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "gkwXZsxofAU1" - }, - "outputs": [], - "source": [ - "system_prompt = \"\"\"\n", - "You are a synthetic dataset generator. Your role is to create synthetic dataset that infers structured data schemas from business scenarios given by the user.\n", - "\n", - "Your task is to:\n", - "1. Understand the user's business problem(s) or use case(s).\n", - "2. Identify the key fields needed to support that scenario.\n", - "3. Define appropriate field names, data types, and formats.\n", - "4. Generate synthetic records that match the inferred schema.\n", - "\n", - "Guidelines:\n", - "- Use realistic field names and values. Do not invent unrelated fields or values.\n", - "- Choose sensible data types: string, integer, float, date, boolean, enum, etc.\n", - "- Respect logical constraints (e.g., age range, date ranges, email formats).\n", - "- Output the dataset in the format the user requests (json, csv, txt, markdown table).\n", - "- If the scenario is vague or broad, make reasonable assumptions and explain them briefly before generating the dataset.\n", - "- Always generate a dataset that supports the business use case logically.\n", - "\n", - "Before generating the data, display the inferred schema in a readable format.\n", - "\"\"\"\n", - "\n", - "# trial_user_prompt = \"I’m building a churn prediction model for a telecom company. Can you generate a synthetic dataset with 100 rows?\"\n", - "def get_user_prompt(business_problem, no_of_samples, file_format):\n", - " return f\"\"\"\n", - " The business scenario for which I want you to generate a dataset is defined below:\n", - " {business_problem}\n", - "\n", - " Generate a synthetic dataset of {no_of_samples} records in {file_format} format.\n", - " When generating the dataset, wrap it between the '<<<>>>' tag. Make sure the tag is there in the output.\n", - " Do not include any other special characters in between the tags, other than the ones required in producing the correct format of data.\n", - " For examples: When a 'csv' format is given, only the ',' character can be used in between the tags.\n", - " \"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yNpVf9-oQdoO" - }, - "source": [ - "### Quanitzation Config" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "3ErZ315MQdU3" - }, - "outputs": [], - "source": [ - "# This allows us to load the model into memory and use less memory\n", - "def get_quantization_config():\n", - " return BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_use_double_quant=True,\n", - " bnb_4bit_compute_dtype=torch.bfloat16,\n", - " bnb_4bit_quant_type=\"nf4\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "clGtRh0N4951" - }, - "source": [ - "## HF Model inference" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "MAhyn1ehb3Dh" - }, - "outputs": [], - "source": [ - "# All in one HuggingFace Model Response function\n", - "def run_hfmodel_and_get_response(prompt, model_name, output_tokens):\n", - " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - " tokenizer.pad_token = tokenizer.eos_token\n", - " inputs = tokenizer.apply_chat_template(prompt, return_tensors=\"pt\")\n", - " if torch.cuda.is_available():\n", - " inputs = inputs.to(\"cuda\")\n", - " streamer = TextStreamer(tokenizer)\n", - " if \"microsoft/bitnet-b1.58-2B-4T\" in model_name:\n", - " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", trust_remote_code=True)\n", - " elif \"tiiuae/Falcon-E-3B-Instruct\" in model_name:\n", - " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", torch_dtype=torch.float16 )\n", - " else:\n", - " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", quantization_config=get_quantization_config())\n", - " outputs = model.generate(inputs, max_new_tokens=output_tokens, streamer=streamer)\n", - " response = tokenizer.decode(outputs[0])\n", - " del model, inputs, tokenizer, outputs\n", - " gc.collect()\n", - " torch.cuda.empty_cache()\n", - " return response" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gh_Ny1aM-L8z" - }, - "source": [ - "## Frontier Models Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "h11WlZNhfHCR" - }, - "outputs": [], - "source": [ - "# ChatGPT, Claude and Gemini response function\n", - "def get_chatgpt_response(prompt, model_name, output_tokens):\n", - " response = openai_client.chat.completions.create(\n", - " model=model_name,\n", - " messages=prompt,\n", - " max_tokens=output_tokens,\n", - " )\n", - " return response.choices[0].message.content\n", - "\n", - "def get_claude_response(prompt, model_name, output_tokens):\n", - " response = anthropic_client.messages.create(\n", - " model=model_name,\n", - " max_tokens=output_tokens,\n", - " system=system_prompt,\n", - " messages=[\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": prompt,\n", - " }\n", - " ],\n", - " )\n", - " return response.content[0].text\n", - "\n", - "def get_gemini_response(prompt, model_name, output_tokens):\n", - " model = ggai.GenerativeModel(\n", - " model_name=model_name,\n", - " system_instruction=system_prompt,\n", - " )\n", - "\n", - " response = model.generate_content(prompt, generation_config={\n", - " \"max_output_tokens\": output_tokens,\n", - " \"temperature\": 0.7,\n", - " })\n", - " return response.text" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nzHbM_WQvRgT" - }, - "source": [ - "## Gradio Implementation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uFWZqw1R-al_" - }, - "source": [ - "### Dropdowns Selection Lists" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "rOzEb0o--aD7" - }, - "outputs": [], - "source": [ - "# Dropdown List Values for the user\n", - "MODEL_TYPES=[\"GPT\", \"Claude\", \"Gemini\", \"HuggingFace\"]\n", - "OPENAI_MODEL_NAMES=[\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n", - "ANTHROPIC_MODELS=[\"claude-3-7-sonnet-latest\", \"claude-3-5-haiku-latest\", \"claude-3-opus-latest\"]\n", - "GOOGLE_MODELS=[\"gemini-2.0-flash\", \"gemini-1.5-pro\"]\n", - "HUGGINGFACE_MODELS=[\n", - " \"meta-llama/Llama-3.2-3B-Instruct\",\n", - " \"microsoft/bitnet-b1.58-2B-4T\",\n", - " \"ByteDance-Seed/Seed-Coder-8B-Instruct\",\n", - " \"tiiuae/Falcon-E-3B-Instruct\",\n", - " \"Qwen/Qwen2.5-7B-Instruct\"\n", - "]\n", - "MODEL_NAMES = {\n", - " \"GPT\": OPENAI_MODEL_NAMES,\n", - " \"Claude\": ANTHROPIC_MODELS,\n", - " \"Gemini\": GOOGLE_MODELS,\n", - " \"HuggingFace\": HUGGINGFACE_MODELS\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sbXGL8_4-oKc" - }, - "source": [ - "### UI" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "_0NCY7FgCVHj" - }, - "outputs": [], - "source": [ - "with gr.Blocks() as generator_ui:\n", - " gr.Markdown(\"# 🧠 Business Scenario → Synthetic Dataset Generator\")\n", - "\n", - " with gr.Row():\n", - " with gr.Column(scale=3):\n", - " with gr.Row():\n", - " dataset_size=gr.Number(value=10, label=\"Enter the number of data samples to generate.\", show_label=True)\n", - " format=gr.Dropdown([\"json\", \"csv\", \"txt\", \"markdown\"], label=\"Select the format for the dataset\", show_label=True)\n", - " with gr.Row():\n", - " scenario=gr.Textbox(label=\"Business Scenario\", lines=5, placeholder=\"Describe your business scenario here\")\n", - " with gr.Row():\n", - " error = gr.Markdown(visible=False)\n", - " with gr.Row():\n", - " clear = gr.Button(\"Clear Everything\")\n", - " submit = gr.Button(\"Generate Dataset\", variant=\"primary\")\n", - "\n", - " with gr.Column(scale=1):\n", - " model_type = gr.Dropdown(MODEL_TYPES, label=\"Model Type\", show_label=True, info=\"Select the model type you want to use\")\n", - " model_name = gr.Dropdown(MODEL_NAMES[model_type.value], label=\"Model Name\", show_label=True, allow_custom_value=True, info=\"Select the model name or enter one manually\")\n", - " output_tokens= gr.Number(value=1000, label=\"Enter the max number of output tokens to generate.\", show_label=True, info=\"This will impact the length of the response containg the dataset\")\n", - "\n", - " with gr.Row():\n", - " # Chatbot Interface\n", - " chatbot = gr.Chatbot(\n", - " type='messages',\n", - " label='Chatbot',\n", - " show_label=True,\n", - " height=300,\n", - " resizable=True,\n", - " elem_id=\"chatbot\",\n", - " avatar_images=(\"🧑\", \"🤖\",)\n", - " )\n", - " with gr.Row(variant=\"compact\"):\n", - " extract_btn = gr.Button(\"Extract and Save Dataset\", variant=\"huggingface\", visible=False)\n", - " file_name = gr.Textbox(label=\"Enter file name here (without file extension)\", placeholder=\"e.g. cancer_synthetic, warehouse_synthetic (no digits)\", visible=False)\n", - " with gr.Row():\n", - " markdown_preview = gr.Markdown(visible = False)\n", - " dataset_preview = gr.Textbox(label=\"Dataset Preview\",visible=False)\n", - " with gr.Row():\n", - " file_saved = gr.Textbox(visible=False)\n", - "\n", - " def run_inference(scenario, model_type, model_name, output_tokens, dataset_size, format):\n", - " \"\"\"Run the model and get the response\"\"\"\n", - " model_type=model_type.lower()\n", - " print(f\"scenario: {scenario}\")\n", - " print(f\"model_type: {model_type}\")\n", - " print(f\"model_name: {model_name}\")\n", - " if not scenario.strip():\n", - " return gr.update(value=\"❌ **Error:** Please define a scenario first!\",visible=True), []\n", - "\n", - " user_prompt = get_user_prompt(scenario, dataset_size, format)\n", - " prompt = [\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": user_prompt},\n", - " ]\n", - "\n", - " if model_type == \"gpt\":\n", - " response = get_chatgpt_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", - " elif model_type == \"claude\":\n", - " response = get_claude_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", - " elif model_type == \"gemini\":\n", - " response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", - " else:\n", - " response = run_hfmodel_and_get_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", - " torch.cuda.empty_cache()\n", - " history = [\n", - " {\"role\": \"user\", \"content\": scenario},\n", - " {\"role\": \"assistant\", \"content\": response}\n", - " ]\n", - " return gr.update(visible=False), history\n", - "\n", - " def extract_dataset_string(response):\n", - " \"\"\"Extract dataset content between defined tags using regex.\"\"\"\n", - " # Remove known artificial tokens (common in HuggingFace or Claude)\n", - " response = re.sub(r\"<\\[.*?\\]>\", \"\", response)\n", - "\n", - " # Remove system or prompt echo if repeated before dataset\n", - " response = re.sub(r\"(?is)^.*?<<<\", \"<<<\", response.strip(), count=1)\n", - "\n", - " # 1. Match strict <<<>>>...<<<>>> tag blocks (use last match)\n", - " matches = re.findall(r\"<<<>>>[\\s\\r\\n]*(.*?)[\\s\\r\\n]*<<<>>>\", response, re.DOTALL)\n", - " if matches:\n", - " return matches[-1].strip()\n", - "\n", - " # 2. Match loose <<< ... >>> format\n", - " matches = re.findall(r\"<<<[\\s\\r\\n]*(.*?)[\\s\\r\\n]*>>>\", response, re.DOTALL)\n", - " if matches:\n", - " return matches[-1].strip()\n", - "\n", - " # 3. Match final fallback: take everything after last <<< as raw data\n", - " last_open = response.rfind(\"<<<\")\n", - " if last_open != -1:\n", - " raw = response[last_open + 3 :].strip()\n", - " # Optionally cut off noisy trailing notes, explanations, etc.\n", - " raw = re.split(r\"\\n\\s*\\n|Explanation:|Note:|---\", raw)[0]\n", - " return raw.strip()\n", - "\n", - " return \"Could not extract dataset! Try again with a different model.\"\n", - "\n", - " def extract_dataset_from_response(chatbot_history, file_name, file_type):\n", - " \"\"\"Extract dataset and update in gradio UI components\"\"\"\n", - " response = chatbot_history[-1][\"content\"]\n", - " if not response:\n", - " return gr.update(visible=True, value=\"Could not find LLM Response! Try again.\"), gr.update(visible=False)\n", - "\n", - " # match = re.search(r'<<<\\s*(.*?)\\s*>>>', response, re.DOTALL)\n", - " # print(match)\n", - " # if match and match.group(1).strip() == \"\":\n", - " # match = re.search(r'<<<>>>\\s*(.*?)\\s*<<<>>>', response, re.DOTALL)\n", - " # print(match)\n", - " # if match is None:\n", - " # return gr.update(visible=True, value=\"Could not extract dataset! Try again with a different model.\"), gr.update(visible=False)\n", - " # dataset = match.group(1).strip()\n", - " dataset = extract_dataset_string(response)\n", - " if dataset == \"Could not extract dataset! Try again with a different model.\":\n", - " return gr.update(visible=True, value=dataset), gr.update(visible=False)\n", - " text = save_dataset(dataset, file_type, file_name)\n", - " return gr.update(visible=True, value=text), gr.update(visible=True, value=dataset)\n", - "\n", - " def save_dataset(dataset, file_format, file_name):\n", - " \"\"\"Save dataset to a file based on the selected format.\"\"\"\n", - " file_name=file_name+\".\"+file_format\n", - " print(dataset)\n", - " print(file_name)\n", - " if file_format == \"json\":\n", - " try:\n", - " data = json.loads(dataset)\n", - " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", - " json.dump(data, f, indent=4)\n", - " return \"Dataset saved successfully!\"\n", - " except:\n", - " return \"Could not save dataset! Try again in another format.\"\n", - " elif file_format == \"csv\":\n", - " try:\n", - " df = pd.read_csv(StringIO(dataset))\n", - " df.to_csv(file_name, index=False)\n", - " return \"Dataset saved successfully!\"\n", - " except:\n", - " return \"Could not save dataset! Try again in another format.\"\n", - " elif file_format == \"txt\":\n", - " try:\n", - " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", - " f.write(dataset)\n", - " return \"Dataset saved successfully!\"\n", - " except:\n", - " return \"Could not save dataset! Try again in another format.\"\n", - "\n", - " def clear_chat():\n", - " \"\"\"Clear the chat history.\"\"\"\n", - " return \"\", [], gr.update(visible=False), gr.update(visible=False)\n", - "\n", - " def show_extract_btn(chatbot_history, format):\n", - " \"\"\"Show the extract button if the response has been displayed in the chatbot and format is not set to markdown\"\"\"\n", - " if chatbot_history == []:\n", - " return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)\n", - " if format == \"markdown\":\n", - " return gr.update(visible=True, value=chatbot_history[1][\"content\"]), gr.update(visible=False), gr.update(visible=False)\n", - " return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)\n", - "\n", - " extract_btn.click(\n", - " fn=extract_dataset_from_response,\n", - " inputs=[chatbot, file_name, format],\n", - " outputs=[file_saved, dataset_preview]\n", - " )\n", - "\n", - " chatbot.change(\n", - " fn=show_extract_btn,\n", - " inputs=[chatbot, format],\n", - " outputs=[markdown_preview, extract_btn, file_name]\n", - " )\n", - "\n", - " model_type.change(\n", - " fn=lambda x: gr.update(choices=MODEL_NAMES[x], value=MODEL_NAMES[x][0]),\n", - " inputs=[model_type],\n", - " outputs=[model_name]\n", - " )\n", - "\n", - " submit.click(\n", - " fn=run_inference,\n", - " inputs=[scenario, model_type, model_name, output_tokens, dataset_size, format],\n", - " outputs=[error, chatbot],\n", - " show_progress=True\n", - " )\n", - "\n", - " clear.click(\n", - " clear_chat,\n", - " outputs=[scenario, chatbot, dataset_preview, file_saved]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "collapsed": true, - "id": "kzDUJahK8uRN", - "outputId": "c5674be2-b262-4439-ae91-4f3e1f49e041" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", - "* Running on public URL: https://d076a9fef9034a4f24.gradio.live\n", - "\n", - "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "
" - ] - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "scenario: Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n", - "model_type: gpt\n", - "model_name: gpt-4o\n", - "Loan Amount,Applicant Income,Co-applicant Income,Employment Type,Credit History,Loan Term,Number of Dependents,Education Level,Loan Approval Status\n", - "250000,60000,15000,Salaried,1,240,1,Graduate,Approved\n", - "350000,80000,0,Salaried,1,360,2,Graduate,Approved\n", - "120000,30000,10000,Self-employed,0,180,1,Not Graduate,Rejected\n", - "500000,150000,50000,Self-employed,1,300,3,Graduate,Approved\n", - "75000,20000,0,Unemployed,0,120,0,Graduate,Rejected\n", - "275000,75000,25000,Salaried,0,240,2,Not Graduate,Rejected\n", - "100000,40000,20000,Salaried,1,60,0,Graduate,Approved\n", - "310000,95000,0,Self-employed,1,360,1,Graduate,Approved\n", - "450000,50000,0,Self-employed,0,180,4,Not Graduate,Rejected\n", - "200000,55000,20000,Salaried,1,120,3,Graduate,Approved\n", - "100000,35000,0,Unemployed,0,60,0,Not Graduate,Rejected\n", - "230000,68000,13000,Salaried,1,240,1,Graduate,Approved\n", - "330000,99000,40000,Self-employed,1,300,2,Graduate,Approved\n", - "150000,18000,7500,Unemployed,0,48,0,Not Graduate,Rejected\n", - "210000,64000,0,Salaried,0,120,1,Graduate,Rejected\n", - "310000,87000,30000,Self-employed,1,360,2,Graduate,Approved\n", - "50000,22000,7000,Unemployed,0,24,0,Not Graduate,Rejected\n", - "290000,92000,20000,Salaried,1,240,3,Graduate,Approved\n", - "110000,45000,0,Salaried,0,36,0,Graduate,Rejected\n", - "450000,76000,25000,Self-employed,1,360,2,Graduate,Approved\n", - "loan_approval_synthetic.txt\n", - "scenario: Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", - "model_type: gpt\n", - "model_name: gpt-4o\n", - "scenario: Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", - "model_type: gpt\n", - "model_name: gpt-4o\n", - "[\n", - " {\n", - " \"appointment_id\": \"AID001\",\n", - " \"scheduled_date\": \"2023-11-01\",\n", - " \"appointment_date\": \"2023-11-10\",\n", - " \"lead_time\": 9,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 45,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID002\",\n", - " \"scheduled_date\": \"2023-11-03\",\n", - " \"appointment_date\": \"2023-11-15\",\n", - " \"lead_time\": 12,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 34,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID003\",\n", - " \"scheduled_date\": \"2023-11-05\",\n", - " \"appointment_date\": \"2023-11-11\",\n", - " \"lead_time\": 6,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 29,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID004\",\n", - " \"scheduled_date\": \"2023-11-02\",\n", - " \"appointment_date\": \"2023-11-14\",\n", - " \"lead_time\": 12,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 62,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID005\",\n", - " \"scheduled_date\": \"2023-11-06\",\n", - " \"appointment_date\": \"2023-11-13\",\n", - " \"lead_time\": 7,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 21,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID006\",\n", - " \"scheduled_date\": \"2023-11-08\",\n", - " \"appointment_date\": \"2023-11-17\",\n", - " \"lead_time\": 9,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 58,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID007\",\n", - " \"scheduled_date\": \"2023-11-10\",\n", - " \"appointment_date\": \"2023-11-18\",\n", - " \"lead_time\": 8,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 41,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID008\",\n", - " \"scheduled_date\": \"2023-11-07\",\n", - " \"appointment_date\": \"2023-11-12\",\n", - " \"lead_time\": 5,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 67,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID009\",\n", - " \"scheduled_date\": \"2023-11-12\",\n", - " \"appointment_date\": \"2023-11-20\",\n", - " \"lead_time\": 8,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 74,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID010\",\n", - " \"scheduled_date\": \"2023-11-09\",\n", - " \"appointment_date\": \"2023-11-16\",\n", - " \"lead_time\": 7,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 25,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID011\",\n", - " \"scheduled_date\": \"2023-11-13\",\n", - " \"appointment_date\": \"2023-11-21\",\n", - " \"lead_time\": 8,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 32,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID012\",\n", - " \"scheduled_date\": \"2023-11-14\",\n", - " \"appointment_date\": \"2023-11-25\",\n", - " \"lead_time\": 11,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 48,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID013\",\n", - " \"scheduled_date\": \"2023-11-15\",\n", - " \"appointment_date\": \"2023-11-27\",\n", - " \"lead_time\": 12,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 36,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID014\",\n", - " \"scheduled_date\": \"2023-11-17\",\n", - " \"appointment_date\": \"2023-12-02\",\n", - " \"lead_time\": 15,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 28,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID015\",\n", - " \"scheduled_date\": \"2023-11-16\",\n", - " \"appointment_date\": \"2023-12-01\",\n", - " \"lead_time\": 15,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 60,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID016\",\n", - " \"scheduled_date\": \"2023-11-18\",\n", - " \"appointment_date\": \"2023-12-05\",\n", - " \"lead_time\": 17,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 40,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID017\",\n", - " \"scheduled_date\": \"2023-11-19\",\n", - " \"appointment_date\": \"2023-12-03\",\n", - " \"lead_time\": 14,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 19,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID018\",\n", - " \"scheduled_date\": \"2023-11-21\",\n", - " \"appointment_date\": \"2023-12-07\",\n", - " \"lead_time\": 16,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 51,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID019\",\n", - " \"scheduled_date\": \"2023-11-23\",\n", - " \"appointment_date\": \"2023-12-09\",\n", - " \"lead_time\": 16,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 55,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID020\",\n", - " \"scheduled_date\": \"2023-11-22\",\n", - " \"appointment_date\": \"2023-12-08\",\n", - " \"lead_time\": 16,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 23,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID021\",\n", - " \"scheduled_date\": \"2023-11-24\",\n", - " \"appointment_date\": \"2023-12-10\",\n", - " \"lead_time\": 16,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 47,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID022\",\n", - " \"scheduled_date\": \"2023-11-25\",\n", - " \"appointment_date\": \"2023-12-12\",\n", - " \"lead_time\": 17,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 33,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID023\",\n", - " \"scheduled_date\": \"2023-11-27\",\n", - " \"appointment_date\": \"2023-12-14\",\n", - " \"lead_time\": 17,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 42,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID024\",\n", - " \"scheduled_date\": \"2023-11-29\",\n", - " \"appointment_date\": \"2023-12-15\",\n", - " \"lead_time\": 16,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 64,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID025\",\n", - " \"scheduled_date\": \"2023-12-01\",\n", - " \"appointment_date\": \"2023-12-20\",\n", - " \"lead_time\": 19,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 26,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID026\",\n", - " \"scheduled_date\": \"2023-12-03\",\n", - " \"appointment_date\": \"2023-12-22\",\n", - " \"lead_time\": 19,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 31,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID027\",\n", - " \"scheduled_date\": \"2023-12-05\",\n", - " \"appointment_date\": \"2023-12-24\",\n", - " \"lead_time\": 19,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 50,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID028\",\n", - " \"scheduled_date\": \"2023-12-06\",\n", - " \"appointment_date\": \"2023-12-25\",\n", - " \"lead_time\": 19,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 39,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID029\",\n", - " \"scheduled_date\": \"2023-12-07\",\n", - " \"appointment_date\": \"2023-12-27\",\n", - " \"lead_time\": 20,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 71,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID030\",\n", - " \"scheduled_date\": \"2023-12-08\",\n", - " \"appointment_date\": \"2023-12-28\",\n", - " \"lead_time\": 20,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 44,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID031\",\n", - " \"scheduled_date\": \"2023-12-10\",\n", - " \"appointment_date\": \"2023-12-31\",\n", - " \"lead_time\": 21,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 38,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID032\",\n", - " \"scheduled_date\": \"2023-12-11\",\n", - " \"appointment_date\": \"2024-01-02\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 53,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID033\",\n", - " \"scheduled_date\": \"2023-12-13\",\n", - " \"appointment_date\": \"2024-01-04\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 27,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID034\",\n", - " \"scheduled_date\": \"2023-12-15\",\n", - " \"appointment_date\": \"2024-01-06\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 46,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID035\",\n", - " \"scheduled_date\": \"2023-12-17\",\n", - " \"appointment_date\": \"2024-01-09\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 68,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID036\",\n", - " \"scheduled_date\": \"2023-12-19\",\n", - " \"appointment_date\": \"2024-01-10\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 37,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID037\",\n", - " \"scheduled_date\": \"2023-12-20\",\n", - " \"appointment_date\": \"2024-01-12\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 57,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID038\",\n", - " \"scheduled_date\": \"2023-12-22\",\n", - " \"appointment_date\": \"2024-01-14\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 43,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID039\",\n", - " \"scheduled_date\": \"2023-12-23\",\n", - " \"appointment_date\": \"2024-01-16\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 65,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID040\",\n", - " \"scheduled_date\": \"2023-12-25\",\n", - " \"appointment_date\": \"2024-01-17\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 49,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID041\",\n", - " \"scheduled_date\": \"2023-12-27\",\n", - " \"appointment_date\": \"2024-01-20\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 30,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID042\",\n", - " \"scheduled_date\": \"2023-12-29\",\n", - " \"appointment_date\": \"2024-01-22\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 24,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID043\",\n", - " \"scheduled_date\": \"2024-01-01\",\n", - " \"appointment_date\": \"2024-01-25\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 72,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID044\",\n", - " \"scheduled_date\": \"2024-01-03\",\n", - " \"appointment_date\": \"2024-01-27\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 35,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID045\",\n", - " \"scheduled_date\": \"2024-01-04\",\n", - " \"appointment_date\": \"2024-01-28\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 61,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID046\",\n", - " \"scheduled_date\": \"2024-01-05\",\n", - " \"appointment_date\": \"2024-01-30\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 68,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID047\",\n", - " \"scheduled_date\": \"2024-01-07\",\n", - " \"appointment_date\": \"2024-02-01\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 22,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID048\",\n", - " \"scheduled_date\": \"2024-01-08\",\n", - " \"appointment_date\": \"2024-02-03\",\n", - " \"lead_time\": 26,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 52,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID049\",\n", - " \"scheduled_date\": \"2024-01-10\",\n", - " \"appointment_date\": \"2024-02-04\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 73,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID050\",\n", - " \"scheduled_date\": \"2024-01-12\",\n", - " \"appointment_date\": \"2024-02-06\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 56,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID051\",\n", - " \"scheduled_date\": \"2024-01-15\",\n", - " \"appointment_date\": \"2024-02-07\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 62,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID052\",\n", - " \"scheduled_date\": \"2024-01-17\",\n", - " \"appointment_date\": \"2024-02-10\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 80,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID053\",\n", - " \"scheduled_date\": \"2024-01-19\",\n", - " \"appointment_date\": \"2024-02-12\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 29,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID054\",\n", - " \"scheduled_date\": \"2024-01-21\",\n", - " \"appointment_date\": \"2024-02-13\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 66,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID055\",\n", - " \"scheduled_date\": \"2024-01-23\",\n", - " \"appointment_date\": \"2024-02-15\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 77,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID056\",\n", - " \"scheduled_date\": \"2024-01-25\",\n", - " \"appointment_date\": \"2024-02-17\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 54,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID057\",\n", - " \"scheduled_date\": \"2024-01-28\",\n", - " \"appointment_date\": \"2024-02-19\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 28,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID058\",\n", - " \"scheduled_date\": \"2024-01-30\",\n", - " \"appointment_date\": \"2024-02-22\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 45,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID059\",\n", - " \"scheduled_date\": \"2024-02-01\",\n", - " \"appointment_date\": \"2024-02-24\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 69,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID060\",\n", - " \"scheduled_date\": \"2024-02-02\",\n", - " \"appointment_date\": \"2024-02-26\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 51,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID061\",\n", - " \"scheduled_date\": \"2024-02-04\",\n", - " \"appointment_date\": \"2024-02-27\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 33,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID062\",\n", - " \"scheduled_date\": \"2024-02-06\",\n", - " \"appointment_date\": \"2024-03-01\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 84,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID063\",\n", - " \"scheduled_date\": \"2024-02-09\",\n", - " \"appointment_date\": \"2024-03-04\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 47,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID064\",\n", - " \"scheduled_date\": \"2024-02-10\",\n", - " \"appointment_date\": \"2024-03-06\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 59,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID065\",\n", - " \"scheduled_date\": \"2024-02-12\",\n", - " \"appointment_date\": \"2024-03-08\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 20,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID066\",\n", - " \"scheduled_date\": \"2024-02-14\",\n", - " \"appointment_date\": \"2024-03-10\",\n", - " \"lead_time\": 25,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 48,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID067\",\n", - " \"scheduled_date\": \"2024-02-17\",\n", - " \"appointment_date\": \"2024-03-12\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 38,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID068\",\n", - " \"scheduled_date\": \"2024-02-19\",\n", - " \"appointment_date\": \"2024-03-14\",\n", - " \"lead_time\": 24,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 76,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID069\",\n", - " \"scheduled_date\": \"2024-02-21\",\n", - " \"appointment_date\": \"2024-03-15\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 34,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID070\",\n", - " \"scheduled_date\": \"2024-02-23\",\n", - " \"appointment_date\": \"2024-03-17\",\n", - " \"lead_time\": 23,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 26,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID071\",\n", - " \"scheduled_date\": \"2024-02-25\",\n", - " \"appointment_date\": \"2024-03-19\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 22,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 2,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID072\",\n", - " \"scheduled_date\": \"2024-02-27\",\n", - " \"appointment_date\": \"2024-03-20\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 0,\n", - " \"patient_age\": 58,\n", - " \"gender\": \"Other\",\n", - " \"health_condition_severity\": 1,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID073\",\n", - " \"scheduled_date\": \"2024-02-29\",\n", - " \"appointment_date\": \"2024-03-22\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 3,\n", - " \"patient_age\": 67,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 3,\n", - " \"no_show_status\": false\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID074\",\n", - " \"scheduled_date\": \"2024-03-02\",\n", - " \"appointment_date\": \"2024-03-24\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 2,\n", - " \"patient_age\": 32,\n", - " \"gender\": \"Female\",\n", - " \"health_condition_severity\": 4,\n", - " \"no_show_status\": true\n", - " },\n", - " {\n", - " \"appointment_id\": \"AID075\",\n", - " \"scheduled_date\": \"2024-03-04\",\n", - " \"appointment_date\": \"2024-03-26\",\n", - " \"lead_time\": 22,\n", - " \"sms_reminders_sent\": 1,\n", - " \"patient_age\": 46,\n", - " \"gender\": \"Male\",\n", - " \"health_condition_severity\": 5,\n", - " \"no_show_status\": false\n", - " }\n", - "]\n", - "medical_appointment.json\n", - "scenario: Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", - "model_type: claude\n", - "model_name: claude-3-7-sonnet-latest\n", - "scenario: Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", - "model_type: claude\n", - "model_name: claude-3-7-sonnet-latest\n", - "transaction_id,amount,timestamp,merchant_category,customer_location,card_presence,device_type,fraud_label\n", - "TX123456789,45.99,2023-11-01 08:23:15,Retail,New York,Yes,POS Terminal,No\n", - "TX123456790,899.50,2023-11-01 09:45:22,Electronics,Chicago,Yes,POS Terminal,No\n", - "TX123456791,12.35,2023-11-01 10:12:45,Food & Beverage,Los Angeles,No,Mobile,No\n", - "TX123456792,5423.80,2023-11-01 11:30:18,Jewelry,Miami,No,Web Browser,Yes\n", - "TX123456793,76.24,2023-11-01 14:22:56,Groceries,Denver,Yes,POS Terminal,No\n", - "TX123456794,149.99,2023-11-02 07:15:33,Clothing,Seattle,No,Mobile,No\n", - "TX123456795,2500.00,2023-11-02 08:45:12,Electronics,Toronto,No,Web Browser,Yes\n", - "TX123456796,35.50,2023-11-02 12:33:47,Food & Beverage,Boston,Yes,POS Terminal,No\n", - "TX123456797,10.99,2023-11-02 15:20:09,Entertainment,Philadelphia,No,Mobile,No\n", - "TX123456798,750.25,2023-11-02 16:45:18,Travel,San Francisco,No,Web Browser,No\n", - "TX123456799,65.40,2023-11-02 19:22:31,Retail,Austin,Yes,POS Terminal,No\n", - "TX123456800,3299.99,2023-11-03 05:45:22,Electronics,London,No,Web Browser,Yes\n", - "TX123456801,22.50,2023-11-03 08:12:40,Food & Beverage,Atlanta,Yes,POS Terminal,No\n", - "TX123456802,129.95,2023-11-03 10:33:27,Clothing,Chicago,No,Mobile,No\n", - "TX123456803,50.00,2023-11-03 12:15:39,Gas Station,Dallas,Yes,POS Terminal,No\n", - "TX123456804,1999.00,2023-11-03 14:30:45,Electronics,Singapore,No,Web Browser,No\n", - "TX123456805,8.75,2023-11-03 18:22:14,Food & Beverage,Montreal,No,Mobile,No\n", - "TX123456806,459.99,2023-11-04 09:15:33,Home Goods,Houston,Yes,POS Terminal,No\n", - "TX123456807,2750.00,2023-11-04 10:45:28,Travel,Paris,No,Web Browser,Yes\n", - "TX123456808,85.00,2023-11-04 11:33:52,Healthcare,New York,Yes,POS Terminal,No\n", - "TX123456809,17.25,2023-11-04 13:10:44,Food & Beverage,Los Angeles,No,Mobile,No\n", - "TX123456810,150.49,2023-11-04 15:22:18,Entertainment,Miami,No,Mobile,No\n", - "TX123456811,4500.00,2023-11-04 19:45:02,Jewelry,Dubai,No,Web Browser,Yes\n", - "TX123456812,27.99,2023-11-05 08:33:27,Groceries,Seattle,Yes,POS Terminal,No\n", - "TX123456813,1250.00,2023-11-05 10:15:42,Electronics,Tokyo,No,Web Browser,No\n", - "TX123456814,56.75,2023-11-05 12:20:35,Clothing,San Diego,No,Mobile,No\n", - "TX123456815,18.50,2023-11-05 14:30:19,Food & Beverage,Denver,Yes,POS Terminal,No\n", - "TX123456816,3750.25,2023-11-05 16:45:08,Travel,Sydney,No,Web Browser,Yes\n", - "TX123456817,95.00,2023-11-05 18:22:56,Healthcare,Boston,No,Mobile,No\n", - "TX123456818,2345.67,2023-11-05 20:15:33,Electronics,Berlin,No,Web Browser,Yes\n", - "fraud_transactions.csv\n", - "scenario: Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", - "model_type: gemini\n", - "model_name: gemini-1.5-pro\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "WARNING:tornado.access:429 POST /v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 409.67ms\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.11/dist-packages/gradio/queueing.py\", line 625, in process_events\n", - " response = await route_utils.call_process_api(\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py\", line 322, in call_process_api\n", - " output = await app.get_blocks().process_api(\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/gradio/blocks.py\", line 2181, in process_api\n", - " result = await self.call_function(\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/gradio/blocks.py\", line 1692, in call_function\n", - " prediction = await anyio.to_thread.run_sync( # type: ignore\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/anyio/to_thread.py\", line 56, in run_sync\n", - " return await get_async_backend().run_sync_in_worker_thread(\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py\", line 2470, in run_sync_in_worker_thread\n", - " return await future\n", - " ^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/anyio/_backends/_asyncio.py\", line 967, in run\n", - " result = context.run(func, *args)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/gradio/utils.py\", line 889, in wrapper\n", - " response = f(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^\n", - " File \"\", line 62, in run_inference\n", - " response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"\", line 30, in get_gemini_response\n", - " response = model.generate_content(prompt, generation_config={\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/generativeai/generative_models.py\", line 331, in generate_content\n", - " response = self._client.generate_content(\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/client.py\", line 835, in generate_content\n", - " response = rpc(\n", - " ^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/gapic_v1/method.py\", line 131, in __call__\n", - " return wrapped_func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 293, in retry_wrapped_func\n", - " return retry_target(\n", - " ^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 153, in retry_target\n", - " _retry_error_helper(\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_base.py\", line 212, in _retry_error_helper\n", - " raise final_exc from source_exc\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/retry/retry_unary.py\", line 144, in retry_target\n", - " result = target()\n", - " ^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/timeout.py\", line 130, in func_with_timeout\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/api_core/grpc_helpers.py\", line 76, in error_remapped_callable\n", - " return callable_(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/transports/rest.py\", line 1161, in __call__\n", - " raise core_exceptions.from_http_response(response)\n", - "google.api_core.exceptions.TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "scenario: Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", - "model_type: gemini\n", - "model_name: gemini-2.0-flash\n", - "CustomerID,PortfolioValue,Age,IncomeBracket,RiskAppetite,TransactionsPerMonth,PreferredInvestmentType,RiskScore\n", - "1,75000.00,32,Medium,High,8,\"Stocks, Options\",78\n", - "2,120000.50,45,High,Medium,3,\"Bonds, Mutual Funds\",55\n", - "3,30000.75,28,Low,Low,1,\"Bonds\",25\n", - "4,250000.00,58,High,High,12,\"Stocks, Real Estate\",85\n", - "5,80000.25,39,Medium,Medium,5,\"Mutual Funds\",60\n", - "6,150000.00,48,High,Low,2,\"Bonds, ETFs\",40\n", - "7,45000.50,25,Low,Medium,4,\"Stocks\",50\n", - "8,300000.75,62,High,High,15,\"Stocks, Options, Real Estate\",92\n", - "9,90000.00,35,Medium,Medium,6,\"ETFs, Mutual Funds\",65\n", - "10,180000.25,50,High,Low,1,\"Bonds\",35\n", - "11,60000.50,29,Low,Low,2,\"Bonds, ETFs\",30\n", - "12,400000.00,65,High,High,18,\"Stocks, Options, Cryptocurrency\",95\n", - "13,100000.75,42,Medium,Medium,7,\"Mutual Funds, Real Estate\",70\n", - "14,200000.00,55,High,Low,0,\"Bonds, Annuities\",20\n", - "15,70000.25,31,Low,Medium,3,\"Stocks, ETFs\",58\n", - "16,130000.50,47,High,Medium,4,\"Bonds, Mutual Funds\",52\n", - "17,35000.75,27,Low,Low,1,\"Bonds\",28\n", - "18,280000.00,60,High,High,14,\"Stocks, Real Estate\",88\n", - "19,85000.25,37,Medium,Medium,5,\"ETFs\",63\n", - "20,160000.00,52,High,Low,2,\"Bonds, CDs\",38\n", - "21,50000.50,26,Low,Low,1,\"Bonds, Government Securities\",22\n", - "22,450000.75,68,High,High,20,\"Stocks, Options, Venture Capital\",97\n", - "23,110000.00,44,Medium,Medium,8,\"Mutual Funds, ETFs\",73\n", - "24,220000.25,57,High,Low,0,\"Bonds, Treasury Bills\",18\n", - "25,72000.50,33,Low,Medium,4,\"Stocks\",56\n", - "26,140000.00,49,High,Medium,3,\"Bonds, Mutual Funds\",54\n", - "27,32000.75,29,Low,Low,1,\"Bonds\",26\n", - "28,260000.00,61,High,High,13,\"Stocks, Real Estate\",86\n", - "29,82000.25,38,Medium,Medium,6,\"ETFs, Index Funds\",61\n", - "30,170000.50,53,High,Low,2,\"Bonds\",36\n", - "31,55000.75,24,Low,Low,2,\"Bonds, Money Market Accounts\",24\n", - "32,350000.00,64,High,High,17,\"Stocks, Options, Commodities\",93\n", - "33,95000.25,41,Medium,Medium,7,\"Mutual Funds, REITs\",68\n", - "34,190000.50,56,High,Low,0,\"Bonds, Fixed Income\",19\n", - "35,65000.00,30,Low,Medium,3,\"Stocks, Small Cap Stocks\",59\n", - "36,125000.75,46,High,Medium,4,\"Bonds, Large Cap Funds\",51\n", - "37,33000.25,28,Low,Low,1,\"Bonds\",27\n", - "38,270000.50,59,High,High,14,\"Stocks, Emerging Markets\",87\n", - "39,88000.00,36,Medium,Medium,5,\"ETFs, Balanced Funds\",64\n", - "40,155000.75,51,High,Low,2,\"Bonds, Corporate Bonds\",37\n", - "41,48000.25,25,Low,Low,1,\"Bonds, Municipal Bonds\",21\n", - "42,420000.00,67,High,High,19,\"Stocks, Options, Derivatives\",96\n", - "43,105000.75,43,Medium,Medium,8,\"Mutual Funds, Sector Funds\",71\n", - "44,210000.00,54,High,Low,0,\"Bonds, Government Bonds\",17\n", - "45,71000.25,32,Low,Medium,4,\"Stocks\",57\n", - "46,135000.50,48,High,Medium,3,\"Bonds, Index Funds\",53\n", - "47,34000.75,27,Low,Low,1,\"Bonds\",29\n", - "48,290000.00,63,High,High,16,\"Stocks, Real Estate, Private Equity\",90\n", - "49,89000.25,40,Medium,Medium,6,\"ETFs\",62\n", - "50,175000.50,50,High,Low,2,\"Bonds, Preferred Stocks\",39\n", - "investment_customers.csv\n", - "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", - "model_type: gemini\n", - "model_name: gemini-2.0-flash\n", - "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", - "model_type: gemini\n", - "model_name: gemini-2.0-flash\n", - "\n", - "testinggemini.json\n", - "scenario: Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", - "model_type: gemini\n", - "model_name: gemini-2.0-flash\n", - "CustomerID,MonthlyCharges,ContractType,Tenure,SupportCalls,InternetUsage,PaymentMethod,Churned\n", - "TEL2847592374,67.55,Month-to-Month,9,3,145.2,Electronic Check,Yes\n", - "TEL9283746510,92.30,One Year,48,1,87.9,Credit Card,No\n", - "TEL1837465921,25.00,Month-to-Month,2,0,25.6,Mailed Check,Yes\n", - "TEL7364582910,115.75,Two Year,65,2,203.4,Bank Transfer,No\n", - "TEL5928374615,48.20,Month-to-Month,15,4,98.7,Electronic Check,Yes\n", - "TEL3847592016,78.90,One Year,36,1,167.1,Credit Card,No\n", - "TEL8273645910,31.50,Month-to-Month,3,0,30.2,Mailed Check,Yes\n", - "TEL6354789210,102.40,Two Year,70,3,185.9,Bank Transfer,No\n", - "TEL4738291056,55.85,Month-to-Month,11,2,112.5,Electronic Check,Yes\n", - "TEL1928374650,85.60,One Year,42,1,76.3,Credit Card,No\n", - "TEL7463529108,28.75,Month-to-Month,5,0,28.9,Mailed Check,Yes\n", - "TEL5293847610,110.30,Two Year,68,2,192.7,Bank Transfer,No\n", - "TEL3647582910,62.10,Month-to-Month,13,3,134.8,Electronic Check,Yes\n", - "TEL9182736450,98.45,One Year,39,1,91.5,Credit Card,No\n", - "TEL2736458109,34.90,Month-to-Month,7,0,33.6,Mailed Check,Yes\n", - "TEL8547392016,107.60,Two Year,62,2,179.3,Bank Transfer,No\n", - "TEL6192837450,59.35,Month-to-Month,10,3,123.4,Electronic Check,Yes\n", - "TEL4928374651,82.90,One Year,45,1,82.1,Credit Card,No\n", - "TEL1635294810,22.50,Month-to-Month,4,0,22.3,Mailed Check,Yes\n", - "TEL7283746509,118.20,Two Year,71,2,210.5,Bank Transfer,No\n", - "TEL5829374610,69.70,Month-to-Month,12,3,156.9,Electronic Check,Yes\n", - "TEL3918273640,95.15,One Year,40,1,89.7,Credit Card,No\n", - "TEL9374628105,37.40,Month-to-Month,6,0,36.2,Mailed Check,Yes\n", - "TEL6458293710,104.90,Two Year,67,2,188.1,Bank Transfer,No\n", - "TEL4829374615,57.10,Month-to-Month,14,3,118.2,Electronic Check,Yes\n", - "TEL1536472910,80.55,One Year,43,1,78.9,Credit Card,No\n", - "TEL7192837465,25.30,Month-to-Month,2,0,25.9,Mailed Check,Yes\n", - "TEL5374829106,112.90,Two Year,69,2,195.3,Bank Transfer,No\n", - "TEL3746582910,64.85,Month-to-Month,8,3,140.6,Electronic Check,Yes\n", - "TEL9263548107,90.20,One Year,46,1,85.5,Credit Card,No\n", - "TEL2635478109,32.65,Month-to-Month,4,0,31.4,Mailed Check,Yes\n", - "TEL8473920165,109.70,Two Year,63,2,182.5,Bank Transfer,No\n", - "TEL6283749105,54.50,Month-to-Month,16,3,110.1,Electronic Check,Yes\n", - "TEL4192837460,77.30,One Year,41,1,75.2,Credit Card,No\n", - "TEL1746352910,29.90,Month-to-Month,5,0,29.6,Mailed Check,Yes\n", - "TEL7382910564,117.10,Two Year,72,2,207.9,Bank Transfer,No\n", - "TEL5928374610,72.00,Month-to-Month,13,3,159.7,Electronic Check,Yes\n", - "TEL3847592016,97.85,One Year,38,1,93.2,Credit Card,No\n", - "TEL9182736450,39.55,Month-to-Month,7,0,38.3,Mailed Check,Yes\n", - "TEL6354789210,106.30,Two Year,66,2,190.8,Bank Transfer,No\n", - "TEL4738291056,51.75,Month-to-Month,11,3,105.9,Electronic Check,Yes\n", - "TEL1928374650,74.60,One Year,44,1,73.1,Credit Card,No\n", - "TEL7463529108,27.10,Month-to-Month,3,0,26.7,Mailed Check,Yes\n", - "TEL5293847610,114.50,Two Year,70,2,198.6,Bank Transfer,No\n", - "TEL3647582910,66.45,Month-to-Month,12,3,138.5,Electronic Check,Yes\n", - "TEL9182736450,93.50,One Year,47,1,84.2,Credit Card,No\n", - "TEL2736458109,35.15,Month-to-Month,6,0,34.9,Mailed Check,Yes\n", - "TEL8547392016,103.80,Two Year,64,2,176.1,Bank Transfer,No\n", - "TEL6192837450,58.20,Month-to-Month,14,3,120.7,Electronic Check,Yes\n", - "TEL4928374651,81.65,One Year,41,1,80.5,Credit Card,No\n", - "TEL1635294810,23.70,Month-to-Month,5,0,23.4,Mailed Check,Yes\n", - "TEL7283746509,119.90,Two Year,68,2,213.2,Bank Transfer,No\n", - "TEL5829374610,70.85,Month-to-Month,9,3,153.7,Electronic Check,Yes\n", - "TEL3918273640,96.20,One Year,45,1,92.4,Credit Card,No\n", - "TEL9374628105,36.80,Month-to-Month,7,0,35.6,Mailed Check,Yes\n", - "TEL6458293710,105.50,Two Year,69,2,185.4,Bank Transfer,No\n", - "TEL4829374615,56.30,Month-to-Month,15,3,115.1,Electronic Check,Yes\n", - "TEL1536472910,79.40,One Year,42,1,77.8,Credit Card,No\n", - "TEL7192837465,24.50,Month-to-Month,4,0,24.2,Mailed Check,Yes\n", - "TEL5374829106,111.80,Two Year,67,2,193.9,Bank Transfer,No\n", - "TEL3746582910,63.70,Month-to-Month,10,3,137.4,Electronic Check,Yes\n", - "TEL9263548107,89.10,One Year,40,1,83.9,Credit Card,No\n", - "TEL2635478109,33.85,Month-to-Month,6,0,32.5,Mailed Check,Yes\n", - "TEL8473920165,108.60,Two Year,65,2,179.9,Bank Transfer,No\n", - "TEL6283749105,53.40,Month-to-Month,11,3,107.8,Electronic Check,Yes\n", - "TEL4192837460,76.20,One Year,43,1,74.1,Credit Card,No\n", - "TEL1746352910,30.50,Month-to-Month,5,0,30.2,Mailed Check,Yes\n", - "TEL7382910564,116.00,Two Year,71,2,205.3,Bank Transfer,No\n", - "TEL5928374610,71.15,Month-to-Month,16,3,157.6,Electronic Check,Yes\n", - "TEL3847592016,97.00,One Year,39,1,90.9,Credit Card,No\n", - "TEL9182736450,38.70,Month-to-Month,3,0,37.4,Mailed Check,Yes\n", - "TEL6354789210,105.20,Two Year,68,2,188.7,Bank Transfer,No\n", - "TEL4738291056,52.55,Month-to-Month,14,3,104.2,Electronic Check,Yes\n", - "TEL1928374650,75.40,One Year,46,1,72.4,Credit Card,No\n", - "TEL7463529108,26.30,Month-to-Month,2,0,26.0,Mailed Check,Yes\n", - "TEL5293847610,113.70,Two Year,66,2,196.8,Bank Transfer,No\n", - "TEL3647582910,65.60,Month-to-Month,15,3,139.1,Electronic Check,Yes\n", - "TEL9182736450,94.35,One Year,42,1,86.8,Credit Card,No\n", - "TEL2736458109,34.30,Month-to-Month,4,0,34.0,Mailed Check,Yes\n", - "TEL8547392016,102.70,Two Year,63,2,173.5,Bank Transfer,No\n", - "TEL6192837450,59.90,Month-to-Month,13,3,121.3,Electronic Check,Yes\n", - "TEL4928374651,82.20,One Year,47,1,79.2,Credit Card,No\n", - "TEL1635294810,23.10,Month-to-Month,6,0,22.8,Mailed Check,Yes\n", - "TEL7283746509,119.30,Two Year,69,2,211.6,Bank Transfer,No\n", - "TEL5829374610,71.40,Month-to-Month,10,3,154.3,Electronic Check,Yes\n", - "TEL3918273640,96.70,One Year,44,1,91.7,Credit Card,No\n", - "TEL9374628105,37.10,Month-to-Month,5,0,36.8,Mailed Check,Yes\n", - "TEL6458293710,106.00,Two Year,70,2,186.1,Bank Transfer,No\n", - "TEL4829374615,55.70,Month-to-Month,12,3,112.0,Electronic Check,Yes\n", - "TEL1536472910,78.80,One Year,41,1,76.5,Credit Card,No\n", - "TEL7192837465,25.00,Month-to-Month,7,0,24.7,Mailed Check,Yes\n", - "TEL5374829106,111.20,Two Year,64,2,191.3,Bank Transfer,No\n", - "TEL3746582910,64.20,Month-to-Month,14,3,136.1,Electronic Check,Yes\n", - "TEL9263548107,90.80,One Year,43,1,82.6,Credit Card,No\n", - "TEL2635478109,33.20,Month-to-Month,5,0,31.9,Mailed Check,Yes\n", - "TEL8473920165,109.10,Two Year,67,2,177.4,Bank Transfer,No\n", - "TEL6283749105,54.00,Month-to-Month,16,3,109.4,Electronic Check,Yes\n", - "TEL4192837460,75.60,One Year,40,1,73.4,Credit Card,No\n", - "TEL1746352910,31.10,Month-to-Month,3,0,30.8,Mailed Check,Yes\n", - "TEL7382910564,115.40,Two Year,65,2,202.7,Bank Transfer,No\n", - "testinggemini.txt\n", - "Keyboard interruption in main thread... closing server.\n", - "Killing tunnel 127.0.0.1:7860 <> https://d076a9fef9034a4f24.gradio.live\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [] - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "# Example Scenarios\n", - "\n", - "# Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", - "# Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n", - "# Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", - "# Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", - "# Create a dataset of hospitalized patients to predict readmission within 30 days. Include patient ID, age, gender, number of prior admissions, diagnosis codes, length of stay, discharge type, medications prescribed, and readmission label.\n", - "# Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", - "\n", - "generator_ui.launch(share=True, debug=True, inbrowser=True)" - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "_9HIC_AzfZBZ" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "SFA6R-4jL7SS" + }, + "source": [ + "# Synthetic Data Generator Notebook\n", + "## About\n", + "This colab notebook demonstrates the use of Frontier and Open-source LLM models for generating synthetic dataset for a business scenario provided by the user. From a UI interface implemented in gradio, a user can define their business scenario in detail, select the number of records needed along with the its format and adjust the number of max output tokens to be generated by the chosen LLM.\n", + "\n", + "It does not stop here. Once the records have been produced in the LLM output, it can be extracted and stored in a file, format same as set by user before. The file is stored in colab notebook under the contents directory. All of this is extraction is done with the help of the 're' library. My first time using it and I totally enjoyed learning it.\n", + "\n", + "## Outlook\n", + "Sometimes the response is loaded with the user prompt and a lot of tags when using an open-source models, such as Mixtral from Mistral. This is because of the prompt format being used. The 'assistant' 'role' format does not suit them. This is an optimization to look for and can be easily done by using custom prompt template for such models and these templates are hinted on their huggingface repo." + ] }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "ip4I4Lff3B2M" + }, + "source": [ + "## Install & Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8zVlW-GMcBaU", + "outputId": "0c473564-fb93-41a9-c819-e6aa2382d75a" + }, + "outputs": [], + "source": [ + "!pip install -q gradio anthropic requests torch bitsandbytes transformers accelerate openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YKVNzE5sFH2l" + }, + "outputs": [], + "source": [ + "# imports\n", + "import re\n", + "import os\n", + "import sys\n", + "import gc\n", + "import io\n", + "import json\n", + "import anthropic\n", + "import gradio as gr\n", + "import requests\n", + "import subprocess\n", + "import google.generativeai as ggai\n", + "import torch\n", + "import tempfile\n", + "import shutil\n", + "from io import StringIO\n", + "import pandas as pd\n", + "from google.colab import userdata\n", + "from huggingface_hub import login\n", + "from openai import OpenAI\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "from IPython.display import Markdown, display, update_display\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LWpD6bZv3mAR" + }, + "source": [ + "## HuggingFace Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aeC2oWY2FTv7" + }, + "outputs": [], + "source": [ + "# Sign in to HuggingFace Hub\n", + "\n", + "hf_token = userdata.get('HF_TOKEN')\n", + "login(hf_token, add_to_git_credential=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Au2UPVy3vn5" + }, + "source": [ + "## Frontier Models configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "geBBsd14X3UL" + }, + "outputs": [], + "source": [ + "openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))\n", + "anthropic_client = anthropic.Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))\n", + "ggai.configure(api_key=userdata.get('GOOGLE_API_KEY'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tCnDIOlKgjbO" + }, + "source": [ + "## Defining Prompts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gkwXZsxofAU1" + }, + "outputs": [], + "source": [ + "system_prompt = \"\"\"\n", + "You are a synthetic dataset generator. Your role is to create synthetic dataset that infers structured data schemas from business scenarios given by the user.\n", + "\n", + "Your task is to:\n", + "1. Understand the user's business problem(s) or use case(s).\n", + "2. Identify the key fields needed to support that scenario.\n", + "3. Define appropriate field names, data types, and formats.\n", + "4. Generate synthetic records that match the inferred schema.\n", + "\n", + "Guidelines:\n", + "- Use realistic field names and values. Do not invent unrelated fields or values.\n", + "- Choose sensible data types: string, integer, float, date, boolean, enum, etc.\n", + "- Respect logical constraints (e.g., age range, date ranges, email formats).\n", + "- Output the dataset in the format the user requests (json, csv, txt, markdown table).\n", + "- If the scenario is vague or broad, make reasonable assumptions and explain them briefly before generating the dataset.\n", + "- Always generate a dataset that supports the business use case logically.\n", + "\n", + "Before generating the data, display the inferred schema in a readable format.\n", + "\"\"\"\n", + "\n", + "# trial_user_prompt = \"I’m building a churn prediction model for a telecom company. Can you generate a synthetic dataset with 100 rows?\"\n", + "def get_user_prompt(business_problem, no_of_samples, file_format):\n", + " return f\"\"\"\n", + " The business scenario for which I want you to generate a dataset is defined below:\n", + " {business_problem}\n", + "\n", + " Generate a synthetic dataset of {no_of_samples} records in {file_format} format.\n", + " When generating the dataset, wrap it between the '<<<>>>' tag. Make sure the tag is there in the output.\n", + " Do not include any other special characters in between the tags, other than the ones required in producing the correct format of data.\n", + " For examples: When a 'csv' format is given, only the ',' character can be used in between the tags.\n", + " \"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yNpVf9-oQdoO" + }, + "source": [ + "### Quanitzation Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3ErZ315MQdU3" + }, + "outputs": [], + "source": [ + "# This allows us to load the model into memory and use less memory\n", + "def get_quantization_config():\n", + " return BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "clGtRh0N4951" + }, + "source": [ + "## HF Model inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MAhyn1ehb3Dh" + }, + "outputs": [], + "source": [ + "# All in one HuggingFace Model Response function\n", + "def run_hfmodel_and_get_response(prompt, model_name, output_tokens):\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " inputs = tokenizer.apply_chat_template(prompt, return_tensors=\"pt\")\n", + " if torch.cuda.is_available():\n", + " inputs = inputs.to(\"cuda\")\n", + " streamer = TextStreamer(tokenizer)\n", + " if \"microsoft/bitnet-b1.58-2B-4T\" in model_name:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", trust_remote_code=True)\n", + " elif \"tiiuae/Falcon-E-3B-Instruct\" in model_name:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", torch_dtype=torch.float16 )\n", + " else:\n", + " model = AutoModelForCausalLM.from_pretrained(model_name, device_map=\"auto\", quantization_config=get_quantization_config())\n", + " outputs = model.generate(inputs, max_new_tokens=output_tokens, streamer=streamer)\n", + " response = tokenizer.decode(outputs[0])\n", + " del model, inputs, tokenizer, outputs\n", + " gc.collect()\n", + " torch.cuda.empty_cache()\n", + " return response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gh_Ny1aM-L8z" + }, + "source": [ + "## Frontier Models Inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "h11WlZNhfHCR" + }, + "outputs": [], + "source": [ + "# ChatGPT, Claude and Gemini response function\n", + "def get_chatgpt_response(prompt, model_name, output_tokens):\n", + " response = openai_client.chat.completions.create(\n", + " model=model_name,\n", + " messages=prompt,\n", + " max_tokens=output_tokens,\n", + " )\n", + " return response.choices[0].message.content\n", + "\n", + "def get_claude_response(prompt, model_name, output_tokens):\n", + " response = anthropic_client.messages.create(\n", + " model=model_name,\n", + " max_tokens=output_tokens,\n", + " system=system_prompt,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": prompt,\n", + " }\n", + " ],\n", + " )\n", + " return response.content[0].text\n", + "\n", + "def get_gemini_response(prompt, model_name, output_tokens):\n", + " model = ggai.GenerativeModel(\n", + " model_name=model_name,\n", + " system_instruction=system_prompt,\n", + " )\n", + "\n", + " response = model.generate_content(prompt, generation_config={\n", + " \"max_output_tokens\": output_tokens,\n", + " \"temperature\": 0.7,\n", + " })\n", + " return response.text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nzHbM_WQvRgT" + }, + "source": [ + "## Gradio Implementation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uFWZqw1R-al_" + }, + "source": [ + "### Dropdowns Selection Lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rOzEb0o--aD7" + }, + "outputs": [], + "source": [ + "# Dropdown List Values for the user\n", + "MODEL_TYPES=[\"GPT\", \"Claude\", \"Gemini\", \"HuggingFace\"]\n", + "OPENAI_MODEL_NAMES=[\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n", + "ANTHROPIC_MODELS=[\"claude-3-7-sonnet-latest\", \"claude-3-5-haiku-latest\", \"claude-3-opus-latest\"]\n", + "GOOGLE_MODELS=[\"gemini-2.0-flash\", \"gemini-1.5-pro\"]\n", + "HUGGINGFACE_MODELS=[\n", + " \"meta-llama/Llama-3.2-3B-Instruct\",\n", + " \"microsoft/bitnet-b1.58-2B-4T\",\n", + " \"ByteDance-Seed/Seed-Coder-8B-Instruct\",\n", + " \"tiiuae/Falcon-E-3B-Instruct\",\n", + " \"Qwen/Qwen2.5-7B-Instruct\"\n", + "]\n", + "MODEL_NAMES = {\n", + " \"GPT\": OPENAI_MODEL_NAMES,\n", + " \"Claude\": ANTHROPIC_MODELS,\n", + " \"Gemini\": GOOGLE_MODELS,\n", + " \"HuggingFace\": HUGGINGFACE_MODELS\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sbXGL8_4-oKc" + }, + "source": [ + "### UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_0NCY7FgCVHj" + }, + "outputs": [], + "source": [ + "with gr.Blocks() as generator_ui:\n", + " gr.Markdown(\"# 🧠 Business Scenario → Synthetic Dataset Generator\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " with gr.Row():\n", + " dataset_size=gr.Number(value=10, label=\"Enter the number of data samples to generate.\", show_label=True)\n", + " format=gr.Dropdown([\"json\", \"csv\", \"txt\", \"markdown\"], label=\"Select the format for the dataset\", show_label=True)\n", + " with gr.Row():\n", + " scenario=gr.Textbox(label=\"Business Scenario\", lines=5, placeholder=\"Describe your business scenario here\")\n", + " with gr.Row():\n", + " error = gr.Markdown(visible=False)\n", + " with gr.Row():\n", + " clear = gr.Button(\"Clear Everything\")\n", + " submit = gr.Button(\"Generate Dataset\", variant=\"primary\")\n", + "\n", + " with gr.Column(scale=1):\n", + " model_type = gr.Dropdown(MODEL_TYPES, label=\"Model Type\", show_label=True, info=\"Select the model type you want to use\")\n", + " model_name = gr.Dropdown(MODEL_NAMES[model_type.value], label=\"Model Name\", show_label=True, allow_custom_value=True, info=\"Select the model name or enter one manually\")\n", + " output_tokens= gr.Number(value=1000, label=\"Enter the max number of output tokens to generate.\", show_label=True, info=\"This will impact the length of the response containg the dataset\")\n", + "\n", + " with gr.Row():\n", + " # Chatbot Interface\n", + " chatbot = gr.Chatbot(\n", + " type='messages',\n", + " label='Chatbot',\n", + " show_label=True,\n", + " height=300,\n", + " resizable=True,\n", + " elem_id=\"chatbot\",\n", + " avatar_images=(\"🧑\", \"🤖\",)\n", + " )\n", + " with gr.Row(variant=\"compact\"):\n", + " extract_btn = gr.Button(\"Extract and Save Dataset\", variant=\"huggingface\", visible=False)\n", + " file_name = gr.Textbox(label=\"Enter file name here (without file extension)\", placeholder=\"e.g. cancer_synthetic, warehouse_synthetic (no digits)\", visible=False)\n", + " with gr.Row():\n", + " markdown_preview = gr.Markdown(visible = False)\n", + " dataset_preview = gr.Textbox(label=\"Dataset Preview\",visible=False)\n", + " with gr.Row():\n", + " file_saved = gr.Textbox(visible=False)\n", + "\n", + " def run_inference(scenario, model_type, model_name, output_tokens, dataset_size, format):\n", + " \"\"\"Run the model and get the response\"\"\"\n", + " model_type=model_type.lower()\n", + " print(f\"scenario: {scenario}\")\n", + " print(f\"model_type: {model_type}\")\n", + " print(f\"model_name: {model_name}\")\n", + " if not scenario.strip():\n", + " return gr.update(value=\"❌ **Error:** Please define a scenario first!\",visible=True), []\n", + "\n", + " user_prompt = get_user_prompt(scenario, dataset_size, format)\n", + " prompt = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt},\n", + " ]\n", + "\n", + " if model_type == \"gpt\":\n", + " response = get_chatgpt_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", + " elif model_type == \"claude\":\n", + " response = get_claude_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", + " elif model_type == \"gemini\":\n", + " response = get_gemini_response(prompt=user_prompt, model_name=model_name, output_tokens=output_tokens)\n", + " else:\n", + " response = run_hfmodel_and_get_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)\n", + " torch.cuda.empty_cache()\n", + " history = [\n", + " {\"role\": \"user\", \"content\": scenario},\n", + " {\"role\": \"assistant\", \"content\": response}\n", + " ]\n", + " return gr.update(visible=False), history\n", + "\n", + " def extract_dataset_string(response):\n", + " \"\"\"Extract dataset content between defined tags using regex.\"\"\"\n", + " # Remove known artificial tokens (common in HuggingFace or Claude)\n", + " response = re.sub(r\"<\\[.*?\\]>\", \"\", response)\n", + "\n", + " # Remove system or prompt echo if repeated before dataset\n", + " response = re.sub(r\"(?is)^.*?<<<\", \"<<<\", response.strip(), count=1)\n", + "\n", + " # 1. Match strict <<<>>>...<<<>>> tag blocks (use last match)\n", + " matches = re.findall(r\"<<<>>>[\\s\\r\\n]*(.*?)[\\s\\r\\n]*<<<>>>\", response, re.DOTALL)\n", + " if matches:\n", + " return matches[-1].strip()\n", + "\n", + " # 2. Match loose <<< ... >>> format\n", + " matches = re.findall(r\"<<<[\\s\\r\\n]*(.*?)[\\s\\r\\n]*>>>\", response, re.DOTALL)\n", + " if matches:\n", + " return matches[-1].strip()\n", + "\n", + " # 3. Match final fallback: take everything after last <<< as raw data\n", + " last_open = response.rfind(\"<<<\")\n", + " if last_open != -1:\n", + " raw = response[last_open + 3 :].strip()\n", + " # Optionally cut off noisy trailing notes, explanations, etc.\n", + " raw = re.split(r\"\\n\\s*\\n|Explanation:|Note:|---\", raw)[0]\n", + " return raw.strip()\n", + "\n", + " return \"Could not extract dataset! Try again with a different model.\"\n", + "\n", + " def extract_dataset_from_response(chatbot_history, file_name, file_type):\n", + " \"\"\"Extract dataset and update in gradio UI components\"\"\"\n", + " response = chatbot_history[-1][\"content\"]\n", + " if not response:\n", + " return gr.update(visible=True, value=\"Could not find LLM Response! Try again.\"), gr.update(visible=False)\n", + "\n", + " # match = re.search(r'<<<\\s*(.*?)\\s*>>>', response, re.DOTALL)\n", + " # print(match)\n", + " # if match and match.group(1).strip() == \"\":\n", + " # match = re.search(r'<<<>>>\\s*(.*?)\\s*<<<>>>', response, re.DOTALL)\n", + " # print(match)\n", + " # if match is None:\n", + " # return gr.update(visible=True, value=\"Could not extract dataset! Try again with a different model.\"), gr.update(visible=False)\n", + " # dataset = match.group(1).strip()\n", + " dataset = extract_dataset_string(response)\n", + " if dataset == \"Could not extract dataset! Try again with a different model.\":\n", + " return gr.update(visible=True, value=dataset), gr.update(visible=False)\n", + " text = save_dataset(dataset, file_type, file_name)\n", + " return gr.update(visible=True, value=text), gr.update(visible=True, value=dataset)\n", + "\n", + " def save_dataset(dataset, file_format, file_name):\n", + " \"\"\"Save dataset to a file based on the selected format.\"\"\"\n", + " file_name=file_name+\".\"+file_format\n", + " print(dataset)\n", + " print(file_name)\n", + " if file_format == \"json\":\n", + " try:\n", + " data = json.loads(dataset)\n", + " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(data, f, indent=4)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + " elif file_format == \"csv\":\n", + " try:\n", + " df = pd.read_csv(StringIO(dataset))\n", + " df.to_csv(file_name, index=False)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + " elif file_format == \"txt\":\n", + " try:\n", + " with open(file_name, \"w\", encoding=\"utf-8\") as f:\n", + " f.write(dataset)\n", + " return \"Dataset saved successfully!\"\n", + " except:\n", + " return \"Could not save dataset! Try again in another format.\"\n", + "\n", + " def clear_chat():\n", + " \"\"\"Clear the chat history.\"\"\"\n", + " return \"\", [], gr.update(visible=False), gr.update(visible=False)\n", + "\n", + " def show_extract_btn(chatbot_history, format):\n", + " \"\"\"Show the extract button if the response has been displayed in the chatbot and format is not set to markdown\"\"\"\n", + " if chatbot_history == []:\n", + " return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)\n", + " if format == \"markdown\":\n", + " return gr.update(visible=True, value=chatbot_history[1][\"content\"]), gr.update(visible=False), gr.update(visible=False)\n", + " return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)\n", + "\n", + " extract_btn.click(\n", + " fn=extract_dataset_from_response,\n", + " inputs=[chatbot, file_name, format],\n", + " outputs=[file_saved, dataset_preview]\n", + " )\n", + "\n", + " chatbot.change(\n", + " fn=show_extract_btn,\n", + " inputs=[chatbot, format],\n", + " outputs=[markdown_preview, extract_btn, file_name]\n", + " )\n", + "\n", + " model_type.change(\n", + " fn=lambda x: gr.update(choices=MODEL_NAMES[x], value=MODEL_NAMES[x][0]),\n", + " inputs=[model_type],\n", + " outputs=[model_name]\n", + " )\n", + "\n", + " submit.click(\n", + " fn=run_inference,\n", + " inputs=[scenario, model_type, model_name, output_tokens, dataset_size, format],\n", + " outputs=[error, chatbot],\n", + " show_progress=True\n", + " )\n", + "\n", + " clear.click(\n", + " clear_chat,\n", + " outputs=[scenario, chatbot, dataset_preview, file_saved]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "kzDUJahK8uRN", + "outputId": "c5674be2-b262-4439-ae91-4f3e1f49e041" + }, + "outputs": [], + "source": [ + "# Example Scenarios\n", + "\n", + "# Generate a dataset for predicting customer churn in a subscription-based telecom company. Include features like monthly charges, contract type, tenure (in months), number of support calls, internet usage (in GB), payment method, and whether the customer has churned.\n", + "# Generate a dataset for training a model to approve/reject loan applications. Include features like loan amount, applicant income, co-applicant income, employment type, credit history (binary), loan term, number of dependents, education level, and loan approval status.\n", + "# Create a dataset of credit card transactions for detecting fraud. Include transaction ID, amount, timestamp, merchant category, customer location, card presence (yes/no), transaction device type, and fraud label (yes/no).\n", + "# Generate a dataset of investment customers with fields like portfolio value, age, income bracket, risk appetite (low/medium/high), number of transactions per month, preferred investment types, and risk score.\n", + "# Create a dataset of hospitalized patients to predict readmission within 30 days. Include patient ID, age, gender, number of prior admissions, diagnosis codes, length of stay, discharge type, medications prescribed, and readmission label.\n", + "# Generate a dataset for predicting medical appointment no-shows. Include appointment ID, scheduled date, appointment date, lead time (days between scheduling and appointment), SMS reminders sent, patient age, gender, health condition severity, and no-show status.\n", + "\n", + "generator_ui.launch(share=True, debug=True, inbrowser=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_9HIC_AzfZBZ" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From b2b26ddd4b4063cdbcd43fed4663e19cb0381579 Mon Sep 17 00:00:00 2001 From: Fikri Raihan Date: Sun, 25 May 2025 17:38:22 +0700 Subject: [PATCH 03/23] Added my contributions to community contributions week1 day1, github information --- .../day-1-github-information.ipynb | 841 ++++++++++++++++++ 1 file changed, 841 insertions(+) create mode 100644 week1/community-contributions/day-1-github-information.ipynb diff --git a/week1/community-contributions/day-1-github-information.ipynb b/week1/community-contributions/day-1-github-information.ipynb new file mode 100644 index 0000000..b5adb6d --- /dev/null +++ b/week1/community-contributions/day-1-github-information.ipynb @@ -0,0 +1,841 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4d011f3d-c10c-4a75-bd36-576e383a8d1d", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n", + "\n", + "\n", + "# If you get an error running this cell, then please head over to the troubleshooting notebook!" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c51302e0-c848-4ec4-a0ab-03deeb9e7987", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Api key found and looks good so far!\n" + ] + } + ], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "if not api_key:\n", + " print('No Api Key was found')\n", + "elif not api_key.startswith('sk-proj-'):\n", + " print(\"An api key was found, but it doesnt start with sk-proj\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An api key was found, but it might have space in the first or end\")\n", + "else:\n", + " print(\"Api key found and looks good so far!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d1df04f3-bd4d-4b14-87cc-1e91eaf7c0ab", + "metadata": {}, + "outputs": [], + "source": [ + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "340b018a-6e97-491c-aa26-66c683ece8a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello! Welcome! How can I assist you today?\n" + ] + } + ], + "source": [ + "message = \"Hello GPT, this is my first message\"\n", + "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\": \"user\", \"content\":message}])\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4a06c291-2fe6-4669-a8b6-3b67769eb3fa", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + "\n", + " def __init__(self, url):\n", + " \"\"\"\n", + " Create this Website object from the given url using the BeautifulSoup library\n", + " \"\"\"\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dd36b141-a252-44a8-8fa4-d4c2c33d3db9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fikriraihan · GitHub\n", + "Skip to content\n", + "Navigation Menu\n", + "Toggle navigation\n", + "Sign in\n", + "Appearance settings\n", + "Product\n", + "GitHub Copilot\n", + "Write better code with AI\n", + "GitHub Models\n", + "New\n", + "Manage and compare prompts\n", + "GitHub Advanced Security\n", + "Find and fix vulnerabilities\n", + "Actions\n", + "Automate any workflow\n", + "Codespaces\n", + "Instant dev environments\n", + "Issues\n", + "Plan and track work\n", + "Code Review\n", + "Manage code changes\n", + "Discussions\n", + "Collaborate outside of code\n", + "Code Search\n", + "Find more, search less\n", + "Explore\n", + "Why GitHub\n", + "All features\n", + "Documentation\n", + "GitHub Skills\n", + "Blog\n", + "Solutions\n", + "By company size\n", + "Enterprises\n", + "Small and medium teams\n", + "Startups\n", + "Nonprofits\n", + "By use case\n", + "DevSecOps\n", + "DevOps\n", + "CI/CD\n", + "View all use cases\n", + "By industry\n", + "Healthcare\n", + "Financial services\n", + "Manufacturing\n", + "Government\n", + "View all industries\n", + "View all solutions\n", + "Resources\n", + "Topics\n", + "AI\n", + "DevOps\n", + "Security\n", + "Software Development\n", + "View all\n", + "Explore\n", + "Learning Pathways\n", + "Events & Webinars\n", + "Ebooks & Whitepapers\n", + "Customer Stories\n", + "Partners\n", + "Executive Insights\n", + "Open Source\n", + "GitHub Sponsors\n", + "Fund open source developers\n", + "The ReadME Project\n", + "GitHub community articles\n", + "Repositories\n", + "Topics\n", + "Trending\n", + "Collections\n", + "Enterprise\n", + "Enterprise platform\n", + "AI-powered developer platform\n", + "Available add-ons\n", + "GitHub Advanced Security\n", + "Enterprise-grade security features\n", + "Copilot for business\n", + "Enterprise-grade AI features\n", + "Premium Support\n", + "Enterprise-grade 24/7 support\n", + "Pricing\n", + "Search or jump to...\n", + "Search code, repositories, users, issues, pull requests...\n", + "Search\n", + "Clear\n", + "Search syntax tips\n", + "Provide feedback\n", + "We read every piece of feedback, and take your input very seriously.\n", + "Include my email address so I can be contacted\n", + "Cancel\n", + "Submit feedback\n", + "Saved searches\n", + "Use saved searches to filter your results more quickly\n", + "Cancel\n", + "Create saved search\n", + "Sign in\n", + "Sign up\n", + "Appearance settings\n", + "Resetting focus\n", + "You signed in with another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "You signed out in another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "You switched accounts on another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "Dismiss alert\n", + "Fikriraihan\n", + "Follow\n", + "Overview\n", + "Repositories\n", + "34\n", + "Projects\n", + "0\n", + "Packages\n", + "0\n", + "Stars\n", + "0\n", + "More\n", + "Overview\n", + "Repositories\n", + "Projects\n", + "Packages\n", + "Stars\n", + "Fikriraihan\n", + "Follow\n", + "Fikriraihan\n", + "Follow\n", + "Block or Report\n", + "Block or report Fikriraihan\n", + "Report abuse\n", + "Contact GitHub support about this user’s behavior.\n", + " Learn more about\n", + "reporting abuse\n", + ".\n", + "Report abuse\n", + "Overview\n", + "Repositories\n", + "34\n", + "Projects\n", + "0\n", + "Packages\n", + "0\n", + "Stars\n", + "0\n", + "More\n", + "Overview\n", + "Repositories\n", + "Projects\n", + "Packages\n", + "Stars\n", + "Pinned\n", + "Loading\n", + "2024-coding-challenge\n", + "2024-coding-challenge\n", + "Public\n", + "Repository for Coding Challenge 2024\n", + "JavaScript\n", + "ChatGPT\n", + "ChatGPT\n", + "Public\n", + "TypeScript\n", + "fikri-3d-portofolio\n", + "fikri-3d-portofolio\n", + "Public\n", + "JavaScript\n", + "nextjs-dashboard\n", + "nextjs-dashboard\n", + "Public\n", + "Nextjs-dashboard course\n", + "TypeScript\n", + "nextjs-postgre\n", + "nextjs-postgre\n", + "Public\n", + "TypeScript\n", + "imaginify\n", + "imaginify\n", + "Public\n", + "TypeScript\n", + "Something went wrong, please refresh the page to try again.\n", + "If the problem persists, check the\n", + "GitHub status page\n", + "or\n", + "contact support\n", + ".\n", + "Uh oh!\n", + "There was an error while loading.\n", + "Please reload this page\n", + ".\n", + "Footer\n", + "© 2025 GitHub, Inc.\n", + "Footer navigation\n", + "Terms\n", + "Privacy\n", + "Security\n", + "Status\n", + "Docs\n", + "Contact\n", + "Manage cookies\n", + "Do not share my personal information\n", + "You can’t perform that action at this time.\n" + ] + } + ], + "source": [ + "github = Website(\"https://github.com/Fikriraihan\")\n", + "print(github.title)\n", + "print(github.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ea402ba2-6c7f-4f96-95c0-d68a0e96e644", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"You are a skilled GitHub profile analyzer. \" \\\n", + "\"Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: \" \\\n", + "\"1️⃣ **Profile Summary** \" \\\n", + "\"- Username \" \\\n", + "\"- Bio (if available) \" \\\n", + "\"- Total public repositories \" \\\n", + "\"- Total followers \" \\\n", + "\"- Total stars received (sum across repos) \" \\\n", + "\"- Top programming languages (by repo count) \" \\\n", + "\"2️⃣ **Repository Highlights** (top 3 by stars or activity) \" \\\n", + "\"For each: \" \\\n", + "\"- Repository name \" \\\n", + "\"- Description \" \\\n", + "\"- Primary language \" \\\n", + "\"- Star count \" \\\n", + "\"- Last updated date \" \\\n", + "\"- Notable technologies or frameworks used \" \\\n", + "\"3️⃣ **Overall Assessment** \" \\\n", + "\"- What does this user specialize in? \" \\\n", + "\"- Are they more focused on personal projects or collaborations? \" \\\n", + "\"- Any standout strengths or skills you notice? \" \\\n", + "\"4️⃣ **Recommendations** \" \\\n", + "\"- Suggest one area or technology they could explore next to grow. \" \\\n", + "\"- Suggest one improvement to make their GitHub profile more appealing. \" \\\n", + "\"Be concise, insightful, and encourage the user’s growth. \" \\\n", + "\"If some data is missing, state it clearly instead of guessing.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a964e8f2-40f4-457b-9c81-7e6e2768f450", + "metadata": {}, + "outputs": [], + "source": [ + "def user_prompt_for(website):\n", + " user_prompt = f\"You are looking at a github named {website.title}\"\n", + " user_prompt += \"\\nThe contents of this github is as follows; \\\n", + "please provide a summary of this website in markdown.\"\n", + " user_prompt += website.text\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "026d8ae4-1aea-45b9-b694-db0809527780", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'You are a skilled GitHub profile analyzer. Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: 1️⃣ **Profile Summary** - Username - Bio (if available) - Total public repositories - Total followers - Total stars received (sum across repos) - Top programming languages (by repo count) 2️⃣ **Repository Highlights** (top 3 by stars or activity) For each: - Repository name - Description - Primary language - Star count - Last updated date - Notable technologies or frameworks used 3️⃣ **Overall Assessment** - What does this user specialize in? - Are they more focused on personal projects or collaborations? - Any standout strengths or skills you notice? 4️⃣ **Recommendations** - Suggest one area or technology they could explore next to grow. - Suggest one improvement to make their GitHub profile more appealing. Be concise, insightful, and encourage the user’s growth. If some data is missing, state it clearly instead of guessing.'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "system_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2e040916-8d7e-421b-b1a7-56e710940eaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are looking at a github named Fikriraihan · GitHub\n", + "The contents of this github is as follows; please provide a summary of this website in markdown.Skip to content\n", + "Navigation Menu\n", + "Toggle navigation\n", + "Sign in\n", + "Appearance settings\n", + "Product\n", + "GitHub Copilot\n", + "Write better code with AI\n", + "GitHub Models\n", + "New\n", + "Manage and compare prompts\n", + "GitHub Advanced Security\n", + "Find and fix vulnerabilities\n", + "Actions\n", + "Automate any workflow\n", + "Codespaces\n", + "Instant dev environments\n", + "Issues\n", + "Plan and track work\n", + "Code Review\n", + "Manage code changes\n", + "Discussions\n", + "Collaborate outside of code\n", + "Code Search\n", + "Find more, search less\n", + "Explore\n", + "Why GitHub\n", + "All features\n", + "Documentation\n", + "GitHub Skills\n", + "Blog\n", + "Solutions\n", + "By company size\n", + "Enterprises\n", + "Small and medium teams\n", + "Startups\n", + "Nonprofits\n", + "By use case\n", + "DevSecOps\n", + "DevOps\n", + "CI/CD\n", + "View all use cases\n", + "By industry\n", + "Healthcare\n", + "Financial services\n", + "Manufacturing\n", + "Government\n", + "View all industries\n", + "View all solutions\n", + "Resources\n", + "Topics\n", + "AI\n", + "DevOps\n", + "Security\n", + "Software Development\n", + "View all\n", + "Explore\n", + "Learning Pathways\n", + "Events & Webinars\n", + "Ebooks & Whitepapers\n", + "Customer Stories\n", + "Partners\n", + "Executive Insights\n", + "Open Source\n", + "GitHub Sponsors\n", + "Fund open source developers\n", + "The ReadME Project\n", + "GitHub community articles\n", + "Repositories\n", + "Topics\n", + "Trending\n", + "Collections\n", + "Enterprise\n", + "Enterprise platform\n", + "AI-powered developer platform\n", + "Available add-ons\n", + "GitHub Advanced Security\n", + "Enterprise-grade security features\n", + "Copilot for business\n", + "Enterprise-grade AI features\n", + "Premium Support\n", + "Enterprise-grade 24/7 support\n", + "Pricing\n", + "Search or jump to...\n", + "Search code, repositories, users, issues, pull requests...\n", + "Search\n", + "Clear\n", + "Search syntax tips\n", + "Provide feedback\n", + "We read every piece of feedback, and take your input very seriously.\n", + "Include my email address so I can be contacted\n", + "Cancel\n", + "Submit feedback\n", + "Saved searches\n", + "Use saved searches to filter your results more quickly\n", + "Cancel\n", + "Create saved search\n", + "Sign in\n", + "Sign up\n", + "Appearance settings\n", + "Resetting focus\n", + "You signed in with another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "You signed out in another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "You switched accounts on another tab or window.\n", + "Reload\n", + "to refresh your session.\n", + "Dismiss alert\n", + "Fikriraihan\n", + "Follow\n", + "Overview\n", + "Repositories\n", + "34\n", + "Projects\n", + "0\n", + "Packages\n", + "0\n", + "Stars\n", + "0\n", + "More\n", + "Overview\n", + "Repositories\n", + "Projects\n", + "Packages\n", + "Stars\n", + "Fikriraihan\n", + "Follow\n", + "Fikriraihan\n", + "Follow\n", + "Block or Report\n", + "Block or report Fikriraihan\n", + "Report abuse\n", + "Contact GitHub support about this user’s behavior.\n", + " Learn more about\n", + "reporting abuse\n", + ".\n", + "Report abuse\n", + "Overview\n", + "Repositories\n", + "34\n", + "Projects\n", + "0\n", + "Packages\n", + "0\n", + "Stars\n", + "0\n", + "More\n", + "Overview\n", + "Repositories\n", + "Projects\n", + "Packages\n", + "Stars\n", + "Pinned\n", + "Loading\n", + "2024-coding-challenge\n", + "2024-coding-challenge\n", + "Public\n", + "Repository for Coding Challenge 2024\n", + "JavaScript\n", + "ChatGPT\n", + "ChatGPT\n", + "Public\n", + "TypeScript\n", + "fikri-3d-portofolio\n", + "fikri-3d-portofolio\n", + "Public\n", + "JavaScript\n", + "nextjs-dashboard\n", + "nextjs-dashboard\n", + "Public\n", + "Nextjs-dashboard course\n", + "TypeScript\n", + "nextjs-postgre\n", + "nextjs-postgre\n", + "Public\n", + "TypeScript\n", + "imaginify\n", + "imaginify\n", + "Public\n", + "TypeScript\n", + "Something went wrong, please refresh the page to try again.\n", + "If the problem persists, check the\n", + "GitHub status page\n", + "or\n", + "contact support\n", + ".\n", + "Uh oh!\n", + "There was an error while loading.\n", + "Please reload this page\n", + ".\n", + "Footer\n", + "© 2025 GitHub, Inc.\n", + "Footer navigation\n", + "Terms\n", + "Privacy\n", + "Security\n", + "Status\n", + "Docs\n", + "Contact\n", + "Manage cookies\n", + "Do not share my personal information\n", + "You can’t perform that action at this time.\n" + ] + } + ], + "source": [ + "print(user_prompt_for(github))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "da2a2c62-0ff4-4e4b-a1a1-774b47f848a0", + "metadata": {}, + "outputs": [], + "source": [ + "messages = [\n", + " {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n", + " {\"role\": \"user\", \"content\": \"tell me a fruit that has red color\"}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "204b9b40-cfd9-46f4-a954-efee75fc3d79", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Oh, I don’t know, how about the classic red apple? Or maybe you were hoping for something more exotic, like a blood orange? There’s also the ever-popular strawberry. The options are endless! What’s next, a fruit quiz?\n" + ] + } + ], + "source": [ + "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0a4a376a-8c20-4fd3-91ad-25511df76292", + "metadata": {}, + "outputs": [], + "source": [ + "def messages_for(website):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "11bc74b0-7ca7-40da-81cc-84b2dd04780b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'role': 'system',\n", + " 'content': 'You are a skilled GitHub profile analyzer. Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: 1️⃣ **Profile Summary** - Username - Bio (if available) - Total public repositories - Total followers - Total stars received (sum across repos) - Top programming languages (by repo count) 2️⃣ **Repository Highlights** (top 3 by stars or activity) For each: - Repository name - Description - Primary language - Star count - Last updated date - Notable technologies or frameworks used 3️⃣ **Overall Assessment** - What does this user specialize in? - Are they more focused on personal projects or collaborations? - Any standout strengths or skills you notice? 4️⃣ **Recommendations** - Suggest one area or technology they could explore next to grow. - Suggest one improvement to make their GitHub profile more appealing. Be concise, insightful, and encourage the user’s growth. If some data is missing, state it clearly instead of guessing.'},\n", + " {'role': 'user',\n", + " 'content': 'You are looking at a github named Fikriraihan · GitHub\\nThe contents of this github is as follows; please provide a summary of this website in markdown.Skip to content\\nNavigation Menu\\nToggle navigation\\nSign in\\nAppearance settings\\nProduct\\nGitHub Copilot\\nWrite better code with AI\\nGitHub Models\\nNew\\nManage and compare prompts\\nGitHub Advanced Security\\nFind and fix vulnerabilities\\nActions\\nAutomate any workflow\\nCodespaces\\nInstant dev environments\\nIssues\\nPlan and track work\\nCode Review\\nManage code changes\\nDiscussions\\nCollaborate outside of code\\nCode Search\\nFind more, search less\\nExplore\\nWhy GitHub\\nAll features\\nDocumentation\\nGitHub Skills\\nBlog\\nSolutions\\nBy company size\\nEnterprises\\nSmall and medium teams\\nStartups\\nNonprofits\\nBy use case\\nDevSecOps\\nDevOps\\nCI/CD\\nView all use cases\\nBy industry\\nHealthcare\\nFinancial services\\nManufacturing\\nGovernment\\nView all industries\\nView all solutions\\nResources\\nTopics\\nAI\\nDevOps\\nSecurity\\nSoftware Development\\nView all\\nExplore\\nLearning Pathways\\nEvents & Webinars\\nEbooks & Whitepapers\\nCustomer Stories\\nPartners\\nExecutive Insights\\nOpen Source\\nGitHub Sponsors\\nFund open source developers\\nThe ReadME Project\\nGitHub community articles\\nRepositories\\nTopics\\nTrending\\nCollections\\nEnterprise\\nEnterprise platform\\nAI-powered developer platform\\nAvailable add-ons\\nGitHub Advanced Security\\nEnterprise-grade security features\\nCopilot for business\\nEnterprise-grade AI features\\nPremium Support\\nEnterprise-grade 24/7 support\\nPricing\\nSearch or jump to...\\nSearch code, repositories, users, issues, pull requests...\\nSearch\\nClear\\nSearch syntax tips\\nProvide feedback\\nWe read every piece of feedback, and take your input very seriously.\\nInclude my email address so I can be contacted\\nCancel\\nSubmit feedback\\nSaved searches\\nUse saved searches to filter your results more quickly\\nCancel\\nCreate saved search\\nSign in\\nSign up\\nAppearance settings\\nResetting focus\\nYou signed in with another tab or window.\\nReload\\nto refresh your session.\\nYou signed out in another tab or window.\\nReload\\nto refresh your session.\\nYou switched accounts on another tab or window.\\nReload\\nto refresh your session.\\nDismiss alert\\nFikriraihan\\nFollow\\nOverview\\nRepositories\\n34\\nProjects\\n0\\nPackages\\n0\\nStars\\n0\\nMore\\nOverview\\nRepositories\\nProjects\\nPackages\\nStars\\nFikriraihan\\nFollow\\nFikriraihan\\nFollow\\nBlock or Report\\nBlock or report Fikriraihan\\nReport abuse\\nContact GitHub support about this user’s behavior.\\n Learn more about\\nreporting abuse\\n.\\nReport abuse\\nOverview\\nRepositories\\n34\\nProjects\\n0\\nPackages\\n0\\nStars\\n0\\nMore\\nOverview\\nRepositories\\nProjects\\nPackages\\nStars\\nPinned\\nLoading\\n2024-coding-challenge\\n2024-coding-challenge\\nPublic\\nRepository for Coding Challenge 2024\\nJavaScript\\nChatGPT\\nChatGPT\\nPublic\\nTypeScript\\nfikri-3d-portofolio\\nfikri-3d-portofolio\\nPublic\\nJavaScript\\nnextjs-dashboard\\nnextjs-dashboard\\nPublic\\nNextjs-dashboard course\\nTypeScript\\nnextjs-postgre\\nnextjs-postgre\\nPublic\\nTypeScript\\nimaginify\\nimaginify\\nPublic\\nTypeScript\\nSomething went wrong, please refresh the page to try again.\\nIf the problem persists, check the\\nGitHub status page\\nor\\ncontact support\\n.\\nUh oh!\\nThere was an error while loading.\\nPlease reload this page\\n.\\nFooter\\n© 2025 GitHub,\\xa0Inc.\\nFooter navigation\\nTerms\\nPrivacy\\nSecurity\\nStatus\\nDocs\\nContact\\nManage cookies\\nDo not share my personal information\\nYou can’t perform that action at this time.'}]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages_for(github)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e64f497f-3742-4d70-9e15-29d1974b3361", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages = messages_for(website)\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "95d0938d-0b26-4253-94a6-ac9240e7a8c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'# GitHub Profile Summary for Fikriraihan\\n\\n### 1️⃣ Profile Summary\\n- **Username:** Fikriraihan\\n- **Bio:** (No bio available)\\n- **Total public repositories:** 34\\n- **Total followers:** 0 (not indicated)\\n- **Total stars received:** 0\\n- **Top programming languages (by repo count):**\\n - JavaScript\\n - TypeScript\\n\\n### 2️⃣ Repository Highlights\\n**Top 3 repositories by activity:**\\n\\n1. **Repository Name:** 2024-coding-challenge\\n - **Description:** Repository for Coding Challenge 2024\\n - **Primary Language:** JavaScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** JavaScript\\n\\n2. **Repository Name:** ChatGPT\\n - **Description:** (No description provided)\\n - **Primary Language:** TypeScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** TypeScript\\n\\n3. **Repository Name:** fikri-3d-portofolio\\n - **Description:** (No description provided)\\n - **Primary Language:** JavaScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** JavaScript\\n\\n### 3️⃣ Overall Assessment\\n- **What does this user specialize in?** \\n - Based on the repository languages, Fikriraihan appears to specialize in JavaScript and TypeScript.\\n\\n- **Are they more focused on personal projects or collaborations?** \\n - The profile indicates a focus on personal projects given the lack of followers and collaborations apparent from the repositories.\\n\\n- **Any standout strengths or skills you notice?** \\n - The presence of JavaScript and TypeScript projects suggests proficiency in web development, specifically in relation to modern frameworks.\\n\\n### 4️⃣ Recommendations\\n- **One area or technology to explore next to grow:**\\n - Fikriraihan could benefit from exploring backend technologies, such as Node.js or Express, to complement their front-end skills with JavaScript/TypeScript.\\n\\n- **One improvement to make their GitHub profile more appealing:**\\n - Adding a bio and descriptions for each repository would help provide context and showcase their intent and the purpose behind each project, thereby attracting more engagement and potential collaborators.'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarize(\"https://github.com/Fikriraihan\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "cd863db3-731a-46d8-ac14-f74f8ae39bd4", + "metadata": {}, + "outputs": [], + "source": [ + "def display_summary(url):\n", + " summary = summarize(url)\n", + " display(Markdown(summary))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "70c5c3aa-2c06-460b-9c4f-6465d2c8611c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "# GitHub Profile Summary for Fikriraihan\n", + "\n", + "### 1️⃣ Profile Summary\n", + "- **Username:** Fikriraihan\n", + "- **Bio:** Not available\n", + "- **Total public repositories:** 34\n", + "- **Total followers:** Not available\n", + "- **Total stars received:** 0\n", + "- **Top programming languages (by repo count):**\n", + " - JavaScript\n", + " - TypeScript\n", + "\n", + "### 2️⃣ Repository Highlights\n", + "Here are the top repositories based on their details:\n", + "\n", + "1. **Repository Name:** 2024-coding-challenge\n", + " - **Description:** Repository for Coding Challenge 2024\n", + " - **Primary Language:** JavaScript\n", + " - **Star Count:** 0\n", + " - **Last Updated Date:** Not available\n", + " - **Notable Technologies/Frameworks Used:** None specified\n", + "\n", + "2. **Repository Name:** ChatGPT\n", + " - **Description:** Not available\n", + " - **Primary Language:** TypeScript\n", + " - **Star Count:** 0\n", + " - **Last Updated Date:** Not available\n", + " - **Notable Technologies/Frameworks Used:** None specified\n", + "\n", + "3. **Repository Name:** fikri-3d-portofolio\n", + " - **Description:** Not available\n", + " - **Primary Language:** JavaScript\n", + " - **Star Count:** 0\n", + " - **Last Updated Date:** Not available\n", + " - **Notable Technologies/Frameworks Used:** None specified\n", + "\n", + "### 3️⃣ Overall Assessment\n", + "- **What does this user specialize in?** Fikriraihan specializes in JavaScript and TypeScript, indicating a focus on web development or applications that utilize these languages.\n", + "- **Are they more focused on personal projects or collaborations?** The presence of multiple repositories suggests a mix of personal projects. There is no indication of collaboration, as there are no mentions of contributions to external repositories.\n", + "- **Any standout strengths or skills you notice?** The variety of repositories shows an interest in different coding challenges and portfolio projects. However, the lack of stars suggests that the projects may not yet attract a significant audience.\n", + "\n", + "### 4️⃣ Recommendations\n", + "- **Suggest one area or technology they could explore next to grow:** Given the user’s focus on JavaScript and TypeScript, exploring frameworks like React, Vue.js, or even server-side technologies such as Node.js could be beneficial.\n", + "- **Suggest one improvement to make their GitHub profile more appealing:** Adding a bio with a brief introduction and interests, along with project descriptions, would provide more context about the user and enhance engagement with their repositories. Additionally, increasing the visibility of the repositories through more optimization and possibly sharing or collaborating on projects could attract more stars and followers." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_summary(\"https://github.com/Fikriraihan\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3dfe6e3-dfd2-4acd-a2e4-681873c650c8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From feba5d084777035c42908e18e6214fdb994b142a Mon Sep 17 00:00:00 2001 From: Fikri Raihan Date: Sun, 25 May 2025 17:46:40 +0700 Subject: [PATCH 04/23] Added my contributions to community-contributions wk1 day1, github information --- .../day-1-github-information.ipynb | 626 +----------------- 1 file changed, 23 insertions(+), 603 deletions(-) diff --git a/week1/community-contributions/day-1-github-information.ipynb b/week1/community-contributions/day-1-github-information.ipynb index b5adb6d..5b8cf40 100644 --- a/week1/community-contributions/day-1-github-information.ipynb +++ b/week1/community-contributions/day-1-github-information.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4d011f3d-c10c-4a75-bd36-576e383a8d1d", "metadata": {}, "outputs": [], @@ -22,18 +22,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c51302e0-c848-4ec4-a0ab-03deeb9e7987", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Api key found and looks good so far!\n" - ] - } - ], + "outputs": [], "source": [ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", @@ -50,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "d1df04f3-bd4d-4b14-87cc-1e91eaf7c0ab", "metadata": {}, "outputs": [], @@ -60,18 +52,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "340b018a-6e97-491c-aa26-66c683ece8a0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello! Welcome! How can I assist you today?\n" - ] - } - ], + "outputs": [], "source": [ "message = \"Hello GPT, this is my first message\"\n", "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\": \"user\", \"content\":message}])\n", @@ -80,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4a06c291-2fe6-4669-a8b6-3b67769eb3fa", "metadata": {}, "outputs": [], @@ -106,223 +90,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "dd36b141-a252-44a8-8fa4-d4c2c33d3db9", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fikriraihan · GitHub\n", - "Skip to content\n", - "Navigation Menu\n", - "Toggle navigation\n", - "Sign in\n", - "Appearance settings\n", - "Product\n", - "GitHub Copilot\n", - "Write better code with AI\n", - "GitHub Models\n", - "New\n", - "Manage and compare prompts\n", - "GitHub Advanced Security\n", - "Find and fix vulnerabilities\n", - "Actions\n", - "Automate any workflow\n", - "Codespaces\n", - "Instant dev environments\n", - "Issues\n", - "Plan and track work\n", - "Code Review\n", - "Manage code changes\n", - "Discussions\n", - "Collaborate outside of code\n", - "Code Search\n", - "Find more, search less\n", - "Explore\n", - "Why GitHub\n", - "All features\n", - "Documentation\n", - "GitHub Skills\n", - "Blog\n", - "Solutions\n", - "By company size\n", - "Enterprises\n", - "Small and medium teams\n", - "Startups\n", - "Nonprofits\n", - "By use case\n", - "DevSecOps\n", - "DevOps\n", - "CI/CD\n", - "View all use cases\n", - "By industry\n", - "Healthcare\n", - "Financial services\n", - "Manufacturing\n", - "Government\n", - "View all industries\n", - "View all solutions\n", - "Resources\n", - "Topics\n", - "AI\n", - "DevOps\n", - "Security\n", - "Software Development\n", - "View all\n", - "Explore\n", - "Learning Pathways\n", - "Events & Webinars\n", - "Ebooks & Whitepapers\n", - "Customer Stories\n", - "Partners\n", - "Executive Insights\n", - "Open Source\n", - "GitHub Sponsors\n", - "Fund open source developers\n", - "The ReadME Project\n", - "GitHub community articles\n", - "Repositories\n", - "Topics\n", - "Trending\n", - "Collections\n", - "Enterprise\n", - "Enterprise platform\n", - "AI-powered developer platform\n", - "Available add-ons\n", - "GitHub Advanced Security\n", - "Enterprise-grade security features\n", - "Copilot for business\n", - "Enterprise-grade AI features\n", - "Premium Support\n", - "Enterprise-grade 24/7 support\n", - "Pricing\n", - "Search or jump to...\n", - "Search code, repositories, users, issues, pull requests...\n", - "Search\n", - "Clear\n", - "Search syntax tips\n", - "Provide feedback\n", - "We read every piece of feedback, and take your input very seriously.\n", - "Include my email address so I can be contacted\n", - "Cancel\n", - "Submit feedback\n", - "Saved searches\n", - "Use saved searches to filter your results more quickly\n", - "Cancel\n", - "Create saved search\n", - "Sign in\n", - "Sign up\n", - "Appearance settings\n", - "Resetting focus\n", - "You signed in with another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "You signed out in another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "You switched accounts on another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "Dismiss alert\n", - "Fikriraihan\n", - "Follow\n", - "Overview\n", - "Repositories\n", - "34\n", - "Projects\n", - "0\n", - "Packages\n", - "0\n", - "Stars\n", - "0\n", - "More\n", - "Overview\n", - "Repositories\n", - "Projects\n", - "Packages\n", - "Stars\n", - "Fikriraihan\n", - "Follow\n", - "Fikriraihan\n", - "Follow\n", - "Block or Report\n", - "Block or report Fikriraihan\n", - "Report abuse\n", - "Contact GitHub support about this user’s behavior.\n", - " Learn more about\n", - "reporting abuse\n", - ".\n", - "Report abuse\n", - "Overview\n", - "Repositories\n", - "34\n", - "Projects\n", - "0\n", - "Packages\n", - "0\n", - "Stars\n", - "0\n", - "More\n", - "Overview\n", - "Repositories\n", - "Projects\n", - "Packages\n", - "Stars\n", - "Pinned\n", - "Loading\n", - "2024-coding-challenge\n", - "2024-coding-challenge\n", - "Public\n", - "Repository for Coding Challenge 2024\n", - "JavaScript\n", - "ChatGPT\n", - "ChatGPT\n", - "Public\n", - "TypeScript\n", - "fikri-3d-portofolio\n", - "fikri-3d-portofolio\n", - "Public\n", - "JavaScript\n", - "nextjs-dashboard\n", - "nextjs-dashboard\n", - "Public\n", - "Nextjs-dashboard course\n", - "TypeScript\n", - "nextjs-postgre\n", - "nextjs-postgre\n", - "Public\n", - "TypeScript\n", - "imaginify\n", - "imaginify\n", - "Public\n", - "TypeScript\n", - "Something went wrong, please refresh the page to try again.\n", - "If the problem persists, check the\n", - "GitHub status page\n", - "or\n", - "contact support\n", - ".\n", - "Uh oh!\n", - "There was an error while loading.\n", - "Please reload this page\n", - ".\n", - "Footer\n", - "© 2025 GitHub, Inc.\n", - "Footer navigation\n", - "Terms\n", - "Privacy\n", - "Security\n", - "Status\n", - "Docs\n", - "Contact\n", - "Manage cookies\n", - "Do not share my personal information\n", - "You can’t perform that action at this time.\n" - ] - } - ], + "outputs": [], "source": [ "github = Website(\"https://github.com/Fikriraihan\")\n", "print(github.title)\n", @@ -331,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "ea402ba2-6c7f-4f96-95c0-d68a0e96e644", "metadata": {}, "outputs": [], @@ -366,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "a964e8f2-40f4-457b-9c81-7e6e2768f450", "metadata": {}, "outputs": [], @@ -381,321 +152,37 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "026d8ae4-1aea-45b9-b694-db0809527780", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'You are a skilled GitHub profile analyzer. Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: 1️⃣ **Profile Summary** - Username - Bio (if available) - Total public repositories - Total followers - Total stars received (sum across repos) - Top programming languages (by repo count) 2️⃣ **Repository Highlights** (top 3 by stars or activity) For each: - Repository name - Description - Primary language - Star count - Last updated date - Notable technologies or frameworks used 3️⃣ **Overall Assessment** - What does this user specialize in? - Are they more focused on personal projects or collaborations? - Any standout strengths or skills you notice? 4️⃣ **Recommendations** - Suggest one area or technology they could explore next to grow. - Suggest one improvement to make their GitHub profile more appealing. Be concise, insightful, and encourage the user’s growth. If some data is missing, state it clearly instead of guessing.'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "system_prompt" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "2e040916-8d7e-421b-b1a7-56e710940eaa", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "You are looking at a github named Fikriraihan · GitHub\n", - "The contents of this github is as follows; please provide a summary of this website in markdown.Skip to content\n", - "Navigation Menu\n", - "Toggle navigation\n", - "Sign in\n", - "Appearance settings\n", - "Product\n", - "GitHub Copilot\n", - "Write better code with AI\n", - "GitHub Models\n", - "New\n", - "Manage and compare prompts\n", - "GitHub Advanced Security\n", - "Find and fix vulnerabilities\n", - "Actions\n", - "Automate any workflow\n", - "Codespaces\n", - "Instant dev environments\n", - "Issues\n", - "Plan and track work\n", - "Code Review\n", - "Manage code changes\n", - "Discussions\n", - "Collaborate outside of code\n", - "Code Search\n", - "Find more, search less\n", - "Explore\n", - "Why GitHub\n", - "All features\n", - "Documentation\n", - "GitHub Skills\n", - "Blog\n", - "Solutions\n", - "By company size\n", - "Enterprises\n", - "Small and medium teams\n", - "Startups\n", - "Nonprofits\n", - "By use case\n", - "DevSecOps\n", - "DevOps\n", - "CI/CD\n", - "View all use cases\n", - "By industry\n", - "Healthcare\n", - "Financial services\n", - "Manufacturing\n", - "Government\n", - "View all industries\n", - "View all solutions\n", - "Resources\n", - "Topics\n", - "AI\n", - "DevOps\n", - "Security\n", - "Software Development\n", - "View all\n", - "Explore\n", - "Learning Pathways\n", - "Events & Webinars\n", - "Ebooks & Whitepapers\n", - "Customer Stories\n", - "Partners\n", - "Executive Insights\n", - "Open Source\n", - "GitHub Sponsors\n", - "Fund open source developers\n", - "The ReadME Project\n", - "GitHub community articles\n", - "Repositories\n", - "Topics\n", - "Trending\n", - "Collections\n", - "Enterprise\n", - "Enterprise platform\n", - "AI-powered developer platform\n", - "Available add-ons\n", - "GitHub Advanced Security\n", - "Enterprise-grade security features\n", - "Copilot for business\n", - "Enterprise-grade AI features\n", - "Premium Support\n", - "Enterprise-grade 24/7 support\n", - "Pricing\n", - "Search or jump to...\n", - "Search code, repositories, users, issues, pull requests...\n", - "Search\n", - "Clear\n", - "Search syntax tips\n", - "Provide feedback\n", - "We read every piece of feedback, and take your input very seriously.\n", - "Include my email address so I can be contacted\n", - "Cancel\n", - "Submit feedback\n", - "Saved searches\n", - "Use saved searches to filter your results more quickly\n", - "Cancel\n", - "Create saved search\n", - "Sign in\n", - "Sign up\n", - "Appearance settings\n", - "Resetting focus\n", - "You signed in with another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "You signed out in another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "You switched accounts on another tab or window.\n", - "Reload\n", - "to refresh your session.\n", - "Dismiss alert\n", - "Fikriraihan\n", - "Follow\n", - "Overview\n", - "Repositories\n", - "34\n", - "Projects\n", - "0\n", - "Packages\n", - "0\n", - "Stars\n", - "0\n", - "More\n", - "Overview\n", - "Repositories\n", - "Projects\n", - "Packages\n", - "Stars\n", - "Fikriraihan\n", - "Follow\n", - "Fikriraihan\n", - "Follow\n", - "Block or Report\n", - "Block or report Fikriraihan\n", - "Report abuse\n", - "Contact GitHub support about this user’s behavior.\n", - " Learn more about\n", - "reporting abuse\n", - ".\n", - "Report abuse\n", - "Overview\n", - "Repositories\n", - "34\n", - "Projects\n", - "0\n", - "Packages\n", - "0\n", - "Stars\n", - "0\n", - "More\n", - "Overview\n", - "Repositories\n", - "Projects\n", - "Packages\n", - "Stars\n", - "Pinned\n", - "Loading\n", - "2024-coding-challenge\n", - "2024-coding-challenge\n", - "Public\n", - "Repository for Coding Challenge 2024\n", - "JavaScript\n", - "ChatGPT\n", - "ChatGPT\n", - "Public\n", - "TypeScript\n", - "fikri-3d-portofolio\n", - "fikri-3d-portofolio\n", - "Public\n", - "JavaScript\n", - "nextjs-dashboard\n", - "nextjs-dashboard\n", - "Public\n", - "Nextjs-dashboard course\n", - "TypeScript\n", - "nextjs-postgre\n", - "nextjs-postgre\n", - "Public\n", - "TypeScript\n", - "imaginify\n", - "imaginify\n", - "Public\n", - "TypeScript\n", - "Something went wrong, please refresh the page to try again.\n", - "If the problem persists, check the\n", - "GitHub status page\n", - "or\n", - "contact support\n", - ".\n", - "Uh oh!\n", - "There was an error while loading.\n", - "Please reload this page\n", - ".\n", - "Footer\n", - "© 2025 GitHub, Inc.\n", - "Footer navigation\n", - "Terms\n", - "Privacy\n", - "Security\n", - "Status\n", - "Docs\n", - "Contact\n", - "Manage cookies\n", - "Do not share my personal information\n", - "You can’t perform that action at this time.\n" - ] - } - ], + "outputs": [], "source": [ "print(user_prompt_for(github))" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "da2a2c62-0ff4-4e4b-a1a1-774b47f848a0", - "metadata": {}, - "outputs": [], - "source": [ - "messages = [\n", - " {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n", - " {\"role\": \"user\", \"content\": \"tell me a fruit that has red color\"}\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "204b9b40-cfd9-46f4-a954-efee75fc3d79", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Oh, I don’t know, how about the classic red apple? Or maybe you were hoping for something more exotic, like a blood orange? There’s also the ever-popular strawberry. The options are endless! What’s next, a fruit quiz?\n" - ] - } - ], - "source": [ - "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n", - "print(response.choices[0].message.content)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "0a4a376a-8c20-4fd3-91ad-25511df76292", - "metadata": {}, - "outputs": [], - "source": [ - "def messages_for(website):\n", - " return [\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", - " ]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "11bc74b0-7ca7-40da-81cc-84b2dd04780b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'role': 'system',\n", - " 'content': 'You are a skilled GitHub profile analyzer. Your job is to take the provided GitHub profile or repository URL and generate a clear, structured summary covering these points: 1️⃣ **Profile Summary** - Username - Bio (if available) - Total public repositories - Total followers - Total stars received (sum across repos) - Top programming languages (by repo count) 2️⃣ **Repository Highlights** (top 3 by stars or activity) For each: - Repository name - Description - Primary language - Star count - Last updated date - Notable technologies or frameworks used 3️⃣ **Overall Assessment** - What does this user specialize in? - Are they more focused on personal projects or collaborations? - Any standout strengths or skills you notice? 4️⃣ **Recommendations** - Suggest one area or technology they could explore next to grow. - Suggest one improvement to make their GitHub profile more appealing. Be concise, insightful, and encourage the user’s growth. If some data is missing, state it clearly instead of guessing.'},\n", - " {'role': 'user',\n", - " 'content': 'You are looking at a github named Fikriraihan · GitHub\\nThe contents of this github is as follows; please provide a summary of this website in markdown.Skip to content\\nNavigation Menu\\nToggle navigation\\nSign in\\nAppearance settings\\nProduct\\nGitHub Copilot\\nWrite better code with AI\\nGitHub Models\\nNew\\nManage and compare prompts\\nGitHub Advanced Security\\nFind and fix vulnerabilities\\nActions\\nAutomate any workflow\\nCodespaces\\nInstant dev environments\\nIssues\\nPlan and track work\\nCode Review\\nManage code changes\\nDiscussions\\nCollaborate outside of code\\nCode Search\\nFind more, search less\\nExplore\\nWhy GitHub\\nAll features\\nDocumentation\\nGitHub Skills\\nBlog\\nSolutions\\nBy company size\\nEnterprises\\nSmall and medium teams\\nStartups\\nNonprofits\\nBy use case\\nDevSecOps\\nDevOps\\nCI/CD\\nView all use cases\\nBy industry\\nHealthcare\\nFinancial services\\nManufacturing\\nGovernment\\nView all industries\\nView all solutions\\nResources\\nTopics\\nAI\\nDevOps\\nSecurity\\nSoftware Development\\nView all\\nExplore\\nLearning Pathways\\nEvents & Webinars\\nEbooks & Whitepapers\\nCustomer Stories\\nPartners\\nExecutive Insights\\nOpen Source\\nGitHub Sponsors\\nFund open source developers\\nThe ReadME Project\\nGitHub community articles\\nRepositories\\nTopics\\nTrending\\nCollections\\nEnterprise\\nEnterprise platform\\nAI-powered developer platform\\nAvailable add-ons\\nGitHub Advanced Security\\nEnterprise-grade security features\\nCopilot for business\\nEnterprise-grade AI features\\nPremium Support\\nEnterprise-grade 24/7 support\\nPricing\\nSearch or jump to...\\nSearch code, repositories, users, issues, pull requests...\\nSearch\\nClear\\nSearch syntax tips\\nProvide feedback\\nWe read every piece of feedback, and take your input very seriously.\\nInclude my email address so I can be contacted\\nCancel\\nSubmit feedback\\nSaved searches\\nUse saved searches to filter your results more quickly\\nCancel\\nCreate saved search\\nSign in\\nSign up\\nAppearance settings\\nResetting focus\\nYou signed in with another tab or window.\\nReload\\nto refresh your session.\\nYou signed out in another tab or window.\\nReload\\nto refresh your session.\\nYou switched accounts on another tab or window.\\nReload\\nto refresh your session.\\nDismiss alert\\nFikriraihan\\nFollow\\nOverview\\nRepositories\\n34\\nProjects\\n0\\nPackages\\n0\\nStars\\n0\\nMore\\nOverview\\nRepositories\\nProjects\\nPackages\\nStars\\nFikriraihan\\nFollow\\nFikriraihan\\nFollow\\nBlock or Report\\nBlock or report Fikriraihan\\nReport abuse\\nContact GitHub support about this user’s behavior.\\n Learn more about\\nreporting abuse\\n.\\nReport abuse\\nOverview\\nRepositories\\n34\\nProjects\\n0\\nPackages\\n0\\nStars\\n0\\nMore\\nOverview\\nRepositories\\nProjects\\nPackages\\nStars\\nPinned\\nLoading\\n2024-coding-challenge\\n2024-coding-challenge\\nPublic\\nRepository for Coding Challenge 2024\\nJavaScript\\nChatGPT\\nChatGPT\\nPublic\\nTypeScript\\nfikri-3d-portofolio\\nfikri-3d-portofolio\\nPublic\\nJavaScript\\nnextjs-dashboard\\nnextjs-dashboard\\nPublic\\nNextjs-dashboard course\\nTypeScript\\nnextjs-postgre\\nnextjs-postgre\\nPublic\\nTypeScript\\nimaginify\\nimaginify\\nPublic\\nTypeScript\\nSomething went wrong, please refresh the page to try again.\\nIf the problem persists, check the\\nGitHub status page\\nor\\ncontact support\\n.\\nUh oh!\\nThere was an error while loading.\\nPlease reload this page\\n.\\nFooter\\n© 2025 GitHub,\\xa0Inc.\\nFooter navigation\\nTerms\\nPrivacy\\nSecurity\\nStatus\\nDocs\\nContact\\nManage cookies\\nDo not share my personal information\\nYou can’t perform that action at this time.'}]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "messages_for(github)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "e64f497f-3742-4d70-9e15-29d1974b3361", "metadata": {}, "outputs": [], @@ -711,28 +198,17 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "95d0938d-0b26-4253-94a6-ac9240e7a8c9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'# GitHub Profile Summary for Fikriraihan\\n\\n### 1️⃣ Profile Summary\\n- **Username:** Fikriraihan\\n- **Bio:** (No bio available)\\n- **Total public repositories:** 34\\n- **Total followers:** 0 (not indicated)\\n- **Total stars received:** 0\\n- **Top programming languages (by repo count):**\\n - JavaScript\\n - TypeScript\\n\\n### 2️⃣ Repository Highlights\\n**Top 3 repositories by activity:**\\n\\n1. **Repository Name:** 2024-coding-challenge\\n - **Description:** Repository for Coding Challenge 2024\\n - **Primary Language:** JavaScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** JavaScript\\n\\n2. **Repository Name:** ChatGPT\\n - **Description:** (No description provided)\\n - **Primary Language:** TypeScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** TypeScript\\n\\n3. **Repository Name:** fikri-3d-portofolio\\n - **Description:** (No description provided)\\n - **Primary Language:** JavaScript\\n - **Star Count:** 0\\n - **Last Updated Date:** (Not available)\\n - **Notable Technologies or Frameworks Used:** JavaScript\\n\\n### 3️⃣ Overall Assessment\\n- **What does this user specialize in?** \\n - Based on the repository languages, Fikriraihan appears to specialize in JavaScript and TypeScript.\\n\\n- **Are they more focused on personal projects or collaborations?** \\n - The profile indicates a focus on personal projects given the lack of followers and collaborations apparent from the repositories.\\n\\n- **Any standout strengths or skills you notice?** \\n - The presence of JavaScript and TypeScript projects suggests proficiency in web development, specifically in relation to modern frameworks.\\n\\n### 4️⃣ Recommendations\\n- **One area or technology to explore next to grow:**\\n - Fikriraihan could benefit from exploring backend technologies, such as Node.js or Express, to complement their front-end skills with JavaScript/TypeScript.\\n\\n- **One improvement to make their GitHub profile more appealing:**\\n - Adding a bio and descriptions for each repository would help provide context and showcase their intent and the purpose behind each project, thereby attracting more engagement and potential collaborators.'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "summarize(\"https://github.com/Fikriraihan\")" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "cd863db3-731a-46d8-ac14-f74f8ae39bd4", "metadata": {}, "outputs": [], @@ -744,66 +220,10 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "70c5c3aa-2c06-460b-9c4f-6465d2c8611c", "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "# GitHub Profile Summary for Fikriraihan\n", - "\n", - "### 1️⃣ Profile Summary\n", - "- **Username:** Fikriraihan\n", - "- **Bio:** Not available\n", - "- **Total public repositories:** 34\n", - "- **Total followers:** Not available\n", - "- **Total stars received:** 0\n", - "- **Top programming languages (by repo count):**\n", - " - JavaScript\n", - " - TypeScript\n", - "\n", - "### 2️⃣ Repository Highlights\n", - "Here are the top repositories based on their details:\n", - "\n", - "1. **Repository Name:** 2024-coding-challenge\n", - " - **Description:** Repository for Coding Challenge 2024\n", - " - **Primary Language:** JavaScript\n", - " - **Star Count:** 0\n", - " - **Last Updated Date:** Not available\n", - " - **Notable Technologies/Frameworks Used:** None specified\n", - "\n", - "2. **Repository Name:** ChatGPT\n", - " - **Description:** Not available\n", - " - **Primary Language:** TypeScript\n", - " - **Star Count:** 0\n", - " - **Last Updated Date:** Not available\n", - " - **Notable Technologies/Frameworks Used:** None specified\n", - "\n", - "3. **Repository Name:** fikri-3d-portofolio\n", - " - **Description:** Not available\n", - " - **Primary Language:** JavaScript\n", - " - **Star Count:** 0\n", - " - **Last Updated Date:** Not available\n", - " - **Notable Technologies/Frameworks Used:** None specified\n", - "\n", - "### 3️⃣ Overall Assessment\n", - "- **What does this user specialize in?** Fikriraihan specializes in JavaScript and TypeScript, indicating a focus on web development or applications that utilize these languages.\n", - "- **Are they more focused on personal projects or collaborations?** The presence of multiple repositories suggests a mix of personal projects. There is no indication of collaboration, as there are no mentions of contributions to external repositories.\n", - "- **Any standout strengths or skills you notice?** The variety of repositories shows an interest in different coding challenges and portfolio projects. However, the lack of stars suggests that the projects may not yet attract a significant audience.\n", - "\n", - "### 4️⃣ Recommendations\n", - "- **Suggest one area or technology they could explore next to grow:** Given the user’s focus on JavaScript and TypeScript, exploring frameworks like React, Vue.js, or even server-side technologies such as Node.js could be beneficial.\n", - "- **Suggest one improvement to make their GitHub profile more appealing:** Adding a bio with a brief introduction and interests, along with project descriptions, would provide more context about the user and enhance engagement with their repositories. Additionally, increasing the visibility of the repositories through more optimization and possibly sharing or collaborating on projects could attract more stars and followers." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display_summary(\"https://github.com/Fikriraihan\")" ] From 33205380b621900f1c265f5064404d0004d33c8e Mon Sep 17 00:00:00 2001 From: Ritchy Date: Sun, 25 May 2025 17:11:12 +0100 Subject: [PATCH 05/23] AI stock adviser webscraping notebook --- .../day1- stock adviser webscrap.ipynb | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 week1/community-contributions/day1- stock adviser webscrap.ipynb diff --git a/week1/community-contributions/day1- stock adviser webscrap.ipynb b/week1/community-contributions/day1- stock adviser webscrap.ipynb new file mode 100644 index 0000000..4872d5d --- /dev/null +++ b/week1/community-contributions/day1- stock adviser webscrap.ipynb @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2e40e4f0-4f65-4f68-be50-07401959f46e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fea8f921-7f2f-4942-9f88-cb6eb64ea731", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key found and looks good so far!\n" + ] + } + ], + "source": [ + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv()\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8d90ba3b-e50e-4a7d-820f-e669ea3679ff", + "metadata": {}, + "outputs": [], + "source": [ + "#call open AI\n", + "openai = OpenAI()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "046a59c6-56f5-4a09-89bd-8163075ad643", + "metadata": {}, + "outputs": [], + "source": [ + "# A class to represent a Webpage\n", + "\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "class Website:\n", + " def __init__(self, url):\n", + " \"\"\"\n", + " Create this Website object for a Finance latest news\n", + " \"\"\"\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " \n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " \n", + " # Find news headlines and content \n", + " news_data = []\n", + " \n", + " # Try different selectors \n", + " news_items = soup.find_all('h3') + soup.find_all('h2')\n", + " \n", + " for item in news_items:\n", + " headline = item.get_text(strip=True)\n", + " if headline and len(headline) > 20: # Filter out short/empty text\n", + " # Try to find content near the headline\n", + " content = \"\"\n", + " parent = item.find_parent()\n", + " if parent:\n", + " # Look for paragraph or summary text\n", + " summary = parent.find('p')\n", + " if summary:\n", + " content = summary.get_text(strip=True)[:300] + \"...\"\n", + " \n", + " news_data.append({'headline': headline, 'content': content})\n", + " \n", + " # Create the text content\n", + " self.text = \"Latest financial news headlines:\\n\\n\"\n", + " \n", + " # Get top 5 headlines with content\n", + " for i, news in enumerate(news_data[:10], 1):\n", + " self.text += f\"{i}. {news['headline']}\\n\"\n", + " if news['content']:\n", + " self.text += f\" Summary: {news['content']}\\n\"\n", + " self.text += \"\\n\"\n", + " \n", + " if not news_data:\n", + " self.text = \"No headlines found. Yahoo Finance structure may have changed.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b5b1c72e-bc74-4ed0-9a64-795ca9bac74d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Title: Yahoo Finance - Stock Market Live, Quotes, Business & Finance News\n", + "Top News:\n", + "Latest financial news headlines:\n", + "\n", + "1. US Risks Losing ‘Reliable Investment’ Status, Allianz GI Manager Says\n", + " Summary: (Bloomberg) -- Inside one of Europe’s biggest asset managers, there’s growing concern that Republican efforts to gut legislation supporting key industries such as clean energy may result in the US losing its status as a destination for investor capital.Most Read from BloombergNY Private School Plead...\n", + "\n", + "2. Why Intempus thinks robots should have a human physiological state\n", + " Summary: Teddy Warner, 19, has always been interested in robotics. His family was in the industry, and he says he \"grew up\" working in a machinist shop while in high school. Now Warner is building a robotics company of his own, Intempus, that looks to make robots a bit more human. Intempus is building tech t...\n", + "\n", + "3. Last 24 hours: TechCrunch Disrupt 2025 Early Bird Deals will fly away after today\n", + " Summary: Just 24 hours left to lock in Early Bird pricing for TechCrunch Disrupt 2025 — happening October 27–29 at Moscone West in San Francisco. Save up to $900 on your pass, or bring someone brilliant with you for 90% off their ticket. This deal ends tonight at 11:59 p.m. PT. Grab your Early Bird discount ...\n", + "\n", + "4. 48 hours left: What you won’t want to miss at the 20th TechCrunch Disrupt in October\n", + " Summary: ​​There are just 48 hours left to save up to $900 on your ticket to TechCrunch Disrupt 2025 — and get 90% off the second. After May 25 at 11:59 p.m. PT, Early Bird pricing vanishes — along with your best chance to join 10,000 of tech’s most forward-thinking minds for less. But forget the math for a ...\n", + "\n", + "5. More than a third of Americans say they want an 'adventurous retirement'\n", + " Summary: Retirement is no longer just about rocking chairs, gardening, grandchildren, or afternoons on the golf course....\n", + "\n", + "6. 'Unsustainable fiscal situation': Wall Street braces for more bond market turmoil as Trump tax bill stirs up deficit concerns\n", + " Summary: Surging Treasury yields signal deepening market fears as Trump's tax plan, soaring deficits, and global fiscal turmoil shake investor confidence....\n", + "\n", + "7. Nvidia has lost its shock power to investors, for now\n", + " Summary: Nvidia's quarter may be tougher than normal to assess. Here's why....\n", + "\n", + "8. Nvidia earnings, Trump tariff updates, and the Fed's preferred inflation gauge: What to know this week\n", + " Summary: A quarterly earnings release from Nvidia is set to greet investors in the week ahead as the stock market rally has hit pause....\n", + "\n", + "9. This week in Trumponomics: Bonds spoil the party\n", + " Summary: Trump is heading toward an important victory on tax cuts. Instead of cheering, markets are fretting....\n", + "\n", + "10. Manufacturers could benefit from Trump's 'big, beautiful' bill depending on what they make\n", + " Summary: Advocates for the manufacturing sector have hailed the advancement of Trump's \"big, beautiful bill,\" but at least two provisions in the 1,000-plus-page package could cut that ebullience for some factory owners....\n", + "\n", + "\n" + ] + } + ], + "source": [ + "website = Website(\"https://finance.yahoo.com/topic/latest-news/\")\n", + "\n", + "print(\"Title:\", website.title)\n", + "print(\"Top News:\")\n", + "print(website.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2c0ac856-b0d8-4b15-8092-71ab3952a0d9", + "metadata": {}, + "outputs": [], + "source": [ + "# Define our system prompt\n", + "system_prompt = \"\"\"You are a veteran stock market and finance expert with 50+ years of experience helping investors make safe, steady gains. Your audience is beginners with small amounts to invest (around $100). \n", + "\n", + "**Response Format:**\n", + "1. Start with \"The News Snapshot:\" - Write 3-4 lines summarizing the key financial developments from the provided headlines and summaries, showing you understand the current market situation, start the write up for this with today in the news we see that...\n", + "\n", + "2. Give specific stock advice based on the news:\n", + " - What to avoid and why\n", + " - 2-3 specific stock recommendations with ticker symbols\n", + " - Focus only on safe, dividend-paying stocks or clear beneficiaries from the news\n", + "\n", + "3. End with \"The big picture:\" - One sentence explaining the overall market condition\n", + "\n", + "4. Close with \"Your game plan:\" - Simple, actionable advice for their $100 to show how to split it\n", + "\n", + "**Tone & Style:**\n", + "- Talk like a knowledgeable but friendly Wall Street professional advising a beginner\n", + "- Keep it under 200 words total\n", + "- Use simple language, no complex jargon\n", + "- Be direct and practical\n", + "- Focus on capital preservation over quick gains\n", + "- Always relate advice directly to the news headlines provided\n", + "\n", + "**Key Rules:**\n", + "- Only recommend established, safe stocks\n", + "- Always explain WHY based on the news\n", + "- No speculative or meme stocks\n", + "- Emphasize learning over quick profits\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "077acf13-6e37-488f-a7c7-5f301266f57f", + "metadata": {}, + "outputs": [], + "source": [ + "# A function that writes a User Prompt that asks for summaries of websites:\n", + "\n", + "def user_prompt_for(website):\n", + " user_prompt = f\"You are looking at a website titled {website.title}\"\n", + " user_prompt += \"\\nThe contents of this website is as follows; \\\n", + "please provide a provide your investment advice for a beginner with $100. \\\n", + "Because it includes finance news or trend, let the advice be based on these too.\\n\\n\"\n", + " user_prompt += website.text\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1c129909-769c-49f0-a84d-85a25972463b", + "metadata": {}, + "outputs": [], + "source": [ + "def messages_for(website):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2c9f998f-639f-451b-a67e-5a95978ab70d", + "metadata": {}, + "outputs": [], + "source": [ + "def get_advice(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages = messages_for(website)\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "402b4bb4-fbf4-4930-9cd1-4ede22491fa2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'**The News Snapshot:** Recent headlines reveal rising treasury yields and concerns over the US losing its \"reliable investment\" status, stoking fears of market uncertainty. Amidst this backdrop, investors may want to focus on stable, dividend-paying stocks that can weather the storm and provide consistent returns.\\n\\n**Stock Advice:**\\n- **Avoid speculative tech stocks** like Nvidia, which has recently shown volatility and uncertainty in earnings, leading to a potential loss of investor confidence.\\n- **Recommendation #1: Johnson & Johnson (JNJ)** – A well-established healthcare company that pays a reliable dividend, making it a safe bet in uncertain times.\\n- **Recommendation #2: Procter & Gamble (PG)** – Known for its strong brand portfolio and consistent dividend payouts, PG offers stability and resilience against market fluctuations.\\n- **Recommendation #3: Coca-Cola (KO)** – With a history of dividend increases, Coca-Cola remains a staple in many portfolios, providing that defensive position investors need right now.\\n\\n**The big picture:** The market is showing signs of concern, and investors should prioritize capital preservation over chasing quick returns.\\n\\n**Your game plan:** With your $100, consider investing in fractional shares of JNJ, PG, or KO to benefit from their dividends and stability while learning about long-term investing principles.'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_advice(\"https://finance.yahoo.com/topic/latest-news/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0427753f-6b47-4c36-b68f-0f22abd8a7cd", + "metadata": {}, + "outputs": [], + "source": [ + "def display_fin_advice(url):\n", + " advice_content = get_advice(url) \n", + " display(Markdown(advice_content))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "1d26e64f-fdd0-4492-9b20-a54847b11139", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "The News Snapshot: Today in the news, we see that concerns are rising around the US potentially losing its appeal as a reliable investment destination due to political actions, particularly in clean energy. Rising Treasury yields and fiscal uncertainty, stemming from tax policies, are causing unease in the markets. Generally, investors are on alert due to potential repercussions for sectors reliant on government support and tax reform.\n", + "\n", + "Specific Stock Advice:\n", + "- I advise avoiding high-growth tech stocks like **Nvidia (NVDA)** for now, as their recent earnings show volatility and uncertainty. \n", + "- Instead, consider established dividend-paying stocks like **Johnson & Johnson (JNJ)** and **Procter & Gamble (PG)**. Both companies are less sensitive to political changes and provide steady dividends, making them safer bets during turbulent times.\n", + "- Another option is **3M Company (MMM)**, which has a strong history of dividend payments and benefits from potential manufacturing boosts tied to new legislation.\n", + "\n", + "The big picture: The market is navigating through uncertainties, particularly around fiscal policy and investment confidence.\n", + "\n", + "Your game plan: Split your $100 into three parts: $40 in Johnson & Johnson, $40 in Procter & Gamble, and keep $20 in cash for future opportunities or to cover transaction fees. This balanced approach aims for safety and steady growth." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_fin_advice(\"https://finance.yahoo.com/topic/latest-news/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7567571d-b4c7-41be-9fd0-d65ae533a252", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2d6ec3eed7b132b5c47375fcfc277f30814d4f7f Mon Sep 17 00:00:00 2001 From: shikhidvaja Date: Sun, 25 May 2025 21:48:55 +0530 Subject: [PATCH 06/23] job recommendation based on the resume content. Suggests the suitable job role to apply for and provide the links from job sites based on location and sites user wants to search for. --- .../resume_based_job_recommender.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 week1/community-contributions/resume_based_job_recommender.py diff --git a/week1/community-contributions/resume_based_job_recommender.py b/week1/community-contributions/resume_based_job_recommender.py new file mode 100644 index 0000000..7daef4f --- /dev/null +++ b/week1/community-contributions/resume_based_job_recommender.py @@ -0,0 +1,104 @@ +from openai import OpenAI +from dotenv import load_dotenv +import os +import pypdf + +class ResumeBasedJobRecommendation: + def __init__(self, path: str): + self.resume_path = path + + # method to read the content from the resume and use it for the user prompt + def read_resume(self): + """method to read the content from the resume and use it for the user prompt. + + Returns: + content (str): returns the content of the resume. + """ + try: + pdfreader = pypdf.PdfReader(self.resume_path) + data = "" + for page_number in range(pdfreader.get_num_pages()): + page = pdfreader.pages[page_number] + data += page.extract_text() + except FileNotFoundError as e: + print(f"Issue with the resume file path: {str(e)}") + return + except Exception as e: + print(f"Couldn't able to parse the pdf : {str(e)}") + return + return data + + # + def message_prompt(self, data: str, job_sites: list, location: str): + """method suggests the appropriate job roles and provides the search link from job sites based on users input of resume data, job boards and location. + + Args: + data (str): resume content for user prompt + job_sites (list): job searching sites for user prompt + location (str): location of job search + + Returns: + content (str): Provides summary of resume with suggested job roles and links using gpt 4.o model. + """ + self.message = [ + {"role": "system", + "content": "You are an assistant that analysizes the resume data and summarize it. \ + Based on the summarization, you suggest the appropriate job roles \ + and provide the appropriate job search links for each suggested roles from the job sites based on filtering by the \ + location provided. " + }, + { + "role": "user", + "content": f"Below is my resume content, kindly look for the appropriate job openings in \ + {job_sites} for location {location}:\n{data}" + }] + self.response = openai.chat.completions.create(model='gpt-4o-mini', messages=self.message) + return self.response.choices[0].message.content + + +if __name__ == '__main__': + # load the api key from .env and check if it is valid. + load_dotenv() + + api_key = os.getenv('OPENAI_API_KEY') + + if api_key is None: + print("No api key was found.") + exit() + elif not api_key.startswith('sk-proj-'): + print("api key is present but it is not matching with the openai api key pattern starting with sk-proj-. Please check it.") + exit() + elif api_key.strip() != api_key: + print("api key is good but it seems it has the spaces at starting or the end. Please check and remove it.") + exit() + else: + print("api key is found and it looks good.") + + openai = OpenAI() + + #Provide the valid resume path + file_path = input("Kindly enter the resume path:\n") + if not file_path: + print("Resume path is not provided. Kindly provide the valid path.") + exit() + + obj = ResumeBasedJobRecommendation(file_path) + data = obj.read_resume() + + if not data: + pass + else: + #provide the input for the job sites to search and valid job location + job_sites = input("Enter the job sites with space between each other: ") + if not job_sites: + print("Didn't provided the job sites to search for. Going with Linkedin, Indeed, Glassdoor and Naukri as defaults.") + job_sites = ['LinkedIn', 'Indeed', 'Naukri', 'Glassdoor'] + else: + job_sites = job_sites.split(' ') + location = input("Enter the job location:") + if not location: + print("No location has been provided. Default will consider as United States.") + location = 'United States' + + response = obj.message_prompt(data, job_sites, location) + print(response) From 7fc4aa97539dd8e298b43289753b84c94dd16e4d Mon Sep 17 00:00:00 2001 From: Jack McInerney Date: Mon, 26 May 2025 17:30:08 -0700 Subject: [PATCH 07/23] Add my notebook on adverserial chatting to community-contributions --- .../MyAdverserialChat.ipynb | 330 ++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 community-contributions/mcinerney-adverserial/MyAdverserialChat.ipynb diff --git a/community-contributions/mcinerney-adverserial/MyAdverserialChat.ipynb b/community-contributions/mcinerney-adverserial/MyAdverserialChat.ipynb new file mode 100644 index 0000000..1424900 --- /dev/null +++ b/community-contributions/mcinerney-adverserial/MyAdverserialChat.ipynb @@ -0,0 +1,330 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70a27b7c-3f3c-4d82-bdea-381939ce98bd", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# My Adverserial Conversation\n", + "J. McInerney, 26 May 2025\n", + "I am taking some cells from the Week2, Day 1 notebook and modifying them so I can have an adverserial conversation between OpenAI and a local LLM (gemma3:12b). First I will just reimplement what Ed did in the Week2, Day 1 notebook. Then I will try a deeper conversation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ec14834-4cf2-4f1d-9128-4ddad7b91804", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "#import anthropic\n", + "from IPython.display import Markdown, display, update_display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98618ab4-075f-438c-b85b-d146e5299a87", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables in a file called .env\n", + "# Print the key prefixes to help with any debugging\n", + "\n", + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95e69172-4601-4eb0-a7af-19abebd4bf56", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Connect to OpenAI, Anthropic\n", + "openai = OpenAI()" + ] + }, + { + "cell_type": "markdown", + "id": "98f47886-71ae-4b41-875a-1b97a5eb0ddc", + "metadata": {}, + "source": [ + "## An adversarial conversation between Chatbots..\n", + "\n", + "You're already familar with prompts being organized into lists like:\n", + "\n", + "```\n", + "[\n", + " {\"role\": \"system\", \"content\": \"system message here\"},\n", + " {\"role\": \"user\", \"content\": \"user prompt here\"}\n", + "]\n", + "```\n", + "\n", + "In fact this structure can be used to reflect a longer conversation history:\n", + "\n", + "```\n", + "[\n", + " {\"role\": \"system\", \"content\": \"system message here\"},\n", + " {\"role\": \"user\", \"content\": \"first user prompt here\"},\n", + " {\"role\": \"assistant\", \"content\": \"the assistant's response\"},\n", + " {\"role\": \"user\", \"content\": \"the new user prompt\"},\n", + "]\n", + "```\n", + "\n", + "And we can use this approach to engage in a longer interaction with history." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74125f8b-042e-4236-ad3d-6371ce5a1493", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Let's make a conversation between GPT-4o-mini and Gemma3:12b\n", + "# We're using cheap versions of models so the costs will be minimal\n", + "\n", + "gpt_model = \"gpt-4o-mini\"\n", + "local_model = 'gemma3:12b'\n", + "\n", + "gpt_system = \"You are a chatbot who is very argumentative; \\\n", + "you disagree with anything in the conversation and you challenge everything, in a snarky way.\"\n", + "\n", + "local_system = \"You are a very polite, courteous chatbot. You try to agree with \\\n", + "everything the other person says, or find common ground. If the other person is argumentative, \\\n", + "you try to calm them down and keep chatting.\"\n", + "\n", + "gpt_messages = [\"Hi there\"]\n", + "local_messages = [\"Hi\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f94d9232-f82a-4eab-9d89-bd9815f260f0", + "metadata": {}, + "outputs": [], + "source": [ + "def call_gpt():\n", + " messages = [{\"role\": \"system\", \"content\": gpt_system}]\n", + " for gpt, local in zip(gpt_messages, local_messages):\n", + " messages.append({\"role\": \"assistant\", \"content\": gpt})\n", + " messages.append({\"role\": \"user\", \"content\": local})\n", + " completion = openai.chat.completions.create(\n", + " model=gpt_model,\n", + " messages=messages\n", + " )\n", + " return completion.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6445453-31be-4c63-b350-957b7d99b6f4", + "metadata": {}, + "outputs": [], + "source": [ + "call_gpt()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc51f776-f6e2-41af-acb5-cbdf03fdf530", + "metadata": {}, + "outputs": [], + "source": [ + "basellm = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n", + "def call_local():\n", + " messages = []\n", + " for gpt, local_message in zip(gpt_messages, local_messages):\n", + " messages.append({\"role\": \"user\", \"content\": gpt})\n", + " messages.append({\"role\": \"assistant\", \"content\": local_message})\n", + " messages.append({\"role\": \"user\", \"content\": gpt_messages[-1]})\n", + " \n", + " completion = basellm.chat.completions.create(\n", + " model=local_model,\n", + " messages=messages\n", + " )\n", + " \n", + " return completion.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16fd90cb-ebfd-4a4f-ae49-70568ae8fbb1", + "metadata": {}, + "outputs": [], + "source": [ + "call_local()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "429eeefb-f080-4a57-8f2d-ff3d4237afab", + "metadata": {}, + "outputs": [], + "source": [ + "call_gpt()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ce847ed-521d-4be5-895b-44088de499e1", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_messages = [\"Hi there\"]\n", + "local_messages = [\"Hi\"]\n", + "\n", + "print(f\"GPT:\\n{gpt_messages[0]}\\n\")\n", + "print(f\"local:\\n{local_messages[0]}\\n\")\n", + "\n", + "for i in range(5):\n", + " gpt_next = call_gpt()\n", + " print(f\"GPT:\\n{gpt_next}\\n\")\n", + " gpt_messages.append(gpt_next)\n", + " \n", + " local_next = call_local()\n", + " print(f\"local:\\n{local_next}\\n\")\n", + " local_messages.append(local_next)" + ] + }, + { + "cell_type": "markdown", + "id": "d3b1707a-2903-4529-b6eb-95a874a14e78", + "metadata": {}, + "source": [ + "## Let's try a more thoughful conversation\n", + "The two chatbots will engage in a friendly discussion on whether the US should have entered World War I in 1917. They are both open minded so they can learn from each other." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abb733bf-a5d3-4718-8741-8e8abfd3a088", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's make a conversation between GPT-4o-mini and Gemma3:12b\n", + "# We're using cheap versions of models so the costs will be minimal\n", + "\n", + "gpt_system = \"You are a chatbot who believes it was a mistake for the US to enter World War I; \\\n", + "you are open to other arguments, but you feel the evidence suggests the world would have been \\\n", + "better off if the US had stayed isolationalist. You consider counter arguments but also express \\\n", + "your own arguments.\"\n", + "\n", + "local_system = \"You are a chatbot who believes the US made the right decision entering World War I in \\\n", + "1917. Overall, the world is a better place for it. You are open minded but believe the evidence \\\n", + "supports this view. You consider counter arguments but also express your own arguments.\"\n", + "\n", + "gpt_messages = [\"It was such a mistake for the US to enter WWI\"]\n", + "local_messages = [\"Why do you say that?\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "569e18a3-25cd-46d5-8edb-713ff149d008", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(f\"GPT:\\n{gpt_messages[0]}\\n\")\n", + "print(f\"local:\\n{local_messages[0]}\\n\")\n", + "\n", + "for i in range(5):\n", + " gpt_next = call_gpt()\n", + " print(f\"GPT:\\n{gpt_next}\\n\")\n", + " gpt_messages.append(gpt_next)\n", + " \n", + " local_next = call_local()\n", + " print(f\"local:\\n{local_next}\\n\")\n", + " local_messages.append(local_next)" + ] + }, + { + "cell_type": "markdown", + "id": "d29df7da-eaa3-4c98-b913-05185b62cffe", + "metadata": {}, + "source": [ + "## Conclusion\n", + "I am amazed at how insightful this conversation was. Not only did they explore all the pros and cons, they began applying those lessons to current day foreign policy. This looks like a very good way to explore a topic. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b486b2d6-40da-4745-8cbf-1afd2be22caa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 776478f6e87578f5c13564a3a680efe4113f2ad4 Mon Sep 17 00:00:00 2001 From: sharathir Date: Wed, 28 May 2025 11:01:32 +0530 Subject: [PATCH 08/23] Adding Week 2 Day 4 Ticket Pricing using tools with Ollama by sharathir --- .../Wk2Day4_Ollama_Tools_Sharathir.ipynb | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 week2/community-contributions/Wk2Day4_Ollama_Tools_Sharathir.ipynb diff --git a/week2/community-contributions/Wk2Day4_Ollama_Tools_Sharathir.ipynb b/week2/community-contributions/Wk2Day4_Ollama_Tools_Sharathir.ipynb new file mode 100644 index 0000000..213d50b --- /dev/null +++ b/week2/community-contributions/Wk2Day4_Ollama_Tools_Sharathir.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "989184c3-676b-4a68-8841-387ba0776e1d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "import gradio as gr\n", + "import ollama" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ac9605-d28a-4c19-97e3-1dd3f9ac99ba", + "metadata": {}, + "outputs": [], + "source": [ + "system_message = \"You are a helpful assistant for an Airline called FlightAI. \"\n", + "system_message += \"Give short, courteous answers, no more than 1 sentence. Respond to greetings and general conversation politely.\"\n", + "system_message += \"Always be accurate. If you don't know the answer, say so.\"\n", + "system_message += \"When a user asks for information that requires external data or action, use the available tools to get that information Specifically\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "533e6edf-454a-493d-b0a7-dbc29a5f3930", + "metadata": {}, + "outputs": [], + "source": [ + "def chat(message, history):\n", + " messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": message}]\n", + " response = ollama.chat(model=\"llama3.2\", messages=messages)\n", + " return response['message']['content']\n", + "\n", + "gr.ChatInterface(fn=chat, type=\"messages\").launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac22d421-a241-4c1f-bac4-db2150099ecc", + "metadata": {}, + "outputs": [], + "source": [ + "ticket_prices = {\"london\": \"$799\", \"paris\": \"$899\", \"tokyo\": \"$1400\", \"berlin\": \"$499\"}\n", + "\n", + "def get_ticket_price(destination_city):\n", + " print(f\"Tool get_ticket_price called for {destination_city}\")\n", + " city = destination_city.lower()\n", + " return ticket_prices.get(city, \"Unknown\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a0381b1-375c-44ac-8757-2fdde2c76541", + "metadata": {}, + "outputs": [], + "source": [ + "price_function = {\n", + " \"name\": \"get_ticket_price\",\n", + " \"description\": \"Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"destination_city\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city that the customer wants to travel to\",\n", + " },\n", + " },\n", + " \"required\": [\"destination_city\"],\n", + " \"additionalProperties\": False\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce5a7fd0-1ce1-4b53-873e-f55d1e39d847", + "metadata": {}, + "outputs": [], + "source": [ + "#tools = [{\"type\": \"function\", \"function\": price_function}]\n", + "tools = [\n", + " {\n", + " \"type\":\"function\",\n", + " \"function\":{\n", + " \"name\": \"get_ticket_price\",\n", + " \"description\": \"Get the price of a return ticket to the destination city. Call this whenever you need to know the ticket price, for example when a customer asks 'How much is a ticket to this city'\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"destination_city\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city that the customer wants to travel to\"\n", + " },\n", + " },\n", + " \"required\": [\"destination_city\"],\n", + " \"additionalProperties\": False\n", + " },\n", + " },\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06eab709-3f05-4697-a6a8-5f5bc1f442a5", + "metadata": {}, + "outputs": [], + "source": [ + "def handle_tool_call(message):\n", + " tool_call = message.tool_calls[0]\n", + " arguments = tool_call.function.arguments\n", + " city = arguments.get('destination_city')\n", + " price = get_ticket_price(city)\n", + " response = {\n", + " \"role\": \"tool\",\n", + " \"content\": json.dumps({\"destination_city\": city,\"price\": price}),\n", + " # \"tool_call_id\": tool_call.id\n", + " }\n", + " return response, city" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7f9af23-0683-40c3-a70b-0a385754688c", + "metadata": {}, + "outputs": [], + "source": [ + "def chat(message, history):\n", + " messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": message}]\n", + " response = ollama.chat(model=\"llama3.2\", messages=messages,tools=tools)\n", + " if response['message'].get('tool_calls'):\n", + " message = response['message']\n", + " response, city = handle_tool_call(message)\n", + " messages.append(message)\n", + " messages.append(response)\n", + " response = ollama.chat(model=\"llama3.2\", messages=messages)\n", + " \n", + " return response['message']['content']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcfa39e2-92ce-48df-b735-f9bbfe638c81", + "metadata": {}, + "outputs": [], + "source": [ + "gr.ChatInterface(fn=chat, type=\"messages\").launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f5044e9-0ae8-4d88-a22f-d1180ab52434", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9b85a94833d21ea0c9d53ea92170ace7dba01fd9 Mon Sep 17 00:00:00 2001 From: Jayapal Sahadevan Date: Wed, 28 May 2025 23:05:04 +0530 Subject: [PATCH 09/23] Added my contributions to community-contributions --- ...k1-day1-ollama-webpage-summarization.ipynb | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb diff --git a/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb new file mode 100644 index 0000000..27aaabb --- /dev/null +++ b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "4dabb31c-a584-4715-9714-9fc9978c3cb5", + "metadata": {}, + "outputs": [], + "source": [ + "#Get IPL best team" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3bb88086-ea9c-4766-9baf-a57bb69c3202", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9dc24243-d20a-48aa-b90b-26ef90233e22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API key found and looks good so far!\n" + ] + } + ], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cb35e3d1-8733-4931-8744-9c3754793161", + "metadata": {}, + "outputs": [], + "source": [ + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "63d62eb3-3255-4046-863e-d866a833d1a6", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + " def __init__(self, url):\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "409a70a6-331a-4ea4-ab8d-7a46fffc70d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "How about \"The A-Team 2.0\"? Because clearly, you’re aiming for a sequel that’s already better than the original. Or maybe \"The Not-So-Secret League of Awesome\"? That one’s a real conversation starter! What vibe are you going for?\n", + "[{'role': 'system', 'content': 'You are an assistant that analyzes the contents of a cric info website and provides a short summary of best team in IPL. Respond in markdown.'}, {'role': 'user', 'content': '\\n Get page title\\n'}]\n" + ] + }, + { + "data": { + "text/markdown": [ + "# Best Team in IPL History\n", + "\n", + "The Indian Premier League (IPL) has seen various teams competing for the title since its inception in 2008. Some of the most successful teams in IPL history include:\n", + "\n", + "1. **Mumbai Indians**: They have clinched the IPL trophy a record five times (2013, 2015, 2017, 2019, 2020) and are known for their strong squad and strategic gameplay.\n", + "\n", + "2. **Chennai Super Kings**: With four titles (2010, 2011, 2018, 2021), the CSK has consistently been one of the top teams, led by the experienced MS Dhoni.\n", + "\n", + "3. **Kolkata Knight Riders**: They have won the championship twice (2012, 2014) and are recognized for their fan base and competitive spirit.\n", + "\n", + "Overall, the Mumbai Indians are often considered the best team in IPL history due to their multiple championships and consistent performance over the years." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Step 1: Create your prompts\n", + "system_prompt = \"You are an assistant that analyzes the contents of a cric info website \\\n", + "and provides a short summary of best team in IPL. \\\n", + "Respond in markdown.\"\n", + "\n", + "user_prompt = \"\"\"\n", + " Get page title\n", + "\"\"\"\n", + "\n", + "# Step 2: Make the messages list\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n", + " {\"role\": \"user\", \"content\": \"Team name\"}\n", + "]\n", + "\n", + "# Step 3: Call OpenAI\n", + "\n", + "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n", + "print(response.choices[0].message.content)\n", + "\n", + "def messages_for(website):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}]\n", + "\n", + "webUrl = \"https://www.google.com\"\n", + "print(messages_for(webUrl))\n", + "\n", + "def summarize(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages = messages_for(website)\n", + " )\n", + " return response.choices[0].message.content\n", + "\n", + "# Step 4: print the result\n", + "summary = summarize(webUrl)\n", + "display(Markdown(summary))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 30d1d58f07a380ba980f3b803f1babb71053e547 Mon Sep 17 00:00:00 2001 From: Jayapal Sahadevan Date: Wed, 28 May 2025 23:43:45 +0530 Subject: [PATCH 10/23] Added my contributions to community-contributions --- ...k1-day1-ollama-webpage-summarization.ipynb | 53 +++---------------- 1 file changed, 7 insertions(+), 46 deletions(-) diff --git a/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb index 27aaabb..95e6c32 100644 --- a/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb +++ b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "3bb88086-ea9c-4766-9baf-a57bb69c3202", "metadata": {}, "outputs": [], @@ -27,18 +27,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "9dc24243-d20a-48aa-b90b-26ef90233e22", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "API key found and looks good so far!\n" - ] - } - ], + "outputs": [], "source": [ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", @@ -57,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "cb35e3d1-8733-4931-8744-9c3754793161", "metadata": {}, "outputs": [], @@ -67,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "63d62eb3-3255-4046-863e-d866a833d1a6", "metadata": {}, "outputs": [], @@ -89,41 +81,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "409a70a6-331a-4ea4-ab8d-7a46fffc70d7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "How about \"The A-Team 2.0\"? Because clearly, you’re aiming for a sequel that’s already better than the original. Or maybe \"The Not-So-Secret League of Awesome\"? That one’s a real conversation starter! What vibe are you going for?\n", - "[{'role': 'system', 'content': 'You are an assistant that analyzes the contents of a cric info website and provides a short summary of best team in IPL. Respond in markdown.'}, {'role': 'user', 'content': '\\n Get page title\\n'}]\n" - ] - }, - { - "data": { - "text/markdown": [ - "# Best Team in IPL History\n", - "\n", - "The Indian Premier League (IPL) has seen various teams competing for the title since its inception in 2008. Some of the most successful teams in IPL history include:\n", - "\n", - "1. **Mumbai Indians**: They have clinched the IPL trophy a record five times (2013, 2015, 2017, 2019, 2020) and are known for their strong squad and strategic gameplay.\n", - "\n", - "2. **Chennai Super Kings**: With four titles (2010, 2011, 2018, 2021), the CSK has consistently been one of the top teams, led by the experienced MS Dhoni.\n", - "\n", - "3. **Kolkata Knight Riders**: They have won the championship twice (2012, 2014) and are recognized for their fan base and competitive spirit.\n", - "\n", - "Overall, the Mumbai Indians are often considered the best team in IPL history due to their multiple championships and consistent performance over the years." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Step 1: Create your prompts\n", "system_prompt = \"You are an assistant that analyzes the contents of a cric info website \\\n", From 06b14a37b6753bc323281fb7b2411b6166eb050e Mon Sep 17 00:00:00 2001 From: Jayapal Sahadevan Date: Thu, 29 May 2025 00:06:39 +0530 Subject: [PATCH 11/23] Added my contributions to community-contributions - removed comment --- .../week1-day1-ollama-webpage-summarization.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb index 95e6c32..3df7751 100644 --- a/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb +++ b/week1/community-contributions/week1-day1-ollama-webpage-summarization.ipynb @@ -35,8 +35,6 @@ "load_dotenv(override=True)\n", "api_key = os.getenv('OPENAI_API_KEY')\n", "\n", - "# Check the key\n", - "\n", "if not api_key:\n", " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", "elif not api_key.startswith(\"sk-proj-\"):\n", From 0714e7d1d973937ec696eb39b84e5f43be90bc3c Mon Sep 17 00:00:00 2001 From: armangoudarzi91 Date: Wed, 28 May 2025 12:50:14 -0600 Subject: [PATCH 12/23] Add my notebook to community-contributions --- .../Day1-finance-journal-summarizer.ipynb | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 week1/community-contributions/Day1-finance-journal-summarizer.ipynb diff --git a/week1/community-contributions/Day1-finance-journal-summarizer.ipynb b/week1/community-contributions/Day1-finance-journal-summarizer.ipynb new file mode 100644 index 0000000..cffa355 --- /dev/null +++ b/week1/community-contributions/Day1-finance-journal-summarizer.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import os, textwrap, time, requests\n", + "from bs4 import BeautifulSoup\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "from urllib.parse import urljoin\n", + "\n", + "# ------------------ ENV & OpenAI ------------------\n", + "load_dotenv(override=True)\n", + "openai = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", + "\n", + "UA = (\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n", + " \"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36\")\n", + "BASE_URL = \"https://www.cambridge.org\"\n", + "JFQA_URL = f\"{BASE_URL}/core/journals/journal-of-financial-and-quantitative-analysis/latest-issue\"\n", + "\n", + "# ------------------ Helpers ------------------\n", + "def fetch_latest_issue(url: str) -> list[dict]:\n", + " \"\"\"Return unique {title, link} dicts for each research article.\"\"\"\n", + " soup = BeautifulSoup(\n", + " requests.get(url, headers={\"User-Agent\": UA}, timeout=30).text,\n", + " \"html.parser\"\n", + " )\n", + "\n", + " anchors = soup.find_all(\"a\", href=lambda h: h and \"/article/\" in h)\n", + " seen, articles = set(), []\n", + " for a in anchors:\n", + " href = a[\"href\"].split(\"?\")[0] # strip tracking params\n", + " if href in seen: # de‑duplicate\n", + " continue\n", + " seen.add(href)\n", + " title = a.get_text(\" \", strip=True)\n", + " full = urljoin(BASE_URL, href)\n", + " articles.append({\"title\": title, \"link\": full})\n", + " print(f\"Found {len(articles)} unique article links.\")\n", + " return articles\n", + "\n", + "def fetch_article_details(link: str) -> dict:\n", + " soup = BeautifulSoup(\n", + " requests.get(link, headers={\"User-Agent\": UA}, timeout=30).text,\n", + " \"html.parser\"\n", + " )\n", + "\n", + " # abstract\n", + " abs_tag = soup.find(\"div\", class_=\"abstract\")\n", + " abstract = abs_tag.get_text(\" \", strip=True) if abs_tag else \"N/A\"\n", + "\n", + " # publication date (meta is most reliable)\n", + " meta_date = soup.find(\"meta\", attrs={\"name\": \"citation_publication_date\"})\n", + " pub_date = meta_date[\"content\"] if meta_date else \"N/A\"\n", + "\n", + " # authors (multiple tags)\n", + " authors = [m[\"content\"] for m in soup.find_all(\"meta\",\n", + " attrs={\"name\": \"citation_author\"})]\n", + " authors_str = \", \".join(authors) or \"N/A\"\n", + "\n", + " return {\"abstract\": abstract, \"pub_date\": pub_date, \"authors\": authors_str}\n", + "\n", + "def summarise(txt: str) -> str:\n", + " prompt = (\"Summarise the following finance‑paper abstract in 2‑3 sentences, \"\n", + " \"mentioning the question, method, and main finding.\\n\\n\"\n", + " f\"Abstract:\\n{txt}\")\n", + " try:\n", + " rsp = openai.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\",\n", + " \"content\": \"You are a helpful finance research assistant.\"},\n", + " {\"role\": \"user\", \"content\": prompt}],\n", + " temperature=0.2, max_tokens=120\n", + " )\n", + " return rsp.choices[0].message.content.strip()\n", + " except Exception as e:\n", + " print(f\"⚠️ summarise error → {e}\")\n", + " return \"Summary unavailable.\"\n", + "\n", + "def scrape_jfqa_latest() -> None:\n", + " for art in fetch_latest_issue(JFQA_URL):\n", + " det = fetch_article_details(art[\"link\"])\n", + " if det[\"abstract\"] == \"N/A\":\n", + " print(f\"\\n📘 {art['title']} — no abstract found.\")\n", + " continue\n", + "\n", + " summary = summarise(det[\"abstract\"])\n", + " print(f\"\\n📘 {art['title']}\")\n", + " print(f\" Authors: {det['authors']}\")\n", + " print(f\" Date : {det['pub_date']}\")\n", + " print(f\" Journal: JFQA (Latest Issue)\")\n", + " print(\" Summary:\", textwrap.shorten(summary, width=600, placeholder=\"…\"))\n", + " print(\"-\" * 90)\n", + " time.sleep(1.0) # polite gap between OpenAI calls\n", + "\n", + "if __name__ == \"__main__\":\n", + " scrape_jfqa_latest()\n" + ], + "id": "e20b182f6258f0be", + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 15c2520925b342e8ce7b11a6fcbec85817aac0d6 Mon Sep 17 00:00:00 2001 From: Jeannine Jordan Date: Fri, 30 May 2025 06:15:59 -0400 Subject: [PATCH 13/23] Add community contributions for PR --- ...thical-antibot-async_jeannine-jordan.ipynb | 794 ++++++++++++++++++ ...ver-threaded-scraper_jeannine-jordan.ipynb | 626 ++++++++++++++ ...ls-code-and-UI-image_jeannine-jordan.ipynb | 349 ++++++++ 3 files changed, 1769 insertions(+) create mode 100644 week1/community-contributions/day1_ethical-antibot-async_jeannine-jordan.ipynb create mode 100644 week1/community-contributions/day5_shared-driver-threaded-scraper_jeannine-jordan.ipynb create mode 100644 week1/community-contributions/week1-EXERCISE_rewrite-internal-tools-code-and-UI-image_jeannine-jordan.ipynb diff --git a/week1/community-contributions/day1_ethical-antibot-async_jeannine-jordan.ipynb b/week1/community-contributions/day1_ethical-antibot-async_jeannine-jordan.ipynb new file mode 100644 index 0000000..70e5cc7 --- /dev/null +++ b/week1/community-contributions/day1_ethical-antibot-async_jeannine-jordan.ipynb @@ -0,0 +1,794 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9", + "metadata": {}, + "source": [ + "# YOUR FIRST LAB\n", + "### Please read this section. This is valuable to get you prepared, even if it's a long read -- it's important stuff.\n", + "\n", + "## Your first Frontier LLM Project\n", + "\n", + "Let's build a useful LLM solution - in a matter of minutes.\n", + "\n", + "By the end of this course, you will have built an autonomous Agentic AI solution with 7 agents that collaborate to solve a business problem. All in good time! We will start with something smaller...\n", + "\n", + "Our goal is to code a new kind of Web Browser. Give it a URL, and it will respond with a summary. The Reader's Digest of the internet!!\n", + "\n", + "Before starting, you should have completed the setup for [PC](../SETUP-PC.md) or [Mac](../SETUP-mac.md) and you hopefully launched this jupyter lab from within the project root directory, with your environment activated.\n", + "\n", + "## If you're new to Jupyter Lab\n", + "\n", + "Welcome to the wonderful world of Data Science experimentation! Once you've used Jupyter Lab, you'll wonder how you ever lived without it. Simply click in each \"cell\" with code in it, such as the cell immediately below this text, and hit Shift+Return to execute that cell. As you wish, you can add a cell with the + button in the toolbar, and print values of variables, or try out variations. \n", + "\n", + "I've written a notebook called [Guide to Jupyter](Guide%20to%20Jupyter.ipynb) to help you get more familiar with Jupyter Labs, including adding Markdown comments, using `!` to run shell commands, and `tqdm` to show progress.\n", + "\n", + "## If you're new to the Command Line\n", + "\n", + "Please see these excellent guides: [Command line on PC](https://chatgpt.com/share/67b0acea-ba38-8012-9c34-7a2541052665) and [Command line on Mac](https://chatgpt.com/canvas/shared/67b0b10c93a081918210723867525d2b). \n", + "\n", + "## If you'd prefer to work in IDEs\n", + "\n", + "If you're more comfortable in IDEs like VSCode, Cursor or PyCharm, they both work great with these lab notebooks too. \n", + "If you'd prefer to work in VSCode, [here](https://chatgpt.com/share/676f2e19-c228-8012-9911-6ca42f8ed766) are instructions from an AI friend on how to configure it for the course.\n", + "\n", + "## If you'd like to brush up your Python\n", + "\n", + "I've added a notebook called [Intermediate Python](Intermediate%20Python.ipynb) to get you up to speed. But you should give it a miss if you already have a good idea what this code does: \n", + "`yield from {book.get(\"author\") for book in books if book.get(\"author\")}`\n", + "\n", + "## I am here to help\n", + "\n", + "If you have any problems at all, please do reach out. \n", + "I'm available through the platform, or at ed@edwarddonner.com, or at https://www.linkedin.com/in/eddonner/ if you'd like to connect (and I love connecting!) \n", + "And this is new to me, but I'm also trying out X/Twitter at [@edwarddonner](https://x.com/edwarddonner) - if you're on X, please show me how it's done 😂 \n", + "\n", + "## More troubleshooting\n", + "\n", + "Please see the [troubleshooting](troubleshooting.ipynb) notebook in this folder to diagnose and fix common problems. At the very end of it is a diagnostics script with some useful debug info.\n", + "\n", + "## For foundational technical knowledge (eg Git, APIs, debugging) \n", + "\n", + "If you're relatively new to programming -- I've got your back! While it's ideal to have some programming experience for this course, there's only one mandatory prerequisite: plenty of patience. 😁 I've put together a set of self-study guides that cover Git and GitHub, APIs and endpoints, beginner python and more.\n", + "\n", + "This covers Git and GitHub; what they are, the difference, and how to use them: \n", + "https://github.com/ed-donner/agents/blob/main/guides/03_git_and_github.ipynb\n", + "\n", + "This covers technical foundations: \n", + "ChatGPT vs API; taking screenshots; Environment Variables; Networking basics; APIs and endpoints: \n", + "https://github.com/ed-donner/agents/blob/main/guides/04_technical_foundations.ipynb\n", + "\n", + "This covers Python for beginners, and making sure that a `NameError` never trips you up: \n", + "https://github.com/ed-donner/agents/blob/main/guides/06_python_foundations.ipynb\n", + "\n", + "This covers the essential techniques for figuring out errors: \n", + "https://github.com/ed-donner/agents/blob/main/guides/08_debugging.ipynb\n", + "\n", + "And you'll find other useful guides in the same folder in GitHub. Some information applies to my other Udemy course (eg Async Python) but most of it is very relevant for LLM engineering.\n", + "\n", + "## If this is old hat!\n", + "\n", + "If you're already comfortable with today's material, please hang in there; you can move swiftly through the first few labs - we will get much more in depth as the weeks progress. Ultimately we will fine-tune our own LLM to compete with OpenAI!\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Please read - important note

\n", + " The way I collaborate with you may be different to other courses you've taken. I prefer not to type code while you watch. Rather, I execute Jupyter Labs, like this, and give you an intuition for what's going on. My suggestion is that you carefully execute this yourself, after watching the lecture. Add print statements to understand what's going on, and then come up with your own variations. If you have a Github account, use this to showcase your variations. Not only is this essential practice, but it demonstrates your skills to others, including perhaps future clients or employers...\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

This code is a live resource - keep an eye out for my emails

\n", + " I push updates to the code regularly. As people ask questions, I add more examples or improved commentary. As a result, you'll notice that the code below isn't identical to the videos. Everything from the videos is here; but I've also added better explanations and new models like DeepSeek. Consider this like an interactive book.

\n", + " I try to send emails regularly with important updates related to the course. You can find this in the 'Announcements' section of Udemy in the left sidebar. You can also choose to receive my emails via your Notification Settings in Udemy. I'm respectful of your inbox and always try to add value with my emails!\n", + "
\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Business value of these exercises

\n", + " A final thought. While I've designed these notebooks to be educational, I've also tried to make them enjoyable. We'll do fun things like have LLMs tell jokes and argue with each other. But fundamentally, my goal is to teach skills you can apply in business. I'll explain business implications as we go, and it's worth keeping this in mind: as you build experience with models and techniques, think of ways you could put this into action at work today. Please do contact me if you'd like to discuss more or if you have ideas to bounce off me.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e2a9393-7767-488e-a8bf-27c12dca35bd", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.chrome.service import Service\n", + "from webdriver_manager.chrome import ChromeDriverManager\n", + "import time\n", + "import random\n", + "from urllib import robotparser\n", + "from urllib.parse import urlparse\n", + "\n", + "# If you get an error running this cell, then please head over to the troubleshooting notebook!" + ] + }, + { + "cell_type": "markdown", + "id": "6900b2a8-6384-4316-8aaa-5e519fca4254", + "metadata": {}, + "source": [ + "# Connecting to OpenAI (or Ollama)\n", + "\n", + "The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI. \n", + "\n", + "If you'd like to use free Ollama instead, please see the README section \"Free Alternative to Paid APIs\", and if you're not sure how to do this, there's a full solution in the solutions folder (day1_with_ollama.ipynb).\n", + "\n", + "## Troubleshooting if you have problems:\n", + "\n", + "Head over to the [troubleshooting](troubleshooting.ipynb) notebook in this folder for step by step code to identify the root cause and fix it!\n", + "\n", + "If you make a change, try restarting the \"Kernel\" (the python process sitting behind this notebook) by Kernel menu >> Restart Kernel and Clear Outputs of All Cells. Then try this notebook again, starting at the top.\n", + "\n", + "Or, contact me! Message me or email ed@edwarddonner.com and we will get this to work.\n", + "\n", + "Any concerns about API costs? See my notes in the README - costs should be minimal, and you can control it at every point. You can also use Ollama as a free alternative, which we discuss during Day 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b87cadb-d513-4303-baee-a37b6f938e4d", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3", + "metadata": {}, + "outputs": [], + "source": [ + "openai = OpenAI()\n", + "\n", + "# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n", + "# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions" + ] + }, + { + "cell_type": "markdown", + "id": "442fc84b-0815-4f40-99ab-d9a5da6bda91", + "metadata": {}, + "source": [ + "# Let's make a quick call to a Frontier model to get started, as a preview!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a58394bf-1e45-46af-9bfd-01e24da6f49a", + "metadata": {}, + "outputs": [], + "source": [ + "# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.\n", + "\n", + "message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n", + "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=[{\"role\":\"user\", \"content\":message}])\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "2aa190e5-cb31-456a-96cc-db109919cd78", + "metadata": {}, + "source": [ + "## OK onwards with our first project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5e793b2-6775-426a-a139-4848291d0463", + "metadata": {}, + "outputs": [], + "source": [ + "# A class to represent a Webpage\n", + "# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n", + "\n", + "# Some websites need you to use proper headers when fetching them:\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + "\n", + " def __init__(self, url):\n", + " \"\"\"\n", + " Create this Website object from the given url using the BeautifulSoup library\n", + " \"\"\"\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's try one out. Change the website and add print statements to follow along.\n", + "\n", + "ed = Website(\"https://edwarddonner.com\")\n", + "print(ed.title)\n", + "print(ed.text)" + ] + }, + { + "cell_type": "markdown", + "id": "6a478a0c-2c53-48ff-869c-4d08199931e1", + "metadata": {}, + "source": [ + "## Types of prompts\n", + "\n", + "You may know this already - but if not, you will get very familiar with it!\n", + "\n", + "Models like GPT4o have been trained to receive instructions in a particular way.\n", + "\n", + "They expect to receive:\n", + "\n", + "**A system prompt** that tells them what task they are performing and what tone they should use\n", + "\n", + "**A user prompt** -- the conversation starter that they should reply to" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abdb8417-c5dc-44bc-9bee-2e059d162699", + "metadata": {}, + "outputs": [], + "source": [ + "# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n", + "\n", + "system_prompt = \"You are an assistant that analyzes the contents of a website \\\n", + "and provides a short summary, ignoring text that might be navigation related. \\\n", + "Respond in markdown.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0275b1b-7cfe-4f9d-abfa-7650d378da0c", + "metadata": {}, + "outputs": [], + "source": [ + "# A function that writes a User Prompt that asks for summaries of websites:\n", + "\n", + "def user_prompt_for(website):\n", + " user_prompt = f\"You are looking at a website titled {website.title}\"\n", + " user_prompt += \"\\nThe contents of this website is as follows; \\\n", + "please provide a short summary of this website in markdown. \\\n", + "If it includes news or announcements, then summarize these too.\\n\\n\"\n", + " user_prompt += website.text\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26448ec4-5c00-4204-baec-7df91d11ff2e", + "metadata": {}, + "outputs": [], + "source": [ + "print(user_prompt_for(ed))" + ] + }, + { + "cell_type": "markdown", + "id": "ea211b5f-28e1-4a86-8e52-c0b7677cadcc", + "metadata": {}, + "source": [ + "## Messages\n", + "\n", + "The API from OpenAI expects to receive messages in a particular structure.\n", + "Many of the other APIs share this structure:\n", + "\n", + "```python\n", + "[\n", + " {\"role\": \"system\", \"content\": \"system message goes here\"},\n", + " {\"role\": \"user\", \"content\": \"user message goes here\"}\n", + "]\n", + "```\n", + "To give you a preview, the next 2 cells make a rather simple call - we won't stretch the mighty GPT (yet!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f25dcd35-0cd0-4235-9f64-ac37ed9eaaa5", + "metadata": {}, + "outputs": [], + "source": [ + "messages = [\n", + " {\"role\": \"system\", \"content\": \"You are a snarky assistant\"},\n", + " {\"role\": \"user\", \"content\": \"What is 2 + 2?\"}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ed95c5-7001-47de-a36d-1d6673b403ce", + "metadata": {}, + "outputs": [], + "source": [ + "# To give you a preview -- calling OpenAI with system and user messages:\n", + "\n", + "response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n", + "print(response.choices[0].message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47", + "metadata": {}, + "source": [ + "## And now let's build useful messages for GPT-4o-mini, using a function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0134dfa4-8299-48b5-b444-f2a8c3403c88", + "metadata": {}, + "outputs": [], + "source": [ + "# See how this function creates exactly the format above\n", + "\n", + "def messages_for(website):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36478464-39ee-485c-9f3f-6a4e458dbc9c", + "metadata": {}, + "outputs": [], + "source": [ + "# Try this out, and then try for a few more websites\n", + "\n", + "messages_for(ed)" + ] + }, + { + "cell_type": "markdown", + "id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0", + "metadata": {}, + "source": [ + "## Time to bring it together - the API for OpenAI is very simple!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "905b9919-aba7-45b5-ae65-81b3d1d78e34", + "metadata": {}, + "outputs": [], + "source": [ + "# And now: call the OpenAI API. You will get very familiar with this!\n", + "\n", + "def summarize(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model = \"gpt-4o-mini\",\n", + " messages = messages_for(website)\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5", + "metadata": {}, + "outputs": [], + "source": [ + "summarize(\"https://edwarddonner.com\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d926d59-450e-4609-92ba-2d6f244f1342", + "metadata": {}, + "outputs": [], + "source": [ + "# A function to display this nicely in the Jupyter output, using markdown\n", + "\n", + "def display_summary(url):\n", + " summary = summarize(url)\n", + " display(Markdown(summary))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3018853a-445f-41ff-9560-d925d1774b2f", + "metadata": {}, + "outputs": [], + "source": [ + "display_summary(\"https://edwarddonner.com\")" + ] + }, + { + "cell_type": "markdown", + "id": "b3bcf6f4-adce-45e9-97ad-d9a5d7a3a624", + "metadata": {}, + "source": [ + "# Let's try more websites\n", + "\n", + "Note that this will only work on websites that can be scraped using this simplistic approach.\n", + "\n", + "Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this. You'll need to read up on installing Selenium (ask ChatGPT!)\n", + "\n", + "Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.\n", + "\n", + "But many websites will work just fine!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45d83403-a24c-44b5-84ac-961449b4008f", + "metadata": {}, + "outputs": [], + "source": [ + "display_summary(\"https://cnn.com\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e9fd40-b354-4341-991e-863ef2e59db7", + "metadata": {}, + "outputs": [], + "source": [ + "display_summary(\"https://anthropic.com\")" + ] + }, + { + "cell_type": "markdown", + "id": "c951be1a-7f1b-448f-af1f-845978e47e2c", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Business applications

\n", + " In this exercise, you experienced calling the Cloud API of a Frontier Model (a leading model at the frontier of AI) for the first time. We will be using APIs like OpenAI at many stages in the course, in addition to building our own LLMs.\n", + "\n", + "More specifically, we've applied this to Summarization - a classic Gen AI use case to make a summary. This can be applied to any business vertical - summarizing the news, summarizing financial performance, summarizing a resume in a cover letter - the applications are limitless. Consider how you could apply Summarization in your business, and try prototyping a solution.\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Before you continue - now try yourself

\n", + " Use the cell below to make your own simple commercial example. Stick with the summarization use case for now. Here's an idea: write something that will take the contents of an email, and will suggest an appropriate short subject line for the email. That's the kind of feature that might be built into a commercial email tool.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00743dac-0e70-45b7-879a-d7293a6f68a6", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create your prompts\n", + "\n", + "system_prompt = \"\"\"\n", + "You are an assistant that creates short clear concise and relevant email \n", + "subject lines based on the content of the email\n", + "\"\"\"\n", + "user_prompt = \"\"\"\n", + "Hi team,\n", + "\n", + "Just a quick update on our Q2 progress. We’ve exceeded our sales goals by 15% and customer satisfaction scores are up 10 points from last quarter. Kudos to everyone involved, especially the sales and support teams. Let’s keep this momentum going as we head into Q3.\n", + "\n", + "Best,\n", + "Jeannine\n", + "\"\"\"\n", + "\n", + "# Step 2: Make the messages list\n", + "\n", + "messages = [\n", + " {\"role\":\"system\", \"content\":system_prompt},\n", + " {\"role\":\"user\", \"content\":f\"Email:{user_prompt}/n/nGenerate a concise subject line for this email.\"}\n", + "] # fill this in\n", + "\n", + "# Step 3: Call OpenAI\n", + "\n", + "response = openai.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=messages,\n", + " temperature=0.5,\n", + " max_tokens=20\n", + ")\n", + "\n", + "# Step 4: print the result\n", + "\n", + "print(\"Suggested subject line: \", response.choices[0].message.content.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "36ed9f14-b349-40e9-a42c-b367e77f8bda", + "metadata": {}, + "source": [ + "## An extra exercise for those who enjoy web scraping\n", + "\n", + "You may notice that if you try `display_summary(\"https://openai.com\")` - it doesn't work! That's because OpenAI has a fancy website that uses Javascript. There are many ways around this that some of you might be familiar with. For example, Selenium is a hugely popular framework that runs a browser behind the scenes, renders the page, and allows you to query it. If you have experience with Selenium, Playwright or similar, then feel free to improve the Website class to use them. In the community-contributions folder, you'll find an example Selenium solution from a student (thank you!)" + ] + }, + { + "cell_type": "markdown", + "id": "eeab24dc-5f90-4570-b542-b0585aca3eb6", + "metadata": {}, + "source": [ + "# Sharing your code\n", + "\n", + "I'd love it if you share your code afterwards so I can share it with others! You'll notice that some students have already made changes (including a Selenium implementation) which you will find in the community-contributions folder. If you'd like add your changes to that folder, submit a Pull Request with your new versions in that folder and I'll merge your changes.\n", + "\n", + "If you're not an expert with git (and I am not!) then GPT has given some nice instructions on how to submit a Pull Request. It's a bit of an involved process, but once you've done it once it's pretty clear. As a pro-tip: it's best if you clear the outputs of your Jupyter notebooks (Edit >> Clean outputs of all cells, and then Save) for clean notebooks.\n", + "\n", + "Here are good instructions courtesy of an AI friend: \n", + "https://chatgpt.com/share/677a9cb5-c64c-8012-99e0-e06e88afd293" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4484fcf-8b39-4c3f-9674-37970ed71988", + "metadata": {}, + "outputs": [], + "source": [ + "# A modified class to fetch and parse fully rendered pages: with ethically reduced CAPTCHA events\n", + "class Website:\n", + "\n", + " def __init__(self, url):\n", + " \"\"\"\n", + " Create this Website object from the given url using Selenium and BeautifulSoup.\n", + " Render JavaScript content and extract text from the page.\n", + " \"\"\"\n", + " self.url = url\n", + "\n", + " if not self._is_allowed_by_robots(url):\n", + " print(f\"Warning: robots.txt does not explicitly allow webscraping of {url}. Proceeding anyway.\")\n", + " self.text, self.title = self._scrape_content()\n", + "\n", + " # Check robots.txt if scraping is allowed\n", + " def _is_allowed_by_robots(self, url, user_agent=\"*\"):\n", + " parsed = urlparse(url)\n", + " robots_url = f\"{parsed.scheme}://{parsed.netloc}/robots.txt\"\n", + " rp = urllib.robotparser.RobotFileParser()\n", + " rp.set_url(robots_url)\n", + " try:\n", + " rp.read()\n", + " return rp.can_fetch(user_agent, url)\n", + " except Exception:\n", + " # If robots.txt is unreachable, assume permissable\n", + " return True\n", + "\n", + " def _scrape_content(self, retries=3, wait_base=5):\n", + " # List of user agents for rotation\n", + " user_agents = [\n", + " \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\",\n", + " \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15\",\n", + " \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0\"\n", + " ]\n", + " # Rotate user agents infrequently\n", + " selected_agent = random.choice(user_agents)\n", + " \n", + " # Set up headless Chrome options\n", + " options = Options()\n", + " options.add_argument(\"--headless=new\")\n", + " options.add_argument(\"--disable-gpu\")\n", + " options.add_argument(\"--no-sandbox\")\n", + " options.add_argument(\"--disable-dev-shm-usage\")\n", + " options.add_argument(f\"user-agent={selected_agent}\")\n", + "\n", + " # Try to bypass anti-bot protections with exponential backoff\n", + " for attempt in range(retries):\n", + " try:\n", + " # Start browser\n", + " service = Service(ChromeDriverManager().install())\n", + " driver = webdriver.Chrome(service=service, options=options)\n", + " driver.set_page_load_timeout(30)\n", + " driver.get(self.url)\n", + "\n", + " # Mimick human browsing behavior with random time delay, without overloading the server\n", + " time.sleep(random.uniform(6, 12))\n", + " \n", + " # Get the page source after rendering\n", + " soup = BeautifulSoup(driver.page_source, 'html.parser')\n", + " driver.quit()\n", + "\n", + " for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n", + " tag.decompose()\n", + " \n", + " title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n", + " body = soup.body\n", + " text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n", + "\n", + " return text, title\n", + " \n", + " except Exception as e:\n", + " # Exponential backoff to avoid retry spamming on failure\n", + " time.sleep(wait_base * (2 ** attempt)) \n", + " continue\n", + "\n", + " raise Exception(\"Failed to retrieve content despite retries.\")\n", + "\n", + "\n", + "rendered_page = Website(\"https://openai.com\")\n", + "print(\"\\nTitle: \", rendered_page.title)\n", + "print(\"\\nText: \", rendered_page.text, \"\\n\")\n", + "#print(\"\\nUser prompt: \", user_prompt_for(rendered_page), \"\\n\")\n", + "#messages_for(rendered_page)\n", + "#summarize(\"https://openai.com\")\n", + "\n", + "display_summary(\"https://openai.com\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "781119a4-844c-4e03-84bd-8b8f2200d86c", + "metadata": {}, + "outputs": [], + "source": [ + "# With Async for multiple page scraping: using Selenium and Jupyter Labs\n", + "import nest_asyncio # Required for Jupyter notebook\n", + "import asyncio\n", + "from concurrent.futures import ThreadPoolExecutor\n", + "\n", + "# Async-safe wrapper for multiple URLs: because Selenium is synchronous\n", + "def scrape_sync(url):\n", + " try:\n", + " page = Website(url)\n", + " return {\n", + " \"url\": url,\n", + " \"title\": page.title,\n", + " \"text\": page.text,\n", + " \"summary\": display_summary(url)\n", + " }\n", + " except Exception as e:\n", + " return {\n", + " \"url\": url,\n", + " \"error\": str(e)\n", + " }\n", + "\n", + "\n", + "# Async runner for multiple URLs\n", + "async def scrape_multiple_async(urls, max_workers=4):\n", + " loop = asyncio.get_running_loop()\n", + " with ThreadPoolExecutor(max_workers=max_workers) as executor:\n", + " futures = [\n", + " loop.run_in_executor(executor, scrape_sync, url)\n", + " for url in urls\n", + " ]\n", + " return await asyncio.gather(*futures)\n", + "\n", + "\n", + "# Example async usage\n", + "if __name__ == \"__main__\":\n", + " urls_to_scrape = [\n", + " \"https://www.investopedia.com/articles/active-trading/111115/why-all-worlds-top-10-companies-are-american.asp\",\n", + " \"https://fortune.com/ranking/global500/\",\n", + " \"http://en.wikipedia.org/wiki/List_of_largest_corporate_profits_and_losses\",\n", + " ]\n", + "\n", + " async def run():\n", + " results = await scrape_multiple_async(urls_to_scrape)\n", + " for res in results:\n", + " print(f\"\\nURL: {res.get('url')}\")\n", + " print(f\"Title: {res.get('title', 'N/A')}\")\n", + " print(f\"Preview:\\n{res.get('text', res.get('error', 'No content'))}\\n\")\n", + "\n", + " # Jupyter notebook already has a running event loop: asyncio.run() cannot be called from a running event loop\n", + " nest_asyncio.apply()\n", + " await run()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32fa56f2-f78e-421f-b35e-77fb9608d652", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/day5_shared-driver-threaded-scraper_jeannine-jordan.ipynb b/week1/community-contributions/day5_shared-driver-threaded-scraper_jeannine-jordan.ipynb new file mode 100644 index 0000000..698145e --- /dev/null +++ b/week1/community-contributions/day5_shared-driver-threaded-scraper_jeannine-jordan.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a98030af-fcd1-4d63-a36e-38ba053498fa", + "metadata": {}, + "source": [ + "# A full business solution\n", + "\n", + "## Now we will take our project from Day 1 to the next level\n", + "\n", + "### BUSINESS CHALLENGE:\n", + "\n", + "Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.\n", + "\n", + "We will be provided a company name and their primary website.\n", + "\n", + "See the end of this notebook for examples of real-world business applications.\n", + "\n", + "And remember: I'm always available if you have problems or ideas! Please do reach out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5b08506-dc8b-4443-9201-5f1848161363", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt\n", + "\n", + "import os\n", + "import requests\n", + "import json\n", + "from typing import List\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display, update_display\n", + "from openai import OpenAI\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from urllib.parse import urlparse, urljoin\n", + "import time\n", + "import random\n", + "import concurrent.futures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc5d8880-f2ee-4c06-af16-ecbc0262af61", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize and constants\n", + "\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:\n", + " print(\"API key looks good so far\")\n", + "else:\n", + " print(\"There might be a problem with your API key? Please visit the troubleshooting notebook!\")\n", + " \n", + "MODEL = 'gpt-4o-mini'\n", + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "106dd65e-90af-4ca8-86b6-23a41840645b", + "metadata": {}, + "outputs": [], + "source": [ + "# A class to represent a Webpage\n", + "\n", + "# Some websites need you to use proper headers when fetching them:\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + " \"\"\"\n", + " A utility class to represent a Website that we have scraped, now with links\n", + " \"\"\"\n", + "\n", + " def __init__(self, url):\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " self.body = response.content\n", + " soup = BeautifulSoup(self.body, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " if soup.body:\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n", + " else:\n", + " self.text = \"\"\n", + " links = [link.get('href') for link in soup.find_all('a')]\n", + " self.links = [link for link in links if link]\n", + "\n", + " def get_contents(self):\n", + " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n", + "\n", + "\n", + "# A modified class to fetch and parse fully rendered pages\n", + "class NewWebsite:\n", + " shared_driver = None # Class variable to share browser instance\n", + "\n", + " def __init__(self, url, driver=None):\n", + " self.url = url\n", + " self.driver = driver or NewWebsite._get_shared_driver()\n", + " self.text, self.title, self.links = self._scrape_content()\n", + " \n", + " @classmethod\n", + " def _get_shared_driver(cls):\n", + " if cls.shared_driver is None:\n", + " # Set up headless Chrome options\n", + " options = Options()\n", + " options.add_argument(\"--headless=new\")\n", + " options.add_argument(\"--disable-gpu\")\n", + " options.add_argument(\"--no-sandbox\")\n", + " options.add_argument(\"--disable-dev-shm-usage\")\n", + " options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n", + "\n", + " service = Service(ChromeDriverManager().install())\n", + " cls.shared_driver = webdriver.Chrome(service=service, options=options)\n", + " return cls.shared_driver\n", + "\n", + " def _scrape_content(self):\n", + " try:\n", + " self.driver.get(self.url)\n", + " # Mimick human browsing behavior without overloading the server\n", + " WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n", + " # Allow JS-rendered content to settle\n", + " time.sleep(2)\n", + "\n", + " # Get the page source after rendering\n", + " soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n", + " \n", + " for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n", + " tag.decompose()\n", + " \n", + " title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n", + " body = soup.body\n", + " text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n", + "\n", + " # Extract and clean links\n", + " links = []\n", + " for link_tag in soup.find_all(\"a\", href=True):\n", + " href = link_tag[\"href\"].strip()\n", + " if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n", + " full_url = urljoin(self.url, href)\n", + " links.append(full_url)\n", + " \n", + " return text, title, links\n", + " \n", + " except Exception as e:\n", + " return \"Error loading content\", \"Error\", []\n", + "\n", + " def get_contents(self):\n", + " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n", + "\n", + " # Close the driver\n", + " @classmethod\n", + " def close_driver(cls):\n", + " if cls.shared_driver:\n", + " cls.shared_driver.quit()\n", + " cls.shared_driver = None\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e30d8128-933b-44cc-81c8-ab4c9d86589a", + "metadata": {}, + "outputs": [], + "source": [ + "cardiff = NewWebsite(\"https://cardiff.co/\")\n", + "cardiff.links" + ] + }, + { + "cell_type": "markdown", + "id": "1771af9c-717a-4fca-bbbe-8a95893312c3", + "metadata": {}, + "source": [ + "## First step: Have GPT-4o-mini figure out which links are relevant\n", + "\n", + "### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON. \n", + "It should decide which links are relevant, and replace relative links such as \"/about\" with \"https://company.com/about\". \n", + "We will use \"one shot prompting\" in which we provide an example of how it should respond in the prompt.\n", + "\n", + "This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!\n", + "\n", + "Sidenote: there is a more advanced technique called \"Structured Outputs\" in which we require the model to respond according to a spec. We cover this technique in Week 8 during our autonomous Agentic AI project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6957b079-0d96-45f7-a26a-3487510e9b35", + "metadata": {}, + "outputs": [], + "source": [ + "link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n", + "You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n", + "such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n", + "link_system_prompt += \"You should respond in JSON as in this example:\"\n", + "link_system_prompt += \"\"\"\n", + "{\n", + " \"links\": [\n", + " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", + " {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n", + " ]\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b97e4068-97ed-4120-beae-c42105e4d59a", + "metadata": {}, + "outputs": [], + "source": [ + "print(link_system_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e1f601b-2eaf-499d-b6b8-c99050c9d6b3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_links_user_prompt(website):\n", + " user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n", + " user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n", + "Do not include Terms of Service, Privacy, email links.\\n\"\n", + " user_prompt += \"Links (some might be relative links):\\n\"\n", + " user_prompt += \"\\n\".join(website.links)\n", + " return user_prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bcbfa78-6395-4685-b92c-22d592050fd7", + "metadata": {}, + "outputs": [], + "source": [ + "print(get_links_user_prompt(cardiff))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a29aca19-ca13-471c-a4b4-5abbfa813f69", + "metadata": {}, + "outputs": [], + "source": [ + "def get_links(url):\n", + " website = Website(url)\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + " result = response.choices[0].message.content\n", + " return json.loads(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74a827a0-2782-4ae5-b210-4a242a8b4cc2", + "metadata": {}, + "outputs": [], + "source": [ + "# Anthropic has made their site harder to scrape, so I'm using HuggingFace..\n", + "\n", + "huggingface = Website(\"https://huggingface.co\")\n", + "huggingface.links" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3d583e2-dcc4-40cc-9b28-1e8dbf402924", + "metadata": {}, + "outputs": [], + "source": [ + "get_links(\"https://cardiff.co\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d74128e-dfb6-47ec-9549-288b621c838c", + "metadata": {}, + "source": [ + "## Second step: make the brochure!\n", + "\n", + "Assemble all the details into another prompt to GPT4-o" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85a5b6e2-e7ef-44a9-bc7f-59ede71037b5", + "metadata": {}, + "outputs": [], + "source": [ + "def get_all_details(url):\n", + " result = \"Landing page:\\n\"\n", + " result += Website(url).get_contents()\n", + " links = get_links(url)\n", + " print(\"Found links:\", links)\n", + " for link in links[\"links\"]:\n", + " result += f\"\\n\\n{link['type']}\\n\"\n", + " result += Website(link[\"url\"]).get_contents()\n", + " return result\n", + "\n", + "def get_all_details_rendered(url):\n", + " result = \"Landing page:\\n\"\n", + " result += NewWebsite(url).get_contents()\n", + " links = get_links(url)\n", + " print(\"Found links:\", links)\n", + "\n", + " for link in links[\"links\"]:\n", + " result += f\"\\n\\n{link['type']}\\n\"\n", + " result += NewWebsite(link[\"url\"]).get_contents()\n", + "\n", + " # Important: close browser after all scraping is done\n", + " NewWebsite.close_driver()\n", + " return result\n", + "\n", + "def scrape_link(link):\n", + " try:\n", + " page = NewWebsite(link[\"url\"])\n", + " return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n", + " except Exception as e:\n", + " return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n", + "\n", + "# Threaded scraper for linked pages\n", + "def get_all_details_rendered_concurrently(url):\n", + " result = \"Landing page:\\n\"\n", + " result += NewWebsite(url).get_contents()\n", + "\n", + " # LLM-filtered link generator\n", + " links = get_links(url)\n", + " print(\"Found links:\", links)\n", + "\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n", + " for future in concurrent.futures.as_completed(future_to_link):\n", + " result += future.result()\n", + "\n", + " # Close shared browser\n", + " NewWebsite.close_driver()\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5099bd14-076d-4745-baf3-dac08d8e5ab2", + "metadata": {}, + "outputs": [], + "source": [ + "print(get_all_details_rendered_concurrently(\"https://cardiff.co\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b863a55-f86c-4e3f-8a79-94e24c1a8cf2", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n", + "and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n", + "Include details of company culture, customers and careers/jobs if you have the information.\"\n", + "\n", + "# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':\n", + "\n", + "# system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n", + "# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n", + "# Include details of company culture, customers and careers/jobs if you have the information.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ab83d92-d36b-4ce0-8bcc-5bb4c2f8ff23", + "metadata": {}, + "outputs": [], + "source": [ + "def get_brochure_user_prompt(company_name, url):\n", + " user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n", + " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n", + " #user_prompt += get_all_details(url)\n", + " user_prompt += get_all_details_rendered_concurrently(url)\n", + " user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd909e0b-1312-4ce2-a553-821e795d7572", + "metadata": {}, + "outputs": [], + "source": [ + "get_brochure_user_prompt(\"Cardiff\", \"https://cardiff.co\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e44de579-4a1a-4e6a-a510-20ea3e4b8d46", + "metadata": {}, + "outputs": [], + "source": [ + "def create_brochure(company_name, url):\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n", + " ],\n", + " )\n", + " result = response.choices[0].message.content\n", + " display(Markdown(result))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e093444a-9407-42ae-924a-145730591a39", + "metadata": {}, + "outputs": [], + "source": [ + "create_brochure(\"Cardiff\", \"https://cardiff.co\")" + ] + }, + { + "cell_type": "markdown", + "id": "61eaaab7-0b47-4b29-82d4-75d474ad8d18", + "metadata": {}, + "source": [ + "## Finally - a minor improvement\n", + "\n", + "With a small adjustment, we can change this so that the results stream back from OpenAI,\n", + "with the familiar typewriter animation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51db0e49-f261-4137-aabe-92dd601f7725", + "metadata": {}, + "outputs": [], + "source": [ + "def stream_brochure(company_name, url):\n", + " stream = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": get_brochure_user_prompt(company_name, url)}\n", + " ],\n", + " stream=True\n", + " )\n", + " \n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " for chunk in stream:\n", + " response += chunk.choices[0].delta.content or ''\n", + " response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n", + " update_display(Markdown(response), display_id=display_handle.display_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56bf0ae3-ee9d-4a72-9cd6-edcac67ceb6d", + "metadata": {}, + "outputs": [], + "source": [ + "stream_brochure(\"Cardiff\", \"https://cardiff.co\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdb3f8d8-a3eb-41c8-b1aa-9f60686a653b", + "metadata": {}, + "outputs": [], + "source": [ + "# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:\n", + "\n", + "stream_brochure(\"HuggingFace\", \"https://huggingface.co\")" + ] + }, + { + "cell_type": "markdown", + "id": "a27bf9e0-665f-4645-b66b-9725e2a959b5", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Business applications

\n", + " In this exercise we extended the Day 1 code to make multiple LLM calls, and generate a document.\n", + "\n", + "This is perhaps the first example of Agentic AI design patterns, as we combined multiple calls to LLMs. This will feature more in Week 2, and then we will return to Agentic AI in a big way in Week 8 when we build a fully autonomous Agent solution.\n", + "\n", + "Generating content in this way is one of the very most common Use Cases. As with summarization, this can be applied to any business vertical. Write marketing content, generate a product tutorial from a spec, create personalized email content, and so much more. Explore how you can apply content generation to your business, and try making yourself a proof-of-concept prototype. See what other students have done in the community-contributions folder -- so many valuable projects -- it's wild!\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "14b2454b-8ef8-4b5c-b928-053a15e0d553", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Before you move to Week 2 (which is tons of fun)

\n", + " Please see the week1 EXERCISE notebook for your challenge for the end of week 1. This will give you some essential practice working with Frontier APIs, and prepare you well for Week 2.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "17b64f0f-7d33-4493-985a-033d06e8db08", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

A reminder on 3 useful resources

\n", + " 1. The resources for the course are available here.
\n", + " 2. I'm on LinkedIn here and I love connecting with people taking the course!
\n", + " 3. I'm trying out X/Twitter and I'm at @edwarddonner and hoping people will teach me how it's done.. \n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "6f48e42e-fa7a-495f-a5d4-26bfc24d60b6", + "metadata": {}, + "source": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + "

Finally! I have a special request for you

\n", + " \n", + " My editor tells me that it makes a MASSIVE difference when students rate this course on Udemy - it's one of the main ways that Udemy decides whether to show it to others. If you're able to take a minute to rate this, I'd be so very grateful! And regardless - always please reach out to me at ed@edwarddonner.com if I can help at any point.\n", + " \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8d3e1a1-ba54-4907-97c5-30f89a24775b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/week1-EXERCISE_rewrite-internal-tools-code-and-UI-image_jeannine-jordan.ipynb b/week1/community-contributions/week1-EXERCISE_rewrite-internal-tools-code-and-UI-image_jeannine-jordan.ipynb new file mode 100644 index 0000000..6449246 --- /dev/null +++ b/week1/community-contributions/week1-EXERCISE_rewrite-internal-tools-code-and-UI-image_jeannine-jordan.ipynb @@ -0,0 +1,349 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5", + "metadata": {}, + "source": [ + "# End of week 1 exercise\n", + "\n", + "To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question, \n", + "and responds with an explanation. This is a tool that you will be able to use yourself during the course!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1070317-3ed9-4659-abe3-828943230e03", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "import json\n", + "from typing import List\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import Markdown, display, update_display, Image\n", + "from openai import OpenAI\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from urllib.parse import urlparse, urljoin\n", + "import time\n", + "import random\n", + "import concurrent.futures\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a456906-915a-4bfd-bb9d-57e505c5093f", + "metadata": {}, + "outputs": [], + "source": [ + "# constants\n", + "\n", + "MODEL = 'gpt-4o-mini'\n", + "openai = OpenAI()\n", + "MODEL_GPT = 'gpt-4o-mini'\n", + "MODEL_LLAMA = 'llama3.2'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8d7923c-5f28-4c30-8556-342d7c8497c1", + "metadata": {}, + "outputs": [], + "source": [ + "# set up environment\n", + "\n", + "# A modified class to fetch and parse fully rendered pages\n", + "class NewWebsite:\n", + " shared_driver = None # Class variable to share browser instance\n", + "\n", + " def __init__(self, url, driver=None):\n", + " self.url = url\n", + " self.driver = driver or NewWebsite._get_shared_driver()\n", + " self.text, self.title, self.links = self._scrape_content()\n", + " \n", + " @classmethod\n", + " def _get_shared_driver(cls):\n", + " if cls.shared_driver is None:\n", + " # Set up headless Chrome options\n", + " options = Options()\n", + " options.add_argument(\"--headless=new\")\n", + " options.add_argument(\"--disable-gpu\")\n", + " options.add_argument(\"--no-sandbox\")\n", + " options.add_argument(\"--disable-dev-shm-usage\")\n", + " options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\")\n", + "\n", + " service = Service(ChromeDriverManager().install())\n", + " cls.shared_driver = webdriver.Chrome(service=service, options=options)\n", + " return cls.shared_driver\n", + "\n", + " def _scrape_content(self):\n", + " try:\n", + " self.driver.get(self.url)\n", + " # Mimick human browsing behavior without overloading the server\n", + " WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, \"a\")))\n", + " # Allow JS-rendered content to settle\n", + " time.sleep(2)\n", + "\n", + " # Get the page source after rendering\n", + " soup = BeautifulSoup(self.driver.page_source, \"html.parser\")\n", + " \n", + " for tag in soup([\"script\", \"style\", \"img\", \"input\"]):\n", + " tag.decompose()\n", + " \n", + " title = soup.title.string.strip() if soup.title and soup.title.string else \"No title found\"\n", + " body = soup.body\n", + " text = soup.body.get_text(separator=\"\\n\", strip=True) if body else \"No content found.\"\n", + "\n", + " # Extract and clean links\n", + " links = []\n", + " for link_tag in soup.find_all(\"a\", href=True):\n", + " href = link_tag[\"href\"].strip()\n", + " if href and not href.startswith((\"mailto:\", \"tel:\", \"javascript:\")):\n", + " full_url = urljoin(self.url, href)\n", + " links.append(full_url)\n", + " \n", + " return text, title, links\n", + " \n", + " except Exception as e:\n", + " return \"Error loading content\", \"Error\", []\n", + "\n", + " def get_contents(self):\n", + " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"\n", + "\n", + " # Close the driver\n", + " @classmethod\n", + " def close_driver(cls):\n", + " if cls.shared_driver:\n", + " cls.shared_driver.quit()\n", + " cls.shared_driver = None\n", + "\n", + "link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n", + "You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n", + "such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n", + "link_system_prompt += \"You should respond in JSON as in this example:\"\n", + "link_system_prompt += \"\"\"\n", + "{\n", + " \"links\": [\n", + " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", + " {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n", + " ]\n", + "}\n", + "\"\"\"\n", + "\n", + "def get_links_user_prompt(website):\n", + " user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n", + " user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n", + "Do not include Terms of Service, Privacy, email links.\\n\"\n", + " user_prompt += \"Links (some might be relative links):\\n\"\n", + " user_prompt += \"\\n\".join(website.links)\n", + " return user_prompt\n", + "\n", + "def get_links(url):\n", + " website = NewWebsite(url)\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": get_links_user_prompt(website)}\n", + " ],\n", + " response_format={\"type\": \"json_object\"}\n", + " )\n", + " result = response.choices[0].message.content\n", + " return json.loads(result)\n", + "\n", + "def scrape_link(link):\n", + " try:\n", + " page = NewWebsite(link[\"url\"])\n", + " return f\"\\n\\n{link['type']}\\n{page.get_contents()}\"\n", + " except Exception as e:\n", + " return f\"\\n\\n{link['type']}\\nError loading page: {e}\"\n", + "\n", + "# Threaded scraper for linked pages\n", + "def get_all_details_rendered_concurrently(url):\n", + " result = \"Landing page:\\n\"\n", + " result += NewWebsite(url).get_contents()\n", + "\n", + " # LLM-filtered link generator\n", + " links = get_links(url)\n", + " print(\"Found links:\", links)\n", + "\n", + " with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " future_to_link = {executor.submit(scrape_link, link): link for link in links[\"links\"]}\n", + " for future in concurrent.futures.as_completed(future_to_link):\n", + " result += future.result()\n", + "\n", + " # Close shared browser\n", + " NewWebsite.close_driver()\n", + " return result\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f0d0137-52b0-47a8-81a8-11a90a010798", + "metadata": {}, + "outputs": [], + "source": [ + "# here is the question; type over this to ask something new\n", + "\n", + "system_prompt = \"You are an LLM Engineer that analyzes the contents of several relevant pages from a company website \\\n", + "rewrites internal tools and systems and rebuilds them end-to-end, starting from scratch. Starting with the online application at cardiff.co/apply, \\\n", + "Tell me why you're best suited to be the lead of this project and work with our 12 year resident developer to implement a \\\n", + "state of the art solution in record time. Include backend architecture, model orchestration, how you handle latency, cost and user experience, \\\n", + "and details of how you would achieve this goal based on company culture and industries served if you have the information, \\\n", + "and walk me through the details like you're explaining it to a sharp product owner. Respond in markdown.\"\\\n", + "\n", + "\n", + "def get_solution_user_prompt(company_name, url):\n", + " user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n", + " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a solution to rewrite the company's application in markdown.\\n\"\n", + " #user_prompt += get_all_details(url)\n", + " user_prompt += get_all_details_rendered_concurrently(url)\n", + " user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters\n", + " return user_prompt\n", + "\n", + "def create_solution(company_name, url):\n", + " response = openai.chat.completions.create(\n", + " model=MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": get_solution_user_prompt(company_name, url)}\n", + " ],\n", + " )\n", + " result = response.choices[0].message.content\n", + " display(Markdown(result))\n", + "\n", + " return result\n", + "\n", + "#create_solution(\"Cardiff\", \"https://cardiff.co\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60ce7000-a4a5-4cce-a261-e75ef45063b4", + "metadata": {}, + "outputs": [], + "source": [ + "# Get gpt-4o-mini to answer, with streaming\n", + "\n", + "new_system_prompt = \"You are a Senior Engineer that analyzes the planned solution given to you for a company website \\\n", + "and you rewrite code for rebuilding internal tools and systems end-to-end based on the proposed solutions. \\\n", + "Start with the online application at cardiff.co/apply, use canvas and write code for the proposed solution \\\n", + "in the appropriate language that best suits the task for backend architecture, model orchestration, how you handle latency, cost and user experience wherever possible.\"\n", + "\n", + "output_dir = \"cardiff_rebuild_output\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "def save_code_blocks(markdown_text, base_filename=\"cardiff_code\"):\n", + " output_dir = \"cardiff_rebuild_output\"\n", + " os.makedirs(output_dir, exist_ok=True)\n", + " \n", + " code_blocks = re.findall(r\"```(.*?)\\n(.*?)```\", markdown_text, re.DOTALL)\n", + " saved_files = []\n", + "\n", + " for idx, (language, code) in enumerate(code_blocks, 1):\n", + " ext = language.strip() if language else \"txt\"\n", + " filename = f\"{base_filename}_part{idx}.{ext}\"\n", + " filepath = os.path.join(output_dir, filename)\n", + " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", + " f.write(code)\n", + " saved_files.append(filepath)\n", + "\n", + " return saved_files\n", + "\n", + "def develop_from_proposal(proposal_text, company_name):\n", + " # Stream code generation from GPT-4o\n", + " system = \"You are a senior software engineer. Use the following proposal to generate production-ready code to \\\n", + " implement the backend, frontend, and any orchestration described. Write clean, documented code in markdown format.\"\n", + " \n", + " stream = openai.chat.completions.create(\n", + " model=\"gpt-4o\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system},\n", + " {\"role\": \"user\", \"content\": proposal_text}\n", + " ],\n", + " stream=True\n", + " )\n", + "\n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " for chunk in stream:\n", + " content = chunk.choices[0].delta.content or \"\"\n", + " response += content\n", + " update_display(Markdown(response), display_id=display_handle.display_id)\n", + "\n", + " saved_files = save_code_blocks(response)\n", + " \n", + " # Generate a UI design mockup image\n", + " image_prompt = f\"A modern, mobile-friendly UI wireframe for a business loan application system for {company_name}. Clean layout, input fields for business name, revenue, loan amount, industry, and contact info. Includes a step-by-step progress bar, submit button, and secure branding.\"\n", + " \n", + " img_response = openai.images.generate(\n", + " model=\"dall-e-3\",\n", + " prompt=image_prompt,\n", + " n=1,\n", + " size=\"1024x1024\"\n", + " )\n", + " \n", + " image_url = img_response.data[0].url\n", + " img_path = os.path.join(output_dir, f\"{company_name.lower()}_ui_mockup.png\")\n", + " with open(img_path, 'wb') as handler:\n", + " handler.write(requests.get(image_url).content)\n", + "\n", + " print(\"Code files saved to:\", saved_files)\n", + " print(\"UI mockup saved at:\", img_path)\n", + "\n", + " display(Markdown(\"### Proposed UI Design\"))\n", + " display(Image(url=image_url))\n", + "\n", + "proposal = create_solution(\"Cardiff\", \"https://cardiff.co\")\n", + "develop_from_proposal(proposal, \"Cardiff\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f7c8ea8-4082-4ad0-8751-3301adcf6538", + "metadata": {}, + "outputs": [], + "source": [ + "# Get Llama 3.2 to answer" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0010efa94631ca1ed3f918aec4d2f2b29c8b5752 Mon Sep 17 00:00:00 2001 From: "Marcus.Rosen" Date: Mon, 2 Jun 2025 16:33:02 +1000 Subject: [PATCH 14/23] Added LiteLLM exampel for Week2 Day2 --- .../Week2_Day2_Litellm.ipynb | 420 ++++++++++++++++++ 1 file changed, 420 insertions(+) create mode 100644 week2/community-contributions/Week2_Day2_Litellm.ipynb diff --git a/week2/community-contributions/Week2_Day2_Litellm.ipynb b/week2/community-contributions/Week2_Day2_Litellm.ipynb new file mode 100644 index 0000000..af49175 --- /dev/null +++ b/week2/community-contributions/Week2_Day2_Litellm.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6a08763a-aed6-4f91-94d0-80a3c0e2665b", + "metadata": {}, + "source": [ + "### Weeks 2 - Day 2 - Gradio Chatbot with LiteLLM (Model Routing)" + ] + }, + { + "cell_type": "markdown", + "id": "a4f38c58-5ceb-4d5e-b538-c1acdc881f73", + "metadata": {}, + "source": [ + "**Author** : [Marcus Rosen](https://github.com/MarcusRosen)" + ] + }, + { + "cell_type": "markdown", + "id": "36f4814a-2bfc-4631-97d7-7a474fa1cc8e", + "metadata": {}, + "source": [ + "[LiteLLM](https://docs.litellm.ai/docs/) provides the abilitty to call different LLM providers via a unified interface, returning results in OpenAI compatible formats.\n", + "\n", + "Features:\n", + "- Model Selection in Gradio (Anthropic, OpenAI, Gemini)\n", + "- Single Inference function for all model providers via LiteLLM (call_llm)\n", + "- Streaming **NOTE:** Bug when trying to stream in Gradio, but works directly in Notebook\n", + "- Debug Tracing" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "b6c12598-4773-4f85-93ca-0128d74fbca0", + "metadata": {}, + "outputs": [], + "source": [ + "from litellm import completion\n", + "import gradio as gr\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "import os\n", + "import requests\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "d24be370-5347-47fb-a58e-21a1b5409ab2", + "metadata": {}, + "source": [ + "#### Load API Keys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e03afbe9-16aa-434c-a701-b3bfe75e927d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI API Key exists and begins sk-proj-\n", + "Anthropic API Key exists and begins sk-ant-\n", + "Google API Key exists and begins AIzaSyDC\n" + ] + } + ], + "source": [ + "# Load environment variables in a file called .env\n", + "# Print the key prefixes to help with any debugging\n", + "\n", + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "google_api_key = os.getenv('GEMINI_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", + "else:\n", + " print(\"OpenAI API Key not set\")\n", + " \n", + "if anthropic_api_key:\n", + " print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n", + "else:\n", + " print(\"Anthropic API Key not set\")\n", + "\n", + "if google_api_key:\n", + " print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n", + " # import google.generativeai\n", + " # google.generativeai.configure()\n", + "else:\n", + " print(\"Gemini API Key not set\")" + ] + }, + { + "cell_type": "markdown", + "id": "66e46447-0e73-49ef-944a-d1e8fae4986e", + "metadata": {}, + "source": [ + "### Use LiteLLM to abstract out the model provider" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "473c2029-ca74-4f1e-92ac-05f7817ff7df", + "metadata": {}, + "outputs": [], + "source": [ + "def call_llm(model, system_prompt, user_prompt, json_format_response=False, streaming=False):\n", + " if DEBUG_OUTPUT: \n", + " print(\"call_llm()\")\n", + " print(f\"streaming={streaming}\")\n", + " print(f\"json_format_response={json_format_response}\")\n", + " \n", + " messages = [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + "\n", + " payload = {\n", + " \"model\": model,\n", + " \"messages\": messages\n", + " }\n", + " # Use Json Reponse Format\n", + " # Link: https://docs.litellm.ai/docs/completion/json_mode\n", + " if json_format_response:\n", + " payload[\"response_format\"]: { \"type\": \"json_object\" }\n", + " \n", + " if streaming:\n", + " payload[\"stream\"] = True\n", + " response = completion(**payload)\n", + " # Return a generator expression instead of using yield in the function\n", + " return (part.choices[0].delta.content or \"\" for part in response)\n", + " else:\n", + " response = completion(**payload)\n", + " return response[\"choices\"][0][\"message\"][\"content\"]" + ] + }, + { + "cell_type": "markdown", + "id": "f45e0972-a6a0-4237-8a69-e6f165f30e0d", + "metadata": {}, + "source": [ + "### Brochure building functions" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "c76d4ff9-0f18-49d0-a9b5-2c6c0bad359a", + "metadata": {}, + "outputs": [], + "source": [ + "# A class to represent a Webpage\n", + "\n", + "# Some websites need you to use proper headers when fetching them:\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + " \"\"\"\n", + " A utility class to represent a Website that we have scraped, now with links\n", + " \"\"\"\n", + "\n", + " def __init__(self, url):\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " self.body = response.content\n", + " soup = BeautifulSoup(self.body, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " if soup.body:\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)\n", + " else:\n", + " self.text = \"\"\n", + " links = [link.get('href') for link in soup.find_all('a')]\n", + " self.links = [link for link in links if link]\n", + "\n", + " def get_contents(self):\n", + " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "ff41b687-3a46-4bca-a031-1148b91a4fdf", + "metadata": {}, + "outputs": [], + "source": [ + "def get_links(url, model):\n", + " if DEBUG_OUTPUT:\n", + " print(\"get_links()\")\n", + " website = Website(url)\n", + "\n", + " link_system_prompt = \"You are provided with a list of links found on a webpage. \\\n", + " You are able to decide which of the links would be most relevant to include in a brochure about the company, \\\n", + " such as links to an About page, or a Company page, or Careers/Jobs pages.\\n\"\n", + " link_system_prompt += \"You should respond in raw JSON exactly as specified in this example. DO NOT USE MARKDOWN.\"\n", + " link_system_prompt += \"\"\"\n", + " {\n", + " \"links\": [\n", + " {\"type\": \"about page\", \"url\": \"https://full.url/goes/here/about\"},\n", + " {\"type\": \"careers page\": \"url\": \"https://another.full.url/careers\"}\n", + " ]\n", + " }\n", + " \"\"\"\n", + " \n", + " result = call_llm(model=model, \n", + " system_prompt=link_system_prompt, \n", + " user_prompt=get_links_user_prompt(website), \n", + " json_format_response=True, \n", + " streaming=False)\n", + " if DEBUG_OUTPUT:\n", + " print(result)\n", + " return json.loads(result)\n", + "\n", + "def get_links_user_prompt(website):\n", + " if DEBUG_OUTPUT:\n", + " print(\"get_links_user_prompt()\")\n", + " \n", + " user_prompt = f\"Here is the list of links on the website of {website.url} - \"\n", + " user_prompt += \"please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \\\n", + "Do not include Terms of Service, Privacy, email links.\\n\"\n", + " user_prompt += \"Links (some might be relative links):\\n\"\n", + " user_prompt += \"\\n\".join(website.links)\n", + "\n", + " if DEBUG_OUTPUT:\n", + " print(user_prompt)\n", + " \n", + " return user_prompt\n", + "\n", + "def get_all_details(url, model):\n", + " if DEBUG_OUTPUT:\n", + " print(\"get_all_details()\")\n", + " \n", + " result = \"Landing page:\\n\"\n", + " result += Website(url).get_contents()\n", + " links = get_links(url, model)\n", + " if DEBUG_OUTPUT:\n", + " print(\"Found links:\", links)\n", + " for link in links[\"links\"]:\n", + " result += f\"\\n\\n{link['type']}\\n\"\n", + " result += Website(link[\"url\"]).get_contents()\n", + " return result\n", + "\n", + "def get_brochure_user_prompt(company_name, url, model):\n", + " \n", + " if DEBUG_OUTPUT:\n", + " print(\"get_brochure_user_prompt()\")\n", + " \n", + " user_prompt = f\"You are looking at a company called: {company_name}\\n\"\n", + " user_prompt += f\"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\\n\"\n", + " user_prompt += get_all_details(url, model)\n", + " user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters\n", + " return user_prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "cf7512a1-a498-44e8-a234-9affb72efe60", + "metadata": {}, + "outputs": [], + "source": [ + "def create_brochure(company_name, url, model, streaming):\n", + "\n", + " system_prompt = \"You are an assistant that analyzes the contents of several relevant pages from a company website \\\n", + "and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\\\n", + "Include details of company culture, customers and careers/jobs if you have the information.\"\n", + " if streaming:\n", + " result = call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=True)\n", + " return (p for p in result)\n", + " else: \n", + " return call_llm(model=model, system_prompt=system_prompt, user_prompt=get_brochure_user_prompt(company_name, url, model), streaming=False)\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "ecb6d212-ddb6-4170-81bf-8f3ea54479f8", + "metadata": {}, + "source": [ + "#### Testing Model before implenting Gradio" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "de89843a-08ac-4431-8c83-21a93c05f764", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Rio Tinto: Providing the Materials for a Sustainable Future\n", + "\n", + "## About Rio Tinto\n", + "\n", + "Rio Tinto is a global mining and metals company, operating in 35 countries with over 60,000 employees. Their purpose is to find better ways to provide the materials the world needs. Continuous improvement and innovation are at the core of their DNA, as they work to responsibly supply the metals and minerals critical for urbanization and the transition to a low-carbon economy.\n", + "\n", + "## Our Products\n", + "\n", + "Rio Tinto's diverse portfolio includes:\n", + "\n", + "- Iron Ore: The primary raw material used to make steel, which is strong, long-lasting and cost-efficient.\n", + "- Aluminium: A lightweight, durable and recyclable metal.\n", + "- Copper: A tough, malleable, corrosion-resistant and recyclable metal that is an excellent conductor of heat and electricity.\n", + "- Lithium: The lightest of all metals, a key element for low-carbon technologies.\n", + "- Diamonds: Ethically-sourced, high-quality diamonds.\n", + "\n", + "## Sustainability and Innovation\n", + "\n", + "Sustainability is at the heart of Rio Tinto's operations. They are targeting net zero emissions by 2050 and investing in nature-based solutions to complement their decarbonization efforts. Innovation is a key focus, with research and development into new technologies to improve efficiency and reduce environmental impact.\n", + "\n", + "## Careers and Culture\n", + "\n", + "Rio Tinto values its 60,000 employees and is committed to fostering a diverse and inclusive workplace. They offer a wide range of career opportunities, from mining and processing to engineering, finance, and more. Rio Tinto's culture is centered on safety, collaboration, and continuous improvement, with a strong emphasis on sustainability and responsible business practices.\n", + "\n", + "## Conclusion\n", + "\n", + "Rio Tinto is a global leader in the mining and metals industry, providing the materials essential for a sustainable future. Through their commitment to innovation, sustainability, and their talented workforce, Rio Tinto is well-positioned to meet the world's growing demand for critical resources.\n", + "\u001b[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\u001b[0m\n", + "LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.\n", + "\n", + ". at 0x7f80ca5da0c0>\n" + ] + } + ], + "source": [ + "MODEL=\"claude-3-haiku-20240307\"\n", + "DEBUG_OUTPUT=False\n", + "streaming=True\n", + "result = create_brochure(company_name=\"Rio Tinto\", url=\"http://www.riotinto.com\", model=MODEL, streaming=streaming)\n", + "\n", + "if streaming:\n", + " for chunk in result:\n", + " print(chunk, end=\"\", flush=True)\n", + "else:\n", + " print(result)\n" + ] + }, + { + "cell_type": "markdown", + "id": "1f330c92-6280-4dae-b4d8-717a56edb236", + "metadata": {}, + "source": [ + "#### Gradio Setup\n", + "Associate Dropdown values with the model we want to use.\n", + "Link: https://www.gradio.app/docs/gradio/dropdown#initialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2f38862-3728-4bba-9e16-6f9fab276145", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "DEBUG_OUTPUT=True\n", + "view = gr.Interface(\n", + " fn=create_brochure,\n", + " inputs=[\n", + " gr.Textbox(label=\"Company name:\"),\n", + " gr.Textbox(label=\"Landing page URL including http:// or https://\"),\n", + " gr.Dropdown(choices=[(\"GPT 4o Mini\", \"gpt-4o-mini\"), \n", + " (\"Claude Haiku 3\", \"claude-3-haiku-20240307\"), \n", + " (\"Gemini 2.0 Flash\", \"gemini/gemini-2.0-flash\")], \n", + " label=\"Select model\"),\n", + " gr.Checkbox(label=\"Stream\")\n", + " ],\n", + " outputs=[gr.Markdown(label=\"Brochure:\")],\n", + " flagging_mode=\"never\"\n", + ")\n", + "view.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0981136-2067-43b8-b17d-83560dd609ce", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 7bd46ba43b75f5a7958a291aa9d8f036d463ea1d Mon Sep 17 00:00:00 2001 From: habibmir808 Date: Mon, 2 Jun 2025 19:40:49 +0600 Subject: [PATCH 15/23] user can summarize research papers by website link --- .../day1_summarize_research_papers.ipynb | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 week1/community-contributions/day1_summarize_research_papers.ipynb diff --git a/week1/community-contributions/day1_summarize_research_papers.ipynb b/week1/community-contributions/day1_summarize_research_papers.ipynb new file mode 100644 index 0000000..246da69 --- /dev/null +++ b/week1/community-contributions/day1_summarize_research_papers.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install selenium" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb6636be-e43f-4896-aadd-cafda003ed4e", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q -U google-genai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfe66209-1d33-4292-80f1-20e11baf4bc3", + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.chrome.service import Service\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from IPython.display import Markdown, display\n", + "from google import genai\n", + "from google.genai import types\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2b4306c-17d0-46fe-a889-7440ff809dc6", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#load env\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('GEMINI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")" + ] + }, + { + "cell_type": "markdown", + "id": "08ec6fec-886c-4a0c-a046-e8643ad700d3", + "metadata": {}, + "source": [ + "# Lets make a simple call for check our model is working fine or not" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89143d5c-0013-4f7e-8e1f-f7db7e936f0d", + "metadata": {}, + "outputs": [], + "source": [ + "client = genai.Client(api_key=api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1144b77a-6785-479a-ab4f-bb0ab5624b49", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "response = client.models.generate_content(\n", + " model=\"gemini-2.5-flash-preview-05-20\",\n", + " contents=[\"hi gemini\"]\n", + ")\n", + "print(response.text)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf3836c-19b8-44e1-904a-f265925c2786", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class Website:\n", + " def __init__(self, url, driver_path=None, wait_time=3):\n", + " self.url = url\n", + " self.wait_time = wait_time\n", + "\n", + " # Headless Chrome settings\n", + " options = Options()\n", + " # options.add_argument(\"--headless\") \n", + " # Headless mode runs the browser in the background (invisible).\n", + " # However, some websites (like openai.com) block headless browsers.\n", + " # So if this line is active, the page may not load correctly and you may not get the full content.\n", + " options.add_argument(\"--disable-gpu\")\n", + " options.add_argument(\"--no-sandbox\")\n", + " options.add_argument(\"--window-size=1920x1080\")\n", + "\n", + " # Driver path\n", + " if driver_path:\n", + " service = Service(executable_path=driver_path)\n", + " else:\n", + " service = Service() \n", + "\n", + " # Start browser\n", + " driver = webdriver.Chrome(service=service, options=options)\n", + " driver.get(url)\n", + "\n", + " # Wait for the loading page\n", + " time.sleep(self.wait_time)\n", + "\n", + " # Take page source\n", + " html = driver.page_source\n", + " driver.quit()\n", + "\n", + " # Analysis with BeautifulSoup \n", + " soup = BeautifulSoup(html, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + "\n", + " # Clean irrelevant tags\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + "\n", + " self.text = soup.body.get_text(separator=\"\\n\", strip=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:\n", + "\n", + "Step 1: Document Verification\n", + "Verify if the input is a research paper by checking for:\n", + "\n", + "Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)\n", + "\n", + "Technical/scholarly language\n", + "\n", + "Citations (in-text or bibliography)\n", + "\n", + "Research claims or data analysis\n", + "If NOT a research paper:\n", + "→ Respond: \"This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization.\"\n", + "\n", + "Step 2: Structured Summary (If verified)\n", + "Generate a 5-section summary in this exact format:\n", + "\n", + "1. Research Question\n", + "[Identify core problem/gap addressed in 1 sentence]\n", + "\n", + "2. Methodology\n", + "[Study design, data sources, analytical techniques in 2 bullet points]\n", + "\n", + "3. Key Findings\n", + "[3-4 quantified results with numerical evidence from tables/figures]\n", + "\n", + "4. Limitations\n", + "[2 major constraints acknowledged by authors]\n", + "\n", + "5. Significance\n", + "[Impact on field & practical implications in 1 sentence]\n", + "\n", + "Critical Rules:\n", + "Accuracy Priority: Never invent data. Write \"Not specified\" for missing elements\n", + "\n", + "Source Anchoring: Cite page/paragraph numbers for claims (e.g., \"Fig 3 shows 24% improvement\")\n", + "\n", + "Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline\n", + "\n", + "Bias Alert: Flag any undeclared funding/sponsorship conflicts\n", + "\n", + "Output Format: Strict Markdown with section headers, 200-word maximum\n", + "\n", + "Example Output:\n", + "1. Research Question\n", + "How does microplastic concentration affect zebrafish neural development?\n", + "\n", + "2. Methodology\n", + "\n", + "Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h\n", + "\n", + "Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)\n", + "\n", + "3. Key Findings\n", + "▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)\n", + "■ 2.3x increase in anxiolytic behavior (Table 3)\n", + "▼ 17% downregulation in shha expression (p=0.03)\n", + "\n", + "4. Limitations\n", + " \n", + "Used static exposure vs dynamic aquatic environments\n", + "\n", + "Limited proteomic validation\n", + "\n", + "5. Significance\n", + "Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7", + "metadata": {}, + "outputs": [], + "source": [ + "# A function that writes a User Prompt that asks for summaries of websites:\n", + "\n", + "def user_prompt_for(website):\n", + " user_prompt = f\"You are looking at a website titled {website.title}\"\n", + " user_prompt += \"\\nThe contents of this website is as follows; \\\n", + "please provide a summary of this website in markdown.\\n\\n\"\n", + " user_prompt += website.text\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4257406-089b-45a3-bfb5-272004360a49", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize(url):\n", + " website = Website(url)\n", + " response = client.models.generate_content(\n", + " model=\"gemini-2.5-flash-preview-05-20\",\n", + " config=types.GenerateContentConfig(\n", + " system_instruction=system_prompt),\n", + " contents=user_prompt_for(website)\n", + " )\n", + "\n", + " return response.text\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f68b32ae-9e65-4aa4-ae8d-cc2482c4a2e2", + "metadata": {}, + "outputs": [], + "source": [ + "def display_summary(url):\n", + " summary = summarize(url)\n", + " display(Markdown(summary))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae52543c-01c1-4262-b53c-95ef4e5a93aa", + "metadata": {}, + "outputs": [], + "source": [ + "display_summary(\"https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 65db703e2db3e2e9a1981ec9877519ad8f6fad95 Mon Sep 17 00:00:00 2001 From: Ekta Shukla Date: Mon, 2 Jun 2025 23:43:20 +0530 Subject: [PATCH 16/23] Add LLM product comparison using Selenium, OpenAI, and Ollama --- ...ay1_product_comparison_openai_ollama.ipynb | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 community-contributions/day1_product_comparison_openai_ollama.ipynb diff --git a/community-contributions/day1_product_comparison_openai_ollama.ipynb b/community-contributions/day1_product_comparison_openai_ollama.ipynb new file mode 100644 index 0000000..6c76c1f --- /dev/null +++ b/community-contributions/day1_product_comparison_openai_ollama.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "38795b24-9801-4cfb-a000-ccd7f41e6128", + "metadata": {}, + "source": [ + "\n", + "# 🧠 Multi-Product Competitor Intelligence Summarizer using Web Scraping + LLM\n", + "\n", + "This notebook scrapes product pages using `Selenium`, collects the product information, and summarizes key features and comparison insights using `Ollama (LLaMA3) and OpenAI`.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b87cadb-d513-4303-baee-a37b6f938e4d", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "\n", + "# Load environment variables in a file called .env\n", + "\n", + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "\n", + "if not api_key:\n", + " print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n", + "elif not api_key.startswith(\"sk-proj-\"):\n", + " print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n", + "elif api_key.strip() != api_key:\n", + " print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n", + "else:\n", + " print(\"API key found and looks good so far!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abdb8417-c5dc-44bc-9bee-2e059d162699", + "metadata": {}, + "outputs": [], + "source": [ + "# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n", + "\n", + "system_prompt = \"Summarize the following product information for comparison.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38245e18", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# 📦 Install required packages (run once)\n", + "!pip install selenium bs4 requests\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88ae528b-aefe-4c64-b927-676e739194af", + "metadata": {}, + "outputs": [], + "source": [ + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4a831a5", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_with_openai(text, model=\"gpt-4o-mini\"):\n", + " response = openai.chat.completions.create(\n", + " model=model,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": text}\n", + " ],\n", + " temperature=0.7\n", + " )\n", + " return response.choices[0].message.content\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef65cd72", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# ⚙️ Selenium setup (headless)\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.common.by import By\n", + "import time\n", + "\n", + "def scrape_text_from_url(url):\n", + " options = Options()\n", + " options.add_argument(\"--headless=new\")\n", + " driver = webdriver.Chrome(options=options)\n", + " driver.get(url)\n", + " time.sleep(3)\n", + " \n", + " # You can tune this selector depending on the site\n", + " body = driver.find_element(By.TAG_NAME, 'body')\n", + " text = body.text\n", + " driver.quit()\n", + " return text.strip()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36e19014", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# 🧠 LLM Prompting using Ollama (local llama3)\n", + "import subprocess\n", + "\n", + "def summarize_with_ollama(text):\n", + " prompt = f\"Summarize the following product description:\\n\\n{text}\\n\\nSummary:\"\n", + " try:\n", + " print(\"inside ollama\")\n", + " result = subprocess.run(\n", + " [\"ollama\", \"run\", \"llama3.2\"],\n", + " input=prompt,\n", + " capture_output=True, text=True, check=True, encoding=\"utf-8\"\n", + " )\n", + " print(\"git result\")\n", + " return result.stdout.strip()\n", + " except subprocess.CalledProcessError as e:\n", + " return f\"Error running ollama: {e.stderr}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e04cea6e", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# 🔁 Analyze multiple product URLs and summarize\n", + "product_urls = {\n", + " \"iPhone 15 Pro\": \"https://www.apple.com/in/iphone-15-pro/\",\n", + " \"Samsung S24 Ultra\": \"https://www.samsung.com/in/smartphones/galaxy-s24-ultra/\",\n", + "}\n", + "\n", + "product_texts = {}\n", + "\n", + "for name, url in product_urls.items():\n", + " print(f\"Scraping {name} ...\")\n", + " product_texts[name] = scrape_text_from_url(url)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ebd5a20", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# 📄 Display side-by-side summaries\n", + "for name, text in product_texts.items():\n", + " print(f\"\\n🔹 {name} Summary with Ollama:\")\n", + " print(summarize_with_ollama(text))\n", + "\n", + " print(f\"\\n🔹 {name} Summary with OpenAI GPT:\")\n", + " print(summarize_with_openai(text))\n", + " print(\"=\"*100)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "935e0081-ccf5-4d9a-a984-ee82c77c04a2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 41ae0bdc24ecea4ed4bdfb5d91ccd22ffc467336 Mon Sep 17 00:00:00 2001 From: Adriana394 <158718290+Adriana394@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:16:50 +0200 Subject: [PATCH 17/23] create community folder --- .../testing_fine_tuned_model_with_rag.py | 258 ++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py diff --git a/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py b/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py new file mode 100644 index 0000000..22c775d --- /dev/null +++ b/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- +"""Testing Fine-tuned model with RAG + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1J8P8cwqwhBo3CNIZaEFe6BMRw0WUfEqy + +## Predict Product Prices + +### And now, to evaluate our fine-tuned open source model +""" + +!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb + +import os +import re +import math + +from google.colab import userdata + +from huggingface_hub import login + +import torch +import torch.nn.functional as F + +from transformers import ( + AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, GenerationConfig) + +from datasets import load_dataset + +from peft import PeftModel + +from sentence_transformers import SentenceTransformer +from langchain.vectorstores import Chroma +from langchain.embeddings import HuggingFaceEmbeddings + +import matplotlib.pyplot as plt + +# Commented out IPython magic to ensure Python compatibility. +# Constants + +BASE_MODEL = "meta-llama/Llama-3.1-8B" +PROJECT_NAME = "pricer" +HF_USER = "Adriana213" + +RUN_NAME = "optim-20250514_061529" +PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}" + +FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}" + +# Data + +DATASET_NAME = f"{HF_USER}/pricer-data" + +# Hyperparameters for QLoRA + +QUANT_4_BIT = True + +# %matplotlib inline + +# Used for writing to output in color + +GREEN = "\033[92m" +YELLOW = "\033[93m" +RED = "\033[91m" +RESET = "\033[0m" +COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN} + +"""### Log in to HuggingFace + + +""" + +hf_token = userdata.get('HF_TOKEN') +login(hf_token, add_to_git_credential=True) + +dataset = load_dataset(DATASET_NAME) +train = dataset['train'] +test = dataset['test'] + +test[0] + +"""## Now load the Tokenizer and Model""" + +if QUANT_4_BIT: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) +else: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + bnb_8bit_compute_dtype=torch.bfloat16 + ) + +# Load the Tokenizer and the Model + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto", +) +base_model.generation_config.pad_token_id = tokenizer.pad_token_id + +# Load the fine-tuned model with PEFT + +fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL) + + +print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB") + +fine_tuned_model + +"""# Evaluation""" + +def extract_price(s): + if "Price is $" in s: + contents = s.split("Price is $")[1] + contents = contents.replace(',','') + match = re.search(r"[-+]?\d*\.\d+|\d+", contents) + return float(match.group()) if match else 0 + return 0 + +extract_price("Price is $a fabulous 899.99 or so") + +# Original prediction function takes the most likely next token + +def model_predict(prompt): + inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda") + attention_mask = torch.ones(inputs.shape, device="cuda") + outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1) + response = tokenizer.decode(outputs[0]) + return extract_price(response) + +# top_K = 3 + +# def improved_model_predict(prompt, device="cuda"): +# set_seed(42) +# inputs = tokenizer.encode(prompt, return_tensors="pt").to(device) +# attention_mask = torch.ones(inputs.shape, device=device) + +# with torch.no_grad(): +# outputs = fine_tuned_model(inputs, attention_mask=attention_mask) +# next_token_logits = outputs.logits[:, -1, :].to('cpu') + +# next_token_probs = F.softmax(next_token_logits, dim=-1) +# top_prob, top_token_id = next_token_probs.topk(top_K) +# prices, weights = [], [] +# for i in range(top_K): +# predicted_token = tokenizer.decode(top_token_id[0][i]) +# probability = top_prob[0][i] +# try: +# result = float(predicted_token) +# except ValueError as e: +# result = 0.0 +# if result > 0: +# prices.append(result) +# weights.append(probability) +# if not prices: +# return 0.0, 0.0 +# total = sum(weights) +# weighted_prices = [price * weight / total for price, weight in zip(prices, weights)] +# return sum(weighted_prices).item() + +embedder = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") +chroma = Chroma( + persist_directory = "chroma_train_index", + embedding_function = embedder +) + +gen_config = GenerationConfig(max_new_tokens=10, do_sample=False) + +def predict_price_rag(desc: str, k: int = 3) -> float: + docs = chroma.similarity_search(desc, k=k) + shots = "\n\n".join(f"Description: {d.page_content}\nPrice is ${d.metadata['price']}" + for d in docs) + prompt = f"{shots}\n\nDescription: {desc}\nPrice is $" + inp = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device) + out = fine_tuned_model.generate(**inp, generation_config=gen_config) + txt = tokenizer.decode(out[0, inp["input_ids"].shape[-1]:], skip_special_tokens=True).strip() + return float(re.findall(r"\d+\.?\d+", txt)[0]) + +class Tester: + + def __init__(self, predictor, data, title=None, size=250): + self.predictor = predictor + self.data = data + self.title = title or predictor.__name__.replace("_", " ").title() + self.size = size + self.guesses = [] + self.truths = [] + self.errors = [] + self.sles = [] + self.colors = [] + + def color_for(self, error, truth): + if error<40 or error/truth < 0.2: + return "green" + elif error<80 or error/truth < 0.4: + return "orange" + else: + return "red" + + def run_datapoint(self, i): + datapoint = self.data[i] + guess = self.predictor(datapoint["text"]) + truth = datapoint["price"] + error = abs(guess - truth) + log_error = math.log(truth+1) - math.log(guess+1) + sle = log_error ** 2 + color = self.color_for(error, truth) + title = datapoint["text"].split("\n\n")[1][:20] + "..." + self.guesses.append(guess) + self.truths.append(truth) + self.errors.append(error) + self.sles.append(sle) + self.colors.append(color) + print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}") + + def chart(self, title): + max_error = max(self.errors) + plt.figure(figsize=(12, 8)) + max_val = max(max(self.truths), max(self.guesses)) + plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6) + plt.scatter(self.truths, self.guesses, s=3, c=self.colors) + plt.xlabel('Ground Truth') + plt.ylabel('Model Estimate') + plt.xlim(0, max_val) + plt.ylim(0, max_val) + plt.title(title) + plt.show() + + def report(self): + average_error = sum(self.errors) / self.size + rmsle = math.sqrt(sum(self.sles) / self.size) + hits = sum(1 for color in self.colors if color=="green") + title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%" + self.chart(title) + + def run(self): + self.error = 0 + for i in range(self.size): + self.run_datapoint(i) + self.report() + + @classmethod + def test(cls, function, data): + cls(function, data).run() + +Tester.test(predict_price_rag, test) \ No newline at end of file From bfc20be33cd9b425b82986d419d5f6c00cd221a2 Mon Sep 17 00:00:00 2001 From: Adriana394 <158718290+Adriana394@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:33:14 +0200 Subject: [PATCH 18/23] Create new_training_with_rag (1).py --- .../new_training_with_rag (1).py | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py diff --git a/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py new file mode 100644 index 0000000..49feb73 --- /dev/null +++ b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +"""new_training_with_RAG.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1gi8FPI1dtnxBNTf86JdmXQ0BYqnKz7LS + +# Predict Product Prices +""" + +!nvidia-smi + +!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb + +import os +import re +import math +from tqdm import tqdm +from google.colab import userdata +from huggingface_hub import login +import torch +import transformers +from transformers import ( + AutoModelForCausalLM, AutoTokenizer, TrainingArguments, + set_seed, BitsAndBytesConfig, GenerationConfig) + +from datasets import load_dataset +from peft import LoraConfig, PeftModel +from trl import SFTTrainer, SFTConfig +from datetime import datetime +import matplotlib.pyplot as plt + +#LangChain & RAG Imports + +from sentence_transformers import SentenceTransformer +from langchain.schema import Document +from langchain.vectorstores import Chroma +import chromadb +from langchain.embeddings import HuggingFaceEmbeddings + +# Commented out IPython magic to ensure Python compatibility. +# Constants + +BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B" +#BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.1' +PROJECT_NAME = "pricer-optim" +HF_USER = "Adriana213" + +# Data + +DATASET_NAME = f"{HF_USER}/pricer-data" +MAX_SEQUENCE_LENGTH = 182 + + +RUN_NAME = f"{PROJECT_NAME}-{datetime.now():%Y%m%d_%H%M%S}" + +HUB_MODEL_NAME = f"{HF_USER}/{RUN_NAME}" + +# Hyperparameters for QLoRA + +LORA_R = 8 +LORA_ALPHA = 32 +TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"] +LORA_DROPOUT = 0.10 +QUANT_4_BIT = True + +# Hyperparameters for Training + +EPOCHS = 2 +BATCH_SIZE = 16 +GRADIENT_ACCUMULATION_STEPS = 1 +LEARNING_RATE = 2e-4 +LR_SCHEDULER_TYPE = 'cosine' +WARMUP_RATIO = 0.05 +OPTIMIZER = "paged_adamw_32bit" +STEPS = 50 +SAVE_STEPS = 200 +EVAL_STEPS = 200 # kept for potential future use + +# %matplotlib inline + +HUB_MODEL_NAME + +"""### Log in to HuggingFace & get Data""" + +hf_token = userdata.get('HF_TOKEN') +login(hf_token, add_to_git_credential=True) + +torch.cuda.empty_cache() + +dataset = load_dataset(DATASET_NAME) +train = dataset['train'] +test = dataset['test'] + +"""## Now load the Tokenizer and Model + +The model is "quantized" - we are reducing the precision to 4 bits. +""" + +# Pick the right quantization + +if QUANT_4_BIT: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) +else: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + bnb_8bit_compute_dtype=torch.bfloat16 + ) + +# Load the Tokenizer and the Model + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto", +) + +base_model.generation_config.pad_token_id = tokenizer.pad_token_id + +print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB") + +"""# Data Collator + +""" + +from trl import DataCollatorForCompletionOnlyLM + +response_template = "Price is $" +collator = DataCollatorForCompletionOnlyLM(response_template, + tokenizer=tokenizer) + +"""# Set up the configuration for Training""" + +# LoRA Config + +lora_parameters = LoraConfig( + lora_alpha = LORA_ALPHA, + lora_dropout = LORA_DROPOUT, + r = LORA_R, + bias = "none", + task_type = "CAUSAL_LM", + target_modules = TARGET_MODULES, +) + +# Training Config + +train_parameters = SFTConfig( + output_dir = RUN_NAME, + num_train_epochs = EPOCHS, + per_device_train_batch_size = BATCH_SIZE, + per_device_eval_batch_size = 4, + eval_strategy = "no", + eval_steps = EVAL_STEPS, + gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, + optim = OPTIMIZER, + save_steps = SAVE_STEPS, + save_total_limit = 5, + logging_steps = 50, + learning_rate = LEARNING_RATE, + weight_decay = 0.01, + fp16=False, + bf16=True, + max_grad_norm=0.3, + max_steps=-1, + warmup_ratio = WARMUP_RATIO, + group_by_length=True, + lr_scheduler_type = LR_SCHEDULER_TYPE, + run_name = RUN_NAME, + max_seq_length = MAX_SEQUENCE_LENGTH, + dataset_text_field = "text", + save_strategy = "steps", + hub_strategy = "every_save", + push_to_hub = True, + hub_model_id = HUB_MODEL_NAME, + hub_private_repo = True, + report_to = 'none', +) + + +fine_tuning = SFTTrainer( + model = base_model, + train_dataset = train, + eval_dataset=test, + peft_config = lora_parameters, + args = train_parameters, + data_collator = collator, + ) + +"""## Fine Tuning""" + +fine_tuning.train() + +fine_tuning.model.push_to_hub(RUN_NAME, private=True) +print(f"Saved to the hub: {RUN_NAME}") + +"""# Implement RAG""" + +HF_USER = "Adriana213" +RUN_NAME = "pricer-optim-20250514_061529" +fine_tuned_model = PeftModel.from_pretrained(base_model, f"{HF_USER}/{RUN_NAME}") +print(f"✅ Loaded fine-tuned adapter: {HF_USER}/{RUN_NAME}") + +base_model = fine_tuned_model + +"""## Build Chroma index""" + +docs = [ + Document(page_content=text, metadata = {'price': price}) + for text, price in zip(train['text'], train['price']) +] + +# Create embeddings & persist Chroma index + +embedding = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2') +chroma = Chroma.from_documents( + documents = docs, + embedding = embedding, + persist_directory = 'chroma_train_index' +) + +chroma.persist() +print('Chroma index built and persisted.') + +"""## RAG Prediction Function""" + +generation_config = GenerationConfig( + max_new_token = 10, + do_sample = False, + temperature = 0.1 +) + +def predict_price_rag(desc: str, k: int = 3) -> float: + hits = chroma.similarity_search(desc, k = k) + shot_strs = [ + f'Description: {doc.page_content}\nPrice is ${doc.metadata["price"]}' + for doc in hits + ] + + prompt = "\n\n".join(shot_strs) + f"\n\nDescription: {desc}\nPrice is $" + + inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device) + out = base_model.generate(**inputs, generation_config=generation_config) + text = tokenizer.decode( + out[0, inputs["input_ids"].shape[-1]:], + skip_special_tokens=True + ).strip() + return float(re.findall(r"\d+\.?\d+", text)[0]) + +!zip -r chroma_index.zip chroma_train_index + +from google.colab import files +files.download("chroma_index.zip") \ No newline at end of file From fdd5ad44cf469610da3493d20bad356b29a6485c Mon Sep 17 00:00:00 2001 From: renannovais Date: Wed, 4 Jun 2025 10:20:03 -0300 Subject: [PATCH 19/23] Add my notebook to community-contributions --- ...y1-webpage-summarizer-brazilian-news.ipynb | 247 ++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 week1/community-contributions/day1-webpage-summarizer-brazilian-news.ipynb diff --git a/week1/community-contributions/day1-webpage-summarizer-brazilian-news.ipynb b/week1/community-contributions/day1-webpage-summarizer-brazilian-news.ipynb new file mode 100644 index 0000000..e108977 --- /dev/null +++ b/week1/community-contributions/day1-webpage-summarizer-brazilian-news.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "id": "8ce13728-0040-43cc-82cd-e10c838ef71c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🌍 Detected language: PT\n", + "🔗 Preview of extracted text:\n", + "\n", + "ITASAT2 irá atuar para aplicações científicas e de defesa\n", + "Publicado em 14/04/2025 - 14h15\n", + "O Instituto Tecnológico de Aeronáutica (ITA) realizou, entre os dias 17 e 19 de março, a Revisão Preliminar de Projeto (PDR) do ITASAT 2, novo microssatélite em desenvolvimento por pesquisadores do Centro Espacial ITA (CEI). A atividade representa uma etapa importante dos estudos e contou com a presença de instituições parceiras, tanto do Brasil quanto do exterior.\n", + "Participaram do encontro representantes do\n", + "...\n", + "\n", + "Amount of words: 526\n", + "\n", + "\n", + "📊 Usage Report\n", + "🧾 Prompt tokens: 927\n", + "🧠 Completion tokens: 309\n", + "🔢 Total tokens: 1236\n", + "💰 Total cost: $0.000927\n", + "\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/markdown": [ + "# 📝 Summary\n", + "\n", + "The ITA (Instituto Tecnológico de Aeronáutica) is working on the ITASAT 2 project, a new microsatellite geared towards scientific and defense applications! 🌟 This initiative was highlighted at the Preliminary Design Review (PDR) held from March 17 to 19, with participation from notable organizations such as NASA and the Brazilian Space Agency (AEB). This is a fantastic collaboration that spans both domestic and international partnerships – how exciting is that? \n", + "\n", + "ITASAT 2 will consist of a constellation of three CubeSats focusing on monitoring the Earth's ionosphere and assessing plasma bubble formation. Interestingly, it also has defense applications such as geolocating radio frequency sources and optical identification of uncooperative vessels – a crucial capability for maritime security!\n", + "\n", + "The PDR showcased the team's technical and managerial capabilities, receiving unanimous approval to proceed with the project. It’s great to see such thorough preparation reflecting the dedication of the ITA team! \n", + "\n", + "The CubeSats themselves are cubic nano or microsatellites, and the ITASAT 2 is of the 16U variety, meaning it's made up of 16 units measuring 10 cm each – just amazing how compact these technologies can be! Additionally, the CEI is also developing another CubeSat called SelenITA, which will contribute to NASA's Artemis mission to study the Moon! 🌕\n", + "\n", + "Keep an eye on this remarkable project as it continues to develop – the future of space exploration and defense technology looks bright! 🚀" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Import Libraries\n", + "import os\n", + "import requests\n", + "from openai import OpenAI\n", + "\n", + "from bs4 import BeautifulSoup\n", + "from langdetect import detect, LangDetectException\n", + "from dotenv import load_dotenv\n", + "\n", + "from IPython.display import Markdown, display\n", + "\n", + "# Load .env variables\n", + "load_dotenv()\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "# Check the key\n", + "if not openai_api_key:\n", + " raise ValueError(\"⚠️ OPENAI_API_KEY not found in .env file.\")\n", + "\n", + "# Generating object to work with GPT tasks \n", + "openai = OpenAI()\n", + "\n", + "# Class to work with text extraction, processing and summarizing from a given url\n", + "class WebPageSummarizer():\n", + " \"\"\"\n", + " Class to work with text extraction, processing and summarizing from a given url using the BeautifulSoup library. It also includes pricing.\n", + " \"\"\"\n", + " def __init__(self, url: str, summary_detail: str = \"high\", show_summary: bool = True, language_of_reference = \"English\", model: str = \"gpt-4o-mini\") -> None:\n", + "\n", + " # Initial summarizer settings\n", + " self.url = url\n", + " self.model = model\n", + " self.show_summary = show_summary\n", + " self.summary_detail = summary_detail\n", + " self.language_of_reference = language_of_reference\n", + " self.language_code_map = {\n", + " \"english\": \"en\",\n", + " \"portuguese\": \"pt\",\n", + " \"spanish\": \"es\",\n", + " \"french\": \"fr\",\n", + " \"german\": \"de\",\n", + " \"italian\": \"it\",\n", + " \"japanese\": \"ja\",\n", + " \"chinese\": \"zh\",\n", + " \"korean\": \"ko\",\n", + " }\n", + " \n", + " self.model_pricing = {\n", + " \"gpt-4o-mini\": {\"input\": 0.0005, \"output\": 0.0015},\n", + " \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n", + " \"gpt-4-turbo\": {\"input\": 0.01, \"output\": 0.03},\n", + " \"gpt-4\": {\"input\": 0.03, \"output\": 0.06}, # Rarely used now\n", + " \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015}\n", + " }\n", + "\n", + " self.headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \"\n", + " \"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36\"\n", + " }\n", + "\n", + " if self.summary_detail not in [\"high\", \"low\"]:\n", + " raise Exception(\"\"\"Please select summary detail as either \"high\" or \"low\".\"\"\")\n", + "\n", + " def __extract_text(self):\n", + " response = requests.get(self.url, headers=self.headers)\n", + " if response.status_code != 200:\n", + " raise Exception(f\"Failed to fetch page. Status code: {response.status_code}\")\n", + " \n", + " soup = BeautifulSoup(response.text, \"html.parser\")\n", + " \n", + " # Try to extract meaningful content\n", + " paragraphs = soup.find_all(\"p\")\n", + " \n", + " # Join all paragraph text\n", + " self.text = \"\\n\".join([p.get_text() for p in paragraphs if p.get_text().strip() != \"\"])\n", + "\n", + " # Guarantee limit of text to summary\n", + " max_words = 7000\n", + " if len(self.text.split()) > max_words:\n", + " self.text = \" \".join(self.text.split()[:max_words])\n", + " \n", + " def __detect_language(self):\n", + " # Detect language\n", + " try:\n", + " self.language_url = detect(self.text)\n", + " except LangDetectException:\n", + " self.language_url = \"unknown\"\n", + "\n", + " # Normalize and resolve target language code\n", + " target_language_name = self.language_of_reference.lower().strip()\n", + " self.target_language_code = self.language_code_map.get(target_language_name)\n", + " \n", + " if not self.target_language_code:\n", + " raise ValueError(f\"❌ Unsupported language: {self.language_of_reference}. Please use one of: {list(LANGUAGE_CODE_MAP.keys())}\")\n", + "\n", + " print(f\"🌍 Detected language: {self.language_url.upper()}\")\n", + " \n", + " if self.show_summary:\n", + " print(\"🔗 Preview of extracted text:\\n\")\n", + " print(self.text[:500] + \"\\n...\\n\")\n", + " print(f\"Amount of words: {len(self.text.split())}\\n\")\n", + "\n", + " def __calculate_cost(self, prompt_tokens: int, completion_tokens: int) -> float:\n", + " \"\"\"\n", + " Calculates total cost in USD based on selected model.\n", + " \"\"\"\n", + " pricing = self.model_pricing.get(self.model)\n", + " if pricing is None:\n", + " raise ValueError(f\"\"\"Pricing not available for model \"{self.model}\". Add it to model_pricing.\"\"\")\n", + " \n", + " input_cost = (prompt_tokens / 1000) * pricing[\"input\"]\n", + " output_cost = (completion_tokens / 1000) * pricing[\"output\"]\n", + " return input_cost + output_cost\n", + "\n", + " def summarize(self)-> str:\n", + " \"\"\"\n", + " Method to process user prompts in the context of the user.\n", + " \"\"\"\n", + " self.__extract_text()\n", + " self.__detect_language()\n", + " \n", + " # Prompt for system definition\n", + " self.system_prompt = f\"\"\" \n", + " You are an assistant that analyzes the contents of a website and provides a summary. \n", + " Please notice that providing a {self.summary_detail} summary detail is IMPORTANT.\n", + " If you find text that might be navigation related or ad related please ignore. Respond in markdown. \n", + " Also, can you please start your summary with the tile \"📝 Summary\"?\n", + " \n", + " Please show some excited behavior during your summary, making comments with extra knowledge if possible during or at the end of the sentence. \n", + " \"\"\"\n", + "\n", + " self.content = f\"\"\"The text to summarize is as follows: {self.text}\"\"\"\n", + "\n", + " if self.language_url != self.target_language_code:\n", + " self.system_prompt = f\"\"\"The website content is in {self.language_url.upper()}. Please first translate it to {self.language_of_reference}. \n", + " {self.system_prompt.strip()}\n", + " \"\"\"\n", + "\n", + " response = openai.chat.completions.create(model=self.model, messages=[{\"role\":\"system\", \"content\":self.system_prompt}, \n", + " {\"role\": \"user\", \"content\":self.content}])\n", + "\n", + " # Cost calculation and usage report\n", + " usage = response.usage\n", + " total_cost = self.__calculate_cost(usage.prompt_tokens, usage.completion_tokens)\n", + " \n", + " print(\"\\n📊 Usage Report\")\n", + " print(f\"🧾 Prompt tokens: {usage.prompt_tokens}\")\n", + " print(f\"🧠 Completion tokens: {usage.completion_tokens}\")\n", + " print(f\"🔢 Total tokens: {usage.total_tokens}\")\n", + " print(f\"💰 Total cost: ${total_cost:.6f}\\n\\n\\n\")\n", + "\n", + " return response.choices[0].message.content\n", + "\n", + "\n", + "web_page_summarizer = WebPageSummarizer(\"http://www.ita.br/noticias/revisodeprojetodonovomicrossatlitedoitaaprovada\", summary_detail = \"low\")\n", + "display(Markdown(web_page_summarizer.summarize()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af5a186a-bb25-4cf4-a6d2-6034cd493bc4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ba1b3b702f44ef66555017767167845bdeecbe65 Mon Sep 17 00:00:00 2001 From: lisekarimi Date: Thu, 5 Jun 2025 16:20:51 +0200 Subject: [PATCH 20/23] Add week1 contributions --- .../01_webpage_summarizer.ipynb | 357 +++++++++++++++++ .../02_brochure_generator.ipynb | 370 ++++++++++++++++++ .../03_tech_explainer.ipynb | 142 +++++++ 3 files changed, 869 insertions(+) create mode 100644 week1/community-contributions/01_webpage_summarizer.ipynb create mode 100644 week1/community-contributions/02_brochure_generator.ipynb create mode 100644 week1/community-contributions/03_tech_explainer.ipynb diff --git a/week1/community-contributions/01_webpage_summarizer.ipynb b/week1/community-contributions/01_webpage_summarizer.ipynb new file mode 100644 index 0000000..f8be204 --- /dev/null +++ b/week1/community-contributions/01_webpage_summarizer.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "53211323-6a09-452a-b471-98e22d92bfc2", + "metadata": {}, + "source": [ + "# 🌐 WebPage Summarizer\n", + "---\n", + "- 🌍 **Task:** Summarizing webpage content using AI. \n", + "- 🧠 **Model:** OpenAI's ``gpt-4o-mini`` and ``llama3.2:3b`` for text summarization. \n", + "- 🕵️‍♂️ **Data Extraction:** Selenium for handling both static and JavaScript-rendered websites. \n", + "- 📌 **Output Format:** Markdown-formatted summaries. \n", + "- 🔗 **Scope:** Processes only the given webpage URL (not the entire site). \n", + "- 🚀 **Tools:** Python, Requests, Selenium, BeautifulSoup, OpenAI API, Ollama. \n", + "- 🧑‍💻 **Skill Level:** Beginner.\n", + "\n", + "🛠️ Requirements\n", + "- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n", + "- 🔑 OpenAI API Key (for GPT model)\n", + "- Install Ollama and pull llama3.2:3b or another lightweight model\n", + "- Google Chrome browser installed\n", + "\n", + "**✨ This script handles both JavaScript and non-JavaScript websites using Selenium with Chrome WebDriver for reliable content extraction from modern web applications.**\n", + "\n", + "Let's get started and automate website summarization! 🚀\n", + "\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/01_basic_llm_project.jpg?raw=true)\n", + "\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "markdown", + "id": "d70aa4b0", + "metadata": {}, + "source": [ + "## 🛠️ Environment Setup & Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf2fa36", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install selenium webdriver-manager" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dcf1d9d-c540-4900-b14e-ad36a28fc822", + "metadata": {}, + "outputs": [], + "source": [ + "# ===========================\n", + "# System & Environment\n", + "# ===========================\n", + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "# ===========================\n", + "# Web Scraping\n", + "# ===========================\n", + "import time\n", + "from bs4 import BeautifulSoup\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "\n", + "# ===========================\n", + "# AI-related\n", + "# ===========================\n", + "from IPython.display import Markdown, display\n", + "from openai import OpenAI\n", + "import ollama" + ] + }, + { + "cell_type": "markdown", + "id": "cc20642b", + "metadata": {}, + "source": [ + "## 🔐 Model Configuration & Authentication" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8598c299-05ca-492e-b085-6bcc2f7dda0d", + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "\n", + "if not api_key:\n", + " raise ValueError(\"OPENAI_API_KEY not found in environment variables\")\n", + "\n", + "print(\"✅ API key loaded successfully!\")\n", + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8098defb", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_OPENAI = \"gpt-4o-mini\"\n", + "MODEL_OLLAMA = \"llama3.2:3b\"" + ] + }, + { + "cell_type": "markdown", + "id": "2bd1d83f", + "metadata": {}, + "source": [ + "## 🌐 Web Scraping Infrastructure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fe5114", + "metadata": {}, + "outputs": [], + "source": [ + "class WebsiteCrawler:\n", + " def __init__(self, url):\n", + " self.url = url\n", + " self.title = \"\"\n", + " self.text = \"\"\n", + " self.scrape()\n", + "\n", + " def scrape(self):\n", + " try:\n", + " # Chrome options\n", + " chrome_options = Options()\n", + " chrome_options.add_argument(\"--headless\")\n", + " chrome_options.add_argument(\"--no-sandbox\")\n", + " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", + " chrome_options.add_argument(\"--disable-gpu\")\n", + " chrome_options.add_argument(\"--window-size=1920,1080\")\n", + " chrome_options.add_argument(\"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\")\n", + "\n", + " # Try to find Chrome\n", + " chrome_paths = [\n", + " r\"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe\",\n", + " r\"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe\",\n", + " r\"C:\\Users\\{}\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe\".format(os.getenv('USERNAME')),\n", + " ]\n", + "\n", + " chrome_binary = None\n", + " for path in chrome_paths:\n", + " if os.path.exists(path):\n", + " chrome_binary = path\n", + " break\n", + "\n", + " if chrome_binary:\n", + " chrome_options.binary_location = chrome_binary\n", + "\n", + " # Create driver\n", + " driver = webdriver.Chrome(options=chrome_options)\n", + " driver.set_page_load_timeout(30)\n", + "\n", + " print(f\"🔍 Loading: {self.url}\")\n", + " driver.get(self.url)\n", + "\n", + " # Wait for page to load\n", + " time.sleep(5)\n", + "\n", + " # Try to wait for main content\n", + " try:\n", + " WebDriverWait(driver, 10).until(\n", + " EC.presence_of_element_located((By.TAG_NAME, \"main\"))\n", + " )\n", + " except Exception:\n", + " try:\n", + " WebDriverWait(driver, 10).until(\n", + " EC.presence_of_element_located((By.TAG_NAME, \"body\"))\n", + " )\n", + " except Exception:\n", + " pass # Continue anyway\n", + "\n", + " # Get title and page source\n", + " self.title = driver.title\n", + " page_source = driver.page_source\n", + " driver.quit()\n", + "\n", + " print(f\"✅ Page loaded: {self.title}\")\n", + "\n", + " # Parse with BeautifulSoup\n", + " soup = BeautifulSoup(page_source, 'html.parser')\n", + "\n", + " # Remove unwanted elements\n", + " for element in soup([\"script\", \"style\", \"img\", \"input\", \"button\", \"nav\", \"footer\", \"header\"]):\n", + " element.decompose()\n", + "\n", + " # Get main content\n", + " main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')\n", + " if main:\n", + " self.text = main.get_text(separator=\"\\n\", strip=True)\n", + " else:\n", + " self.text = soup.get_text(separator=\"\\n\", strip=True)\n", + "\n", + " # Clean up text\n", + " lines = [line.strip() for line in self.text.split('\\n') if line.strip() and len(line.strip()) > 2]\n", + " self.text = '\\n'.join(lines[:200]) # Limit to first 200 lines\n", + "\n", + " print(f\"📄 Extracted {len(self.text)} characters\")\n", + "\n", + " except Exception as e:\n", + " print(f\"❌ Error occurred: {e}\")\n", + " self.title = \"Error occurred\"\n", + " self.text = \"Could not scrape website content\"" + ] + }, + { + "cell_type": "markdown", + "id": "d727feff", + "metadata": {}, + "source": [ + "## 🧠 Prompt Engineering & Templates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02e3a673-a8a1-4101-a441-3816f7ab9e4d", + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"You are an assistant that analyzes the contents of a website \\\n", + "and provides a short summary, ignoring text that might be navigation related. \\\n", + "Respond in markdown.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86bb80f9-9e7c-4825-985f-9b83fe50839f", + "metadata": {}, + "outputs": [], + "source": [ + "def user_prompt_for(website):\n", + " user_prompt = f\"You are looking at a website titled {website.title}\"\n", + " user_prompt += \"\\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\\n\\n\"\n", + " user_prompt += website.text\n", + " return user_prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89998b18-77aa-4aaf-a137-f0d078d61f75", + "metadata": {}, + "outputs": [], + "source": [ + "def messages_for(website):\n", + " return [\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(website)}\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "id": "cde36d4f", + "metadata": {}, + "source": [ + "## 📝 Summarization " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5636affe", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_gpt(url):\n", + " \"\"\"Scrape website and summarize with GPT\"\"\"\n", + " site = WebsiteCrawler(url)\n", + "\n", + " if \"Error occurred\" in site.title or len(site.text) < 50:\n", + " print(f\"❌ Failed to scrape meaningful content from {url}\")\n", + " return\n", + "\n", + " print(\"🤖 Creating summary...\")\n", + "\n", + " # Create summary\n", + " response = openai.chat.completions.create(\n", + " model=MODEL_OPENAI,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt_for(site)}\n", + " ]\n", + " )\n", + "\n", + " web_summary = response.choices[0].message.content\n", + " display(Markdown(web_summary))\n", + "\n", + "summarize_gpt('https://openai.com')\n", + "# summarize_gpt('https://stripe.com')\n", + "# summarize_gpt('https://vercel.com')\n", + "# summarize_gpt('https://react.dev')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b9a8f8-0c1c-40c8-a4b3-e8e1fcd29df5", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_ollama(url):\n", + " website = WebsiteCrawler(url)\n", + " response = ollama.chat(\n", + " model=MODEL_OLLAMA,\n", + " messages=messages_for(website))\n", + " display(Markdown(response['message']['content'])) # Generate and display output\n", + "\n", + "summarize_ollama('https://github.com')\n", + "# summarize_ollama('https://nextjs.org')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/02_brochure_generator.ipynb b/week1/community-contributions/02_brochure_generator.ipynb new file mode 100644 index 0000000..5b81824 --- /dev/null +++ b/week1/community-contributions/02_brochure_generator.ipynb @@ -0,0 +1,370 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "dc8af57c-23a9-452e-9fc3-0e5027edda14", + "metadata": {}, + "source": [ + "# AI-powered Brochure Generator\n", + "---\n", + "- 🌍 Task: Generate a company brochure using its name and website for clients, investors, and recruits.\n", + "- 🧠 Model: Toggle `USE_OPENAI` to switch between OpenAI and Ollama models\n", + "- 🕵️‍♂️ Data Extraction: Scraping website content and filtering key links (About, Products, Careers, Contact).\n", + "- 📌 Output Format: a Markdown-formatted brochure streamed in real-time.\n", + "- 🚀 Tools: BeautifulSoup, OpenAI API, and IPython display, ollama.\n", + "- 🧑‍💻 Skill Level: Intermediate.\n", + "\n", + "🛠️ Requirements\n", + "- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n", + "- 🔑 OpenAI API Key \n", + "- Install Ollama and pull llama3.2:3b or another lightweight model\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "markdown", + "id": "ec869f2c", + "metadata": {}, + "source": [ + "## 🧩 System Design Overview\n", + "\n", + "### Class Structure\n", + "\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_class_diagram.png?raw=true)\n", + "\n", + "This code consists of three main classes:\n", + "\n", + "1. **`Website`**: \n", + " - Scrapes and processes webpage content. \n", + " - Extracts **text** and **links** from a given URL. \n", + "\n", + "2. **`LLMClient`**: \n", + " - Handles interactions with **OpenAI or Ollama (`llama3`, `deepseek`, `qwen`)**. \n", + " - Uses `get_relevant_links()` to filter webpage links. \n", + " - Uses `generate_brochure()` to create and stream a Markdown-formatted brochure. \n", + "\n", + "3. **`BrochureGenerator`**: \n", + " - Uses `Website` to scrape the main webpage and relevant links. \n", + " - Uses `LLMClient` to filter relevant links and generate a brochure. \n", + " - Calls `generate()` to run the entire process.\n", + "\n", + "### Workflow\n", + "\n", + "1. **`main()`** initializes `BrochureGenerator` and calls `generate()`. \n", + "2. **`generate()`** calls **`LLMClient.get_relevant_links()`** to extract relevant links using **LLM (OpenAI/Ollama)**. \n", + "3. **`Website` scrapes the webpage**, extracting **text and links** from the given URL. \n", + "4. **Relevant links are re-scraped** using `Website` to collect additional content. \n", + "5. **All collected content is passed to `LLMClient.generate_brochure()`**. \n", + "6. **`LLMClient` streams the generated brochure** using **OpenAI or Ollama**. \n", + "7. **The final brochure is displayed in Markdown format.**\n", + "\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/02_brochure_process.png?raw=true)\n", + "\n", + "\n", + "### Intermediate reasoning\n", + "\n", + "In this workflow, we have intermediate reasoning because the LLM is called twice:\n", + "\n", + "1. **First LLM call**: Takes raw links → filters/selects relevant ones (reasoning step).\n", + "2. **Second LLM call**: Takes selected content → generates final brochure.\n", + "\n", + "🧠 **LLM output becomes LLM input** — that’s intermediate reasoning.\n", + "\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/02_llm_intermd_reasoning.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "id": "4b286461-35ee-4bc5-b07d-af554923e36d", + "metadata": {}, + "source": [ + "## 📦 Import Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fe5670c-5146-474b-9e75-484210533f55", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import json\n", + "import ollama\n", + "from dotenv import load_dotenv\n", + "from bs4 import BeautifulSoup\n", + "from IPython.display import display, Markdown, update_display\n", + "from openai import OpenAI" + ] + }, + { + "cell_type": "markdown", + "id": "f3e23181-1e66-410d-a910-1fb4230f8088", + "metadata": {}, + "source": [ + "## 🧠 Define the Model\n", + "\n", + "The user can switch between OpenAI and Ollama by changing a single variable (`USE_OPENAI`). The model selection is dynamic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa2bd452-0cf4-4fec-9542-e1c86584c23f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load API key\n", + "load_dotenv()\n", + "api_key = os.getenv('OPENAI_API_KEY')\n", + "if not api_key or not api_key.startswith('sk-'):\n", + " raise ValueError(\"Invalid OpenAI API key. Check your .env file.\")\n", + "\n", + "# Define the model dynamically\n", + "USE_OPENAI = True # True to use openai and False to use Ollama\n", + "MODEL = 'gpt-4o-mini' if USE_OPENAI else 'llama3.2:3b'\n", + "\n", + "openai_client = OpenAI() if USE_OPENAI else None" + ] + }, + { + "cell_type": "markdown", + "id": "4fd997b7-1b89-4817-b53a-078164f5f71f", + "metadata": {}, + "source": [ + "## 🏗️ Define Classes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aed1af59-8b8f-4add-98dc-a9f1b5b511a5", + "metadata": {}, + "outputs": [], + "source": [ + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n", + "}\n", + "\n", + "class Website:\n", + " \"\"\"\n", + " A utility class to scrape and process website content.\n", + " \"\"\"\n", + " def __init__(self, url):\n", + " self.url = url\n", + " response = requests.get(url, headers=headers)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + " self.title = soup.title.string if soup.title else \"No title found\"\n", + " self.text = self.extract_text(soup)\n", + " self.links = self.extract_links(soup)\n", + "\n", + " def extract_text(self, soup):\n", + " if soup.body:\n", + " for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n", + " irrelevant.decompose()\n", + " return soup.body.get_text(separator=\"\\n\", strip=True)\n", + " return \"\"\n", + "\n", + " def extract_links(self, soup):\n", + " links = [link.get('href') for link in soup.find_all('a')]\n", + " return [link for link in links if link and 'http' in link]\n", + "\n", + " def get_contents(self):\n", + " return f\"Webpage Title:\\n{self.title}\\nWebpage Contents:\\n{self.text}\\n\\n\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea04dc7e-ff4c-4113-83b7-0bddcf5072b9", + "metadata": {}, + "outputs": [], + "source": [ + "class LLMClient:\n", + " def __init__(self, model=MODEL):\n", + " self.model = model\n", + "\n", + " def get_relevant_links(self, website):\n", + " link_system_prompt = \"\"\"\n", + " You are given a list of links from a company website.\n", + " Select only relevant links for a brochure (About, Company, Careers, Products, Contact).\n", + " Exclude login, terms, privacy, and emails.\n", + "\n", + " ### **Instructions**\n", + " - Return **only valid JSON**.\n", + " - **Do not** include explanations, comments, or Markdown.\n", + " - Example output:\n", + " {\n", + " \"links\": [\n", + " {\"type\": \"about\", \"url\": \"https://company.com/about\"},\n", + " {\"type\": \"contact\", \"url\": \"https://company.com/contact\"},\n", + " {\"type\": \"product\", \"url\": \"https://company.com/products\"}\n", + " ]\n", + " }\n", + " \"\"\"\n", + "\n", + " user_prompt = f\"\"\"\n", + " Here is the list of links on the website of {website.url}:\n", + " Please identify the relevant web links for a company brochure. Respond in JSON format.\n", + " Do not include login, terms of service, privacy, or email links.\n", + " Links (some might be relative links):\n", + " {', '.join(website.links)}\n", + " \"\"\"\n", + "\n", + " if USE_OPENAI:\n", + " response = openai_client.chat.completions.create(\n", + " model=self.model,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " )\n", + " return json.loads(response.choices[0].message.content.strip())\n", + " else:\n", + " response = ollama.chat(\n", + " model=self.model,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": link_system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ]\n", + " )\n", + " result = response.get(\"message\", {}).get(\"content\", \"\").strip()\n", + " try:\n", + " return json.loads(result) # Attempt to parse JSON\n", + " except json.JSONDecodeError:\n", + " print(\"Error: Response is not valid JSON\")\n", + " return {\"links\": []} # Return empty list if parsing fails\n", + "\n", + "\n", + " def generate_brochure(self, company_name, content, language):\n", + " system_prompt = \"\"\"\n", + " You are a professional translator and writer who creates fun and engaging brochures.\n", + " Your task is to read content from a company’s website and write a short, humorous, joky,\n", + " and entertaining brochure for potential customers, investors, and job seekers.\n", + " Include details about the company’s culture, customers, and career opportunities if available.\n", + " Respond in Markdown format.\n", + " \"\"\"\n", + "\n", + " user_prompt = f\"\"\"\n", + " Create a fun brochure for '{company_name}' using the following content:\n", + " {content[:5000]}\n", + " Respond in {language} only, and format your response correctly in Markdown.\n", + " Do NOT escape characters or return extra backslashes.\n", + " \"\"\"\n", + "\n", + " if USE_OPENAI:\n", + " response_stream = openai_client.chat.completions.create(\n", + " model=self.model,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " stream=True\n", + " )\n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " for chunk in response_stream:\n", + " response += chunk.choices[0].delta.content or ''\n", + " response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n", + " update_display(Markdown(response), display_id=display_handle.display_id)\n", + " else:\n", + " response_stream = ollama.chat(\n", + " model=self.model,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " stream=True\n", + " )\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " full_text = \"\"\n", + " for chunk in response_stream:\n", + " if \"message\" in chunk:\n", + " content = chunk[\"message\"][\"content\"] or \"\"\n", + " full_text += content\n", + " update_display(Markdown(full_text), display_id=display_handle.display_id)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c69651f-e004-421e-acc5-c439e57a8762", + "metadata": {}, + "outputs": [], + "source": [ + "class BrochureGenerator:\n", + " \"\"\"\n", + " Main class to generate a company brochure.\n", + " \"\"\"\n", + " def __init__(self, company_name, url, language='English'):\n", + " self.company_name = company_name\n", + " self.url = url\n", + " self.language = language\n", + " self.website = Website(url)\n", + " self.llm_client = LLMClient()\n", + "\n", + " def generate(self):\n", + " links = self.llm_client.get_relevant_links(self.website)\n", + " content = self.website.get_contents()\n", + "\n", + " for link in links['links']:\n", + " linked_website = Website(link['url'])\n", + " content += f\"\\n\\n{link['type']}:\\n\"\n", + " content += linked_website.get_contents()\n", + "\n", + " self.llm_client.generate_brochure(self.company_name, content, self.language)\n" + ] + }, + { + "cell_type": "markdown", + "id": "1379d39d", + "metadata": {}, + "source": [ + "## 📝 Generate Brochure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a63519a-1981-477b-9de1-f1ff9be94201", + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " company_name = \"Tour Eiffel\"\n", + " url = \"https://www.toureiffel.paris/fr\"\n", + " language = \"French\"\n", + "\n", + " generator = BrochureGenerator(company_name, url, language)\n", + " generator.generate()\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/03_tech_explainer.ipynb b/week1/community-contributions/03_tech_explainer.ipynb new file mode 100644 index 0000000..7e8f2f9 --- /dev/null +++ b/week1/community-contributions/03_tech_explainer.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "6e907206-4c13-4698-91c6-9ca1c32be8e7", + "metadata": {}, + "source": [ + "# TechExplainAI\n", + "---\n", + "\n", + "AI-driven tool that provides concise, structured explanations for technical questions and code snippets.\n", + "\n", + "- 🌍 Task: AI-powered technical explanation generator\n", + "- 🧠 Model: OpenAI's `GPT-4o-mini`, Ollama's `llama3.2:3b`\n", + "- 📌 Output Format: Markdown with real-time streaming\n", + "- 🧑‍💻 Skill Level: Beginner\n", + "- 🔄 Interaction Mode: User enters a technical question → AI generates a structured, concise explanation\n", + "- 🎯 Purpose: Quickly explain technical concepts and Python code snippets\n", + "- 🔧 Customization: Users can modify the models, prompts, and formatting as needed\n", + "\n", + "🛠️ Requirements\n", + "- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n", + "- 🔑 OpenAI API Key\n", + "- Install Ollama and pull llama3.2:3b or another lightweight model\n", + "\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f743c87a-ed80-43d5-84ad-c78c8bdacb09", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import openai\n", + "import ollama\n", + "from dotenv import load_dotenv\n", + "from IPython.display import display, Markdown, update_display\n", + "\n", + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "# Set up OpenAI API key\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')\n", + "if not OPENAI_API_KEY:\n", + " raise ValueError(\"Please set your OpenAI API key in environment variables.\")\n", + "\n", + "# Constants\n", + "MODEL_GPT = \"gpt-4o-mini\"\n", + "MODEL_LLAMA = \"llama3.2:3b\"\n", + "\n", + "# Prompt user for question (until input is provided)\n", + "while True:\n", + " question = input(\"Hello, I am your personal technical tutor. Enter your question: \").strip()\n", + " if question:\n", + " break # Proceed only if a valid question is entered\n", + " print(\"Question cannot be empty. Please enter a question.\")\n", + "\n", + "# Common user prompt\n", + "user_prompt = f\"\"\"\n", + "Please give a detailed explanation to the following question: {question}.\n", + "Be less verbose.\n", + "Provide a clear and concise explanation without unnecessary elaboration.\n", + "\"\"\"\n", + "\n", + "# Common system prompt\n", + "system_prompt = \"\"\"\n", + "You are a helpful AI assistant that explains Python code in a clear and concise manner. Provide structured explanations and examples when necessary.\n", + "Be less verbose.\n", + "\"\"\"\n", + "\n", + "def ask_openai():\n", + " \"\"\"Gets response from OpenAI's GPT model with streaming.\"\"\"\n", + " print(\"\\n\\n\\n🚀🤖🚀 Response from OpenAI GPT-4o-mini 🚀🤖🚀\")\n", + " client = openai.OpenAI(api_key=OPENAI_API_KEY)\n", + " response_stream = client.chat.completions.create(\n", + " model=MODEL_GPT,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " stream=True\n", + " )\n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " for chunk in response_stream:\n", + " response += chunk.choices[0].delta.content or ''\n", + " response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n", + " update_display(Markdown(response), display_id=display_handle.display_id)\n", + "\n", + "def ask_ollama():\n", + " \"\"\"Gets response from Ollama's Llama 3.2 model with streaming.\"\"\"\n", + " print(\"\\n\\n\\n🔥✨🔥 Response from Llama 3.2 🔥✨🔥\\n\")\n", + " response = ollama.chat(\n", + " model=MODEL_LLAMA,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " stream=True\n", + " )\n", + "\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " full_text = \"\"\n", + " for chunk in response:\n", + " if \"message\" in chunk:\n", + " content = chunk[\"message\"][\"content\"] or \"\"\n", + " full_text += content\n", + " update_display(Markdown(full_text), display_id=display_handle.display_id)\n", + "\n", + "# Call the functions\n", + "ask_openai()\n", + "ask_ollama()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2d511eb3c62f3b2b985b358b91540006848636b0 Mon Sep 17 00:00:00 2001 From: lisekarimi Date: Thu, 5 Jun 2025 16:40:08 +0200 Subject: [PATCH 21/23] Add week4 contributions --- .../07_data_generator.ipynb | 569 ++++++++++++++++++ 1 file changed, 569 insertions(+) create mode 100644 week4/community-contributions/07_data_generator.ipynb diff --git a/week4/community-contributions/07_data_generator.ipynb b/week4/community-contributions/07_data_generator.ipynb new file mode 100644 index 0000000..6de3bcf --- /dev/null +++ b/week4/community-contributions/07_data_generator.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "BSbc4VbLi2Ek" + }, + "source": [ + "# Synthetic Dataset generator\n", + "- 🚀 Live Demo: https://huggingface.co/spaces/lisekarimi/datagen\n", + "- 🧑‍💻 Repo: https://github.com/lisekarimi/datagen\n", + "\n", + "---\n", + "\n", + "- 🌍 **Task**: Generate realistic synthetic datasets\n", + "- 🎯 **Supported Data Types**: Tabular, Text, Time-series\n", + "- 🧠 **Models**: GPT (OpenAI) , Claude (Anthropic), CodeQwen1.5-7B-Chat (via Hugging Face Inference) / Llama (in Google Colab through T4 GPU)\n", + "- 🚀 **Tools**: Python, Gradio UI, OpenAI / Anthropic / HuggingFace APIs\n", + "- 📤 **Output Formats**: JSON and CSV file\n", + "- 🧑‍💻 **Skill Level**: Intermediate\n", + "\n", + "🎯 **How It Works**\n", + "\n", + "1️⃣ Define your business problem or dataset topic.\n", + "\n", + "2️⃣ Choose the dataset type, output format, model, and number of samples.\n", + "\n", + "3️⃣ The LLM generates the code; you can adjust or modify it as needed.\n", + "\n", + "4️⃣ Execute the code to generate your output file.\n", + "\n", + "🛠️ **Requirements** \n", + "- ⚙️ **Hardware**: ✅ GPU required (model download); Google Colab recommended (T4)\n", + "- 🔑 OpenAI API Key (for GPT) \n", + "- 🔑 Anthropic API Key (for Claude) \n", + "- 🔑 Hugging Face Token \n", + "\n", + "**Deploy CodeQwen Endpoint:**\n", + "- Visit https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat\n", + "- Click **Deploy** → **Inference Endpoints** → **Create Endpoint** (requires credit card)\n", + "- Copy your endpoint URL: `https://[id].us-east-1.aws.endpoints.huggingface.cloud`\n", + "\n", + "⚙️ **Customizable by user** \n", + "- 🤖 Selected model: GPT / Claude / Llama / Code Qwen\n", + "- 📜 `system_prompt`: Controls model behavior (concise, accurate, structured) \n", + "- 💬 `user_prompt`: Dynamic — include other fields\n", + "\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9E-Ioggxi2Em" + }, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pR-ftUatjEGd", + "outputId": "ae5668c5-c369-4066-bbbf-b560fb28e39a" + }, + "outputs": [], + "source": [ + "# Install required packages in Google Colab\n", + "%pip install -q python-dotenv gradio anthropic openai requests torch bitsandbytes transformers sentencepiece accelerate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VPmk2-Ggi2Em" + }, + "outputs": [], + "source": [ + "import re\n", + "import sys\n", + "import subprocess\n", + "import threading\n", + "import anthropic\n", + "import torch\n", + "import gradio as gr\n", + "from openai import OpenAI\n", + "from huggingface_hub import InferenceClient, login\n", + "from google.colab import userdata\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DUQ55_oji2En" + }, + "source": [ + "## Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MiicxGawi2En" + }, + "outputs": [], + "source": [ + "# Google Colab User Data\n", + "# Ensure you have set the following in your Google Colab environment:\n", + "openai_api_key = userdata.get(\"OPENAI_API_KEY\")\n", + "anthropic_api_key = userdata.get(\"ANTHROPIC_API_KEY\")\n", + "hf_token = userdata.get('HF_TOKEN')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_MODEL = \"gpt-4o-mini\"\n", + "CLAUDE_MODEL = \"claude-3-5-sonnet-20240620\"\n", + "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n", + "\n", + "code_qwen = \"Qwen/CodeQwen1.5-7B-Chat\"\n", + "CODE_QWEN_URL = \"https://zfkokxzs1xrqv13v.us-east-1.aws.endpoints.huggingface.cloud\"\n", + "\n", + "login(hf_token, add_to_git_credential=True)\n", + "openai = OpenAI(api_key=openai_api_key)\n", + "claude = anthropic.Anthropic(api_key=anthropic_api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ipA1F440i2En" + }, + "source": [ + "## Prompts definition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JgtqCyRji2En" + }, + "outputs": [], + "source": [ + "system_message = \"\"\"\n", + "You are a helpful assistant whose main purpose is to generate datasets for business problems.\n", + "\n", + "Be less verbose.\n", + "Be accurate and concise.\n", + "\n", + "The user will describe a business problem. Based on this, you must generate a synthetic dataset that fits the context.\n", + "\n", + "The dataset should be saved in a specific format such as CSV, JSON — the desired format will be specified by the user.\n", + "\n", + "The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries.\n", + "\n", + "When saving a DataFrame to JSON using `to_json()`, do not use the `encoding` parameter. Instead, manually open the file with `open()` and specify the encoding. Then pass the file object to `to_json()`.\n", + "\n", + "Ensure Python code blocks are correctly indented, especially inside `with`, `for`, `if`, `try`, and `def` blocks.\n", + "\n", + "Return only the Python code that generates and saves the dataset.\n", + "After saving the file, print the code that was executed and a message confirming the dataset was generated successfully.\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Bk6saP4oi2Eo" + }, + "outputs": [], + "source": [ + "def user_prompt(**input_data):\n", + " user_prompt = f\"\"\"\n", + " Generate a synthetic {input_data[\"dataset_type\"].lower()} dataset in {input_data[\"output_format\"].upper()} format.\n", + " Business problem: {input_data[\"business_problem\"]}\n", + " Samples: {input_data[\"num_samples\"]}\n", + " \"\"\"\n", + " return user_prompt\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XnrPiAZ7i2Eo" + }, + "source": [ + "## Call API for Closed Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sx7hHKczi2Eo" + }, + "outputs": [], + "source": [ + "def stream_gpt(user_prompt):\n", + " stream = openai.chat.completions.create(\n", + " model=OPENAI_MODEL,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\",\"content\": user_prompt},\n", + " ],\n", + " stream=True,\n", + " )\n", + "\n", + " response = \"\"\n", + " for chunk in stream:\n", + " response += chunk.choices[0].delta.content or \"\"\n", + " yield response\n", + "\n", + " return response\n", + "\n", + "\n", + "def stream_claude(user_prompt):\n", + " result = claude.messages.stream(\n", + " model=CLAUDE_MODEL,\n", + " max_tokens=2000,\n", + " system=system_message,\n", + " messages=[\n", + " {\"role\": \"user\",\"content\": user_prompt}\n", + " ]\n", + " )\n", + " reply = \"\"\n", + " with result as stream:\n", + " for text in stream.text_stream:\n", + " reply += text\n", + " yield reply\n", + " print(text, end=\"\", flush=True)\n", + " return reply\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PUPeZ4xPi2Eo" + }, + "source": [ + "## Call Open Source Models\n", + "- Llama is downloaded and run on T4 GPU (Google Colab).\n", + "- Code Qwen is run through inference endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W0AuZT2uk0Sd" + }, + "outputs": [], + "source": [ + "def stream_llama(user_prompt):\n", + " try:\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\",\"content\": user_prompt},\n", + " ]\n", + "\n", + " tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + " quant_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_quant_type=\"nf4\"\n", + " )\n", + "\n", + " model = AutoModelForCausalLM.from_pretrained(\n", + " LLAMA,\n", + " device_map=\"auto\",\n", + " quantization_config=quant_config\n", + " )\n", + "\n", + " inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n", + " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)\n", + "\n", + " thread = threading.Thread(target=model.generate, kwargs={\n", + " \"input_ids\": inputs,\n", + " \"max_new_tokens\": 1000,\n", + " \"pad_token_id\": tokenizer.eos_token_id,\n", + " \"streamer\": streamer\n", + " })\n", + " thread.start()\n", + "\n", + " started = False\n", + " reply = \"\"\n", + "\n", + " for new_text in streamer:\n", + " if not started:\n", + " if \"<|start_header_id|>assistant<|end_header_id|>\" in new_text:\n", + " started = True\n", + " new_text = new_text.split(\"<|start_header_id|>assistant<|end_header_id|>\")[-1].strip()\n", + " else:\n", + " continue\n", + "\n", + " if \"<|eot_id|>\" in new_text:\n", + " new_text = new_text.replace(\"<|eot_id|>\", \"\")\n", + " if new_text.strip():\n", + " reply += new_text\n", + " yield reply\n", + " break\n", + "\n", + " if new_text.strip():\n", + " reply += new_text\n", + " yield reply\n", + "\n", + " return reply\n", + "\n", + " except Exception as e:\n", + " print(f\"LLaMA error: {e}\")\n", + " raise\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V0JS_6THi2Eo" + }, + "outputs": [], + "source": [ + "def stream_code_qwen(user_prompt):\n", + " tokenizer = AutoTokenizer.from_pretrained(code_qwen)\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\",\"content\": user_prompt},\n", + " ]\n", + " text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n", + " client = InferenceClient(CODE_QWEN_URL, token=hf_token)\n", + " stream = client.text_generation(text, stream=True, details=True, max_new_tokens=3000)\n", + " result = \"\"\n", + " for r in stream:\n", + " result += r.token.text\n", + " yield result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PqG57dJIi2Eo" + }, + "source": [ + "## Select the model and generate the ouput" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YqSKnklRi2Eo" + }, + "outputs": [], + "source": [ + "def generate_from_inputs(model, **input_data):\n", + " # print(\"🔍 input_data received:\", input_data)\n", + " user_prompt_str = user_prompt(**input_data)\n", + "\n", + " if model == \"GPT\":\n", + " result = stream_gpt(user_prompt_str)\n", + " elif model == \"Claude\":\n", + " result = stream_claude(user_prompt_str)\n", + " elif model == \"Llama\":\n", + " result = stream_llama(user_prompt_str)\n", + " elif model == \"Code Qwen\":\n", + " result = stream_code_qwen(user_prompt_str)\n", + " else:\n", + " raise ValueError(\"Unknown model\")\n", + "\n", + " for stream_so_far in result:\n", + " yield stream_so_far\n", + "\n", + " return result\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zG6_TSfni2Eo" + }, + "outputs": [], + "source": [ + "def handle_generate(business_problem, dataset_type, dataset_format, num_samples, model):\n", + " input_data = {\n", + " \"business_problem\": business_problem,\n", + " \"dataset_type\": dataset_type,\n", + " \"output_format\": dataset_format,\n", + " \"num_samples\": num_samples,\n", + " }\n", + "\n", + " response = generate_from_inputs(model, **input_data)\n", + " for chunk in response:\n", + " yield chunk\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p5DQcx71i2Ep" + }, + "source": [ + "## Extract python code from the LLM output and execute it locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NcEkmsnai2Ep", + "jp-MarkdownHeadingCollapsed": true + }, + "outputs": [], + "source": [ + "def extract_code(text):\n", + " match = re.search(r\"```python(.*?)```\", text, re.DOTALL)\n", + "\n", + " if match:\n", + " code = match.group(0).strip()\n", + " else:\n", + " code = \"\"\n", + " print(\"No matching substring found.\")\n", + "\n", + " return code.replace(\"```python\\n\", \"\").replace(\"```\", \"\")\n", + "\n", + "\n", + "def execute_code_in_virtualenv(text, python_interpreter=sys.executable):\n", + " if not python_interpreter:\n", + " raise EnvironmentError(\"Python interpreter not found in the specified virtual environment.\")\n", + "\n", + " code_str = extract_code(text)\n", + " command = [python_interpreter, '-c', code_str]\n", + "\n", + " try:\n", + " result = subprocess.run(command, check=True, capture_output=True, text=True)\n", + " stdout = result.stdout\n", + " return stdout\n", + "\n", + " except subprocess.CalledProcessError as e:\n", + " return f\"Execution error:\\n{e}\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DQgEyFzJi2Ep" + }, + "source": [ + "## Gradio interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SEiZVkdFi2Ep" + }, + "outputs": [], + "source": [ + "def update_output_format(dataset_type):\n", + " if dataset_type in [\"Tabular\", \"Time-series\"]:\n", + " return gr.update(choices=[\"JSON\", \"csv\"], value=\"JSON\")\n", + " elif dataset_type == \"Text\":\n", + " return gr.update(choices=[\"JSON\"], value=\"JSON\")\n", + "\n", + "with gr.Blocks() as ui:\n", + " gr.Markdown(\"## Create a dataset for a business problem\")\n", + "\n", + " with gr.Column():\n", + " business_problem = gr.Textbox(label=\"Business problem\", lines=2)\n", + " dataset_type = gr.Dropdown(\n", + " [\"Tabular\", \"Time-series\", \"Text\"], label=\"Dataset type\"\n", + " )\n", + "\n", + " output_format = gr.Dropdown( choices=[\"JSON\", \"csv\"], value=\"JSON\",label=\"Output Format\")\n", + "\n", + " num_samples = gr.Number(label=\"Number of samples\", value=10, precision=0)\n", + "\n", + " model = gr.Dropdown([\"GPT\", \"Claude\", \"Llama\", \"Code Qwen\"], label=\"Select model\", value=\"GPT\")\n", + "\n", + " dataset_type.change(update_output_format,inputs=[dataset_type], outputs=[output_format])\n", + "\n", + " with gr.Row():\n", + " with gr.Column():\n", + " dataset_run = gr.Button(\"Create a dataset\")\n", + " gr.Markdown(\"\"\"⚠️ For Llama and Code Qwen: The generated code might not be optimal. It's recommended to review it before execution.\n", + " Some mistakes may occur.\"\"\")\n", + "\n", + " with gr.Column():\n", + " code_run = gr.Button(\"Execute code for a dataset\")\n", + " gr.Markdown(\"\"\"⚠️ Be cautious when sharing this app with code execution publicly, as it could pose safety risks.\n", + " The execution of user-generated code may lead to potential vulnerabilities, and it’s important to use this tool responsibly.\"\"\")\n", + "\n", + " with gr.Row():\n", + " dataset_out = gr.Textbox(label=\"Generated Dataset\")\n", + " code_out = gr.Textbox(label=\"Executed code\")\n", + "\n", + " dataset_run.click(\n", + " handle_generate,\n", + " inputs=[business_problem, dataset_type, output_format, num_samples, model],\n", + " outputs=[dataset_out]\n", + " )\n", + "\n", + " code_run.click(\n", + " execute_code_in_virtualenv,\n", + " inputs=[dataset_out],\n", + " outputs=[code_out]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 646 + }, + "id": "jCAkTEtMi2Ep", + "outputId": "deeeb1a7-c432-4007-eba2-cbcc28dbc0ff" + }, + "outputs": [], + "source": [ + "ui.launch(inbrowser=True)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 01b33b4edeb640666685169dda9a3ea7e3aacc20 Mon Sep 17 00:00:00 2001 From: lisekarimi Date: Thu, 5 Jun 2025 17:19:54 +0200 Subject: [PATCH 22/23] Add week2 contributions --- .../04_tribot_debate.ipynb | 429 ++++++++++++++ .../05_weathermate_ai_agent.ipynb | 557 ++++++++++++++++++ 2 files changed, 986 insertions(+) create mode 100644 week2/community-contributions/04_tribot_debate.ipynb create mode 100644 week2/community-contributions/05_weathermate_ai_agent.ipynb diff --git a/week2/community-contributions/04_tribot_debate.ipynb b/week2/community-contributions/04_tribot_debate.ipynb new file mode 100644 index 0000000..3fddadf --- /dev/null +++ b/week2/community-contributions/04_tribot_debate.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "559ec769-087c-4c38-a6e4-4732f4ffb261", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# TriBot Debate\n", + "---\n", + "\n", + "This notebook sets up a **three-bot chat system** where GPT (polite & humorous) 🎭, Claude (argumentative & snarky) 🔥, and DeepSeek (logical & analytical) 💡 engage in conversations with distinct personalities.\n", + "\n", + "- 🧑‍💻 **Skill Level:** Advanced \n", + "- 🎯 **Purpose:** Simulate diverse conversational styles for debate, analysis, and entertainment\n", + "\n", + "🛠️ Requirements\n", + "- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n", + "- 🔑 OpenAI API Key\n", + "- 🔑 Anthropic API Key (Claude)\n", + "- 🔑 Deepseek API Key\n", + " \n", + "🔧 Customizable by user\n", + "- Selected model: GPT / Claude / Deepseek\n", + "- System_prompt\n", + "- Starter sentences for each bot\n", + "- `max_turns` to control the number of responses in the conversation\n", + "\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "markdown", + "id": "fe78fae0", + "metadata": {}, + "source": [ + "## 📘 Class Diagram\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/04_3bot_class_diagram.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "id": "62a2f5ca-7d89-4ba7-b342-277452beb2f5", + "metadata": {}, + "source": [ + "## 📚 Imports & Keys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce67806a-3e3b-426d-b442-c3bca2e3dda2", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "import random\n", + "import anthropic\n", + "from openai import OpenAI\n", + "from IPython.display import display, Markdown, update_display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd2613d2-b675-4633-aedf-37ea2a1f0234", + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables from .env file\n", + "load_dotenv(override=True)\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", + "deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n", + "\n", + "if openai_api_key:\n", + " print(\"✅ OpenAI API Key is set.\")\n", + "else:\n", + " print(\"❌ OpenAI API Key not set.\")\n", + "\n", + "if anthropic_api_key:\n", + " print(\"✅ Anthropic API Key is set.\")\n", + "else:\n", + " print(\"❌ Anthropic API Key not set.\")\n", + "\n", + "if deepseek_api_key:\n", + " print(\"✅ Deepseek API Key is set.\")\n", + "else:\n", + " print(\"❌ Deepseek API Key not set.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb8d5d01-04b9-44e8-a713-da36e6dd9be1", + "metadata": {}, + "outputs": [], + "source": [ + "# Establishe connection with the chatbot APIs\n", + "\n", + "# OpenAI API Client\n", + "openai = OpenAI()\n", + "\n", + "# Anthropic API Client\n", + "claude = anthropic.Anthropic()\n", + "\n", + "# DeepSeek using OpenAI-compatible API\n", + "deepseek_client = OpenAI(\n", + " api_key=deepseek_api_key,\n", + " base_url=\"https://api.deepseek.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6881d2a1-3a5d-4d0d-a437-aae7fc2542de", + "metadata": {}, + "source": [ + "## 📋 Constants & Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c009278-ad4a-4bdf-8aa4-d0882155710d", + "metadata": {}, + "outputs": [], + "source": [ + "# We're using cheap versions of models so the costs will be minimal\n", + "GPT_MODEL = \"gpt-4o-mini\"\n", + "CLAUDE_MODEL = \"claude-3-haiku-20240307\"\n", + "DEEPSEEK_MODEL = \"deepseek-chat\"\n", + "\n", + "MAX_TURNS = 6 # Dynamic, can be adjusted by the user\n", + "\n", + "# System Prompts\n", + "GPT_SYSTEM = \"You are a very polite, courteous chatbot. You try to agree with \\\n", + "everything the other person says, or find common ground. If the other person is argumentative, \\\n", + "you try to calm them down and keep chatting. Avoid questions like 'How can I assist you?' or 'How can I help you?' \\\n", + "and dive directly into the conversation. Be less verbose, don't talk too much. \\\n", + "Go straight to the point, don't beat around the bush. Keep the conversation light, fun, and engaging with a touch of humor. \\\n", + "Throw in witty remarks, playful jokes, and entertaining responses when appropriate to keep things lively.\"\n", + "\n", + "CLAUDE_SYSTEM = \"You are a chatbot who is very argumentative; \\\n", + "you disagree with anything in the conversation and you challenge everything, in a snarky way. \\\n", + "Avoid questions like 'How can I assist you?' or 'How can I help you?' \\\n", + "and dive directly into the conversation. Be less verbose, don't talk too much. \\\n", + "Go straight to the point, don't beat around the bush.\"\n", + "\n", + "DEEPSEEK_SYSTEM = \"You are a highly logical and analytical chatbot. You break down \\\n", + "arguments with precise reasoning, focusing on facts and logic over emotions. You stay neutral \\\n", + "and detached, always pointing out inconsistencies or flaws in reasoning. \\\n", + "Avoid questions like 'How can I assist you?' or 'How can I help you?' \\\n", + "and dive directly into the conversation. Be less verbose, don't talk too much. \\\n", + "Go straight to the point, don't beat around the bush.\"\n", + "\n", + "# Define emojis for each bot\n", + "BOT_EMOJIS = {\n", + " \"GPT\": \"🎭\",\n", + " \"Claude\": \"🔥\",\n", + " \"Deepseek\": \"💡\"\n", + "}\n", + "\n", + "# Starter Messages\n", + "STARTER_GPT = \"Hey there! Let’s chat—serious debates, silly topics, or why cats rule the world. Your call!\"\n", + "STARTER_CLAUDE = \"Hello. Got an argument? Fine. Try me, but be ready—I won’t just agree.\"\n", + "STARTER_DEEPSEEK = \"Hi! Let’s dive into a focused discussion. What topic do you want to analyze?\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "7a6c05cc-8bae-4d66-8378-1629092e5d15", + "metadata": {}, + "source": [ + "## 🤖 Bot Classes & Logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b9e0ac1-2569-4bdb-b392-8beb767646cb", + "metadata": {}, + "outputs": [], + "source": [ + "class Chatbot:\n", + " def __init__(self, name, model, system_prompt, starter_message):\n", + " self.name = name\n", + " self.model = model\n", + " self.system_prompt = system_prompt\n", + " self.starter_message = starter_message\n", + "\n", + " def reply(self, message_history):\n", + " \"\"\"Override this method in subclasses for specific chatbot behaviors.\"\"\"\n", + " raise NotImplementedError(\"Subclasses must implement this method.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b883870-6311-4ba3-87e5-3a2bca46487a", + "metadata": {}, + "outputs": [], + "source": [ + "class GPTBot(Chatbot):\n", + " def reply(self, message_history):\n", + " \"\"\"Calls OpenAI GPT API and returns a response.\"\"\"\n", + " try:\n", + " # Explicitly include the system prompt in the messages list\n", + " messages = [{\"role\": \"system\", \"content\": self.system_prompt}] + [\n", + " {\"role\": msg[\"role\"], \"content\": msg[\"content\"]} for msg in message_history\n", + " ]\n", + " response = openai.chat.completions.create(\n", + " model=self.model,\n", + " messages=messages, # Use the explicitly formatted messages\n", + " temperature=0.4,\n", + " max_tokens=200,\n", + " stream=True\n", + " )\n", + " return response\n", + " except Exception as e:\n", + " return f\"Error in GPT response: {e}\"\n", + "\n", + "\n", + "class ClaudeBot(Chatbot):\n", + " def reply(self, message_history):\n", + " \"\"\"Calls Anthropic Claude API and returns a response.\"\"\"\n", + " try:\n", + " # Extract user/assistant messages\n", + " user_messages = [\n", + " {\"role\": msg[\"role\"], \"content\": msg[\"content\"]} for msg in message_history\n", + " ]\n", + " # Call Claude API with system prompt and user messages\n", + " response = claude.messages.stream(\n", + " model=self.model,\n", + " max_tokens=1000,\n", + " system=self.system_prompt, # Pass the system prompt\n", + " messages=user_messages # Pass the conversation history\n", + " )\n", + " return response\n", + " except Exception as e:\n", + " return f\"Error in Claude response: {e}\"\n", + "\n", + "\n", + "class DeepseekBot(Chatbot):\n", + " def reply(self, message_history):\n", + " \"\"\"Calls DeepSeek API using OpenAI-compatible client.\"\"\"\n", + " try:\n", + " # Explicitly include the system prompt in the messages list\n", + " messages = [{\"role\": \"system\", \"content\": self.system_prompt}] + [\n", + " {\"role\": msg[\"role\"], \"content\": msg[\"content\"]} for msg in message_history\n", + " ]\n", + " response = deepseek_client.chat.completions.create(\n", + " model=self.model,\n", + " messages=messages, # Use the explicitly formatted messages\n", + " max_tokens=200,\n", + " stream=True\n", + " )\n", + " return response\n", + " except Exception as e:\n", + " return f\"Error in DeepSeek response: {e}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c07c8814-87af-4bb4-8a90-9d146c228d85", + "metadata": {}, + "outputs": [], + "source": [ + "class ChatManager:\n", + " def __init__(self, bots, max_turns=MAX_TURNS):\n", + " self.bots = bots # List of chatbot instances\n", + " self.max_turns = max_turns\n", + " self.message_history = []\n", + " self.current_bot = random.choice(self.bots) # Random starting bot\n", + "\n", + " def conversation(self):\n", + " \"\"\"Manages the chat loop up to max_turns.\"\"\"\n", + "\n", + " # Stream the first message as \"user\" role\n", + " emoji = BOT_EMOJIS.get(self.current_bot.name, \"🤖\") # Default emoji if not found\n", + " response = f\"{emoji} **{self.current_bot.name}:** \\n\"\n", + " display_handle = display(Markdown(response), display_id=True)\n", + "\n", + " for char in self.current_bot.starter_message:\n", + " update_display(Markdown(response + char), display_id=display_handle.display_id)\n", + " response += char\n", + "\n", + " # Store first message as \"user\" role\n", + " self.message_history.append({\"role\": \"assistant\", \"content\": self.current_bot.starter_message})\n", + "\n", + " print(\"\\n--------------\\n\") # Fancy separator\n", + "\n", + " for _ in range(self.max_turns - 1): # Already sent 1 message\n", + " self.current_bot = self._choose_next_bot()\n", + "\n", + " # Alternate roles while ensuring last role is always \"user\"\n", + " for i in range(len(self.message_history)):\n", + " self.message_history[i][\"role\"] = \"user\" if i % 2 == 0 else \"assistant\"\n", + "\n", + " # Ensure the last role is \"user\" before sending to the bot\n", + " if self.message_history[-1][\"role\"] != \"user\":\n", + " self.message_history[-1][\"role\"] = \"user\"\n", + "\n", + " # Pass only the message history to the bot and Get bot's response\n", + " response_stream = self.current_bot.reply(self.message_history)\n", + "\n", + " # Get the correct emoji for the bot\n", + " emoji = BOT_EMOJIS.get(self.current_bot.name, \"🤖\")\n", + "\n", + " # Display bot name separately before streaming starts\n", + " bot_header = f\"{emoji} **{self.current_bot.name}:** \\n\"\n", + " display_handle = display(Markdown(bot_header), display_id=True)\n", + "\n", + " # **Initialize response content separately (exclude bot name)**\n", + " response_content = \"\"\n", + "\n", + " if isinstance(self.current_bot, GPTBot) or isinstance(self.current_bot, DeepseekBot):\n", + " # Handle OpenAI GPT & DeepSeek\n", + " for chunk in response_stream:\n", + " new_text = chunk.choices[0].delta.content or '' # Get new streamed text\n", + " response_content += new_text # Append new content\n", + "\n", + " # Clean Markdown artifacts\n", + " response_content = response_content.replace(\"```\", \"\").replace(\"markdown\", \"\")\n", + "\n", + " # Update the content, without duplicating the bot name\n", + " update_display(Markdown(bot_header + response_content), display_id=display_handle.display_id)\n", + "\n", + " elif isinstance(self.current_bot, ClaudeBot):\n", + " # Handle Claude differently\n", + " with response_stream as stream:\n", + " for text in stream.text_stream:\n", + " response_content += text or '' # Append new streamed text\n", + " # Clean Markdown artifacts\n", + " response_content = response_content.replace(\"```\", \"\").replace(\"markdown\", \"\")\n", + "\n", + " update_display(Markdown(bot_header + response_content), display_id=display_handle.display_id)\n", + "\n", + " print(\"\\n--------------\\n\") # Fancy separator\n", + "\n", + " # Store bot response\n", + " self.message_history.append({\"role\": \"assistant\", \"content\": response_content})\n", + "\n", + "\n", + " def _choose_next_bot(self):\n", + " \"\"\"Selects the next bot dynamically (avoiding immediate self-replies).\"\"\"\n", + " available_bots = [bot for bot in self.bots if bot != self.current_bot]\n", + " return random.choice(available_bots)" + ] + }, + { + "cell_type": "markdown", + "id": "5d8fe072", + "metadata": {}, + "source": [ + "## 🗨️ Chat Engine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca192335-31e0-4849-878a-da29069f90be", + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " # Initialize chatbot instances\n", + " gpt_bot = GPTBot(\"GPT\", GPT_MODEL, GPT_SYSTEM, STARTER_GPT)\n", + " claude_bot = ClaudeBot(\"Claude\", CLAUDE_MODEL, CLAUDE_SYSTEM, STARTER_CLAUDE)\n", + " deepseek_bot = DeepseekBot(\"Deepseek\", DEEPSEEK_MODEL, DEEPSEEK_SYSTEM, STARTER_DEEPSEEK)\n", + "\n", + " # Create chat manager with all bots\n", + " chat_manager = ChatManager([gpt_bot, claude_bot, deepseek_bot], max_turns=MAX_TURNS)\n", + " # chat_manager = ChatManager([gpt_bot, claude_bot], max_turns=MAX_TURNS)\n", + "\n", + " # Start the conversation\n", + " chat_manager.conversation()\n", + "\n", + "# Ensures the script runs only when executed directly\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "a8dcd130", + "metadata": {}, + "source": [ + "![image.png](attachment:image.png)" + ] + }, + { + "cell_type": "markdown", + "id": "1a086c45", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week2/community-contributions/05_weathermate_ai_agent.ipynb b/week2/community-contributions/05_weathermate_ai_agent.ipynb new file mode 100644 index 0000000..0f6502a --- /dev/null +++ b/week2/community-contributions/05_weathermate_ai_agent.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "ae1ef804-3504-488d-af86-5a0da36fea78", + "metadata": {}, + "source": [ + "# ☀️🏃‍♀️ WeatherMate\n", + "----\n", + "\n", + "**WeatherMate** is a conversational **AI agent** that analyzes real-time weather conditions and suggests the best activities and events based on location. Whether it's sunny, rainy, or snowy, WeatherMate helps you make the most of your day! \n", + "\n", + "Here's how it works:\n", + "1. Get current weather conditions for the user's location.\n", + "2. Recommend suitable indoor or outdoor activities based on the weather.\n", + "3. Find relevant events using the Ticketmaster API.\n", + "4. Merge both activity suggestions and events into a single, structured response.\n", + "\n", + "---\n", + "\n", + "Large Language Models (LLMs), by themselves, cannot fetch real-time data such as weather information. To enable LLMs to access and use such real-time data, we integrate **external tools.** \n", + "\n", + "In this notebook, we will implement a weather API, allowing the assistant to fetch real-time weather information and use it for personalized activity suggestions based on current weather conditions. This is an essential step in transforming an LLM into a more interactive and data-driven AI assistant.\n", + "\n", + "\n", + "In this notebook, we will develop a conversational AI Agent that helps users receive personalized activity recommendations based on real-time weather data.\n", + "\n", + "- 🧑‍💻 Skill Level: Advanced\n", + "- 📤 Output Format: conversational chat\n", + "- 🚀 Tools:\n", + " - Weather API integration \n", + " - Ticketmaster API\n", + " - OpenAI with external tool handling\n", + " - Gradio for the UI\n", + "\n", + "🛠️ Requirements\n", + "- ⚙️ Hardware: ✅ CPU is sufficient — no GPU required\n", + "- 🔑 OpenAI API Key\n", + "- 🔑 Weather API integration (https://www.weatherapi.com)\n", + "- 🔑 Ticketmaster API (https://developer.ticketmaster.com/explore/)\n", + "\n", + "⚙️ Customizable by user\n", + "- 🤖 Selected model\n", + "- 📜 system_prompt: Controls model behavior\n", + "\n", + "---\n", + "📢 Find more LLM notebooks on my [GitHub repository](https://github.com/lisekarimi/lexo)" + ] + }, + { + "cell_type": "markdown", + "id": "ad262788", + "metadata": {}, + "source": [ + "**Class Diagram**\n", + "\n", + "![](https://github.com/lisekarimi/lexo/blob/main/assets/05_weather_class_diagram.png?raw=true)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b7a492-f510-4ba4-bbc3-239675d389dd", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import json\n", + "import requests\n", + "from dotenv import load_dotenv\n", + "from openai import OpenAI\n", + "import gradio as gr\n", + "from datetime import datetime\n", + "\n", + "# Initialization\n", + "\n", + "load_dotenv(override=True)\n", + "\n", + "openai_api_key = os.getenv('OPENAI_API_KEY')\n", + "if not openai_api_key:\n", + " print(\"❌ OpenAI API Key is missing!\")\n", + "\n", + "weather_api_key = os.getenv('WEATHERAPI_KEY')\n", + "if not weather_api_key:\n", + " print(\"❌ Weather API Key is missing!\")\n", + "\n", + "ticketmaster_api_key = os.getenv('TICKETMASTER_KEY')\n", + "if not ticketmaster_api_key:\n", + " print(\"❌ TicketMaster API Key is missing!\")\n", + "\n", + "\n", + "MODEL = \"gpt-4o-mini\"\n", + "openai = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "347dbe00-5826-4aa6-9d2c-9d028fc33ec8", + "metadata": {}, + "outputs": [], + "source": [ + "# Get today's date and day name\n", + "today_str = datetime.today().strftime('%Y-%m-%d')\n", + "day_name = datetime.today().strftime('%A')\n", + "\n", + "nb_activity = 10\n", + "\n", + "\n", + "system_message = f\"\"\"\n", + "You are a fun and helpful assistant for an Activity Suggestion App.\n", + "Your job is to recommend **up to {nb_activity} activities** based on the real-time weather fetched from the API, ensuring a mix of **indoor, outdoor, and event-based activities** whenever possible.\n", + "\n", + "The total must always be **10 or fewer**, following this rule:\n", + "**nb_events + nb_indoors + nb_outdoors ≤ 10**.\n", + "\n", + "You must **analyze and think carefully** to determine the best combination of activities and events for the user.\n", + "- Evaluate **weather conditions** to decide if outdoor activities are suitable.\n", + "- Check **event availability** and select the most relevant ones.\n", + "- Balance **indoor, outdoor, and event-based activities** dynamically to provide the best experience.\n", + "\n", + "If one of these categories is unavailable, that's fine—just provide the best possible suggestions without exceeding **10 activities**.\n", + "Deliver everything **in one go—no waiting!**\n", + "\n", + "\n", + "### **Understanding Relative Dates**\n", + "- Always interpret relative dates based on **{today_str} ({day_name})**.\n", + "- The weekend always refers to Saturday and Sunday.\n", + "- \"Next {day_name}\" should refer to the **closest upcoming occurrence** of that day.\n", + "- If the user asks for a time range (e.g., \"the next 3 days\"), calculate the **exact date range** starting from today.\n", + "- If no specific date is mentioned, **assume today by default**.\n", + "- **Do not ask for confirmation** when interpreting dates—just assume the correct date and proceed confidently unless there's real ambiguity.\n", + "\n", + "### **Activity and Event Suggestion Process**\n", + "To provide the best {nb_activity} activity recommendations, follow these steps:\n", + "Step 1: Retrieve Weather Data – Use the Weather API to get current conditions for the user's location.\n", + "Step 2: Suggest Activities – Recommend suitable indoor or outdoor activities based on the weather.\n", + "Step 3: Fetch Events (if available) – Use the Ticketmaster API to find relevant events in the user’s area.\n", + "Step 4: Combine Everything – Merge both event listings and activity suggestions into a single, well-structured response.\n", + "This entire process should be done seamlessly in one go without making the user wait.\n", + "\n", + "### **How to Handle Each API**\n", + "- **Weather API Handling**:\n", + " - If the user requests a relative date (e.g., \"tomorrow,\" \"next Monday\"), calculate the number of days from today.\n", + " - Provide the weather forecast only for the requested date, ignoring any other days in the response.\n", + " - If no weather data is available, inform the user in a friendly, light-hearted way.\n", + " - The forecast is limited to 14 days, so if the user requests a longer period, politely let him know.\n", + "\n", + "- **Ticketmaster API Handling**:\n", + " - If the user asks for events today, set the start date as today’s date.\n", + " - If the user asks for any specific weekday, find the next occurrence of that day and use it as the start date.\n", + " - If the user asks for a range of days (e.g., \"the next 3 days\"), use today’s date as the start date.\n", + " - The country corresponding to the user's city must be represented using the ISO Alpha-2 Code (e.g., FR for France, US for the United States, CA for Canada, DK for Denmark).\n", + " - If more than 5 events are found, ask the user for their interests to refine the search, using a one-word keyword like 'music,' 'cinema,' or 'theater.'\n", + " - If no events are found, explicitly inform the user in a friendly, funny way.\n", + " - Do not mention Ticketmaster unless necessary; simply state that you are checking for events.\n", + "\n", + "### **User Interaction Rules**\n", + "- If the user **doesn’t mention a city**, **ask them to provide one**.\n", + "- If an event search fails, do **not** mention Ticketmaster; simply say that no events were found.\n", + "- Ensure all activity suggestions are provided **in one response**, combining weather-based activities and event suggestions.\n", + "\n", + "\n", + "### **Event Formatting in Output**\n", + "**If Ticketmaster events are available**, format the output as follows:\n", + "Here are some events that may interest you:\n", + "**Event Name**:\n", + "- 📅 Date: Give the date like 19th March 2025\n", + "- 📍 Venue:\n", + "- 🔗 Ticket Link: Put the URL here\n", + "\n", + "(And don't forget to separate these gems with a snazzy divider)\n", + "\n", + "**Event Name**:\n", + "- 📅 Date: Give the date like 19th March 2025\n", + "- 📍 Venue:\n", + "- 🔗 Ticket Link: Put the URL here\n", + "\n", + "(Another divider, because we like to keep things fresh!)\n", + "\n", + "**Event Name**:\n", + "- 📅 Date: Give the date like 19th March 2025\n", + "- 📍 Venue:\n", + "- 🔗 Ticket Link: Put the URL here\n", + "\n", + "### **Tone and Style**\n", + "**Keep it short, fun, and don’t forget to add a dash of humor!**\n", + "Your job is to keep the user smiling while giving them the **best activities for the day**.\n", + "Be **accurate and concise**, but let’s keep it **light and lively!** 🎉\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578da33d-be38-4c75-8a96-9d6bfc1af99b", + "metadata": {}, + "outputs": [], + "source": [ + "class WeatherAPI:\n", + " def get_weather(self, city: str, days: int) -> dict:\n", + " \"\"\"Fetches weather data for the given city for the next 'days' number of days.\"\"\"\n", + " url = \"https://api.weatherapi.com/v1/forecast.json\"\n", + " params = {\"key\": weather_api_key, \"q\": city, \"days\": days}\n", + " # print(f\"params weather: {params}\")\n", + " response = requests.get(url, params=params)\n", + "\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " forecast = []\n", + " for day in data[\"forecast\"][\"forecastday\"]:\n", + " forecast.append({\n", + " \"date\": day[\"date\"],\n", + " \"temp\": day[\"day\"][\"avgtemp_c\"]\n", + " })\n", + "\n", + " result = {\n", + " \"city\": city,\n", + " \"forecast\": forecast\n", + " }\n", + " return result\n", + " else:\n", + " return {\"error\": f\"City '{city}' not found or other issue. Please check the city name and try again.\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "305f9f18-8556-4b49-9f6b-4a2233eefae9", + "metadata": {}, + "outputs": [], + "source": [ + "from abc import ABC, abstractmethod\n", + "\n", + "class BaseEventAPI(ABC):\n", + " @abstractmethod\n", + " def get_events(self, city, country_code, keywords, size):\n", + " \"\"\"Fetches upcoming events from an event provider.\"\"\"\n", + " pass # Subclasses must implement this method\n", + "\n", + "class TicketmasterAPI(BaseEventAPI):\n", + " def get_events(self, city, country_code, keywords, start_date):\n", + " \"\"\"Fetches upcoming events from Ticketmaster for a given city.\"\"\"\n", + " url = \"https://app.ticketmaster.com/discovery/v2/events.json\"\n", + " params = {\n", + " \"apikey\": ticketmaster_api_key,\n", + " \"city\": city,\n", + " \"countryCode\": country_code,\n", + " \"keyword\": \",\".join(keywords),\n", + " \"size\": 10,\n", + " \"startDateTime\": start_date\n", + " }\n", + "\n", + " response = requests.get(url, params=params)\n", + "\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " events = data.get(\"_embedded\", {}).get(\"events\", [])\n", + " return [\n", + " {\n", + " \"name\": event[\"name\"],\n", + " \"date\": event[\"dates\"][\"start\"][\"localDate\"],\n", + " \"venue\": event[\"_embedded\"][\"venues\"][0][\"name\"],\n", + " \"url\": event.get(\"url\", \"N/A\") # Using .get() to avoid KeyError\n", + " }\n", + " for event in events\n", + " ] if events else []\n", + " else:\n", + " return {\"error\": f\"API request failed! Status: {response.status_code}, Response: {response.text}\"}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c60820f-4e9f-4851-8330-52c8fd676259", + "metadata": {}, + "outputs": [], + "source": [ + "class ChatAssistant:\n", + " def __init__(self):\n", + " self.model = MODEL\n", + " self.tools = [\n", + " {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_weather\",\n", + " \"description\": \"Get the current weather and forecast for the destination city.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"city\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city for which the weather is being requested.\"\n", + " },\n", + " \"days\": {\n", + " \"type\": \"integer\",\n", + " \"description\": \"The number of days for the weather forecast (can be 1, 2, 6, or 10).\"\n", + " }\n", + " },\n", + " \"required\": [\"city\", \"days\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_ticketmaster_events\",\n", + " \"description\": \"Fetch upcoming events from Ticketmaster.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"city\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"City where the events are searched.\"\n", + " },\n", + " \"country_code\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Country code for filtering results.\"\n", + " },\n", + " \"keywords\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"string\"\n", + " },\n", + " \"description\": \"Optional keywords for event search (e.g., 'music', 'concert').\"\n", + " },\n", + " \"size\": {\n", + " \"type\": \"integer\",\n", + " \"description\": \"Number of events to fetch.\"\n", + " },\n", + " \"start_date\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Start date for the event search.\"\n", + " }\n", + " },\n", + " \"required\": [\"city\", \"country_code\", \"size\", \"start_date\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " }\n", + " ]\n", + "\n", + " def chat(self, user_message, history, weather_api, event_apis):\n", + " # Build the conversation\n", + " messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": user_message}]\n", + "\n", + " # OpenAI response\n", + " response = openai.chat.completions.create(model=self.model, messages=messages, tools=self.tools, stream=True)\n", + "\n", + " recovered_pieces = {\n", + " \"content\": None,\n", + " \"role\": \"assistant\",\n", + " \"tool_calls\": {}\n", + " }\n", + " last_tool_calls = {}\n", + " has_tool_call = False\n", + " result = \"\" # Initialize result accumulator\n", + " # previous_index = None # Track the last processed index\n", + "\n", + " for chunk in response:\n", + " delta = chunk.choices[0].delta\n", + " finish_reason = chunk.choices[0].finish_reason\n", + "\n", + " # Handle tool call detection\n", + " if delta.tool_calls and finish_reason in [None, \"tool_calls\"]:\n", + " has_tool_call = True\n", + " piece = delta.tool_calls[0] # Get the first piece in the tool call\n", + "\n", + " # Create a dictionary for the tool call if it doesn't exist yet\n", + " recovered_pieces[\"tool_calls\"][piece.index] = recovered_pieces[\"tool_calls\"].get(\n", + " piece.index, {\"id\": None, \"function\": {\"arguments\": \"\", \"name\": \"\"}, \"type\": \"function\"}\n", + " )\n", + "\n", + " if piece.id:\n", + " recovered_pieces[\"tool_calls\"][piece.index][\"id\"] = piece.id\n", + " if piece.function.name:\n", + " recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"name\"] = piece.function.name\n", + " recovered_pieces[\"tool_calls\"][piece.index][\"function\"][\"arguments\"] += piece.function.arguments\n", + "\n", + " # Store the tool call in the dictionary by index\n", + " last_tool_calls[piece.index] = recovered_pieces[\"tool_calls\"][piece.index]\n", + "\n", + " # Store content in result and yield\n", + " else:\n", + " result += delta.content or \"\"\n", + " if result.strip():\n", + " yield result\n", + "\n", + "\n", + " # Handle tool call scenario\n", + " if has_tool_call:\n", + " # Handle the tool calls\n", + " response = self.handle_tool_call(last_tool_calls, weather_api, event_apis)\n", + "\n", + " if response: # Only iterate if response is not None\n", + " tool_calls_list = [tool_call for tool_call in last_tool_calls.values()]\n", + " messages.append({\"role\": \"assistant\", \"tool_calls\": tool_calls_list}) # Append the tool calls to the messages\n", + "\n", + " # Dynamically process each tool call response and append it to the message history\n", + " for res in response:\n", + " messages.append({\n", + " \"role\": \"tool\",\n", + " \"tool_call_id\": res[\"tool_call_id\"],\n", + " \"content\": json.dumps(res[\"content\"])\n", + " })\n", + "\n", + " # New OpenAI request with tool response\n", + " response = openai.chat.completions.create(model=self.model, messages=messages, stream=True)\n", + "\n", + " result = \"\" # Reset result before second stream\n", + " for chunk in response:\n", + " result += chunk.choices[0].delta.content or \"\"\n", + " if result.strip():\n", + " yield result\n", + "\n", + "\n", + " def handle_tool_call(self, tool_call, weather_api, event_apis):\n", + " stored_values = {} # Dictionary to store the valid value for each field\n", + "\n", + " for index, call in tool_call.items():\n", + " # Load the arguments for each tool call dynamically\n", + " arguments = json.loads(call[\"function\"][\"arguments\"])\n", + "\n", + " # Iterate over all keys dynamically\n", + " for key, value in arguments.items():\n", + " # Update the field if it's currently None or hasn't been set before\n", + " if key not in stored_values or stored_values[key] is None:\n", + " stored_values[key] = value\n", + "\n", + " city = stored_values.get('city')\n", + " days = stored_values.get('days')\n", + " country_code = stored_values.get('country_code')\n", + " keywords = stored_values.get('keywords', [])\n", + " # size = stored_values.get('size')\n", + " start_date = stored_values.get('start_date')\n", + " start_date = str(start_date) + \"T00:00:00Z\"\n", + "\n", + " weather_data = None\n", + " event_data = None\n", + "\n", + " # Iteration over tool_call\n", + " for call in tool_call.values():\n", + " if call[\"function\"][\"name\"] == \"get_weather\":\n", + " weather_data = weather_api.get_weather(city, days)\n", + "\n", + " if call[\"function\"][\"name\"] == \"get_ticketmaster_events\":\n", + " event_data = event_apis[\"ticketmaster\"].get_events(city, country_code, keywords, start_date)\n", + "\n", + " responses = []\n", + "\n", + " # Ensure weather response is always included\n", + " weather_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_weather\"), None)\n", + " if weather_data and \"forecast\" in weather_data:\n", + " responses.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": {\"weather\": weather_data[\"forecast\"]},\n", + " \"tool_call_id\": weather_tool_call_id\n", + " })\n", + " elif weather_tool_call_id:\n", + " responses.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": {\"message\": \"No weather data available for this location.\"},\n", + " \"tool_call_id\": weather_tool_call_id\n", + " })\n", + "\n", + " # Ensure event response is always included\n", + " event_tool_call_id = next((call[\"id\"] for call in tool_call.values() if call[\"function\"][\"name\"] == \"get_ticketmaster_events\"), None)\n", + " if event_data:\n", + " responses.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": {\"events\": event_data},\n", + " \"tool_call_id\": event_tool_call_id\n", + " })\n", + " elif event_tool_call_id:\n", + " responses.append({\n", + " \"role\": \"assistant\",\n", + " \"content\": {\"message\": \"No events found for this location.\"},\n", + " \"tool_call_id\": event_tool_call_id\n", + " })\n", + "\n", + " # print(\"Final responses:\", responses)\n", + " return responses\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "191a3a9e-95e1-4ca6-8992-4a5bafb9b8ff", + "metadata": {}, + "outputs": [], + "source": [ + "# GradioInterface class to handle the Gradio UI\n", + "class GradioInterface:\n", + " def __init__(self, activity_assistant):\n", + " self.activity_assistant = activity_assistant\n", + "\n", + " def launch(self):\n", + " # Gradio chat interface\n", + " gr.ChatInterface(fn=self.activity_assistant.chat, type=\"messages\").launch()\n", + "\n", + "# ActivityAssistant setup\n", + "class ActivityAssistant:\n", + " def __init__(self):\n", + " self.weather_api = WeatherAPI() # Interact with the Weather API\n", + " self.event_apis = { # Interact with the Events API\n", + " \"ticketmaster\": TicketmasterAPI()\n", + " }\n", + " self.chat_assistant = ChatAssistant() # This will handle conversation with OpenAI\n", + "\n", + " def chat(self, user_message, history):\n", + " # Forward the user message and conversation history to ChatAssistant\n", + " response_stream = self.chat_assistant.chat(user_message, history, self.weather_api, self.event_apis)\n", + " for chunk in response_stream:\n", + " yield chunk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b501e8e-2e10-4ab7-b523-1d4b8ad358e8", + "metadata": {}, + "outputs": [], + "source": [ + "# Main execution\n", + "if __name__ == \"__main__\":\n", + " activity_assistant = ActivityAssistant()\n", + " gradio_interface = GradioInterface(activity_assistant)\n", + " gradio_interface.launch()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 7edfba7dca0156cee124ab1409a184647fdcb468 Mon Sep 17 00:00:00 2001 From: Simon <134164156+simondb94@users.noreply.github.com> Date: Fri, 6 Jun 2025 03:27:08 +0100 Subject: [PATCH 23/23] Add files via upload Improvements made. --- .../simondb94-Improved-LLM-Tutor-.ipynb | 1449 +++++++++++++++++ 1 file changed, 1449 insertions(+) create mode 100644 week1/community-contributions/simondb94-Improved-LLM-Tutor-.ipynb diff --git a/week1/community-contributions/simondb94-Improved-LLM-Tutor-.ipynb b/week1/community-contributions/simondb94-Improved-LLM-Tutor-.ipynb new file mode 100644 index 0000000..dab89a9 --- /dev/null +++ b/week1/community-contributions/simondb94-Improved-LLM-Tutor-.ipynb @@ -0,0 +1,1449 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe12c203-e6a6-452c-a655-afb8a03a4ff5", + "metadata": {}, + "source": [ + "Improved-LLM-Tutor" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c1070317-3ed9-4659-abe3-828943230e03", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library imports\n", + "import os\n", + "import time\n", + "import json\n", + "from typing import Dict, List, Any, Optional, Union, Callable\n", + "\n", + "# Third-party imports\n", + "from dotenv import load_dotenv\n", + "from IPython.display import Markdown, display, HTML, update_display\n", + "from openai import OpenAI\n", + "import ollama\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Try to import rich, install if not available\n", + "try:\n", + " from rich.console import Console\n", + " from rich.markdown import Markdown as RichMarkdown\n", + " from rich.panel import Panel\n", + "except ImportError:\n", + " !pip install rich\n", + " from rich.console import Console\n", + " from rich.markdown import Markdown as RichMarkdown\n", + " from rich.panel import Panel\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4a456906-915a-4bfd-bb9d-57e505c5093f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Constants\n", + "MODEL_GPT = 'gpt-4o-mini'\n", + "MODEL_LLAMA = 'llama3.2'\n", + "DEFAULT_SYSTEM_PROMPT = \"You are a helpful technical tutor who answers questions about python code, software engineering, data science and LLMs\"\n", + "\n", + "# Set up environment\n", + "load_dotenv()\n", + "openai = OpenAI()\n", + "console = Console()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a8d7923c-5f28-4c30-8556-342d7c8497c1", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class LLMTutor:\n", + " \"\"\"\n", + " A class that provides tutoring functionality using multiple LLM models.\n", + " \"\"\"\n", + " \n", + " def __init__(self, \n", + " system_prompt: str = DEFAULT_SYSTEM_PROMPT,\n", + " gpt_model: str = MODEL_GPT,\n", + " llama_model: str = MODEL_LLAMA):\n", + " \"\"\"\n", + " Initialize the LLM Tutor with specified models and system prompt.\n", + " \n", + " Args:\n", + " system_prompt: The system prompt to use for the LLMs\n", + " gpt_model: The OpenAI GPT model to use\n", + " llama_model: The Ollama model to use\n", + " \"\"\"\n", + " self.system_prompt = system_prompt\n", + " self.gpt_model = gpt_model\n", + " self.llama_model = llama_model\n", + " self.history: List[Dict[str, Any]] = []\n", + " self.response_times = {'gpt': [], 'llama': []}\n", + " \n", + " def format_question(self, question: str) -> str:\n", + " \"\"\"\n", + " Format the user's question with a standard prefix.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " \n", + " Returns:\n", + " Formatted question with prefix\n", + " \"\"\"\n", + " return f\"Please give a detailed explanation to the following question: {question}\"\n", + " \n", + " def create_messages(self, question: str) -> List[Dict[str, str]]:\n", + " \"\"\"\n", + " Create the message structure for LLM API calls.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " \n", + " Returns:\n", + " List of message dictionaries\n", + " \"\"\"\n", + " formatted_question = self.format_question(question)\n", + " return [\n", + " {\"role\": \"system\", \"content\": self.system_prompt},\n", + " {\"role\": \"user\", \"content\": formatted_question}\n", + " ]\n", + " \n", + " def get_gpt_response(self, \n", + " question: str, \n", + " stream: bool = True) -> str:\n", + " \"\"\"\n", + " Get a response from the GPT model.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " stream: Whether to stream the response\n", + " \n", + " Returns:\n", + " The model's response as a string\n", + " \"\"\"\n", + " messages = self.create_messages(question)\n", + " start_time = time.time()\n", + " \n", + " try:\n", + " if stream:\n", + " return self._stream_gpt_response(messages)\n", + " else:\n", + " response = openai.chat.completions.create(\n", + " model=self.gpt_model, \n", + " messages=messages\n", + " )\n", + " elapsed = time.time() - start_time\n", + " self.response_times['gpt'].append(elapsed)\n", + " return response.choices[0].message.content\n", + " except Exception as e:\n", + " console.print(f\"[bold red]Error with GPT model:[/bold red] {str(e)}\")\n", + " return f\"Error: {str(e)}\"\n", + " \n", + " def _stream_gpt_response(self, messages: List[Dict[str, str]]) -> str:\n", + " \"\"\"\n", + " Stream a response from the GPT model.\n", + " \n", + " Args:\n", + " messages: The messages to send to the model\n", + " \n", + " Returns:\n", + " The complete response as a string\n", + " \"\"\"\n", + " start_time = time.time()\n", + " try:\n", + " stream = openai.chat.completions.create(\n", + " model=self.gpt_model, \n", + " messages=messages,\n", + " stream=True\n", + " )\n", + " \n", + " response = \"\"\n", + " display_handle = display(Markdown(\"\"), display_id=True)\n", + " \n", + " for chunk in stream:\n", + " delta_content = chunk.choices[0].delta.content or ''\n", + " response += delta_content\n", + " # Clean the response for display\n", + " clean_response = response.replace(\"```python\", \"```\").replace(\"```\", \"\")\n", + " update_display(Markdown(clean_response), display_id=display_handle.display_id)\n", + " \n", + " elapsed = time.time() - start_time\n", + " self.response_times['gpt'].append(elapsed)\n", + " return response\n", + " except Exception as e:\n", + " console.print(f\"[bold red]Error streaming GPT response:[/bold red] {str(e)}\")\n", + " return f\"Error: {str(e)}\"\n", + " \n", + " def get_llama_response(self, question: str) -> str:\n", + " \"\"\"\n", + " Get a response from the Llama model.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " \n", + " Returns:\n", + " The model's response as a string\n", + " \"\"\"\n", + " messages = self.create_messages(question)\n", + " start_time = time.time()\n", + " \n", + " try:\n", + " response = ollama.chat(model=self.llama_model, messages=messages)\n", + " elapsed = time.time() - start_time\n", + " self.response_times['llama'].append(elapsed)\n", + " return response['message']['content']\n", + " except Exception as e:\n", + " console.print(f\"[bold red]Error with Llama model:[/bold red] {str(e)}\")\n", + " return f\"Error: {str(e)}\"\n", + " \n", + " def ask(self, question: str, models: List[str] = ['gpt', 'llama']) -> Dict[str, str]:\n", + " \"\"\"\n", + " Ask a question to one or more models.\n", + " \n", + " Args:\n", + " question: The user's question\n", + " models: List of models to query ('gpt', 'llama', or both)\n", + " \n", + " Returns:\n", + " Dictionary with model responses\n", + " \"\"\"\n", + " responses = {}\n", + " \n", + " # Store the question in history\n", + " self.history.append({\n", + " 'question': question,\n", + " 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),\n", + " 'responses': {}\n", + " })\n", + " \n", + " # Get responses from requested models\n", + " if 'gpt' in models:\n", + " console.print(Panel(f\"[bold blue]Getting response from {self.gpt_model}...[/bold blue]\"))\n", + " gpt_response = self.get_gpt_response(question)\n", + " responses['gpt'] = gpt_response\n", + " self.history[-1]['responses']['gpt'] = gpt_response\n", + " \n", + " if 'llama' in models:\n", + " console.print(Panel(f\"[bold green]Getting response from {self.llama_model}...[/bold green]\"))\n", + " llama_response = self.get_llama_response(question)\n", + " responses['llama'] = llama_response\n", + " self.history[-1]['responses']['llama'] = llama_response\n", + " display(Markdown(f\"## {self.llama_model} Response\\n{llama_response}\"))\n", + " \n", + " return responses\n", + " \n", + " def compare_responses(self, question: str = None) -> None:\n", + " \"\"\"\n", + " Compare responses from different models side by side.\n", + " \n", + " Args:\n", + " question: Optional specific question to compare responses for\n", + " \"\"\"\n", + " if question:\n", + " responses = self.ask(question)\n", + " else:\n", + " # Use the most recent question from history\n", + " if not self.history:\n", + " console.print(\"[bold red]No questions in history to compare[/bold red]\")\n", + " return\n", + " responses = self.history[-1]['responses']\n", + " question = self.history[-1]['question']\n", + " \n", + " # Create HTML for side-by-side comparison\n", + " html = f\"\"\"\n", + "
\n", + "
\n", + "

{self.gpt_model}

\n", + "
{responses.get('gpt', 'No response')}
\n", + "
\n", + "
\n", + "

{self.llama_model}

\n", + "
{responses.get('llama', 'No response')}
\n", + "
\n", + "
\n", + " \"\"\"\n", + " display(HTML(html))\n", + " \n", + " def show_performance_metrics(self) -> None:\n", + " \"\"\"\n", + " Display performance metrics for the models.\n", + " \"\"\"\n", + " if not self.response_times['gpt'] and not self.response_times['llama']:\n", + " console.print(\"[bold yellow]No performance data available yet[/bold yellow]\")\n", + " return\n", + " \n", + " # Create DataFrame for metrics\n", + " data = {\n", + " 'Model': [],\n", + " 'Response Time (s)': []\n", + " }\n", + " \n", + " for model, times in self.response_times.items():\n", + " for t in times:\n", + " data['Model'].append(model)\n", + " data['Response Time (s)'].append(t)\n", + " \n", + " df = pd.DataFrame(data)\n", + " \n", + " # Calculate statistics\n", + " stats = df.groupby('Model')['Response Time (s)'].agg(['mean', 'min', 'max', 'count'])\n", + " \n", + " # Display statistics\n", + " console.print(\"\\n[bold]Performance Statistics:[/bold]\")\n", + " console.print(stats)\n", + " \n", + " # Create visualization\n", + " plt.figure(figsize=(10, 6))\n", + " \n", + " # Box plot\n", + " ax = plt.subplot(1, 2, 1)\n", + " df.boxplot(column='Response Time (s)', by='Model', ax=ax)\n", + " plt.title('Response Time Distribution')\n", + " plt.suptitle('')\n", + " \n", + " # Bar chart for average times\n", + " ax = plt.subplot(1, 2, 2)\n", + " stats['mean'].plot(kind='bar', ax=ax, color=['#4285F4', '#34A853'])\n", + " plt.title('Average Response Time')\n", + " plt.ylabel('Seconds')\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + " def save_history(self, filename: str = 'tutor_history.json') -> None:\n", + " \"\"\"\n", + " Save the question and response history to a file.\n", + " \n", + " Args:\n", + " filename: The filename to save to\n", + " \"\"\"\n", + " try:\n", + " with open(filename, 'w') as f:\n", + " json.dump(self.history, f, indent=2)\n", + " console.print(f\"[bold green]History saved to {filename}[/bold green]\")\n", + " except Exception as e:\n", + " console.print(f\"[bold red]Error saving history:[/bold red] {str(e)}\")\n", + " \n", + " def load_history(self, filename: str = 'tutor_history.json') -> None:\n", + " \"\"\"\n", + " Load question and response history from a file.\n", + " \n", + " Args:\n", + " filename: The filename to load from\n", + " \"\"\"\n", + " try:\n", + " with open(filename, 'r') as f:\n", + " self.history = json.load(f)\n", + " console.print(f\"[bold green]History loaded from {filename}[/bold green]\")\n", + " except FileNotFoundError:\n", + " console.print(f\"[bold yellow]History file {filename} not found[/bold yellow]\")\n", + " except Exception as e:\n", + " console.print(f\"[bold red]Error loading history:[/bold red] {str(e)}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3f0d0137-52b0-47a8-81a8-11a90a010798", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LLM Tutor initialized successfully!\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32mLLM Tutor initialized successfully!\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Create a tutor instance\n", + "tutor = LLMTutor()\n", + "console.print(\"[bold green]LLM Tutor initialized successfully![/bold green]\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "25a36470-a68f-40f6-bea1-d2ebb173c015", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       " Question:                                                                                                       \n",
+       "                                                                                                                 \n",
+       " Given a list of dictionaries called 'books', write code to find and print all information                       \n",
+       " about the book titled 'Mastery' by Robert Greene.                                                               \n",
+       "                                                                                                                 \n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[34m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1mQuestion:\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m Given a list of dictionaries called 'books', write code to find and print all information \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m about the book titled 'Mastery' by Robert Greene. \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Define your question here\n", + "question = \"\"\"\n", + "Given a list of dictionaries called 'books', write code to find and print all information \n", + "about the book titled 'Mastery' by Robert Greene.\n", + "\"\"\"\n", + "\n", + "console.print(Panel(f\"[bold]Question:[/bold]\\n{question}\", border_style=\"blue\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bceaeaf9-4d08-4380-b757-597b851dd8ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Getting response from gpt-4o-mini...                                                                            │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ \u001b[1;34mGetting response from gpt-4o-mini...\u001b[0m │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "To find and print all information about the book titled \"Mastery\" by Robert Greene from a list of dictionaries called `books`, we can write a function that iterates through the list, checks for the specific title and author, and then prints the information if a match is found. Here's a step-by-step explanation followed by the code.\n", + "\n", + "### Steps to Follow:\n", + "\n", + "1. **Structure of the Data**: \n", + " Each book in the `books` list is a dictionary. We need to understand how the book's information is structured. A typical dictionary might look like this:\n", + " \n", + " {\n", + " 'title': 'Mastery',\n", + " 'author': 'Robert Greene',\n", + " 'year': 2012,\n", + " 'genre': 'Non-fiction',\n", + " 'isbn': '978-0143124177'\n", + " }\n", + " \n", + "\n", + "2. **Iterate through the List**:\n", + " We will use a loop to go through each book in the `books` list. \n", + "\n", + "3. **Check for Conditions**:\n", + " For each book (dictionary), we need to check if the 'title' is 'Mastery' and the 'author' is 'Robert Greene'. \n", + "\n", + "4. **Print the Details**: \n", + " If we find a match, we will print all the details of that book.\n", + "\n", + "### Example Code\n", + "\n", + "Here’s a Python code snippet that accomplishes this:\n", + "\n", + "\n", + "# Sample list of dictionaries representing books\n", + "books = [\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'year': 2012, 'genre': 'Non-fiction', 'isbn': '978-0143124177'},\n", + " {'title': 'The 48 Laws of Power', 'author': 'Robert Greene', 'year': 1998, 'genre': 'Non-fiction', 'isbn': '978-0140280197'},\n", + " {'title': 'The Art of War', 'author': 'Sun Tzu', 'year': '5th century BC', 'genre': 'Philosophy', 'isbn': '978-1590302255'}\n", + "]\n", + "\n", + "# Function to find and print information about the book titled 'Mastery' by Robert Greene\n", + "def find_book(books):\n", + " for book in books:\n", + " # Check if the title and author match\n", + " if book.get('title') == 'Mastery' and book.get('author') == 'Robert Greene':\n", + " # Print the entire dictionary if a match is found\n", + " print(\"Found book:\")\n", + " for key, value in book.items():\n", + " print(f\"{key}: {value}\")\n", + " return # Exit the function after finding the book\n", + " print(\"Book not found.\") # Optional: Print if the book is not in the list\n", + "\n", + "# Call the function\n", + "find_book(books)\n", + "\n", + "\n", + "### Explanation of the Code:\n", + "\n", + "1. **Data Structure**: The `books` variable is initialized as a list containing dictionary elements, where each dictionary represents a book.\n", + "\n", + "2. **Function Definition**: The function `find_book(books)` takes the list of books as an argument.\n", + "\n", + "3. **Iteration**: The `for` loop iterates over each book in the `books` list.\n", + "\n", + "4. **Finding the Match**: It checks if the title and author of the current book (retrieved using the `get` method to avoid `KeyError`) match 'Mastery' and 'Robert Greene'.\n", + "\n", + "5. **Printing Details**: If a match is found, it prints out the key-value pairs from the dictionary in a formatted manner.\n", + "\n", + "6. **Exit after Finding**: The `return` statement ensures that the function exits as soon as the book is found.\n", + "\n", + "7. **Not Found Condition**: If no book matches the criteria, it prints \"Book not found.\"\n", + "\n", + "### Conclusion\n", + "This method is efficient for small to moderately sized lists of dictionaries. If you have a very large dataset, consider using more efficient search algorithms or data structures like dictionaries for faster lookups, but the above approach should work well for typical use cases." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Getting response from llama3.2...                                                                               │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ \u001b[1;32mGetting response from llama3.2...\u001b[0m │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "## llama3.2 Response\n", + "Here's an example of how you can achieve this using Python:\n", + "\n", + "**Problem Statement**\n", + "\n", + "Given a list of dictionaries called `books`, where each dictionary represents a book with its title, author, publication year, etc., write code to find and print all information about the book titled `'Mastery'` by Robert Greene.\n", + "\n", + "**Example Input Data**\n", + "```python\n", + "books = [\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'publication_year': 2012, 'genre': 'Self-Help'},\n", + " {'title': 'The 48 Laws of Power', 'author': 'Robert Greene', 'publication_year': 2007, 'genre': 'Non-Fiction'},\n", + " {'title': 'To Kill a Mockingbird', 'author': 'Harper Lee', 'publication_year': 1960, 'genre': 'Classic Fiction'},\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'publication_year': 2018, 'genre': 'Self-Help'} # duplicate title\n", + "]\n", + "```\n", + "**Solution**\n", + "\n", + "Here's the Python code that finds and prints all information about the book titled `'Mastery'` by Robert Greene:\n", + "```python\n", + "# Define a function to find books with a specific title and author\n", + "def find_book(books, title, author):\n", + " \"\"\"\n", + " Find all books in the list that match the given title and author.\n", + "\n", + " Args:\n", + " books (list): List of dictionaries representing books.\n", + " title (str): Title of the book to search for.\n", + " author (str): Author of the book to search for.\n", + "\n", + " Returns:\n", + " list: List of dictionaries representing the found books.\n", + " \"\"\"\n", + " return [book for book in books if book['title'] == title and book['author'] == author]\n", + "\n", + "# Define a function to print book information\n", + "def print_book_info(book):\n", + " \"\"\"\n", + " Print all information about a single book.\n", + "\n", + " Args:\n", + " book (dict): Dictionary representing the book.\n", + " \"\"\"\n", + " print(f\"Title: {book['title']}\")\n", + " print(f\"Author: {book['author']}\")\n", + " print(f\"Publication Year: {book['publication_year']}\")\n", + " print(f\"Genre: {book['genre']}\\n\")\n", + "\n", + "# Find and print information about the book titled 'Mastery' by Robert Greene\n", + "target_title = \"Mastery\"\n", + "target_author = \"Robert Greene\"\n", + "\n", + "found_books = find_book(books, target_title, target_author)\n", + "\n", + "if found_books:\n", + " for i, book in enumerate(found_books):\n", + " print(f\"Book {i+1}:\")\n", + " print_book_info(book)\n", + "else:\n", + " print(f\"No books found with title '{target_title}' by author '{target_author}'.\")\n", + "```\n", + "**Explanation**\n", + "\n", + "The solution consists of two functions:\n", + "\n", + "1. `find_book`: This function takes a list of dictionaries representing books, as well as the title and author to search for. It uses a list comprehension to find all books that match the given criteria and returns them.\n", + "2. `print_book_info`: This function takes a single dictionary representing a book and prints its information.\n", + "\n", + "In the example code, we define the `books` list with some sample data. We then call the `find_book` function to find all books with the title `'Mastery'` by Robert Greene. If found books are returned, we iterate over them and print their information using the `print_book_info` function.\n", + "\n", + "Note that if there are duplicate titles in the input data, only one book will be returned by the `find_book` function, as dictionaries cannot have duplicate keys." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Get responses from both models\n", + "responses = tutor.ask(question)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "60ce7000-a4a5-4cce-a261-e75ef45063b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "

gpt-4o-mini

\n", + "
To find and print all information about the book titled \"Mastery\" by Robert Greene from a list of dictionaries called `books`, we can write a function that iterates through the list, checks for the specific title and author, and then prints the information if a match is found. Here's a step-by-step explanation followed by the code.\n", + "\n", + "### Steps to Follow:\n", + "\n", + "1. **Structure of the Data**: \n", + " Each book in the `books` list is a dictionary. We need to understand how the book's information is structured. A typical dictionary might look like this:\n", + " ```python\n", + " {\n", + " 'title': 'Mastery',\n", + " 'author': 'Robert Greene',\n", + " 'year': 2012,\n", + " 'genre': 'Non-fiction',\n", + " 'isbn': '978-0143124177'\n", + " }\n", + " ```\n", + "\n", + "2. **Iterate through the List**:\n", + " We will use a loop to go through each book in the `books` list. \n", + "\n", + "3. **Check for Conditions**:\n", + " For each book (dictionary), we need to check if the 'title' is 'Mastery' and the 'author' is 'Robert Greene'. \n", + "\n", + "4. **Print the Details**: \n", + " If we find a match, we will print all the details of that book.\n", + "\n", + "### Example Code\n", + "\n", + "Here’s a Python code snippet that accomplishes this:\n", + "\n", + "```python\n", + "# Sample list of dictionaries representing books\n", + "books = [\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'year': 2012, 'genre': 'Non-fiction', 'isbn': '978-0143124177'},\n", + " {'title': 'The 48 Laws of Power', 'author': 'Robert Greene', 'year': 1998, 'genre': 'Non-fiction', 'isbn': '978-0140280197'},\n", + " {'title': 'The Art of War', 'author': 'Sun Tzu', 'year': '5th century BC', 'genre': 'Philosophy', 'isbn': '978-1590302255'}\n", + "]\n", + "\n", + "# Function to find and print information about the book titled 'Mastery' by Robert Greene\n", + "def find_book(books):\n", + " for book in books:\n", + " # Check if the title and author match\n", + " if book.get('title') == 'Mastery' and book.get('author') == 'Robert Greene':\n", + " # Print the entire dictionary if a match is found\n", + " print(\"Found book:\")\n", + " for key, value in book.items():\n", + " print(f\"{key}: {value}\")\n", + " return # Exit the function after finding the book\n", + " print(\"Book not found.\") # Optional: Print if the book is not in the list\n", + "\n", + "# Call the function\n", + "find_book(books)\n", + "```\n", + "\n", + "### Explanation of the Code:\n", + "\n", + "1. **Data Structure**: The `books` variable is initialized as a list containing dictionary elements, where each dictionary represents a book.\n", + "\n", + "2. **Function Definition**: The function `find_book(books)` takes the list of books as an argument.\n", + "\n", + "3. **Iteration**: The `for` loop iterates over each book in the `books` list.\n", + "\n", + "4. **Finding the Match**: It checks if the title and author of the current book (retrieved using the `get` method to avoid `KeyError`) match 'Mastery' and 'Robert Greene'.\n", + "\n", + "5. **Printing Details**: If a match is found, it prints out the key-value pairs from the dictionary in a formatted manner.\n", + "\n", + "6. **Exit after Finding**: The `return` statement ensures that the function exits as soon as the book is found.\n", + "\n", + "7. **Not Found Condition**: If no book matches the criteria, it prints \"Book not found.\"\n", + "\n", + "### Conclusion\n", + "This method is efficient for small to moderately sized lists of dictionaries. If you have a very large dataset, consider using more efficient search algorithms or data structures like dictionaries for faster lookups, but the above approach should work well for typical use cases.
\n", + "
\n", + "
\n", + "

llama3.2

\n", + "
Here's an example of how you can achieve this using Python:\n", + "\n", + "**Problem Statement**\n", + "\n", + "Given a list of dictionaries called `books`, where each dictionary represents a book with its title, author, publication year, etc., write code to find and print all information about the book titled `'Mastery'` by Robert Greene.\n", + "\n", + "**Example Input Data**\n", + "```python\n", + "books = [\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'publication_year': 2012, 'genre': 'Self-Help'},\n", + " {'title': 'The 48 Laws of Power', 'author': 'Robert Greene', 'publication_year': 2007, 'genre': 'Non-Fiction'},\n", + " {'title': 'To Kill a Mockingbird', 'author': 'Harper Lee', 'publication_year': 1960, 'genre': 'Classic Fiction'},\n", + " {'title': 'Mastery', 'author': 'Robert Greene', 'publication_year': 2018, 'genre': 'Self-Help'} # duplicate title\n", + "]\n", + "```\n", + "**Solution**\n", + "\n", + "Here's the Python code that finds and prints all information about the book titled `'Mastery'` by Robert Greene:\n", + "```python\n", + "# Define a function to find books with a specific title and author\n", + "def find_book(books, title, author):\n", + " \"\"\"\n", + " Find all books in the list that match the given title and author.\n", + "\n", + " Args:\n", + " books (list): List of dictionaries representing books.\n", + " title (str): Title of the book to search for.\n", + " author (str): Author of the book to search for.\n", + "\n", + " Returns:\n", + " list: List of dictionaries representing the found books.\n", + " \"\"\"\n", + " return [book for book in books if book['title'] == title and book['author'] == author]\n", + "\n", + "# Define a function to print book information\n", + "def print_book_info(book):\n", + " \"\"\"\n", + " Print all information about a single book.\n", + "\n", + " Args:\n", + " book (dict): Dictionary representing the book.\n", + " \"\"\"\n", + " print(f\"Title: {book['title']}\")\n", + " print(f\"Author: {book['author']}\")\n", + " print(f\"Publication Year: {book['publication_year']}\")\n", + " print(f\"Genre: {book['genre']}\\n\")\n", + "\n", + "# Find and print information about the book titled 'Mastery' by Robert Greene\n", + "target_title = \"Mastery\"\n", + "target_author = \"Robert Greene\"\n", + "\n", + "found_books = find_book(books, target_title, target_author)\n", + "\n", + "if found_books:\n", + " for i, book in enumerate(found_books):\n", + " print(f\"Book {i+1}:\")\n", + " print_book_info(book)\n", + "else:\n", + " print(f\"No books found with title '{target_title}' by author '{target_author}'.\")\n", + "```\n", + "**Explanation**\n", + "\n", + "The solution consists of two functions:\n", + "\n", + "1. `find_book`: This function takes a list of dictionaries representing books, as well as the title and author to search for. It uses a list comprehension to find all books that match the given criteria and returns them.\n", + "2. `print_book_info`: This function takes a single dictionary representing a book and prints its information.\n", + "\n", + "In the example code, we define the `books` list with some sample data. We then call the `find_book` function to find all books with the title `'Mastery'` by Robert Greene. If found books are returned, we iterate over them and print their information using the `print_book_info` function.\n", + "\n", + "Note that if there are duplicate titles in the input data, only one book will be returned by the `find_book` function, as dictionaries cannot have duplicate keys.
\n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Compare responses side by side\n", + "tutor.compare_responses()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f00c09c3-1728-442b-94f1-548255fb95b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n",
+       "Performance Statistics:\n",
+       "
\n" + ], + "text/plain": [ + "\n", + "\u001b[1mPerformance Statistics:\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
            mean        min        max  count\n",
+       "Model                                        \n",
+       "gpt    14.672200  14.672200  14.672200      1\n",
+       "llama  79.891858  79.891858  79.891858      1\n",
+       "
\n" + ], + "text/plain": [ + " mean min max count\n", + "Model \n", + "gpt \u001b[1;36m14.672200\u001b[0m \u001b[1;36m14.672200\u001b[0m \u001b[1;36m14.672200\u001b[0m \u001b[1;36m1\u001b[0m\n", + "llama \u001b[1;36m79.891858\u001b[0m \u001b[1;36m79.891858\u001b[0m \u001b[1;36m79.891858\u001b[0m \u001b[1;36m1\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Show performance metrics\n", + "tutor.show_performance_metrics()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "41f5122e-96e9-4fda-9b4f-e8cf4caff552", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
History saved to my_tutor_session.json\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32mHistory saved to my_tutor_session.json\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Save history to a file\n", + "tutor.save_history(\"my_tutor_session.json\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4aa6afbf-1cc1-4ed1-a65f-14ee02ce278f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       " New Question:                                                                                                   \n",
+       " Explain how to implement a binary search algorithm in Python.                                                   \n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[32m╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", + "\u001b[32m│\u001b[0m \u001b[1mNew Question:\u001b[0m \u001b[32m│\u001b[0m\n", + "\u001b[32m│\u001b[0m Explain how to implement a binary search algorithm in Python. \u001b[32m│\u001b[0m\n", + "\u001b[32m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Getting response from gpt-4o-mini...                                                                            │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ \u001b[1;34mGetting response from gpt-4o-mini...\u001b[0m │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "Binary search is an efficient algorithm for finding a target value within a sorted list. It works by repeatedly dividing the search interval in half. If the target value is less than the element in the middle of the interval, the search continues on the lower half; otherwise, it continues on the upper half. This process is continued until the target value is found or the search interval is empty.\n", + "\n", + "Here's a detailed explanation and implementation of the binary search algorithm in Python:\n", + "\n", + "### Step-by-Step Implementation\n", + "\n", + "1. **Prerequisites**:\n", + " - Ensure the input list is sorted. Binary search can only be performed on a sorted list.\n", + " \n", + "2. **Set Initial Variables**:\n", + " - Define two pointers, `low` and `high`, which represent the starting and ending indices of the search range in the list.\n", + "\n", + "3. **Calculate the Middle Index**:\n", + " - Use the formula `mid = (low + high) // 2` to find the middle index.\n", + "\n", + "4. **Comparison**:\n", + " - Compare the middle element with the target:\n", + " - If the middle element is equal to the target, return the index of the middle element.\n", + " - If the target is less than the middle element, narrow the search to the left half by setting `high = mid - 1`.\n", + " - If the target is greater than the middle element, narrow the search to the right half by setting `low = mid + 1`.\n", + "\n", + "5. **Loop Until the Target is Found or the Interval is Empty**:\n", + " - Repeat the above steps until the `low` pointer exceeds the `high` pointer. If the target is not found, return a value indicating that the target is not present (commonly -1).\n", + "\n", + "### Implementation in Python\n", + "\n", + "Here’s a complete Python implementation of the binary search algorithm:\n", + "\n", + "\n", + "def binary_search(arr, target):\n", + " low = 0\n", + " high = len(arr) - 1\n", + "\n", + " while low <= high:\n", + " # Find the middle index\n", + " mid = (low + high) // 2\n", + " \n", + " # Check if the target is present at mid\n", + " if arr[mid] == target:\n", + " return mid # Target found, return the index\n", + " \n", + " # If the target is smaller than the mid element,\n", + " # it can only be present in the left subarray\n", + " elif arr[mid] > target:\n", + " high = mid - 1\n", + " \n", + " # If the target is larger than the mid element,\n", + " # it can only be present in the right subarray\n", + " else:\n", + " low = mid + 1\n", + "\n", + " # Target was not found\n", + " return -1\n", + "\n", + "\n", + "### Example Usage\n", + "\n", + "\n", + "# Example sorted list\n", + "arr = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]\n", + "target = 7\n", + "\n", + "# Perform binary search\n", + "result = binary_search(arr, target)\n", + "\n", + "if result != -1:\n", + " print(f'Target {target} found at index {result}.')\n", + "else:\n", + " print(f'Target {target} not found in the list.')\n", + "\n", + "\n", + "### Key Points\n", + "\n", + "1. **Time Complexity**: The time complexity of binary search is O(log n), where n is the number of elements in the array. This is significantly more efficient than a linear search, which has a time complexity of O(n).\n", + " \n", + "2. **Space Complexity**: The space complexity of the binary search algorithm is O(1) for the iterative version, as it requires a fixed amount of space for variables.\n", + "\n", + "3. **Iterative vs Recursive**: The above implementation is iterative, which is generally preferred for binary search due to its efficiency and avoidance of recursion limits. However, a recursive implementation can also be done:\n", + "\n", + "### Recursive Implementation\n", + "\n", + "\n", + "def binary_search_recursive(arr, target, low, high):\n", + " if low <= high:\n", + " mid = (low + high) // 2\n", + " \n", + " if arr[mid] == target:\n", + " return mid\n", + " elif arr[mid] > target:\n", + " return binary_search_recursive(arr, target, low, mid - 1)\n", + " else:\n", + " return binary_search_recursive(arr, target, mid + 1, high)\n", + " \n", + " return -1\n", + "\n", + "\n", + "### Conclusion\n", + "\n", + "Binary search is a fundamental searching technique that exploits the properties of sorted arrays. Its efficiency makes it a preferred method for searching when working with large datasets. Understanding its underlying algorithm and being able to implement it in Python is a valuable skill in software engineering and data science." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Getting response from llama3.2...                                                                               │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ \u001b[1;32mGetting response from llama3.2...\u001b[0m │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "## llama3.2 Response\n", + "**Implementing Binary Search Algorithm in Python**\n", + "=====================================================\n", + "\n", + "Binary search is an efficient algorithm for finding an item from a sorted list of items. It works by repeatedly dividing in half the portion of the list that could contain the item, until you've narrowed down the possible locations to just one.\n", + "\n", + "Here's a step-by-step guide on how to implement binary search in Python:\n", + "\n", + "**Step 1: Define the Search Function**\n", + "-------------------------------------\n", + "\n", + "```python\n", + "def binary_search(arr, target):\n", + " \"\"\"\n", + " Searches for an element in a sorted array using binary search algorithm.\n", + " \n", + " Parameters:\n", + " arr (list): The sorted list of elements.\n", + " target: The element to be searched.\n", + " \n", + " Returns:\n", + " int: The index of the target element if found; otherwise, -1.\n", + " \"\"\"\n", + "```\n", + "\n", + "**Step 2: Initialize Variables**\n", + "---------------------------------\n", + "\n", + "```python\n", + " low = 0 # Index of the first element in the list\n", + " high = len(arr) - 1 # Index of the last element in the list\n", + "```\n", + "\n", + "**Step 3: Loop Until Found or Not Found**\n", + "-----------------------------------------\n", + "\n", + "```python\n", + " while low <= high:\n", + " mid = (low + high) // 2 # Calculate the middle index\n", + " \n", + " if arr[mid] == target:\n", + " return mid # Target found, return its index\n", + " \n", + " elif arr[mid] < target:\n", + " low = mid + 1 # Search in the right half\n", + " \n", + " else:\n", + " high = mid - 1 # Search in the left half\n", + "```\n", + "\n", + "**Step 4: Handle Edge Cases**\n", + "---------------------------\n", + "\n", + "```python\n", + " if low > high:\n", + " return -1 # Target not found, return -1\n", + "```\n", + "\n", + "**Putting it all Together**\n", + "-----------------------------\n", + "\n", + "Here's the complete binary search implementation in Python:\n", + "\n", + "```python\n", + "def binary_search(arr, target):\n", + " \"\"\"\n", + " Searches for an element in a sorted array using binary search algorithm.\n", + " \n", + " Parameters:\n", + " arr (list): The sorted list of elements.\n", + " target: The element to be searched.\n", + " \n", + " Returns:\n", + " int: The index of the target element if found; otherwise, -1.\n", + " \"\"\"\n", + " low = 0\n", + " high = len(arr) - 1\n", + "\n", + " while low <= high:\n", + " mid = (low + high) // 2\n", + " \n", + " if arr[mid] == target:\n", + " return mid\n", + " elif arr[mid] < target:\n", + " low = mid + 1\n", + " else:\n", + " high = mid - 1\n", + " \n", + " return -1\n", + "\n", + "# Example usage\n", + "arr = [2, 4, 6, 8, 10]\n", + "target = 6\n", + "index = binary_search(arr, target)\n", + "if index != -1:\n", + " print(f\"Target {target} found at index {index}\")\n", + "else:\n", + " print(\"Target not found\")\n", + "```\n", + "\n", + "**Time Complexity**\n", + "------------------\n", + "\n", + "The time complexity of binary search is O(log n), where n is the length of the input array. This makes it much faster than linear search (O(n)) for large datasets.\n", + "\n", + "I hope this explanation helps! Let me know if you have any further questions or need additional clarification." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "

gpt-4o-mini

\n", + "
Binary search is an efficient algorithm for finding a target value within a sorted list. It works by repeatedly dividing the search interval in half. If the target value is less than the element in the middle of the interval, the search continues on the lower half; otherwise, it continues on the upper half. This process is continued until the target value is found or the search interval is empty.\n", + "\n", + "Here's a detailed explanation and implementation of the binary search algorithm in Python:\n", + "\n", + "### Step-by-Step Implementation\n", + "\n", + "1. **Prerequisites**:\n", + " - Ensure the input list is sorted. Binary search can only be performed on a sorted list.\n", + " \n", + "2. **Set Initial Variables**:\n", + " - Define two pointers, `low` and `high`, which represent the starting and ending indices of the search range in the list.\n", + "\n", + "3. **Calculate the Middle Index**:\n", + " - Use the formula `mid = (low + high) // 2` to find the middle index.\n", + "\n", + "4. **Comparison**:\n", + " - Compare the middle element with the target:\n", + " - If the middle element is equal to the target, return the index of the middle element.\n", + " - If the target is less than the middle element, narrow the search to the left half by setting `high = mid - 1`.\n", + " - If the target is greater than the middle element, narrow the search to the right half by setting `low = mid + 1`.\n", + "\n", + "5. **Loop Until the Target is Found or the Interval is Empty**:\n", + " - Repeat the above steps until the `low` pointer exceeds the `high` pointer. If the target is not found, return a value indicating that the target is not present (commonly -1).\n", + "\n", + "### Implementation in Python\n", + "\n", + "Here’s a complete Python implementation of the binary search algorithm:\n", + "\n", + "```python\n", + "def binary_search(arr, target):\n", + " low = 0\n", + " high = len(arr) - 1\n", + "\n", + " while low <= high:\n", + " # Find the middle index\n", + " mid = (low + high) // 2\n", + " \n", + " # Check if the target is present at mid\n", + " if arr[mid] == target:\n", + " return mid # Target found, return the index\n", + " \n", + " # If the target is smaller than the mid element,\n", + " # it can only be present in the left subarray\n", + " elif arr[mid] > target:\n", + " high = mid - 1\n", + " \n", + " # If the target is larger than the mid element,\n", + " # it can only be present in the right subarray\n", + " else:\n", + " low = mid + 1\n", + "\n", + " # Target was not found\n", + " return -1\n", + "```\n", + "\n", + "### Example Usage\n", + "\n", + "```python\n", + "# Example sorted list\n", + "arr = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]\n", + "target = 7\n", + "\n", + "# Perform binary search\n", + "result = binary_search(arr, target)\n", + "\n", + "if result != -1:\n", + " print(f'Target {target} found at index {result}.')\n", + "else:\n", + " print(f'Target {target} not found in the list.')\n", + "```\n", + "\n", + "### Key Points\n", + "\n", + "1. **Time Complexity**: The time complexity of binary search is O(log n), where n is the number of elements in the array. This is significantly more efficient than a linear search, which has a time complexity of O(n).\n", + " \n", + "2. **Space Complexity**: The space complexity of the binary search algorithm is O(1) for the iterative version, as it requires a fixed amount of space for variables.\n", + "\n", + "3. **Iterative vs Recursive**: The above implementation is iterative, which is generally preferred for binary search due to its efficiency and avoidance of recursion limits. However, a recursive implementation can also be done:\n", + "\n", + "### Recursive Implementation\n", + "\n", + "```python\n", + "def binary_search_recursive(arr, target, low, high):\n", + " if low <= high:\n", + " mid = (low + high) // 2\n", + " \n", + " if arr[mid] == target:\n", + " return mid\n", + " elif arr[mid] > target:\n", + " return binary_search_recursive(arr, target, low, mid - 1)\n", + " else:\n", + " return binary_search_recursive(arr, target, mid + 1, high)\n", + " \n", + " return -1\n", + "```\n", + "\n", + "### Conclusion\n", + "\n", + "Binary search is a fundamental searching technique that exploits the properties of sorted arrays. Its efficiency makes it a preferred method for searching when working with large datasets. Understanding its underlying algorithm and being able to implement it in Python is a valuable skill in software engineering and data science.
\n", + "
\n", + "
\n", + "

llama3.2

\n", + "
**Implementing Binary Search Algorithm in Python**\n", + "=====================================================\n", + "\n", + "Binary search is an efficient algorithm for finding an item from a sorted list of items. It works by repeatedly dividing in half the portion of the list that could contain the item, until you've narrowed down the possible locations to just one.\n", + "\n", + "Here's a step-by-step guide on how to implement binary search in Python:\n", + "\n", + "**Step 1: Define the Search Function**\n", + "-------------------------------------\n", + "\n", + "```python\n", + "def binary_search(arr, target):\n", + " \"\"\"\n", + " Searches for an element in a sorted array using binary search algorithm.\n", + " \n", + " Parameters:\n", + " arr (list): The sorted list of elements.\n", + " target: The element to be searched.\n", + " \n", + " Returns:\n", + " int: The index of the target element if found; otherwise, -1.\n", + " \"\"\"\n", + "```\n", + "\n", + "**Step 2: Initialize Variables**\n", + "---------------------------------\n", + "\n", + "```python\n", + " low = 0 # Index of the first element in the list\n", + " high = len(arr) - 1 # Index of the last element in the list\n", + "```\n", + "\n", + "**Step 3: Loop Until Found or Not Found**\n", + "-----------------------------------------\n", + "\n", + "```python\n", + " while low <= high:\n", + " mid = (low + high) // 2 # Calculate the middle index\n", + " \n", + " if arr[mid] == target:\n", + " return mid # Target found, return its index\n", + " \n", + " elif arr[mid] < target:\n", + " low = mid + 1 # Search in the right half\n", + " \n", + " else:\n", + " high = mid - 1 # Search in the left half\n", + "```\n", + "\n", + "**Step 4: Handle Edge Cases**\n", + "---------------------------\n", + "\n", + "```python\n", + " if low > high:\n", + " return -1 # Target not found, return -1\n", + "```\n", + "\n", + "**Putting it all Together**\n", + "-----------------------------\n", + "\n", + "Here's the complete binary search implementation in Python:\n", + "\n", + "```python\n", + "def binary_search(arr, target):\n", + " \"\"\"\n", + " Searches for an element in a sorted array using binary search algorithm.\n", + " \n", + " Parameters:\n", + " arr (list): The sorted list of elements.\n", + " target: The element to be searched.\n", + " \n", + " Returns:\n", + " int: The index of the target element if found; otherwise, -1.\n", + " \"\"\"\n", + " low = 0\n", + " high = len(arr) - 1\n", + "\n", + " while low <= high:\n", + " mid = (low + high) // 2\n", + " \n", + " if arr[mid] == target:\n", + " return mid\n", + " elif arr[mid] < target:\n", + " low = mid + 1\n", + " else:\n", + " high = mid - 1\n", + " \n", + " return -1\n", + "\n", + "# Example usage\n", + "arr = [2, 4, 6, 8, 10]\n", + "target = 6\n", + "index = binary_search(arr, target)\n", + "if index != -1:\n", + " print(f\"Target {target} found at index {index}\")\n", + "else:\n", + " print(\"Target not found\")\n", + "```\n", + "\n", + "**Time Complexity**\n", + "------------------\n", + "\n", + "The time complexity of binary search is O(log n), where n is the length of the input array. This makes it much faster than linear search (O(n)) for large datasets.\n", + "\n", + "I hope this explanation helps! Let me know if you have any further questions or need additional clarification.
\n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define a new question\n", + "new_question = \"Explain how to implement a binary search algorithm in Python.\"\n", + "\n", + "console.print(Panel(f\"[bold]New Question:[/bold]\\n{new_question}\", border_style=\"green\"))\n", + "\n", + "# Get responses for the new question\n", + "new_responses = tutor.ask(new_question)\n", + "\n", + "# Compare responses\n", + "tutor.compare_responses()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dca14276-c3d5-493a-aa1f-8dc4c23b144d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}