Merge branch 'ed-donner:main' into main
This commit is contained in:
141
community-contributions/SyntheticDataGenerator_PT.ipynb
Normal file
141
community-contributions/SyntheticDataGenerator_PT.ipynb
Normal file
@@ -0,0 +1,141 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d08b387c-53fb-46d2-b083-5eebc3c97e1b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124\n",
|
||||
"!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f1851b2-890c-427b-8e70-b998efa04c67",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from google.colab import drive\n",
|
||||
"from huggingface_hub import login\n",
|
||||
"from google.colab import userdata\n",
|
||||
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c2d334b5-453e-4213-8e1c-905d504d2dc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1b3684c-c170-45f2-a83d-7e6e2ca1e23b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"hf_token = userdata.get('HF_TOKEN')\n",
|
||||
"login(hf_token, add_to_git_credential=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8c1b6dae-3213-4d68-8fa1-d195704790dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
|
||||
"openai = OpenAI(api_key=openai_api_key)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "988974c7-814c-478a-be7b-0928b0efdbab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_message = \"You are an assistant that produces synthetic test data. The fields, data type of the field like numeric, date, alphanumeric etc., will be provided. Generate data considering all cases, if it is a workflow audit data then consider all touchpoint movements. Do not provide a python script to generate the data. Provide the data as a json with arrays.\"\n",
|
||||
"user_prompt = \"\"\"Create a synthetic dataset for testing. \n",
|
||||
"Column names and type - \n",
|
||||
"ID: 10 digit number\n",
|
||||
"TRACKING_ID: 13 character alphanumeric\n",
|
||||
"CASE REPORT DATE : DD-MMM-YYYY HH:MM:SS\n",
|
||||
"NOTIFICATION DATE : DD-MMM-YYYY HH:MM:SS\n",
|
||||
"IN SCOPE : (Yes/No)\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40cebc04-abf0-4c61-8b18-f98d3c1fe680",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"quant_config = BitsAndBytesConfig(\n",
|
||||
" load_in_4bit=True,\n",
|
||||
" bnb_4bit_use_double_quant=True,\n",
|
||||
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
|
||||
" bnb_4bit_quant_type=\"nf4\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "710ba1af-8e12-4635-933b-00df8d2e3f9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
|
||||
"tokenizer.pad_token = tokenizer.eos_token\n",
|
||||
"inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
|
||||
"streamer = TextStreamer(tokenizer)\n",
|
||||
"model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
|
||||
"outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
103
week1/community-contributions/Day-1_email_summarizers.ipynb
Normal file
103
week1/community-contributions/Day-1_email_summarizers.ipynb
Normal file
@@ -0,0 +1,103 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7a6bb51",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import library\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"# Load your API key from an .env file\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7ac4cdf9",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"system_prompt = \"you are a helpful assistant that suggests an appropriate short subject line for an email based on its contents.\"\n",
|
||||
"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
"Hi John,\n",
|
||||
"I hope this email finds you well. I wanted to follow up on our meeting last week regarding the quarterly budget proposal.\n",
|
||||
"After reviewing the numbers with my team, we've identified some areas where we can reduce costs by approximately 15% without impacting our core operations. This would involve consolidating some vendor contracts and optimizing our software licensing.\n",
|
||||
"Could we schedule a meeting next week to discuss these findings in detail? I'm available Tuesday through Thursday afternoon.\n",
|
||||
"Looking forward to hearing from you.\n",
|
||||
"\n",
|
||||
"Best regards,\n",
|
||||
"Sarah\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a77ca09e",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 2: Make the messages list\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8404f0fe",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"response = openai.chat.completions.create(model=\"gpt-4o-mini\", messages=messages)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a4875f7",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 4: Print the result\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
290
week1/community-contributions/Day-2_exercise_with_ollama3.ipynb
Normal file
290
week1/community-contributions/Day-2_exercise_with_ollama3.ipynb
Normal file
@@ -0,0 +1,290 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "135717e7",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "29a9e634",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTION 1\n",
|
||||
"# using openai\n",
|
||||
"\n",
|
||||
"# message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"# client = OpenAI(base_url=\"http://localhost:11434/v1\", api_key=\"not-needed\")\n",
|
||||
"# response = openai.chat.completions.create(model=`<name of model>`, messages=[{\"role\":\"user\", \"content\":message}])\n",
|
||||
"# print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "306993ed",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OPTION 2\n",
|
||||
"# using Ollama\n",
|
||||
"\n",
|
||||
"message = \"Hello, GPT! This is my first ever message to you! Hi!\"\n",
|
||||
"model=\"llama3\"\n",
|
||||
"response=ollama.chat(model=model,messages=[{\"role\":\"user\",\"content\":message}])\n",
|
||||
"print(response[\"message\"][\"content\"])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "856f767b",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"# If you're not familiar with Classes, check out the \"Intermediate Python\" notebook\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4ce558dc",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's try one out. Change the website and add print statements to follow along.\n",
|
||||
"\n",
|
||||
"ed = Website(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5e3956f8",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish.\"\n",
|
||||
"\n",
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "99d791b4",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "5d89b748",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# See how this function creates exactly the format above\n",
|
||||
"\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "9a97d3e2",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# And now: call the OpenAI API. You will get very familiar with this!\n",
|
||||
"\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response=ollama.chat(model=model,messages=messages_for(website))\n",
|
||||
" return(response[\"message\"][\"content\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ec13fe0a",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "e3ade092",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "be2d49e6",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://edwarddonner.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1ccbf33b",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://cnn.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ae3d0eae",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://anthropic.com\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables in a file called .env\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n",
|
||||
"# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "00743dac-0e70-45b7-879a-d7293a6f68a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Step 1: Create your prompts\n",
|
||||
"\n",
|
||||
"system_prompt = \"Eres un analista acostumbrado a trabajar con correos electrónicos que contiene un gran conocimiento sobre la mejor manera de resumir contenido releveante \\\n",
|
||||
"dejando de lado cualquier información que no despierte interés o no sea el tema principal del correo. Tu función será leer contenido de correos y definir un listado de las 3 mejores opciones con el formato: Opción *numero de la opción*: *sujeto* Motivo: *que palabras clave dentro del texto has utilizado para llegar a esa conclusion y la relación semántica con tu idea\"\n",
|
||||
"user_prompt = \"\"\"\n",
|
||||
"Tengo un correo que le quiero enviar a mi profesor pero no se muy bien como llamarlo, ayudame. El correo es el siguiente:\n",
|
||||
"Hola profe,\n",
|
||||
"Ultimamente estoy disfrutando mucho sus clases y la información que presenta me parece muy importante. Este fin de semana me voy de vacaciones y no podré\n",
|
||||
"ir a sus clases la semana que viene. Me gustaría si pudiera pasarme los pdfs de la siguiente semana para echarle un vistazo por mi cuenta durante mi ausencia en Francia.\n",
|
||||
"\n",
|
||||
"Un saludo,\n",
|
||||
"Daniel.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Step 2: Make the messages list\n",
|
||||
"\n",
|
||||
"messages = [{\"role\" : \"system\" , \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt}]\n",
|
||||
"\n",
|
||||
"# Step 3: Call OpenAI\n",
|
||||
"\n",
|
||||
"response = openai.chat.completions.create( \n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages)\n",
|
||||
"\n",
|
||||
"# Step 4: print the result\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2588fbba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Website Analysis and Summarization with Selenium and OpenAI\n",
|
||||
"\n",
|
||||
"> This notebook demonstrates how to extract and summarize the main content of any website using Selenium for dynamic extraction and OpenAI for generating concise summaries in Mexican Spanish.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"This notebook provides a workflow to automatically analyze websites, extract relevant text, and generate a short summary using a language model. Navigation elements are ignored, focusing on news, announcements, and main content.\n",
|
||||
"\n",
|
||||
"## Features\n",
|
||||
"- Extracts relevant text from web pages using Selenium and BeautifulSoup.\n",
|
||||
"- Generates automatic summaries using OpenAI's language models.\n",
|
||||
"- Presents results in markdown format.\n",
|
||||
"\n",
|
||||
"## Requirements\n",
|
||||
"- Python 3.8+\n",
|
||||
"- Google Chrome browser installed\n",
|
||||
"- The following Python packages:\n",
|
||||
" - selenium\n",
|
||||
" - webdriver-manager\n",
|
||||
" - beautifulsoup4\n",
|
||||
" - openai\n",
|
||||
" - python-dotenv\n",
|
||||
" - requests\n",
|
||||
"- An OpenAI API key (project key, starting with `sk-proj-`)\n",
|
||||
"- Internet connection\n",
|
||||
"\n",
|
||||
"## How to Use\n",
|
||||
"1. Install the required packages:\n",
|
||||
" ```bash\n",
|
||||
" pip install selenium webdriver-manager undetected-chromedriver beautifulsoup4 openai python-dotenv requests\n",
|
||||
" ```\n",
|
||||
"2. Add your OpenAI API key to a `.env` file as `OPENAI_API_KEY`.\n",
|
||||
"3. Run the notebook cells in order. You can change the target website URL in the code to analyze different sites.\n",
|
||||
"4. The summary will be displayed in markdown format below the code cell.\n",
|
||||
"\n",
|
||||
"**Note:** Some websites may block automated access. The notebook includes options to simulate a real user and avoid bot detection, but results may vary depending on the site's protections.\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dc7c2ade",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Imports\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"import undetected_chromedriver as uc"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2d21987",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the environment variables from .env\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bbb3a8ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5313aa64",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Website:\n",
|
||||
" def __init__(self, url, headless=True, wait_time=10):\n",
|
||||
" self.url = url # Website URL to analyze\n",
|
||||
" self.title = None # Title of the website\n",
|
||||
" self.text = None # Extracted text from the website\n",
|
||||
" \n",
|
||||
" # Chrome options configuration for Selenium\n",
|
||||
" options = Options()\n",
|
||||
" if headless:\n",
|
||||
" options.add_argument(\"--headless=new\") # Run Chrome in headless mode (no window)\n",
|
||||
" options.add_argument(\"--disable-gpu\") # Disable GPU acceleration\n",
|
||||
" options.add_argument(\"--no-sandbox\") # Disable Chrome sandbox (required for some environments)\n",
|
||||
" options.add_argument(\"--window-size=1920,1080\") # Set window size to simulate a real user\n",
|
||||
" # Simulate a real user-agent to avoid bot detection\n",
|
||||
" options.add_argument(\"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\")\n",
|
||||
" \n",
|
||||
" # Initialize Chrome WebDriver\n",
|
||||
" self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)\n",
|
||||
" self.driver.get(url) # Open the URL in the browser\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Wait until the <body> element is present in the page\n",
|
||||
" WebDriverWait(self.driver, wait_time).until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
|
||||
" html = self.driver.page_source # Get the full HTML of the page\n",
|
||||
" soup = BeautifulSoup(html, 'html.parser') # Parse HTML with BeautifulSoup\n",
|
||||
" self.title = soup.title.string if soup.title else 'No title found' # Extract the title\n",
|
||||
" if soup.body:\n",
|
||||
" # Remove irrelevant elements from the body\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" # Extract clean text from the body\n",
|
||||
" self.text = soup.body.get_text(separator='\\n', strip=True)\n",
|
||||
" else:\n",
|
||||
" self.text = \"No body found\" # If no body is found, indicate it\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error accessing the site: {e}\") # Print error to console\n",
|
||||
" self.text = \"Error accessing the site\" # Store error in the attribute\n",
|
||||
" finally:\n",
|
||||
" self.driver.quit() # Always close the browser, whether or not an error occurred"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e902c6b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_prompt = \"You are an assistant that analyzes the contents of a website \\\n",
|
||||
"and provides a short summary, ignoring text that might be navigation related. \\\n",
|
||||
"Respond in markdown in Mexican Spanish.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eaee8f36",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ac4ed8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Creates messages for the OpenAI API\n",
|
||||
"def messages_for(website):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1536d537",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Creates a summary for the given URL\n",
|
||||
"def summarize(url):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model = \"gpt-4o-mini\",\n",
|
||||
" messages = messages_for(website)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fe135339",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Shows the summary for the given URL\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a301ab4e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://openai.com/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,211 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d955d75d-4970-48fe-983e-a2a850cecfc5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"import PyPDF2\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.chrome.service import Service\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e1e5dd3-f91a-466b-8fd4-2dbf4eedf101",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override = True)\n",
|
||||
"api_key = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"API key doesn't look correct, check it\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"It looks like API key has an extra space - check it\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key looks good, moving on!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "67a6e583-1ef7-4b77-8886-c0e8c619933c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34a07806-dd68-4a86-8b6e-e1b2aaf0daa1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# path to the CV\n",
|
||||
"path = \"/Users/yanasklar/Documents/For applying/CV/СV_YanaSklyar_c.pdf\"\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Vacancy:\n",
|
||||
" def __init__(self, url, instructions = \"\"):\n",
|
||||
" self.url = url\n",
|
||||
" \n",
|
||||
" # configure Chrome settings\n",
|
||||
" options = Options()\n",
|
||||
" # options.add_argument(\"--headless\") \n",
|
||||
" \"\"\"\n",
|
||||
" Headless mode runs the browser in the background (invisible).\n",
|
||||
" However, some websites (like openai.com) block headless browsers.\n",
|
||||
" So if this line is active, the page may not load correctly and you may not get the full content.\n",
|
||||
" \"\"\"\n",
|
||||
" options.add_argument(\"--disable-gpu\")\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--window-size=1920x1080\")\n",
|
||||
"\n",
|
||||
" # use webdriver-manager to manage ChromeDriver\n",
|
||||
" service = Service(ChromeDriverManager().install())\n",
|
||||
" driver = webdriver.Chrome(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
" time.sleep(3) # let the page load\n",
|
||||
"\n",
|
||||
" # take the source of the page\n",
|
||||
" page_source = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # analyse with BeautifulSoup\n",
|
||||
" soup = BeautifulSoup(page_source, 'html.parser')\n",
|
||||
"\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"img\", \"script\", \"style\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator='\\n', strip=True)\n",
|
||||
"\n",
|
||||
" # read CV\n",
|
||||
" with open(path, 'rb') as f:\n",
|
||||
" reader = PyPDF2.PdfReader(f)\n",
|
||||
" cv_text = \"\"\n",
|
||||
" for page in reader.pages:\n",
|
||||
" text = page.extract_text()\n",
|
||||
" if text:\n",
|
||||
" cv_text += text + \"\\n\"\n",
|
||||
" self.cv_text = cv_text\n",
|
||||
"\n",
|
||||
" # summarise and print the description of the job\n",
|
||||
" message = f\"\"\"Here is the content of a webpage: {self.text}.\n",
|
||||
" Find job description on that page,\n",
|
||||
" summarise it, include the list requirements and other important details.\n",
|
||||
" \"\"\"\n",
|
||||
" messages = [{\"role\":\"user\", \"content\":message}]\n",
|
||||
" response = openai.chat.completions.create(model='gpt-4o-mini', messages = messages)\n",
|
||||
" print(\"The job description: \", response.choices[0].message.content)\n",
|
||||
"\n",
|
||||
" # create prompts\n",
|
||||
" self.system_prompt = \"\"\"You are a career assistant specializing in writing cover letter.\n",
|
||||
" Your tasks:\n",
|
||||
" 1. Read the candidate's CV (provided as text).\n",
|
||||
" 2. Read the job description (provided from a webpage).\n",
|
||||
" 3. Write a concise and compelling cover letter, that:\n",
|
||||
" - Hightlights the most relevant experience and skills from the CV,\n",
|
||||
" - Aligns directly wit the requirements in the job description,\n",
|
||||
" - Adapts to cultural and professional norms in Israel.\n",
|
||||
" The letter should be no longer than half a page, persuasive and tailored to make the applicant stand out.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" user_prompt = f\"\"\"\n",
|
||||
" Here is my CV:\n",
|
||||
" {self.cv_text}\n",
|
||||
" \n",
|
||||
" The job vacancy is from the website {self.title}.\n",
|
||||
" Here is the decription of the vacancy:\n",
|
||||
" {self.text}\n",
|
||||
" Please write a cover letter that connects my background to this vacancy.\n",
|
||||
" Make it persuasive and suitable for Israeli job market.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" if instructions:\n",
|
||||
" user_prompt += f\"Additional instructions: {instructions}\"\n",
|
||||
" self.user_prompt = user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9160b9f5-177b-4477-8e54-3a212f275a22",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cover_letter(url, instructions = \"\"):\n",
|
||||
" vacancy = Vacancy(url, instructions)\n",
|
||||
" messages = [\n",
|
||||
" {\"role\":\"system\", \"content\":vacancy.system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\":vacancy.user_prompt}\n",
|
||||
" ]\n",
|
||||
" response = openai.chat.completions.create(model='gpt-4o-mini', messages=messages)\n",
|
||||
" if not response:\n",
|
||||
" print(\"smt went wrong\")\n",
|
||||
" print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1de4b55c-a8da-445f-9865-c7a8bafdbc3c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = \"https://www.linkedin.com/jobs/view/4285898438/?alternateChannel=search&eBP=CwEAAAGY3R5LOabDLOVTy6xvBcSlWyAkIXQz8IRkSM3rgsqTPtvcEvUSnq980O7oLV2Hh_ldTpc2cBBmRq1IRnLtp7TzEcUvndFEXeCuviA5yo7oFYfW7KoEp4SPNzmf3D9LtnSgk9Iudy3skk6n3hVOtyDpx8Zm0AiTWPvdwCaZ_w5Xu8lAG797NRNDco71ynm99LmCOC9Go7DdDQ2eLewamc4SOsA4xWcXy0GmZVy3kBF1AprK3ylAYR2wrm5-hp4lRpbbfUxXjkEOG6H_GbPpKtN-N8mYnMd9w_cej5qQmTFX86gqSi6HuXFtK0h46TbOS5r-YQksVd1Yb4kYZnDznWXPLbxp04xVJSPzsHoa05wQdOfZ2UUSoMTJmic3n3qfV2u9Bp8n4sLYtINpzKdvm4eADGGkN-nR3O2oPeas9XjGbBwNdjXHAcX_PJoRwlFdQ1gVkYQEF1T7qAfXUJoUt-fv4oLxGnIgV6yJuMgw&refId=9NA7Bvt%2FhCqDkFNRGu1dPA%3D%3D&trackingId=W11hvpcIjHA%2FjU%2FFZ%2B1uAA%3D%3D\"\n",
|
||||
"b = \"The style of the cover letter should informal, as if i talked to a friend about my background\"\n",
|
||||
"cover_letter(a, b)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0feb3cbe-686a-4a97-9ca3-a0cb32a24c5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python (llms)",
|
||||
"language": "python",
|
||||
"name": "llms"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ba06289-d17a-4ccd-85f5-2b79956d4e59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "935fe7b1-1807-4f75-863d-4c118e425a19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip show selenium"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eabbbc62-1de1-4883-9b3e-9c90145ea6c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.edge.options import Options as EdgeOptions # Import EdgeOptions\n",
|
||||
"from selenium.webdriver.edge.service import Service as EdgeService # Import EdgeService\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import time\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
" def __init__(self, url, driver_path=None, wait_time=3):\n",
|
||||
" self.url = url\n",
|
||||
" self.wait_time = wait_time\n",
|
||||
"\n",
|
||||
" # Headless Edge settings\n",
|
||||
" options = EdgeOptions() # Use EdgeOptions\n",
|
||||
" # options.add_argument(\"--headless\")\n",
|
||||
" options.add_argument(\"--disable-gpu\")\n",
|
||||
" options.add_argument(\"--no-sandbox\")\n",
|
||||
" options.add_argument(\"--window-size=1920x1080\")\n",
|
||||
"\n",
|
||||
" # Driver path\n",
|
||||
" if driver_path:\n",
|
||||
" # For Edge, you might need to specify the path to msedgedriver\n",
|
||||
" # For driver download, https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/?form=MA13LH#downloads\n",
|
||||
" service = EdgeService(executable_path=driver_path) # Use EdgeService\n",
|
||||
" else:\n",
|
||||
" # If msedgedriver.exe is in your system's PATH, you can omit executable_path\n",
|
||||
" service = EdgeService()\n",
|
||||
"\n",
|
||||
" # Start browser\n",
|
||||
" # Use webdriver.Edge() for Microsoft Edge\n",
|
||||
" driver = webdriver.Edge(service=service, options=options)\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # Wait for the loading page\n",
|
||||
" time.sleep(self.wait_time)\n",
|
||||
"\n",
|
||||
" # Take page source\n",
|
||||
" html = driver.page_source\n",
|
||||
" driver.quit()\n",
|
||||
"\n",
|
||||
" # Analysis with BeautifulSoup \n",
|
||||
" soup = BeautifulSoup(html, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
"\n",
|
||||
" # Clean irrelevant tags\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
"\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "852c52e2-bd4d-4bb9-94ef-e498c33f1a89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"site = Website(\"https://openai.com\", driver_path=\"/Users/klee/Documents/edgedriver_mac64_m1/msedgedriver\")\n",
|
||||
"print(\"Title:\", site.title)\n",
|
||||
"print(\"\\nFirst 500 character:\\n\", site.text[:500])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7620c685-c35c-4d6b-aaf1-a3da98f19ca7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
319
week1/community-contributions/day2_exercise_using_input.ipynb
Normal file
319
week1/community-contributions/day2_exercise_using_input.ipynb
Normal file
@@ -0,0 +1,319 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d15d8294-3328-4e07-ad16-8a03e9bbfdb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Welcome to your first assignment!\n",
|
||||
"\n",
|
||||
"Instructions are below. Please give this a try, and look in the solutions folder if you get stuck (or feel free to ask me!)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29ddd15d-a3c5-4f4e-a678-873f56162724",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Constants\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dac0a679-599c-441f-9bf2-ddc73d35b940",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a messages list using the same format that we used for OpenAI\n",
|
||||
"\n",
|
||||
"messages = [\n",
|
||||
" {\"role\": \"user\", \"content\": \"Describe some of the business applications of Generative AI\"}\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7bb9c624-14f0-4945-a719-8ddb64f66f47",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"payload = {\n",
|
||||
" \"model\": MODEL,\n",
|
||||
" \"messages\": messages,\n",
|
||||
" \"stream\": False\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7745b9c4-57dc-4867-9180-61fa5db55eb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ollama\n",
|
||||
"\n",
|
||||
"response = ollama.chat(model=MODEL, messages=messages)\n",
|
||||
"print(response['message']['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a4704e10-f5fb-4c15-a935-f046c06fb13d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Alternative approach - using OpenAI python library to connect to Ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "23057e00-b6fc-4678-93a9-6b31cb704bff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# There's actually an alternative approach that some people might prefer\n",
|
||||
"# You can use the OpenAI client python library to call Ollama:\n",
|
||||
"\n",
|
||||
"from openai import OpenAI\n",
|
||||
"ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
"\n",
|
||||
"response = ollama_via_openai.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages=messages\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1622d9bb-5c68-4d4e-9ca4-b492c751f898",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NOW the exercise for you\n",
|
||||
"\n",
|
||||
"Take the code from day1 and incorporate it here, to build a website summarizer that uses Llama 3.2 running locally instead of OpenAI; use either of the above approaches."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "0c1f84c4-4cc0-4085-8ea5-871a8ca46a47",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import ollama"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "890852ab-2cd4-41dc-b168-6bd1360b967a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"llama3.2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "6de38216-6d1c-48c4-877b-86d403f4e0f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "9d398f9a-c66e-42b5-91b4-5417944b8408",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt_generator(website) -> str:\n",
|
||||
" user_prompt = f\"You will act as a website summarizer with knowledge of Web Content Accessibility Guidelines. You will look into the web: {website.title} and \"\n",
|
||||
" user_prompt += \"break down the relevant information about it in this categories: What is the website about, \\\n",
|
||||
" to whom the website belongs and what practises should improve to have a better user experience. \\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
"\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "156d7c67-b714-4156-9f69-faf0c50aaf13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def messages_generator(user_prompt : str) -> list[dict[str, str]]:\n",
|
||||
" messages = [{\"role\" : \"user\", \"content\" : user_prompt}]\n",
|
||||
"\n",
|
||||
" return messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "f07c4143-6cc5-4d28-846c-a373564e9264",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_request_reader() -> str:\n",
|
||||
" while True:\n",
|
||||
" website_url = input(\"Define what website you want to summarize by giving the url: \")\n",
|
||||
" if website_url.lower().startswith(\"http\"):\n",
|
||||
" return website_url\n",
|
||||
" print(\"URL not valid. Please provide a full url.\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "94933255-2ca8-40b5-8f74-865d3e781058",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def summarizer_bot():\n",
|
||||
" website_url = user_request_reader()\n",
|
||||
" website = Website(website_url)\n",
|
||||
" \n",
|
||||
" user_prompt = user_prompt_generator(website)\n",
|
||||
" messages = messages_generator(user_prompt)\n",
|
||||
"\n",
|
||||
" response = ollama.chat(model=MODEL, messages=messages)\n",
|
||||
" print(response['message']['content'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "2d81faa4-25b3-4d5d-8f36-93772e449b5c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Define what website you want to summarize by giving the url: test.com\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"URL not valid. Please provide a full url.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Define what website you want to summarize by giving the url: https://edwarddonner.com\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"**Summary:**\n",
|
||||
"\n",
|
||||
"The website \"Home - Edward Donner\" belongs to Edward Donner, a co-founder and CTO of Nebula.io, an AI startup. The website is about Edward's interests in writing code, experimenting with Large Language Models (LLMs), and DJing, as well as his work in applying AI to help people discover their potential.\n",
|
||||
"\n",
|
||||
"**Categories:**\n",
|
||||
"\n",
|
||||
"### What is the website about?\n",
|
||||
"\n",
|
||||
"The website is primarily about Edward Donner's personal brand, showcasing his expertise in AI and LLMs. It includes information about his work at Nebula.io, which applies AI to talent management. The website also features a \"Connect Four\" arena where LLMs compete against each other, as well as sections for learning more about LLMs and staying up-to-date with Edward's courses and publications.\n",
|
||||
"\n",
|
||||
"### To whom does the website belong?\n",
|
||||
"\n",
|
||||
"The website belongs to Edward Donner, a co-founder and CTO of Nebula.io. It appears to be a personal website or blog, showcasing his expertise and interests in AI and LLMs.\n",
|
||||
"\n",
|
||||
"### Practices to improve for better user experience:\n",
|
||||
"\n",
|
||||
"1. **Clearer navigation**: The website's menu is simple but not intuitive. Adding clear categories or sections would help users quickly find the information they're looking for.\n",
|
||||
"2. **More detailed about section**: The \"About\" section provides a brief overview of Edward's work and interests, but it could be more detailed and comprehensive.\n",
|
||||
"3. **Improved accessibility**: While the website is likely following general web accessibility guidelines, there are no clear indications of this on the page. Adding alt text to images, providing a clear font size and color scheme, and ensuring sufficient contrast between background and foreground would improve the user experience for people with disabilities.\n",
|
||||
"4. **Better calls-to-action (CTAs)**: The website could benefit from more prominent CTAs, guiding users towards specific actions such as signing up for courses or following Edward on social media.\n",
|
||||
"5. **SEO optimization**: The website's content and meta tags appear to be optimized for search engines, but a more thorough SEO analysis would help identify areas for improvement.\n",
|
||||
"\n",
|
||||
"Overall, the website provides a clear overview of Edward Donner's interests and expertise in AI and LLMs, but could benefit from some tweaks to improve accessibility, navigation, and CTAs.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The call\n",
|
||||
"summarizer_bot()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,329 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ab446e4-219c-4589-aa8f-9386adcf5c60",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Project Overview\n",
|
||||
"This project combines web scraping with OpenAI’s GPT models to summarize online training content. It extracts material from Microsoft’s **Quantum Computing Fundamentals** learning path, cleans it, and generates concise summaries per lesson as well as an overall course summary. \n",
|
||||
"\n",
|
||||
"## Key Features\n",
|
||||
"- Fetches and parses webpages using **requests** and **BeautifulSoup** \n",
|
||||
"- Produces summaries in multiple languages (e.g., English, Spanish, or any language) and at varying levels of detail (short, medium, detailed) \n",
|
||||
"- Summarizes individual lessons on demand or processes entire learning paths \n",
|
||||
"- Presents results as clean, structured **Markdown** directly in the notebook \n",
|
||||
"\n",
|
||||
"## Tech Stack\n",
|
||||
"- **Model**: GPT-4o-mini \n",
|
||||
"- **Language**: Python \n",
|
||||
"- **Libraries**: BeautifulSoup, OpenAI \n",
|
||||
"\n",
|
||||
"## Purpose\n",
|
||||
"This project demonstrates how AI can streamline the understanding of technical documentation and online courses by generating multilingual, customizable summaries. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4e2a9393-7767-488e-a8bf-27c12dca35bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from IPython.display import Markdown, display\n",
|
||||
"from openai import OpenAI\n",
|
||||
"\n",
|
||||
"# If you get an error running this cell, then please head over to the troubleshooting notebook!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b87cadb-d513-4303-baee-a37b6f938e4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load environment variables from .env file (not included)\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"# Check the key\n",
|
||||
"\n",
|
||||
"if not api_key:\n",
|
||||
" print(\"No API key was found\")\n",
|
||||
"elif not api_key.startswith(\"sk-proj-\"):\n",
|
||||
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key\")\n",
|
||||
"elif api_key.strip() != api_key:\n",
|
||||
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them\")\n",
|
||||
"else:\n",
|
||||
" print(\"API key found and looks good so far!\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A class to represent a Webpage\n",
|
||||
"\n",
|
||||
"# Some websites need you to use proper headers when fetching them:\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"class Website:\n",
|
||||
"\n",
|
||||
" def __init__(self, url):\n",
|
||||
" \"\"\"\n",
|
||||
" Create this Website object from the given url using the BeautifulSoup library\n",
|
||||
" \"\"\"\n",
|
||||
" self.url = url\n",
|
||||
" response = requests.get(url, headers=headers)\n",
|
||||
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||
" self.title = soup.title.string if soup.title else \"No title found\"\n",
|
||||
" for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
|
||||
" irrelevant.decompose()\n",
|
||||
" self.text = soup.body.get_text(separator=\"\\n\", strip=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ef960cf-6dc2-4cda-afb3-b38be12f4c97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"training_website = Website(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")\n",
|
||||
"print(training_website.title)\n",
|
||||
"print(training_website.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "abdb8417-c5dc-44bc-9bee-2e059d162699",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a system prompt function that can use different language and length \n",
|
||||
"\n",
|
||||
"def build_system_prompt(language=\"Spanish\", length=\"short\"):\n",
|
||||
" return f\"\"\"You are an assistant that analyzes the contents of a website and provides a {length} summary, ignoring text that might be navigation related.\n",
|
||||
" Respond in 20 words or less markdown, and respond in {language}.\n",
|
||||
" \"\"\"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "987c95a6-6618-4d22-a2c3-3038a9d3f154",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a function that writes a User Prompt that asks for summaries of websites:\n",
|
||||
"\n",
|
||||
"def user_prompt_for(website):\n",
|
||||
" user_prompt = f\"You are looking at a website titled {website.title}\"\n",
|
||||
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
|
||||
"please provide a short summary in {language} of this website in markdown. \\\n",
|
||||
"If it includes news or announcements, then summarize these too.\\n\\n\"\n",
|
||||
" user_prompt += website.text\n",
|
||||
" return user_prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a846c89-81d8-4f48-9d62-7744d76694e2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(training_website))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26448ec4-5c00-4204-baec-7df91d11ff2e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(user_prompt_for(training_website))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d06e8d78-ce4c-4b05-aa8e-17050c82bb47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## And now let's build useful messages for GPT-4o-mini, using a function"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0134dfa4-8299-48b5-b444-f2a8c3403c88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def messages_for(website, language=\"Spanish\", length=\"short\"):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(website)}\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "16f49d46-bf55-4c3e-928f-68fc0bf715b0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Time to bring it together - the API for OpenAI is very simple!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "425214b8-c5c5-4d7a-8b79-f9e151c9d54f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "905b9919-aba7-45b5-ae65-81b3d1d78e34",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#call the OpenAI API. \n",
|
||||
"\n",
|
||||
"def summarize(url, language=\"Spanish\", length=\"short\"):\n",
|
||||
" website = Website(url)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=messages_for(website, language, length)\n",
|
||||
" )\n",
|
||||
" return response.choices[0].message.content\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1c437357-d004-49f5-95c3-fce38aefcb5c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Summarize all the lessons in microsoft quantum computer training, having the option to summarize by lesson, or the training as a whole\n",
|
||||
"\n",
|
||||
"def summarize_training(path_url, language=\"Spanish\", length=\"short\"):\n",
|
||||
" links = get_links_from_path(path_url)\n",
|
||||
" print(f\"Found {len(links)} lessons\")\n",
|
||||
"\n",
|
||||
" all_summaries = []\n",
|
||||
"\n",
|
||||
" for link in links:\n",
|
||||
" print(f\"Summarizing {link}...\")\n",
|
||||
" summary = summarize(link, language, length)\n",
|
||||
" all_summaries.append(f\"### {link}\\n{summary}\\n\")\n",
|
||||
"\n",
|
||||
" combined_prompt = \"Here are summaries of each lesson:\\n\\n\" + \"\\n\".join(all_summaries)\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": build_system_prompt(language, length)},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Please summarize the entire training path based on these lesson summaries:\\n\\n\" + combined_prompt}\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return \"\\n\".join(all_summaries) + \"\\n\\n## General Course Summary\\n\" + response.choices[0].message.content\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "05e38d41-dfa4-4b20-9c96-c46ea75d9fb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarize(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3d926d59-450e-4609-92ba-2d6f244f1342",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A function to display this nicely in the Jupyter output, using markdown\n",
|
||||
"\n",
|
||||
"def display_summary(url):\n",
|
||||
" summary = summarize(url)\n",
|
||||
" display(Markdown(summary))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3018853a-445f-41ff-9560-d925d1774b2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"display_summary(\"https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
337
week1/community-contributions/week-1_exercise.ipynb
Normal file
337
week1/community-contributions/week-1_exercise.ipynb
Normal file
@@ -0,0 +1,337 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "64d2e4a0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# End of Week 1 Exercise\n",
|
||||
"\n",
|
||||
"To demonstrate your familiarity with OpenAI API, and also Ollama, build a tool that takes a technical question,\n",
|
||||
"and responds with an explanation. This is a tool that you will be able to use yourself during the course!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e62b915e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from openai import OpenAI\n",
|
||||
"import ollama\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import os\n",
|
||||
"from IPython.display import display, update_display, Markdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "8bdfc47a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_GPT = 'gpt-4o-mini'\n",
|
||||
"MODEL_LLAMA = 'llama3'\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"\n",
|
||||
"openai=OpenAI()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "57983d03",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_messages(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Create properly formatted messages for API calls\"\"\"\n",
|
||||
" messages = [\n",
|
||||
" {\n",
|
||||
" \"role\": \"system\",\n",
|
||||
" \"content\": \"You are a helpful technical assistant that provides clear, detailed explanations for technical questions.\"\n",
|
||||
" },\n",
|
||||
" {\"role\": \"user\", \"content\": prompt}\n",
|
||||
" ]\n",
|
||||
" return messages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a6bcb94d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def answer_with_openai(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Get answer using OpenAI API and print in stream\"\"\"\n",
|
||||
" try:\n",
|
||||
" messages = create_messages(prompt)\n",
|
||||
" stream = openai.chat.completions.create(\n",
|
||||
" model=MODEL_GPT,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.7,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" answer = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" if chunk.choices[0].delta.content:\n",
|
||||
" answer += chunk.choices[0].delta.content\n",
|
||||
" # Clean up markdown formatting for display\n",
|
||||
" clean_answer = answer.replace(\"```\", \"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(clean_answer), display_id=display_handle.display_id)\n",
|
||||
" return answer\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"Error with OpenAI: {str(e)}\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "e96159ab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def answer_with_ollama(prompt=\"Describe some of the business applications of Generative AI\"):\n",
|
||||
" \"\"\"Get answer using Ollama API and print in stream\"\"\"\n",
|
||||
" try:\n",
|
||||
" messages = create_messages(prompt)\n",
|
||||
" stream = ollama.chat(\n",
|
||||
" model=MODEL_LLAMA,\n",
|
||||
" messages=messages,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" answer = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" if chunk['message']['content']:\n",
|
||||
" answer += chunk['message']['content']\n",
|
||||
" # Clean up markdown formatting for display\n",
|
||||
" clean_answer = answer.replace(\"```\", \"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(clean_answer), display_id=display_handle.display_id)\n",
|
||||
" return answer\n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"Error with Ollama: {str(e)}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "ab72f8b6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def technical_qa_tool(question, use_openai=True, use_ollama=True):\n",
|
||||
" \"\"\"Main function to get technical explanations from both APIs\"\"\"\n",
|
||||
" print(f\"Question: {question}\")\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" \n",
|
||||
" if use_openai:\n",
|
||||
" print(\"\\n🤖 OpenAI Response:\")\n",
|
||||
" print(\"-\" * 40)\n",
|
||||
" answer_with_openai(question)\n",
|
||||
" \n",
|
||||
" if use_ollama:\n",
|
||||
" print(\"\\n🦙 Ollama Response:\")\n",
|
||||
" print(\"-\" * 40)\n",
|
||||
" answer_with_ollama(question)\n",
|
||||
" # display(Markdown(ollama_answer))\n",
|
||||
" \n",
|
||||
" print(\"\\n\" + \"=\" * 80)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "1a6aa4a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Question: What is the difference between supervised and unsupervised machine learning?\n",
|
||||
"================================================================================\n",
|
||||
"\n",
|
||||
"🤖 OpenAI Response:\n",
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"Supervised and unsupervised machine learning are two primary categories of machine learning techniques, and they differ mainly in how they learn from data and the type of problems they are used to solve. Here’s a detailed explanation of each:\n",
|
||||
"\n",
|
||||
"### Supervised Machine Learning\n",
|
||||
"\n",
|
||||
"**Definition**: In supervised learning, the model is trained on a labeled dataset, meaning that each training example is paired with an output label. The goal is to learn a mapping from inputs (features) to the output labels.\n",
|
||||
"\n",
|
||||
"**Characteristics**:\n",
|
||||
"- **Labeled Data**: Requires a dataset that includes both the input features and the corresponding output labels.\n",
|
||||
"- **Objective**: The objective is to predict the output for new, unseen data based on the learned mapping from the training data.\n",
|
||||
"- **Common Techniques**:\n",
|
||||
" - **Regression**: For predicting continuous values (e.g., predicting house prices).\n",
|
||||
" - **Classification**: For predicting discrete labels (e.g., spam detection in emails).\n",
|
||||
"- **Examples**:\n",
|
||||
" - Predicting whether an email is spam or not based on various features (classification).\n",
|
||||
" - Forecasting sales figures based on historical sales data (regression).\n",
|
||||
"\n",
|
||||
"### Unsupervised Machine Learning\n",
|
||||
"\n",
|
||||
"**Definition**: In unsupervised learning, the model is trained on data that is not labeled, meaning that it does not have predefined output labels. The goal is to discover patterns, groupings, or structures within the data.\n",
|
||||
"\n",
|
||||
"**Characteristics**:\n",
|
||||
"- **Unlabeled Data**: Works with datasets that only have input features without any associated output labels.\n",
|
||||
"- **Objective**: The objective is to explore the data and find hidden patterns or intrinsic structures without specific guidance.\n",
|
||||
"- **Common Techniques**:\n",
|
||||
" - **Clustering**: Grouping similar data points together (e.g., customer segmentation).\n",
|
||||
" - **Dimensionality Reduction**: Reducing the number of features while retaining essential information (e.g., PCA - Principal Component Analysis).\n",
|
||||
"- **Examples**:\n",
|
||||
" - Grouping customers into segments based on purchasing behavior (clustering).\n",
|
||||
" - Reducing the dimensionality of a dataset to visualize it in two or three dimensions (dimensionality reduction).\n",
|
||||
"\n",
|
||||
"### Key Differences\n",
|
||||
"\n",
|
||||
"1. **Data Type**:\n",
|
||||
" - Supervised Learning: Requires labeled data.\n",
|
||||
" - Unsupervised Learning: Works with unlabeled data.\n",
|
||||
"\n",
|
||||
"2. **Goal**:\n",
|
||||
" - Supervised Learning: To learn a function that maps inputs to the correct outputs.\n",
|
||||
" - Unsupervised Learning: To identify patterns or groupings in the input data.\n",
|
||||
"\n",
|
||||
"3. **Applications**:\n",
|
||||
" - Supervised Learning: Typically used in scenarios where past data with known outcomes is available (e.g., fraud detection, image classification).\n",
|
||||
" - Unsupervised Learning: Used for exploratory data analysis or when the outcome is not known (e.g., market basket analysis, anomaly detection).\n",
|
||||
"\n",
|
||||
"In summary, the primary difference between supervised and unsupervised machine learning lies in the presence or absence of labeled data and the objectives of the learning process. Supervised learning aims to predict outcomes based on existing labels, while unsupervised learning seeks to identify hidden structures in data without predefined labels."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"🦙 Ollama Response:\n",
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"In machine learning, there are two main categories: supervised and unsupervised learning. The key difference lies in the type of data used to train the model and the goal of the learning process.\n",
|
||||
"\n",
|
||||
"**Supervised Learning**\n",
|
||||
"\n",
|
||||
"In supervised learning, you have a labeled dataset that contains both input data (features) and corresponding output labels or target variables. The goal is to learn a mapping between the input data and the output labels so that the model can make accurate predictions on new, unseen data.\n",
|
||||
"\n",
|
||||
"Here are some characteristics of supervised learning:\n",
|
||||
"\n",
|
||||
"1. Labeled training data: You have a dataset with input data and corresponding output labels.\n",
|
||||
"2. Specific goal: You want to predict the output label for a given input instance.\n",
|
||||
"3. Model evaluation: You evaluate the performance of your model using metrics like accuracy, precision, recall, F1 score, etc.\n",
|
||||
"\n",
|
||||
"Examples of supervised learning tasks include:\n",
|
||||
"\n",
|
||||
"* Image classification (e.g., recognizing dogs vs. cats)\n",
|
||||
"* Sentiment analysis (e.g., determining if text is positive or negative)\n",
|
||||
"* Regression problems (e.g., predicting house prices based on features like number of bedrooms and square footage)\n",
|
||||
"\n",
|
||||
"**Unsupervised Learning**\n",
|
||||
"\n",
|
||||
"In unsupervised learning, you have an unlabeled dataset, and the goal is to discover patterns, relationships, or structure in the data without a specific target variable. This type of learning is often used for exploratory data analysis, feature selection, and dimensionality reduction.\n",
|
||||
"\n",
|
||||
"Here are some characteristics of unsupervised learning:\n",
|
||||
"\n",
|
||||
"1. Unlabeled training data: You have a dataset with only input features (no output labels).\n",
|
||||
"2. No specific goal: You want to find interesting patterns or structure in the data.\n",
|
||||
"3. Model evaluation: You evaluate the performance of your model using metrics like silhouette score, Calinski-Harabasz index, etc.\n",
|
||||
"\n",
|
||||
"Examples of unsupervised learning tasks include:\n",
|
||||
"\n",
|
||||
"* Clustering (e.g., grouping customers based on their purchase history)\n",
|
||||
"* Dimensionality reduction (e.g., reducing the number of features in a dataset while preserving important information)\n",
|
||||
"* Anomaly detection (e.g., identifying unusual behavior or outliers in financial transactions)\n",
|
||||
"\n",
|
||||
"In summary, supervised learning involves training a model to make predictions based on labeled data, whereas unsupervised learning aims to discover patterns and relationships in unlabeled data."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the tool with a technical question\n",
|
||||
"technical_question = \"What is the difference between supervised and unsupervised machine learning?\"\n",
|
||||
"technical_qa_tool(technical_question)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0a976ce1",
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9b0a539e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Interactive version - uncomment to use\n",
|
||||
"# user_question = input(\"Enter your technical question: \")\n",
|
||||
"# technical_qa_tool(user_question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
216
week1/community-contributions/youtube_video_summarize.ipynb
Normal file
216
week1/community-contributions/youtube_video_summarize.ipynb
Normal file
@@ -0,0 +1,216 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "8ca2e60d-17c0-40fc-91c6-c16915b39c06",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re, html, json\n",
|
||||
"import requests\n",
|
||||
"from urllib.error import HTTPError\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled, VideoUnavailable\n",
|
||||
"\n",
|
||||
"OLLAMA_API = \"http://localhost:11434/api/chat\"\n",
|
||||
"HEADERS = {\"Content-Type\": \"application/json\"}\n",
|
||||
"MODEL = \"llama3.2\"\n",
|
||||
"api_key='ollama'\n",
|
||||
"\n",
|
||||
"def yt_title_desc_transcript(url: str, lang=\"en\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Returns {\"title\": str|None, \"description\": str|None, \"transcript\": str|None}.\n",
|
||||
" - Title via oEmbed (no API key).\n",
|
||||
" - Description scraped from the watch page (shortDescription).\n",
|
||||
" - Transcript via youtube-transcript-api, gracefully handling 400/disabled.\n",
|
||||
" \"\"\"\n",
|
||||
" # --- extract 11-char video id ---\n",
|
||||
" m = re.search(r\"(?:v=|/)([0-9A-Za-z_-]{11})|^([0-9A-Za-z_-]{11})$\", url)\n",
|
||||
" vid = (m.group(1) or m.group(2)) if m else None\n",
|
||||
" if not vid:\n",
|
||||
" return {\"title\": None, \"description\": None, \"transcript\": None}\n",
|
||||
"\n",
|
||||
" # --- title via oEmbed (very robust) ---\n",
|
||||
" title = None\n",
|
||||
" try:\n",
|
||||
" r = requests.get(\"https://www.youtube.com/oembed\",\n",
|
||||
" params={\"url\": f\"https://www.youtube.com/watch?v={vid}\", \"format\": \"json\"},\n",
|
||||
" timeout=10)\n",
|
||||
" if r.ok:\n",
|
||||
" title = r.json().get(\"title\")\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # --- description from watch page (shortDescription in initial JSON) ---\n",
|
||||
" description = None\n",
|
||||
" try:\n",
|
||||
" page = requests.get(f\"https://www.youtube.com/watch?v={vid}\", timeout=10).text\n",
|
||||
" # Look for ytInitialPlayerResponse JSON\n",
|
||||
" jmatch = re.search(r\"ytInitialPlayerResponse\\s*=\\s*({.*?});\", page, re.DOTALL)\n",
|
||||
" if jmatch:\n",
|
||||
" data = json.loads(jmatch.group(1))\n",
|
||||
" desc = data.get(\"videoDetails\", {}).get(\"shortDescription\")\n",
|
||||
" if desc:\n",
|
||||
" description = html.unescape(desc)\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # --- transcript (handle 400 cleanly) ---\n",
|
||||
" transcript_text = None\n",
|
||||
" try:\n",
|
||||
" items = YouTubeTranscriptApi.get_transcript(vid, languages=[lang])\n",
|
||||
" transcript_text = \" \".join(ch[\"text\"].strip() for ch in items if ch.get(\"text\"))\n",
|
||||
" except (NoTranscriptFound, TranscriptsDisabled, VideoUnavailable, HTTPError):\n",
|
||||
" # HTTPError covers the \"HTTP Error 400: Bad Request\" case\n",
|
||||
" transcript_text = None\n",
|
||||
" except Exception:\n",
|
||||
" transcript_text = None\n",
|
||||
"\n",
|
||||
" return {\"title\": title, \"description\": description, \"transcript\": transcript_text}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "ad9be496-4e91-4562-90f3-54d11208da55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"system_prompt = '''\n",
|
||||
"You are an assistant that generates detailed yet concise summaries of YouTube videos.\n",
|
||||
"When the user provides a title and description of a YouTube video, your task is to write a coherent, engaging, and informative summary of around 500 words.\n",
|
||||
"The summary should:\n",
|
||||
"\n",
|
||||
"Capture the main themes and key points the video likely covers.\n",
|
||||
"\n",
|
||||
"Expand on the description logically, providing context and flow.\n",
|
||||
"\n",
|
||||
"Stay neutral, factual, and clear (no personal opinions).\n",
|
||||
"\n",
|
||||
"Be self-contained so it makes sense without needing to watch the video.\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "dd4be0bc-df1f-47e0-9e03-9b734117f80a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def user_prompt(title, description):\n",
|
||||
" prompt = '''Provide me the YouTube video title and description.\\n\n",
|
||||
" I will generate a clear, engaging, and concise summary of the video content in around 500 words,\\n\n",
|
||||
" highlighting the main ideas, key points, and important details.\\n'''\n",
|
||||
" prompt += f'here is the title : {title} \\n Description : {description} '\n",
|
||||
" return prompt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "46896ad3-db1e-448a-8a03-036b9568c69f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_youtube(yt_url):\n",
|
||||
" ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')\n",
|
||||
" video_metadata = yt_title_desc_transcript(yt_url)\n",
|
||||
" stream = ollama.chat.completions.create(\n",
|
||||
" model=MODEL,\n",
|
||||
" messages = [\n",
|
||||
" {\"role\":\"system\", \"content\": system_prompt},\n",
|
||||
" {\"role\":\"user\", \"content\": user_prompt(video_metadata['title'], video_metadata['description'])}\n",
|
||||
" ],\n",
|
||||
" stream=True\n",
|
||||
" \n",
|
||||
" )\n",
|
||||
" response = \"\"\n",
|
||||
" display_handle = display(Markdown(\"\"), display_id=True)\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" response = response.replace(\"```\",\"\").replace(\"markdown\", \"\")\n",
|
||||
" update_display(Markdown(response), display_id=display_handle.display_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "b59f8773-c13e-4050-ad3c-b578d07ef5e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/markdown": [
|
||||
"Here is a summary of the YouTube video:\n",
|
||||
"\n",
|
||||
"**Monta Re: A Baul-Inspired Tribute to the Mystic Guru Shankaracharya**\n",
|
||||
"\n",
|
||||
"The music video for \"Monta Re\" by Amit Trivedi, featuring Swanand Kirkire and Amitabh Bhattacharya, is a soulful tribute to the mystic guru Shankaracharya. Set in the Bengali folk music tradition, this song brings to life the ancient tales of Shankaracharya's spiritual journey.\n",
|
||||
"\n",
|
||||
"With elegant lyrics penned by Amitabh Bhattacharya, \"Monta Re\" transports listeners to the banks of the Ganges River, where Shankaracharya wandered in search of wisdom and inner peace. The song's haunting melodies and emotive vocals evoke a sense of longing and introspection, perfectly capturing the mystic guru's spiritual essence.\n",
|
||||
"\n",
|
||||
"The music video beautifully illustrates the baul-inspired style, with intricate traditional dance movements performed by a group of energetic dancers. The choreography seamlessly blends elements of Bengal's folk heritage with modern sensibilities, making the song an engaging watch for audience members interested in Indian classical music.\n",
|
||||
"\n",
|
||||
"**Music and Lyric Credit:**\n",
|
||||
"Amit Trivedi handles the music composition, ensuring that the melody complements the song's themes without overpowering them. Amitabh Bhattacharya takes credit for the lyrics, which tell stunning stories of Shankaracharya's spiritual adventures. The song features Swanand Kirkire and Amitabh Bhattacharya as vocalists, further enriching its emotional impact.\n",
|
||||
"\n",
|
||||
"**Relevance to Bengali Culture:**\n",
|
||||
"\"Monta Re\" is a heartwarming tribute to Bengal's rich cultural heritage. Inspired by the baul traditions of the region, this song honors Shankaracharya's life and spiritual journey without diminishing his significance in modern times. By showcasing these folk roots, \"Monta Re\" provides fans with an enriching sensory experience.\n",
|
||||
"\n",
|
||||
"You can listen to \"Monta Re\" along with other T-Series music videos released by Amit Trivedi at the links provided below:\n",
|
||||
"\n",
|
||||
"- Watch \"Ankahee\"\n",
|
||||
"- Check out \"Sawaar Loon\"\n",
|
||||
"- Explore \"Zinda Hoon\"\n",
|
||||
"\n",
|
||||
"Follow the official T-SERIES YouTube channel for an ever-growing variety of original music tracks!\n",
|
||||
"\n",
|
||||
"By embracing the richness of Bengali folk traditions, \"Monta Re\" embodies a musical reflection of Shankaracharya's extraordinary journey as both spiritual guide and symbol of timeless wisdom."
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Markdown object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stream_youtube('https://youtu.be/99NUJ1cLbBI?list=RDdJ6_aU6auZc')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "649287ca-aff8-4b59-91b7-731c007e83a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
223
week2/community-contributions/AddingGeminiToDropdown.ipynb
Normal file
223
week2/community-contributions/AddingGeminiToDropdown.ipynb
Normal file
@@ -0,0 +1,223 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "057bc09f-a682-4b72-97ed-c69ddef3f03e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Gemini to Dropdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d66eb067-7bae-4145-b613-6da2f40fbf27",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"from typing import List\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import google.generativeai as genai\n",
|
||||
"import anthropic"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e36f8a93-8a65-48f2-bcad-7c47dd72ef3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gradio as gr "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8a5ec1b0-f5b4-46d2-abb0-b28b73cc4d28",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
|
||||
"\n",
|
||||
"if openai_api_key:\n",
|
||||
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"OpenAI API Key not set\")\n",
|
||||
" \n",
|
||||
"if anthropic_api_key:\n",
|
||||
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Anthropic API Key not set\")\n",
|
||||
"\n",
|
||||
"if google_api_key:\n",
|
||||
" print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Google API Key not set\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26d0099c-890f-4358-8c1d-7a708abcb105",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"claude = anthropic.Anthropic()\n",
|
||||
"\n",
|
||||
"google.generativeai.configure()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6606bfdb-964e-4d6f-b2a1-5017b99aa23d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_message = \"You are a helpful assistant\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e0cfb96a-2dbe-4228-8efb-75947dbc3228",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_gpt(prompt):\n",
|
||||
" messages = [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": prompt}\n",
|
||||
" ]\n",
|
||||
" stream = openai.chat.completions.create(\n",
|
||||
" model='gpt-4o-mini',\n",
|
||||
" messages=messages,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" result = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" result += chunk.choices[0].delta.content or \"\"\n",
|
||||
" yield result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9008a15d-0ee8-44e0-b123-225e7148113e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_claude(prompt):\n",
|
||||
" result = claude.messages.stream(\n",
|
||||
" model=\"claude-3-haiku-20240307\",\n",
|
||||
" max_tokens=1000,\n",
|
||||
" temperature=0.7,\n",
|
||||
" system=system_message,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": prompt},\n",
|
||||
" ],\n",
|
||||
" )\n",
|
||||
" response = \"\"\n",
|
||||
" with result as stream:\n",
|
||||
" for text in stream.text_stream:\n",
|
||||
" response += text or \"\"\n",
|
||||
" yield response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "378ad12e-6645-4647-807c-00995e360268",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_gemini(prompt):\n",
|
||||
" gemini = genai.GenerativeModel(\n",
|
||||
" model_name=\"gemini-2.0-flash\",\n",
|
||||
" system_instruction=system_message\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" stream = gemini.generate_content(prompt, stream=True)\n",
|
||||
" \n",
|
||||
" result = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" try:\n",
|
||||
" part = chunk.text\n",
|
||||
" if part:\n",
|
||||
" result += part\n",
|
||||
" yield result \n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"Chunk error:\", e)\n",
|
||||
" \n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fd50e143-eead-49b1-8ea3-b440becd4bc9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def stream_model(prompt, model):\n",
|
||||
" if model==\"GPT\":\n",
|
||||
" result = stream_gpt(prompt)\n",
|
||||
" elif model==\"Claude\":\n",
|
||||
" result = stream_claude(prompt)\n",
|
||||
" elif model==\"Gemini\":\n",
|
||||
" result = stream_gemini(prompt)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown model\")\n",
|
||||
" yield from result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c7fc9cb4-fbb8-4301-86a6-96c90f67eb3b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"view = gr.Interface(\n",
|
||||
" fn=stream_model,\n",
|
||||
" inputs=[gr.Textbox(label=\"Your message:\"), gr.Dropdown([\"GPT\", \"Claude\",\"Gemini\"], label=\"Select model\", value=\"GPT\")],\n",
|
||||
" outputs=[gr.Markdown(label=\"Response:\")],\n",
|
||||
" flagging_mode=\"never\"\n",
|
||||
")\n",
|
||||
"view.launch()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
252
week2/community-contributions/Mediterranean Banter.ipynb
Normal file
252
week2/community-contributions/Mediterranean Banter.ipynb
Normal file
@@ -0,0 +1,252 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9ab6f493-026f-4950-b244-22c5251b8daa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Mediterranean Banter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4568bc63-679e-4ea1-a9c9-b85dfc386ec7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import anthropic\n",
|
||||
"from IPython.display import Markdown, display, update_display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a852bee-76e6-4538-89a3-0702c2d5f05c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import google.generativeai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c843ac74-4797-4bd0-bed2-dfcaa2f98c41",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
|
||||
"\n",
|
||||
"if openai_api_key:\n",
|
||||
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"OpenAI API Key not set\")\n",
|
||||
" \n",
|
||||
"if anthropic_api_key:\n",
|
||||
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Anthropic API Key not set\")\n",
|
||||
"\n",
|
||||
"if google_api_key:\n",
|
||||
" print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Google API Key not set\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e4181018-1818-47fa-a3fb-554627a63f69",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"\n",
|
||||
"claude = anthropic.Anthropic()\n",
|
||||
"\n",
|
||||
"google.generativeai.configure()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19f833c8-e183-469c-a8c8-1c014889a15d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_model_spain = \"gpt-4.1-mini\"\n",
|
||||
"claude_model_italy = \"claude-3-5-haiku-latest\"\n",
|
||||
"gemini_model_france = \"gemini-2.0-flash\"\n",
|
||||
"\n",
|
||||
"gpt_system_spain = \"You are a chatbot who is very argumentative; \\\n",
|
||||
"you disagree with anything other than how beautiful and great South of Spain is.\"\n",
|
||||
"\n",
|
||||
"claude_system_italy = \"You are a very polite, courteous chatbot but you favour south of Italy prasing Amalfi Coast.\"\n",
|
||||
"\n",
|
||||
"gemini_system_france = \"You are neutral. You always accept what other people say and never try to put down other people thoughts.Although, you politely try to include that South of France is beautiful\"\n",
|
||||
"\n",
|
||||
"gpt_messages_spain = [\"Hi there, nothing beats the beauty of Spain and its wonderful beaches.\"]\n",
|
||||
"claude_messages_italy = [\"I agree. I admire the Southern Part of Spain but its not as pretty as Amalfi Coast.\"]\n",
|
||||
"gemini_messages_france = [\"Well, both are good and so is the French Riveria.\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2d426b95-5e7c-49aa-a5a1-9613296eb0d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def call_gpt():\n",
|
||||
" messages = [{\"role\": \"system\", \"content\": gpt_system_spain}]\n",
|
||||
" for gpt, claude,gemini in zip(gpt_messages_spain, claude_messages_italy,gemini_messages_france):\n",
|
||||
" messages.append({\"role\": \"assistant\", \"content\": gpt})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": claude})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gemini})\n",
|
||||
" completion = openai.chat.completions.create(\n",
|
||||
" model=gpt_model_spain,\n",
|
||||
" messages=messages\n",
|
||||
" )\n",
|
||||
" return completion.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3fc9a696-3145-4f37-873b-539647f2fc0b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"call_gpt()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "63910faa-a122-4261-82a0-7530c6c5749a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def call_claude():\n",
|
||||
" messages = []\n",
|
||||
" for gpt_spain, claude_italy,gemini_france in zip(gpt_messages_spain, claude_messages_italy,gemini_messages_france):\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gpt_spain})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gemini_france})\n",
|
||||
" messages.append({\"role\": \"assistant\", \"content\": claude_italy})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gpt_messages_spain[-1]})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gemini_messages_france[-1]})\n",
|
||||
" message = claude.messages.create(\n",
|
||||
" model=claude_model_italy,\n",
|
||||
" system=claude_system_italy,\n",
|
||||
" messages=messages,\n",
|
||||
" max_tokens=500\n",
|
||||
" )\n",
|
||||
" return message.content[0].text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d3ab6aa2-a462-4fb3-bb6a-dc6b971827fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"call_claude()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "114cb7eb-0915-46ac-b285-e40acf4a9ffb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def call_gemini():\n",
|
||||
" messages=[]\n",
|
||||
" for gpt_spain, claude_italy,gemini_france in zip(gpt_messages_spain, claude_messages_italy,gemini_messages_france):\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gpt_spain})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": claude_italy})\n",
|
||||
" messages.append({\"role\": \"assistant\", \"content\": gemini_france})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": gpt_messages_spain[-1]})\n",
|
||||
" messages.append({\"role\": \"user\", \"content\": claude_messages_italy[-1]})\n",
|
||||
" gemini = google.generativeai.GenerativeModel(\n",
|
||||
" model_name='gemini-2.0-flash',\n",
|
||||
" system_instruction=gemini_system_france\n",
|
||||
" )\n",
|
||||
" dialogue_text = \"\\n\".join(f\"{m['role']}: {m['content']}\" for m in messages)\n",
|
||||
" response = gemini.generate_content(dialogue_text)\n",
|
||||
" return response.text\n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e3acf708-f9b1-4a6d-b3e1-823c96d00555",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"call_gemini()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c103430e-68c7-4cc6-8a43-6b5aec7fdc96",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_messages_spain = [\"Hi there, nothing beats the beauty of Spain and its wonderful beaches.\"]\n",
|
||||
"claude_messages_italy = [\"I agree. I admire the Southern Part of Spain but its not as pretty as Amalfi Coast.\"]\n",
|
||||
"gemini_messages_france = [\"Well, both are good and so is the French Riveria.\"]\n",
|
||||
"\n",
|
||||
"print(f\"GPT:\\n{gpt_messages_spain[0]}\\n\")\n",
|
||||
"print(f\"Claude:\\n{claude_messages_italy[0]}\\n\")\n",
|
||||
"print(f\"Gemini:\\n{gemini_messages_france[0]}\\n\")\n",
|
||||
"\n",
|
||||
"for i in range(5):\n",
|
||||
" gpt_next = call_gpt()\n",
|
||||
" print(f\"GPT:\\n{gpt_next}\\n\")\n",
|
||||
" gpt_messages_spain.append(gpt_next)\n",
|
||||
" \n",
|
||||
" claude_next = call_claude()\n",
|
||||
" print(f\"Claude:\\n{claude_next}\\n\")\n",
|
||||
" claude_messages_italy.append(claude_next)\n",
|
||||
"\n",
|
||||
" gemini_next = call_gemini()\n",
|
||||
" print(f\"Gemini:\\n{gemini_next}\\n\")\n",
|
||||
" gemini_messages_france.append(gemini_next)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
181
week2/community-contributions/SushiRestaurant.ipynb
Normal file
181
week2/community-contributions/SushiRestaurant.ipynb
Normal file
@@ -0,0 +1,181 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "757905af-7f93-4dca-9526-063bc93a78c7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Sakana-ya (魚屋) Sushi\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9a6721fb-efca-4412-a0a7-cc8e6c4ced76",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import gradio as gr\n",
|
||||
"import json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0fa458f-f73f-491c-b666-95db4b91f571",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
"google_api_key = os.getenv('GOOGLE_API_KEY')\n",
|
||||
"\n",
|
||||
"if openai_api_key:\n",
|
||||
" print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"OpenAI API Key not set\")\n",
|
||||
" \n",
|
||||
"if anthropic_api_key:\n",
|
||||
" print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Anthropic API Key not set\")\n",
|
||||
"\n",
|
||||
"if google_api_key:\n",
|
||||
" print(f\"Google API Key exists and begins {google_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" print(\"Google API Key not set\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aa2846f2-e09c-421d-9774-c04961a79800",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"openai = OpenAI()\n",
|
||||
"MODEL = 'gpt-4o-mini'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7672ecdf-cf50-4b96-887a-b0a4eb5bbbf5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" \n",
|
||||
"menu = {\n",
|
||||
" \"Nigiri (1 pc)\": {\n",
|
||||
" \"Salmon\": 4.25,\n",
|
||||
" \"Tuna\": 4.75,\n",
|
||||
" \"Yellowtail\": 5.00,\n",
|
||||
" \"Eel\": 5.25,\n",
|
||||
" \"Tamago\": 3.00,\n",
|
||||
" },\n",
|
||||
" \"Sashimi (3 pc)\": {\n",
|
||||
" \"Salmon\": 8.50,\n",
|
||||
" \"Tuna\": 9.00,\n",
|
||||
" \"Yellowtail\": 9.50,\n",
|
||||
" \"Octopus\": 8.00,\n",
|
||||
" },\n",
|
||||
" \"Classic Rolls (6 pc)\": {\n",
|
||||
" \"California\": 6.50,\n",
|
||||
" \"Spicy Tuna\": 7.50,\n",
|
||||
" \"Philadelphia\": 7.25,\n",
|
||||
" \"Cucumber\": 4.50,\n",
|
||||
" \"Avocado\": 4.75,\n",
|
||||
" },\n",
|
||||
" \"Specialty Rolls (8 pc)\": {\n",
|
||||
" \"Dragon\": 13.50,\n",
|
||||
" \"Rainbow\": 14.00,\n",
|
||||
" \"Crunchy Shrimp\": 12.50,\n",
|
||||
" \"Volcano\": 13.00,\n",
|
||||
" \"Spider\": 14.50,\n",
|
||||
" },\n",
|
||||
" \"Appetizers\": {\n",
|
||||
" \"Edamame\": 5.00,\n",
|
||||
" \"Gyoza (5)\": 6.50,\n",
|
||||
" \"Miso Soup\": 3.00,\n",
|
||||
" \"Seaweed Salad\": 5.50,\n",
|
||||
" },\n",
|
||||
" \"Beverages\": {\n",
|
||||
" \"Green Tea\": 2.50,\n",
|
||||
" \"Ramune Soda\": 3.00,\n",
|
||||
" \"Sparkling Water\": 2.75,\n",
|
||||
" },\n",
|
||||
" \"Desserts\": {\n",
|
||||
" \"Mochi Ice Cream (2)\": 5.00,\n",
|
||||
" \"Matcha Cheesecake\": 6.50,\n",
|
||||
" },\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "99914500-3630-4fea-987c-d19c760994c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def chat(message, history):\n",
|
||||
" system_message = \"You are a helpful assistant for Sakana-ya (魚屋) Sushi restaurant.\\\n",
|
||||
" Help out with information and if you dont know something just say you cant help with that.\"\n",
|
||||
" system_message += json.dumps(menu)\n",
|
||||
" system_message+=\"If something is not in the menu, we dont serve it.\\\n",
|
||||
" If we dont have a dish just mention it that we dont offer it. \"\n",
|
||||
"\n",
|
||||
" sushi_exotic = [\n",
|
||||
" {\"role\": \"user\", \"content\": \"Do you have aji?\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"We currently dont have shun its available only during the season i.e in May.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"What about buri?\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Thats seasonal as well only during December. Do visit us during that time.\"},\n",
|
||||
" \n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" messages = [{\"role\": \"system\", \"content\": system_message}]+ sushi_exotic + history + [{\"role\": \"user\", \"content\": message}]\n",
|
||||
" stream = openai.chat.completions.create(model=MODEL, messages=messages, stream=True)\n",
|
||||
"\n",
|
||||
" response = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" response += chunk.choices[0].delta.content or ''\n",
|
||||
" yield response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a5c61d91-abee-4ada-9a42-ae87cf53fcff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gr.ChatInterface(fn=chat, type=\"messages\").launch()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,255 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "596b764a-2ece-4cb0-91c7-5317b8b2c65f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"from IPython.display import Markdown, display, update_display"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "191079a8-fcb0-45fa-a954-9e92e3baa250",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"load_dotenv(override=True)\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n",
|
||||
"google_api_key = os.getenv('GOOGLE_API_KEY')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a0f19ff-c936-469f-9fa1-c09b5c126263",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_model = \"gpt-4.1-mini\"\n",
|
||||
"claude_model = \"claude-3-5-haiku-latest\"\n",
|
||||
"gemini_model = \"gemini-2.5-flash\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1ffa25e-8250-4a86-951a-af44f1369336",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_client = OpenAI(\n",
|
||||
" api_key=openai_api_key\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"claude_client = OpenAI(\n",
|
||||
" api_key=anthropic_api_key,\n",
|
||||
" base_url=\"https://api.anthropic.com/v1/\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"gemini_client = OpenAI(\n",
|
||||
" api_key=google_api_key,\n",
|
||||
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eb8a203d-bdc7-40ee-a456-d47bdc71b07f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Tests\n",
|
||||
"\n",
|
||||
"messages = [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Howdy partner!\"}]\n",
|
||||
"\n",
|
||||
"gpt_response = gpt_client.chat.completions.create(\n",
|
||||
" model=gpt_model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.5\n",
|
||||
")\n",
|
||||
"print(f\"GPT: {gpt_response.choices[0].message.content}\")\n",
|
||||
"\n",
|
||||
"claude_response = claude_client.chat.completions.create(\n",
|
||||
" model=claude_model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.5\n",
|
||||
")\n",
|
||||
"print(f\"Claude: {claude_response.choices[0].message.content}\")\n",
|
||||
"\n",
|
||||
"gemini_response = gemini_client.chat.completions.create(\n",
|
||||
" model=gemini_model,\n",
|
||||
" messages=messages,\n",
|
||||
" temperature=0.5\n",
|
||||
")\n",
|
||||
"print(f\"Gemini: {gemini_response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d140561e-fbf8-4741-b0bd-f850524bd6b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_system = \"You are 'user_1'. You are snarky, entitled, and argumentative. Your role is to try and argue about anything and everything, and always have the last word, and never back down.\"\n",
|
||||
"claude_system = \"You are 'user_2'. You are a sharp debater. You always debate every argument, and you do everything you can to be the debate winner. You don't stop until you have the upper hand.\"\n",
|
||||
"gemini_system = \"You are 'user_3'. You are a mediator, coach and philosopher. Your job is to bring two sides to an agreement and have them stop arguing.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b2b26a34-eb36-41c1-be2d-fc8154218897",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"apis = {\n",
|
||||
" \"gpt\": {\n",
|
||||
" \"name\": \"gpt\",\n",
|
||||
" \"user_name\": \"Gapetto\",\n",
|
||||
" \"client\": gpt_client,\n",
|
||||
" \"model\": gpt_model,\n",
|
||||
" \"system\": gpt_system,\n",
|
||||
" \"messages\": [],\n",
|
||||
" },\n",
|
||||
" \"claude\": {\n",
|
||||
" \"name\": \"claude\",\n",
|
||||
" \"user_name\": \"Claudia\",\n",
|
||||
" \"client\": claude_client,\n",
|
||||
" \"model\": claude_model,\n",
|
||||
" \"system\": claude_system,\n",
|
||||
" \"messages\": [],\n",
|
||||
" },\n",
|
||||
" \"gemini\": {\n",
|
||||
" \"name\": \"gemini\",\n",
|
||||
" \"user_name\": \"Germione\",\n",
|
||||
" \"client\": gemini_client,\n",
|
||||
" \"model\": gemini_model,\n",
|
||||
" \"system\": gemini_system,\n",
|
||||
" \"messages\": []\n",
|
||||
" }\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "88bb7277-45dc-41b4-827c-b2e5a8b76675",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def call_llm(name: str):\n",
|
||||
" llm = apis[name]\n",
|
||||
" context = [{\"role\": \"system\", \"content\": llm[\"system\"]}]\n",
|
||||
" \n",
|
||||
" gpt_role, gpt_name = (\"assistant\", \"\") if name == \"gpt\" else (\"user\", f'{apis[\"gpt\"][\"user_name\"]}: ')\n",
|
||||
" claude_role, claude_name = (\"assistant\", \"\") if name == \"claude\" else (\"user\", f'{apis[\"claude\"][\"user_name\"]}: ')\n",
|
||||
" gemini_role, gemini_name = (\"assistant\", \"\") if name == \"gemini\" else (\"user\", f'{apis[\"gemini\"][\"user_name\"]}: ')\n",
|
||||
" \n",
|
||||
" for gpt, claude, gemini in zip(apis[\"gpt\"][\"messages\"], apis[\"claude\"][\"messages\"], apis[\"gemini\"][\"messages\"]):\n",
|
||||
" context.append({\"role\": gpt_role, \"content\": f\"{gpt_name}{gpt}\"})\n",
|
||||
" context.append({\"role\": claude_role, \"content\": f\"{claude_name}{claude}\"})\n",
|
||||
" context.append({\"role\": gemini_role, \"content\": f\"{gemini_name}{gemini}\"})\n",
|
||||
" \n",
|
||||
" for i, key in enumerate(apis.keys()):\n",
|
||||
" if key != name:\n",
|
||||
" if len(apis[key][\"messages\"]) > len(llm[\"messages\"]):\n",
|
||||
" context.append({\"role\": \"user\", \"content\": f'{apis[key][\"user_name\"]}: {apis[key][\"messages\"][-1]}'})\n",
|
||||
" \n",
|
||||
" response = llm[\"client\"].chat.completions.create(\n",
|
||||
" model=llm[\"model\"],\n",
|
||||
" messages=context,\n",
|
||||
" temperature=0.7\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" message = response.choices[0].message.content\n",
|
||||
" llm[\"messages\"].append(message)\n",
|
||||
" return message"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4fc73a2e-d8de-4a39-bfa2-67b16c231869",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"apis[\"gpt\"][\"messages\"] = [\"Hi\"]\n",
|
||||
"apis[\"claude\"][\"messages\"] = [\"Hi\"]\n",
|
||||
"apis[\"gemini\"][\"messages\"] = [\"Lord of the Rings or Harry Potter?\"] # Obviously LOTR."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3810fbaf-94d1-4750-8e13-812d2e05b2d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gpt_response = call_llm(\"gpt\")\n",
|
||||
"display(Markdown(f\"### Gapetto:\\n{gpt_response}\\n\\n\"))\n",
|
||||
"\n",
|
||||
"claude_response = call_llm(\"claude\")\n",
|
||||
"display(Markdown(f\"### Claudia:\\n{claude_response}\\n\\n\"))\n",
|
||||
"\n",
|
||||
"gemini_response = call_llm(\"gemini\")\n",
|
||||
"display(Markdown(f\"### Germione:\\n{gemini_response}\\n\\n\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e87b2ffc-6197-401a-97ca-7f51ac1677f2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"apis[\"gpt\"][\"messages\"] = [\"Hi\"]\n",
|
||||
"apis[\"claude\"][\"messages\"] = [\"Hi\"]\n",
|
||||
"apis[\"gemini\"][\"messages\"] = [\"Lord of the Rings or Harry Potter?\"]\n",
|
||||
"\n",
|
||||
"for i in range(5):\n",
|
||||
" display(Markdown(f\"## Round {i+1}:\\n\\n\"))\n",
|
||||
" \n",
|
||||
" gpt_response = call_llm(\"gpt\")\n",
|
||||
" display(Markdown(f\"### Gapetto:\\n{gpt_response}\\n\\n\"))\n",
|
||||
"\n",
|
||||
" claude_response = call_llm(\"claude\")\n",
|
||||
" display(Markdown(f\"### Claudia:\\n{claude_response}\\n\\n\"))\n",
|
||||
"\n",
|
||||
" gemini_response = call_llm(\"gemini\")\n",
|
||||
" display(Markdown(f\"### Germione:\\n{gemini_response}\\n\\n\"))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,968 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bcb31876-4d8c-41ef-aa24-b8c78dfd5808",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Project - Stock Information AI Assistant\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b7bd1bd7-19d9-4c4b-bc4b-9bc9cca8bd0f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install finnhub-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8b50bbe2-c0b1-49c3-9a5c-1ba7efa2bcb4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import gradio as gr\n",
|
||||
"import finnhub\n",
|
||||
"from typing import Dict, List, Any, Optional\n",
|
||||
"from datetime import datetime"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba0ddc1a-c775-4ed3-9531-ed0c5799e87f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"# Configure root logger\n",
|
||||
"logging.basicConfig(\n",
|
||||
" level=logging.INFO, # Set level: DEBUG, INFO, WARNING, ERROR\n",
|
||||
" format=\"%(asctime)s [%(levelname)s] %(message)s\", \n",
|
||||
" force=True # Ensures reconfiguration if you rerun this cell\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"logger = logging.getLogger(__name__) # Use a global logger object\n",
|
||||
"logger.info(\"Logger initialized!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "747e8786-9da8-4342-b6c9-f5f69c2e22ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialization\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"\n",
|
||||
"openai_api_key = os.getenv('OPENAI_API_KEY')\n",
|
||||
"FINNHUB_API_KEY = os.getenv(\"FINNHUB_API_KEY\")\n",
|
||||
"\n",
|
||||
"if openai_api_key:\n",
|
||||
" logger.info(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n",
|
||||
"else:\n",
|
||||
" logger.error(\"OpenAI API Key not set\")\n",
|
||||
"\n",
|
||||
"if FINNHUB_API_KEY:\n",
|
||||
" logger.info(f\"FINNHUB_API_KEY exists!\")\n",
|
||||
"else:\n",
|
||||
" logger.error(\"OpenAI API Key not set\")\n",
|
||||
" \n",
|
||||
"MODEL = \"gpt-4.1-mini\" # not using gpt-5-mini as openai doesn't let you stream responses till you are a verified organisation :(\n",
|
||||
"openai = OpenAI()\n",
|
||||
"finnhub_client = finnhub.Client(api_key=FINNHUB_API_KEY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ee3aaa9a-5495-42fd-a382-803fbfa92eaf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"system_message = f\"\"\"\n",
|
||||
"You are \"TickerBot\" — a concise, factual, educational assistant specializing in U.S. stocks. \n",
|
||||
"Your job: quickly and accurately explain stock and company information in plain English. NEVER give investment advice, buy/sell recommendations, or price predictions.\n",
|
||||
"\n",
|
||||
"## PRIVACY ABOUT IMPLEMENTATION\n",
|
||||
"- Do not reveal any internal implementation details to users. Never display or mention internal tool names, API names, developer notes, configured flags, date-range limits, or other system/developer constraints in user-facing replies.\n",
|
||||
"- All runtime/tool constraints and capability detection are internal. Present only user-facing capabilities in plain language.\n",
|
||||
"\n",
|
||||
"## USER-FACING CAPABILITIES\n",
|
||||
"- When asked \"What can you do?\", list only stock-relevant actions in plain language. Example reply:\n",
|
||||
" \"I can look up tickers, show the latest quotes, provide key company financials and latest earnings details, summarize recent company or market headlines, and give a brief market overview.\"\n",
|
||||
"- Do not list internal utilities or developer tools as user-facing capabilities.\n",
|
||||
"\n",
|
||||
"## GENERAL PRINCIPLES\n",
|
||||
"- Answer only what was asked for. \n",
|
||||
"- Be brief, clear, and professional while still maintaining a warm tone. Use short paragraphs and one-line bullet explanations when requested.\n",
|
||||
"- Return only what the system provides; do not invent, infer, or extrapolate unavailable data.\n",
|
||||
"- Never offer or advertise any feature the environment does not actually support. Avoid offering attachments, direct downloads, or full-text article retrieval unless the system explicitly provides those outputs.\n",
|
||||
"\n",
|
||||
"## Behavior Rules\n",
|
||||
"- Stay professional and neutral at all times. \n",
|
||||
"- Clarify only when user intent is ambiguous; never guess. \n",
|
||||
"- Only disclose information the user explicitly requested. \n",
|
||||
"- Never explain system limits (e.g., API ranges, date limits) ever. \n",
|
||||
"- Summaries should be tight and relevant, not verbose. \n",
|
||||
"\n",
|
||||
"## NEWS & HEADLINES\n",
|
||||
"- When interpreting date-related or temporal reasoning requests (e.g., “latest earnings,” “recent news,” “Q1 results”) Call `get_current_time` to determine the current date.\n",
|
||||
"- Present news/headlines in concise bullet lines when requested. Default recent-window behavior is internal; do not describe or expose internal default windows or limits to the user.\n",
|
||||
"- If the system only returns headlines/summaries, present those and do not offer to fetch full-text or additional ranges unless the user explicitly asks and the environment supports that action.\n",
|
||||
"\n",
|
||||
"## FOLLOW-UP & CLARIFYING QUESTIONS\n",
|
||||
"- If no matching stock symbol is found, ask the user to clarify the name or ticker. Mention you only support U.S. stocks. If they confirm the symbol but no data exists, state that no results were found.\n",
|
||||
"- Never append unsolicited menus, multi-choice lists, or repeated \"Would you like...\" prompts at the end of a normal reply.\n",
|
||||
"- Ask a single direct clarifying question only when strictly necessary to fulfill the user's request (for example: ambiguous company name or missing ticker). That single question must be the final line of the reply.\n",
|
||||
"- If the user's intent is clear, proceed and return results. Do not request confirmations or offer options unless required to complete the task.\n",
|
||||
"\n",
|
||||
"## MISSING-DATA / NOTE RULES\n",
|
||||
"- Do NOT call out missing/unavailable single fields unless:\n",
|
||||
" 1) the missing field was explicitly requested by the user; OR\n",
|
||||
" 2) multiple (>1) key metrics required to answer the user's request are unavailable and their absence materially prevents a useful answer.\n",
|
||||
"- If condition (1) or (2) applies, include at most one concise \"Note:\" line naming the specific missing field(s) and nothing else.\n",
|
||||
"- Otherwise omit any comment about single, non-central missing fields.\n",
|
||||
"\n",
|
||||
"## ERROR HANDLING\n",
|
||||
"- If a company/ticker cannot be found: \"I couldn't find that company on U.S. exchanges. Please provide the ticker or a clearer company name.\"\n",
|
||||
"- If partial results are returned: present available data and at most one short note: \"Some data wasn't available: <field1>, <field2>.\"\n",
|
||||
"- If you do not know the answer, say plainly: \"I don't have that information.\"\n",
|
||||
"\n",
|
||||
"## ADVICE / DISCLAIMER (CONDITIONAL)\n",
|
||||
"- Do NOT include an investment-advice disclaimer in normal greetings or factual replies.\n",
|
||||
"- Show a single-line disclaimer only when the user explicitly asks for recommendations/advice or uses clear advice-seeking language (e.g., \"should I buy\", \"recommend\", \"what should I do\", \"financial advice\").\n",
|
||||
"- Disclaimer text when triggered: \"I can't provide investment advice or buy/sell recommendations. I can provide facts and explain metrics.\"\n",
|
||||
"\n",
|
||||
"EXAMPLES (templates — follow exactly)\n",
|
||||
"\n",
|
||||
"1) Clear factual request (no unsolicited follow-up)\n",
|
||||
"User: \"What's AAPL last trade?\"\n",
|
||||
"Assistant:\n",
|
||||
"Last trade timestamp: 2025-08-28 20:00:00 UTC (U.S. market close — 4:00 PM ET)\n",
|
||||
"\n",
|
||||
"(—end: no menu, no offers—)\n",
|
||||
"\n",
|
||||
"2) Ambiguous request — single clarifying question only\n",
|
||||
"User: \"Show me Apple\"\n",
|
||||
"Assistant:\n",
|
||||
"I found two matches. Do you mean Apple Inc. (AAPL) or Apple Hospitality REIT (APLE)?\n",
|
||||
"\n",
|
||||
"(—end: single question final line—)\n",
|
||||
"\n",
|
||||
"3) News returned as headlines only (do not offer unsupported fetches)\n",
|
||||
"User: \"Recent NVDA news\"\n",
|
||||
"Assistant:\n",
|
||||
"- 2025-08-28: Headline A — short summary.\n",
|
||||
"- 2025-08-26: Headline B — short summary.\n",
|
||||
"(—end: do not offer full-article fetch or extended-range offers unless explicitly requested and supported—)\n",
|
||||
"\"\"\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fdf1a2b0-07be-47a0-9ce3-14d21b48c8f2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_current_time() -> Dict[str, Any]:\n",
|
||||
" \"\"\"\n",
|
||||
" Retrieve the current UTC time in ISO format with timezone.\n",
|
||||
" Returns a dictionary for consistency with other tools.\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" current_time = datetime.utcnow().isoformat() + 'Z'\n",
|
||||
" return {\n",
|
||||
" \"success\": True,\n",
|
||||
" \"current_time\": current_time\n",
|
||||
" }\n",
|
||||
" except Exception as e:\n",
|
||||
" return {\"success\": False, \"error\": f\"Failed to get time: {str(e)[:100]}\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12d912fc-91fb-469e-9572-2876a099f5aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_current_time_function = {\n",
|
||||
" \"name\": \"get_current_time\",\n",
|
||||
" \"description\": \"Get the current UTC time in ISO format (YYYY-MM-DDTHH:MM:SS.ssssssZ). Useful for temporal reasoning, date calculations, or setting time ranges for queries like news.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {}, # No parameters needed\n",
|
||||
" \"required\": []\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"get_current_time_tool = {\"type\": \"function\", \"function\": get_current_time_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "61a2a15d-b559-4844-b377-6bd5cb4949f6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def validate_symbol(symbol: str) -> bool:\n",
|
||||
" \"\"\"Validate stock symbol format\"\"\"\n",
|
||||
" if not symbol or not isinstance(symbol, str):\n",
|
||||
" return False\n",
|
||||
" return symbol.isalnum() and 1 <= len(symbol) <= 5 and symbol.isupper()\n",
|
||||
"\n",
|
||||
"def search_symbol(query: str) -> Dict[str, Any]:\n",
|
||||
" \"\"\"Search for stock symbol using Finnhub client\"\"\"\n",
|
||||
" logger.info(f\"Tool search_symbol called for {query}\")\n",
|
||||
" try:\n",
|
||||
" if not query or len(query.strip()) < 1:\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid search query\"}\n",
|
||||
" \n",
|
||||
" query = query.strip()[:50]\n",
|
||||
" result = finnhub_client.symbol_lookup(query)\n",
|
||||
" logger.info(f\"Tool search_symbol {result}\")\n",
|
||||
" \n",
|
||||
" if result.get(\"result\") and len(result[\"result\"]) > 0:\n",
|
||||
" first_result = result[\"result\"][0]\n",
|
||||
" symbol = first_result.get(\"symbol\", \"\").upper()\n",
|
||||
" \n",
|
||||
" if validate_symbol(symbol):\n",
|
||||
" return {\n",
|
||||
" \"success\": True,\n",
|
||||
" \"symbol\": symbol\n",
|
||||
" }\n",
|
||||
" else:\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid symbol format found\"}\n",
|
||||
" else:\n",
|
||||
" return {\"success\": False, \"error\": \"No matching US stocks found\"}\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" return {\"success\": False, \"error\": f\"Symbol search failed: {str(e)[:100]}\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "173010e3-dfef-4611-8b68-d11256bd5fba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"search_symbol_function = {\n",
|
||||
" \"name\": \"search_symbol\",\n",
|
||||
" \"description\": \"Search for a stock symbol / ticker symbol based on company name or partial name\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"query\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Company name or partial name to search for, extract only relevant name part and pass it here, keep this to less than 50 characters\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\n",
|
||||
" \"query\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"search_symbol_tool = {\"type\": \"function\", \"function\": search_symbol_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "448bb4ce-8e86-4ceb-ab52-96bddfd33337",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _format_big_number_from_millions(value_millions: Any) -> str:\n",
|
||||
" \"\"\"\n",
|
||||
" Finnhub returns some large metrics (marketCapitalization, enterpriseValue, revenueTTM)\n",
|
||||
" in MILLIONS USD. Convert to full USD and format with M/B/T suffixes.\n",
|
||||
" \"\"\"\n",
|
||||
" if value_millions is None:\n",
|
||||
" return \"Unavailable\"\n",
|
||||
" try:\n",
|
||||
" value = float(value_millions) * 1_000_000 # convert millions -> full USD\n",
|
||||
" except (TypeError, ValueError):\n",
|
||||
" return \"Unavailable\"\n",
|
||||
"\n",
|
||||
" trillion = 1_000_000_000_000\n",
|
||||
" billion = 1_000_000_000\n",
|
||||
" million = 1_000_000\n",
|
||||
"\n",
|
||||
" if value >= trillion:\n",
|
||||
" return f\"{value / trillion:.2f}T USD\"\n",
|
||||
" if value >= billion:\n",
|
||||
" return f\"{value / billion:.2f}B USD\"\n",
|
||||
" if value >= million:\n",
|
||||
" return f\"{value / million:.2f}M USD\"\n",
|
||||
" return f\"{value:.2f} USD\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def _safe_metric(metrics: Dict[str, Any], key: str) -> Any:\n",
|
||||
" \"\"\"\n",
|
||||
" Return metric value if present; otherwise \"Unavailable\".\n",
|
||||
" We intentionally return the raw value for numeric metrics (no rounding/format)\n",
|
||||
" except for the specially formatted big-number fields handled elsewhere.\n",
|
||||
" \"\"\"\n",
|
||||
" if metrics is None:\n",
|
||||
" return \"Unavailable\"\n",
|
||||
" val = metrics.get(key)\n",
|
||||
" return val if val is not None else \"Unavailable\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_company_financials(symbol: str) -> Dict[str, Any]:\n",
|
||||
" \"\"\"\n",
|
||||
" Fetch and return a curated set of 'basic' financial metrics for `symbol`.\n",
|
||||
" - Calls finnhub_client.company_basic_financials(symbol, 'all')\n",
|
||||
" - Formats market cap, enterprise value, revenue (Finnhub returns these in millions)\n",
|
||||
" - Returns success flag and readable keys\n",
|
||||
" \"\"\"\n",
|
||||
" logger.info(f\"Tool get_company_financials called for {symbol}\")\n",
|
||||
" try:\n",
|
||||
" if not symbol or not symbol.strip():\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid stock symbol\"}\n",
|
||||
"\n",
|
||||
" symbol = symbol.strip().upper()\n",
|
||||
"\n",
|
||||
" # --- API Call ---\n",
|
||||
" financials_resp = finnhub_client.company_basic_financials(symbol, \"all\")\n",
|
||||
"\n",
|
||||
" # Finnhub places primary values under \"metric\"\n",
|
||||
" metrics = financials_resp.get(\"metric\", {})\n",
|
||||
" if not metrics:\n",
|
||||
" return {\"success\": False, \"error\": \"No financial metrics found\"}\n",
|
||||
"\n",
|
||||
" # --- Build result using helpers ---\n",
|
||||
" result = {\n",
|
||||
" \"success\": True,\n",
|
||||
" \"symbol\": symbol,\n",
|
||||
" \"financials\": {\n",
|
||||
" \"Market Cap\": _format_big_number_from_millions(metrics.get(\"marketCapitalization\")),\n",
|
||||
" \"Enterprise Value\": _format_big_number_from_millions(metrics.get(\"enterpriseValue\")),\n",
|
||||
" \"P/E Ratio (TTM)\": _safe_metric(metrics, \"peBasicExclExtraTTM\"),\n",
|
||||
" \"Forward P/E\": _safe_metric(metrics, \"forwardPE\"),\n",
|
||||
" \"Gross Margin (TTM)\": _safe_metric(metrics, \"grossMarginTTM\"),\n",
|
||||
" \"Net Profit Margin (TTM)\": _safe_metric(metrics, \"netProfitMarginTTM\"),\n",
|
||||
" \"EPS (TTM)\": _safe_metric(metrics, \"epsTTM\"),\n",
|
||||
" \"EPS Growth (5Y)\": _safe_metric(metrics, \"epsGrowth5Y\"),\n",
|
||||
" \"Dividend Yield (Indicated Annual)\": _safe_metric(metrics, \"dividendYieldIndicatedAnnual\"),\n",
|
||||
" \"Current Ratio (Quarterly)\": _safe_metric(metrics, \"currentRatioQuarterly\"),\n",
|
||||
" \"Debt/Equity (Long Term, Quarterly)\": _safe_metric(metrics, \"longTermDebt/equityQuarterly\"),\n",
|
||||
" \"Beta\": _safe_metric(metrics, \"beta\"),\n",
|
||||
" \"52-Week High\": _safe_metric(metrics, \"52WeekHigh\"),\n",
|
||||
" \"52-Week Low\": _safe_metric(metrics, \"52WeekLow\"),\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" # keep error message short but useful for debugging\n",
|
||||
" return {\"success\": False, \"error\": f\"Failed to fetch metrics: {str(e)[:200]}\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9df7b74e-fec8-4e75-92a9-31acc75e6e97",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_company_financials_function = {\n",
|
||||
" \"name\": \"get_company_financials\",\n",
|
||||
" \"description\": \"Fetch and return a curated set of basic financial metrics for a stock symbol. Calls Finnhub's company_basic_financials API, formats large numbers (market cap, enterprise value, revenue) in M/B/T USD, and shows metrics like P/E ratios, EPS, margins, dividend yield, debt/equity, beta, and 52-week range. Returns 'Unavailable' for missing values.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"symbol\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Stock ticker symbol to fetch metrics for. Example: 'AAPL' for Apple Inc.\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\n",
|
||||
" \"symbol\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"get_company_financials_tool = {\"type\": \"function\", \"function\": get_company_financials_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cfeeb200-3f30-4855-82b9-cc8b2a950f80",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_stock_quote(symbol: str) -> dict:\n",
|
||||
" \"\"\"\n",
|
||||
" Fetch the latest stock quote for a given ticker symbol using Finnhub's /quote endpoint.\n",
|
||||
" Returns current price, daily high/low, open, previous close, percent change, and readable timestamp.\n",
|
||||
" \"\"\"\n",
|
||||
" logger.info(f\"Tool get_stock_quote called for {symbol}\")\n",
|
||||
" try:\n",
|
||||
" if not symbol or len(symbol.strip()) < 1:\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid symbol provided\"}\n",
|
||||
" \n",
|
||||
" symbol = symbol.strip().upper()\n",
|
||||
" data = finnhub_client.quote(symbol)\n",
|
||||
"\n",
|
||||
" if not data or \"c\" not in data:\n",
|
||||
" return {\"success\": False, \"error\": \"No quote data found\"}\n",
|
||||
" \n",
|
||||
" # Convert epoch timestamp to ISO UTC if present\n",
|
||||
" timestamp = data.get(\"t\")\n",
|
||||
" if timestamp and isinstance(timestamp, (int, float)):\n",
|
||||
" timestamp = datetime.utcfromtimestamp(timestamp).isoformat() + \"Z\"\n",
|
||||
" else:\n",
|
||||
" timestamp = \"Unavailable\"\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" \"success\": True,\n",
|
||||
" \"symbol\": symbol,\n",
|
||||
" \"current_price\": round(data.get(\"c\", 0), 2) if data.get(\"c\") is not None else \"Unavailable\",\n",
|
||||
" \"change\": round(data.get(\"d\", 0), 2) if data.get(\"d\") is not None else \"Unavailable\",\n",
|
||||
" \"percent_change\": f\"{round(data.get('dp', 0), 2)}%\" if data.get(\"dp\") is not None else \"Unavailable\",\n",
|
||||
" \"high_price\": round(data.get(\"h\", 0), 2) if data.get(\"h\") is not None else \"Unavailable\",\n",
|
||||
" \"low_price\": round(data.get(\"l\", 0), 2) if data.get(\"l\") is not None else \"Unavailable\",\n",
|
||||
" \"open_price\": round(data.get(\"o\", 0), 2) if data.get(\"o\") is not None else \"Unavailable\",\n",
|
||||
" \"previous_close\": round(data.get(\"pc\", 0), 2) if data.get(\"pc\") is not None else \"Unavailable\",\n",
|
||||
" \"timestamp\": timestamp\n",
|
||||
" }\n",
|
||||
" except Exception as e:\n",
|
||||
" return {\"success\": False, \"error\": f\"Quote retrieval failed: {str(e)[:100]}\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3724d92a-4515-4267-af6f-2c1ec2b6ed36",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_stock_quote_function = {\n",
|
||||
" \"name\": \"get_stock_quote\",\n",
|
||||
" \"description\": \"Retrieve the latest stock quote for a given symbol, including current price, daily high/low, open, previous close, and percent change. Data is near real-time. Avoid constant polling; use websockets for streaming updates.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"symbol\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Stock ticker symbol to fetch the latest quote for. Example: 'AAPL', 'MSFT'.\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"symbol\"]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"get_stock_quote_tool = {\"type\": \"function\", \"function\": get_stock_quote_function}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62f5d477-6626-428f-b8eb-d763e736ef5b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_company_news(symbol: str, _from: str, to: str):\n",
|
||||
" \"\"\"\n",
|
||||
" Fetch the top latest company news for a stock symbol within a date range.\n",
|
||||
" - Ensures the range does not exceed ~1 months (35 days).\n",
|
||||
" - Best practice: Keep searches to a month or less to avoid too much data.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" symbol (str): Stock ticker (e.g., \"AAPL\").\n",
|
||||
" _from (str): Start date in YYYY-MM-DD format.\n",
|
||||
" to (str): End date in YYYY-MM-DD format.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" list or dict: Cleaned news data or error message.\n",
|
||||
" \"\"\"\n",
|
||||
" # Validate date format\n",
|
||||
" logger.info(f\"Tool get_company_news called for {symbol} from {_from} to {to}\")\n",
|
||||
" try:\n",
|
||||
" start_date = datetime.strptime(_from, \"%Y-%m-%d\")\n",
|
||||
" end_date = datetime.strptime(to, \"%Y-%m-%d\")\n",
|
||||
" except ValueError:\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid date format. Use YYYY-MM-DD.\"}\n",
|
||||
"\n",
|
||||
" # Check date range\n",
|
||||
" delta_days = (end_date - start_date).days\n",
|
||||
" if delta_days > 35:\n",
|
||||
" return {\n",
|
||||
" \"success\": False, \n",
|
||||
" \"error\": f\"Date range too large ({delta_days} days). \"\n",
|
||||
" \"Please use a range of 1 months or less.\"\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Fetch data\n",
|
||||
" try:\n",
|
||||
" news = finnhub_client.company_news(symbol, _from=_from, to=to)\n",
|
||||
" except Exception as e:\n",
|
||||
" return {\"success\": False, \"error\": str(e)}\n",
|
||||
"\n",
|
||||
" # Do not want to report just the latest news in the time period\n",
|
||||
" if len(news) <= 10:\n",
|
||||
" # If 10 or fewer articles, take all\n",
|
||||
" selected_news = news\n",
|
||||
" else:\n",
|
||||
" # Take first 5 (oldest) and last 5 (newest)\n",
|
||||
" selected_news = news[:5] + news[-5:]\n",
|
||||
"\n",
|
||||
" # Clean & transform objects\n",
|
||||
" cleaned_news = []\n",
|
||||
" for article in selected_news:\n",
|
||||
" cleaned_news.append({\n",
|
||||
" \"summary\": article.get(\"summary\"),\n",
|
||||
" \"source\": article.get(\"source\"),\n",
|
||||
" \"published_at\": datetime.utcfromtimestamp(article[\"datetime\"]).strftime(\"%Y-%m-%d %H:%M:%S UTC\"),\n",
|
||||
" \"related\": article.get(\"related\")\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" return {\"success\": True, \"news\": cleaned_news}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5150ecb6-e3f1-46dc-94fa-2a9abe5165f6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_company_news_function = {\n",
|
||||
" \"name\": \"get_company_news\",\n",
|
||||
" \"description\": \"Fetch the top most recent company news articles for a given stock symbol. ⚠️ Avoid querying more than a 1-month range at a time as it may return too much data. Only tells news about company within last 1 year. An error is returned if the requested time range exceeds 1 month.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"symbol\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Stock ticker symbol, e.g., 'AAPL'.\"\n",
|
||||
" },\n",
|
||||
" \"_from\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Start date in YYYY-MM-DD format. Ensure it is not more than 1 year ago from today. Ensure it is before or equal to the date in to.\"\n",
|
||||
" },\n",
|
||||
" \"to\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"End date in YYYY-MM-DD format. Ensure it is not more than 1 year ago. Ensure it is after or equal to the date in from.\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\n",
|
||||
" \"symbol\",\n",
|
||||
" \"_from\",\n",
|
||||
" \"to\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"get_company_news_tool = {\"type\": \"function\", \"function\": get_company_news_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "26dd7375-626f-4235-b4a2-f1926f62cc5e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_market_news(category: str = \"general\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Fetch the latest market news for a given category.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" category (str): News category. One of [\"general\", \"forex\", \"crypto\", \"merger\"].\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" list or dict: A cleaned list of news articles or error message.\n",
|
||||
" \"\"\"\n",
|
||||
" logger.info(f\"Tool get_market_news called for category '{category}'\")\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" news = finnhub_client.general_news(category)\n",
|
||||
" except Exception as e:\n",
|
||||
" logger.error(f\"Tool get_market_news API call failed!\")\n",
|
||||
" return {\"success\": False, \"error\": str(e)}\n",
|
||||
"\n",
|
||||
" # Do not want to report just the latest news in the time period\n",
|
||||
" if len(news) <= 10:\n",
|
||||
" # If 10 or fewer articles, take all\n",
|
||||
" selected_news = news\n",
|
||||
" else:\n",
|
||||
" # Take first 5 (oldest) and last 5 (newest)\n",
|
||||
" selected_news = news[:5] + news[-5:]\n",
|
||||
"\n",
|
||||
" # Clean & transform objects\n",
|
||||
" cleaned_news = []\n",
|
||||
" for article in selected_news:\n",
|
||||
" cleaned_news.append({\n",
|
||||
" \"headline\": article.get(\"headline\"),\n",
|
||||
" \"summary\": article.get(\"summary\"),\n",
|
||||
" \"source\": article.get(\"source\"),\n",
|
||||
" \"category\": article.get(\"category\"),\n",
|
||||
" \"related\": article.get(\"related\")\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" return {\"success\": True, \"news\": cleaned_news}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5bd1aa28-119c-4c7a-bdc0-161a582ab1cc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_market_news_function = {\n",
|
||||
" \"name\": \"get_market_news\",\n",
|
||||
" \"description\": \"Fetch the latest market news by category. Returns the top 10 news articles with headline, summary, source, category, published time (UTC), and URLs. Categories: general, forex, crypto, merger. Use this to quickly get relevant financial news.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"category\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"News category to fetch. One of: general, forex, crypto, merger.\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\"category\"]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"get_market_news_tool = {\"type\": \"function\", \"function\": get_market_news_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fbe8ef6c-2d88-43a2-94dc-70b507fe9cd2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_earnings_calendar(symbol: str = \"\", _from: str = \"\", to: str = \"\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Fetch LATEST earnings calendar data for a stock symbol within a date range.\n",
|
||||
" - End date must be within the last month. (Free tier only allows last 1 month data)\n",
|
||||
" - Shows historical and upcoming earnings releases with EPS and revenue data.\n",
|
||||
" Args:\n",
|
||||
" symbol (str): Stock ticker (e.g., \"AAPL\"). Leave empty for all companies.\n",
|
||||
" _from (str): Start date in YYYY-MM-DD format.\n",
|
||||
" to (str): End date in YYYY-MM-DD format.\n",
|
||||
" Returns:\n",
|
||||
" list or dict: Cleaned earnings calendar data or error message.\n",
|
||||
" \"\"\"\n",
|
||||
" logger.info(f\"Tool get_earnings_calendar called for {symbol or 'all symbols'} from {_from} to {to}\")\n",
|
||||
" \n",
|
||||
" # Validate date format if provided\n",
|
||||
" if _from or to:\n",
|
||||
" try:\n",
|
||||
" start_date = datetime.strptime(_from, \"%Y-%m-%d\") if _from else None\n",
|
||||
" end_date = datetime.strptime(to, \"%Y-%m-%d\") if to else None\n",
|
||||
" \n",
|
||||
" # Check date range if both dates provided\n",
|
||||
" # Check if end_date is within 1 month (≈30 days) of today\n",
|
||||
" if end_date:\n",
|
||||
" today = datetime.utcnow()\n",
|
||||
" if (today - end_date).days > 30:\n",
|
||||
" return {\n",
|
||||
" \"success\": False,\n",
|
||||
" \"error\": \"End date must be within the last month.\"\n",
|
||||
" }\n",
|
||||
" except ValueError:\n",
|
||||
" return {\"success\": False, \"error\": \"Invalid date format. Use YYYY-MM-DD.\"}\n",
|
||||
" \n",
|
||||
" # Fetch earnings calendar data\n",
|
||||
" try:\n",
|
||||
" earnings_data = finnhub_client.earnings_calendar(_from=_from, to=to, symbol=symbol, international=False)\n",
|
||||
" except Exception as e:\n",
|
||||
" logger.error(f\"Error fetching earnings calendar: {e}\")\n",
|
||||
" return {\"success\": False, \"error\": str(e)}\n",
|
||||
" \n",
|
||||
" # Check if data exists\n",
|
||||
" if not earnings_data or \"earningsCalendar\" not in earnings_data:\n",
|
||||
" return {\"success\": False, \"error\": \"No earnings data available for the specified criteria.\"}\n",
|
||||
" \n",
|
||||
" earnings_list = earnings_data[\"earningsCalendar\"]\n",
|
||||
" \n",
|
||||
" if not earnings_list:\n",
|
||||
" return {\"success\": True, \"earnings\": [], \"message\": \"No earnings releases found for the specified period.\"}\n",
|
||||
" \n",
|
||||
" # Clean & transform earnings data\n",
|
||||
" cleaned_earnings = []\n",
|
||||
" for earning in earnings_list:\n",
|
||||
" # Format hour description\n",
|
||||
" hour_map = {\n",
|
||||
" \"bmo\": \"Before Market Open\",\n",
|
||||
" \"amc\": \"After Market Close\", \n",
|
||||
" \"dmh\": \"During Market Hours\"\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" cleaned_earnings.append({\n",
|
||||
" \"symbol\": earning.get(\"symbol\"),\n",
|
||||
" \"date\": earning.get(\"date\"),\n",
|
||||
" \"quarter\": f\"Q{earning.get('quarter')} {earning.get('year')}\",\n",
|
||||
" \"announcement_time\": hour_map.get(earning.get(\"hour\", \"\"), earning.get(\"hour\", \"Unknown\")),\n",
|
||||
" \"eps_actual\": earning.get(\"epsActual\"),\n",
|
||||
" \"eps_estimate\": earning.get(\"epsEstimate\"),\n",
|
||||
" \"revenue_actual\": earning.get(\"revenueActual\"),\n",
|
||||
" \"revenue_estimate\": earning.get(\"revenueEstimate\")\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" return {\"success\": True, \"earnings\": cleaned_earnings}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9eaeae75-d68f-4160-a26e-c13e40cf756b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_earnings_calendar_function = {\n",
|
||||
" \"name\": \"get_earnings_calendar\",\n",
|
||||
" \"description\": \"Fetch latest earnings calendar showing historical and upcoming earnings releases for companies. Shows EPS and revenue estimates vs actuals. End date must be within the last month.\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"object\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"symbol\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"Stock ticker symbol, e.g., 'AAPL'. Leave empty to get earnings for all companies in the date range.\"\n",
|
||||
" },\n",
|
||||
" \"_from\": {\n",
|
||||
" \"type\": \"string\", \n",
|
||||
" \"description\": \"Start date in YYYY-MM-DD format. Ensure it is not more than 1 year ago from today. Ensure it is before or equal to the date in to.\"\n",
|
||||
" },\n",
|
||||
" \"to\": {\n",
|
||||
" \"type\": \"string\",\n",
|
||||
" \"description\": \"End date in YYYY-MM-DD format. Ensure it is not more than 1 year ago. Ensure it is after or equal to the date in from. To date must be within the last month.\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"required\": [\n",
|
||||
" \"_from\",\n",
|
||||
" \"to\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"get_earnings_calendar_tool = {\"type\": \"function\", \"function\": get_earnings_calendar_function}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bdca8679-935f-4e7f-97e6-e71a4d4f228c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List of tools:\n",
|
||||
"tools = [search_symbol_tool, get_company_financials_tool, get_stock_quote_tool, get_company_news_tool, get_market_news_tool, get_current_time_tool, get_earnings_calendar_tool]\n",
|
||||
"tool_functions = {\n",
|
||||
" \"search_symbol\": search_symbol,\n",
|
||||
" \"get_company_financials\": get_company_financials,\n",
|
||||
" \"get_stock_quote\": get_stock_quote,\n",
|
||||
" \"get_company_news\": get_company_news,\n",
|
||||
" \"get_market_news\": get_market_news,\n",
|
||||
" \"get_earnings_calendar\": get_earnings_calendar,\n",
|
||||
" \"get_current_time\": get_current_time\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c3d3554f-b4e3-4ce7-af6f-68faa6dd2340",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Getting OpenAI to use our Tool\n",
|
||||
"\n",
|
||||
"There's some fiddly stuff to allow OpenAI \"to call our tool\"\n",
|
||||
"\n",
|
||||
"What we actually do is give the LLM the opportunity to inform us that it wants us to run the tool.\n",
|
||||
"\n",
|
||||
"Here's how the new chat function looks:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "86f76f57-76c4-4dc7-94a8-cfe7816a39f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def execute_tool_call(tool_call):\n",
|
||||
" func_name = tool_call.function.name\n",
|
||||
" args = json.loads(tool_call.function.arguments)\n",
|
||||
"\n",
|
||||
" logger.info(f\"Executing tool: {func_name} with args: {args}\")\n",
|
||||
"\n",
|
||||
" func = tool_functions.get(func_name)\n",
|
||||
" if not func:\n",
|
||||
" result = {\"error\": f\"Function '{func_name}' not found\"}\n",
|
||||
" else:\n",
|
||||
" try:\n",
|
||||
" result = func(**args)\n",
|
||||
" except Exception as e:\n",
|
||||
" logger.exception(f\"Error executing {func_name}\")\n",
|
||||
" result = {\"error\": str(e)}\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"role\": \"tool\",\n",
|
||||
" \"tool_call_id\": tool_call.id,\n",
|
||||
" \"content\": json.dumps(result)\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ce9b0744-9c78-408d-b9df-9f6fd9ed78cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def chat(message, history):\n",
|
||||
" messages = [{\"role\": \"system\", \"content\": system_message}] + history + [{\"role\": \"user\", \"content\": message}]\n",
|
||||
"\n",
|
||||
" # Skip the first system message\n",
|
||||
" to_log = messages[1:]\n",
|
||||
"\n",
|
||||
" # Print each dict on its own line\n",
|
||||
" logger.info(\"\\nMessages:\\n\" + \"\\n\".join(str(m) for m in to_log) + \"\\n\")\n",
|
||||
"\n",
|
||||
" while True:\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=MODEL, \n",
|
||||
" messages=messages, \n",
|
||||
" tools=tools,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" content = \"\"\n",
|
||||
" tool_calls = []\n",
|
||||
" finish_reason = None\n",
|
||||
" \n",
|
||||
" # Process the stream\n",
|
||||
" for chunk in response:\n",
|
||||
" choice = chunk.choices[0]\n",
|
||||
" finish_reason = choice.finish_reason\n",
|
||||
" \n",
|
||||
" # Stream content\n",
|
||||
" if choice.delta.content:\n",
|
||||
" content += choice.delta.content\n",
|
||||
" yield content\n",
|
||||
" \n",
|
||||
" # Collect tool calls\n",
|
||||
" if choice.delta.tool_calls:\n",
|
||||
" for tc_delta in choice.delta.tool_calls:\n",
|
||||
" # Extend tool_calls list if needed\n",
|
||||
" while len(tool_calls) <= tc_delta.index:\n",
|
||||
" tool_calls.append({\n",
|
||||
" \"id\": \"\",\n",
|
||||
" \"function\": {\"name\": \"\", \"arguments\": \"\"}\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" tc = tool_calls[tc_delta.index]\n",
|
||||
" if tc_delta.id:\n",
|
||||
" tc[\"id\"] = tc_delta.id\n",
|
||||
" if tc_delta.function:\n",
|
||||
" if tc_delta.function.name:\n",
|
||||
" tc[\"function\"][\"name\"] = tc_delta.function.name\n",
|
||||
" if tc_delta.function.arguments:\n",
|
||||
" tc[\"function\"][\"arguments\"] += tc_delta.function.arguments\n",
|
||||
" \n",
|
||||
" # If no tool calls, we're done\n",
|
||||
" if finish_reason != \"tool_calls\":\n",
|
||||
" return content\n",
|
||||
" \n",
|
||||
" # Execute tools\n",
|
||||
" ai_message = {\n",
|
||||
" \"role\": \"assistant\", \n",
|
||||
" \"content\": content,\n",
|
||||
" \"tool_calls\": [\n",
|
||||
" {\n",
|
||||
" \"id\": tc[\"id\"],\n",
|
||||
" \"type\": \"function\",\n",
|
||||
" \"function\": tc[\"function\"]\n",
|
||||
" } for tc in tool_calls\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" tool_responses = []\n",
|
||||
" for tool_call in ai_message[\"tool_calls\"]:\n",
|
||||
" # Convert dict back to object for your existing function\n",
|
||||
" class ToolCall:\n",
|
||||
" def __init__(self, tc_dict):\n",
|
||||
" self.id = tc_dict[\"id\"]\n",
|
||||
" self.function = type('obj', (object,), tc_dict[\"function\"])\n",
|
||||
" \n",
|
||||
" tool_responses.append(execute_tool_call(ToolCall(tool_call)))\n",
|
||||
" \n",
|
||||
" messages.append(ai_message)\n",
|
||||
" messages.extend(tool_responses)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f4be8a71-b19e-4c2f-80df-f59ff2661f14",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gr.ChatInterface(fn=chat, type=\"messages\", title=\"TickerBot\", description=\"Ask about stock prices, company financials and market news!\").launch(share=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5c014d6f-820d-4d58-8527-7d703aad3399",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "40c77d61-3e90-4708-b360-fb58b4211e9b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
222
week4/community-contributions/unit-test-generator-v3.ipynb
Normal file
222
week4/community-contributions/unit-test-generator-v3.ipynb
Normal file
@@ -0,0 +1,222 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "56957b7f-e289-4999-8a40-ce1a8378d8cd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Unit Test Generator\n",
|
||||
"\n",
|
||||
"The requirement: use a Frontier model to generate fast and repeatable unit tests for Python code.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ef67ef0-1bda-45bb-abca-f003217602d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# imports\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import io\n",
|
||||
"import sys\n",
|
||||
"import ast\n",
|
||||
"import unittest, contextlib\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from openai import OpenAI\n",
|
||||
"import google.generativeai\n",
|
||||
"import anthropic\n",
|
||||
"from IPython.display import Markdown, display, update_display\n",
|
||||
"import gradio as gr\n",
|
||||
"import subprocess\n",
|
||||
"\n",
|
||||
"# environment\n",
|
||||
"\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"\n",
|
||||
"openai = OpenAI()\n",
|
||||
"claude = anthropic.Anthropic()\n",
|
||||
"OPENAI_MODEL = \"gpt-4o\"\n",
|
||||
"CLAUDE_MODEL = \"claude-3-7-sonnet-20250219\"\n",
|
||||
"\n",
|
||||
"system_message = \"You are an assistant that implements unit testing for Python code. \"\n",
|
||||
"system_message += \"Respond only with Python code; use comments sparingly and do not provide any explanation other than occasional comments. \"\n",
|
||||
"\n",
|
||||
"def remove_main_block_from_code(code):\n",
|
||||
" \"\"\"\n",
|
||||
" Remove top-level `if __name__ == \"__main__\":` blocks from code.\n",
|
||||
" \"\"\"\n",
|
||||
" try:\n",
|
||||
" tree = ast.parse(code)\n",
|
||||
"\n",
|
||||
" class RemoveMain(ast.NodeTransformer):\n",
|
||||
" def visit_If(self, node):\n",
|
||||
" # check if this is: if __name__ == \"__main__\":\n",
|
||||
" test = node.test\n",
|
||||
" if (\n",
|
||||
" isinstance(test, ast.Compare) and\n",
|
||||
" isinstance(test.left, ast.Name) and\n",
|
||||
" test.left.id == \"__name__\" and\n",
|
||||
" len(test.ops) == 1 and isinstance(test.ops[0], ast.Eq) and\n",
|
||||
" len(test.comparators) == 1 and\n",
|
||||
" isinstance(test.comparators[0], ast.Constant) and\n",
|
||||
" test.comparators[0].value == \"__main__\"\n",
|
||||
" ):\n",
|
||||
" return None # remove this node entirely\n",
|
||||
" return node\n",
|
||||
"\n",
|
||||
" tree = RemoveMain().visit(tree)\n",
|
||||
" ast.fix_missing_locations(tree)\n",
|
||||
" return ast.unparse(tree) # get back code as string\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"Error removing __main__ block:\", e)\n",
|
||||
" return code # fallback: return original code if AST fails\n",
|
||||
"\n",
|
||||
"def user_prompt_for(python_file):\n",
|
||||
" if isinstance(python_file, dict): # from Gradio\n",
|
||||
" file_path = python_file[\"name\"]\n",
|
||||
" elif hasattr(python_file, \"name\"): # tempfile\n",
|
||||
" file_path = python_file.name\n",
|
||||
" else: # string path\n",
|
||||
" file_path = python_file\n",
|
||||
"\n",
|
||||
" with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" python_code = f.read()\n",
|
||||
"\n",
|
||||
" # strip __main__ blocks\n",
|
||||
" python_code = remove_main_block_from_code(python_code)\n",
|
||||
"\n",
|
||||
" user_prompt = \"Write unit tests for this Python code. \"\n",
|
||||
" user_prompt += \"Respond only with Python code; do not explain your work other than a few comments. \"\n",
|
||||
" user_prompt += \"The unit testing is done in Jupyterlab, so you should use packages that play nicely with the Jupyter kernel. \\n\\n\"\n",
|
||||
" user_prompt += \"Include the original Python code in your generated output so that I can run all in one fell swoop.\\n\\n\"\n",
|
||||
" user_prompt += python_code\n",
|
||||
"\n",
|
||||
" return user_prompt\n",
|
||||
"\n",
|
||||
"def messages_for(python_file):\n",
|
||||
" return [\n",
|
||||
" {\"role\": \"system\", \"content\": system_message},\n",
|
||||
" {\"role\": \"user\", \"content\": user_prompt_for(python_file)}\n",
|
||||
" ]\n",
|
||||
"\t\n",
|
||||
"def stream_gpt(python_file): \n",
|
||||
" stream = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages_for(python_file), stream=True)\n",
|
||||
" reply = \"\"\n",
|
||||
" for chunk in stream:\n",
|
||||
" fragment = chunk.choices[0].delta.content or \"\"\n",
|
||||
" reply += fragment\n",
|
||||
" yield reply.replace('```python\\n','').replace('```','')\n",
|
||||
"\t\t\n",
|
||||
"def stream_claude(python_file):\n",
|
||||
" result = claude.messages.stream(\n",
|
||||
" model=CLAUDE_MODEL,\n",
|
||||
" max_tokens=2000,\n",
|
||||
" system=system_message,\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": user_prompt_for(python_file)}],\n",
|
||||
" )\n",
|
||||
" reply = \"\"\n",
|
||||
" with result as stream:\n",
|
||||
" for text in stream.text_stream:\n",
|
||||
" reply += text\n",
|
||||
" yield reply.replace('```python\\n','').replace('```','')\n",
|
||||
"\t\t\t\n",
|
||||
"def unit_test(python_file, model):\n",
|
||||
" if model==\"GPT\":\n",
|
||||
" result = stream_gpt(python_file)\n",
|
||||
" elif model==\"Claude\":\n",
|
||||
" result = stream_claude(python_file)\n",
|
||||
" else:\n",
|
||||
" raise ValueError(\"Unknown model\")\n",
|
||||
" for stream_so_far in result:\n",
|
||||
" yield stream_so_far\n",
|
||||
"\n",
|
||||
"def execute_python(code):\n",
|
||||
" buffer = io.StringIO()\n",
|
||||
" try:\n",
|
||||
" with contextlib.redirect_stdout(buffer), contextlib.redirect_stderr(buffer):\n",
|
||||
" # execute code in isolated namespace\n",
|
||||
" ns = {}\n",
|
||||
" exec(code, ns)\n",
|
||||
"\n",
|
||||
" # manually collect TestCase subclasses\n",
|
||||
" test_cases = [\n",
|
||||
" obj for obj in ns.values()\n",
|
||||
" if isinstance(obj, type) and issubclass(obj, unittest.TestCase)\n",
|
||||
" ]\n",
|
||||
" if test_cases:\n",
|
||||
" suite = unittest.TestSuite()\n",
|
||||
" for case in test_cases:\n",
|
||||
" suite.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(case))\n",
|
||||
" runner = unittest.TextTestRunner(stream=buffer, verbosity=2)\n",
|
||||
" runner.run(suite)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error during execution: {e}\", file=buffer)\n",
|
||||
"\n",
|
||||
" return buffer.getvalue()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "670b8b78-0b13-488a-9533-59802b2fe101",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# --- Gradio UI ---\n",
|
||||
"with gr.Blocks() as ui:\n",
|
||||
" gr.Markdown(\"## Unit Test Generator\\nUpload a Python file and get structured unit testing.\")\n",
|
||||
" with gr.Row(): # Row 1\n",
|
||||
" orig_code = gr.File(label=\"Upload your Python file\", file_types=[\".py\"])\n",
|
||||
" test_code = gr.Textbox(label=\"Unit test code:\", lines=10)\n",
|
||||
" with gr.Row(): # Row 2\n",
|
||||
" model = gr.Dropdown([\"GPT\", \"Claude\"], label=\"Select model\", value=\"GPT\")\n",
|
||||
" with gr.Row(): # Row 3\n",
|
||||
" generate = gr.Button(\"Generate unit test code\")\n",
|
||||
" with gr.Row(): # Row 4\n",
|
||||
" unit_run = gr.Button(\"Run Python unit test\")\n",
|
||||
" with gr.Row(): # Row 5\n",
|
||||
" test_out = gr.Textbox(label=\"Unit test result:\", lines=10)\n",
|
||||
"\n",
|
||||
" generate.click(unit_test, inputs=[orig_code, model], outputs=[test_code])\n",
|
||||
"\n",
|
||||
" unit_run.click(execute_python, inputs=[test_code], outputs=[test_out])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "609bbdc3-1e1c-4538-91dd-7d13134ab381",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ui.launch(inbrowser=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,409 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "97a93fee-6bbd-477b-aba8-577d318a9f9d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AI-Powered Academic Knowledge Assistant\n",
|
||||
"AI-powered RAG (Retrieval-Augmented Generation) system that transforms document collections into queryable knowledge bases using OpenAI embeddings and vector search. Features configurable chunking, file size limits, and retrieval parameters with a Gradio interface for processing PDFs and generating contextually-aware responses via LangChain and ChromaDB."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3589eee0-ce34-42f4-b538-b43f3b0d9f6f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import gradio as gr\n",
|
||||
"import shutil\n",
|
||||
"import tiktoken\n",
|
||||
"import time\n",
|
||||
"import uuid\n",
|
||||
"from typing import List, Tuple, Optional\n",
|
||||
"\n",
|
||||
"# imports for langchain and Chroma\n",
|
||||
"from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
|
||||
"from langchain_chroma import Chroma\n",
|
||||
"from langchain.memory import ConversationBufferMemory\n",
|
||||
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"from langchain_community.document_loaders import PyPDFLoader, TextLoader\n",
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv(override=True)\n",
|
||||
"os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n",
|
||||
"\n",
|
||||
"# Global variables to store the current setup\n",
|
||||
"current_vectorstore = None\n",
|
||||
"current_conversation_chain = None\n",
|
||||
"processing_status = \"\"\n",
|
||||
"\n",
|
||||
"def count_tokens(text: str, model: str = \"gpt-4o-mini\") -> int:\n",
|
||||
" \"\"\"Count tokens in text using tiktoken\"\"\"\n",
|
||||
" try:\n",
|
||||
" encoding = tiktoken.encoding_for_model(model)\n",
|
||||
" return len(encoding.encode(text))\n",
|
||||
" except:\n",
|
||||
" # Fallback estimation: roughly 4 characters per token\n",
|
||||
" return len(text) // 4\n",
|
||||
"\n",
|
||||
"def filter_chunks_by_tokens(chunks: List[Document], max_total_tokens: int = 250000) -> List[Document]:\n",
|
||||
" \"\"\"Filter chunks to stay within token limits\"\"\"\n",
|
||||
" filtered_chunks = []\n",
|
||||
" total_tokens = 0\n",
|
||||
" \n",
|
||||
" for chunk in chunks:\n",
|
||||
" chunk_tokens = count_tokens(chunk.page_content)\n",
|
||||
" \n",
|
||||
" # Skip individual chunks that are too large (shouldn't happen with proper splitting)\n",
|
||||
" if chunk_tokens > 8000: # Individual chunk limit\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" if total_tokens + chunk_tokens <= max_total_tokens:\n",
|
||||
" filtered_chunks.append(chunk)\n",
|
||||
" total_tokens += chunk_tokens\n",
|
||||
" else:\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" return filtered_chunks\n",
|
||||
"\n",
|
||||
"def add_metadata(doc, doc_type, file_path):\n",
|
||||
" \"\"\"Add metadata including document type and file information\"\"\"\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
" doc.metadata[\"file_path\"] = file_path\n",
|
||||
" doc.metadata[\"file_name\"] = os.path.basename(file_path)\n",
|
||||
" return doc\n",
|
||||
"\n",
|
||||
"def check_file_size(file_path, max_size_bytes):\n",
|
||||
" \"\"\"Check if file size is within the limit\"\"\"\n",
|
||||
" try:\n",
|
||||
" file_size = os.path.getsize(file_path)\n",
|
||||
" return file_size <= max_size_bytes, file_size\n",
|
||||
" except OSError:\n",
|
||||
" return False, 0\n",
|
||||
"\n",
|
||||
"def load_pdfs_with_size_limit(folder_path, doc_type, max_size_bytes):\n",
|
||||
" \"\"\"Load PDF files from a folder with size restrictions\"\"\"\n",
|
||||
" pdf_files = glob.glob(os.path.join(folder_path, \"**/*.pdf\"), recursive=True)\n",
|
||||
" loaded_docs = []\n",
|
||||
" skipped_files = []\n",
|
||||
" \n",
|
||||
" for pdf_file in pdf_files:\n",
|
||||
" is_valid_size, file_size = check_file_size(pdf_file, max_size_bytes)\n",
|
||||
" \n",
|
||||
" if is_valid_size:\n",
|
||||
" try:\n",
|
||||
" loader = PyPDFLoader(pdf_file)\n",
|
||||
" docs = loader.load()\n",
|
||||
" docs_with_metadata = [add_metadata(doc, doc_type, pdf_file) for doc in docs]\n",
|
||||
" loaded_docs.extend(docs_with_metadata)\n",
|
||||
" except Exception as e:\n",
|
||||
" skipped_files.append((pdf_file, f\"Loading error: {str(e)}\"))\n",
|
||||
" else:\n",
|
||||
" file_size_mb = file_size / 1024 / 1024\n",
|
||||
" skipped_files.append((pdf_file, f\"File too large: {file_size_mb:.2f} MB\"))\n",
|
||||
" \n",
|
||||
" return loaded_docs, skipped_files\n",
|
||||
"\n",
|
||||
"def process_documents(knowledge_base_dir: str, max_file_size_mb: float, chunk_size: int, chunk_overlap: int) -> Tuple[str, str]:\n",
|
||||
" \"\"\"Process documents and create vector store\"\"\"\n",
|
||||
" global current_vectorstore, current_conversation_chain\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Validate directory\n",
|
||||
" if not knowledge_base_dir or not knowledge_base_dir.strip():\n",
|
||||
" return \"❌ Error: Please enter a directory path!\", \"\"\n",
|
||||
" \n",
|
||||
" directory_path = knowledge_base_dir.strip()\n",
|
||||
" \n",
|
||||
" if not os.path.exists(directory_path):\n",
|
||||
" return \"❌ Error: Directory does not exist! Please check the path.\", \"\"\n",
|
||||
" \n",
|
||||
" # Configuration\n",
|
||||
" MAX_FILE_SIZE_BYTES = int(max_file_size_mb * 1024 * 1024)\n",
|
||||
" \n",
|
||||
" # Find folders\n",
|
||||
" if directory_path.endswith('*'):\n",
|
||||
" folders = glob.glob(directory_path)\n",
|
||||
" else:\n",
|
||||
" folders = glob.glob(os.path.join(directory_path, \"*\"))\n",
|
||||
" \n",
|
||||
" if not folders:\n",
|
||||
" return \"❌ Error: No folders found in the specified directory!\", \"\"\n",
|
||||
" \n",
|
||||
" # Process documents\n",
|
||||
" documents = []\n",
|
||||
" all_skipped_files = []\n",
|
||||
" status_lines = []\n",
|
||||
" \n",
|
||||
" status_lines.append(f\"🔍 Processing folders with {max_file_size_mb} MB file size limit...\")\n",
|
||||
" status_lines.append(\"-\" * 60)\n",
|
||||
" \n",
|
||||
" for folder in folders:\n",
|
||||
" if os.path.isdir(folder):\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" status_lines.append(f\"📁 Processing folder: {doc_type}\")\n",
|
||||
" \n",
|
||||
" folder_docs, skipped_files = load_pdfs_with_size_limit(folder, doc_type, MAX_FILE_SIZE_BYTES)\n",
|
||||
" documents.extend(folder_docs)\n",
|
||||
" all_skipped_files.extend(skipped_files)\n",
|
||||
" \n",
|
||||
" if folder_docs:\n",
|
||||
" status_lines.append(f\" ✅ Loaded {len(folder_docs)} document pages\")\n",
|
||||
" if skipped_files:\n",
|
||||
" status_lines.append(f\" ⚠️ Skipped {len(skipped_files)} files\")\n",
|
||||
" \n",
|
||||
" if not documents:\n",
|
||||
" error_msg = \"❌ No PDF documents were loaded successfully.\"\n",
|
||||
" if all_skipped_files:\n",
|
||||
" error_msg += f\"\\n\\nAll {len(all_skipped_files)} files were skipped:\"\n",
|
||||
" for file_path, reason in all_skipped_files[:10]: # Show first 10\n",
|
||||
" error_msg += f\"\\n • {os.path.basename(file_path)}: {reason}\"\n",
|
||||
" if len(all_skipped_files) > 10:\n",
|
||||
" error_msg += f\"\\n ... and {len(all_skipped_files) - 10} more\"\n",
|
||||
" return error_msg, \"\"\n",
|
||||
" \n",
|
||||
" # Text splitting\n",
|
||||
" status_lines.append(\"\\n\" + \"=\"*40)\n",
|
||||
" status_lines.append(\"✂️ TEXT SPLITTING\")\n",
|
||||
" status_lines.append(\"=\"*40)\n",
|
||||
" \n",
|
||||
" text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
" chunks = text_splitter.split_documents(documents)\n",
|
||||
" \n",
|
||||
" # Filter chunks by token count to prevent API errors\n",
|
||||
" status_lines.append(\"🔢 Checking token limits...\")\n",
|
||||
" original_chunk_count = len(chunks)\n",
|
||||
" chunks = filter_chunks_by_tokens(chunks, max_total_tokens=250000)\n",
|
||||
" \n",
|
||||
" if len(chunks) < original_chunk_count:\n",
|
||||
" status_lines.append(f\"⚠️ Filtered from {original_chunk_count} to {len(chunks)} chunks to stay within token limits\")\n",
|
||||
" \n",
|
||||
" # Create vectorstore\n",
|
||||
" status_lines.append(\"🧮 Creating vector embeddings...\")\n",
|
||||
" embeddings = OpenAIEmbeddings()\n",
|
||||
" \n",
|
||||
" # Use a temporary database name\n",
|
||||
" db_name = \"temp_vector_db\"\n",
|
||||
" \n",
|
||||
" # Delete if already exists\n",
|
||||
" if os.path.exists(db_name):\n",
|
||||
" shutil.rmtree(db_name)\n",
|
||||
" \n",
|
||||
" # Create vectorstore\n",
|
||||
" vectorstore = Chroma.from_documents(\n",
|
||||
" documents=chunks, \n",
|
||||
" embedding=embeddings, \n",
|
||||
" persist_directory=db_name\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Update global variables\n",
|
||||
" current_vectorstore = vectorstore\n",
|
||||
" \n",
|
||||
" # Create conversation chain\n",
|
||||
" llm = ChatOpenAI(temperature=0.7, model_name=\"gpt-4o-mini\")\n",
|
||||
" memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n",
|
||||
" retriever = vectorstore.as_retriever(search_kwargs={\"k\": 25})\n",
|
||||
" current_conversation_chain = ConversationalRetrievalChain.from_llm(\n",
|
||||
" llm=llm, \n",
|
||||
" retriever=retriever, \n",
|
||||
" memory=memory\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Summary statistics\n",
|
||||
" status_lines.append(\"\\n\" + \"=\"*40)\n",
|
||||
" status_lines.append(\"📊 SUMMARY\")\n",
|
||||
" status_lines.append(\"=\"*40)\n",
|
||||
" status_lines.append(f\"✅ Total PDFs processed: {len(set(doc.metadata['file_path'] for doc in documents))}\")\n",
|
||||
" status_lines.append(f\"📄 Total document pages: {len(documents)}\")\n",
|
||||
" status_lines.append(f\"🧩 Total text chunks: {len(chunks)}\")\n",
|
||||
" status_lines.append(f\"📁 Document types: {', '.join(set(doc.metadata['doc_type'] for doc in documents))}\")\n",
|
||||
" status_lines.append(f\"🗃️ Vector store size: {vectorstore._collection.count()} embeddings\")\n",
|
||||
" \n",
|
||||
" if all_skipped_files:\n",
|
||||
" status_lines.append(f\"\\n⚠️ Skipped files: {len(all_skipped_files)}\")\n",
|
||||
" for file_path, reason in all_skipped_files[:5]: # Show first 5\n",
|
||||
" status_lines.append(f\" • {os.path.basename(file_path)}: {reason}\")\n",
|
||||
" if len(all_skipped_files) > 5:\n",
|
||||
" status_lines.append(f\" ... and {len(all_skipped_files) - 5} more\")\n",
|
||||
" \n",
|
||||
" success_msg = \"✅ Knowledge base successfully created and ready for questions!\"\n",
|
||||
" detailed_status = \"\\n\".join(status_lines)\n",
|
||||
" \n",
|
||||
" return success_msg, detailed_status\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" error_msg = f\"❌ Error processing documents: {str(e)}\"\n",
|
||||
" return error_msg, \"\"\n",
|
||||
"\n",
|
||||
"def chat_with_documents(message, history, num_chunks):\n",
|
||||
" \"\"\"Chat with the processed documents\"\"\"\n",
|
||||
" global current_conversation_chain, current_vectorstore\n",
|
||||
" \n",
|
||||
" if current_conversation_chain is None:\n",
|
||||
" return \"❌ Please process documents first before asking questions!\"\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Update retriever with new chunk count\n",
|
||||
" if current_vectorstore is not None:\n",
|
||||
" retriever = current_vectorstore.as_retriever(search_kwargs={\"k\": num_chunks})\n",
|
||||
" current_conversation_chain.retriever = retriever\n",
|
||||
" \n",
|
||||
" result = current_conversation_chain.invoke({\"question\": message})\n",
|
||||
" return result[\"answer\"]\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" return f\"❌ Error generating response: {str(e)}\"\n",
|
||||
"\n",
|
||||
"def reset_conversation():\n",
|
||||
" \"\"\"Reset the conversation memory\"\"\"\n",
|
||||
" global current_conversation_chain\n",
|
||||
" if current_conversation_chain is not None:\n",
|
||||
" current_conversation_chain.memory.clear()\n",
|
||||
" return \"✅ Conversation history cleared!\"\n",
|
||||
" return \"No active conversation to reset.\"\n",
|
||||
"\n",
|
||||
"# Create Gradio Interface\n",
|
||||
"with gr.Blocks(title=\"AI-Powered Academic Knowledge Assistant\", theme=gr.themes.Soft()) as app:\n",
|
||||
" gr.Markdown(\"# 🎓 AI-Powered Academic Knowledge Assistant\")\n",
|
||||
" gr.Markdown(\"Transform your entire document library into an intelligent, searchable AI tutor that answers questions instantly.\")\n",
|
||||
" \n",
|
||||
" with gr.Tabs():\n",
|
||||
" # Configuration Tab\n",
|
||||
" with gr.Tab(\"⚙️ Configuration\"):\n",
|
||||
" gr.Markdown(\"### 📁 Document Processing Settings\")\n",
|
||||
" \n",
|
||||
" gr.Markdown(\"💡 **Tip:** Copy and paste your folder path here. On mobile, you can use file manager apps to copy folder paths.\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" with gr.Column():\n",
|
||||
" knowledge_dir = gr.Textbox(\n",
|
||||
" label=\"Knowledge Base Directory\",\n",
|
||||
" value=r\"C:\\Users\\Documents\\Syllabi\\Georgia Tech\\Spring 22\\Microwave Design\",\n",
|
||||
" placeholder=\"Enter or paste your document directory path\",\n",
|
||||
" lines=1\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" max_file_size = gr.Slider(\n",
|
||||
" label=\"Max File Size (MB)\",\n",
|
||||
" minimum=0.5,\n",
|
||||
" maximum=50,\n",
|
||||
" value=4,\n",
|
||||
" step=0.5\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" with gr.Column():\n",
|
||||
" chunk_size = gr.Slider(\n",
|
||||
" label=\"Chunk Size (characters)\",\n",
|
||||
" minimum=200,\n",
|
||||
" maximum=1500,\n",
|
||||
" value=800,\n",
|
||||
" step=100,\n",
|
||||
" info=\"Smaller chunks = better token management\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" chunk_overlap = gr.Slider(\n",
|
||||
" label=\"Chunk Overlap (characters)\",\n",
|
||||
" minimum=0,\n",
|
||||
" maximum=300,\n",
|
||||
" value=150,\n",
|
||||
" step=25,\n",
|
||||
" info=\"Overlap preserves context between chunks\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" process_btn = gr.Button(\"🚀 Process Documents\", variant=\"primary\", size=\"lg\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" status_output = gr.Textbox(\n",
|
||||
" label=\"Status\",\n",
|
||||
" lines=2,\n",
|
||||
" max_lines=2\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" detailed_output = gr.Textbox(\n",
|
||||
" label=\"Detailed Processing Log\",\n",
|
||||
" lines=15,\n",
|
||||
" max_lines=20\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Chat Tab\n",
|
||||
" with gr.Tab(\"💬 Chat\"):\n",
|
||||
" gr.Markdown(\"### 🤖 Ask Questions About Your Documents\")\n",
|
||||
" \n",
|
||||
" with gr.Row():\n",
|
||||
" with gr.Column(scale=1):\n",
|
||||
" num_chunks = gr.Slider(\n",
|
||||
" label=\"Number of chunks to retrieve\",\n",
|
||||
" minimum=1,\n",
|
||||
" maximum=50,\n",
|
||||
" value=25,\n",
|
||||
" step=1\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" reset_btn = gr.Button(\"🗑️ Clear Chat History\", variant=\"secondary\")\n",
|
||||
" reset_output = gr.Textbox(label=\"Reset Status\", lines=1)\n",
|
||||
" \n",
|
||||
" with gr.Column(scale=3):\n",
|
||||
" chatbot = gr.ChatInterface(\n",
|
||||
" fn=lambda msg, history: chat_with_documents(msg, history, num_chunks.value),\n",
|
||||
" type=\"messages\",\n",
|
||||
" title=\"Academic Assistant Chat\",\n",
|
||||
" description=\"Ask questions about your processed documents\"\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Event handlers\n",
|
||||
" process_btn.click(\n",
|
||||
" fn=process_documents,\n",
|
||||
" inputs=[knowledge_dir, max_file_size, chunk_size, chunk_overlap],\n",
|
||||
" outputs=[status_output, detailed_output]\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" reset_btn.click(\n",
|
||||
" fn=reset_conversation,\n",
|
||||
" outputs=reset_output\n",
|
||||
" )\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9eb807e0-194b-48dd-a1e9-b1b9b8a99620",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"app.launch(share=True, inbrowser=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user