diff --git a/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb b/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb index c6ba32a..64d976b 100644 --- a/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb +++ b/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb @@ -61,9 +61,9 @@ "\n", "# LLM APIs\n", "from openai import OpenAI\n", - "import anthropic\n", - "import google.generativeai as genai\n", - "from deepseek import DeepSeek\n", + "# import anthropic\n", + "# import google.generativeai as genai\n", + "# from deepseek import DeepSeek\n", "\n", "# HuggingFace\n", "from huggingface_hub import login\n", @@ -72,7 +72,7 @@ "# Data processing\n", "import nltk\n", "from nltk.corpus import wordnet\n", - "import pyarrow as pa\n", + "# import pyarrow as pa\n", "\n", "# UI\n", "import gradio as gr\n", @@ -105,6 +105,10 @@ " 'anthropic': userdata.get('ANTHROPIC_API_KEY'),\n", " 'google': userdata.get('GOOGLE_API_KEY'),\n", " 'deepseek': userdata.get('DEEPSEEK_API_KEY'),\n", + " # 'groq': userdata.get('GROQ_API_KEY'),\n", + " 'grok': userdata.get('GROK_API_KEY'),\n", + " # 'openrouter': userdata.get('OPENROUTER_API_KEY'),\n", + " # 'ollama': userdata.get('OLLAMA_API_KEY'),\n", " 'hf_token': userdata.get('HF_TOKEN')\n", " }\n", " print(\"✅ Using Colab secrets\")\n", @@ -117,27 +121,44 @@ " 'anthropic': os.getenv('ANTHROPIC_API_KEY'),\n", " 'google': os.getenv('GOOGLE_API_KEY'),\n", " 'deepseek': os.getenv('DEEPSEEK_API_KEY'),\n", + " # 'groq': os.getenv('GROQ_API_KEY'),\n", + " 'grok': os.getenv('GROK_API_KEY'),\n", + " # 'openrouter': os.getenv('OPENROUTER_API_KEY'),\n", + " # 'ollama': os.getenv('OLLAMA_API_KEY'),\n", " 'hf_token': os.getenv('HF_TOKEN')\n", " }\n", " print(\"✅ Using local .env file\")\n", " \n", " # Initialize API clients\n", + " anthropic_url = \"https://api.anthropic.com/v1/\"\n", + " gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n", + " deepseek_url = \"https://api.deepseek.com\"\n", + " # groq_url = \"https://api.groq.com/openai/v1\"\n", + " grok_url = \"https://api.x.ai/v1\"\n", + " # openrouter_url = \"https://openrouter.ai/api/v1\"\n", + " # ollama_url = \"http://localhost:11434/v1\"\n", + "\n", " clients = {}\n", " if api_keys['openai']:\n", " clients['openai'] = OpenAI(api_key=api_keys['openai'])\n", " if api_keys['anthropic']:\n", - " clients['anthropic'] = anthropic.Anthropic(api_key=api_keys['anthropic'])\n", + " clients['anthropic'] = OpenAI(api_key=api_keys['anthropic'], base_url=anthropic_url)\n", + " # clients['anthropic'] = anthropic.Anthropic(api_key=api_keys['anthropic'])\n", " if api_keys['google']:\n", - " genai.configure(api_key=api_keys['google'])\n", + " # genai.configure(api_key=api_keys['google'])\n", + " clients['gemini'] = OpenAI(api_key=api_keys['google'], base_url=gemini_url)\n", " if api_keys['deepseek']:\n", - " clients['deepseek'] = DeepSeek(api_key=api_keys['deepseek'])\n", + " clients['deepseek'] = OpenAI(api_key=api_keys['deepseek'], base_url=deepseek_url)\n", + " # clients['deepseek'] = DeepSeek(api_key=api_keys['deepseek'])\n", + " if api_keys['grok']:\n", + " clients['grok'] = OpenAI(api_key=api_keys['grok'], base_url=grok_url)\n", " if api_keys['hf_token']:\n", " login(api_keys['hf_token'], add_to_git_credential=True)\n", " \n", " return api_keys, clients\n", "\n", "# Initialize API keys and clients\n", - "api_keys, clients = setup_api_keys()\n" + "api_keys, clients = setup_api_keys()" ] }, { @@ -152,43 +173,43 @@ "HUGGINGFACE_MODELS = {\n", " \"Llama 3.1 8B\": {\n", " \"model_id\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " \"description\": \"Versatile 8B model, excellent for structured data generation\",\n", + " \"description\": \"8B model - that is good for structured data generation\",\n", " \"size\": \"8B\",\n", " \"type\": \"huggingface\"\n", " },\n", " \"Llama 3.2 3B\": {\n", " \"model_id\": \"meta-llama/Llama-3.2-3B-Instruct\", \n", - " \"description\": \"Smaller, faster model, good for simple schemas\",\n", + " \"description\": \"3B model - smaller and faster model that is good for simple schemas\",\n", " \"size\": \"3B\",\n", " \"type\": \"huggingface\"\n", " },\n", " \"Phi-3.5 Mini\": {\n", " \"model_id\": \"microsoft/Phi-3.5-mini-instruct\",\n", - " \"description\": \"Efficient 3.8B model with strong reasoning capabilities\",\n", + " \"description\": \"3.8B model - with reasoning capabilities\",\n", " \"size\": \"3.8B\", \n", " \"type\": \"huggingface\"\n", " },\n", " \"Gemma 2 9B\": {\n", " \"model_id\": \"google/gemma-2-9b-it\",\n", - " \"description\": \"Google's 9B instruction-tuned model\",\n", + " \"description\": \"9B model - instruction-tuned model\",\n", " \"size\": \"9B\",\n", " \"type\": \"huggingface\"\n", " },\n", " \"Qwen 2.5 7B\": {\n", " \"model_id\": \"Qwen/Qwen2.5-7B-Instruct\",\n", - " \"description\": \"Strong multilingual support, good for diverse data\",\n", + " \"description\": \"7B model - multilingual that is good for diverse data\",\n", " \"size\": \"7B\",\n", " \"type\": \"huggingface\"\n", " },\n", " \"Mistral 7B\": {\n", " \"model_id\": \"mistralai/Mistral-7B-Instruct-v0.3\",\n", - " \"description\": \"Fast inference with reliable outputs\",\n", + " \"description\": \"7B model - fast inference\",\n", " \"size\": \"7B\",\n", " \"type\": \"huggingface\"\n", " },\n", " \"Zephyr 7B\": {\n", " \"model_id\": \"HuggingFaceH4/zephyr-7b-beta\",\n", - " \"description\": \"Fine-tuned for helpfulness and instruction following\",\n", + " \"description\": \"7B model - fine-tuned for instruction following\",\n", " \"size\": \"7B\",\n", " \"type\": \"huggingface\"\n", " }\n", @@ -196,29 +217,35 @@ "\n", "# Commercial Models (Additional Options)\n", "COMMERCIAL_MODELS = {\n", - " \"GPT-4o Mini\": {\n", - " \"model_id\": \"gpt-4o-mini\",\n", + " \"GPT-5 Mini\": {\n", + " \"model_id\": \"gpt-5-mini\",\n", " \"description\": \"Fast, cost-effective OpenAI model\",\n", " \"provider\": \"openai\",\n", " \"type\": \"commercial\"\n", " },\n", - " \"Claude 3 Haiku\": {\n", - " \"model_id\": \"claude-3-haiku-20240307\",\n", - " \"description\": \"Good balance of speed and quality\",\n", + " \"Claude 4.5 Haiku\": {\n", + " \"model_id\": \"claude-4.5-haiku-20251001\",\n", + " \"description\": \"Balance of speed and quality\",\n", " \"provider\": \"anthropic\", \n", " \"type\": \"commercial\"\n", " },\n", - " \"Gemini 2.0 Flash\": {\n", - " \"model_id\": \"gemini-2.0-flash-exp\",\n", - " \"description\": \"Fast, multimodal capable Google model\",\n", + " \"Gemini 2.5 Flash\": {\n", + " \"model_id\": \"gemini-2.5-flash-lite\",\n", + " \"description\": \"Fast Google model\",\n", " \"provider\": \"google\",\n", " \"type\": \"commercial\"\n", " },\n", " \"DeepSeek Chat\": {\n", " \"model_id\": \"deepseek-chat\",\n", - " \"description\": \"Cost-effective alternative with good performance\",\n", + " \"description\": \"Cost-effective with good performance\",\n", " \"provider\": \"deepseek\",\n", " \"type\": \"commercial\"\n", + " },\n", + " \"Grok 4\": {\n", + " \"model_id\": \"grok-4\",\n", + " \"description\": \"Grok 4\",\n", + " \"provider\": \"grok\",\n", + " \"type\": \"commercial\"\n", " }\n", "}\n", "\n", @@ -370,48 +397,15 @@ " model_id = model_info[\"model_id\"]\n", " \n", " try:\n", - " if provider == \"openai\" and \"openai\" in clients:\n", - " response = clients[\"openai\"].chat.completions.create(\n", - " model=model_id,\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": user_prompt}\n", - " ],\n", - " temperature=temperature\n", - " )\n", - " return response.choices[0].message.content\n", - " \n", - " elif provider == \"anthropic\" and \"anthropic\" in clients:\n", - " response = clients[\"anthropic\"].messages.create(\n", - " model=model_id,\n", - " messages=[{\"role\": \"user\", \"content\": user_prompt}],\n", - " system=system_prompt,\n", - " temperature=temperature,\n", - " max_tokens=2000\n", - " )\n", - " return response.content[0].text\n", - " \n", - " elif provider == \"google\" and api_keys[\"google\"]:\n", - " model = genai.GenerativeModel(model_id)\n", - " response = model.generate_content(\n", - " f\"{system_prompt}\\n\\n{user_prompt}\",\n", - " generation_config=genai.types.GenerationConfig(temperature=temperature)\n", - " )\n", - " return response.text\n", - " \n", - " elif provider == \"deepseek\" and \"deepseek\" in clients:\n", - " response = clients[\"deepseek\"].chat.completions.create(\n", - " model=model_id,\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": system_prompt},\n", - " {\"role\": \"user\", \"content\": user_prompt}\n", - " ],\n", - " temperature=temperature\n", - " )\n", - " return response.choices[0].message.content\n", - " \n", - " else:\n", - " return f\"API client not available for {provider}\"\n", + " response = clients[provider].chat.completions.create(\n", + " model=model_id,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_prompt},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + " ],\n", + " temperature=temperature\n", + " )\n", + " return response.choices[0].message.content\n", " \n", " except Exception as e:\n", " return f\"Error querying {model_name}: {str(e)}\"\n", @@ -580,49 +574,16 @@ " model_id = model_info[\"model_id\"]\n", " \n", " try:\n", - " if provider == \"openai\" and \"openai\" in clients:\n", - " response = clients[\"openai\"].chat.completions.create(\n", - " model=model_id,\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n", - " {\"role\": \"user\", \"content\": prompt}\n", - " ],\n", - " temperature=temperature\n", - " )\n", - " return response.choices[0].message.content\n", + " response = clients[provider].chat.completions.create(\n", + " model=model_id,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n", + " {\"role\": \"user\", \"content\": prompt}\n", + " ],\n", + " temperature=temperature\n", + " )\n", + " return response.choices[0].message.content\n", " \n", - " elif provider == \"anthropic\" and \"anthropic\" in clients:\n", - " response = clients[\"anthropic\"].messages.create(\n", - " model=model_id,\n", - " messages=[{\"role\": \"user\", \"content\": prompt}],\n", - " system=\"You are a helpful assistant that generates realistic datasets.\",\n", - " temperature=temperature,\n", - " max_tokens=4000\n", - " )\n", - " return response.content[0].text\n", - " \n", - " elif provider == \"google\" and api_keys[\"google\"]:\n", - " model = genai.GenerativeModel(model_id)\n", - " response = model.generate_content(\n", - " prompt,\n", - " generation_config=genai.types.GenerationConfig(temperature=temperature)\n", - " )\n", - " return response.text\n", - " \n", - " elif provider == \"deepseek\" and \"deepseek\" in clients:\n", - " response = clients[\"deepseek\"].chat.completions.create(\n", - " model=model_id,\n", - " messages=[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n", - " {\"role\": \"user\", \"content\": prompt}\n", - " ],\n", - " temperature=temperature\n", - " )\n", - " return response.choices[0].message.content\n", - " \n", - " else:\n", - " raise Exception(f\"API client not available for {provider}\")\n", - " \n", " except Exception as e:\n", " raise Exception(f\"Commercial API error: {str(e)}\")\n", " \n", @@ -1671,7 +1632,7 @@ " print(\"🔄 Testing OpenAI schema generation...\")\n", " result = schema_manager.generate_schema_with_llm(\n", " \"Generate a dataset for e-commerce customer analysis\",\n", - " \"GPT-4o Mini\",\n", + " \"GPT-5 Mini\",\n", " 0.7\n", " )\n", " print(f\"✅ OpenAI schema generation: {len(result)} characters\")\n", @@ -1845,8 +1806,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" } }, "nbformat": 4,