updates

2025-10-19 13:36:29 -07:00
parent a56ac723a7
commit 1268763737
1 changed files with 85 additions and 110 deletions
--- a/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb
+++ b/week3/community-contributions/dkisselev-zz/Week3_Excercise_Synthetic_Dataset_PGx.ipynb
@@ -61,9 +61,9 @@
    "\n",
    "# LLM APIs\n",
    "from openai import OpenAI\n",
-    "import anthropic\n",
-    "import google.generativeai as genai\n",
-    "from deepseek import DeepSeek\n",
+    "# import anthropic\n",
+    "# import google.generativeai as genai\n",
+    "# from deepseek import DeepSeek\n",
    "\n",
    "# HuggingFace\n",
    "from huggingface_hub import login\n",
@@ -72,7 +72,7 @@
    "# Data processing\n",
    "import nltk\n",
    "from nltk.corpus import wordnet\n",
-    "import pyarrow as pa\n",
+    "# import pyarrow as pa\n",
    "\n",
    "# UI\n",
    "import gradio as gr\n",
@@ -105,6 +105,10 @@
    "            'anthropic': userdata.get('ANTHROPIC_API_KEY'),\n",
    "            'google': userdata.get('GOOGLE_API_KEY'),\n",
    "            'deepseek': userdata.get('DEEPSEEK_API_KEY'),\n",
+    "            # 'groq': userdata.get('GROQ_API_KEY'),\n",
+    "            'grok': userdata.get('GROK_API_KEY'),\n",
+    "            # 'openrouter': userdata.get('OPENROUTER_API_KEY'),\n",
+    "            # 'ollama': userdata.get('OLLAMA_API_KEY'),\n",
    "            'hf_token': userdata.get('HF_TOKEN')\n",
    "        }\n",
    "        print(\"✅ Using Colab secrets\")\n",
@@ -117,27 +121,44 @@
    "            'anthropic': os.getenv('ANTHROPIC_API_KEY'),\n",
    "            'google': os.getenv('GOOGLE_API_KEY'),\n",
    "            'deepseek': os.getenv('DEEPSEEK_API_KEY'),\n",
+    "            # 'groq': os.getenv('GROQ_API_KEY'),\n",
+    "            'grok': os.getenv('GROK_API_KEY'),\n",
+    "            # 'openrouter': os.getenv('OPENROUTER_API_KEY'),\n",
+    "            # 'ollama': os.getenv('OLLAMA_API_KEY'),\n",
    "            'hf_token': os.getenv('HF_TOKEN')\n",
    "        }\n",
    "        print(\"✅ Using local .env file\")\n",
    "    \n",
    "    # Initialize API clients\n",
+    "    anthropic_url = \"https://api.anthropic.com/v1/\"\n",
+    "    gemini_url = \"https://generativelanguage.googleapis.com/v1beta/openai/\"\n",
+    "    deepseek_url = \"https://api.deepseek.com\"\n",
+    "    # groq_url = \"https://api.groq.com/openai/v1\"\n",
+    "    grok_url = \"https://api.x.ai/v1\"\n",
+    "    # openrouter_url = \"https://openrouter.ai/api/v1\"\n",
+    "    # ollama_url = \"http://localhost:11434/v1\"\n",
+    "\n",
    "    clients = {}\n",
    "    if api_keys['openai']:\n",
    "        clients['openai'] = OpenAI(api_key=api_keys['openai'])\n",
    "    if api_keys['anthropic']:\n",
-    "        clients['anthropic'] = anthropic.Anthropic(api_key=api_keys['anthropic'])\n",
+    "        clients['anthropic'] = OpenAI(api_key=api_keys['anthropic'], base_url=anthropic_url)\n",
+    "        # clients['anthropic'] = anthropic.Anthropic(api_key=api_keys['anthropic'])\n",
    "    if api_keys['google']:\n",
-    "        genai.configure(api_key=api_keys['google'])\n",
+    "        # genai.configure(api_key=api_keys['google'])\n",
+    "        clients['gemini'] = OpenAI(api_key=api_keys['google'], base_url=gemini_url)\n",
    "    if api_keys['deepseek']:\n",
-    "        clients['deepseek'] = DeepSeek(api_key=api_keys['deepseek'])\n",
+    "        clients['deepseek'] = OpenAI(api_key=api_keys['deepseek'], base_url=deepseek_url)\n",
+    "        # clients['deepseek'] = DeepSeek(api_key=api_keys['deepseek'])\n",
+    "    if api_keys['grok']:\n",
+    "        clients['grok'] = OpenAI(api_key=api_keys['grok'], base_url=grok_url)\n",
    "    if api_keys['hf_token']:\n",
    "        login(api_keys['hf_token'], add_to_git_credential=True)\n",
    "    \n",
    "    return api_keys, clients\n",
    "\n",
    "# Initialize API keys and clients\n",
-    "api_keys, clients = setup_api_keys()\n"
+    "api_keys, clients = setup_api_keys()"
   ]
  },
  {
@@ -152,43 +173,43 @@
    "HUGGINGFACE_MODELS = {\n",
    "    \"Llama 3.1 8B\": {\n",
    "        \"model_id\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "        \"description\": \"Versatile 8B model, excellent for structured data generation\",\n",
+    "        \"description\": \"8B model - that is good for structured data generation\",\n",
    "        \"size\": \"8B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Llama 3.2 3B\": {\n",
    "        \"model_id\": \"meta-llama/Llama-3.2-3B-Instruct\", \n",
-    "        \"description\": \"Smaller, faster model, good for simple schemas\",\n",
+    "        \"description\": \"3B model - smaller and faster model that is good for simple schemas\",\n",
    "        \"size\": \"3B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Phi-3.5 Mini\": {\n",
    "        \"model_id\": \"microsoft/Phi-3.5-mini-instruct\",\n",
-    "        \"description\": \"Efficient 3.8B model with strong reasoning capabilities\",\n",
+    "        \"description\": \"3.8B model - with reasoning capabilities\",\n",
    "        \"size\": \"3.8B\", \n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Gemma 2 9B\": {\n",
    "        \"model_id\": \"google/gemma-2-9b-it\",\n",
-    "        \"description\": \"Google's 9B instruction-tuned model\",\n",
+    "        \"description\": \"9B model - instruction-tuned model\",\n",
    "        \"size\": \"9B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Qwen 2.5 7B\": {\n",
    "        \"model_id\": \"Qwen/Qwen2.5-7B-Instruct\",\n",
-    "        \"description\": \"Strong multilingual support, good for diverse data\",\n",
+    "        \"description\": \"7B model - multilingual that is good for diverse data\",\n",
    "        \"size\": \"7B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Mistral 7B\": {\n",
    "        \"model_id\": \"mistralai/Mistral-7B-Instruct-v0.3\",\n",
-    "        \"description\": \"Fast inference with reliable outputs\",\n",
+    "        \"description\": \"7B model - fast inference\",\n",
    "        \"size\": \"7B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    },\n",
    "    \"Zephyr 7B\": {\n",
    "        \"model_id\": \"HuggingFaceH4/zephyr-7b-beta\",\n",
-    "        \"description\": \"Fine-tuned for helpfulness and instruction following\",\n",
+    "        \"description\": \"7B model - fine-tuned for instruction following\",\n",
    "        \"size\": \"7B\",\n",
    "        \"type\": \"huggingface\"\n",
    "    }\n",
@@ -196,29 +217,35 @@
    "\n",
    "# Commercial Models (Additional Options)\n",
    "COMMERCIAL_MODELS = {\n",
-    "    \"GPT-4o Mini\": {\n",
-    "        \"model_id\": \"gpt-4o-mini\",\n",
+    "    \"GPT-5 Mini\": {\n",
+    "        \"model_id\": \"gpt-5-mini\",\n",
    "        \"description\": \"Fast, cost-effective OpenAI model\",\n",
    "        \"provider\": \"openai\",\n",
    "        \"type\": \"commercial\"\n",
    "    },\n",
-    "    \"Claude 3 Haiku\": {\n",
-    "        \"model_id\": \"claude-3-haiku-20240307\",\n",
-    "        \"description\": \"Good balance of speed and quality\",\n",
+    "    \"Claude 4.5 Haiku\": {\n",
+    "        \"model_id\": \"claude-4.5-haiku-20251001\",\n",
+    "        \"description\": \"Balance of speed and quality\",\n",
    "        \"provider\": \"anthropic\", \n",
    "        \"type\": \"commercial\"\n",
    "    },\n",
-    "    \"Gemini 2.0 Flash\": {\n",
-    "        \"model_id\": \"gemini-2.0-flash-exp\",\n",
-    "        \"description\": \"Fast, multimodal capable Google model\",\n",
+    "    \"Gemini 2.5 Flash\": {\n",
+    "        \"model_id\": \"gemini-2.5-flash-lite\",\n",
+    "        \"description\": \"Fast  Google model\",\n",
    "        \"provider\": \"google\",\n",
    "        \"type\": \"commercial\"\n",
    "    },\n",
    "    \"DeepSeek Chat\": {\n",
    "        \"model_id\": \"deepseek-chat\",\n",
-    "        \"description\": \"Cost-effective alternative with good performance\",\n",
+    "        \"description\": \"Cost-effective with good performance\",\n",
    "        \"provider\": \"deepseek\",\n",
    "        \"type\": \"commercial\"\n",
+    "    },\n",
+    "    \"Grok 4\": {\n",
+    "        \"model_id\": \"grok-4\",\n",
+    "        \"description\": \"Grok 4\",\n",
+    "        \"provider\": \"grok\",\n",
+    "        \"type\": \"commercial\"\n",
    "    }\n",
    "}\n",
    "\n",
@@ -370,48 +397,15 @@
    "        model_id = model_info[\"model_id\"]\n",
    "        \n",
    "        try:\n",
-    "            if provider == \"openai\" and \"openai\" in clients:\n",
-    "                response = clients[\"openai\"].chat.completions.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[\n",
-    "                        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "                        {\"role\": \"user\", \"content\": user_prompt}\n",
-    "                    ],\n",
-    "                    temperature=temperature\n",
-    "                )\n",
-    "                return response.choices[0].message.content\n",
-    "            \n",
-    "            elif provider == \"anthropic\" and \"anthropic\" in clients:\n",
-    "                response = clients[\"anthropic\"].messages.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[{\"role\": \"user\", \"content\": user_prompt}],\n",
-    "                    system=system_prompt,\n",
-    "                    temperature=temperature,\n",
-    "                    max_tokens=2000\n",
-    "                )\n",
-    "                return response.content[0].text\n",
-    "            \n",
-    "            elif provider == \"google\" and api_keys[\"google\"]:\n",
-    "                model = genai.GenerativeModel(model_id)\n",
-    "                response = model.generate_content(\n",
-    "                    f\"{system_prompt}\\n\\n{user_prompt}\",\n",
-    "                    generation_config=genai.types.GenerationConfig(temperature=temperature)\n",
-    "                )\n",
-    "                return response.text\n",
-    "            \n",
-    "            elif provider == \"deepseek\" and \"deepseek\" in clients:\n",
-    "                response = clients[\"deepseek\"].chat.completions.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[\n",
-    "                        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "                        {\"role\": \"user\", \"content\": user_prompt}\n",
-    "                    ],\n",
-    "                    temperature=temperature\n",
-    "                )\n",
-    "                return response.choices[0].message.content\n",
-    "            \n",
-    "            else:\n",
-    "                return f\"API client not available for {provider}\"\n",
+    "            response = clients[provider].chat.completions.create(\n",
+    "                model=model_id,\n",
+    "                messages=[\n",
+    "                    {\"role\": \"system\", \"content\": system_prompt},\n",
+    "                    {\"role\": \"user\", \"content\": user_prompt}\n",
+    "                ],\n",
+    "                temperature=temperature\n",
+    "            )\n",
+    "            return response.choices[0].message.content\n",
    "                \n",
    "        except Exception as e:\n",
    "            return f\"Error querying {model_name}: {str(e)}\"\n",
@@ -580,49 +574,16 @@
    "        model_id = model_info[\"model_id\"]\n",
    "        \n",
    "        try:\n",
-    "            if provider == \"openai\" and \"openai\" in clients:\n",
-    "                response = clients[\"openai\"].chat.completions.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[\n",
-    "                        {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n",
-    "                        {\"role\": \"user\", \"content\": prompt}\n",
-    "                    ],\n",
-    "                    temperature=temperature\n",
-    "                )\n",
-    "                return response.choices[0].message.content\n",
+    "            response = clients[provider].chat.completions.create(\n",
+    "                model=model_id,\n",
+    "                messages=[\n",
+    "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n",
+    "                    {\"role\": \"user\", \"content\": prompt}\n",
+    "                ],\n",
+    "                temperature=temperature\n",
+    "            )\n",
+    "            return response.choices[0].message.content\n",
    "            \n",
-    "            elif provider == \"anthropic\" and \"anthropic\" in clients:\n",
-    "                response = clients[\"anthropic\"].messages.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
-    "                    system=\"You are a helpful assistant that generates realistic datasets.\",\n",
-    "                    temperature=temperature,\n",
-    "                    max_tokens=4000\n",
-    "                )\n",
-    "                return response.content[0].text\n",
-    "            \n",
-    "            elif provider == \"google\" and api_keys[\"google\"]:\n",
-    "                model = genai.GenerativeModel(model_id)\n",
-    "                response = model.generate_content(\n",
-    "                    prompt,\n",
-    "                    generation_config=genai.types.GenerationConfig(temperature=temperature)\n",
-    "                )\n",
-    "                return response.text\n",
-    "            \n",
-    "            elif provider == \"deepseek\" and \"deepseek\" in clients:\n",
-    "                response = clients[\"deepseek\"].chat.completions.create(\n",
-    "                    model=model_id,\n",
-    "                    messages=[\n",
-    "                        {\"role\": \"system\", \"content\": \"You are a helpful assistant that generates realistic datasets.\"},\n",
-    "                        {\"role\": \"user\", \"content\": prompt}\n",
-    "                    ],\n",
-    "                    temperature=temperature\n",
-    "                )\n",
-    "                return response.choices[0].message.content\n",
-    "            \n",
-    "            else:\n",
-    "                raise Exception(f\"API client not available for {provider}\")\n",
-    "                \n",
    "        except Exception as e:\n",
    "            raise Exception(f\"Commercial API error: {str(e)}\")\n",
    "    \n",
@@ -1671,7 +1632,7 @@
    "        print(\"🔄 Testing OpenAI schema generation...\")\n",
    "        result = schema_manager.generate_schema_with_llm(\n",
    "            \"Generate a dataset for e-commerce customer analysis\",\n",
-    "            \"GPT-4o Mini\",\n",
+    "            \"GPT-5 Mini\",\n",
    "            0.7\n",
    "        )\n",
    "        print(f\"✅ OpenAI schema generation: {len(result)} characters\")\n",
@@ -1845,8 +1806,22 @@
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,