LLM_Engineering_OLD/community-contributions/LLaVA-For-Visually-Impared-People/llava-week2-ChainForRealTimeCaptionGeneration.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f97c7598-f571-4ea1-838c-e9158f729c3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ollama\n",
    "import base64\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23",
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode_image(image_path):\n",
    "    with open(image_path, 'rb') as f:\n",
    "        return base64.b64encode(f.read()).decode('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n",
    "# image_base64 = encode_image(image_path)\n",
    "# print(image_base64[:100]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71146ccf-25af-48d3-8068-ee3c9008cebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f8801a8-0c30-4199-a334-587096e6edeb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee3c5d82-e530-40f5-901a-681421f21d1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def put_image():\n",
    "    global image_list\n",
    "    user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n",
    "    \n",
    "    if not user_input_image:\n",
    "        print(\"No image inserted\")\n",
    "        return image_list\n",
    "\n",
    "    image_path = os.path.normpath(user_input_image)\n",
    "    \n",
    "    if not os.path.exists(image_path):\n",
    "        print(\"Image path not found! Try again or enter to leave blank\")\n",
    "        return put_image()  # Continue to allow more inputs\n",
    "        \n",
    "\n",
    "\n",
    "\n",
    "        \n",
    "    image_base64 = encode_image(image_path)\n",
    "    image_list.append(image_base64)\n",
    "    \n",
    "    # Detect file extension for MIME type\n",
    "    # ext = os.path.splitext(image_path)[-1].lower()\n",
    "    # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png'  # Extend if needed\n",
    "\n",
    "\n",
    "    return image_list\n",
    "    \n",
    "    # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt=  (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n",
    "    \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n",
    "    \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n",
    "    \"Be vivid and precise, as if you are painting a picture with words. \"\n",
    "    \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n",
    "    \"If the user includes a specific prompt, prioritize that in your description.)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29494db0-4770-4689-9904-8eebc4390e7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def put_prompt():\n",
    "    global prompt\n",
    "    user_input = input(\"Put new prompt: \")\n",
    "    if not user_input:\n",
    "        print(\"please enter a prompt\")\n",
    "        return put_prompt()\n",
    "    prompt += \"\\nUser: \" + user_input\n",
    "    return prompt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d286369c-e6ef-4a20-a3a8-3563af28940a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_description():\n",
    "    global prompt\n",
    "\n",
    "    put_image()\n",
    "    if not image_list: \n",
    "        return \"No images available. Skipping...\"\n",
    "\n",
    "    user_prompt = put_prompt()\n",
    "    full_answer = \"\"\n",
    "\n",
    "    for chunk in ollama.generate(\n",
    "        model='llava:7b-v1.6',\n",
    "        prompt=user_prompt,\n",
    "        images=image_list,\n",
    "        stream=True\n",
    "    ):\n",
    "        content = chunk.get(\"response\", \"\")\n",
    "        print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True)  # Live stream to console\n",
    "        full_answer += content\n",
    "\n",
    "    prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n",
    "    return full_answer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbda35a3-45ed-4509-ab41-6827eacd922c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def call_llava():\n",
    "    image_list.clear()\n",
    "    for i in range(5):\n",
    "        print(f\"\\n Iteration {i+1}\")\n",
    "        answer = image_description()\n",
    "        print(\"\\n\\n Final Answer:\", answer)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15518865-6c59-4029-bc2d-42d313eb78bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "call_llava()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23de3b59-3699-4270-9392-99fccdede83e",
   "metadata": {},
   "source": [
    "# second week practice on personal project making model faster and smarter by using tools\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = []\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_content = (\n",
    "    \"You are a helpful assistant for visually impaired users. \"\n",
    "    \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n",
    "    \"There is a list of images available, indexed from 0. \"\n",
    "    \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n",
    "    \"If yes, reply in this structured format:\\n\\n\"\n",
    "    \"TOOL_CALL: analyze_image(<image_index_or_range>, prompt='<description_request>')\\n\\n\"\n",
    "    \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n",
    "    \"Be clear and use descriptive but accessible language suitable for blind users.\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages.append({\"role\":\"system\",\"content\":system_content})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8009b75-3468-4694-887d-6cd5132c2907",
   "metadata": {},
   "outputs": [],
   "source": [
    "def chat_loop():\n",
    "    \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n",
    "    global image_list, messages\n",
    "    \n",
    "    print(\"\\n\" + \"=\"*50)\n",
    "    print(\"LLaVA Assistant for Visually Impaired Users\")\n",
    "    print(\"=\"*50 + \"\\n\")\n",
    "    \n",
    "    # Step 1: Load images\n",
    "    print(\"Step 1: Add images (optional)\")\n",
    "    put_image()\n",
    "    messages.append({\n",
    "        \"role\": \"system\", \n",
    "        \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n",
    "    })\n",
    "    \n",
    "    # Step 2: Single chat interaction\n",
    "    print(\"\\nStep 2: Ask a question about the images\")\n",
    "    user_content = put_prompt()\n",
    "    messages.append({\"role\": \"user\", \"content\": user_content})\n",
    "    \n",
    "    # Get model response\n",
    "    try:\n",
    "        response = ollama.chat(\n",
    "            model='llava:7b-v1.6',\n",
    "            messages=messages\n",
    "        )[\"message\"][\"content\"]\n",
    "        print(\"assistant: \",response)    \n",
    "        processed_response = process_response(response)\n",
    "        print(f\"\\nASSISTANT: {processed_response}\\n\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error occurred: {e}\")\n",
    "    \n",
    "    print(\"\\nSession ended. Goodbye!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_response(response):\n",
    "    \"\"\"Process the model's response and handle tool calls\"\"\"\n",
    "    if response.strip().startswith(\"TOOL_CALL:\"):\n",
    "        # Extract image index/range and prompt from TOOL_CALL\n",
    "        pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n",
    "        match = re.search(pattern, response, re.DOTALL)\n",
    "        \n",
    "        if not match:\n",
    "            error_msg = \"Error: Invalid TOOL_CALL format.\"\n",
    "            messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
    "            return error_msg\n",
    "            \n",
    "        image_expr = match.group(1).strip()\n",
    "        prompt = match.group(2).strip()\n",
    "        \n",
    "        try:\n",
    "            # Handle different index formats\n",
    "            if \":\" in image_expr:  # Range (e.g., \"1:3\")\n",
    "                start, end = map(int, image_expr.split(\":\"))\n",
    "                index_or_range = list(range(start, end))\n",
    "            else:  # Single index\n",
    "                index_or_range = int(image_expr)\n",
    "                \n",
    "            # Validate indices\n",
    "            max_index = len(image_list) - 1\n",
    "            if isinstance(index_or_range, list):\n",
    "                if any(i < 0 or i > max_index for i in index_or_range):\n",
    "                    error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
    "                    messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
    "                    return error_msg\n",
    "            elif index_or_range < 0 or index_or_range > max_index:\n",
    "                error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
    "                messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
    "                return error_msg\n",
    "                \n",
    "            # Perform analysis\n",
    "            result = analyze_image(index_or_range, prompt)\n",
    "            print(\"funtion called\")\n",
    "            messages.append({\n",
    "                \"role\": \"function\",\n",
    "                \"name\": \"analyze_image\",\n",
    "                \"content\": result\n",
    "            })\n",
    "            \n",
    "            # Return formatted result\n",
    "            formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n",
    "            return formatted_result\n",
    "\n",
    "        except Exception as e:\n",
    "            error_msg = f\"Error processing TOOL_CALL: {e}\"\n",
    "            messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
    "            return error_msg\n",
    "    else:\n",
    "        messages.append({\"role\": \"assistant\", \"content\": response})\n",
    "        return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze_image(index_or_range, prompt):\n",
    "    \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n",
    "    global image_list\n",
    "    \n",
    "    # Handle single index or range\n",
    "    if isinstance(index_or_range, int):\n",
    "        images = [image_list[index_or_range]]\n",
    "    elif isinstance(index_or_range, list):\n",
    "        images = [image_list[i] for i in index_or_range]\n",
    "    else:\n",
    "        return \"Invalid image index/range specified.\"\n",
    "    \n",
    "    if not images:\n",
    "        return \"No images available for analysis.\"\n",
    "    \n",
    "    full_prompt = (\n",
    "        \"Describe the image clearly for a visually impaired user. \"\n",
    "        \"Be detailed about objects, people, colors, spatial relationships, \"\n",
    "        \"and any important context. \"\n",
    "        f\"User's specific request: {prompt}\"\n",
    "    )\n",
    "    \n",
    "    output = \"\"\n",
    "    try:\n",
    "        for chunk in ollama.generate(\n",
    "            model='llava:7b-v1.6',\n",
    "            prompt=full_prompt,\n",
    "            images=images,\n",
    "            stream=True\n",
    "        ):\n",
    "            output += chunk.get('response', \"\")\n",
    "    except Exception as e:\n",
    "        return f\"Error analyzing image: {e}\"\n",
    "    \n",
    "    return output\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2040b020-8944-409b-8ebb-10d7ffef1748",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_list.clear\n",
    "for i in range(5):\n",
    "    chat_loop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}