{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f97c7598-f571-4ea1-838c-e9158f729c3e", "metadata": {}, "outputs": [], "source": [ "import ollama\n", "import base64\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23", "metadata": {}, "outputs": [], "source": [ "def encode_image(image_path):\n", " with open(image_path, 'rb') as f:\n", " return base64.b64encode(f.read()).decode('utf-8')" ] }, { "cell_type": "code", "execution_count": null, "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a", "metadata": {}, "outputs": [], "source": [ "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n", "# image_base64 = encode_image(image_path)\n", "# print(image_base64[:100]) " ] }, { "cell_type": "code", "execution_count": null, "id": "71146ccf-25af-48d3-8068-ee3c9008cebf", "metadata": {}, "outputs": [], "source": [ "image_list = []" ] }, { "cell_type": "code", "execution_count": null, "id": "6f8801a8-0c30-4199-a334-587096e6edeb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ee3c5d82-e530-40f5-901a-681421f21d1e", "metadata": {}, "outputs": [], "source": [ "def put_image():\n", " global image_list\n", " user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n", " \n", " if not user_input_image:\n", " print(\"No image inserted\")\n", " return image_list\n", "\n", " image_path = os.path.normpath(user_input_image)\n", " \n", " if not os.path.exists(image_path):\n", " print(\"Image path not found! Try again or enter to leave blank\")\n", " return put_image() # Continue to allow more inputs\n", " \n", "\n", "\n", "\n", " \n", " image_base64 = encode_image(image_path)\n", " image_list.append(image_base64)\n", " \n", " # Detect file extension for MIME type\n", " # ext = os.path.splitext(image_path)[-1].lower()\n", " # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n", "\n", "\n", " return image_list\n", " \n", " # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43", "metadata": {}, "outputs": [], "source": [ "prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n", " \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n", " \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n", " \"Be vivid and precise, as if you are painting a picture with words. \"\n", " \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n", " \"If the user includes a specific prompt, prioritize that in your description.)\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "29494db0-4770-4689-9904-8eebc4390e7c", "metadata": {}, "outputs": [], "source": [ "def put_prompt():\n", " global prompt\n", " user_input = input(\"Put new prompt: \")\n", " if not user_input:\n", " print(\"please enter a prompt\")\n", " return put_prompt()\n", " prompt += \"\\nUser: \" + user_input\n", " return prompt\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d286369c-e6ef-4a20-a3a8-3563af28940a", "metadata": {}, "outputs": [], "source": [ "def image_description():\n", " global prompt\n", "\n", " put_image()\n", " if not image_list: \n", " return \"No images available. Skipping...\"\n", "\n", " user_prompt = put_prompt()\n", " full_answer = \"\"\n", "\n", " for chunk in ollama.generate(\n", " model='llava:7b-v1.6',\n", " prompt=user_prompt,\n", " images=image_list,\n", " stream=True\n", " ):\n", " content = chunk.get(\"response\", \"\")\n", " print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n", " full_answer += content\n", "\n", " prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n", " return full_answer\n" ] }, { "cell_type": "code", "execution_count": null, "id": "cbda35a3-45ed-4509-ab41-6827eacd922c", "metadata": {}, "outputs": [], "source": [ "def call_llava():\n", " image_list.clear()\n", " for i in range(5):\n", " print(f\"\\n Iteration {i+1}\")\n", " answer = image_description()\n", " print(\"\\n\\n Final Answer:\", answer)\n", " \n" ] }, { "cell_type": "code", "execution_count": null, "id": "15518865-6c59-4029-bc2d-42d313eb78bc", "metadata": {}, "outputs": [], "source": [ "call_llava()" ] }, { "cell_type": "markdown", "id": "23de3b59-3699-4270-9392-99fccdede83e", "metadata": {}, "source": [ "# second week practice on personal project making model faster and smarter by using tools\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda", "metadata": {}, "outputs": [], "source": [ "messages = []\n" ] }, { "cell_type": "code", "execution_count": null, "id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af", "metadata": {}, "outputs": [], "source": [ "system_content = (\n", " \"You are a helpful assistant for visually impaired users. \"\n", " \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n", " \"There is a list of images available, indexed from 0. \"\n", " \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n", " \"If yes, reply in this structured format:\\n\\n\"\n", " \"TOOL_CALL: analyze_image(, prompt='')\\n\\n\"\n", " \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n", " \"Be clear and use descriptive but accessible language suitable for blind users.\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7", "metadata": {}, "outputs": [], "source": [ "messages.append({\"role\":\"system\",\"content\":system_content})" ] }, { "cell_type": "code", "execution_count": null, "id": "a8009b75-3468-4694-887d-6cd5132c2907", "metadata": {}, "outputs": [], "source": [ "def chat_loop():\n", " \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n", " global image_list, messages\n", " \n", " print(\"\\n\" + \"=\"*50)\n", " print(\"LLaVA Assistant for Visually Impaired Users\")\n", " print(\"=\"*50 + \"\\n\")\n", " \n", " # Step 1: Load images\n", " print(\"Step 1: Add images (optional)\")\n", " put_image()\n", " messages.append({\n", " \"role\": \"system\", \n", " \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n", " })\n", " \n", " # Step 2: Single chat interaction\n", " print(\"\\nStep 2: Ask a question about the images\")\n", " user_content = put_prompt()\n", " messages.append({\"role\": \"user\", \"content\": user_content})\n", " \n", " # Get model response\n", " try:\n", " response = ollama.chat(\n", " model='llava:7b-v1.6',\n", " messages=messages\n", " )[\"message\"][\"content\"]\n", " print(\"assistant: \",response) \n", " processed_response = process_response(response)\n", " print(f\"\\nASSISTANT: {processed_response}\\n\")\n", " \n", " except Exception as e:\n", " print(f\"Error occurred: {e}\")\n", " \n", " print(\"\\nSession ended. Goodbye!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0", "metadata": {}, "outputs": [], "source": [ "def process_response(response):\n", " \"\"\"Process the model's response and handle tool calls\"\"\"\n", " if response.strip().startswith(\"TOOL_CALL:\"):\n", " # Extract image index/range and prompt from TOOL_CALL\n", " pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n", " match = re.search(pattern, response, re.DOTALL)\n", " \n", " if not match:\n", " error_msg = \"Error: Invalid TOOL_CALL format.\"\n", " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", " return error_msg\n", " \n", " image_expr = match.group(1).strip()\n", " prompt = match.group(2).strip()\n", " \n", " try:\n", " # Handle different index formats\n", " if \":\" in image_expr: # Range (e.g., \"1:3\")\n", " start, end = map(int, image_expr.split(\":\"))\n", " index_or_range = list(range(start, end))\n", " else: # Single index\n", " index_or_range = int(image_expr)\n", " \n", " # Validate indices\n", " max_index = len(image_list) - 1\n", " if isinstance(index_or_range, list):\n", " if any(i < 0 or i > max_index for i in index_or_range):\n", " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", " return error_msg\n", " elif index_or_range < 0 or index_or_range > max_index:\n", " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", " return error_msg\n", " \n", " # Perform analysis\n", " result = analyze_image(index_or_range, prompt)\n", " print(\"funtion called\")\n", " messages.append({\n", " \"role\": \"function\",\n", " \"name\": \"analyze_image\",\n", " \"content\": result\n", " })\n", " \n", " # Return formatted result\n", " formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n", " return formatted_result\n", "\n", " except Exception as e:\n", " error_msg = f\"Error processing TOOL_CALL: {e}\"\n", " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", " return error_msg\n", " else:\n", " messages.append({\"role\": \"assistant\", \"content\": response})\n", " return response" ] }, { "cell_type": "code", "execution_count": null, "id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b", "metadata": {}, "outputs": [], "source": [ "def analyze_image(index_or_range, prompt):\n", " \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n", " global image_list\n", " \n", " # Handle single index or range\n", " if isinstance(index_or_range, int):\n", " images = [image_list[index_or_range]]\n", " elif isinstance(index_or_range, list):\n", " images = [image_list[i] for i in index_or_range]\n", " else:\n", " return \"Invalid image index/range specified.\"\n", " \n", " if not images:\n", " return \"No images available for analysis.\"\n", " \n", " full_prompt = (\n", " \"Describe the image clearly for a visually impaired user. \"\n", " \"Be detailed about objects, people, colors, spatial relationships, \"\n", " \"and any important context. \"\n", " f\"User's specific request: {prompt}\"\n", " )\n", " \n", " output = \"\"\n", " try:\n", " for chunk in ollama.generate(\n", " model='llava:7b-v1.6',\n", " prompt=full_prompt,\n", " images=images,\n", " stream=True\n", " ):\n", " output += chunk.get('response', \"\")\n", " except Exception as e:\n", " return f\"Error analyzing image: {e}\"\n", " \n", " return output\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2040b020-8944-409b-8ebb-10d7ffef1748", "metadata": {}, "outputs": [], "source": [ "image_list.clear\n", "for i in range(5):\n", " chat_loop()" ] }, { "cell_type": "code", "execution_count": null, "id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }