diff --git a/community-contributions/LLaVA-For-Visually-Impared-People/llava-For-Image-week1.ipynb b/community-contributions/LLaVA-For-Visually-Impared-People/llava-For-Image-week1.ipynb new file mode 100644 index 0000000..d1494d8 --- /dev/null +++ b/community-contributions/LLaVA-For-Visually-Impared-People/llava-For-Image-week1.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2a5df086", + "metadata": {}, + "source": [ + "# If Anyone is interested in this idea and want to contribute please let me know and contribute your idea/Code\n" + ] + }, + { + "cell_type": "markdown", + "id": "3b0d5f6e", + "metadata": {}, + "source": [ + "*IDEA* - For visually impaired individuals, daily life often presents numerous obstacles that many of us take for granted. While tools like Braille and guide dogs offer some support, they do not fully address the limitations faced in navigating the world. With over 43.3 million blind people globally, there is a pressing need for more inclusive technologies that help break these barriers. This project aims to do more than assist with daily tasks; it seeks to empower individuals to engage meaningfully with their environment. By providing real-time, contextually accurate captions, this system allows them to experience the world around them, feel less isolated, and regain a sense of autonomy. Beyond just aiding navigation, it provides a bridge to connection—helping them feel more alive, present, and capable. This project is not just about overcoming limitations; it’s about enriching lives and enabling a deeper, fuller interaction with the world, fostering a sense of belonging and independence.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f97c7598-f571-4ea1-838c-e9158f729c3e", + "metadata": {}, + "outputs": [], + "source": [ + "import ollama\n", + "import base64\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " with open(image_path, 'rb') as f:\n", + " return base64.b64encode(f.read()).decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a", + "metadata": {}, + "outputs": [], + "source": [ + "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n", + "# image_base64 = encode_image(image_path)\n", + "# print(image_base64[:100]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71146ccf-25af-48d3-8068-ee3c9008cebf", + "metadata": {}, + "outputs": [], + "source": [ + "image_list = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8801a8-0c30-4199-a334-587096e6edeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee3c5d82-e530-40f5-901a-681421f21d1e", + "metadata": {}, + "outputs": [], + "source": [ + "def put_image():\n", + " global image_list\n", + " user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n", + " \n", + " if not user_input_image:\n", + " print(\"No image inserted\")\n", + " return image_list\n", + "\n", + " image_path = os.path.normpath(user_input_image)\n", + " \n", + " if not os.path.exists(image_path):\n", + " print(\"Image path not found! Try again or enter to leave blank\")\n", + " return put_image() # Continue to allow more inputs\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " image_base64 = encode_image(image_path)\n", + " image_list.append(image_base64)\n", + " \n", + " # Detect file extension for MIME type\n", + " # ext = os.path.splitext(image_path)[-1].lower()\n", + " # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n", + "\n", + "\n", + " return image_list\n", + " \n", + " # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43", + "metadata": {}, + "outputs": [], + "source": [ + "prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n", + " \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n", + " \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n", + " \"Be vivid and precise, as if you are painting a picture with words. \"\n", + " \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n", + " \"If the user includes a specific prompt, prioritize that in your description.)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29494db0-4770-4689-9904-8eebc4390e7c", + "metadata": {}, + "outputs": [], + "source": [ + "def put_prompt():\n", + " global prompt\n", + " user_input = input(\"Put new prompt: \")\n", + " if not user_input:\n", + " print(\"please enter a prompt\")\n", + " return put_prompt()\n", + " prompt += \"\\nUser: \" + user_input\n", + " return prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d286369c-e6ef-4a20-a3a8-3563af28940a", + "metadata": {}, + "outputs": [], + "source": [ + "def image_description():\n", + " global prompt\n", + "\n", + " put_image()\n", + " if not image_list: \n", + " return \"No images available. Skipping...\"\n", + "\n", + " user_prompt = put_prompt()\n", + " full_answer = \"\"\n", + "\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=user_prompt,\n", + " images=image_list,\n", + " stream=True\n", + " ):\n", + " content = chunk.get(\"response\", \"\")\n", + " print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n", + " full_answer += content\n", + "\n", + " prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n", + " return full_answer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbda35a3-45ed-4509-ab41-6827eacd922c", + "metadata": {}, + "outputs": [], + "source": [ + "def call_llava():\n", + " image_list.clear()\n", + " for i in range(5):\n", + " print(f\"\\n Iteration {i+1}\")\n", + " answer = image_description()\n", + " print(\"\\n\\n Final Answer:\", answer)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15518865-6c59-4029-bc2d-42d313eb78bc", + "metadata": {}, + "outputs": [], + "source": [ + "call_llava()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c77bd493-f893-402e-b4e3-64854e9d2e19", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/community-contributions/LLaVA-For-Visually-Impared-People/llava-week2-ChainForRealTimeCaptionGeneration.ipynb b/community-contributions/LLaVA-For-Visually-Impared-People/llava-week2-ChainForRealTimeCaptionGeneration.ipynb new file mode 100644 index 0000000..26e30e3 --- /dev/null +++ b/community-contributions/LLaVA-For-Visually-Impared-People/llava-week2-ChainForRealTimeCaptionGeneration.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f97c7598-f571-4ea1-838c-e9158f729c3e", + "metadata": {}, + "outputs": [], + "source": [ + "import ollama\n", + "import base64\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " with open(image_path, 'rb') as f:\n", + " return base64.b64encode(f.read()).decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a", + "metadata": {}, + "outputs": [], + "source": [ + "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n", + "# image_base64 = encode_image(image_path)\n", + "# print(image_base64[:100]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71146ccf-25af-48d3-8068-ee3c9008cebf", + "metadata": {}, + "outputs": [], + "source": [ + "image_list = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8801a8-0c30-4199-a334-587096e6edeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee3c5d82-e530-40f5-901a-681421f21d1e", + "metadata": {}, + "outputs": [], + "source": [ + "def put_image():\n", + " global image_list\n", + " user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n", + " \n", + " if not user_input_image:\n", + " print(\"No image inserted\")\n", + " return image_list\n", + "\n", + " image_path = os.path.normpath(user_input_image)\n", + " \n", + " if not os.path.exists(image_path):\n", + " print(\"Image path not found! Try again or enter to leave blank\")\n", + " return put_image() # Continue to allow more inputs\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " image_base64 = encode_image(image_path)\n", + " image_list.append(image_base64)\n", + " \n", + " # Detect file extension for MIME type\n", + " # ext = os.path.splitext(image_path)[-1].lower()\n", + " # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n", + "\n", + "\n", + " return image_list\n", + " \n", + " # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43", + "metadata": {}, + "outputs": [], + "source": [ + "prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n", + " \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n", + " \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n", + " \"Be vivid and precise, as if you are painting a picture with words. \"\n", + " \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n", + " \"If the user includes a specific prompt, prioritize that in your description.)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29494db0-4770-4689-9904-8eebc4390e7c", + "metadata": {}, + "outputs": [], + "source": [ + "def put_prompt():\n", + " global prompt\n", + " user_input = input(\"Put new prompt: \")\n", + " if not user_input:\n", + " print(\"please enter a prompt\")\n", + " return put_prompt()\n", + " prompt += \"\\nUser: \" + user_input\n", + " return prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d286369c-e6ef-4a20-a3a8-3563af28940a", + "metadata": {}, + "outputs": [], + "source": [ + "def image_description():\n", + " global prompt\n", + "\n", + " put_image()\n", + " if not image_list: \n", + " return \"No images available. Skipping...\"\n", + "\n", + " user_prompt = put_prompt()\n", + " full_answer = \"\"\n", + "\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=user_prompt,\n", + " images=image_list,\n", + " stream=True\n", + " ):\n", + " content = chunk.get(\"response\", \"\")\n", + " print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n", + " full_answer += content\n", + "\n", + " prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n", + " return full_answer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbda35a3-45ed-4509-ab41-6827eacd922c", + "metadata": {}, + "outputs": [], + "source": [ + "def call_llava():\n", + " image_list.clear()\n", + " for i in range(5):\n", + " print(f\"\\n Iteration {i+1}\")\n", + " answer = image_description()\n", + " print(\"\\n\\n Final Answer:\", answer)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15518865-6c59-4029-bc2d-42d313eb78bc", + "metadata": {}, + "outputs": [], + "source": [ + "call_llava()" + ] + }, + { + "cell_type": "markdown", + "id": "23de3b59-3699-4270-9392-99fccdede83e", + "metadata": {}, + "source": [ + "# second week practice on personal project making model faster and smarter by using tools\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda", + "metadata": {}, + "outputs": [], + "source": [ + "messages = []\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af", + "metadata": {}, + "outputs": [], + "source": [ + "system_content = (\n", + " \"You are a helpful assistant for visually impaired users. \"\n", + " \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n", + " \"There is a list of images available, indexed from 0. \"\n", + " \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n", + " \"If yes, reply in this structured format:\\n\\n\"\n", + " \"TOOL_CALL: analyze_image(, prompt='')\\n\\n\"\n", + " \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n", + " \"Be clear and use descriptive but accessible language suitable for blind users.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7", + "metadata": {}, + "outputs": [], + "source": [ + "messages.append({\"role\":\"system\",\"content\":system_content})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8009b75-3468-4694-887d-6cd5132c2907", + "metadata": {}, + "outputs": [], + "source": [ + "def chat_loop():\n", + " \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n", + " global image_list, messages\n", + " \n", + " print(\"\\n\" + \"=\"*50)\n", + " print(\"LLaVA Assistant for Visually Impaired Users\")\n", + " print(\"=\"*50 + \"\\n\")\n", + " \n", + " # Step 1: Load images\n", + " print(\"Step 1: Add images (optional)\")\n", + " put_image()\n", + " messages.append({\n", + " \"role\": \"system\", \n", + " \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n", + " })\n", + " \n", + " # Step 2: Single chat interaction\n", + " print(\"\\nStep 2: Ask a question about the images\")\n", + " user_content = put_prompt()\n", + " messages.append({\"role\": \"user\", \"content\": user_content})\n", + " \n", + " # Get model response\n", + " try:\n", + " response = ollama.chat(\n", + " model='llava:7b-v1.6',\n", + " messages=messages\n", + " )[\"message\"][\"content\"]\n", + " print(\"assistant: \",response) \n", + " processed_response = process_response(response)\n", + " print(f\"\\nASSISTANT: {processed_response}\\n\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error occurred: {e}\")\n", + " \n", + " print(\"\\nSession ended. Goodbye!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0", + "metadata": {}, + "outputs": [], + "source": [ + "def process_response(response):\n", + " \"\"\"Process the model's response and handle tool calls\"\"\"\n", + " if response.strip().startswith(\"TOOL_CALL:\"):\n", + " # Extract image index/range and prompt from TOOL_CALL\n", + " pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n", + " match = re.search(pattern, response, re.DOTALL)\n", + " \n", + " if not match:\n", + " error_msg = \"Error: Invalid TOOL_CALL format.\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " \n", + " image_expr = match.group(1).strip()\n", + " prompt = match.group(2).strip()\n", + " \n", + " try:\n", + " # Handle different index formats\n", + " if \":\" in image_expr: # Range (e.g., \"1:3\")\n", + " start, end = map(int, image_expr.split(\":\"))\n", + " index_or_range = list(range(start, end))\n", + " else: # Single index\n", + " index_or_range = int(image_expr)\n", + " \n", + " # Validate indices\n", + " max_index = len(image_list) - 1\n", + " if isinstance(index_or_range, list):\n", + " if any(i < 0 or i > max_index for i in index_or_range):\n", + " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " elif index_or_range < 0 or index_or_range > max_index:\n", + " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " \n", + " # Perform analysis\n", + " result = analyze_image(index_or_range, prompt)\n", + " print(\"funtion called\")\n", + " messages.append({\n", + " \"role\": \"function\",\n", + " \"name\": \"analyze_image\",\n", + " \"content\": result\n", + " })\n", + " \n", + " # Return formatted result\n", + " formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n", + " return formatted_result\n", + "\n", + " except Exception as e:\n", + " error_msg = f\"Error processing TOOL_CALL: {e}\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " else:\n", + " messages.append({\"role\": \"assistant\", \"content\": response})\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b", + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_image(index_or_range, prompt):\n", + " \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n", + " global image_list\n", + " \n", + " # Handle single index or range\n", + " if isinstance(index_or_range, int):\n", + " images = [image_list[index_or_range]]\n", + " elif isinstance(index_or_range, list):\n", + " images = [image_list[i] for i in index_or_range]\n", + " else:\n", + " return \"Invalid image index/range specified.\"\n", + " \n", + " if not images:\n", + " return \"No images available for analysis.\"\n", + " \n", + " full_prompt = (\n", + " \"Describe the image clearly for a visually impaired user. \"\n", + " \"Be detailed about objects, people, colors, spatial relationships, \"\n", + " \"and any important context. \"\n", + " f\"User's specific request: {prompt}\"\n", + " )\n", + " \n", + " output = \"\"\n", + " try:\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=full_prompt,\n", + " images=images,\n", + " stream=True\n", + " ):\n", + " output += chunk.get('response', \"\")\n", + " except Exception as e:\n", + " return f\"Error analyzing image: {e}\"\n", + " \n", + " return output\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2040b020-8944-409b-8ebb-10d7ffef1748", + "metadata": {}, + "outputs": [], + "source": [ + "image_list.clear\n", + "for i in range(5):\n", + " chat_loop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week1/community-contributions/llava-For-Image-week1.ipynb b/week1/community-contributions/llava-For-Image-week1.ipynb new file mode 100644 index 0000000..616c7e0 --- /dev/null +++ b/week1/community-contributions/llava-For-Image-week1.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f97c7598-f571-4ea1-838c-e9158f729c3e", + "metadata": {}, + "outputs": [], + "source": [ + "import ollama\n", + "import base64\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " with open(image_path, 'rb') as f:\n", + " return base64.b64encode(f.read()).decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a", + "metadata": {}, + "outputs": [], + "source": [ + "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n", + "# image_base64 = encode_image(image_path)\n", + "# print(image_base64[:100]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71146ccf-25af-48d3-8068-ee3c9008cebf", + "metadata": {}, + "outputs": [], + "source": [ + "image_list = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8801a8-0c30-4199-a334-587096e6edeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee3c5d82-e530-40f5-901a-681421f21d1e", + "metadata": {}, + "outputs": [], + "source": [ + "def put_image():\n", + " global image_list\n", + " user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n", + " \n", + " if not user_input_image:\n", + " print(\"No image inserted\")\n", + " return image_list\n", + "\n", + " image_path = os.path.normpath(user_input_image)\n", + " \n", + " if not os.path.exists(image_path):\n", + " print(\"Image path not found! Try again or enter to leave blank\")\n", + " return put_image() # Continue to allow more inputs\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " image_base64 = encode_image(image_path)\n", + " image_list.append(image_base64)\n", + " \n", + " # Detect file extension for MIME type\n", + " # ext = os.path.splitext(image_path)[-1].lower()\n", + " # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n", + "\n", + "\n", + " return image_list\n", + " \n", + " # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43", + "metadata": {}, + "outputs": [], + "source": [ + "prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n", + " \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n", + " \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n", + " \"Be vivid and precise, as if you are painting a picture with words. \"\n", + " \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n", + " \"If the user includes a specific prompt, prioritize that in your description.)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29494db0-4770-4689-9904-8eebc4390e7c", + "metadata": {}, + "outputs": [], + "source": [ + "def put_prompt():\n", + " global prompt\n", + " user_input = input(\"Put new prompt: \")\n", + " if not user_input:\n", + " print(\"please enter a prompt\")\n", + " return put_prompt()\n", + " prompt += \"\\nUser: \" + user_input\n", + " return prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d286369c-e6ef-4a20-a3a8-3563af28940a", + "metadata": {}, + "outputs": [], + "source": [ + "def image_description():\n", + " global prompt\n", + "\n", + " put_image()\n", + " if not image_list: \n", + " return \"No images available. Skipping...\"\n", + "\n", + " user_prompt = put_prompt()\n", + " full_answer = \"\"\n", + "\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=user_prompt,\n", + " images=image_list,\n", + " stream=True\n", + " ):\n", + " content = chunk.get(\"response\", \"\")\n", + " print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n", + " full_answer += content\n", + "\n", + " prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n", + " return full_answer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbda35a3-45ed-4509-ab41-6827eacd922c", + "metadata": {}, + "outputs": [], + "source": [ + "def call_llava():\n", + " image_list.clear()\n", + " for i in range(5):\n", + " print(f\"\\n Iteration {i+1}\")\n", + " answer = image_description()\n", + " print(\"\\n\\n Final Answer:\", answer)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15518865-6c59-4029-bc2d-42d313eb78bc", + "metadata": {}, + "outputs": [], + "source": [ + "call_llava()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c77bd493-f893-402e-b4e3-64854e9d2e19", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/week2/community-contributions/llava-week2-ChainForRealTimeCaptionGeneration.ipynb b/week2/community-contributions/llava-week2-ChainForRealTimeCaptionGeneration.ipynb new file mode 100644 index 0000000..26e30e3 --- /dev/null +++ b/week2/community-contributions/llava-week2-ChainForRealTimeCaptionGeneration.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f97c7598-f571-4ea1-838c-e9158f729c3e", + "metadata": {}, + "outputs": [], + "source": [ + "import ollama\n", + "import base64\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_image(image_path):\n", + " with open(image_path, 'rb') as f:\n", + " return base64.b64encode(f.read()).decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53cca1fa-6db2-4fe4-8990-ffd98423964a", + "metadata": {}, + "outputs": [], + "source": [ + "# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n", + "# image_base64 = encode_image(image_path)\n", + "# print(image_base64[:100]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71146ccf-25af-48d3-8068-ee3c9008cebf", + "metadata": {}, + "outputs": [], + "source": [ + "image_list = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f8801a8-0c30-4199-a334-587096e6edeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee3c5d82-e530-40f5-901a-681421f21d1e", + "metadata": {}, + "outputs": [], + "source": [ + "def put_image():\n", + " global image_list\n", + " user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n", + " \n", + " if not user_input_image:\n", + " print(\"No image inserted\")\n", + " return image_list\n", + "\n", + " image_path = os.path.normpath(user_input_image)\n", + " \n", + " if not os.path.exists(image_path):\n", + " print(\"Image path not found! Try again or enter to leave blank\")\n", + " return put_image() # Continue to allow more inputs\n", + " \n", + "\n", + "\n", + "\n", + " \n", + " image_base64 = encode_image(image_path)\n", + " image_list.append(image_base64)\n", + " \n", + " # Detect file extension for MIME type\n", + " # ext = os.path.splitext(image_path)[-1].lower()\n", + " # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n", + "\n", + "\n", + " return image_list\n", + " \n", + " # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43", + "metadata": {}, + "outputs": [], + "source": [ + "prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n", + " \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n", + " \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n", + " \"Be vivid and precise, as if you are painting a picture with words. \"\n", + " \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n", + " \"If the user includes a specific prompt, prioritize that in your description.)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29494db0-4770-4689-9904-8eebc4390e7c", + "metadata": {}, + "outputs": [], + "source": [ + "def put_prompt():\n", + " global prompt\n", + " user_input = input(\"Put new prompt: \")\n", + " if not user_input:\n", + " print(\"please enter a prompt\")\n", + " return put_prompt()\n", + " prompt += \"\\nUser: \" + user_input\n", + " return prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d286369c-e6ef-4a20-a3a8-3563af28940a", + "metadata": {}, + "outputs": [], + "source": [ + "def image_description():\n", + " global prompt\n", + "\n", + " put_image()\n", + " if not image_list: \n", + " return \"No images available. Skipping...\"\n", + "\n", + " user_prompt = put_prompt()\n", + " full_answer = \"\"\n", + "\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=user_prompt,\n", + " images=image_list,\n", + " stream=True\n", + " ):\n", + " content = chunk.get(\"response\", \"\")\n", + " print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n", + " full_answer += content\n", + "\n", + " prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n", + " return full_answer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbda35a3-45ed-4509-ab41-6827eacd922c", + "metadata": {}, + "outputs": [], + "source": [ + "def call_llava():\n", + " image_list.clear()\n", + " for i in range(5):\n", + " print(f\"\\n Iteration {i+1}\")\n", + " answer = image_description()\n", + " print(\"\\n\\n Final Answer:\", answer)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15518865-6c59-4029-bc2d-42d313eb78bc", + "metadata": {}, + "outputs": [], + "source": [ + "call_llava()" + ] + }, + { + "cell_type": "markdown", + "id": "23de3b59-3699-4270-9392-99fccdede83e", + "metadata": {}, + "source": [ + "# second week practice on personal project making model faster and smarter by using tools\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda", + "metadata": {}, + "outputs": [], + "source": [ + "messages = []\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af", + "metadata": {}, + "outputs": [], + "source": [ + "system_content = (\n", + " \"You are a helpful assistant for visually impaired users. \"\n", + " \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n", + " \"There is a list of images available, indexed from 0. \"\n", + " \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n", + " \"If yes, reply in this structured format:\\n\\n\"\n", + " \"TOOL_CALL: analyze_image(, prompt='')\\n\\n\"\n", + " \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n", + " \"Be clear and use descriptive but accessible language suitable for blind users.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7", + "metadata": {}, + "outputs": [], + "source": [ + "messages.append({\"role\":\"system\",\"content\":system_content})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8009b75-3468-4694-887d-6cd5132c2907", + "metadata": {}, + "outputs": [], + "source": [ + "def chat_loop():\n", + " \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n", + " global image_list, messages\n", + " \n", + " print(\"\\n\" + \"=\"*50)\n", + " print(\"LLaVA Assistant for Visually Impaired Users\")\n", + " print(\"=\"*50 + \"\\n\")\n", + " \n", + " # Step 1: Load images\n", + " print(\"Step 1: Add images (optional)\")\n", + " put_image()\n", + " messages.append({\n", + " \"role\": \"system\", \n", + " \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n", + " })\n", + " \n", + " # Step 2: Single chat interaction\n", + " print(\"\\nStep 2: Ask a question about the images\")\n", + " user_content = put_prompt()\n", + " messages.append({\"role\": \"user\", \"content\": user_content})\n", + " \n", + " # Get model response\n", + " try:\n", + " response = ollama.chat(\n", + " model='llava:7b-v1.6',\n", + " messages=messages\n", + " )[\"message\"][\"content\"]\n", + " print(\"assistant: \",response) \n", + " processed_response = process_response(response)\n", + " print(f\"\\nASSISTANT: {processed_response}\\n\")\n", + " \n", + " except Exception as e:\n", + " print(f\"Error occurred: {e}\")\n", + " \n", + " print(\"\\nSession ended. Goodbye!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0", + "metadata": {}, + "outputs": [], + "source": [ + "def process_response(response):\n", + " \"\"\"Process the model's response and handle tool calls\"\"\"\n", + " if response.strip().startswith(\"TOOL_CALL:\"):\n", + " # Extract image index/range and prompt from TOOL_CALL\n", + " pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n", + " match = re.search(pattern, response, re.DOTALL)\n", + " \n", + " if not match:\n", + " error_msg = \"Error: Invalid TOOL_CALL format.\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " \n", + " image_expr = match.group(1).strip()\n", + " prompt = match.group(2).strip()\n", + " \n", + " try:\n", + " # Handle different index formats\n", + " if \":\" in image_expr: # Range (e.g., \"1:3\")\n", + " start, end = map(int, image_expr.split(\":\"))\n", + " index_or_range = list(range(start, end))\n", + " else: # Single index\n", + " index_or_range = int(image_expr)\n", + " \n", + " # Validate indices\n", + " max_index = len(image_list) - 1\n", + " if isinstance(index_or_range, list):\n", + " if any(i < 0 or i > max_index for i in index_or_range):\n", + " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " elif index_or_range < 0 or index_or_range > max_index:\n", + " error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " \n", + " # Perform analysis\n", + " result = analyze_image(index_or_range, prompt)\n", + " print(\"funtion called\")\n", + " messages.append({\n", + " \"role\": \"function\",\n", + " \"name\": \"analyze_image\",\n", + " \"content\": result\n", + " })\n", + " \n", + " # Return formatted result\n", + " formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n", + " return formatted_result\n", + "\n", + " except Exception as e:\n", + " error_msg = f\"Error processing TOOL_CALL: {e}\"\n", + " messages.append({\"role\": \"assistant\", \"content\": error_msg})\n", + " return error_msg\n", + " else:\n", + " messages.append({\"role\": \"assistant\", \"content\": response})\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b", + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_image(index_or_range, prompt):\n", + " \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n", + " global image_list\n", + " \n", + " # Handle single index or range\n", + " if isinstance(index_or_range, int):\n", + " images = [image_list[index_or_range]]\n", + " elif isinstance(index_or_range, list):\n", + " images = [image_list[i] for i in index_or_range]\n", + " else:\n", + " return \"Invalid image index/range specified.\"\n", + " \n", + " if not images:\n", + " return \"No images available for analysis.\"\n", + " \n", + " full_prompt = (\n", + " \"Describe the image clearly for a visually impaired user. \"\n", + " \"Be detailed about objects, people, colors, spatial relationships, \"\n", + " \"and any important context. \"\n", + " f\"User's specific request: {prompt}\"\n", + " )\n", + " \n", + " output = \"\"\n", + " try:\n", + " for chunk in ollama.generate(\n", + " model='llava:7b-v1.6',\n", + " prompt=full_prompt,\n", + " images=images,\n", + " stream=True\n", + " ):\n", + " output += chunk.get('response', \"\")\n", + " except Exception as e:\n", + " return f\"Error analyzing image: {e}\"\n", + " \n", + " return output\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2040b020-8944-409b-8ebb-10d7ffef1748", + "metadata": {}, + "outputs": [], + "source": [ + "image_list.clear\n", + "for i in range(5):\n", + " chat_loop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}