Files
LLM_Engineering_OLD/community-contributions/LLaVA-For-Visually-Impared-People/llava-week2-ChainForRealTimeCaptionGeneration.ipynb
2025-08-31 23:21:42 +05:30

434 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f97c7598-f571-4ea1-838c-e9158f729c3e",
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"import base64\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23",
"metadata": {},
"outputs": [],
"source": [
"def encode_image(image_path):\n",
" with open(image_path, 'rb') as f:\n",
" return base64.b64encode(f.read()).decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53cca1fa-6db2-4fe4-8990-ffd98423964a",
"metadata": {},
"outputs": [],
"source": [
"# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n",
"# image_base64 = encode_image(image_path)\n",
"# print(image_base64[:100]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71146ccf-25af-48d3-8068-ee3c9008cebf",
"metadata": {},
"outputs": [],
"source": [
"image_list = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f8801a8-0c30-4199-a334-587096e6edeb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee3c5d82-e530-40f5-901a-681421f21d1e",
"metadata": {},
"outputs": [],
"source": [
"def put_image():\n",
" global image_list\n",
" user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n",
" \n",
" if not user_input_image:\n",
" print(\"No image inserted\")\n",
" return image_list\n",
"\n",
" image_path = os.path.normpath(user_input_image)\n",
" \n",
" if not os.path.exists(image_path):\n",
" print(\"Image path not found! Try again or enter to leave blank\")\n",
" return put_image() # Continue to allow more inputs\n",
" \n",
"\n",
"\n",
"\n",
" \n",
" image_base64 = encode_image(image_path)\n",
" image_list.append(image_base64)\n",
" \n",
" # Detect file extension for MIME type\n",
" # ext = os.path.splitext(image_path)[-1].lower()\n",
" # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n",
"\n",
"\n",
" return image_list\n",
" \n",
" # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43",
"metadata": {},
"outputs": [],
"source": [
"prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n",
" \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n",
" \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n",
" \"Be vivid and precise, as if you are painting a picture with words. \"\n",
" \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n",
" \"If the user includes a specific prompt, prioritize that in your description.)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29494db0-4770-4689-9904-8eebc4390e7c",
"metadata": {},
"outputs": [],
"source": [
"def put_prompt():\n",
" global prompt\n",
" user_input = input(\"Put new prompt: \")\n",
" if not user_input:\n",
" print(\"please enter a prompt\")\n",
" return put_prompt()\n",
" prompt += \"\\nUser: \" + user_input\n",
" return prompt\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d286369c-e6ef-4a20-a3a8-3563af28940a",
"metadata": {},
"outputs": [],
"source": [
"def image_description():\n",
" global prompt\n",
"\n",
" put_image()\n",
" if not image_list: \n",
" return \"No images available. Skipping...\"\n",
"\n",
" user_prompt = put_prompt()\n",
" full_answer = \"\"\n",
"\n",
" for chunk in ollama.generate(\n",
" model='llava:7b-v1.6',\n",
" prompt=user_prompt,\n",
" images=image_list,\n",
" stream=True\n",
" ):\n",
" content = chunk.get(\"response\", \"\")\n",
" print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n",
" full_answer += content\n",
"\n",
" prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n",
" return full_answer\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbda35a3-45ed-4509-ab41-6827eacd922c",
"metadata": {},
"outputs": [],
"source": [
"def call_llava():\n",
" image_list.clear()\n",
" for i in range(5):\n",
" print(f\"\\n Iteration {i+1}\")\n",
" answer = image_description()\n",
" print(\"\\n\\n Final Answer:\", answer)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15518865-6c59-4029-bc2d-42d313eb78bc",
"metadata": {},
"outputs": [],
"source": [
"call_llava()"
]
},
{
"cell_type": "markdown",
"id": "23de3b59-3699-4270-9392-99fccdede83e",
"metadata": {},
"source": [
"# second week practice on personal project making model faster and smarter by using tools\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda",
"metadata": {},
"outputs": [],
"source": [
"messages = []\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af",
"metadata": {},
"outputs": [],
"source": [
"system_content = (\n",
" \"You are a helpful assistant for visually impaired users. \"\n",
" \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n",
" \"There is a list of images available, indexed from 0. \"\n",
" \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n",
" \"If yes, reply in this structured format:\\n\\n\"\n",
" \"TOOL_CALL: analyze_image(<image_index_or_range>, prompt='<description_request>')\\n\\n\"\n",
" \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n",
" \"Be clear and use descriptive but accessible language suitable for blind users.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7",
"metadata": {},
"outputs": [],
"source": [
"messages.append({\"role\":\"system\",\"content\":system_content})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8009b75-3468-4694-887d-6cd5132c2907",
"metadata": {},
"outputs": [],
"source": [
"def chat_loop():\n",
" \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n",
" global image_list, messages\n",
" \n",
" print(\"\\n\" + \"=\"*50)\n",
" print(\"LLaVA Assistant for Visually Impaired Users\")\n",
" print(\"=\"*50 + \"\\n\")\n",
" \n",
" # Step 1: Load images\n",
" print(\"Step 1: Add images (optional)\")\n",
" put_image()\n",
" messages.append({\n",
" \"role\": \"system\", \n",
" \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n",
" })\n",
" \n",
" # Step 2: Single chat interaction\n",
" print(\"\\nStep 2: Ask a question about the images\")\n",
" user_content = put_prompt()\n",
" messages.append({\"role\": \"user\", \"content\": user_content})\n",
" \n",
" # Get model response\n",
" try:\n",
" response = ollama.chat(\n",
" model='llava:7b-v1.6',\n",
" messages=messages\n",
" )[\"message\"][\"content\"]\n",
" print(\"assistant: \",response) \n",
" processed_response = process_response(response)\n",
" print(f\"\\nASSISTANT: {processed_response}\\n\")\n",
" \n",
" except Exception as e:\n",
" print(f\"Error occurred: {e}\")\n",
" \n",
" print(\"\\nSession ended. Goodbye!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0",
"metadata": {},
"outputs": [],
"source": [
"def process_response(response):\n",
" \"\"\"Process the model's response and handle tool calls\"\"\"\n",
" if response.strip().startswith(\"TOOL_CALL:\"):\n",
" # Extract image index/range and prompt from TOOL_CALL\n",
" pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n",
" match = re.search(pattern, response, re.DOTALL)\n",
" \n",
" if not match:\n",
" error_msg = \"Error: Invalid TOOL_CALL format.\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" \n",
" image_expr = match.group(1).strip()\n",
" prompt = match.group(2).strip()\n",
" \n",
" try:\n",
" # Handle different index formats\n",
" if \":\" in image_expr: # Range (e.g., \"1:3\")\n",
" start, end = map(int, image_expr.split(\":\"))\n",
" index_or_range = list(range(start, end))\n",
" else: # Single index\n",
" index_or_range = int(image_expr)\n",
" \n",
" # Validate indices\n",
" max_index = len(image_list) - 1\n",
" if isinstance(index_or_range, list):\n",
" if any(i < 0 or i > max_index for i in index_or_range):\n",
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" elif index_or_range < 0 or index_or_range > max_index:\n",
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" \n",
" # Perform analysis\n",
" result = analyze_image(index_or_range, prompt)\n",
" print(\"funtion called\")\n",
" messages.append({\n",
" \"role\": \"function\",\n",
" \"name\": \"analyze_image\",\n",
" \"content\": result\n",
" })\n",
" \n",
" # Return formatted result\n",
" formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n",
" return formatted_result\n",
"\n",
" except Exception as e:\n",
" error_msg = f\"Error processing TOOL_CALL: {e}\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" else:\n",
" messages.append({\"role\": \"assistant\", \"content\": response})\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b",
"metadata": {},
"outputs": [],
"source": [
"def analyze_image(index_or_range, prompt):\n",
" \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n",
" global image_list\n",
" \n",
" # Handle single index or range\n",
" if isinstance(index_or_range, int):\n",
" images = [image_list[index_or_range]]\n",
" elif isinstance(index_or_range, list):\n",
" images = [image_list[i] for i in index_or_range]\n",
" else:\n",
" return \"Invalid image index/range specified.\"\n",
" \n",
" if not images:\n",
" return \"No images available for analysis.\"\n",
" \n",
" full_prompt = (\n",
" \"Describe the image clearly for a visually impaired user. \"\n",
" \"Be detailed about objects, people, colors, spatial relationships, \"\n",
" \"and any important context. \"\n",
" f\"User's specific request: {prompt}\"\n",
" )\n",
" \n",
" output = \"\"\n",
" try:\n",
" for chunk in ollama.generate(\n",
" model='llava:7b-v1.6',\n",
" prompt=full_prompt,\n",
" images=images,\n",
" stream=True\n",
" ):\n",
" output += chunk.get('response', \"\")\n",
" except Exception as e:\n",
" return f\"Error analyzing image: {e}\"\n",
" \n",
" return output\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2040b020-8944-409b-8ebb-10d7ffef1748",
"metadata": {},
"outputs": [],
"source": [
"image_list.clear\n",
"for i in range(5):\n",
" chat_loop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}