487 lines
16 KiB
Plaintext
487 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "f97c7598-f571-4ea1-838c-e9158f729c3e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import ollama\n",
|
|
"import base64\n",
|
|
"import os"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def encode_image(image_path):\n",
|
|
" with open(image_path, 'rb') as f:\n",
|
|
" return base64.b64encode(f.read()).decode('utf-8')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "53cca1fa-6db2-4fe4-8990-ffd98423964a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n",
|
|
"# image_base64 = encode_image(image_path)\n",
|
|
"# print(image_base64[:100]) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "71146ccf-25af-48d3-8068-ee3c9008cebf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"image_list = []"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6f8801a8-0c30-4199-a334-587096e6edeb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "ee3c5d82-e530-40f5-901a-681421f21d1e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def put_image():\n",
|
|
" global image_list\n",
|
|
" user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n",
|
|
" \n",
|
|
" if not user_input_image:\n",
|
|
" print(\"No image inserted\")\n",
|
|
" return image_list\n",
|
|
"\n",
|
|
" image_path = os.path.normpath(user_input_image)\n",
|
|
" \n",
|
|
" if not os.path.exists(image_path):\n",
|
|
" print(\"Image path not found! Try again or enter to leave blank\")\n",
|
|
" return put_image() # Continue to allow more inputs\n",
|
|
" \n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
" \n",
|
|
" image_base64 = encode_image(image_path)\n",
|
|
" image_list.append(image_base64)\n",
|
|
" \n",
|
|
" # Detect file extension for MIME type\n",
|
|
" # ext = os.path.splitext(image_path)[-1].lower()\n",
|
|
" # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n",
|
|
"\n",
|
|
"\n",
|
|
" return image_list\n",
|
|
" \n",
|
|
" # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n",
|
|
" \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n",
|
|
" \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n",
|
|
" \"Be vivid and precise, as if you are painting a picture with words. \"\n",
|
|
" \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n",
|
|
" \"If the user includes a specific prompt, prioritize that in your description.)\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "29494db0-4770-4689-9904-8eebc4390e7c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def put_prompt():\n",
|
|
" global prompt\n",
|
|
" user_input = input(\"Put new prompt: \")\n",
|
|
" if not user_input:\n",
|
|
" print(\"please enter a prompt\")\n",
|
|
" return put_prompt()\n",
|
|
" prompt += \"\\nUser: \" + user_input\n",
|
|
" return prompt\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "d286369c-e6ef-4a20-a3a8-3563af28940a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def image_description():\n",
|
|
" global prompt\n",
|
|
"\n",
|
|
" put_image()\n",
|
|
" if not image_list: \n",
|
|
" return \"No images available. Skipping...\"\n",
|
|
"\n",
|
|
" user_prompt = put_prompt()\n",
|
|
" full_answer = \"\"\n",
|
|
"\n",
|
|
" for chunk in ollama.generate(\n",
|
|
" model='llava:7b-v1.6',\n",
|
|
" prompt=user_prompt,\n",
|
|
" images=image_list,\n",
|
|
" stream=True\n",
|
|
" ):\n",
|
|
" content = chunk.get(\"response\", \"\")\n",
|
|
" print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n",
|
|
" full_answer += content\n",
|
|
"\n",
|
|
" prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n",
|
|
" return full_answer\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "cbda35a3-45ed-4509-ab41-6827eacd922c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def call_llava():\n",
|
|
" image_list.clear()\n",
|
|
" for i in range(5):\n",
|
|
" print(f\"\\n Iteration {i+1}\")\n",
|
|
" answer = image_description()\n",
|
|
" print(\"\\n\\n Final Answer:\", answer)\n",
|
|
" \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "15518865-6c59-4029-bc2d-42d313eb78bc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"call_llava()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "23de3b59-3699-4270-9392-99fccdede83e",
|
|
"metadata": {},
|
|
"source": [
|
|
"# second week practice on personal project making model faster and smarter by using tools\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"messages = []\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"system_content = (\n",
|
|
" \"You are a helpful assistant for visually impaired users. \"\n",
|
|
" \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n",
|
|
" \"There is a list of images available, indexed from 0. \"\n",
|
|
" \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n",
|
|
" \"If yes, reply in this structured format:\\n\\n\"\n",
|
|
" \"TOOL_CALL: analyze_image(<image_index_or_range>, prompt='<description_request>')\\n\\n\"\n",
|
|
" \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n",
|
|
" \"Be clear and use descriptive but accessible language suitable for blind users.\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"messages.append({\"role\":\"system\",\"content\":system_content})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "a8009b75-3468-4694-887d-6cd5132c2907",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def chat_loop():\n",
|
|
" \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n",
|
|
" global image_list, messages\n",
|
|
" \n",
|
|
" print(\"\\n\" + \"=\"*50)\n",
|
|
" print(\"LLaVA Assistant for Visually Impaired Users\")\n",
|
|
" print(\"=\"*50 + \"\\n\")\n",
|
|
" \n",
|
|
" # Step 1: Load images\n",
|
|
" print(\"Step 1: Add images (optional)\")\n",
|
|
" put_image()\n",
|
|
" messages.append({\n",
|
|
" \"role\": \"system\", \n",
|
|
" \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n",
|
|
" })\n",
|
|
" \n",
|
|
" # Step 2: Single chat interaction\n",
|
|
" print(\"\\nStep 2: Ask a question about the images\")\n",
|
|
" user_content = put_prompt()\n",
|
|
" messages.append({\"role\": \"user\", \"content\": user_content})\n",
|
|
" \n",
|
|
" # Get model response\n",
|
|
" try:\n",
|
|
" response = ollama.chat(\n",
|
|
" model='llava:7b-v1.6',\n",
|
|
" messages=messages\n",
|
|
" )[\"message\"][\"content\"]\n",
|
|
" print(\"assistant: \",response) \n",
|
|
" processed_response = process_response(response)\n",
|
|
" print(f\"\\nASSISTANT: {processed_response}\\n\")\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error occurred: {e}\")\n",
|
|
" \n",
|
|
" print(\"\\nSession ended. Goodbye!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def process_response(response):\n",
|
|
" \"\"\"Process the model's response and handle tool calls\"\"\"\n",
|
|
" if response.strip().startswith(\"TOOL_CALL:\"):\n",
|
|
" # Extract image index/range and prompt from TOOL_CALL\n",
|
|
" pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n",
|
|
" match = re.search(pattern, response, re.DOTALL)\n",
|
|
" \n",
|
|
" if not match:\n",
|
|
" error_msg = \"Error: Invalid TOOL_CALL format.\"\n",
|
|
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
|
|
" return error_msg\n",
|
|
" \n",
|
|
" image_expr = match.group(1).strip()\n",
|
|
" prompt = match.group(2).strip()\n",
|
|
" \n",
|
|
" try:\n",
|
|
" # Handle different index formats\n",
|
|
" if \":\" in image_expr: # Range (e.g., \"1:3\")\n",
|
|
" start, end = map(int, image_expr.split(\":\"))\n",
|
|
" index_or_range = list(range(start, end))\n",
|
|
" else: # Single index\n",
|
|
" index_or_range = int(image_expr)\n",
|
|
" \n",
|
|
" # Validate indices\n",
|
|
" max_index = len(image_list) - 1\n",
|
|
" if isinstance(index_or_range, list):\n",
|
|
" if any(i < 0 or i > max_index for i in index_or_range):\n",
|
|
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
|
|
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
|
|
" return error_msg\n",
|
|
" elif index_or_range < 0 or index_or_range > max_index:\n",
|
|
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
|
|
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
|
|
" return error_msg\n",
|
|
" \n",
|
|
" # Perform analysis\n",
|
|
" result = analyze_image(index_or_range, prompt)\n",
|
|
" print(\"funtion called\")\n",
|
|
" messages.append({\n",
|
|
" \"role\": \"function\",\n",
|
|
" \"name\": \"analyze_image\",\n",
|
|
" \"content\": result\n",
|
|
" })\n",
|
|
" \n",
|
|
" # Return formatted result\n",
|
|
" formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n",
|
|
" return formatted_result\n",
|
|
"\n",
|
|
" except Exception as e:\n",
|
|
" error_msg = f\"Error processing TOOL_CALL: {e}\"\n",
|
|
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
|
|
" return error_msg\n",
|
|
" else:\n",
|
|
" messages.append({\"role\": \"assistant\", \"content\": response})\n",
|
|
" return response"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def analyze_image(index_or_range, prompt):\n",
|
|
" \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n",
|
|
" global image_list\n",
|
|
" \n",
|
|
" # Handle single index or range\n",
|
|
" if isinstance(index_or_range, int):\n",
|
|
" images = [image_list[index_or_range]]\n",
|
|
" elif isinstance(index_or_range, list):\n",
|
|
" images = [image_list[i] for i in index_or_range]\n",
|
|
" else:\n",
|
|
" return \"Invalid image index/range specified.\"\n",
|
|
" \n",
|
|
" if not images:\n",
|
|
" return \"No images available for analysis.\"\n",
|
|
" \n",
|
|
" full_prompt = (\n",
|
|
" \"Describe the image clearly for a visually impaired user. \"\n",
|
|
" \"Be detailed about objects, people, colors, spatial relationships, \"\n",
|
|
" \"and any important context. \"\n",
|
|
" f\"User's specific request: {prompt}\"\n",
|
|
" )\n",
|
|
" \n",
|
|
" output = \"\"\n",
|
|
" try:\n",
|
|
" for chunk in ollama.generate(\n",
|
|
" model='llava:7b-v1.6',\n",
|
|
" prompt=full_prompt,\n",
|
|
" images=images,\n",
|
|
" stream=True\n",
|
|
" ):\n",
|
|
" output += chunk.get('response', \"\")\n",
|
|
" except Exception as e:\n",
|
|
" return f\"Error analyzing image: {e}\"\n",
|
|
" \n",
|
|
" return output\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2040b020-8944-409b-8ebb-10d7ffef1748",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"==================================================\n",
|
|
"LLaVA Assistant for Visually Impaired Users\n",
|
|
"==================================================\n",
|
|
"\n",
|
|
"Step 1: Add images (optional)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdin",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Enter image path or press enter to skip: C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Step 2: Ask a question about the images\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdin",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Put new prompt: descibe this image\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"assistant: I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? \n",
|
|
"\n",
|
|
"ASSISTANT: I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? \n",
|
|
"\n",
|
|
"\n",
|
|
"Session ended. Goodbye!\n",
|
|
"\n",
|
|
"==================================================\n",
|
|
"LLaVA Assistant for Visually Impaired Users\n",
|
|
"==================================================\n",
|
|
"\n",
|
|
"Step 1: Add images (optional)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"image_list.clear\n",
|
|
"for i in range(5):\n",
|
|
" chat_loop()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|