Files
LLM_Engineering_OLD/week2/community-contributions/llava-week2-ChainForRealTimeCaptionGeneration.ipynb
2025-08-26 01:59:01 +05:30

487 lines
16 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "f97c7598-f571-4ea1-838c-e9158f729c3e",
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"import base64\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23",
"metadata": {},
"outputs": [],
"source": [
"def encode_image(image_path):\n",
" with open(image_path, 'rb') as f:\n",
" return base64.b64encode(f.read()).decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "53cca1fa-6db2-4fe4-8990-ffd98423964a",
"metadata": {},
"outputs": [],
"source": [
"# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n",
"# image_base64 = encode_image(image_path)\n",
"# print(image_base64[:100]) "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "71146ccf-25af-48d3-8068-ee3c9008cebf",
"metadata": {},
"outputs": [],
"source": [
"image_list = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f8801a8-0c30-4199-a334-587096e6edeb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ee3c5d82-e530-40f5-901a-681421f21d1e",
"metadata": {},
"outputs": [],
"source": [
"def put_image():\n",
" global image_list\n",
" user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n",
" \n",
" if not user_input_image:\n",
" print(\"No image inserted\")\n",
" return image_list\n",
"\n",
" image_path = os.path.normpath(user_input_image)\n",
" \n",
" if not os.path.exists(image_path):\n",
" print(\"Image path not found! Try again or enter to leave blank\")\n",
" return put_image() # Continue to allow more inputs\n",
" \n",
"\n",
"\n",
"\n",
" \n",
" image_base64 = encode_image(image_path)\n",
" image_list.append(image_base64)\n",
" \n",
" # Detect file extension for MIME type\n",
" # ext = os.path.splitext(image_path)[-1].lower()\n",
" # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n",
"\n",
"\n",
" return image_list\n",
" \n",
" # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43",
"metadata": {},
"outputs": [],
"source": [
"prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n",
" \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n",
" \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n",
" \"Be vivid and precise, as if you are painting a picture with words. \"\n",
" \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n",
" \"If the user includes a specific prompt, prioritize that in your description.)\")\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "29494db0-4770-4689-9904-8eebc4390e7c",
"metadata": {},
"outputs": [],
"source": [
"def put_prompt():\n",
" global prompt\n",
" user_input = input(\"Put new prompt: \")\n",
" if not user_input:\n",
" print(\"please enter a prompt\")\n",
" return put_prompt()\n",
" prompt += \"\\nUser: \" + user_input\n",
" return prompt\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d286369c-e6ef-4a20-a3a8-3563af28940a",
"metadata": {},
"outputs": [],
"source": [
"def image_description():\n",
" global prompt\n",
"\n",
" put_image()\n",
" if not image_list: \n",
" return \"No images available. Skipping...\"\n",
"\n",
" user_prompt = put_prompt()\n",
" full_answer = \"\"\n",
"\n",
" for chunk in ollama.generate(\n",
" model='llava:7b-v1.6',\n",
" prompt=user_prompt,\n",
" images=image_list,\n",
" stream=True\n",
" ):\n",
" content = chunk.get(\"response\", \"\")\n",
" print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n",
" full_answer += content\n",
"\n",
" prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n",
" return full_answer\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "cbda35a3-45ed-4509-ab41-6827eacd922c",
"metadata": {},
"outputs": [],
"source": [
"def call_llava():\n",
" image_list.clear()\n",
" for i in range(5):\n",
" print(f\"\\n Iteration {i+1}\")\n",
" answer = image_description()\n",
" print(\"\\n\\n Final Answer:\", answer)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15518865-6c59-4029-bc2d-42d313eb78bc",
"metadata": {},
"outputs": [],
"source": [
"call_llava()"
]
},
{
"cell_type": "markdown",
"id": "23de3b59-3699-4270-9392-99fccdede83e",
"metadata": {},
"source": [
"# second week practice on personal project making model faster and smarter by using tools\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "9d44c59e-5eb7-4b00-9489-e05d7c8c3eda",
"metadata": {},
"outputs": [],
"source": [
"messages = []\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "061ea026-d4c6-4d6c-bb9b-f6430de9f5af",
"metadata": {},
"outputs": [],
"source": [
"system_content = (\n",
" \"You are a helpful assistant for visually impaired users. \"\n",
" \"You are capable of answering questions directly or calling a function to analyze an image if needed. \"\n",
" \"There is a list of images available, indexed from 0. \"\n",
" \"When a user asks a question, first determine whether any image in the list is needed to answer. \"\n",
" \"If yes, reply in this structured format:\\n\\n\"\n",
" \"TOOL_CALL: analyze_image(<image_index_or_range>, prompt='<description_request>')\\n\\n\"\n",
" \"If image is not needed, just answer the user directly in plain natural language.\\n\"\n",
" \"Be clear and use descriptive but accessible language suitable for blind users.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2f859450-eb3e-4e6c-9602-84f91f5ffda7",
"metadata": {},
"outputs": [],
"source": [
"messages.append({\"role\":\"system\",\"content\":system_content})"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "a8009b75-3468-4694-887d-6cd5132c2907",
"metadata": {},
"outputs": [],
"source": [
"def chat_loop():\n",
" \"\"\"Main chat interaction loop (single-turn version)\"\"\"\n",
" global image_list, messages\n",
" \n",
" print(\"\\n\" + \"=\"*50)\n",
" print(\"LLaVA Assistant for Visually Impaired Users\")\n",
" print(\"=\"*50 + \"\\n\")\n",
" \n",
" # Step 1: Load images\n",
" print(\"Step 1: Add images (optional)\")\n",
" put_image()\n",
" messages.append({\n",
" \"role\": \"system\", \n",
" \"content\": f\"There are {len(image_list)} images available (index 0-{len(image_list)-1}).\"\n",
" })\n",
" \n",
" # Step 2: Single chat interaction\n",
" print(\"\\nStep 2: Ask a question about the images\")\n",
" user_content = put_prompt()\n",
" messages.append({\"role\": \"user\", \"content\": user_content})\n",
" \n",
" # Get model response\n",
" try:\n",
" response = ollama.chat(\n",
" model='llava:7b-v1.6',\n",
" messages=messages\n",
" )[\"message\"][\"content\"]\n",
" print(\"assistant: \",response) \n",
" processed_response = process_response(response)\n",
" print(f\"\\nASSISTANT: {processed_response}\\n\")\n",
" \n",
" except Exception as e:\n",
" print(f\"Error occurred: {e}\")\n",
" \n",
" print(\"\\nSession ended. Goodbye!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3b3ff73-3cd5-4e5a-a37e-aaa8b325613c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 29,
"id": "ee2de6d7-a0bf-45fc-8d5c-98e0055519b0",
"metadata": {},
"outputs": [],
"source": [
"def process_response(response):\n",
" \"\"\"Process the model's response and handle tool calls\"\"\"\n",
" if response.strip().startswith(\"TOOL_CALL:\"):\n",
" # Extract image index/range and prompt from TOOL_CALL\n",
" pattern = r\"TOOL_CALL:\\s*analyze_image\\((.*?)\\s*,\\s*prompt='(.*?)'\\)\"\n",
" match = re.search(pattern, response, re.DOTALL)\n",
" \n",
" if not match:\n",
" error_msg = \"Error: Invalid TOOL_CALL format.\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" \n",
" image_expr = match.group(1).strip()\n",
" prompt = match.group(2).strip()\n",
" \n",
" try:\n",
" # Handle different index formats\n",
" if \":\" in image_expr: # Range (e.g., \"1:3\")\n",
" start, end = map(int, image_expr.split(\":\"))\n",
" index_or_range = list(range(start, end))\n",
" else: # Single index\n",
" index_or_range = int(image_expr)\n",
" \n",
" # Validate indices\n",
" max_index = len(image_list) - 1\n",
" if isinstance(index_or_range, list):\n",
" if any(i < 0 or i > max_index for i in index_or_range):\n",
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" elif index_or_range < 0 or index_or_range > max_index:\n",
" error_msg = f\"Error: Image index out of range (0-{max_index}).\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" \n",
" # Perform analysis\n",
" result = analyze_image(index_or_range, prompt)\n",
" print(\"funtion called\")\n",
" messages.append({\n",
" \"role\": \"function\",\n",
" \"name\": \"analyze_image\",\n",
" \"content\": result\n",
" })\n",
" \n",
" # Return formatted result\n",
" formatted_result = f\"\\nIMAGE ANALYSIS RESULT:\\n{result}\"\n",
" return formatted_result\n",
"\n",
" except Exception as e:\n",
" error_msg = f\"Error processing TOOL_CALL: {e}\"\n",
" messages.append({\"role\": \"assistant\", \"content\": error_msg})\n",
" return error_msg\n",
" else:\n",
" messages.append({\"role\": \"assistant\", \"content\": response})\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "ea82f8f6-c321-4fbc-81ee-a508b087d53b",
"metadata": {},
"outputs": [],
"source": [
"def analyze_image(index_or_range, prompt):\n",
" \"\"\"Analyze specific image(s) using LLaVA\"\"\"\n",
" global image_list\n",
" \n",
" # Handle single index or range\n",
" if isinstance(index_or_range, int):\n",
" images = [image_list[index_or_range]]\n",
" elif isinstance(index_or_range, list):\n",
" images = [image_list[i] for i in index_or_range]\n",
" else:\n",
" return \"Invalid image index/range specified.\"\n",
" \n",
" if not images:\n",
" return \"No images available for analysis.\"\n",
" \n",
" full_prompt = (\n",
" \"Describe the image clearly for a visually impaired user. \"\n",
" \"Be detailed about objects, people, colors, spatial relationships, \"\n",
" \"and any important context. \"\n",
" f\"User's specific request: {prompt}\"\n",
" )\n",
" \n",
" output = \"\"\n",
" try:\n",
" for chunk in ollama.generate(\n",
" model='llava:7b-v1.6',\n",
" prompt=full_prompt,\n",
" images=images,\n",
" stream=True\n",
" ):\n",
" output += chunk.get('response', \"\")\n",
" except Exception as e:\n",
" return f\"Error analyzing image: {e}\"\n",
" \n",
" return output\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2040b020-8944-409b-8ebb-10d7ffef1748",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"==================================================\n",
"LLaVA Assistant for Visually Impaired Users\n",
"==================================================\n",
"\n",
"Step 1: Add images (optional)\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"Enter image path or press enter to skip: C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Step 2: Ask a question about the images\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"Put new prompt: descibe this image\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"assistant: I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? \n",
"\n",
"ASSISTANT: I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? \n",
"\n",
"\n",
"Session ended. Goodbye!\n",
"\n",
"==================================================\n",
"LLaVA Assistant for Visually Impaired Users\n",
"==================================================\n",
"\n",
"Step 1: Add images (optional)\n"
]
}
],
"source": [
"image_list.clear\n",
"for i in range(5):\n",
" chat_loop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c7c40d7-df9d-464a-89da-1c6fe613c31d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}