Files
LLM_Engineering_OLD/week1/community-contributions/llava-For-Image-week1.ipynb
2025-08-31 23:21:42 +05:30

215 lines
6.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f97c7598-f571-4ea1-838c-e9158f729c3e",
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"import base64\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fc1393c-f0b1-4982-94a2-bfd502e85b23",
"metadata": {},
"outputs": [],
"source": [
"def encode_image(image_path):\n",
" with open(image_path, 'rb') as f:\n",
" return base64.b64encode(f.read()).decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53cca1fa-6db2-4fe4-8990-ffd98423964a",
"metadata": {},
"outputs": [],
"source": [
"# image_path = r\"C:\\Users\\LAKSHYA\\OneDrive\\Pictures\\Camera Roll\\WIN_20250614_02_46_47_Pro.jpg\"\n",
"# image_base64 = encode_image(image_path)\n",
"# print(image_base64[:100]) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71146ccf-25af-48d3-8068-ee3c9008cebf",
"metadata": {},
"outputs": [],
"source": [
"image_list = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f8801a8-0c30-4199-a334-587096e6edeb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee3c5d82-e530-40f5-901a-681421f21d1e",
"metadata": {},
"outputs": [],
"source": [
"def put_image():\n",
" global image_list\n",
" user_input_image = input(\"Enter image path or press enter to skip: \").strip()\n",
" \n",
" if not user_input_image:\n",
" print(\"No image inserted\")\n",
" return image_list\n",
"\n",
" image_path = os.path.normpath(user_input_image)\n",
" \n",
" if not os.path.exists(image_path):\n",
" print(\"Image path not found! Try again or enter to leave blank\")\n",
" return put_image() # Continue to allow more inputs\n",
" \n",
"\n",
"\n",
"\n",
" \n",
" image_base64 = encode_image(image_path)\n",
" image_list.append(image_base64)\n",
" \n",
" # Detect file extension for MIME type\n",
" # ext = os.path.splitext(image_path)[-1].lower()\n",
" # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png' # Extend if needed\n",
"\n",
"\n",
" return image_list\n",
" \n",
" # return f\"data:{mime_type};base64,{image_base64[:100]}\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "032f1abb-ca6c-4f03-bda1-1a0a62f2ec43",
"metadata": {},
"outputs": [],
"source": [
"prompt= (\"System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. \"\n",
" \"Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. \"\n",
" \"Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. \"\n",
" \"Be vivid and precise, as if you are painting a picture with words. \"\n",
" \"Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. \"\n",
" \"If the user includes a specific prompt, prioritize that in your description.)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29494db0-4770-4689-9904-8eebc4390e7c",
"metadata": {},
"outputs": [],
"source": [
"def put_prompt():\n",
" global prompt\n",
" user_input = input(\"Put new prompt: \")\n",
" if not user_input:\n",
" print(\"please enter a prompt\")\n",
" return put_prompt()\n",
" prompt += \"\\nUser: \" + user_input\n",
" return prompt\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d286369c-e6ef-4a20-a3a8-3563af28940a",
"metadata": {},
"outputs": [],
"source": [
"def image_description():\n",
" global prompt\n",
"\n",
" put_image()\n",
" if not image_list: \n",
" return \"No images available. Skipping...\"\n",
"\n",
" user_prompt = put_prompt()\n",
" full_answer = \"\"\n",
"\n",
" for chunk in ollama.generate(\n",
" model='llava:7b-v1.6',\n",
" prompt=user_prompt,\n",
" images=image_list,\n",
" stream=True\n",
" ):\n",
" content = chunk.get(\"response\", \"\")\n",
" print(\"\\n\\n Final Answer:\",content, end=\"\", flush=True) # Live stream to console\n",
" full_answer += content\n",
"\n",
" prompt += \"\\nUser: \" + user_prompt + \"\\nAssistant: \" + full_answer\n",
" return full_answer\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbda35a3-45ed-4509-ab41-6827eacd922c",
"metadata": {},
"outputs": [],
"source": [
"def call_llava():\n",
" image_list.clear()\n",
" for i in range(5):\n",
" print(f\"\\n Iteration {i+1}\")\n",
" answer = image_description()\n",
" print(\"\\n\\n Final Answer:\", answer)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15518865-6c59-4029-bc2d-42d313eb78bc",
"metadata": {},
"outputs": [],
"source": [
"call_llava()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c77bd493-f893-402e-b4e3-64854e9d2e19",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}