Files
LLM_Engineering_OLD/week5/community-contributions/Cosmus_Week5_Exercise.ipynb
2025-10-23 01:08:27 +03:00

308 lines
9.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "d04a7c55",
"metadata": {},
"outputs": [],
"source": [
"#Importing necessary libraries\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from anthropic import Client\n",
"from dotenv import load_dotenv\n",
"import sys\n",
"from faker import Faker\n",
"import random\n",
"import gradio as gr\n",
"from langchain_community.document_loaders import DirectoryLoader, TextLoader\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from langchain_community.vectorstores import Chroma\n",
"from langchain_anthropic import ChatAnthropic\n",
"from langchain_classic.memory import ConversationBufferMemory\n",
"from langchain_classic.chains import ConversationalRetrievalChain\n",
"\n",
"!{sys.executable} -m pip install faker\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d7f8354",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# loading the .env variables\n",
"load_dotenv(override=True)\n",
"\n",
"# Force export to OS env so LangChain can detect it (had to try this because the key was not loading at some point but by the time i shared the code it loaded well so i commented it out)\n",
"#os.environ[\"ANTHROPIC_API_KEY\"] = os.getenv(\"ANTHROPIC_API_KEY\")\n",
"\n",
"#getting the key from the our .env file. It is Anthropic_API_KEY\n",
"ANTHROPIC_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
"client = Client(api_key=ANTHROPIC_KEY)\n",
"\n",
"# Checking the anthropic models list our anthropic key ca help us play with\n",
"models = client.models.list()\n",
"for model in models:\n",
" print(model.id)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20d11d1c",
"metadata": {},
"outputs": [],
"source": [
"#Getting the python executable path on my notebook to know where to install the faker library\n",
"print(sys.executable)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93a8f3ec",
"metadata": {},
"outputs": [],
"source": [
"#Creating a fake person with faker\n",
"fake = Faker()\n",
"base_dir = \"knowledge_base\"\n",
"folders = [\"personal\", \"projects\", \"learning\"]\n",
"\n",
"# We now create folders if they don't exist\n",
"for folder in folders:\n",
" os.makedirs(f\"{base_dir}/{folder}\", exist_ok=True)\n",
"\n",
"# Check if data already exists\n",
"personal_file = f\"{base_dir}/personal/info.md\"\n",
"projects_file = f\"{base_dir}/projects/projects.md\"\n",
"learning_file = f\"{base_dir}/learning/learning.md\"\n",
"\n",
"#If the personal info file does not exist, create it\n",
"if not os.path.exists(personal_file):\n",
" name = fake.name()\n",
" profession = random.choice([\"Data Analyst\", \"Business Analyst\", \"Software Engineer\", \"AI Specialist\"])\n",
" bio = fake.paragraph(nb_sentences=5)\n",
" experience = \"\\n\".join([f\"- {fake.job()} at {fake.company()} ({fake.year()})\" for _ in range(3)])\n",
" \n",
" personal_text = f\"\"\"\n",
"# Personal Profile\n",
"Name: {name} \n",
"Profession: {profession} \n",
"\n",
"Bio: {bio}\n",
"\n",
"## Experience\n",
"{experience}\n",
"\"\"\"\n",
" with open(personal_file, \"w\") as f:\n",
" f.write(personal_text)\n",
" print(\"Personal info generated.\")\n",
"else:\n",
" #If the personal info file exists, skip the regeneration\n",
" print(\"Personal info already exists. Skipping regeneration.\")\n",
"\n",
"#doing the same for project file\n",
"if not os.path.exists(projects_file):\n",
" projects = \"\\n\".join([\n",
" f\"- **{fake.catch_phrase()}** — {fake.bs().capitalize()} for {fake.company()}.\"\n",
" for _ in range(5)\n",
" ])\n",
" projects_text = f\"\"\"\n",
"# Projects Portfolio\n",
"\n",
"Key Projects:\n",
"{projects}\n",
"\"\"\"\n",
" with open(projects_file, \"w\") as f:\n",
" f.write(projects_text)\n",
" print(\"Projects generated.\")\n",
"else:\n",
" print(\"Projects already exist. Skipping regeneration.\")\n",
"\n",
"#same thing for learning file\n",
"if not os.path.exists(learning_file):\n",
" topics = [\"LangChain\", \"RAG Systems\", \"Vector Databases\", \"AI Ethics\", \"Prompt Engineering\", \"Data Visualization\"]\n",
" learning = \"\\n\".join([\n",
" f\"- {random.choice(topics)} — {fake.sentence(nb_words=8)}\"\n",
" for _ in range(6)\n",
" ])\n",
" learning_text = f\"\"\"\n",
"# Learning Journey\n",
"\n",
"Recent Topics and Notes:\n",
"{learning}\n",
"\"\"\"\n",
" with open(learning_file, \"w\") as f:\n",
" f.write(learning_text)\n",
" print(\"Learning notes generated.\")\n",
"else:\n",
" print(\"Learning notes already exist. Skipping regeneration.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fa19091",
"metadata": {},
"outputs": [],
"source": [
"#loading the knowledge information from the knowledge_base folder\n",
"loader = DirectoryLoader(\"knowledge_base\", glob=\"**/*.md\", loader_cls=TextLoader)\n",
"documents = loader.load()\n",
"\n",
"#Splitting the documents into chunks\n",
"splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=80)\n",
"chunks = splitter.split_documents(documents)\n",
"\n",
"print(f\"Loaded {len(documents)} documents and created {len(chunks)} chunks.\")\n"
]
},
{
"cell_type": "markdown",
"id": "7b9fc9a5",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dcdec41",
"metadata": {},
"outputs": [],
"source": [
"#Creating the embeddings\n",
"embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
"\n",
"# Chroma as the vector store\n",
"vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=\"chroma_db\")\n",
"vectorstore.persist()\n",
"\n",
"print(\"Vector store created and saved to 'chroma_db'.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99e4a99f",
"metadata": {},
"outputs": [],
"source": [
"#Check Langchain version as they updated the version recently thus making it difficult to use it successfullt\n",
"print(langchain.__version__)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5dc1b6ce",
"metadata": {},
"outputs": [],
"source": [
"# The main Langchain Abstraction are: Memory, LLM, and Retriever\n",
"\n",
"# Memory for conversation history\n",
"memory = ConversationBufferMemory(\n",
" memory_key=\"chat_history\",\n",
" return_messages=True\n",
")\n",
"\n",
"# Using one of the Anthropic models from the list above to create the LLM\n",
"llm = ChatAnthropic(\n",
" model=\"claude-sonnet-4-5-20250929\",\n",
" temperature=0.6,\n",
" max_tokens=1024,\n",
" anthropic_api_key=ANTHROPIC_KEY\n",
")\n",
"\n",
"# Retriever from your vectorstore\n",
"retriever = vectorstore.as_retriever(search_kwargs={\"k\": 3})\n",
"\n",
"# Bringing everything together tConversational RAG Chain\n",
"conversation_chain = ConversationalRetrievalChain.from_llm(\n",
" llm=llm,\n",
" retriever=retriever,\n",
" memory=memory\n",
")\n",
"\n",
"print(\"Anthropic conversational retriever is ready!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f93eea7",
"metadata": {},
"outputs": [],
"source": [
"#fnc to create a chat interface\n",
"def chat(message, history):\n",
" if conversation_chain:\n",
" result = conversation_chain.invoke({\"question\": message})\n",
" return result[\"answer\"]\n",
" else:\n",
" # Retrieval-only fallback\n",
" docs = retriever.get_relevant_documents(message)\n",
" context = \"\\n\\n\".join([d.page_content for d in docs])\n",
" return f\"(Offline Mode)\\nTop relevant info:\\n\\n{context[:1000]}\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aadf91b4",
"metadata": {},
"outputs": [],
"source": [
"#used som css to make the chat interface look better, and dark mode. I love dark mode btw\n",
"css = \"\"\"\n",
"body {background-color: #0f1117; color: #e6e6e6;}\n",
".gradio-container {background-color: #0f1117 !important;}\n",
"textarea, input, .wrap.svelte-1ipelgc {background-color: #1b1f2a !important; color: #ffffff !important;}\n",
"\"\"\"\n",
"\n",
"#Gradio blocks\n",
"with gr.Blocks(css=css, theme=\"gradio/monochrome\") as demo:\n",
" gr.Markdown(\n",
" \"\"\"\n",
" <h2 style=\"color: #f5f5f5;\">Personal Knowledge Worker</h2>\n",
" <p style=\"color: #f5f5f5;\">Chat with your auto-generated knowledge base (Claude-powered if available)</p>\n",
" \"\"\",\n",
" elem_id=\"title\"\n",
" )\n",
" gr.ChatInterface(chat, type=\"messages\")\n",
"\n",
"demo.launch(inbrowser=True)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}