LLM_Engineering_OLD/week5/community-contributions/Cosmus_Week5_Exercise.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d04a7c55",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Importing necessary libraries\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from anthropic import Client\n",
    "from dotenv import load_dotenv\n",
    "import sys\n",
    "from faker import Faker\n",
    "import random\n",
    "import gradio as gr\n",
    "from langchain_community.document_loaders import DirectoryLoader, TextLoader\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
    "from langchain_community.vectorstores import Chroma\n",
    "from langchain_anthropic import ChatAnthropic\n",
    "from langchain_classic.memory import ConversationBufferMemory\n",
    "from langchain_classic.chains import ConversationalRetrievalChain\n",
    "\n",
    "!{sys.executable} -m pip install faker\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d7f8354",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# loading the .env variables\n",
    "load_dotenv(override=True)\n",
    "\n",
    "# Force export to OS env so LangChain can detect it (had to try this because the key was not loading at some point but by the time i shared the code it loaded well so i commented it out)\n",
    "#os.environ[\"ANTHROPIC_API_KEY\"] = os.getenv(\"ANTHROPIC_API_KEY\")\n",
    "\n",
    "#getting the key from the our .env file. It is Anthropic_API_KEY\n",
    "ANTHROPIC_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n",
    "client = Client(api_key=ANTHROPIC_KEY)\n",
    "\n",
    "# Checking the anthropic models list our anthropic key ca help us play with\n",
    "models = client.models.list()\n",
    "for model in models:\n",
    "    print(model.id)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20d11d1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Getting the python executable path on my notebook to know where to install the faker library\n",
    "print(sys.executable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93a8f3ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Creating a fake person with faker\n",
    "fake = Faker()\n",
    "base_dir = \"knowledge_base\"\n",
    "folders = [\"personal\", \"projects\", \"learning\"]\n",
    "\n",
    "# We now create folders if they don't exist\n",
    "for folder in folders:\n",
    "    os.makedirs(f\"{base_dir}/{folder}\", exist_ok=True)\n",
    "\n",
    "# Check if data already exists\n",
    "personal_file = f\"{base_dir}/personal/info.md\"\n",
    "projects_file = f\"{base_dir}/projects/projects.md\"\n",
    "learning_file = f\"{base_dir}/learning/learning.md\"\n",
    "\n",
    "#If the personal info file does not exist, create it\n",
    "if not os.path.exists(personal_file):\n",
    "    name = fake.name()\n",
    "    profession = random.choice([\"Data Analyst\", \"Business Analyst\", \"Software Engineer\", \"AI Specialist\"])\n",
    "    bio = fake.paragraph(nb_sentences=5)\n",
    "    experience = \"\\n\".join([f\"- {fake.job()} at {fake.company()} ({fake.year()})\" for _ in range(3)])\n",
    "    \n",
    "    personal_text = f\"\"\"\n",
    "# Personal Profile\n",
    "Name: {name}  \n",
    "Profession: {profession}  \n",
    "\n",
    "Bio: {bio}\n",
    "\n",
    "## Experience\n",
    "{experience}\n",
    "\"\"\"\n",
    "    with open(personal_file, \"w\") as f:\n",
    "        f.write(personal_text)\n",
    "    print(\"Personal info generated.\")\n",
    "else:\n",
    "    #If the personal info file exists, skip the regeneration\n",
    "    print(\"ℹPersonal info already exists. Skipping regeneration.\")\n",
    "\n",
    "#doing the same for project file\n",
    "if not os.path.exists(projects_file):\n",
    "    projects = \"\\n\".join([\n",
    "        f\"- **{fake.catch_phrase()}** — {fake.bs().capitalize()} for {fake.company()}.\"\n",
    "        for _ in range(5)\n",
    "    ])\n",
    "    projects_text = f\"\"\"\n",
    "# Projects Portfolio\n",
    "\n",
    "Key Projects:\n",
    "{projects}\n",
    "\"\"\"\n",
    "    with open(projects_file, \"w\") as f:\n",
    "        f.write(projects_text)\n",
    "    print(\"Projects generated.\")\n",
    "else:\n",
    "    print(\"ℹProjects already exist. Skipping regeneration.\")\n",
    "\n",
    "#same thing for learning file\n",
    "if not os.path.exists(learning_file):\n",
    "    topics = [\"LangChain\", \"RAG Systems\", \"Vector Databases\", \"AI Ethics\", \"Prompt Engineering\", \"Data Visualization\"]\n",
    "    learning = \"\\n\".join([\n",
    "        f\"- {random.choice(topics)} — {fake.sentence(nb_words=8)}\"\n",
    "        for _ in range(6)\n",
    "    ])\n",
    "    learning_text = f\"\"\"\n",
    "# Learning Journey\n",
    "\n",
    "Recent Topics and Notes:\n",
    "{learning}\n",
    "\"\"\"\n",
    "    with open(learning_file, \"w\") as f:\n",
    "        f.write(learning_text)\n",
    "    print(\"Learning notes generated.\")\n",
    "else:\n",
    "    print(\"ℹLearning notes already exist. Skipping regeneration.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fa19091",
   "metadata": {},
   "outputs": [],
   "source": [
    "#loading the knowledge information from the knowledge_base folder\n",
    "loader = DirectoryLoader(\"knowledge_base\", glob=\"**/*.md\", loader_cls=TextLoader)\n",
    "documents = loader.load()\n",
    "\n",
    "#Splitting the documents into chunks\n",
    "splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=80)\n",
    "chunks = splitter.split_documents(documents)\n",
    "\n",
    "print(f\"Loaded {len(documents)} documents and created {len(chunks)} chunks.\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b9fc9a5",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dcdec41",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Creating the embeddings\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
    "\n",
    "# Chroma as the vector store\n",
    "vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=\"chroma_db\")\n",
    "vectorstore.persist()\n",
    "\n",
    "print(\"Vector store created and saved to 'chroma_db'.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99e4a99f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Check Langchain version as they updated the version recently thus making it difficult to use it successfullt\n",
    "print(langchain.__version__)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dc1b6ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The main Langchain Abstraction are:  Memory, LLM, and Retriever\n",
    "\n",
    "# Memory for conversation history\n",
    "memory = ConversationBufferMemory(\n",
    "    memory_key=\"chat_history\",\n",
    "    return_messages=True\n",
    ")\n",
    "\n",
    "# Using one of the Anthropic models from the list above to create the LLM\n",
    "llm = ChatAnthropic(\n",
    "    model=\"claude-sonnet-4-5-20250929\",\n",
    "    temperature=0.6,\n",
    "    max_tokens=1024,\n",
    "    anthropic_api_key=ANTHROPIC_KEY\n",
    ")\n",
    "\n",
    "# Retriever from your vectorstore\n",
    "retriever = vectorstore.as_retriever(search_kwargs={\"k\": 3})\n",
    "\n",
    "#  Bringing everything together tConversational RAG Chain\n",
    "conversation_chain = ConversationalRetrievalChain.from_llm(\n",
    "    llm=llm,\n",
    "    retriever=retriever,\n",
    "    memory=memory\n",
    ")\n",
    "\n",
    "print(\"Anthropic conversational retriever is ready!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f93eea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#fnc to create a chat interface\n",
    "def chat(message, history):\n",
    "    if conversation_chain:\n",
    "        result = conversation_chain.invoke({\"question\": message})\n",
    "        return result[\"answer\"]\n",
    "    else:\n",
    "        # Retrieval-only fallback\n",
    "        docs = retriever.get_relevant_documents(message)\n",
    "        context = \"\\n\\n\".join([d.page_content for d in docs])\n",
    "        return f\"(Offline Mode)\\nTop relevant info:\\n\\n{context[:1000]}\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aadf91b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#used som css to make the chat interface look better, and dark mode. I love dark mode btw\n",
    "css = \"\"\"\n",
    "body {background-color: #0f1117; color: #e6e6e6;}\n",
    ".gradio-container {background-color: #0f1117 !important;}\n",
    "textarea, input, .wrap.svelte-1ipelgc {background-color: #1b1f2a !important; color: #ffffff !important;}\n",
    "\"\"\"\n",
    "\n",
    "#Gradio blocks\n",
    "with gr.Blocks(css=css, theme=\"gradio/monochrome\") as demo:\n",
    "    gr.Markdown(\n",
    "        \"\"\"\n",
    "        <h2 style=\"color: #f5f5f5;\">Personal Knowledge Worker</h2>\n",
    "        <p style=\"color: #f5f5f5;\">Chat with your auto-generated knowledge base (Claude-powered if available)</p>\n",
    "        \"\"\",\n",
    "        elem_id=\"title\"\n",
    "    )\n",
    "    gr.ChatInterface(chat, type=\"messages\")\n",
    "\n",
    "demo.launch(inbrowser=True)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}