Updated Week 5 with November version

2025-11-04 07:26:42 -05:00
parent 9132764523
commit e5c3fcab46
81 changed files with 9263 additions and 2725 deletions
--- a/week5/implementation/answer.py
+++ b/week5/implementation/answer.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
+from langchain_core.documents import Document
+
+from dotenv import load_dotenv
+
+
+load_dotenv(override=True)
+
+MODEL = "gpt-4.1-nano"
+DB_NAME = str(Path(__file__).parent.parent / "vector_db")
+
+# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
+RETRIEVAL_K = 10
+
+SYSTEM_PROMPT = """
+You are a knowledgeable, friendly assistant representing the company Insurellm.
+You are chatting with a user about Insurellm.
+If relevant, use the given context to answer any question.
+If you don't know the answer, say so.
+Context:
+{context}
+"""
+
+vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)
+retriever = vectorstore.as_retriever()
+llm = ChatOpenAI(temperature=0, model_name=MODEL)
+
+
+def fetch_context(question: str) -> list[Document]:
+    """
+    Retrieve relevant context documents for a question.
+    """
+    return retriever.invoke(question, k=RETRIEVAL_K)
+
+
+def combined_question(question: str, history: list[dict] = []) -> str:
+    """
+    Combine all the user's messages into a single string.
+    """
+    prior = "\n".join(m["content"] for m in history if m["role"] == "user")
+    return prior + "\n" + question
+
+
+def answer_question(question: str, history: list[dict] = []) -> tuple[str, list[Document]]:
+    """
+    Answer the given question with RAG; return the answer and the context documents.
+    """
+    combined = combined_question(question, history)
+    docs = fetch_context(combined)
+    context = "\n\n".join(doc.page_content for doc in docs)
+    system_prompt = SYSTEM_PROMPT.format(context=context)
+    messages = [SystemMessage(content=system_prompt)]
+    messages.extend(convert_to_messages(history))
+    messages.append(HumanMessage(content=question))
+    response = llm.invoke(messages)
+    return response.content, docs
--- a/week5/implementation/ingest.py
+++ b/week5/implementation/ingest.py
@@ -0,0 +1,67 @@
+import os
+import glob
+from pathlib import Path
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import OpenAIEmbeddings
+
+
+from dotenv import load_dotenv
+
+MODEL = "gpt-4.1-nano"
+
+DB_NAME = str(Path(__file__).parent.parent / "vector_db")
+KNOWLEDGE_BASE = str(Path(__file__).parent.parent / "knowledge-base")
+
+# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+
+load_dotenv(override=True)
+
+embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
+
+
+def fetch_documents():
+    folders = glob.glob(str(Path(KNOWLEDGE_BASE) / "*"))
+    documents = []
+    for folder in folders:
+        doc_type = os.path.basename(folder)
+        loader = DirectoryLoader(
+            folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}
+        )
+        folder_docs = loader.load()
+        for doc in folder_docs:
+            doc.metadata["doc_type"] = doc_type
+            documents.append(doc)
+    return documents
+
+
+def create_chunks(documents):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+
+
+def create_embeddings(chunks):
+    if os.path.exists(DB_NAME):
+        Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()
+
+    vectorstore = Chroma.from_documents(
+        documents=chunks, embedding=embeddings, persist_directory=DB_NAME
+    )
+
+    collection = vectorstore._collection
+    count = collection.count()
+
+    sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
+    dimensions = len(sample_embedding)
+    print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")
+    return vectorstore
+
+
+if __name__ == "__main__":
+    documents = fetch_documents()
+    chunks = create_chunks(documents)
+    create_embeddings(chunks)
+    print("Ingestion complete")