Updated Week 5 with November version
This commit is contained in:
61
week5/implementation/answer.py
Normal file
61
week5/implementation/answer.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from pathlib import Path
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from langchain_core.messages import SystemMessage, HumanMessage, convert_to_messages
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
MODEL = "gpt-4.1-nano"
|
||||
DB_NAME = str(Path(__file__).parent.parent / "vector_db")
|
||||
|
||||
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
|
||||
RETRIEVAL_K = 10
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are a knowledgeable, friendly assistant representing the company Insurellm.
|
||||
You are chatting with a user about Insurellm.
|
||||
If relevant, use the given context to answer any question.
|
||||
If you don't know the answer, say so.
|
||||
Context:
|
||||
{context}
|
||||
"""
|
||||
|
||||
vectorstore = Chroma(persist_directory=DB_NAME, embedding_function=embeddings)
|
||||
retriever = vectorstore.as_retriever()
|
||||
llm = ChatOpenAI(temperature=0, model_name=MODEL)
|
||||
|
||||
|
||||
def fetch_context(question: str) -> list[Document]:
|
||||
"""
|
||||
Retrieve relevant context documents for a question.
|
||||
"""
|
||||
return retriever.invoke(question, k=RETRIEVAL_K)
|
||||
|
||||
|
||||
def combined_question(question: str, history: list[dict] = []) -> str:
|
||||
"""
|
||||
Combine all the user's messages into a single string.
|
||||
"""
|
||||
prior = "\n".join(m["content"] for m in history if m["role"] == "user")
|
||||
return prior + "\n" + question
|
||||
|
||||
|
||||
def answer_question(question: str, history: list[dict] = []) -> tuple[str, list[Document]]:
|
||||
"""
|
||||
Answer the given question with RAG; return the answer and the context documents.
|
||||
"""
|
||||
combined = combined_question(question, history)
|
||||
docs = fetch_context(combined)
|
||||
context = "\n\n".join(doc.page_content for doc in docs)
|
||||
system_prompt = SYSTEM_PROMPT.format(context=context)
|
||||
messages = [SystemMessage(content=system_prompt)]
|
||||
messages.extend(convert_to_messages(history))
|
||||
messages.append(HumanMessage(content=question))
|
||||
response = llm.invoke(messages)
|
||||
return response.content, docs
|
||||
67
week5/implementation/ingest.py
Normal file
67
week5/implementation/ingest.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import os
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
MODEL = "gpt-4.1-nano"
|
||||
|
||||
DB_NAME = str(Path(__file__).parent.parent / "vector_db")
|
||||
KNOWLEDGE_BASE = str(Path(__file__).parent.parent / "knowledge-base")
|
||||
|
||||
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
|
||||
|
||||
|
||||
def fetch_documents():
|
||||
folders = glob.glob(str(Path(KNOWLEDGE_BASE) / "*"))
|
||||
documents = []
|
||||
for folder in folders:
|
||||
doc_type = os.path.basename(folder)
|
||||
loader = DirectoryLoader(
|
||||
folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}
|
||||
)
|
||||
folder_docs = loader.load()
|
||||
for doc in folder_docs:
|
||||
doc.metadata["doc_type"] = doc_type
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
|
||||
def create_chunks(documents):
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
|
||||
chunks = text_splitter.split_documents(documents)
|
||||
return chunks
|
||||
|
||||
|
||||
def create_embeddings(chunks):
|
||||
if os.path.exists(DB_NAME):
|
||||
Chroma(persist_directory=DB_NAME, embedding_function=embeddings).delete_collection()
|
||||
|
||||
vectorstore = Chroma.from_documents(
|
||||
documents=chunks, embedding=embeddings, persist_directory=DB_NAME
|
||||
)
|
||||
|
||||
collection = vectorstore._collection
|
||||
count = collection.count()
|
||||
|
||||
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
|
||||
dimensions = len(sample_embedding)
|
||||
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")
|
||||
return vectorstore
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
documents = fetch_documents()
|
||||
chunks = create_chunks(documents)
|
||||
create_embeddings(chunks)
|
||||
print("Ingestion complete")
|
||||
Reference in New Issue
Block a user