Updated Week 5 with November version
This commit is contained in:
145
week5/pro_implementation/answer.py
Normal file
145
week5/pro_implementation/answer.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
from chromadb import PersistentClient
|
||||
from litellm import completion
|
||||
from pydantic import BaseModel, Field
|
||||
from pathlib import Path
|
||||
from tenacity import retry, wait_exponential
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
# MODEL = "openai/gpt-4.1-nano"
|
||||
MODEL = "groq/openai/gpt-oss-120b"
|
||||
DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
|
||||
KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
|
||||
SUMMARIES_PATH = Path(__file__).parent.parent / "summaries"
|
||||
|
||||
collection_name = "docs"
|
||||
embedding_model = "text-embedding-3-large"
|
||||
wait = wait_exponential(multiplier=1, min=10, max=240)
|
||||
|
||||
openai = OpenAI()
|
||||
|
||||
chroma = PersistentClient(path=DB_NAME)
|
||||
collection = chroma.get_or_create_collection(collection_name)
|
||||
|
||||
RETRIEVAL_K = 20
|
||||
FINAL_K = 10
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are a knowledgeable, friendly assistant representing the company Insurellm.
|
||||
You are chatting with a user about Insurellm.
|
||||
Your answer will be evaluated for accuracy, relevance and completeness, so make sure it only answers the question and fully answers it.
|
||||
If you don't know the answer, say so.
|
||||
For context, here are specific extracts from the Knowledge Base that might be directly relevant to the user's question:
|
||||
{context}
|
||||
|
||||
With this context, please answer the user's question. Be accurate, relevant and complete.
|
||||
"""
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
page_content: str
|
||||
metadata: dict
|
||||
|
||||
|
||||
class RankOrder(BaseModel):
|
||||
order: list[int] = Field(
|
||||
description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
|
||||
)
|
||||
|
||||
|
||||
@retry(wait=wait)
|
||||
def rerank(question, chunks):
|
||||
system_prompt = """
|
||||
You are a document re-ranker.
|
||||
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
|
||||
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance, but you may be able to improve on that.
|
||||
You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.
|
||||
Reply only with the list of ranked chunk ids, nothing else. Include all the chunk ids you are provided with, reranked.
|
||||
"""
|
||||
user_prompt = f"The user has asked the following question:\n\n{question}\n\nOrder all the chunks of text by relevance to the question, from most relevant to least relevant. Include all the chunk ids you are provided with, reranked.\n\n"
|
||||
user_prompt += "Here are the chunks:\n\n"
|
||||
for index, chunk in enumerate(chunks):
|
||||
user_prompt += f"# CHUNK ID: {index + 1}:\n\n{chunk.page_content}\n\n"
|
||||
user_prompt += "Reply only with the list of ranked chunk ids, nothing else."
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
response = completion(model=MODEL, messages=messages, response_format=RankOrder)
|
||||
reply = response.choices[0].message.content
|
||||
order = RankOrder.model_validate_json(reply).order
|
||||
return [chunks[i - 1] for i in order]
|
||||
|
||||
|
||||
def make_rag_messages(question, history, chunks):
|
||||
context = "\n\n".join(
|
||||
f"Extract from {chunk.metadata['source']}:\n{chunk.page_content}" for chunk in chunks
|
||||
)
|
||||
system_prompt = SYSTEM_PROMPT.format(context=context)
|
||||
return (
|
||||
[{"role": "system", "content": system_prompt}]
|
||||
+ history
|
||||
+ [{"role": "user", "content": question}]
|
||||
)
|
||||
|
||||
|
||||
@retry(wait=wait)
|
||||
def rewrite_query(question, history=[]):
|
||||
"""Rewrite the user's question to be a more specific question that is more likely to surface relevant content in the Knowledge Base."""
|
||||
message = f"""
|
||||
You are in a conversation with a user, answering questions about the company Insurellm.
|
||||
You are about to look up information in a Knowledge Base to answer the user's question.
|
||||
|
||||
This is the history of your conversation so far with the user:
|
||||
{history}
|
||||
|
||||
And this is the user's current question:
|
||||
{question}
|
||||
|
||||
Respond only with a short, refined question that you will use to search the Knowledge Base.
|
||||
It should be a VERY short specific question most likely to surface content. Focus on the question details.
|
||||
IMPORTANT: Respond ONLY with the precise knowledgebase query, nothing else.
|
||||
"""
|
||||
response = completion(model=MODEL, messages=[{"role": "system", "content": message}])
|
||||
return response.choices[0].message.content
|
||||
|
||||
|
||||
def merge_chunks(chunks, reranked):
|
||||
merged = chunks[:]
|
||||
existing = [chunk.page_content for chunk in chunks]
|
||||
for chunk in reranked:
|
||||
if chunk.page_content not in existing:
|
||||
merged.append(chunk)
|
||||
return merged
|
||||
|
||||
|
||||
def fetch_context_unranked(question):
|
||||
query = openai.embeddings.create(model=embedding_model, input=[question]).data[0].embedding
|
||||
results = collection.query(query_embeddings=[query], n_results=RETRIEVAL_K)
|
||||
chunks = []
|
||||
for result in zip(results["documents"][0], results["metadatas"][0]):
|
||||
chunks.append(Result(page_content=result[0], metadata=result[1]))
|
||||
return chunks
|
||||
|
||||
|
||||
def fetch_context(original_question):
|
||||
rewritten_question = rewrite_query(original_question)
|
||||
chunks1 = fetch_context_unranked(original_question)
|
||||
chunks2 = fetch_context_unranked(rewritten_question)
|
||||
chunks = merge_chunks(chunks1, chunks2)
|
||||
reranked = rerank(original_question, chunks)
|
||||
return reranked[:FINAL_K]
|
||||
|
||||
|
||||
@retry(wait=wait)
|
||||
def answer_question(question: str, history: list[dict] = []) -> tuple[str, list]:
|
||||
"""
|
||||
Answer a question using RAG and return the answer and the retrieved context
|
||||
"""
|
||||
chunks = fetch_context(question)
|
||||
messages = make_rag_messages(question, history, chunks)
|
||||
response = completion(model=MODEL, messages=messages)
|
||||
return response.choices[0].message.content, chunks
|
||||
146
week5/pro_implementation/ingest.py
Normal file
146
week5/pro_implementation/ingest.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from pathlib import Path
|
||||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field
|
||||
from chromadb import PersistentClient
|
||||
from tqdm import tqdm
|
||||
from litellm import completion
|
||||
from multiprocessing import Pool
|
||||
from tenacity import retry, wait_exponential
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
MODEL = "openai/gpt-4.1-nano"
|
||||
|
||||
DB_NAME = str(Path(__file__).parent.parent / "preprocessed_db")
|
||||
collection_name = "docs"
|
||||
embedding_model = "text-embedding-3-large"
|
||||
KNOWLEDGE_BASE_PATH = Path(__file__).parent.parent / "knowledge-base"
|
||||
AVERAGE_CHUNK_SIZE = 100
|
||||
wait = wait_exponential(multiplier=1, min=10, max=240)
|
||||
|
||||
|
||||
WORKERS = 3
|
||||
|
||||
openai = OpenAI()
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
page_content: str
|
||||
metadata: dict
|
||||
|
||||
|
||||
class Chunk(BaseModel):
|
||||
headline: str = Field(
|
||||
description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query",
|
||||
)
|
||||
summary: str = Field(
|
||||
description="A few sentences summarizing the content of this chunk to answer common questions"
|
||||
)
|
||||
original_text: str = Field(
|
||||
description="The original text of this chunk from the provided document, exactly as is, not changed in any way"
|
||||
)
|
||||
|
||||
def as_result(self, document):
|
||||
metadata = {"source": document["source"], "type": document["type"]}
|
||||
return Result(
|
||||
page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
class Chunks(BaseModel):
|
||||
chunks: list[Chunk]
|
||||
|
||||
|
||||
def fetch_documents():
|
||||
"""A homemade version of the LangChain DirectoryLoader"""
|
||||
|
||||
documents = []
|
||||
|
||||
for folder in KNOWLEDGE_BASE_PATH.iterdir():
|
||||
doc_type = folder.name
|
||||
for file in folder.rglob("*.md"):
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
documents.append({"type": doc_type, "source": file.as_posix(), "text": f.read()})
|
||||
|
||||
print(f"Loaded {len(documents)} documents")
|
||||
return documents
|
||||
|
||||
|
||||
def make_prompt(document):
|
||||
how_many = (len(document["text"]) // AVERAGE_CHUNK_SIZE) + 1
|
||||
return f"""
|
||||
You take a document and you split the document into overlapping chunks for a KnowledgeBase.
|
||||
|
||||
The document is from the shared drive of a company called Insurellm.
|
||||
The document is of type: {document["type"]}
|
||||
The document has been retrieved from: {document["source"]}
|
||||
|
||||
A chatbot will use these chunks to answer questions about the company.
|
||||
You should divide up the document as you see fit, being sure that the entire document is returned across the chunks - don't leave anything out.
|
||||
This document should probably be split into at least {how_many} chunks, but you can have more or less as appropriate, ensuring that there are individual chunks to answer specific questions.
|
||||
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.
|
||||
|
||||
For each chunk, you should provide a headline, a summary, and the original text of the chunk.
|
||||
Together your chunks should represent the entire document with overlap.
|
||||
|
||||
Here is the document:
|
||||
|
||||
{document["text"]}
|
||||
|
||||
Respond with the chunks.
|
||||
"""
|
||||
|
||||
|
||||
def make_messages(document):
|
||||
return [
|
||||
{"role": "user", "content": make_prompt(document)},
|
||||
]
|
||||
|
||||
|
||||
@retry(wait=wait)
|
||||
def process_document(document):
|
||||
messages = make_messages(document)
|
||||
response = completion(model=MODEL, messages=messages, response_format=Chunks)
|
||||
reply = response.choices[0].message.content
|
||||
doc_as_chunks = Chunks.model_validate_json(reply).chunks
|
||||
return [chunk.as_result(document) for chunk in doc_as_chunks]
|
||||
|
||||
|
||||
def create_chunks(documents):
|
||||
"""
|
||||
Create chunks using a number of workers in parallel.
|
||||
If you get a rate limit error, set the WORKERS to 1.
|
||||
"""
|
||||
chunks = []
|
||||
with Pool(processes=WORKERS) as pool:
|
||||
for result in tqdm(pool.imap_unordered(process_document, documents), total=len(documents)):
|
||||
chunks.extend(result)
|
||||
return chunks
|
||||
|
||||
|
||||
def create_embeddings(chunks):
|
||||
chroma = PersistentClient(path=DB_NAME)
|
||||
if collection_name in [c.name for c in chroma.list_collections()]:
|
||||
chroma.delete_collection(collection_name)
|
||||
|
||||
texts = [chunk.page_content for chunk in chunks]
|
||||
emb = openai.embeddings.create(model=embedding_model, input=texts).data
|
||||
vectors = [e.embedding for e in emb]
|
||||
|
||||
collection = chroma.get_or_create_collection(collection_name)
|
||||
|
||||
ids = [str(i) for i in range(len(chunks))]
|
||||
metas = [chunk.metadata for chunk in chunks]
|
||||
|
||||
collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
|
||||
print(f"Vectorstore created with {collection.count()} documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
documents = fetch_documents()
|
||||
chunks = create_chunks(documents)
|
||||
create_embeddings(chunks)
|
||||
print("Ingestion complete")
|
||||
Reference in New Issue
Block a user