Updated Week 5 with November version
This commit is contained in:
248
week5/evaluation/eval.py
Normal file
248
week5/evaluation/eval.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import sys
|
||||
import math
|
||||
from pydantic import BaseModel, Field
|
||||
from litellm import completion
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from evaluation.test import TestQuestion, load_tests
|
||||
from pro_implementation.answer import answer_question, fetch_context
|
||||
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
MODEL = "gpt-4.1-nano"
|
||||
db_name = "vector_db"
|
||||
|
||||
|
||||
class RetrievalEval(BaseModel):
|
||||
"""Evaluation metrics for retrieval performance."""
|
||||
|
||||
mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords")
|
||||
ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)")
|
||||
keywords_found: int = Field(description="Number of keywords found in top-k results")
|
||||
total_keywords: int = Field(description="Total number of keywords to find")
|
||||
keyword_coverage: float = Field(description="Percentage of keywords found")
|
||||
|
||||
|
||||
class AnswerEval(BaseModel):
|
||||
"""LLM-as-a-judge evaluation of answer quality."""
|
||||
|
||||
feedback: str = Field(
|
||||
description="Concise feedback on the answer quality, comparing it to the reference answer and evaluating based on the retrieved context"
|
||||
)
|
||||
accuracy: float = Field(
|
||||
description="How factually correct is the answer compared to the reference answer? 1 (wrong. any wrong answer must score 1) to 5 (ideal - perfectly accurate). An acceptable answer would score 3."
|
||||
)
|
||||
completeness: float = Field(
|
||||
description="How complete is the answer in addressing all aspects of the question? 1 (very poor - missing key information) to 5 (ideal - all the information from the reference answer is provided completely). Only answer 5 if ALL information from the reference answer is included."
|
||||
)
|
||||
relevance: float = Field(
|
||||
description="How relevant is the answer to the specific question asked? 1 (very poor - off-topic) to 5 (ideal - directly addresses question and gives no additional information). Only answer 5 if the answer is completely relevant to the question and gives no additional information."
|
||||
)
|
||||
|
||||
|
||||
def calculate_mrr(keyword: str, retrieved_docs: list) -> float:
|
||||
"""Calculate reciprocal rank for a single keyword (case-insensitive)."""
|
||||
keyword_lower = keyword.lower()
|
||||
for rank, doc in enumerate(retrieved_docs, start=1):
|
||||
if keyword_lower in doc.page_content.lower():
|
||||
return 1.0 / rank
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_dcg(relevances: list[int], k: int) -> float:
|
||||
"""Calculate Discounted Cumulative Gain."""
|
||||
dcg = 0.0
|
||||
for i in range(min(k, len(relevances))):
|
||||
dcg += relevances[i] / math.log2(i + 2) # i+2 because rank starts at 1
|
||||
return dcg
|
||||
|
||||
|
||||
def calculate_ndcg(keyword: str, retrieved_docs: list, k: int = 10) -> float:
|
||||
"""Calculate nDCG for a single keyword (binary relevance, case-insensitive)."""
|
||||
keyword_lower = keyword.lower()
|
||||
|
||||
# Binary relevance: 1 if keyword found, 0 otherwise
|
||||
relevances = [
|
||||
1 if keyword_lower in doc.page_content.lower() else 0 for doc in retrieved_docs[:k]
|
||||
]
|
||||
|
||||
# DCG
|
||||
dcg = calculate_dcg(relevances, k)
|
||||
|
||||
# Ideal DCG (best case: keyword in first position)
|
||||
ideal_relevances = sorted(relevances, reverse=True)
|
||||
idcg = calculate_dcg(ideal_relevances, k)
|
||||
|
||||
return dcg / idcg if idcg > 0 else 0.0
|
||||
|
||||
|
||||
def evaluate_retrieval(test: TestQuestion, k: int = 10) -> RetrievalEval:
|
||||
"""
|
||||
Evaluate retrieval performance for a test question.
|
||||
|
||||
Args:
|
||||
test: TestQuestion object containing question and keywords
|
||||
k: Number of top documents to retrieve (default 10)
|
||||
|
||||
Returns:
|
||||
RetrievalEval object with MRR, nDCG, and keyword coverage metrics
|
||||
"""
|
||||
# Retrieve documents using shared answer module
|
||||
retrieved_docs = fetch_context(test.question)
|
||||
|
||||
# Calculate MRR (average across all keywords)
|
||||
mrr_scores = [calculate_mrr(keyword, retrieved_docs) for keyword in test.keywords]
|
||||
avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0
|
||||
|
||||
# Calculate nDCG (average across all keywords)
|
||||
ndcg_scores = [calculate_ndcg(keyword, retrieved_docs, k) for keyword in test.keywords]
|
||||
avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0
|
||||
|
||||
# Calculate keyword coverage
|
||||
keywords_found = sum(1 for score in mrr_scores if score > 0)
|
||||
total_keywords = len(test.keywords)
|
||||
keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0
|
||||
|
||||
return RetrievalEval(
|
||||
mrr=avg_mrr,
|
||||
ndcg=avg_ndcg,
|
||||
keywords_found=keywords_found,
|
||||
total_keywords=total_keywords,
|
||||
keyword_coverage=keyword_coverage,
|
||||
)
|
||||
|
||||
|
||||
def evaluate_answer(test: TestQuestion) -> tuple[AnswerEval, str, list]:
|
||||
"""
|
||||
Evaluate answer quality using LLM-as-a-judge (async).
|
||||
|
||||
Args:
|
||||
test: TestQuestion object containing question and reference answer
|
||||
|
||||
Returns:
|
||||
Tuple of (AnswerEval object, generated_answer string, retrieved_docs list)
|
||||
"""
|
||||
# Get RAG response using shared answer module
|
||||
generated_answer, retrieved_docs = answer_question(test.question)
|
||||
|
||||
# LLM judge prompt
|
||||
judge_messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an expert evaluator assessing the quality of answers. Evaluate the generated answer by comparing it to the reference answer. Only give 5/5 scores for perfect answers.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"""Question:
|
||||
{test.question}
|
||||
|
||||
Generated Answer:
|
||||
{generated_answer}
|
||||
|
||||
Reference Answer:
|
||||
{test.reference_answer}
|
||||
|
||||
Please evaluate the generated answer on three dimensions:
|
||||
1. Accuracy: How factually correct is it compared to the reference answer? Only give 5/5 scores for perfect answers.
|
||||
2. Completeness: How thoroughly does it address all aspects of the question, covering all the information from the reference answer?
|
||||
3. Relevance: How well does it directly answer the specific question asked, giving no additional information?
|
||||
|
||||
Provide detailed feedback and scores from 1 (very poor) to 5 (ideal) for each dimension. If the answer is wrong, then the accuracy score must be 1.""",
|
||||
},
|
||||
]
|
||||
|
||||
# Call LLM judge with structured outputs (async)
|
||||
judge_response = completion(model=MODEL, messages=judge_messages, response_format=AnswerEval)
|
||||
|
||||
answer_eval = AnswerEval.model_validate_json(judge_response.choices[0].message.content)
|
||||
|
||||
return answer_eval, generated_answer, retrieved_docs
|
||||
|
||||
|
||||
def evaluate_all_retrieval():
|
||||
"""Evaluate all retrieval tests."""
|
||||
tests = load_tests()
|
||||
total_tests = len(tests)
|
||||
for index, test in enumerate(tests):
|
||||
result = evaluate_retrieval(test)
|
||||
progress = (index + 1) / total_tests
|
||||
yield test, result, progress
|
||||
|
||||
|
||||
def evaluate_all_answers():
|
||||
"""Evaluate all answers to tests using batched async execution."""
|
||||
tests = load_tests()
|
||||
total_tests = len(tests)
|
||||
for index, test in enumerate(tests):
|
||||
result = evaluate_answer(test)[0]
|
||||
progress = (index + 1) / total_tests
|
||||
yield test, result, progress
|
||||
|
||||
|
||||
def run_cli_evaluation(test_number: int):
|
||||
"""Run evaluation for a specific test (async helper for CLI)."""
|
||||
# Load tests
|
||||
tests = load_tests("tests.jsonl")
|
||||
|
||||
if test_number < 0 or test_number >= len(tests):
|
||||
print(f"Error: test_row_number must be between 0 and {len(tests) - 1}")
|
||||
sys.exit(1)
|
||||
|
||||
# Get the test
|
||||
test = tests[test_number]
|
||||
|
||||
# Print test info
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Test #{test_number}")
|
||||
print(f"{'=' * 80}")
|
||||
print(f"Question: {test.question}")
|
||||
print(f"Keywords: {test.keywords}")
|
||||
print(f"Category: {test.category}")
|
||||
print(f"Reference Answer: {test.reference_answer}")
|
||||
|
||||
# Retrieval Evaluation
|
||||
print(f"\n{'=' * 80}")
|
||||
print("Retrieval Evaluation")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
retrieval_result = evaluate_retrieval(test)
|
||||
|
||||
print(f"MRR: {retrieval_result.mrr:.4f}")
|
||||
print(f"nDCG: {retrieval_result.ndcg:.4f}")
|
||||
print(f"Keywords Found: {retrieval_result.keywords_found}/{retrieval_result.total_keywords}")
|
||||
print(f"Keyword Coverage: {retrieval_result.keyword_coverage:.1f}%")
|
||||
|
||||
# Answer Evaluation
|
||||
print(f"\n{'=' * 80}")
|
||||
print("Answer Evaluation")
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
answer_result, generated_answer, retrieved_docs = evaluate_answer(test)
|
||||
|
||||
print(f"\nGenerated Answer:\n{generated_answer}")
|
||||
print(f"\nFeedback:\n{answer_result.feedback}")
|
||||
print("\nScores:")
|
||||
print(f" Accuracy: {answer_result.accuracy:.2f}/5")
|
||||
print(f" Completeness: {answer_result.completeness:.2f}/5")
|
||||
print(f" Relevance: {answer_result.relevance:.2f}/5")
|
||||
print(f"\n{'=' * 80}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI to evaluate a specific test by row number."""
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: uv run eval.py <test_row_number>")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
test_number = int(sys.argv[1])
|
||||
except ValueError:
|
||||
print("Error: test_row_number must be an integer")
|
||||
sys.exit(1)
|
||||
|
||||
run_cli_evaluation(test_number)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user