Updated Week 5 with November version

2025-11-04 07:26:42 -05:00
parent 9132764523
commit e5c3fcab46
81 changed files with 9263 additions and 2725 deletions
--- a/week5/evaluation/eval.py
+++ b/week5/evaluation/eval.py
@@ -0,0 +1,248 @@
+import sys
+import math
+from pydantic import BaseModel, Field
+from litellm import completion
+from dotenv import load_dotenv
+
+from evaluation.test import TestQuestion, load_tests
+from pro_implementation.answer import answer_question, fetch_context
+
+
+load_dotenv(override=True)
+
+MODEL = "gpt-4.1-nano"
+db_name = "vector_db"
+
+
+class RetrievalEval(BaseModel):
+    """Evaluation metrics for retrieval performance."""
+
+    mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords")
+    ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)")
+    keywords_found: int = Field(description="Number of keywords found in top-k results")
+    total_keywords: int = Field(description="Total number of keywords to find")
+    keyword_coverage: float = Field(description="Percentage of keywords found")
+
+
+class AnswerEval(BaseModel):
+    """LLM-as-a-judge evaluation of answer quality."""
+
+    feedback: str = Field(
+        description="Concise feedback on the answer quality, comparing it to the reference answer and evaluating based on the retrieved context"
+    )
+    accuracy: float = Field(
+        description="How factually correct is the answer compared to the reference answer? 1 (wrong. any wrong answer must score 1) to 5 (ideal - perfectly accurate). An acceptable answer would score 3."
+    )
+    completeness: float = Field(
+        description="How complete is the answer in addressing all aspects of the question? 1 (very poor - missing key information) to 5 (ideal - all the information from the reference answer is provided completely). Only answer 5 if ALL information from the reference answer is included."
+    )
+    relevance: float = Field(
+        description="How relevant is the answer to the specific question asked? 1 (very poor - off-topic) to 5 (ideal - directly addresses question and gives no additional information). Only answer 5 if the answer is completely relevant to the question and gives no additional information."
+    )
+
+
+def calculate_mrr(keyword: str, retrieved_docs: list) -> float:
+    """Calculate reciprocal rank for a single keyword (case-insensitive)."""
+    keyword_lower = keyword.lower()
+    for rank, doc in enumerate(retrieved_docs, start=1):
+        if keyword_lower in doc.page_content.lower():
+            return 1.0 / rank
+    return 0.0
+
+
+def calculate_dcg(relevances: list[int], k: int) -> float:
+    """Calculate Discounted Cumulative Gain."""
+    dcg = 0.0
+    for i in range(min(k, len(relevances))):
+        dcg += relevances[i] / math.log2(i + 2)  # i+2 because rank starts at 1
+    return dcg
+
+
+def calculate_ndcg(keyword: str, retrieved_docs: list, k: int = 10) -> float:
+    """Calculate nDCG for a single keyword (binary relevance, case-insensitive)."""
+    keyword_lower = keyword.lower()
+
+    # Binary relevance: 1 if keyword found, 0 otherwise
+    relevances = [
+        1 if keyword_lower in doc.page_content.lower() else 0 for doc in retrieved_docs[:k]
+    ]
+
+    # DCG
+    dcg = calculate_dcg(relevances, k)
+
+    # Ideal DCG (best case: keyword in first position)
+    ideal_relevances = sorted(relevances, reverse=True)
+    idcg = calculate_dcg(ideal_relevances, k)
+
+    return dcg / idcg if idcg > 0 else 0.0
+
+
+def evaluate_retrieval(test: TestQuestion, k: int = 10) -> RetrievalEval:
+    """
+    Evaluate retrieval performance for a test question.
+
+    Args:
+        test: TestQuestion object containing question and keywords
+        k: Number of top documents to retrieve (default 10)
+
+    Returns:
+        RetrievalEval object with MRR, nDCG, and keyword coverage metrics
+    """
+    # Retrieve documents using shared answer module
+    retrieved_docs = fetch_context(test.question)
+
+    # Calculate MRR (average across all keywords)
+    mrr_scores = [calculate_mrr(keyword, retrieved_docs) for keyword in test.keywords]
+    avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0
+
+    # Calculate nDCG (average across all keywords)
+    ndcg_scores = [calculate_ndcg(keyword, retrieved_docs, k) for keyword in test.keywords]
+    avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0
+
+    # Calculate keyword coverage
+    keywords_found = sum(1 for score in mrr_scores if score > 0)
+    total_keywords = len(test.keywords)
+    keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0
+
+    return RetrievalEval(
+        mrr=avg_mrr,
+        ndcg=avg_ndcg,
+        keywords_found=keywords_found,
+        total_keywords=total_keywords,
+        keyword_coverage=keyword_coverage,
+    )
+
+
+def evaluate_answer(test: TestQuestion) -> tuple[AnswerEval, str, list]:
+    """
+    Evaluate answer quality using LLM-as-a-judge (async).
+
+    Args:
+        test: TestQuestion object containing question and reference answer
+
+    Returns:
+        Tuple of (AnswerEval object, generated_answer string, retrieved_docs list)
+    """
+    # Get RAG response using shared answer module
+    generated_answer, retrieved_docs = answer_question(test.question)
+
+    # LLM judge prompt
+    judge_messages = [
+        {
+            "role": "system",
+            "content": "You are an expert evaluator assessing the quality of answers. Evaluate the generated answer by comparing it to the reference answer. Only give 5/5 scores for perfect answers.",
+        },
+        {
+            "role": "user",
+            "content": f"""Question:
+{test.question}
+
+Generated Answer:
+{generated_answer}
+
+Reference Answer:
+{test.reference_answer}
+
+Please evaluate the generated answer on three dimensions:
+1. Accuracy: How factually correct is it compared to the reference answer? Only give 5/5 scores for perfect answers.
+2. Completeness: How thoroughly does it address all aspects of the question, covering all the information from the reference answer?
+3. Relevance: How well does it directly answer the specific question asked, giving no additional information?
+
+Provide detailed feedback and scores from 1 (very poor) to 5 (ideal) for each dimension. If the answer is wrong, then the accuracy score must be 1.""",
+        },
+    ]
+
+    # Call LLM judge with structured outputs (async)
+    judge_response = completion(model=MODEL, messages=judge_messages, response_format=AnswerEval)
+
+    answer_eval = AnswerEval.model_validate_json(judge_response.choices[0].message.content)
+
+    return answer_eval, generated_answer, retrieved_docs
+
+
+def evaluate_all_retrieval():
+    """Evaluate all retrieval tests."""
+    tests = load_tests()
+    total_tests = len(tests)
+    for index, test in enumerate(tests):
+        result = evaluate_retrieval(test)
+        progress = (index + 1) / total_tests
+        yield test, result, progress
+
+
+def evaluate_all_answers():
+    """Evaluate all answers to tests using batched async execution."""
+    tests = load_tests()
+    total_tests = len(tests)
+    for index, test in enumerate(tests):
+        result = evaluate_answer(test)[0]
+        progress = (index + 1) / total_tests
+        yield test, result, progress
+
+
+def run_cli_evaluation(test_number: int):
+    """Run evaluation for a specific test (async helper for CLI)."""
+    # Load tests
+    tests = load_tests("tests.jsonl")
+
+    if test_number < 0 or test_number >= len(tests):
+        print(f"Error: test_row_number must be between 0 and {len(tests) - 1}")
+        sys.exit(1)
+
+    # Get the test
+    test = tests[test_number]
+
+    # Print test info
+    print(f"\n{'=' * 80}")
+    print(f"Test #{test_number}")
+    print(f"{'=' * 80}")
+    print(f"Question: {test.question}")
+    print(f"Keywords: {test.keywords}")
+    print(f"Category: {test.category}")
+    print(f"Reference Answer: {test.reference_answer}")
+
+    # Retrieval Evaluation
+    print(f"\n{'=' * 80}")
+    print("Retrieval Evaluation")
+    print(f"{'=' * 80}")
+
+    retrieval_result = evaluate_retrieval(test)
+
+    print(f"MRR: {retrieval_result.mrr:.4f}")
+    print(f"nDCG: {retrieval_result.ndcg:.4f}")
+    print(f"Keywords Found: {retrieval_result.keywords_found}/{retrieval_result.total_keywords}")
+    print(f"Keyword Coverage: {retrieval_result.keyword_coverage:.1f}%")
+
+    # Answer Evaluation
+    print(f"\n{'=' * 80}")
+    print("Answer Evaluation")
+    print(f"{'=' * 80}")
+
+    answer_result, generated_answer, retrieved_docs = evaluate_answer(test)
+
+    print(f"\nGenerated Answer:\n{generated_answer}")
+    print(f"\nFeedback:\n{answer_result.feedback}")
+    print("\nScores:")
+    print(f"  Accuracy: {answer_result.accuracy:.2f}/5")
+    print(f"  Completeness: {answer_result.completeness:.2f}/5")
+    print(f"  Relevance: {answer_result.relevance:.2f}/5")
+    print(f"\n{'=' * 80}\n")
+
+
+def main():
+    """CLI to evaluate a specific test by row number."""
+    if len(sys.argv) != 2:
+        print("Usage: uv run eval.py <test_row_number>")
+        sys.exit(1)
+
+    try:
+        test_number = int(sys.argv[1])
+    except ValueError:
+        print("Error: test_row_number must be an integer")
+        sys.exit(1)
+
+    run_cli_evaluation(test_number)
+
+
+if __name__ == "__main__":
+    main()
--- a/week5/evaluation/test.py
+++ b/week5/evaluation/test.py
@@ -0,0 +1,24 @@
+import json
+from pathlib import Path
+from pydantic import BaseModel, Field
+
+TEST_FILE = str(Path(__file__).parent / "tests.jsonl")
+
+
+class TestQuestion(BaseModel):
+    """A test question with expected keywords and reference answer."""
+
+    question: str = Field(description="The question to ask the RAG system")
+    keywords: list[str] = Field(description="Keywords that must appear in retrieved context")
+    reference_answer: str = Field(description="The reference answer for this question")
+    category: str = Field(description="Question category (e.g., direct_fact, spanning, temporal)")
+
+
+def load_tests() -> list[TestQuestion]:
+    """Load test questions from JSONL file."""
+    tests = []
+    with open(TEST_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line.strip())
+            tests.append(TestQuestion(**data))
+    return tests
--- a/week5/evaluation/tests.jsonl
+++ b/week5/evaluation/tests.jsonl
@@ -0,0 +1,150 @@
+{"question": "Who won the prestigious IIOTY award in 2023?", "keywords": ["Maxine", "Thompson", "IIOTY"], "reference_answer": "Maxine Thompson won the prestigious Insurellm Innovator of the Year (IIOTY) award in 2023.", "category": "direct_fact"}
+{"question": "When was Insurellm founded?", "keywords": ["2015", "founded"], "reference_answer": "Insurellm was founded in 2015.", "category": "direct_fact"}
+{"question": "Who founded Insurellm?", "keywords": ["Avery", "Lancaster"], "reference_answer": "Avery Lancaster founded Insurellm in 2015.", "category": "direct_fact"}
+{"question": "How many employees does Insurellm currently have?", "keywords": ["32", "employees"], "reference_answer": "Insurellm currently operates with 32 employees as of 2025.", "category": "direct_fact"}
+{"question": "What was Insurellm's first product?", "keywords": ["Markellm", "first"], "reference_answer": "Insurellm's first product was Markellm, the marketplace connecting consumers with insurance providers.", "category": "direct_fact"}
+{"question": "How many active contracts does Insurellm manage?", "keywords": ["32", "contracts"], "reference_answer": "Insurellm manages 32 active contracts spanning all eight product lines.", "category": "direct_fact"}
+{"question": "Where is Insurellm's headquarters located?", "keywords": ["San Francisco", "headquarters"], "reference_answer": "Insurellm's headquarters is located in San Francisco.", "category": "direct_fact"}
+{"question": "How many office locations does Insurellm maintain?", "keywords": ["offices in", "5"], "reference_answer": "Insurellm maintains 5 office locations: San Francisco, New York, Austin, Chicago, and Denver.", "category": "direct_fact"}
+{"question": "What is Insurellm's vision statement?", "keywords": ["revolutionize", "insurance", "technology"], "reference_answer": "Insurellm's vision is to revolutionize the insurance industry through innovative technology that makes insurance accessible, transparent, and effortless.", "category": "direct_fact"}
+{"question": "How many Bizllm contracts has Insurellm secured?", "keywords": ["7", "Bizllm"], "reference_answer": "Insurellm has secured 7 commercial insurance contracts for Bizllm.", "category": "direct_fact"}
+{"question": "How many Claimllm contracts does Insurellm have?", "keywords": ["7", "Claimllm"], "reference_answer": "Insurellm has 7 contracts for Claimllm, ranging from independent adjusting firms to enterprise claims networks.", "category": "direct_fact"}
+{"question": "How many life insurance contracts has Lifellm secured?", "keywords": ["6", "Lifellm"], "reference_answer": "Lifellm has secured 6 life insurance contracts serving carriers from small regional providers to major national groups.", "category": "direct_fact"}
+{"question": "How many health insurance contracts does Healthllm have?", "keywords": ["6", "Healthllm"], "reference_answer": "Healthllm has 6 health insurance contracts with plans from regional insurers to multi-state healthcare alliances.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Carllm's Basic Tier?", "keywords": ["$1,000", "Basic", "Carllm"], "reference_answer": "Carllm's Basic Tier costs $1,000 per month.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Carllm's Enterprise Tier?", "keywords": ["$5,000", "Enterprise", "Carllm"], "reference_answer": "Carllm's Enterprise Tier costs $5,000 per month.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Homellm's Standard Tier?", "keywords": ["$10,000", "Standard", "Homellm"], "reference_answer": "Homellm's Standard Tier costs $10,000 per month for medium-sized insurers.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Bizllm's Professional Tier?", "keywords": ["$12,000", "Professional", "Bizllm"], "reference_answer": "Bizllm's Professional Tier costs $12,000 per month for multi-line carriers.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Claimllm's Core Tier?", "keywords": ["$4,500", "Core", "Claimllm"], "reference_answer": "Claimllm's Core Tier costs $4,500 per month for smaller insurers processing up to 5,000 claims annually.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Healthllm's Essential Tier?", "keywords": ["$8,000", "Essential", "Healthllm"], "reference_answer": "Healthllm's Essential Tier costs $8,000 per month for regional health plans.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Lifellm's Starter Tier?", "keywords": ["$3,500", "Starter", "Lifellm"], "reference_answer": "Lifellm's Starter Tier costs $3,500 per month for small insurers.", "category": "direct_fact"}
+{"question": "What is the monthly cost of Rellm's Basic Plan?", "keywords": ["$5,000", "Basic", "Rellm"], "reference_answer": "Rellm's Basic Plan costs $5,000 per month.", "category": "direct_fact"}
+{"question": "What is Maxine Thompson's current job title?", "keywords": ["Senior Data Engineer", "Maxine"], "reference_answer": "Maxine Thompson's current job title is Senior Data Engineer.", "category": "direct_fact"}
+{"question": "What is Maxine Thompson's current salary?", "keywords": ["$120,000", "Maxine"], "reference_answer": "Maxine Thompson's current salary is $120,000.", "category": "direct_fact"}
+{"question": "Where is Maxine Thompson located?", "keywords": ["Austin", "Texas", "Maxine"], "reference_answer": "Maxine Thompson is located in Austin, Texas.", "category": "direct_fact"}
+{"question": "What is Avery Lancaster's job title?", "keywords": ["CEO", "Avery", "Lancaster"], "reference_answer": "Avery Lancaster is the Co-Founder and Chief Executive Officer (CEO) of Insurellm.", "category": "direct_fact"}
+{"question": "What is Avery Lancaster's current salary?", "keywords": ["$225,000", "Avery"], "reference_answer": "Avery Lancaster's current salary is $225,000.", "category": "direct_fact"}
+{"question": "What is James Wilson's job title?", "keywords": ["CTO", "James", "Wilson"], "reference_answer": "James Wilson is the Chief Technology Officer (CTO) of Insurellm.", "category": "direct_fact"}
+{"question": "What is James Wilson's current salary?", "keywords": ["$285,000", "James", "Wilson"], "reference_answer": "James Wilson's current salary is $285,000.", "category": "direct_fact"}
+{"question": "What is Priya Sharma's job title?", "keywords": ["Senior Data Scientist", "Priya"], "reference_answer": "Priya Sharma is a Senior Data Scientist at Insurellm.", "category": "direct_fact"}
+{"question": "What is Priya Sharma's current salary?", "keywords": ["$145,000", "Priya"], "reference_answer": "Priya Sharma's current salary is $145,000.", "category": "direct_fact"}
+{"question": "What is Robert Chen's job title?", "keywords": ["Senior Full Stack Engineer", "Robert"], "reference_answer": "Robert Chen is a Senior Full Stack Engineer at Insurellm.", "category": "direct_fact"}
+{"question": "What is Robert Chen's current salary?", "keywords": ["$152,000", "Robert"], "reference_answer": "Robert Chen's current salary is $152,000.", "category": "direct_fact"}
+{"question": "Which product does Robert Chen serve as technical lead for?", "keywords": ["Homellm", "Robert", "Chen"], "reference_answer": "Robert Chen is the technical lead for Homellm home insurance portal.", "category": "direct_fact"}
+{"question": "What is David Kim's job title?", "keywords": ["DevOps Engineer", "David", "Kim"], "reference_answer": "David Kim is a DevOps Engineer at Insurellm.", "category": "direct_fact"}
+{"question": "What is Sarah Williams's job title?", "keywords": ["UX Designer", "Sarah"], "reference_answer": "Sarah Williams is a UX Designer at Insurellm.", "category": "direct_fact"}
+{"question": "Which product does Sarah Williams lead design for?", "keywords": ["Homellm", "Sarah"], "reference_answer": "Sarah Williams leads design for the Homellm home insurance portal.", "category": "direct_fact"}
+{"question": "What is Marcus Johnson's job title?", "keywords": ["Customer Success Manager", "Marcus"], "reference_answer": "Marcus Johnson is a Customer Success Manager at Insurellm.", "category": "direct_fact"}
+{"question": "What is Marcus Johnson's client retention rate?", "keywords": ["95%", "retention", "Marcus"], "reference_answer": "Marcus Johnson achieved a 95% client retention rate over the past three years.", "category": "direct_fact"}
+{"question": "What is Lisa Anderson's job title?", "keywords": ["Marketing Manager", "Lisa"], "reference_answer": "Lisa Anderson is a Marketing Manager at Insurellm.", "category": "direct_fact"}
+{"question": "What is the size of Lisa Anderson's annual marketing budget?", "keywords": ["$2M", "Lisa"], "reference_answer": "Lisa Anderson manages a $2M annual marketing budget.", "category": "direct_fact"}
+{"question": "What is Emily Carter's job title?", "keywords": ["Account Executive", "Emily"], "reference_answer": "Emily Carter is an Account Executive at Insurellm.", "category": "direct_fact"}
+{"question": "What is the contract number for DriveSmart Insurance's Carllm agreement?", "keywords": ["CR-2025-E-0078", "DriveSmart"], "reference_answer": "The contract number for DriveSmart Insurance's Carllm agreement is CR-2025-E-0078.", "category": "direct_fact"}
+{"question": "What is the duration of the DriveSmart Insurance Carllm contract?", "keywords": ["36 months", "DriveSmart"], "reference_answer": "The DriveSmart Insurance Carllm contract is effective for a period of 36 months.", "category": "direct_fact"}
+{"question": "What is the total contract value for DriveSmart Insurance's Carllm agreement?", "keywords": ["$702,000", "DriveSmart"], "reference_answer": "The total contract value for DriveSmart Insurance's Carllm agreement is $702,000 over the 36-month term.", "category": "direct_fact"}
+{"question": "How many active auto policies does DriveSmart Insurance have?", "keywords": ["85,000", "DriveSmart", "policies"], "reference_answer": "DriveSmart Insurance currently has 85,000 active auto policies across 8 states.", "category": "direct_fact"}
+{"question": "What is the uptime SLA guarantee for DriveSmart Insurance?", "keywords": ["99.9%", "uptime", "DriveSmart"], "reference_answer": "Insurellm guarantees 99.9% platform uptime measured monthly for DriveSmart Insurance.", "category": "direct_fact"}
+{"question": "Who signed the DriveSmart Insurance contract on behalf of Insurellm?", "keywords": ["Jennifer Rodriguez", "DriveSmart"], "reference_answer": "Jennifer Rodriguez, CEO of Insurellm, signed the DriveSmart Insurance contract.", "category": "direct_fact"}
+{"question": "What is the monthly payment for Greenstone Insurance's Homellm contract?", "keywords": ["$10,000", "Greenstone"], "reference_answer": "Greenstone Insurance pays $10,000 per month for the Standard Tier of the Homellm service.", "category": "direct_fact"}
+{"question": "What tier did Harmony Health Plans subscribe to for Healthllm?", "keywords": ["Professional", "Harmony"], "reference_answer": "Harmony Health Plans subscribed to the Professional Tier of Healthllm.", "category": "direct_fact"}
+{"question": "What is the monthly cost for Harmony Health Plans' Healthllm subscription?", "keywords": ["$15,000", "Harmony"], "reference_answer": "Harmony Health Plans pays $15,000 per month for the Professional Tier of Healthllm.", "category": "direct_fact"}
+{"question": "How many covered members does Harmony Health Plans have?", "keywords": ["38,000", "Harmony"], "reference_answer": "Harmony Health Plans currently covers 38,000 members across 3 states.", "category": "direct_fact"}
+{"question": "Who signed the Harmony Health Plans contract on behalf of Insurellm?", "keywords": ["Sarah Chen", "Harmony"], "reference_answer": "Sarah Chen, Vice President of Sales at Insurellm, signed the Harmony Health Plans contract.", "category": "direct_fact"}
+{"question": "What is the contract number for Metropolitan Life Group's Lifellm agreement?", "keywords": ["LF-2025-E-0087", "Metropolitan"], "reference_answer": "The contract number for Metropolitan Life Group's Lifellm agreement is LF-2025-E-0087.", "category": "direct_fact"}
+{"question": "What is the total contract value for Metropolitan Life Group's Lifellm agreement?", "keywords": ["$1,098,000", "Metropolitan"], "reference_answer": "The total contract value for Metropolitan Life Group's Lifellm agreement is $1,098,000 over the 36-month term.", "category": "direct_fact"}
+{"question": "How many active policies does Metropolitan Life Group manage?", "keywords": ["50,000", "Metropolitan"], "reference_answer": "Metropolitan Life Group manages 50,000+ active policies.", "category": "direct_fact"}
+{"question": "Who signed the Metropolitan Life Group contract on behalf of the client?", "keywords": ["Richard Thompson", "Metropolitan"], "reference_answer": "Richard Thompson, Chairman & Chief Executive Officer, signed the contract on behalf of Metropolitan Life Group.", "category": "direct_fact"}
+{"question": "What tier did FastTrack Insurance Services subscribe to for Claimllm?", "keywords": ["Advanced", "FastTrack"], "reference_answer": "FastTrack Insurance Services subscribed to the Advanced Tier of Claimllm.", "category": "direct_fact"}
+{"question": "What is the monthly payment for FastTrack Insurance Services' Claimllm contract?", "keywords": ["$9,500", "FastTrack"], "reference_answer": "FastTrack Insurance Services pays $9,500 per month for Claimllm Advanced Tier.", "category": "direct_fact"}
+{"question": "What is the duration of the FastTrack Insurance Services contract?", "keywords": ["18 months", "FastTrack"], "reference_answer": "The FastTrack Insurance Services contract has a duration of 18 months.", "category": "direct_fact"}
+{"question": "Who signed the FastTrack Insurance Services contract on behalf of Insurellm?", "keywords": ["Sarah Chen", "FastTrack"], "reference_answer": "Sarah Chen, VP of Sales, signed the FastTrack Insurance Services contract on behalf of Insurellm.", "category": "direct_fact"}
+{"question": "What tier did Atlantic Risk Solutions subscribe to for Bizllm?", "keywords": ["Professional", "Atlantic"], "reference_answer": "Atlantic Risk Solutions subscribed to the Professional Tier of Bizllm.", "category": "direct_fact"}
+{"question": "What is the monthly cost for Atlantic Risk Solutions' Bizllm subscription?", "keywords": ["$12,000", "Atlantic"], "reference_answer": "Atlantic Risk Solutions pays $12,000 per month for Bizllm Professional Tier.", "category": "direct_fact"}
+{"question": "How many user licenses are included in Atlantic Risk Solutions' contract?", "keywords": ["35", "Atlantic"], "reference_answer": "Atlantic Risk Solutions' contract includes 35 named user licenses.", "category": "direct_fact"}
+{"question": "Who signed the Atlantic Risk Solutions contract on behalf of Insurellm?", "keywords": ["Michael Torres", "Atlantic"], "reference_answer": "Michael Torres, Chief Revenue Officer, signed the Atlantic Risk Solutions contract on behalf of Insurellm.", "category": "direct_fact"}
+{"question": "What is the monthly fee for Belvedere Insurance's Markellm listing?", "keywords": ["$199", "Belvedere"], "reference_answer": "Belvedere Insurance pays a Basic Listing Fee of $199 per month for accessing the Markellm platform.", "category": "direct_fact"}
+{"question": "When did Maxine Thompson join Insurellm?", "keywords": ["January 2017", "Maxine"], "reference_answer": "Maxine Thompson joined Insurellm in January 2017 as a Junior Data Engineer.", "category": "temporal"}
+{"question": "When did James Wilson join Insurellm?", "keywords": ["January 2017", "James", "Wilson"], "reference_answer": "James Wilson joined Insurellm as Chief Technology Officer in January 2017.", "category": "temporal"}
+{"question": "When did Insurellm reach its peak of 200 employees?", "keywords": ["2020", "200"], "reference_answer": "Insurellm reached a peak of 200 employees in 2020.", "category": "temporal"}
+{"question": "When did Insurellm undergo strategic restructuring?", "keywords": ["2022-2023", "restructuring"], "reference_answer": "Insurellm underwent strategic restructuring in 2022-2023 to focus on profitability and sustainable growth.", "category": "temporal"}
+{"question": "When is Carllm's telematics-based pricing feature scheduled to launch?", "keywords": ["Q2 2025", "telematics"], "reference_answer": "Carllm's telematics-based pricing is scheduled to launch in Q2 2025.", "category": "temporal"}
+{"question": "When is Homellm version 2.0 scheduled for release?", "keywords": ["Q1 2025", "Homellm"], "reference_answer": "Homellm version 2.0 is scheduled for release in Q1 2025.", "category": "temporal"}
+{"question": "When is Bizllm version 1.0 scheduled to launch?", "keywords": ["Q2 2025", "Bizllm"], "reference_answer": "Bizllm version 1.0 is scheduled to launch in Q2 2025 with core multi-line underwriting, quoting, and policy administration.", "category": "temporal"}
+{"question": "When is Claimllm version 1.0 scheduled to launch?", "keywords": ["Q1 2025", "Claimllm"], "reference_answer": "Claimllm version 1.0 is scheduled to launch in Q1 2025 with core claims processing automation, FNOL capture, and basic fraud detection.", "category": "temporal"}
+{"question": "When is Lifellm version 1.0 scheduled to launch?", "keywords": ["Q2 2025", "Lifellm"], "reference_answer": "Lifellm version 1.0 is scheduled to launch in Q2 2025 with core AI underwriting and policy management capabilities.", "category": "temporal"}
+{"question": "When is Healthllm version 1.0 scheduled to launch?", "keywords": ["Q1 2025", "Healthllm"], "reference_answer": "Healthllm version 1.0 is scheduled to launch in Q1 2025 featuring core claims processing, eligibility verification, and member portal.", "category": "temporal"}
+{"question": "When was the DriveSmart Insurance contract for Carllm signed?", "keywords": ["March 20, 2025", "DriveSmart"], "reference_answer": "The DriveSmart Insurance contract for Carllm was signed on March 20, 2025.", "category": "temporal"}
+{"question": "When was the Harmony Health Plans Healthllm contract effective?", "keywords": ["January 25, 2025", "Harmony"], "reference_answer": "The Harmony Health Plans Healthllm contract became effective on January 25, 2025.", "category": "temporal"}
+{"question": "When was the Metropolitan Life Group Lifellm contract signed?", "keywords": ["April 5, 2025", "Metropolitan"], "reference_answer": "The Metropolitan Life Group Lifellm contract was signed on April 5, 2025.", "category": "temporal"}
+{"question": "When was the FastTrack Insurance Services Claimllm contract signed?", "keywords": ["May 10, 2025", "FastTrack"], "reference_answer": "The FastTrack Insurance Services Claimllm contract was signed on May 10, 2025.", "category": "temporal"}
+{"question": "When was the Atlantic Risk Solutions Bizllm contract effective?", "keywords": ["January 15, 2025", "Atlantic"], "reference_answer": "The Atlantic Risk Solutions Bizllm contract became effective on January 15, 2025.", "category": "temporal"}
+{"question": "How much did Priya Sharma's recommendation engine increase conversion by for Marketllm?", "keywords": ["28%", "Priya", "Marketllm"], "reference_answer": "Priya Sharma built a recommendation engine for Marketllm that increased conversion by 28%.", "category": "comparative"}
+{"question": "By what percentage did David Kim reduce deployment time with CI/CD pipelines?", "keywords": ["60%", "David", "Kim"], "reference_answer": "David Kim implemented CI/CD pipelines that reduced deployment time by 60%.", "category": "comparative"}
+{"question": "By what percentage did Sarah Williams improve user satisfaction scores?", "keywords": ["35%", "Sarah"], "reference_answer": "Sarah Williams conducted comprehensive user research that resulted in a 35% improvement in user satisfaction scores.", "category": "comparative"}
+{"question": "By what percentage has Lisa Anderson increased qualified leads since 2021?", "keywords": ["65%", "Lisa"], "reference_answer": "Lisa Anderson increased qualified leads by 65% since 2021.", "category": "comparative"}
+{"question": "By what percentage did Emily Carter exceed her annual sales target in 2022?", "keywords": ["30%", "Emily", "Carter"], "reference_answer": "Emily Carter exceeded her annual sales target by 30% in 2022.", "category": "comparative"}
+{"question": "What is the performance-based pricing for Belvedere Insurance's Markellm contract?", "keywords": ["$25", "Belvedere"], "reference_answer": "Belvedere Insurance pays $25 per lead generated through Markellm on a performance-based pricing model.", "category": "comparative"}
+{"question": "What is the monthly cost for Stellar Insurance Co.'s Rellm subscription?", "keywords": ["$10,000", "Stellar"], "reference_answer": "Stellar Insurance Co. pays $10,000 per month for the Professional Plan of Rellm.", "category": "comparative"}
+{"question": "What are Insurellm's four core values?", "keywords": ["Innovation First", "Customer Obsession", "Collaborative Excellence"], "reference_answer": "Insurellm's four core values are Innovation First, Customer Obsession, Integrity & Transparency, and Collaborative Excellence.", "category": "comparative"}
+{"question": "What award did Alex Harper win in 2022?", "keywords": ["SDR of the Year", "Alex", "Harper"], "reference_answer": "Alex Harper was awarded SDR of the Year in 2022 for outstanding contributions.", "category": "comparative"}
+{"question": "When was the Stellar Insurance Co. Rellm contract effective?", "keywords": ["January 1, 2024", "Stellar"], "reference_answer": "The Stellar Insurance Co. Rellm contract became effective on January 1, 2024.", "category": "comparative"}
+{"question": "How many claims does FastTrack Insurance Services project for year 1?", "keywords": ["22,000", "FastTrack"], "reference_answer": "FastTrack Insurance Services projects 22,000 claims in year 1.", "category": "numerical"}
+{"question": "How much annual revenue does Marcus Johnson's portfolio generate?", "keywords": ["$5M", "Marcus"], "reference_answer": "Marcus Johnson manages a portfolio of 25 enterprise clients generating $5M in annual revenue.", "category": "numerical"}
+{"question": "What percentage of quota did Jennifer Adams achieve in 2023?", "keywords": ["78%", "Jennifer"], "reference_answer": "Jennifer Adams achieved 78% of her lead generation quota in 2023.", "category": "numerical"}
+{"question": "How many products does Insurellm offer?", "keywords": ["8", "products"], "reference_answer": "Insurellm offers 8 insurance software products across multiple insurance lines.", "category": "numerical"}
+{"question": "What is the monthly cost of Carllm's Professional Tier?", "keywords": ["$2,500", "Professional", "Carllm"], "reference_answer": "Carllm's Professional Tier costs $2,500 per month.", "category": "numerical"}
+{"question": "What are the core insurance portals offered by Insurellm?", "keywords": ["Core Insurance Portals", "Carllm"], "reference_answer": "Insurellm's core insurance portals are Carllm (auto), Homellm (home), Lifellm (life), Healthllm (health), and Bizllm (commercial).", "category": "relationship"}
+{"question": "What are the marketplace and infrastructure products offered by Insurellm?", "keywords": ["Marketplace & Infrastructure", "Markellm"], "reference_answer": "Insurellm's marketplace and infrastructure products are Markellm (marketplace), Claimllm (claims processing), and Rellm (reinsurance).", "category": "relationship"}
+{"question": "Which product does Jessica Liu develop for?", "keywords": ["Rellm", "Jessica"], "reference_answer": "Jessica Liu develops user interfaces for the Rellm reinsurance platform using React.", "category": "relationship"}
+{"question": "What is Tyler Brooks's job title?", "keywords": ["Junior Backend Developer", "Tyler"], "reference_answer": "Tyler Brooks is a Junior Backend Developer at Insurellm.", "category": "relationship"}
+{"question": "Which product does Tyler Brooks work on?", "keywords": ["Carllm", "Tyler"], "reference_answer": "Tyler Brooks develops backend services for the Carllm auto insurance portal.", "category": "relationship"}
+{"question": "What product does the IIOTY award winner work on?", "keywords": ["Maxine", "Thompson", "Senior Data Engineer", "IIOTY"], "reference_answer": "Maxine Thompson, who won the IIOTY award in 2023, works as a Senior Data Engineer.", "category": "spanning"}
+{"question": "Who is the technical lead for the product that costs $10,000/month for Standard Tier?", "keywords": ["Robert", "Chen", "Homellm", "$10,000"], "reference_answer": "Robert Chen is the technical lead for Homellm, which costs $10,000/month for the Standard Tier.", "category": "spanning"}
+{"question": "What is the salary of the CTO who joined in January 2017?", "keywords": ["James", "Wilson", "$285,000", "January 2017"], "reference_answer": "James Wilson, who joined as CTO in January 2017, has a current salary of $285,000.", "category": "spanning"}
+{"question": "What is the current salary of the founder of Insurellm?", "keywords": ["Avery", "Lancaster", "$225,000", "founder"], "reference_answer": "Avery Lancaster, the founder of Insurellm, has a current salary of $225,000.", "category": "spanning"}
+{"question": "Which product did the Senior Data Scientist build a recommendation engine for that increased conversion by 28%?", "keywords": ["Priya", "Sharma", "Marketllm", "28%"], "reference_answer": "Priya Sharma built a recommendation engine for Marketllm that increased conversion by 28%.", "category": "spanning"}
+{"question": "Which product does the UX Designer who improved user satisfaction by 35% lead design for?", "keywords": ["Sarah", "Williams", "Homellm", "35%"], "reference_answer": "Sarah Williams, who improved user satisfaction by 35%, leads design for Homellm.", "category": "spanning"}
+{"question": "What is the annual marketing budget managed by the Marketing Manager in Austin?", "keywords": ["Lisa", "Anderson", "$2M", "Austin"], "reference_answer": "Lisa Anderson, the Marketing Manager in Austin, manages a $2M annual marketing budget.", "category": "spanning"}
+{"question": "Who signed the contract on behalf of Insurellm for the client that pays $18,000 per month for Carllm?", "keywords": ["Jennifer", "Rodriguez", "DriveSmart", "$18,000"], "reference_answer": "Jennifer Rodriguez, CEO of Insurellm, signed the contract for DriveSmart Insurance which pays $18,000 per month for Carllm.", "category": "spanning"}
+{"question": "What tier did the client with 35 user licenses subscribe to for Bizllm?", "keywords": ["Atlantic", "Risk", "Solutions", "Professional", "35"], "reference_answer": "Atlantic Risk Solutions, which has 35 named user licenses, subscribed to the Professional Tier of Bizllm.", "category": "spanning"}
+{"question": "How many covered members does the client who subscribed to Healthllm Professional Tier for $15,000/month have?", "keywords": ["Harmony", "Health", "Plans", "38,000", "$15,000"], "reference_answer": "Harmony Health Plans, which subscribed to Healthllm Professional Tier for $15,000/month, has 38,000 covered members.", "category": "spanning"}
+{"question": "Who signed the Metropolitan Life Group contract that totals $1,098,000?", "keywords": ["Jennifer", "Rodriguez", "Richard", "Thompson", "$1,098,000"], "reference_answer": "Jennifer Rodriguez signed for Insurellm and Richard Thompson signed for Metropolitan Life Group on the contract totaling $1,098,000.", "category": "spanning"}
+{"question": "What is the total contract value for the client paying $12,000 per month for Bizllm Professional Tier?", "keywords": ["Atlantic", "$144,000", "12 months", "$12,000"], "reference_answer": "Atlantic Risk Solutions pays $12,000/month for 12 months, totaling $144,000 for Bizllm Professional Tier.", "category": "spanning"}
+{"question": "By what percentage did the DevOps Engineer in New York reduce deployment time?", "keywords": ["David", "Kim", "60%", "New York"], "reference_answer": "David Kim, the DevOps Engineer in New York, reduced deployment time by 60% with CI/CD pipelines.", "category": "spanning"}
+{"question": "What is the annual revenue generated by the portfolio managed by the Customer Success Manager in New York?", "keywords": ["Marcus", "Johnson", "$5M", "New York"], "reference_answer": "Marcus Johnson, the Customer Success Manager in New York, manages a portfolio generating $5M in annual revenue.", "category": "spanning"}
+{"question": "By what percentage did the Account Executive in Austin exceed their annual sales target in 2022?", "keywords": ["Emily", "Carter", "30%", "Austin", "2022"], "reference_answer": "Emily Carter, the Account Executive in Austin, exceeded her annual sales target by 30% in 2022.", "category": "spanning"}
+{"question": "When is the telematics-based pricing feature scheduled to launch for the product that costs $1,000/month for Basic Tier?", "keywords": ["Carllm", "Q2 2025", "telematics", "$1,000"], "reference_answer": "Carllm, which costs $1,000/month for Basic Tier, has telematics-based pricing scheduled to launch in Q2 2025.", "category": "spanning"}
+{"question": "What is the monthly cost for the Basic Listing Fee on the marketplace that was Insurellm's first product?", "keywords": ["Markellm", "$199", "first product"], "reference_answer": "Markellm, Insurellm's first product, has a Basic Listing Fee of $199/month.", "category": "spanning"}
+{"question": "How many core values does the company founded by Avery Lancaster have?", "keywords": ["four", "core values", "Avery", "Lancaster", "founded"], "reference_answer": "Insurellm, founded by Avery Lancaster, has four core values: Innovation First, Customer Obsession, Integrity & Transparency, and Collaborative Excellence.", "category": "spanning"}
+{"question": "What tier of Healthllm costs $15,000 per month and includes Advanced Medication Management?", "keywords": ["Professional", "Healthllm", "$15,000", "Advanced Medication Management"], "reference_answer": "The Professional Tier of Healthllm costs $15,000 per month and includes Advanced Medication Management features.", "category": "spanning"}
+{"question": "How many active policies does the client who pays $28,000 per month for Lifellm manage?", "keywords": ["Metropolitan", "Life", "Group", "50,000", "$28,000"], "reference_answer": "Metropolitan Life Group, which pays $28,000 per month for Lifellm, manages 50,000+ active policies.", "category": "spanning"}
+{"question": "What is Alex Chen's current job title?", "keywords": ["Backend Software Engineer", "Alex Chen"], "reference_answer": "Alex Chen is a Backend Software Engineer at Insurellm.", "category": "direct_fact"}
+{"question": "What is Carlos Rodriguez's current salary?", "keywords": ["$125,000", "Carlos"], "reference_answer": "Carlos Rodriguez's current salary is $125,000.", "category": "direct_fact"}
+{"question": "What is the monthly cost of the Bizllm Business Tier?", "keywords": ["$6,000", "Business", "Bizllm"], "reference_answer": "The Bizllm Business Tier costs $6,000 per month.", "category": "direct_fact"}
+{"question": "What is Nina Patel's job title?", "keywords": ["Business Intelligence Analyst", "Nina"], "reference_answer": "Nina Patel is a Business Intelligence Analyst at Insurellm.", "category": "direct_fact"}
+{"question": "What is the contract number for the GreenValley Insurance Homellm agreement?", "keywords": ["HV-2023-0458", "GreenValley"], "reference_answer": "The contract number for the GreenValley Insurance Homellm agreement is HV-2023-0458.", "category": "direct_fact"}
+{"question": "When did Alex Chen join Insurellm?", "keywords": ["April 2020", "Alex Chen"], "reference_answer": "Alex Chen joined Insurellm in April 2020 as a Junior Backend Developer.", "category": "temporal"}
+{"question": "When was the GreenValley Insurance Homellm contract signed?", "keywords": ["October 6, 2023", "GreenValley"], "reference_answer": "The GreenValley Insurance Homellm contract was signed on October 6, 2023.", "category": "temporal"}
+{"question": "When was the Summit Commercial Insurance Bizllm contract effective?", "keywords": ["March 15, 2025", "Summit"], "reference_answer": "The Summit Commercial Insurance Bizllm contract became effective on March 15, 2025.", "category": "temporal"}
+{"question": "When was the Roadway Insurance Carllm contract effective?", "keywords": ["January 1, 2025", "Roadway"], "reference_answer": "The Roadway Insurance Carllm contract became effective on January 1, 2025.", "category": "temporal"}
+{"question": "When was the Evergreen Life Insurance Lifellm contract signed?", "keywords": ["January 20, 2025", "Evergreen"], "reference_answer": "The Evergreen Life Insurance Lifellm contract was signed on January 20, 2025.", "category": "temporal"}
+{"question": "How many employees did Insurellm have at its peak in 2020?", "keywords": ["200", "2020", "peak"], "reference_answer": "Insurellm reached a peak of 200 employees in 2020.", "category": "numerical"}
+{"question": "What is the monthly cost for Apex Reinsurance's Rellm subscription?", "keywords": ["$10,000", "Apex"], "reference_answer": "Apex Reinsurance pays $10,000 per month for the Rellm solution.", "category": "numerical"}
+{"question": "How many active policies does Evergreen Life Insurance currently manage?", "keywords": ["1,400", "Evergreen"], "reference_answer": "Evergreen Life Insurance currently manages 1,400 active policies.", "category": "numerical"}
+{"question": "How many user licenses are provided for onboarding training in the Summit Commercial Insurance contract?", "keywords": ["15", "Summit"], "reference_answer": "Summit Commercial Insurance receives onboarding training for up to 15 staff members.", "category": "numerical"}
+{"question": "How much new business did Michael O'Brien generate over the past 3 years?", "keywords": ["$8.2M", "Michael", "O'Brien"], "reference_answer": "Michael O'Brien generated $8.2M in new business over the past 3 years.", "category": "numerical"}
+{"question": "Which product does Rachel Martinez lead product strategy for?", "keywords": ["Carllm", "Rachel", "Martinez"], "reference_answer": "Rachel Martinez leads product strategy for Carllm, the auto insurance portal.", "category": "relationship"}
+{"question": "Which mobile app does Kevin Zhang lead iOS development for?", "keywords": ["Marketllm", "Kevin", "Zhang"], "reference_answer": "Kevin Zhang leads iOS development for the Marketllm consumer mobile app.", "category": "relationship"}
+{"question": "Which tier of Bizllm did Summit Commercial Insurance subscribe to?", "keywords": ["Business", "Summit"], "reference_answer": "Summit Commercial Insurance subscribed to the Business Tier of Bizllm.", "category": "relationship"}
+{"question": "What award did Carlos Rodriguez win in 2023?", "keywords": ["Solutions Engineer of the Year", "Carlos"], "reference_answer": "Carlos Rodriguez won Solutions Engineer of the Year 2023.", "category": "relationship"}
+{"question": "Who signed the Summit Commercial Insurance contract on behalf of Insurellm?", "keywords": ["Michael Torres", "Summit"], "reference_answer": "Michael Torres, Chief Revenue Officer, signed the Summit Commercial Insurance contract on behalf of Insurellm.", "category": "relationship"}
+{"question": "How many employees at Insurellm have a current salary under $80,000?", "keywords": ["salary", "employees"], "reference_answer": "Based on the employee records, there are several employees with salaries under $80,000, including Tyler Brooks ($75,000) and Alex Harper ($75,000).", "category": "holistic"}
+{"question": "What is the total contract value of all Healthllm contracts combined?", "keywords": ["Healthllm", "contract value"], "reference_answer": "The total contract value of all Healthllm contracts is $3,504,000, combining United Healthcare Alliance ($2,784,000), Harmony Health Plans ($360,000), and other Healthllm clients.", "category": "holistic"}
+{"question": "Which product line has the most active contracts according to the company overview?", "keywords": ["contracts", "products"], "reference_answer": "According to the company overview, both Bizllm and Claimllm are tied with the most active contracts at 7 contracts each.", "category": "holistic"}
+{"question": "How many total employees work at Insurellm across all locations?", "keywords": ["employees", "total"], "reference_answer": "Insurellm has 32 employees operating primarily remotely across the US, with offices in San Francisco, New York, Austin, Chicago, and Denver.", "category": "holistic"}
+{"question": "What is the average monthly subscription cost across all Carllm contracts?", "keywords": ["Carllm", "monthly cost"], "reference_answer": "The average monthly subscription cost across Carllm contracts varies, with examples including Roadway Insurance at $2,500/month for Professional Tier, though specific averages require aggregating all Carllm contract data.", "category": "holistic"}
+{"question": "How many different insurance product lines does Insurellm offer in total?", "keywords": ["products", "insurance lines"], "reference_answer": "Insurellm offers 8 different insurance software products: Carllm, Homellm, Lifellm, Healthllm, Bizllm, Markellm, Claimllm, and Rellm.", "category": "holistic"}
+{"question": "Which product has the fewest active contracts according to the portfolio breakdown?", "keywords": ["contracts", "fewest"], "reference_answer": "According to the portfolio breakdown, both Markellm and Rellm are tied with the fewest active contracts at 2 contracts each.", "category": "holistic"}
+{"question": "What is the total number of active contracts across all Insurellm products?", "keywords": ["contracts", "total"], "reference_answer": "Insurellm has 32 active contracts across all product lines, serving clients ranging from regional insurers to national carriers and global reinsurance partners.", "category": "holistic"}
+{"question": "How many states does United Healthcare Alliance operate in according to their contract?", "keywords": ["United Healthcare", "states"], "reference_answer": "United Healthcare Alliance operates across 12 states according to their Healthllm contract, supporting 250,000+ members.", "category": "holistic"}
+{"question": "What is the longest contract duration among all Insurellm contracts?", "keywords": ["contract", "duration"], "reference_answer": "The longest contract duration is 48 months (4 years) for the United Healthcare Alliance Healthllm contract, representing Insurellm's most strategic long-term healthcare partnership.", "category": "holistic"}