237 lines
5.6 KiB
Plaintext
237 lines
5.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "be0d1141",
|
|
"metadata": {},
|
|
"source": [
|
|
"# RAG Day 4\n",
|
|
"\n",
|
|
"## Evaluation!\n",
|
|
"\n",
|
|
"<table style=\"margin: 0; text-align: left;\">\n",
|
|
" <tr>\n",
|
|
" <td style=\"width: 150px; height: 150px; vertical-align: middle;\">\n",
|
|
" <img src=\"../assets/business.jpg\" width=\"150\" height=\"150\" style=\"display: block;\" />\n",
|
|
" </td>\n",
|
|
" <td>\n",
|
|
" <h2 style=\"color:#181;\">Keep in mind how you would evaluate RAG for your business</h2>\n",
|
|
" <span style=\"color:#181;\">This is such an important part of building an accurate and reliable RAG pipeline. And it's applicable to many aspects of solving business problems with LLMs. People are often focused on RAG architecture and RAG frameworks for their business. But even more important: evaluations!</span>\n",
|
|
" </td>\n",
|
|
" </tr>\n",
|
|
"</table>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "60995166",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from evaluation import test"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "2bbcfcf3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tests = test.load_tests()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "fd4d88a3",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"150"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(tests)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "c65fd09a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Who won the prestigious IIOTY award in 2023?\n",
|
|
"direct_fact\n",
|
|
"Maxine Thompson won the prestigious Insurellm Innovator of the Year (IIOTY) award in 2023.\n",
|
|
"['Maxine', 'Thompson', 'IIOTY']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"example = tests[0]\n",
|
|
"print(example.question)\n",
|
|
"print(example.category)\n",
|
|
"print(example.reference_answer)\n",
|
|
"print(example.keywords)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "a7f058ff",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Counter({'direct_fact': 70,\n",
|
|
" 'temporal': 20,\n",
|
|
" 'spanning': 20,\n",
|
|
" 'comparative': 10,\n",
|
|
" 'numerical': 10,\n",
|
|
" 'relationship': 10,\n",
|
|
" 'holistic': 10})"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from collections import Counter\n",
|
|
"count = Counter([t.category for t in tests])\n",
|
|
"count"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "b413b186",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from evaluation.eval import evaluate_retrieval, evaluate_answer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "daca435e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"RetrievalEval(mrr=0.16666666666666666, ndcg=0.28711770538226206, keywords_found=2, total_keywords=3, keyword_coverage=66.66666666666666)"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"evaluate_retrieval(example)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "925b37d2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval, answer, chunks = evaluate_answer(example)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "01965312",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"AnswerEval(feedback=\"The answer correctly identifies Maxine as the winner and mentions the IIOTY award in 2023, but it omits Thompson's full name, which is present in the reference. This affects completeness. The relevance is high, as it directly addresses the question about the award winner.\", accuracy=5.0, completeness=4.0, relevance=5.0)"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"eval"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "4cd34561",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The answer correctly identifies Maxine as the winner and mentions the IIOTY award in 2023, but it omits Thompson's full name, which is present in the reference. This affects completeness. The relevance is high, as it directly addresses the question about the award winner.\n",
|
|
"5.0\n",
|
|
"4.0\n",
|
|
"5.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(eval.feedback)\n",
|
|
"print(eval.accuracy)\n",
|
|
"print(eval.completeness)\n",
|
|
"print(eval.relevance)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6f5e0cd4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|