Additional resources and tips

This commit is contained in:
Edward Donner
2025-02-15 08:31:06 -05:00
parent 289a1b9bd1
commit ec4ada6456
14 changed files with 133 additions and 81 deletions

View File

@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
"metadata": {},
"outputs": [],
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
"metadata": {},
"outputs": [],
@@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
"metadata": {},
"outputs": [],
@@ -64,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
"metadata": {},
"outputs": [],
@@ -77,7 +77,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
"metadata": {},
"outputs": [],
@@ -104,18 +104,10 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Created a chunk of size 1088, which is longer than the specified 1000\n"
]
}
],
"outputs": [],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)"
@@ -123,39 +115,20 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"123"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"len(chunks)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document types found: company, employees, contracts, products\n"
]
}
],
"outputs": [],
"source": [
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
"print(f\"Document types found: {', '.join(doc_types)}\")"
@@ -184,18 +157,10 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"id": "78998399-ac17-4e28-b15f-0b5f51e6ee23",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 123 vectors with 1,536 dimensions in the vector store\n"
]
}
],
"outputs": [],
"source": [
"# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n",
"# Chroma is a popular open source Vector Database based on SQLLite\n",