Additional resources and tips
This commit is contained in:
@@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -29,7 +29,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -51,7 +51,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -64,7 +64,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -77,7 +77,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -104,18 +104,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created a chunk of size 1088, which is longer than the specified 1000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
||||
"chunks = text_splitter.split_documents(documents)"
|
||||
@@ -123,39 +115,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"123"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(chunks)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"id": "2c54b4b6-06da-463d-bee7-4dd456c2b887",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Document types found: company, employees, contracts, products\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)\n",
|
||||
"print(f\"Document types found: {', '.join(doc_types)}\")"
|
||||
@@ -184,18 +157,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"id": "78998399-ac17-4e28-b15f-0b5f51e6ee23",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"There are 123 vectors with 1,536 dimensions in the vector store\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n",
|
||||
"# Chroma is a popular open source Vector Database based on SQLLite\n",
|
||||
|
||||
Reference in New Issue
Block a user