Fixed a bug identified by student Jon R - thank you!

This commit is contained in:
Edward Donner
2024-10-04 19:37:06 -04:00
parent f6bcc58585
commit 14f2667194
6 changed files with 1540 additions and 30 deletions

View File

@@ -97,7 +97,7 @@
"products = glob.glob(\"knowledge-base/products/*\")\n",
"\n",
"for product in products:\n",
" name = product.split('/')[-1][:-3]\n",
" name = product.split(os.sep)[-1][:-3]\n",
" doc = \"\"\n",
" with open(product, \"r\") as f:\n",
" doc = f.read()\n",

View File

@@ -80,10 +80,13 @@
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
"text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",

View File

@@ -86,10 +86,13 @@
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
"text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",

View File

@@ -87,10 +87,13 @@
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
"text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",

View File

@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
"metadata": {},
"outputs": [],
@@ -31,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
"metadata": {},
"outputs": [],
@@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
"metadata": {},
"outputs": [],
@@ -65,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
"metadata": {},
"outputs": [],
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
"metadata": {},
"outputs": [],
@@ -88,10 +88,13 @@
"\n",
"folders = glob.glob(\"knowledge-base/*\")\n",
"\n",
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
"text_loader_kwargs={'autodetect_encoding': True}\n",
"\n",
"documents = []\n",
"for folder in folders:\n",
" doc_type = os.path.basename(folder)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
" folder_docs = loader.load()\n",
" for doc in folder_docs:\n",
" doc.metadata[\"doc_type\"] = doc_type\n",
@@ -100,10 +103,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Created a chunk of size 1088, which is longer than the specified 1000\n"
]
}
],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"chunks = text_splitter.split_documents(documents)"
@@ -111,10 +122,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"123"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(chunks)"
]

File diff suppressed because one or more lines are too long