Fixed a bug identified by student Jon R - thank you!
This commit is contained in:
@@ -97,7 +97,7 @@
|
|||||||
"products = glob.glob(\"knowledge-base/products/*\")\n",
|
"products = glob.glob(\"knowledge-base/products/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for product in products:\n",
|
"for product in products:\n",
|
||||||
" name = product.split('/')[-1][:-3]\n",
|
" name = product.split(os.sep)[-1][:-3]\n",
|
||||||
" doc = \"\"\n",
|
" doc = \"\"\n",
|
||||||
" with open(product, \"r\") as f:\n",
|
" with open(product, \"r\") as f:\n",
|
||||||
" doc = f.read()\n",
|
" doc = f.read()\n",
|
||||||
|
|||||||
@@ -80,10 +80,13 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||||
|
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||||
|
"\n",
|
||||||
"documents = []\n",
|
"documents = []\n",
|
||||||
"for folder in folders:\n",
|
"for folder in folders:\n",
|
||||||
" doc_type = os.path.basename(folder)\n",
|
" doc_type = os.path.basename(folder)\n",
|
||||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||||
" folder_docs = loader.load()\n",
|
" folder_docs = loader.load()\n",
|
||||||
" for doc in folder_docs:\n",
|
" for doc in folder_docs:\n",
|
||||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||||
|
|||||||
@@ -86,10 +86,13 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||||
|
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||||
|
"\n",
|
||||||
"documents = []\n",
|
"documents = []\n",
|
||||||
"for folder in folders:\n",
|
"for folder in folders:\n",
|
||||||
" doc_type = os.path.basename(folder)\n",
|
" doc_type = os.path.basename(folder)\n",
|
||||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||||
" folder_docs = loader.load()\n",
|
" folder_docs = loader.load()\n",
|
||||||
" for doc in folder_docs:\n",
|
" for doc in folder_docs:\n",
|
||||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||||
|
|||||||
@@ -87,10 +87,13 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||||
|
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||||
|
"\n",
|
||||||
"documents = []\n",
|
"documents = []\n",
|
||||||
"for folder in folders:\n",
|
"for folder in folders:\n",
|
||||||
" doc_type = os.path.basename(folder)\n",
|
" doc_type = os.path.basename(folder)\n",
|
||||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||||
" folder_docs = loader.load()\n",
|
" folder_docs = loader.load()\n",
|
||||||
" for doc in folder_docs:\n",
|
" for doc in folder_docs:\n",
|
||||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
|
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -31,7 +31,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -52,7 +52,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
|
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -65,7 +65,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
|
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -78,7 +78,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -88,10 +88,13 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||||
|
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||||
|
"\n",
|
||||||
"documents = []\n",
|
"documents = []\n",
|
||||||
"for folder in folders:\n",
|
"for folder in folders:\n",
|
||||||
" doc_type = os.path.basename(folder)\n",
|
" doc_type = os.path.basename(folder)\n",
|
||||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||||
" folder_docs = loader.load()\n",
|
" folder_docs = loader.load()\n",
|
||||||
" for doc in folder_docs:\n",
|
" for doc in folder_docs:\n",
|
||||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||||
@@ -100,10 +103,18 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 6,
|
||||||
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Created a chunk of size 1088, which is longer than the specified 1000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
||||||
"chunks = text_splitter.split_documents(documents)"
|
"chunks = text_splitter.split_documents(documents)"
|
||||||
@@ -111,10 +122,21 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 7,
|
||||||
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"123"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"len(chunks)"
|
"len(chunks)"
|
||||||
]
|
]
|
||||||
|
|||||||
1511
week5/day5.ipynb
1511
week5/day5.ipynb
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user