Fixed a bug identified by student Jon R - thank you!
This commit is contained in:
@@ -97,7 +97,7 @@
|
||||
"products = glob.glob(\"knowledge-base/products/*\")\n",
|
||||
"\n",
|
||||
"for product in products:\n",
|
||||
" name = product.split('/')[-1][:-3]\n",
|
||||
" name = product.split(os.sep)[-1][:-3]\n",
|
||||
" doc = \"\"\n",
|
||||
" with open(product, \"r\") as f:\n",
|
||||
" doc = f.read()\n",
|
||||
|
||||
@@ -80,10 +80,13 @@
|
||||
"\n",
|
||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||
"\n",
|
||||
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for folder in folders:\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||
" folder_docs = loader.load()\n",
|
||||
" for doc in folder_docs:\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
|
||||
@@ -86,10 +86,13 @@
|
||||
"\n",
|
||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||
"\n",
|
||||
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for folder in folders:\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||
" folder_docs = loader.load()\n",
|
||||
" for doc in folder_docs:\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
|
||||
@@ -87,10 +87,13 @@
|
||||
"\n",
|
||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||
"\n",
|
||||
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for folder in folders:\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||
" folder_docs = loader.load()\n",
|
||||
" for doc in folder_docs:\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"id": "ba2779af-84ef-4227-9e9e-6eaf0df87e77",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -31,7 +31,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "802137aa-8a74-45e0-a487-d1974927d7ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -52,7 +52,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"id": "58c85082-e417-4708-9efe-81a5d55d1424",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -65,7 +65,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"id": "ee78efcb-60fe-449e-a944-40bab26261af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -78,7 +78,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"id": "730711a9-6ffe-4eee-8f48-d6cfb7314905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -88,10 +88,13 @@
|
||||
"\n",
|
||||
"folders = glob.glob(\"knowledge-base/*\")\n",
|
||||
"\n",
|
||||
"# With thanks to Jon R, a student on the course, for this fix needed for some users \n",
|
||||
"text_loader_kwargs={'autodetect_encoding': True}\n",
|
||||
"\n",
|
||||
"documents = []\n",
|
||||
"for folder in folders:\n",
|
||||
" doc_type = os.path.basename(folder)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader)\n",
|
||||
" loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
|
||||
" folder_docs = loader.load()\n",
|
||||
" for doc in folder_docs:\n",
|
||||
" doc.metadata[\"doc_type\"] = doc_type\n",
|
||||
@@ -100,10 +103,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"id": "7310c9c8-03c1-4efc-a104-5e89aec6db1a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Created a chunk of size 1088, which is longer than the specified 1000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
|
||||
"chunks = text_splitter.split_documents(documents)"
|
||||
@@ -111,10 +122,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"id": "cd06e02f-6d9b-44cc-a43d-e1faa8acc7bb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"123"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(chunks)"
|
||||
]
|
||||
|
||||
1511
week5/day5.ipynb
1511
week5/day5.ipynb
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user