fix: chunk text func

imp: add language params to get_transcript
2025-01-03 18:12:14 +07:00
parent 8a9d94eb41
commit 337083828c
1 changed files with 56 additions and 22 deletions
--- a/week1/community-contributions/day1-youtube-video-summarization.ipynb
+++ b/week1/community-contributions/day1-youtube-video-summarization.ipynb
@@ -28,7 +28,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": null,
   "id": "a082ddaf-abf5-4e6c-8112-74846c768301",
   "metadata": {},
   "outputs": [],
@@ -74,7 +74,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": null,
   "id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
   "metadata": {},
   "outputs": [],
@@ -84,7 +84,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": null,
   "id": "c5e793b2-6775-426a-a139-4848291d0463",
   "metadata": {},
   "outputs": [],
@@ -120,26 +120,28 @@
   "outputs": [],
   "source": [
    "# Example usage\n",
-    "video_url = \"https://www.youtube.com/watch?v=5zuF4Ys1eAw\"\n",
+    "video_url = \"https://www.youtube.com/watch?v=kqaMIFEz15s\"\n",
    "\n",
    "yt_video = YoutubeVideoID(video_url)\n",
-    "print(yt_video)  # Output: Video ID: cicHKo4zH-w"
+    "print(yt_video)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": null,
   "id": "f724be3c-bdeb-4079-b4be-f12608144484",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def get_transcript(video_id):\n",
+    "def get_transcript(video_id, language='en'):\n",
    "    try:\n",
-    "        transcript = YouTubeTranscriptApi.get_transcript(video_id)\n",
+    "        # Try to get the transcript in the desired language (Indonesian by default)\n",
+    "        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
+    "        # Join all the 'text' fields into a single string\n",
    "        return \" \".join([item['text'] for item in transcript])\n",
    "    except Exception as e:\n",
    "        print(f\"Error fetching transcript: {e}\")\n",
-    "        return None"
+    "        return None\n"
   ]
  },
  {
@@ -156,7 +158,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": null,
   "id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
   "metadata": {},
   "outputs": [],
@@ -164,10 +166,18 @@
    "# Function to summarize text using ChatGPT\n",
    "def summarize_text(text):\n",
    "    try:\n",
+    "        system_prompts = \"\"\"\n",
+    "        You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:\n",
+    "        \n",
+    "        - Capture the key points of the content.\n",
+    "        - Keep the summary brief and easy to understand.\n",
+    "        - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.\n",
+    "        - Use bullet points where appropriate to enhance clarity and structure.\n",
+    "        \"\"\"\n",
    "        response = openai.chat.completions.create(\n",
    "            model=\"gpt-4o-mini\",\n",
    "            messages=[\n",
-    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant that summarizes text.\"},\n",
+    "                {\"role\": \"system\", \"content\": system_prompts},\n",
    "                {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
    "            ],\n",
    "            max_tokens=200\n",
@@ -186,36 +196,60 @@
   "outputs": [],
   "source": [
    "def split_text(text, chunk_size=3000):\n",
-    "    \"\"\"Splits large text into smaller chunks based on the given chunk size.\"\"\"\n",
+    "    \"\"\"\n",
+    "    Splits large text into smaller chunks based on the given chunk size.\n",
+    "    Ensures that chunks end with a full stop where possible to maintain sentence integrity.\n",
+    "    \n",
+    "    :param text: str, the text to be split\n",
+    "    :param chunk_size: int, maximum size of each chunk (default 3000 characters)\n",
+    "    :return: list of str, where each str is a chunk of text\n",
+    "    \"\"\"\n",
    "    chunks = []\n",
    "    while len(text) > chunk_size:\n",
-    "        # Find the last full stop within the chunk size to avoid cutting sentences\n",
-    "        split_point = text.rfind('.', 0, chunk_size)\n",
-    "        \n",
-    "        # If no full stop is found, just split at the chunk size\n",
-    "        if split_point == -1:\n",
+    "        # Find the last full stop within or at the chunk size\n",
+    "        split_point = text.rfind('.', 0, chunk_size + 1)  # +1 to include the period itself if it's at chunk_size\n",
+    "        if split_point == -1:  # No period found within the chunk size\n",
    "            split_point = chunk_size\n",
    "        \n",
-    "        chunks.append(text[:split_point].strip())\n",
-    "        text = text[split_point:].strip()\n",
+    "        # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure\n",
+    "        chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])\n",
+    "        text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]\n",
    "    \n",
-    "    # Add the remaining text as the final chunk\n",
+    "    # Add the remaining text as the final chunk, only strip if there's content\n",
    "    if text:\n",
-    "        chunks.append(text)\n",
+    "        chunks.append(text.strip())\n",
    "    \n",
    "    return chunks\n",
    "\n",
    "transcript_chunks = split_text(transcript_text)\n",
+    "\n",
    "# Now you can summarize each chunk individually\n",
    "summaries = []\n",
    "for chunk in transcript_chunks:\n",
    "    summary = summarize_text(chunk)\n",
    "    summaries.append(summary)\n",
-    "    \n",
+    "\n",
+    "\n",
    "# Combine the individual summaries into one\n",
    "full_summary = \" \".join(summaries)\n",
    "display(Markdown(full_summary))\n"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b266fdc-da31-4d79-8982-be77f03be59f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "792c814d-73f8-4c1e-a0bb-b654b40e4d8b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {