fix: chunk text func
imp: add language params to get_transcript
This commit is contained in:
@@ -28,7 +28,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 111,
|
"execution_count": null,
|
||||||
"id": "a082ddaf-abf5-4e6c-8112-74846c768301",
|
"id": "a082ddaf-abf5-4e6c-8112-74846c768301",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -74,7 +74,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 113,
|
"execution_count": null,
|
||||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -84,7 +84,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 114,
|
"execution_count": null,
|
||||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -120,26 +120,28 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Example usage\n",
|
"# Example usage\n",
|
||||||
"video_url = \"https://www.youtube.com/watch?v=5zuF4Ys1eAw\"\n",
|
"video_url = \"https://www.youtube.com/watch?v=kqaMIFEz15s\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"yt_video = YoutubeVideoID(video_url)\n",
|
"yt_video = YoutubeVideoID(video_url)\n",
|
||||||
"print(yt_video) # Output: Video ID: cicHKo4zH-w"
|
"print(yt_video)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 116,
|
"execution_count": null,
|
||||||
"id": "f724be3c-bdeb-4079-b4be-f12608144484",
|
"id": "f724be3c-bdeb-4079-b4be-f12608144484",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_transcript(video_id):\n",
|
"def get_transcript(video_id, language='en'):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" transcript = YouTubeTranscriptApi.get_transcript(video_id)\n",
|
" # Try to get the transcript in the desired language (Indonesian by default)\n",
|
||||||
|
" transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
|
||||||
|
" # Join all the 'text' fields into a single string\n",
|
||||||
" return \" \".join([item['text'] for item in transcript])\n",
|
" return \" \".join([item['text'] for item in transcript])\n",
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" print(f\"Error fetching transcript: {e}\")\n",
|
" print(f\"Error fetching transcript: {e}\")\n",
|
||||||
" return None"
|
" return None\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -156,7 +158,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 118,
|
"execution_count": null,
|
||||||
"id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
|
"id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -164,10 +166,18 @@
|
|||||||
"# Function to summarize text using ChatGPT\n",
|
"# Function to summarize text using ChatGPT\n",
|
||||||
"def summarize_text(text):\n",
|
"def summarize_text(text):\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
|
" system_prompts = \"\"\"\n",
|
||||||
|
" You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:\n",
|
||||||
|
" \n",
|
||||||
|
" - Capture the key points of the content.\n",
|
||||||
|
" - Keep the summary brief and easy to understand.\n",
|
||||||
|
" - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.\n",
|
||||||
|
" - Use bullet points where appropriate to enhance clarity and structure.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
" response = openai.chat.completions.create(\n",
|
" response = openai.chat.completions.create(\n",
|
||||||
" model=\"gpt-4o-mini\",\n",
|
" model=\"gpt-4o-mini\",\n",
|
||||||
" messages=[\n",
|
" messages=[\n",
|
||||||
" {\"role\": \"system\", \"content\": \"You are a helpful assistant that summarizes text.\"},\n",
|
" {\"role\": \"system\", \"content\": system_prompts},\n",
|
||||||
" {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
|
" {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
|
||||||
" ],\n",
|
" ],\n",
|
||||||
" max_tokens=200\n",
|
" max_tokens=200\n",
|
||||||
@@ -186,36 +196,60 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def split_text(text, chunk_size=3000):\n",
|
"def split_text(text, chunk_size=3000):\n",
|
||||||
" \"\"\"Splits large text into smaller chunks based on the given chunk size.\"\"\"\n",
|
" \"\"\"\n",
|
||||||
|
" Splits large text into smaller chunks based on the given chunk size.\n",
|
||||||
|
" Ensures that chunks end with a full stop where possible to maintain sentence integrity.\n",
|
||||||
|
" \n",
|
||||||
|
" :param text: str, the text to be split\n",
|
||||||
|
" :param chunk_size: int, maximum size of each chunk (default 3000 characters)\n",
|
||||||
|
" :return: list of str, where each str is a chunk of text\n",
|
||||||
|
" \"\"\"\n",
|
||||||
" chunks = []\n",
|
" chunks = []\n",
|
||||||
" while len(text) > chunk_size:\n",
|
" while len(text) > chunk_size:\n",
|
||||||
" # Find the last full stop within the chunk size to avoid cutting sentences\n",
|
" # Find the last full stop within or at the chunk size\n",
|
||||||
" split_point = text.rfind('.', 0, chunk_size)\n",
|
" split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size\n",
|
||||||
" \n",
|
" if split_point == -1: # No period found within the chunk size\n",
|
||||||
" # If no full stop is found, just split at the chunk size\n",
|
|
||||||
" if split_point == -1:\n",
|
|
||||||
" split_point = chunk_size\n",
|
" split_point = chunk_size\n",
|
||||||
" \n",
|
" \n",
|
||||||
" chunks.append(text[:split_point].strip())\n",
|
" # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure\n",
|
||||||
" text = text[split_point:].strip()\n",
|
" chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])\n",
|
||||||
|
" text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Add the remaining text as the final chunk\n",
|
" # Add the remaining text as the final chunk, only strip if there's content\n",
|
||||||
" if text:\n",
|
" if text:\n",
|
||||||
" chunks.append(text)\n",
|
" chunks.append(text.strip())\n",
|
||||||
" \n",
|
" \n",
|
||||||
" return chunks\n",
|
" return chunks\n",
|
||||||
"\n",
|
"\n",
|
||||||
"transcript_chunks = split_text(transcript_text)\n",
|
"transcript_chunks = split_text(transcript_text)\n",
|
||||||
|
"\n",
|
||||||
"# Now you can summarize each chunk individually\n",
|
"# Now you can summarize each chunk individually\n",
|
||||||
"summaries = []\n",
|
"summaries = []\n",
|
||||||
"for chunk in transcript_chunks:\n",
|
"for chunk in transcript_chunks:\n",
|
||||||
" summary = summarize_text(chunk)\n",
|
" summary = summarize_text(chunk)\n",
|
||||||
" summaries.append(summary)\n",
|
" summaries.append(summary)\n",
|
||||||
" \n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"# Combine the individual summaries into one\n",
|
"# Combine the individual summaries into one\n",
|
||||||
"full_summary = \" \".join(summaries)\n",
|
"full_summary = \" \".join(summaries)\n",
|
||||||
"display(Markdown(full_summary))\n"
|
"display(Markdown(full_summary))\n"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6b266fdc-da31-4d79-8982-be77f03be59f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "792c814d-73f8-4c1e-a0bb-b654b40e4d8b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
Reference in New Issue
Block a user