fix: chunk text func
imp: add language params to get_transcript
This commit is contained in:
@@ -28,7 +28,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 111,
|
||||
"execution_count": null,
|
||||
"id": "a082ddaf-abf5-4e6c-8112-74846c768301",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -74,7 +74,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 113,
|
||||
"execution_count": null,
|
||||
"id": "019974d9-f3ad-4a8a-b5f9-0a3719aea2d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -84,7 +84,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 114,
|
||||
"execution_count": null,
|
||||
"id": "c5e793b2-6775-426a-a139-4848291d0463",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -120,26 +120,28 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example usage\n",
|
||||
"video_url = \"https://www.youtube.com/watch?v=5zuF4Ys1eAw\"\n",
|
||||
"video_url = \"https://www.youtube.com/watch?v=kqaMIFEz15s\"\n",
|
||||
"\n",
|
||||
"yt_video = YoutubeVideoID(video_url)\n",
|
||||
"print(yt_video) # Output: Video ID: cicHKo4zH-w"
|
||||
"print(yt_video)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 116,
|
||||
"execution_count": null,
|
||||
"id": "f724be3c-bdeb-4079-b4be-f12608144484",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_transcript(video_id):\n",
|
||||
"def get_transcript(video_id, language='en'):\n",
|
||||
" try:\n",
|
||||
" transcript = YouTubeTranscriptApi.get_transcript(video_id)\n",
|
||||
" # Try to get the transcript in the desired language (Indonesian by default)\n",
|
||||
" transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])\n",
|
||||
" # Join all the 'text' fields into a single string\n",
|
||||
" return \" \".join([item['text'] for item in transcript])\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error fetching transcript: {e}\")\n",
|
||||
" return None"
|
||||
" return None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -156,7 +158,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"execution_count": null,
|
||||
"id": "0a0750be-88a1-4e65-9cb8-a0a2f11eecdf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -164,10 +166,18 @@
|
||||
"# Function to summarize text using ChatGPT\n",
|
||||
"def summarize_text(text):\n",
|
||||
" try:\n",
|
||||
" system_prompts = \"\"\"\n",
|
||||
" You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:\n",
|
||||
" \n",
|
||||
" - Capture the key points of the content.\n",
|
||||
" - Keep the summary brief and easy to understand.\n",
|
||||
" - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.\n",
|
||||
" - Use bullet points where appropriate to enhance clarity and structure.\n",
|
||||
" \"\"\"\n",
|
||||
" response = openai.chat.completions.create(\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a helpful assistant that summarizes text.\"},\n",
|
||||
" {\"role\": \"system\", \"content\": system_prompts},\n",
|
||||
" {\"role\": \"user\", \"content\": f\"Summarize the following text:\\n{text}\"}\n",
|
||||
" ],\n",
|
||||
" max_tokens=200\n",
|
||||
@@ -186,36 +196,60 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def split_text(text, chunk_size=3000):\n",
|
||||
" \"\"\"Splits large text into smaller chunks based on the given chunk size.\"\"\"\n",
|
||||
" \"\"\"\n",
|
||||
" Splits large text into smaller chunks based on the given chunk size.\n",
|
||||
" Ensures that chunks end with a full stop where possible to maintain sentence integrity.\n",
|
||||
" \n",
|
||||
" :param text: str, the text to be split\n",
|
||||
" :param chunk_size: int, maximum size of each chunk (default 3000 characters)\n",
|
||||
" :return: list of str, where each str is a chunk of text\n",
|
||||
" \"\"\"\n",
|
||||
" chunks = []\n",
|
||||
" while len(text) > chunk_size:\n",
|
||||
" # Find the last full stop within the chunk size to avoid cutting sentences\n",
|
||||
" split_point = text.rfind('.', 0, chunk_size)\n",
|
||||
" \n",
|
||||
" # If no full stop is found, just split at the chunk size\n",
|
||||
" if split_point == -1:\n",
|
||||
" # Find the last full stop within or at the chunk size\n",
|
||||
" split_point = text.rfind('.', 0, chunk_size + 1) # +1 to include the period itself if it's at chunk_size\n",
|
||||
" if split_point == -1: # No period found within the chunk size\n",
|
||||
" split_point = chunk_size\n",
|
||||
" \n",
|
||||
" chunks.append(text[:split_point].strip())\n",
|
||||
" text = text[split_point:].strip()\n",
|
||||
" # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure\n",
|
||||
" chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])\n",
|
||||
" text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]\n",
|
||||
" \n",
|
||||
" # Add the remaining text as the final chunk\n",
|
||||
" # Add the remaining text as the final chunk, only strip if there's content\n",
|
||||
" if text:\n",
|
||||
" chunks.append(text)\n",
|
||||
" chunks.append(text.strip())\n",
|
||||
" \n",
|
||||
" return chunks\n",
|
||||
"\n",
|
||||
"transcript_chunks = split_text(transcript_text)\n",
|
||||
"\n",
|
||||
"# Now you can summarize each chunk individually\n",
|
||||
"summaries = []\n",
|
||||
"for chunk in transcript_chunks:\n",
|
||||
" summary = summarize_text(chunk)\n",
|
||||
" summaries.append(summary)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Combine the individual summaries into one\n",
|
||||
"full_summary = \" \".join(summaries)\n",
|
||||
"display(Markdown(full_summary))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6b266fdc-da31-4d79-8982-be77f03be59f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "792c814d-73f8-4c1e-a0bb-b654b40e4d8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user