321 lines
15 KiB
Plaintext
321 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Day 4 Solution - Tokenization and Text Processing\n",
|
|
"\n",
|
|
"This is my solution to the Day 4 assignment. I've implemented tokenization understanding and text processing techniques.\n",
|
|
"\n",
|
|
"## Features Implemented:\n",
|
|
"- Tokenization with tiktoken library\n",
|
|
"- Token counting and analysis\n",
|
|
"- Text chunking strategies\n",
|
|
"- Model-specific tokenization\n",
|
|
"- Cost estimation and optimization\n",
|
|
"- Advanced text processing techniques\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Day 4 Solution - Imports and Setup\n",
|
|
"import tiktoken\n",
|
|
"import os\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from openai import OpenAI\n",
|
|
"import json\n",
|
|
"\n",
|
|
"# Load environment variables\n",
|
|
"load_dotenv(override=True)\n",
|
|
"openai = OpenAI()\n",
|
|
"\n",
|
|
"print(\"Day 4 setup complete! Ready for tokenization analysis.\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Understanding Tokenization\n",
|
|
"print(\"## Tokenization Fundamentals\")\n",
|
|
"print(\"=\"*50)\n",
|
|
"\n",
|
|
"# Get encoding for different models\n",
|
|
"models = [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\", \"o1-mini\"]\n",
|
|
"\n",
|
|
"encodings = {}\n",
|
|
"for model in models:\n",
|
|
" try:\n",
|
|
" encodings[model] = tiktoken.encoding_for_model(model)\n",
|
|
" print(f\"✅ {model}: {encodings[model].name}\")\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"❌ {model}: {e}\")\n",
|
|
"\n",
|
|
"# Test text\n",
|
|
"test_text = \"Hi my name is Ed and I like banoffee pie. This is a test of tokenization!\"\n",
|
|
"\n",
|
|
"print(f\"\\\\nTest text: '{test_text}'\")\n",
|
|
"print(f\"Text length: {len(test_text)} characters\")\n",
|
|
"\n",
|
|
"# Tokenize with different models\n",
|
|
"for model, encoding in encodings.items():\n",
|
|
" tokens = encoding.encode(test_text)\n",
|
|
" print(f\"\\\\n{model}:\")\n",
|
|
" print(f\" Tokens: {len(tokens)}\")\n",
|
|
" print(f\" Token IDs: {tokens}\")\n",
|
|
" \n",
|
|
" # Show individual tokens\n",
|
|
" print(\" Individual tokens:\")\n",
|
|
" for i, token_id in enumerate(tokens):\n",
|
|
" token_text = encoding.decode([token_id])\n",
|
|
" print(f\" {i+1}. {token_id} = '{token_text}'\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Token Counting and Cost Estimation\n",
|
|
"def count_tokens(text, model=\"gpt-4o-mini\"):\n",
|
|
" \"\"\"Count tokens for a given text and model\"\"\"\n",
|
|
" try:\n",
|
|
" encoding = tiktoken.encoding_for_model(model)\n",
|
|
" return len(encoding.encode(text))\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error counting tokens for {model}: {e}\")\n",
|
|
" return 0\n",
|
|
"\n",
|
|
"def estimate_cost(text, model=\"gpt-4o-mini\", operation=\"completion\"):\n",
|
|
" \"\"\"Estimate cost for text processing\"\"\"\n",
|
|
" token_count = count_tokens(text, model)\n",
|
|
" \n",
|
|
" # Pricing per 1K tokens (as of 2024)\n",
|
|
" pricing = {\n",
|
|
" \"gpt-4o-mini\": {\"input\": 0.00015, \"output\": 0.0006},\n",
|
|
" \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
|
|
" \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015}\n",
|
|
" }\n",
|
|
" \n",
|
|
" if model in pricing:\n",
|
|
" if operation == \"input\":\n",
|
|
" cost = (token_count / 1000) * pricing[model][\"input\"]\n",
|
|
" else:\n",
|
|
" cost = (token_count / 1000) * pricing[model][\"output\"]\n",
|
|
" return token_count, cost\n",
|
|
" else:\n",
|
|
" return token_count, 0\n",
|
|
"\n",
|
|
"# Test with different texts\n",
|
|
"test_texts = [\n",
|
|
" \"Hello world!\",\n",
|
|
" \"This is a longer text that will have more tokens and cost more money to process.\",\n",
|
|
" \"Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed for every task.\",\n",
|
|
" \"The quick brown fox jumps over the lazy dog. \" * 10 # Repeated text\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(\"## Token Counting and Cost Analysis\")\n",
|
|
"print(\"=\"*60)\n",
|
|
"\n",
|
|
"for i, text in enumerate(test_texts, 1):\n",
|
|
" print(f\"\\\\nText {i}: '{text[:50]}{'...' if len(text) > 50 else ''}'\")\n",
|
|
" print(f\"Length: {len(text)} characters\")\n",
|
|
" \n",
|
|
" for model in [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]:\n",
|
|
" tokens, cost = estimate_cost(text, model, \"input\")\n",
|
|
" print(f\" {model}: {tokens} tokens, ${cost:.6f}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Text Chunking Strategies\n",
|
|
"def chunk_text_by_tokens(text, max_tokens=1000, model=\"gpt-4o-mini\", overlap=50):\n",
|
|
" \"\"\"Split text into chunks based on token count\"\"\"\n",
|
|
" encoding = tiktoken.encoding_for_model(model)\n",
|
|
" \n",
|
|
" # Encode the entire text\n",
|
|
" tokens = encoding.encode(text)\n",
|
|
" chunks = []\n",
|
|
" \n",
|
|
" start = 0\n",
|
|
" while start < len(tokens):\n",
|
|
" # Get chunk of tokens\n",
|
|
" end = min(start + max_tokens, len(tokens))\n",
|
|
" chunk_tokens = tokens[start:end]\n",
|
|
" \n",
|
|
" # Decode back to text\n",
|
|
" chunk_text = encoding.decode(chunk_tokens)\n",
|
|
" chunks.append(chunk_text)\n",
|
|
" \n",
|
|
" # Move start position with overlap\n",
|
|
" start = end - overlap if end < len(tokens) else end\n",
|
|
" \n",
|
|
" return chunks\n",
|
|
"\n",
|
|
"def chunk_text_by_sentences(text, max_tokens=1000, model=\"gpt-4o-mini\"):\n",
|
|
" \"\"\"Split text into chunks by sentences, respecting token limits\"\"\"\n",
|
|
" encoding = tiktoken.encoding_for_model(model)\n",
|
|
" \n",
|
|
" # Split by sentences (simple approach)\n",
|
|
" sentences = text.split('. ')\n",
|
|
" chunks = []\n",
|
|
" current_chunk = \"\"\n",
|
|
" \n",
|
|
" for sentence in sentences:\n",
|
|
" # Add sentence to current chunk\n",
|
|
" test_chunk = current_chunk + sentence + \". \" if current_chunk else sentence + \". \"\n",
|
|
" \n",
|
|
" # Check token count\n",
|
|
" if count_tokens(test_chunk, model) <= max_tokens:\n",
|
|
" current_chunk = test_chunk\n",
|
|
" else:\n",
|
|
" # Save current chunk and start new one\n",
|
|
" if current_chunk:\n",
|
|
" chunks.append(current_chunk.strip())\n",
|
|
" current_chunk = sentence + \". \"\n",
|
|
" \n",
|
|
" # Add final chunk\n",
|
|
" if current_chunk:\n",
|
|
" chunks.append(current_chunk.strip())\n",
|
|
" \n",
|
|
" return chunks\n",
|
|
"\n",
|
|
"# Test chunking strategies\n",
|
|
"long_text = \"\"\"\n",
|
|
"Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed for every task. \n",
|
|
"It involves training models on large datasets to make predictions or decisions. \n",
|
|
"There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning. \n",
|
|
"Supervised learning uses labeled training data to learn a mapping from inputs to outputs. \n",
|
|
"Unsupervised learning finds hidden patterns in data without labeled examples. \n",
|
|
"Reinforcement learning learns through interaction with an environment using rewards and penalties. \n",
|
|
"Deep learning is a subset of machine learning that uses neural networks with multiple layers. \n",
|
|
"These networks can automatically learn hierarchical representations of data. \n",
|
|
"Popular deep learning frameworks include TensorFlow, PyTorch, and Keras. \n",
|
|
"Machine learning has applications in computer vision, natural language processing, speech recognition, and many other domains.\n",
|
|
"\"\"\" * 3 # Repeat to make it longer\n",
|
|
"\n",
|
|
"print(\"## Text Chunking Strategies\")\n",
|
|
"print(\"=\"*50)\n",
|
|
"\n",
|
|
"print(f\"Original text length: {len(long_text)} characters\")\n",
|
|
"print(f\"Token count: {count_tokens(long_text, 'gpt-4o-mini')} tokens\")\n",
|
|
"\n",
|
|
"# Test token-based chunking\n",
|
|
"print(\"\\\\n📊 Token-based chunking:\")\n",
|
|
"token_chunks = chunk_text_by_tokens(long_text, max_tokens=200, model=\"gpt-4o-mini\")\n",
|
|
"for i, chunk in enumerate(token_chunks):\n",
|
|
" tokens = count_tokens(chunk, \"gpt-4o-mini\")\n",
|
|
" print(f\" Chunk {i+1}: {tokens} tokens, {len(chunk)} chars\")\n",
|
|
"\n",
|
|
"# Test sentence-based chunking\n",
|
|
"print(\"\\\\n📊 Sentence-based chunking:\")\n",
|
|
"sentence_chunks = chunk_text_by_sentences(long_text, max_tokens=200, model=\"gpt-4o-mini\")\n",
|
|
"for i, chunk in enumerate(sentence_chunks):\n",
|
|
" tokens = count_tokens(chunk, \"gpt-4o-mini\")\n",
|
|
" print(f\" Chunk {i+1}: {tokens} tokens, {len(chunk)} chars\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Advanced Text Processing with Token Awareness\n",
|
|
"def process_large_text(text, model=\"gpt-4o-mini\", max_tokens=1000, operation=\"summarize\"):\n",
|
|
" \"\"\"Process large text with token awareness\"\"\"\n",
|
|
" chunks = chunk_text_by_tokens(text, max_tokens, model)\n",
|
|
" \n",
|
|
" print(f\"📊 Processing {len(chunks)} chunks with {model}\")\n",
|
|
" \n",
|
|
" results = []\n",
|
|
" total_cost = 0\n",
|
|
" \n",
|
|
" for i, chunk in enumerate(chunks):\n",
|
|
" print(f\"\\\\nProcessing chunk {i+1}/{len(chunks)}...\")\n",
|
|
" \n",
|
|
" # Count tokens and estimate cost\n",
|
|
" tokens, cost = estimate_cost(chunk, model, \"input\")\n",
|
|
" total_cost += cost\n",
|
|
" \n",
|
|
" # Process chunk based on operation\n",
|
|
" if operation == \"summarize\":\n",
|
|
" prompt = f\"Summarize this text in 2-3 sentences:\\\\n\\\\n{chunk}\"\n",
|
|
" elif operation == \"extract_keywords\":\n",
|
|
" prompt = f\"Extract the 5 most important keywords from this text:\\\\n\\\\n{chunk}\"\n",
|
|
" elif operation == \"sentiment\":\n",
|
|
" prompt = f\"Analyze the sentiment of this text (positive/negative/neutral):\\\\n\\\\n{chunk}\"\n",
|
|
" else:\n",
|
|
" prompt = f\"Process this text:\\\\n\\\\n{chunk}\"\n",
|
|
" \n",
|
|
" try:\n",
|
|
" response = openai.chat.completions.create(\n",
|
|
" model=model,\n",
|
|
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
|
|
" max_tokens=100,\n",
|
|
" temperature=0.3\n",
|
|
" )\n",
|
|
" \n",
|
|
" result = response.choices[0].message.content\n",
|
|
" results.append(result)\n",
|
|
" \n",
|
|
" # Estimate output cost\n",
|
|
" output_tokens, output_cost = estimate_cost(result, model, \"output\")\n",
|
|
" total_cost += output_cost\n",
|
|
" \n",
|
|
" print(f\" ✅ Chunk {i+1} processed: {len(result)} chars\")\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\" ❌ Error processing chunk {i+1}: {e}\")\n",
|
|
" results.append(f\"Error: {e}\")\n",
|
|
" \n",
|
|
" print(f\"\\\\n💰 Total estimated cost: ${total_cost:.6f}\")\n",
|
|
" return results, total_cost\n",
|
|
"\n",
|
|
"# Test with a long document\n",
|
|
"document = \"\"\"\n",
|
|
"Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. \n",
|
|
"It encompasses a wide range of techniques and applications that enable machines to perform tasks that typically require human intelligence. \n",
|
|
"Machine learning, a subset of AI, allows systems to automatically learn and improve from experience without being explicitly programmed. \n",
|
|
"Deep learning, which uses neural networks with multiple layers, has achieved remarkable success in areas like image recognition, natural language processing, and game playing. \n",
|
|
"AI applications are now ubiquitous, from recommendation systems on e-commerce platforms to autonomous vehicles and medical diagnosis tools. \n",
|
|
"The field continues to evolve rapidly, with new architectures and training methods being developed regularly. \n",
|
|
"However, AI also raises important questions about ethics, bias, job displacement, and the need for responsible development and deployment. \n",
|
|
"As AI becomes more powerful and widespread, it's crucial to ensure that these systems are fair, transparent, and beneficial to society as a whole.\n",
|
|
"\"\"\" * 5 # Make it longer\n",
|
|
"\n",
|
|
"print(\"## Advanced Text Processing with Token Awareness\")\n",
|
|
"print(\"=\"*60)\n",
|
|
"\n",
|
|
"# Test summarization\n",
|
|
"print(\"\\\\n📝 Testing summarization...\")\n",
|
|
"summaries, cost = process_large_text(document, operation=\"summarize\")\n",
|
|
"print(f\"\\\\nGenerated {len(summaries)} summaries\")\n",
|
|
"for i, summary in enumerate(summaries):\n",
|
|
" print(f\"\\\\nSummary {i+1}: {summary}\")\n",
|
|
"\n",
|
|
"print(f\"\\\\nTotal cost: ${cost:.6f}\")\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|