Files
LLM_Engineering_OLD/week1/my-solutions/day4-solution.ipynb

321 lines
15 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Day 4 Solution - Tokenization and Text Processing\n",
"\n",
"This is my solution to the Day 4 assignment. I've implemented tokenization understanding and text processing techniques.\n",
"\n",
"## Features Implemented:\n",
"- Tokenization with tiktoken library\n",
"- Token counting and analysis\n",
"- Text chunking strategies\n",
"- Model-specific tokenization\n",
"- Cost estimation and optimization\n",
"- Advanced text processing techniques\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Day 4 Solution - Imports and Setup\n",
"import tiktoken\n",
"import os\n",
"from dotenv import load_dotenv\n",
"from openai import OpenAI\n",
"import json\n",
"\n",
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"openai = OpenAI()\n",
"\n",
"print(\"Day 4 setup complete! Ready for tokenization analysis.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Understanding Tokenization\n",
"print(\"## Tokenization Fundamentals\")\n",
"print(\"=\"*50)\n",
"\n",
"# Get encoding for different models\n",
"models = [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\", \"o1-mini\"]\n",
"\n",
"encodings = {}\n",
"for model in models:\n",
" try:\n",
" encodings[model] = tiktoken.encoding_for_model(model)\n",
" print(f\"✅ {model}: {encodings[model].name}\")\n",
" except Exception as e:\n",
" print(f\"❌ {model}: {e}\")\n",
"\n",
"# Test text\n",
"test_text = \"Hi my name is Ed and I like banoffee pie. This is a test of tokenization!\"\n",
"\n",
"print(f\"\\\\nTest text: '{test_text}'\")\n",
"print(f\"Text length: {len(test_text)} characters\")\n",
"\n",
"# Tokenize with different models\n",
"for model, encoding in encodings.items():\n",
" tokens = encoding.encode(test_text)\n",
" print(f\"\\\\n{model}:\")\n",
" print(f\" Tokens: {len(tokens)}\")\n",
" print(f\" Token IDs: {tokens}\")\n",
" \n",
" # Show individual tokens\n",
" print(\" Individual tokens:\")\n",
" for i, token_id in enumerate(tokens):\n",
" token_text = encoding.decode([token_id])\n",
" print(f\" {i+1}. {token_id} = '{token_text}'\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Token Counting and Cost Estimation\n",
"def count_tokens(text, model=\"gpt-4o-mini\"):\n",
" \"\"\"Count tokens for a given text and model\"\"\"\n",
" try:\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" return len(encoding.encode(text))\n",
" except Exception as e:\n",
" print(f\"Error counting tokens for {model}: {e}\")\n",
" return 0\n",
"\n",
"def estimate_cost(text, model=\"gpt-4o-mini\", operation=\"completion\"):\n",
" \"\"\"Estimate cost for text processing\"\"\"\n",
" token_count = count_tokens(text, model)\n",
" \n",
" # Pricing per 1K tokens (as of 2024)\n",
" pricing = {\n",
" \"gpt-4o-mini\": {\"input\": 0.00015, \"output\": 0.0006},\n",
" \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
" \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015}\n",
" }\n",
" \n",
" if model in pricing:\n",
" if operation == \"input\":\n",
" cost = (token_count / 1000) * pricing[model][\"input\"]\n",
" else:\n",
" cost = (token_count / 1000) * pricing[model][\"output\"]\n",
" return token_count, cost\n",
" else:\n",
" return token_count, 0\n",
"\n",
"# Test with different texts\n",
"test_texts = [\n",
" \"Hello world!\",\n",
" \"This is a longer text that will have more tokens and cost more money to process.\",\n",
" \"Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed for every task.\",\n",
" \"The quick brown fox jumps over the lazy dog. \" * 10 # Repeated text\n",
"]\n",
"\n",
"print(\"## Token Counting and Cost Analysis\")\n",
"print(\"=\"*60)\n",
"\n",
"for i, text in enumerate(test_texts, 1):\n",
" print(f\"\\\\nText {i}: '{text[:50]}{'...' if len(text) > 50 else ''}'\")\n",
" print(f\"Length: {len(text)} characters\")\n",
" \n",
" for model in [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]:\n",
" tokens, cost = estimate_cost(text, model, \"input\")\n",
" print(f\" {model}: {tokens} tokens, ${cost:.6f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Text Chunking Strategies\n",
"def chunk_text_by_tokens(text, max_tokens=1000, model=\"gpt-4o-mini\", overlap=50):\n",
" \"\"\"Split text into chunks based on token count\"\"\"\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" \n",
" # Encode the entire text\n",
" tokens = encoding.encode(text)\n",
" chunks = []\n",
" \n",
" start = 0\n",
" while start < len(tokens):\n",
" # Get chunk of tokens\n",
" end = min(start + max_tokens, len(tokens))\n",
" chunk_tokens = tokens[start:end]\n",
" \n",
" # Decode back to text\n",
" chunk_text = encoding.decode(chunk_tokens)\n",
" chunks.append(chunk_text)\n",
" \n",
" # Move start position with overlap\n",
" start = end - overlap if end < len(tokens) else end\n",
" \n",
" return chunks\n",
"\n",
"def chunk_text_by_sentences(text, max_tokens=1000, model=\"gpt-4o-mini\"):\n",
" \"\"\"Split text into chunks by sentences, respecting token limits\"\"\"\n",
" encoding = tiktoken.encoding_for_model(model)\n",
" \n",
" # Split by sentences (simple approach)\n",
" sentences = text.split('. ')\n",
" chunks = []\n",
" current_chunk = \"\"\n",
" \n",
" for sentence in sentences:\n",
" # Add sentence to current chunk\n",
" test_chunk = current_chunk + sentence + \". \" if current_chunk else sentence + \". \"\n",
" \n",
" # Check token count\n",
" if count_tokens(test_chunk, model) <= max_tokens:\n",
" current_chunk = test_chunk\n",
" else:\n",
" # Save current chunk and start new one\n",
" if current_chunk:\n",
" chunks.append(current_chunk.strip())\n",
" current_chunk = sentence + \". \"\n",
" \n",
" # Add final chunk\n",
" if current_chunk:\n",
" chunks.append(current_chunk.strip())\n",
" \n",
" return chunks\n",
"\n",
"# Test chunking strategies\n",
"long_text = \"\"\"\n",
"Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data without being explicitly programmed for every task. \n",
"It involves training models on large datasets to make predictions or decisions. \n",
"There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning. \n",
"Supervised learning uses labeled training data to learn a mapping from inputs to outputs. \n",
"Unsupervised learning finds hidden patterns in data without labeled examples. \n",
"Reinforcement learning learns through interaction with an environment using rewards and penalties. \n",
"Deep learning is a subset of machine learning that uses neural networks with multiple layers. \n",
"These networks can automatically learn hierarchical representations of data. \n",
"Popular deep learning frameworks include TensorFlow, PyTorch, and Keras. \n",
"Machine learning has applications in computer vision, natural language processing, speech recognition, and many other domains.\n",
"\"\"\" * 3 # Repeat to make it longer\n",
"\n",
"print(\"## Text Chunking Strategies\")\n",
"print(\"=\"*50)\n",
"\n",
"print(f\"Original text length: {len(long_text)} characters\")\n",
"print(f\"Token count: {count_tokens(long_text, 'gpt-4o-mini')} tokens\")\n",
"\n",
"# Test token-based chunking\n",
"print(\"\\\\n📊 Token-based chunking:\")\n",
"token_chunks = chunk_text_by_tokens(long_text, max_tokens=200, model=\"gpt-4o-mini\")\n",
"for i, chunk in enumerate(token_chunks):\n",
" tokens = count_tokens(chunk, \"gpt-4o-mini\")\n",
" print(f\" Chunk {i+1}: {tokens} tokens, {len(chunk)} chars\")\n",
"\n",
"# Test sentence-based chunking\n",
"print(\"\\\\n📊 Sentence-based chunking:\")\n",
"sentence_chunks = chunk_text_by_sentences(long_text, max_tokens=200, model=\"gpt-4o-mini\")\n",
"for i, chunk in enumerate(sentence_chunks):\n",
" tokens = count_tokens(chunk, \"gpt-4o-mini\")\n",
" print(f\" Chunk {i+1}: {tokens} tokens, {len(chunk)} chars\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Advanced Text Processing with Token Awareness\n",
"def process_large_text(text, model=\"gpt-4o-mini\", max_tokens=1000, operation=\"summarize\"):\n",
" \"\"\"Process large text with token awareness\"\"\"\n",
" chunks = chunk_text_by_tokens(text, max_tokens, model)\n",
" \n",
" print(f\"📊 Processing {len(chunks)} chunks with {model}\")\n",
" \n",
" results = []\n",
" total_cost = 0\n",
" \n",
" for i, chunk in enumerate(chunks):\n",
" print(f\"\\\\nProcessing chunk {i+1}/{len(chunks)}...\")\n",
" \n",
" # Count tokens and estimate cost\n",
" tokens, cost = estimate_cost(chunk, model, \"input\")\n",
" total_cost += cost\n",
" \n",
" # Process chunk based on operation\n",
" if operation == \"summarize\":\n",
" prompt = f\"Summarize this text in 2-3 sentences:\\\\n\\\\n{chunk}\"\n",
" elif operation == \"extract_keywords\":\n",
" prompt = f\"Extract the 5 most important keywords from this text:\\\\n\\\\n{chunk}\"\n",
" elif operation == \"sentiment\":\n",
" prompt = f\"Analyze the sentiment of this text (positive/negative/neutral):\\\\n\\\\n{chunk}\"\n",
" else:\n",
" prompt = f\"Process this text:\\\\n\\\\n{chunk}\"\n",
" \n",
" try:\n",
" response = openai.chat.completions.create(\n",
" model=model,\n",
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
" max_tokens=100,\n",
" temperature=0.3\n",
" )\n",
" \n",
" result = response.choices[0].message.content\n",
" results.append(result)\n",
" \n",
" # Estimate output cost\n",
" output_tokens, output_cost = estimate_cost(result, model, \"output\")\n",
" total_cost += output_cost\n",
" \n",
" print(f\" ✅ Chunk {i+1} processed: {len(result)} chars\")\n",
" \n",
" except Exception as e:\n",
" print(f\" ❌ Error processing chunk {i+1}: {e}\")\n",
" results.append(f\"Error: {e}\")\n",
" \n",
" print(f\"\\\\n💰 Total estimated cost: ${total_cost:.6f}\")\n",
" return results, total_cost\n",
"\n",
"# Test with a long document\n",
"document = \"\"\"\n",
"Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. \n",
"It encompasses a wide range of techniques and applications that enable machines to perform tasks that typically require human intelligence. \n",
"Machine learning, a subset of AI, allows systems to automatically learn and improve from experience without being explicitly programmed. \n",
"Deep learning, which uses neural networks with multiple layers, has achieved remarkable success in areas like image recognition, natural language processing, and game playing. \n",
"AI applications are now ubiquitous, from recommendation systems on e-commerce platforms to autonomous vehicles and medical diagnosis tools. \n",
"The field continues to evolve rapidly, with new architectures and training methods being developed regularly. \n",
"However, AI also raises important questions about ethics, bias, job displacement, and the need for responsible development and deployment. \n",
"As AI becomes more powerful and widespread, it's crucial to ensure that these systems are fair, transparent, and beneficial to society as a whole.\n",
"\"\"\" * 5 # Make it longer\n",
"\n",
"print(\"## Advanced Text Processing with Token Awareness\")\n",
"print(\"=\"*60)\n",
"\n",
"# Test summarization\n",
"print(\"\\\\n📝 Testing summarization...\")\n",
"summaries, cost = process_large_text(document, operation=\"summarize\")\n",
"print(f\"\\\\nGenerated {len(summaries)} summaries\")\n",
"for i, summary in enumerate(summaries):\n",
" print(f\"\\\\nSummary {i+1}: {summary}\")\n",
"\n",
"print(f\"\\\\nTotal cost: ${cost:.6f}\")\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}