Files
LLM_Engineering_OLD/week1/community-contributions/week1-assignment-Joshua/day4_tokenization_cost_chunking.ipynb

241 lines
8.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Week 1 Day 4 - Tokenization, Cost Estimation, and Chunking (Community Contribution)\n",
"\n",
"This notebook demonstrates:\n",
"- Tokenization using `tiktoken`\n",
"- Token counting per model\n",
"- Simple cost estimation\n",
"- Chunking long text by tokens and by sentences\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Setup complete\n"
]
}
],
"source": [
"# Imports and setup\n",
"import tiktoken\n",
"from openai import OpenAI\n",
"from dotenv import load_dotenv\n",
"import os\n",
"\n",
"load_dotenv(override=True)\n",
"openai = OpenAI()\n",
"\n",
"print(\"Setup complete\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ gpt-4o-mini: o200k_base\n",
"✅ gpt-4o: o200k_base\n",
"✅ gpt-3.5-turbo: cl100k_base\n",
"\n",
"Text length: 73 chars\n",
"\n",
"gpt-4o-mini: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n",
"\n",
"gpt-4o: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n",
"\n",
"gpt-3.5-turbo: 20 tokens -> [13347, 856, 836, 374, 3279, 323, 358, 1093, 9120, 21869, 4447, 13, 1115, 374, 264, 1296, 315, 4037, 2065, 0]\n"
]
}
],
"source": [
"# Tokenization per model\n",
"models = [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n",
"\n",
"encodings = {}\n",
"for m in models:\n",
" try:\n",
" encodings[m] = tiktoken.encoding_for_model(m)\n",
" print(f\"✅ {m}: {encodings[m].name}\")\n",
" except Exception as e:\n",
" print(f\"❌ {m}: {e}\")\n",
"\n",
"text = \"Hi my name is Ed and I like banoffee pie. This is a test of tokenization!\"\n",
"print(f\"\\nText length: {len(text)} chars\")\n",
"\n",
"for m, enc in encodings.items():\n",
" toks = enc.encode(text)\n",
" print(f\"\\n{m}: {len(toks)} tokens -> {toks}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Text: Hello world!\n",
" gpt-4o-mini: 3 tokens, est input cost $0.000000\n",
" gpt-4o: 3 tokens, est input cost $0.000015\n",
" gpt-3.5-turbo: 3 tokens, est input cost $0.000002\n",
"\n",
"Text: This is a longer text that will have more tokens and cost more money to process.\n",
" gpt-4o-mini: 17 tokens, est input cost $0.000003\n",
" gpt-4o: 17 tokens, est input cost $0.000085\n",
" gpt-3.5-turbo: 17 tokens, est input cost $0.000009\n"
]
}
],
"source": [
"# Token counting and simple cost estimation\n",
"PRICING = {\n",
" \"gpt-4o-mini\": {\"input\": 0.00015, \"output\": 0.0006},\n",
" \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
" \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015},\n",
"}\n",
"\n",
"def count_tokens(text, model=\"gpt-4o-mini\"):\n",
" enc = tiktoken.encoding_for_model(model)\n",
" return len(enc.encode(text))\n",
"\n",
"def estimate_cost(tokens, model=\"gpt-4o-mini\", kind=\"input\"):\n",
" if model not in PRICING:\n",
" return 0.0\n",
" return (tokens / 1000) * PRICING[model][kind]\n",
"\n",
"samples = [\n",
" \"Hello world!\",\n",
" \"This is a longer text that will have more tokens and cost more money to process.\",\n",
"]\n",
"\n",
"for s in samples:\n",
" print(f\"\\nText: {s}\")\n",
" for m in PRICING.keys():\n",
" t = count_tokens(s, m)\n",
" c = estimate_cost(t, m, \"input\")\n",
" print(f\" {m}: {t} tokens, est input cost ${c:.6f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token-based chunks:\n",
" Chunk 1: 677 chars\n",
" Chunk 2: 690 chars\n",
" Chunk 3: 700 chars\n",
" Chunk 4: 670 chars\n",
" Chunk 5: 688 chars\n",
" Chunk 6: 711 chars\n",
" Chunk 7: 670 chars\n",
" Chunk 8: 238 chars\n",
"\n",
"Sentence-based chunks:\n",
" Chunk 1: 637 chars\n",
" Chunk 2: 698 chars\n",
" Chunk 3: 582 chars\n",
" Chunk 4: 637 chars\n",
" Chunk 5: 698 chars\n",
" Chunk 6: 582 chars\n"
]
}
],
"source": [
"# Chunking helpers\n",
"import re\n",
"\n",
"def chunk_by_tokens(text, model=\"gpt-4o-mini\", max_tokens=300, overlap=30):\n",
" enc = tiktoken.encoding_for_model(model)\n",
" toks = enc.encode(text)\n",
" chunks = []\n",
" start = 0\n",
" while start < len(toks):\n",
" end = min(start + max_tokens, len(toks))\n",
" chunk_text = enc.decode(toks[start:end])\n",
" chunks.append(chunk_text)\n",
" if end == len(toks):\n",
" break\n",
" start = max(0, end - overlap)\n",
" return chunks\n",
"\n",
"def chunk_by_sentences(text, model=\"gpt-4o-mini\", max_tokens=300):\n",
" enc = tiktoken.encoding_for_model(model)\n",
" sentences = re.split(r\"(?<=[.!?])\\s+\", text)\n",
" chunks, current = [], \"\"\n",
" for s in sentences:\n",
" candidate = (current + \" \" + s).strip() if current else s\n",
" if len(enc.encode(candidate)) <= max_tokens:\n",
" current = candidate\n",
" else:\n",
" if current:\n",
" chunks.append(current)\n",
" current = s\n",
" if current:\n",
" chunks.append(current)\n",
" return chunks\n",
"\n",
"# Try with a long text\n",
"long_text = (\n",
" \"Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. \"\n",
" \"It enables machines to perform tasks that typically require human intelligence. \"\n",
" \"Machine learning, a subset of AI, allows systems to learn from data. \"\n",
" \"Deep learning uses neural networks with multiple layers. \"\n",
" \"AI powers recommendations, autonomous vehicles, and medical diagnostics. \"\n",
") * 10\n",
"\n",
"print(\"Token-based chunks:\")\n",
"for i, ch in enumerate(chunk_by_tokens(long_text, max_tokens=120)):\n",
" print(f\" Chunk {i+1}: {len(ch)} chars\")\n",
"\n",
"print(\"\\nSentence-based chunks:\")\n",
"for i, ch in enumerate(chunk_by_sentences(long_text, max_tokens=120)):\n",
" print(f\" Chunk {i+1}: {len(ch)} chars\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}