{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Week 1 Day 4 - Tokenization, Cost Estimation, and Chunking (Community Contribution)\n", "\n", "This notebook demonstrates:\n", "- Tokenization using `tiktoken`\n", "- Token counting per model\n", "- Simple cost estimation\n", "- Chunking long text by tokens and by sentences\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setup complete\n" ] } ], "source": [ "# Imports and setup\n", "import tiktoken\n", "from openai import OpenAI\n", "from dotenv import load_dotenv\n", "import os\n", "\n", "load_dotenv(override=True)\n", "openai = OpenAI()\n", "\n", "print(\"Setup complete\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ gpt-4o-mini: o200k_base\n", "✅ gpt-4o: o200k_base\n", "✅ gpt-3.5-turbo: cl100k_base\n", "\n", "Text length: 73 chars\n", "\n", "gpt-4o-mini: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n", "\n", "gpt-4o: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n", "\n", "gpt-3.5-turbo: 20 tokens -> [13347, 856, 836, 374, 3279, 323, 358, 1093, 9120, 21869, 4447, 13, 1115, 374, 264, 1296, 315, 4037, 2065, 0]\n" ] } ], "source": [ "# Tokenization per model\n", "models = [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n", "\n", "encodings = {}\n", "for m in models:\n", " try:\n", " encodings[m] = tiktoken.encoding_for_model(m)\n", " print(f\"✅ {m}: {encodings[m].name}\")\n", " except Exception as e:\n", " print(f\"❌ {m}: {e}\")\n", "\n", "text = \"Hi my name is Ed and I like banoffee pie. This is a test of tokenization!\"\n", "print(f\"\\nText length: {len(text)} chars\")\n", "\n", "for m, enc in encodings.items():\n", " toks = enc.encode(text)\n", " print(f\"\\n{m}: {len(toks)} tokens -> {toks}\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Text: Hello world!\n", " gpt-4o-mini: 3 tokens, est input cost $0.000000\n", " gpt-4o: 3 tokens, est input cost $0.000015\n", " gpt-3.5-turbo: 3 tokens, est input cost $0.000002\n", "\n", "Text: This is a longer text that will have more tokens and cost more money to process.\n", " gpt-4o-mini: 17 tokens, est input cost $0.000003\n", " gpt-4o: 17 tokens, est input cost $0.000085\n", " gpt-3.5-turbo: 17 tokens, est input cost $0.000009\n" ] } ], "source": [ "# Token counting and simple cost estimation\n", "PRICING = {\n", " \"gpt-4o-mini\": {\"input\": 0.00015, \"output\": 0.0006},\n", " \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n", " \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015},\n", "}\n", "\n", "def count_tokens(text, model=\"gpt-4o-mini\"):\n", " enc = tiktoken.encoding_for_model(model)\n", " return len(enc.encode(text))\n", "\n", "def estimate_cost(tokens, model=\"gpt-4o-mini\", kind=\"input\"):\n", " if model not in PRICING:\n", " return 0.0\n", " return (tokens / 1000) * PRICING[model][kind]\n", "\n", "samples = [\n", " \"Hello world!\",\n", " \"This is a longer text that will have more tokens and cost more money to process.\",\n", "]\n", "\n", "for s in samples:\n", " print(f\"\\nText: {s}\")\n", " for m in PRICING.keys():\n", " t = count_tokens(s, m)\n", " c = estimate_cost(t, m, \"input\")\n", " print(f\" {m}: {t} tokens, est input cost ${c:.6f}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token-based chunks:\n", " Chunk 1: 677 chars\n", " Chunk 2: 690 chars\n", " Chunk 3: 700 chars\n", " Chunk 4: 670 chars\n", " Chunk 5: 688 chars\n", " Chunk 6: 711 chars\n", " Chunk 7: 670 chars\n", " Chunk 8: 238 chars\n", "\n", "Sentence-based chunks:\n", " Chunk 1: 637 chars\n", " Chunk 2: 698 chars\n", " Chunk 3: 582 chars\n", " Chunk 4: 637 chars\n", " Chunk 5: 698 chars\n", " Chunk 6: 582 chars\n" ] } ], "source": [ "# Chunking helpers\n", "import re\n", "\n", "def chunk_by_tokens(text, model=\"gpt-4o-mini\", max_tokens=300, overlap=30):\n", " enc = tiktoken.encoding_for_model(model)\n", " toks = enc.encode(text)\n", " chunks = []\n", " start = 0\n", " while start < len(toks):\n", " end = min(start + max_tokens, len(toks))\n", " chunk_text = enc.decode(toks[start:end])\n", " chunks.append(chunk_text)\n", " if end == len(toks):\n", " break\n", " start = max(0, end - overlap)\n", " return chunks\n", "\n", "def chunk_by_sentences(text, model=\"gpt-4o-mini\", max_tokens=300):\n", " enc = tiktoken.encoding_for_model(model)\n", " sentences = re.split(r\"(?<=[.!?])\\s+\", text)\n", " chunks, current = [], \"\"\n", " for s in sentences:\n", " candidate = (current + \" \" + s).strip() if current else s\n", " if len(enc.encode(candidate)) <= max_tokens:\n", " current = candidate\n", " else:\n", " if current:\n", " chunks.append(current)\n", " current = s\n", " if current:\n", " chunks.append(current)\n", " return chunks\n", "\n", "# Try with a long text\n", "long_text = (\n", " \"Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. \"\n", " \"It enables machines to perform tasks that typically require human intelligence. \"\n", " \"Machine learning, a subset of AI, allows systems to learn from data. \"\n", " \"Deep learning uses neural networks with multiple layers. \"\n", " \"AI powers recommendations, autonomous vehicles, and medical diagnostics. \"\n", ") * 10\n", "\n", "print(\"Token-based chunks:\")\n", "for i, ch in enumerate(chunk_by_tokens(long_text, max_tokens=120)):\n", " print(f\" Chunk {i+1}: {len(ch)} chars\")\n", "\n", "print(\"\\nSentence-based chunks:\")\n", "for i, ch in enumerate(chunk_by_sentences(long_text, max_tokens=120)):\n", " print(f\" Chunk {i+1}: {len(ch)} chars\")\n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 2 }