LLM_Engineering_OLD/week1/community-contributions/week1-assignment-Joshua/day4_tokenization_cost_chunking.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Week 1 Day 4 - Tokenization, Cost Estimation, and Chunking (Community Contribution)\n",
        "\n",
        "This notebook demonstrates:\n",
        "- Tokenization using `tiktoken`\n",
        "- Token counting per model\n",
        "- Simple cost estimation\n",
        "- Chunking long text by tokens and by sentences\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Setup complete\n"
          ]
        }
      ],
      "source": [
        "# Imports and setup\n",
        "import tiktoken\n",
        "from openai import OpenAI\n",
        "from dotenv import load_dotenv\n",
        "import os\n",
        "\n",
        "load_dotenv(override=True)\n",
        "openai = OpenAI()\n",
        "\n",
        "print(\"Setup complete\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ gpt-4o-mini: o200k_base\n",
            "✅ gpt-4o: o200k_base\n",
            "✅ gpt-3.5-turbo: cl100k_base\n",
            "\n",
            "Text length: 73 chars\n",
            "\n",
            "gpt-4o-mini: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n",
            "\n",
            "gpt-4o: 20 tokens -> [12194, 922, 1308, 382, 6117, 326, 357, 1299, 9171, 26458, 5148, 13, 1328, 382, 261, 1746, 328, 6602, 2860, 0]\n",
            "\n",
            "gpt-3.5-turbo: 20 tokens -> [13347, 856, 836, 374, 3279, 323, 358, 1093, 9120, 21869, 4447, 13, 1115, 374, 264, 1296, 315, 4037, 2065, 0]\n"
          ]
        }
      ],
      "source": [
        "# Tokenization per model\n",
        "models = [\"gpt-4o-mini\", \"gpt-4o\", \"gpt-3.5-turbo\"]\n",
        "\n",
        "encodings = {}\n",
        "for m in models:\n",
        "    try:\n",
        "        encodings[m] = tiktoken.encoding_for_model(m)\n",
        "        print(f\"✅ {m}: {encodings[m].name}\")\n",
        "    except Exception as e:\n",
        "        print(f\"❌ {m}: {e}\")\n",
        "\n",
        "text = \"Hi my name is Ed and I like banoffee pie. This is a test of tokenization!\"\n",
        "print(f\"\\nText length: {len(text)} chars\")\n",
        "\n",
        "for m, enc in encodings.items():\n",
        "    toks = enc.encode(text)\n",
        "    print(f\"\\n{m}: {len(toks)} tokens -> {toks}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "Text: Hello world!\n",
            "  gpt-4o-mini: 3 tokens, est input cost $0.000000\n",
            "  gpt-4o: 3 tokens, est input cost $0.000015\n",
            "  gpt-3.5-turbo: 3 tokens, est input cost $0.000002\n",
            "\n",
            "Text: This is a longer text that will have more tokens and cost more money to process.\n",
            "  gpt-4o-mini: 17 tokens, est input cost $0.000003\n",
            "  gpt-4o: 17 tokens, est input cost $0.000085\n",
            "  gpt-3.5-turbo: 17 tokens, est input cost $0.000009\n"
          ]
        }
      ],
      "source": [
        "# Token counting and simple cost estimation\n",
        "PRICING = {\n",
        "    \"gpt-4o-mini\": {\"input\": 0.00015, \"output\": 0.0006},\n",
        "    \"gpt-4o\": {\"input\": 0.005, \"output\": 0.015},\n",
        "    \"gpt-3.5-turbo\": {\"input\": 0.0005, \"output\": 0.0015},\n",
        "}\n",
        "\n",
        "def count_tokens(text, model=\"gpt-4o-mini\"):\n",
        "    enc = tiktoken.encoding_for_model(model)\n",
        "    return len(enc.encode(text))\n",
        "\n",
        "def estimate_cost(tokens, model=\"gpt-4o-mini\", kind=\"input\"):\n",
        "    if model not in PRICING:\n",
        "        return 0.0\n",
        "    return (tokens / 1000) * PRICING[model][kind]\n",
        "\n",
        "samples = [\n",
        "    \"Hello world!\",\n",
        "    \"This is a longer text that will have more tokens and cost more money to process.\",\n",
        "]\n",
        "\n",
        "for s in samples:\n",
        "    print(f\"\\nText: {s}\")\n",
        "    for m in PRICING.keys():\n",
        "        t = count_tokens(s, m)\n",
        "        c = estimate_cost(t, m, \"input\")\n",
        "        print(f\"  {m}: {t} tokens, est input cost ${c:.6f}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Token-based chunks:\n",
            "  Chunk 1: 677 chars\n",
            "  Chunk 2: 690 chars\n",
            "  Chunk 3: 700 chars\n",
            "  Chunk 4: 670 chars\n",
            "  Chunk 5: 688 chars\n",
            "  Chunk 6: 711 chars\n",
            "  Chunk 7: 670 chars\n",
            "  Chunk 8: 238 chars\n",
            "\n",
            "Sentence-based chunks:\n",
            "  Chunk 1: 637 chars\n",
            "  Chunk 2: 698 chars\n",
            "  Chunk 3: 582 chars\n",
            "  Chunk 4: 637 chars\n",
            "  Chunk 5: 698 chars\n",
            "  Chunk 6: 582 chars\n"
          ]
        }
      ],
      "source": [
        "# Chunking helpers\n",
        "import re\n",
        "\n",
        "def chunk_by_tokens(text, model=\"gpt-4o-mini\", max_tokens=300, overlap=30):\n",
        "    enc = tiktoken.encoding_for_model(model)\n",
        "    toks = enc.encode(text)\n",
        "    chunks = []\n",
        "    start = 0\n",
        "    while start < len(toks):\n",
        "        end = min(start + max_tokens, len(toks))\n",
        "        chunk_text = enc.decode(toks[start:end])\n",
        "        chunks.append(chunk_text)\n",
        "        if end == len(toks):\n",
        "            break\n",
        "        start = max(0, end - overlap)\n",
        "    return chunks\n",
        "\n",
        "def chunk_by_sentences(text, model=\"gpt-4o-mini\", max_tokens=300):\n",
        "    enc = tiktoken.encoding_for_model(model)\n",
        "    sentences = re.split(r\"(?<=[.!?])\\s+\", text)\n",
        "    chunks, current = [], \"\"\n",
        "    for s in sentences:\n",
        "        candidate = (current + \" \" + s).strip() if current else s\n",
        "        if len(enc.encode(candidate)) <= max_tokens:\n",
        "            current = candidate\n",
        "        else:\n",
        "            if current:\n",
        "                chunks.append(current)\n",
        "            current = s\n",
        "    if current:\n",
        "        chunks.append(current)\n",
        "    return chunks\n",
        "\n",
        "# Try with a long text\n",
        "long_text = (\n",
        "    \"Artificial Intelligence (AI) has become one of the most transformative technologies of the 21st century. \"\n",
        "    \"It enables machines to perform tasks that typically require human intelligence. \"\n",
        "    \"Machine learning, a subset of AI, allows systems to learn from data. \"\n",
        "    \"Deep learning uses neural networks with multiple layers. \"\n",
        "    \"AI powers recommendations, autonomous vehicles, and medical diagnostics. \"\n",
        ") * 10\n",
        "\n",
        "print(\"Token-based chunks:\")\n",
        "for i, ch in enumerate(chunk_by_tokens(long_text, max_tokens=120)):\n",
        "    print(f\"  Chunk {i+1}: {len(ch)} chars\")\n",
        "\n",
        "print(\"\\nSentence-based chunks:\")\n",
        "for i, ch in enumerate(chunk_by_sentences(long_text, max_tokens=120)):\n",
        "    print(f\"  Chunk {i+1}: {len(ch)} chars\")\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": ".venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}