{ "cells": [ { "cell_type": "markdown", "id": "6afa6324", "metadata": {}, "source": [ "Website Summarizer using Langchain RecursiveUrlLoader and OpenAI GPT-4o." ] }, { "cell_type": "code", "execution_count": null, "id": "cd0aa282", "metadata": {}, "outputs": [], "source": [ "%pip install -qU langchain-community beautifulsoup4 lxml" ] }, { "cell_type": "code", "execution_count": 1, "id": "ff0ba859", "metadata": {}, "outputs": [], "source": [ "# imports\n", "\n", "import os\n", "import glob\n", "from dotenv import load_dotenv\n", "import gradio as gr\n", "\n", "# imports for langchain\n", "\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", "from langchain_chroma import Chroma\n", "\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain\n", "\n", "from langchain_community.document_loaders import RecursiveUrlLoader\n", "import re\n", "\n", "from bs4 import BeautifulSoup\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "e2be45ee", "metadata": {}, "outputs": [], "source": [ "MODEL = \"gpt-4o\"\n", "db_name = \"vector_db\"\n", "\n", "\n", "load_dotenv(override=True)\n", "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" ] }, { "cell_type": "code", "execution_count": 3, "id": "2cd21d56", "metadata": {}, "outputs": [], "source": [ "def bs4_extractor(html: str) -> str:\n", " soup = BeautifulSoup(html, \"lxml\")\n", " return re.sub(r\"\\n\\n+\", \"\\n\\n\", soup.text).strip()\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "c07925ce", "metadata": {}, "outputs": [], "source": [ "def prepareLLM(website_url):\n", " loader = RecursiveUrlLoader(website_url, extractor=bs4_extractor)\n", " docs = loader.load()\n", " print(f\"Loaded {len(docs)} documents\")\n", " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", " chunks = text_splitter.split_documents(docs)\n", " print(f\"Loaded {len(chunks)} chunks\")\n", "\n", " embeddings = OpenAIEmbeddings()\n", "\n", " # Delete if already exists\n", "\n", " if os.path.exists(db_name):\n", " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", "\n", " # Create vectorstore\n", "\n", " vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", " print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")\n", "\n", " # create a new Chat with OpenAI\n", " llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", "\n", " # set up the conversation memory for the chat\n", " memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "\n", " # the retriever is an abstraction over the VectorStore that will be used during RAG\n", " retriever = vectorstore.as_retriever()\n", "\n", " # putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", " conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n", "\n", " return conversation_chain" ] }, { "cell_type": "code", "execution_count": 5, "id": "8cc26a70", "metadata": {}, "outputs": [], "source": [ "website_global= None\n", "conversational_chain_global = None" ] }, { "cell_type": "code", "execution_count": 6, "id": "809e7afa", "metadata": {}, "outputs": [], "source": [ "def chat(website,question):\n", " global website_global\n", " global conversational_chain_global\n", " if website_global != website:\n", " conversation_chain = prepareLLM(website)\n", " website_global = website\n", " conversational_chain_global = conversation_chain\n", " result = conversational_chain_global.invoke({\"question\":question})\n", " return result['answer']" ] }, { "cell_type": "code", "execution_count": 7, "id": "e1e9c0e9", "metadata": {}, "outputs": [], "source": [ "with gr.Blocks() as ui:\n", " website = gr.Textbox(label=\"Website URL (Only required for the first submit)\")\n", " question = gr.Textbox(label=\"Your Question\")\n", " submit = gr.Button(\"Submit\")\n", " answer = gr.Textbox(label=\"Response\")\n", " submit.click(fn=chat, inputs=[website,question], outputs=[answer])" ] }, { "cell_type": "code", "execution_count": null, "id": "80ef8c02", "metadata": {}, "outputs": [], "source": [ "ui.launch()" ] }, { "cell_type": "markdown", "id": "fef26a4b", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }