diff --git a/week5/community-contributions/day4_RAG_website_summarizer.ipynb b/week5/community-contributions/day4_RAG_website_summarizer.ipynb new file mode 100644 index 0000000..0dd3902 --- /dev/null +++ b/week5/community-contributions/day4_RAG_website_summarizer.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6afa6324", + "metadata": {}, + "source": [ + "Website Summarizer using Langchain RecursiveUrlLoader and OpenAI GPT-4o." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd0aa282", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-community beautifulsoup4 lxml" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ff0ba859", + "metadata": {}, + "outputs": [], + "source": [ + "# imports\n", + "\n", + "import os\n", + "import glob\n", + "from dotenv import load_dotenv\n", + "import gradio as gr\n", + "\n", + "# imports for langchain\n", + "\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", + "from langchain_chroma import Chroma\n", + "\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "\n", + "from langchain_community.document_loaders import RecursiveUrlLoader\n", + "import re\n", + "\n", + "from bs4 import BeautifulSoup\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e2be45ee", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL = \"gpt-4o\"\n", + "db_name = \"vector_db\"\n", + "\n", + "\n", + "load_dotenv(override=True)\n", + "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2cd21d56", + "metadata": {}, + "outputs": [], + "source": [ + "def bs4_extractor(html: str) -> str:\n", + " soup = BeautifulSoup(html, \"lxml\")\n", + " return re.sub(r\"\\n\\n+\", \"\\n\\n\", soup.text).strip()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c07925ce", + "metadata": {}, + "outputs": [], + "source": [ + "def prepareLLM(website_url):\n", + " loader = RecursiveUrlLoader(website_url, extractor=bs4_extractor)\n", + " docs = loader.load()\n", + " print(f\"Loaded {len(docs)} documents\")\n", + " text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + " chunks = text_splitter.split_documents(docs)\n", + " print(f\"Loaded {len(chunks)} chunks\")\n", + "\n", + " embeddings = OpenAIEmbeddings()\n", + "\n", + " # Delete if already exists\n", + "\n", + " if os.path.exists(db_name):\n", + " Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()\n", + "\n", + " # Create vectorstore\n", + "\n", + " vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)\n", + " print(f\"Vectorstore created with {vectorstore._collection.count()} documents\")\n", + "\n", + " # create a new Chat with OpenAI\n", + " llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", + "\n", + " # set up the conversation memory for the chat\n", + " memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", + "\n", + " # the retriever is an abstraction over the VectorStore that will be used during RAG\n", + " retriever = vectorstore.as_retriever()\n", + "\n", + " # putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", + " conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n", + "\n", + " return conversation_chain" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8cc26a70", + "metadata": {}, + "outputs": [], + "source": [ + "website_global= None\n", + "conversational_chain_global = None" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "809e7afa", + "metadata": {}, + "outputs": [], + "source": [ + "def chat(website,question):\n", + " global website_global\n", + " global conversational_chain_global\n", + " if website_global != website:\n", + " conversation_chain = prepareLLM(website)\n", + " website_global = website\n", + " conversational_chain_global = conversation_chain\n", + " result = conversational_chain_global.invoke({\"question\":question})\n", + " return result['answer']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e1e9c0e9", + "metadata": {}, + "outputs": [], + "source": [ + "with gr.Blocks() as ui:\n", + " website = gr.Textbox(label=\"Website URL (Only required for the first submit)\")\n", + " question = gr.Textbox(label=\"Your Question\")\n", + " submit = gr.Button(\"Submit\")\n", + " answer = gr.Textbox(label=\"Response\")\n", + " submit.click(fn=chat, inputs=[website,question], outputs=[answer])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80ef8c02", + "metadata": {}, + "outputs": [], + "source": [ + "ui.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "fef26a4b", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}