Merge pull request #863 from rnik12/rnik12-week8

[Bootcamp] - Nikhil - Week 8 Exercise - Agentic RAG with Query and Context Expansion Tools
This commit is contained in:
Ed Donner
2025-11-01 20:25:56 -04:00
committed by GitHub
3 changed files with 172 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
# Agentic Legal Q&A on Bare Acts (Week 8)
An **agentic RAG** demo that answers legal questions from Indian Bare Acts (IPC/BNS/Constitution).
Pipeline: **Query expansion (Modal+Qwen3) → Multi-retrieval (Chroma) → Neighbor-aware context merge → LLM answer → Self-critique → Optional second pass**.
UI: lightweight **Gradio** chat with live agent logs.
## Features
- **Modal-first expander:** `modal_expander.py` (Qwen3-4B via vLLM, GPU) with local LLM fallback.
- **Vector store:** Chroma + `all-MiniLM-L6-v2`, token-span aware chunking, ±neighbor merge.
- **Agentic loop:** critic validates citations and triggers follow-up retrievals if needed.
- **Config knobs:** top-k per rewrite, neighbor radius, max merged blocks, model dropdown.
## Setup
```bash
python -m pip install -U openai chromadb transformers gradio python-dotenv modal
````
Create `.env` with your keys:
```bash
OPENAI_API_KEY=...
```
Place Bare Acts as UTF-8 `.txt` files in:
```
knowledge_base/bare_acts/ # e.g., ipc.txt, bns.txt, coi.txt
```
## Deploy the Modal expander
Set a Modal secret named `huggingface-secret` containing `HUGGINGFACE_HUB_TOKEN`, then:
```bash
modal deploy -m modal_expander
```
## Run the notebook app
```bash
jupyter notebook agentic_legal_qna_with_rag_on_bare_acts.ipynb
```
Run all cells; a Gradio chat appears. Tune **Top-K**, **Neighbor radius**, and **Max blocks** under *Advanced*.
## Notes
* Default OpenAI model: `gpt-4o-mini` (change via UI).
* Vector DB is persisted in `vector_db_w8`; re-run the indexing cell to rebuild after data changes.

View File

@@ -0,0 +1,3 @@
Download and extract the notebook and knowledge base from below gdrive link
https://drive.google.com/file/d/1ogRxIhD_JEXnPtjbxWbIGGPCCsGOUqAb/view?usp=sharing

View File

@@ -0,0 +1,120 @@
# week8/community_contributions/agentic_legal_qna_with_rag_on_bare_acts/modal_expander.py
import os, json, re
from typing import List
import modal
# minimal image: vLLM + torch + HF hub
image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
.entrypoint([])
.uv_pip_install(
"vllm==0.10.2",
"torch==2.8.0",
"huggingface_hub[hf_transfer]==0.35.0",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
app = modal.App("legal-query-expander-qwen3-v2", image=image)
MODEL_NAME = "Qwen/Qwen3-4B-Instruct" # use instruct, defaults everywhere
_llm = None # warm-container cache
def _extract_json_array(text: str) -> List[str]:
try:
parsed = json.loads(text)
return [x for x in parsed if isinstance(x, str)]
except Exception:
pass
m = re.search(r"\[(?:.|\n|\r)*\]", text)
if m:
try:
parsed = json.loads(m.group(0))
return [x for x in parsed if isinstance(x, str)]
except Exception:
return []
return []
def _sanitize_and_dedupe(items: List[str], n: int) -> List[str]:
out, seen = [], set()
for q in items:
q = re.sub(r"[^\w\s\-./]", "", (q or "")).strip()
k = q.lower()
if q and k not in seen:
seen.add(k)
out.append(q)
if len(out) >= n:
break
return out
@app.function(
image=image,
gpu=modal.gpu.L4(), # pick any available GPU (A100/H100 also fine)
timeout=600,
secrets=[modal.Secret.from_name("huggingface-secret")], # set HF token here
)
def expand(question: str, n: int = 5) -> List[str]:
"""
Return up to n short, diverse retrieval keyphrases for Bare Acts.
Uses Qwen3-4B-Instruct with its default chat template.
"""
global _llm
from vllm import LLM, SamplingParams
# ensure HF token is available to vLLM
tok = os.environ.get("HUGGINGFACE_HUB_TOKEN") or os.environ.get("HF_TOKEN")
if tok and not os.environ.get("HUGGINGFACE_HUB_TOKEN"):
os.environ["HUGGINGFACE_HUB_TOKEN"] = tok
if _llm is None:
_llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
dtype="auto",
tensor_parallel_size=1,
)
user = (
"You are Search Query Expander."
"For given search query you give 4-5 different variants of it to search the database better. It is a legal search query and our database is of legal data like bare acts."
"Respond ONLY as a JSON array of strings; no prose, no section numbers."
f"Question:\n{question}\n\n"
f"Return {n} distinct keyphrases (420 words each), which captures the what to search inside rag database. Return as a JSON array. No commentary."
)
messages = [
{"role": "user", "content": user},
]
tokenizer = _llm.get_tokenizer()
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
result = _llm.generate(
[prompt],
SamplingParams(
max_tokens=256,
temperature=0.2,
),
)
text = result[0].outputs[0].text
arr = _sanitize_and_dedupe(_extract_json_array(text), n)
if not arr:
# deterministic fallback (keeps things non-empty)
base = re.sub(r"[?]+$", "", (question or "")).strip()
pool = [
f"{base} section",
f"{base} provision bare act",
f"{base} indian penal code",
f"{base} bharatiya nyaya sanhita",
f"{base} punishment section keywords",
]
arr = _sanitize_and_dedupe(pool, n)
return arr