diff --git a/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py new file mode 100644 index 0000000..49feb73 --- /dev/null +++ b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +"""new_training_with_RAG.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1gi8FPI1dtnxBNTf86JdmXQ0BYqnKz7LS + +# Predict Product Prices +""" + +!nvidia-smi + +!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb + +import os +import re +import math +from tqdm import tqdm +from google.colab import userdata +from huggingface_hub import login +import torch +import transformers +from transformers import ( + AutoModelForCausalLM, AutoTokenizer, TrainingArguments, + set_seed, BitsAndBytesConfig, GenerationConfig) + +from datasets import load_dataset +from peft import LoraConfig, PeftModel +from trl import SFTTrainer, SFTConfig +from datetime import datetime +import matplotlib.pyplot as plt + +#LangChain & RAG Imports + +from sentence_transformers import SentenceTransformer +from langchain.schema import Document +from langchain.vectorstores import Chroma +import chromadb +from langchain.embeddings import HuggingFaceEmbeddings + +# Commented out IPython magic to ensure Python compatibility. +# Constants + +BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B" +#BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.1' +PROJECT_NAME = "pricer-optim" +HF_USER = "Adriana213" + +# Data + +DATASET_NAME = f"{HF_USER}/pricer-data" +MAX_SEQUENCE_LENGTH = 182 + + +RUN_NAME = f"{PROJECT_NAME}-{datetime.now():%Y%m%d_%H%M%S}" + +HUB_MODEL_NAME = f"{HF_USER}/{RUN_NAME}" + +# Hyperparameters for QLoRA + +LORA_R = 8 +LORA_ALPHA = 32 +TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"] +LORA_DROPOUT = 0.10 +QUANT_4_BIT = True + +# Hyperparameters for Training + +EPOCHS = 2 +BATCH_SIZE = 16 +GRADIENT_ACCUMULATION_STEPS = 1 +LEARNING_RATE = 2e-4 +LR_SCHEDULER_TYPE = 'cosine' +WARMUP_RATIO = 0.05 +OPTIMIZER = "paged_adamw_32bit" +STEPS = 50 +SAVE_STEPS = 200 +EVAL_STEPS = 200 # kept for potential future use + +# %matplotlib inline + +HUB_MODEL_NAME + +"""### Log in to HuggingFace & get Data""" + +hf_token = userdata.get('HF_TOKEN') +login(hf_token, add_to_git_credential=True) + +torch.cuda.empty_cache() + +dataset = load_dataset(DATASET_NAME) +train = dataset['train'] +test = dataset['test'] + +"""## Now load the Tokenizer and Model + +The model is "quantized" - we are reducing the precision to 4 bits. +""" + +# Pick the right quantization + +if QUANT_4_BIT: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) +else: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + bnb_8bit_compute_dtype=torch.bfloat16 + ) + +# Load the Tokenizer and the Model + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto", +) + +base_model.generation_config.pad_token_id = tokenizer.pad_token_id + +print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB") + +"""# Data Collator + +""" + +from trl import DataCollatorForCompletionOnlyLM + +response_template = "Price is $" +collator = DataCollatorForCompletionOnlyLM(response_template, + tokenizer=tokenizer) + +"""# Set up the configuration for Training""" + +# LoRA Config + +lora_parameters = LoraConfig( + lora_alpha = LORA_ALPHA, + lora_dropout = LORA_DROPOUT, + r = LORA_R, + bias = "none", + task_type = "CAUSAL_LM", + target_modules = TARGET_MODULES, +) + +# Training Config + +train_parameters = SFTConfig( + output_dir = RUN_NAME, + num_train_epochs = EPOCHS, + per_device_train_batch_size = BATCH_SIZE, + per_device_eval_batch_size = 4, + eval_strategy = "no", + eval_steps = EVAL_STEPS, + gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, + optim = OPTIMIZER, + save_steps = SAVE_STEPS, + save_total_limit = 5, + logging_steps = 50, + learning_rate = LEARNING_RATE, + weight_decay = 0.01, + fp16=False, + bf16=True, + max_grad_norm=0.3, + max_steps=-1, + warmup_ratio = WARMUP_RATIO, + group_by_length=True, + lr_scheduler_type = LR_SCHEDULER_TYPE, + run_name = RUN_NAME, + max_seq_length = MAX_SEQUENCE_LENGTH, + dataset_text_field = "text", + save_strategy = "steps", + hub_strategy = "every_save", + push_to_hub = True, + hub_model_id = HUB_MODEL_NAME, + hub_private_repo = True, + report_to = 'none', +) + + +fine_tuning = SFTTrainer( + model = base_model, + train_dataset = train, + eval_dataset=test, + peft_config = lora_parameters, + args = train_parameters, + data_collator = collator, + ) + +"""## Fine Tuning""" + +fine_tuning.train() + +fine_tuning.model.push_to_hub(RUN_NAME, private=True) +print(f"Saved to the hub: {RUN_NAME}") + +"""# Implement RAG""" + +HF_USER = "Adriana213" +RUN_NAME = "pricer-optim-20250514_061529" +fine_tuned_model = PeftModel.from_pretrained(base_model, f"{HF_USER}/{RUN_NAME}") +print(f"✅ Loaded fine-tuned adapter: {HF_USER}/{RUN_NAME}") + +base_model = fine_tuned_model + +"""## Build Chroma index""" + +docs = [ + Document(page_content=text, metadata = {'price': price}) + for text, price in zip(train['text'], train['price']) +] + +# Create embeddings & persist Chroma index + +embedding = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2') +chroma = Chroma.from_documents( + documents = docs, + embedding = embedding, + persist_directory = 'chroma_train_index' +) + +chroma.persist() +print('Chroma index built and persisted.') + +"""## RAG Prediction Function""" + +generation_config = GenerationConfig( + max_new_token = 10, + do_sample = False, + temperature = 0.1 +) + +def predict_price_rag(desc: str, k: int = 3) -> float: + hits = chroma.similarity_search(desc, k = k) + shot_strs = [ + f'Description: {doc.page_content}\nPrice is ${doc.metadata["price"]}' + for doc in hits + ] + + prompt = "\n\n".join(shot_strs) + f"\n\nDescription: {desc}\nPrice is $" + + inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device) + out = base_model.generate(**inputs, generation_config=generation_config) + text = tokenizer.decode( + out[0, inputs["input_ids"].shape[-1]:], + skip_special_tokens=True + ).strip() + return float(re.findall(r"\d+\.?\d+", text)[0]) + +!zip -r chroma_index.zip chroma_train_index + +from google.colab import files +files.download("chroma_index.zip") \ No newline at end of file diff --git a/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py b/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py new file mode 100644 index 0000000..22c775d --- /dev/null +++ b/week7/community_contributions/price_prediction_with_RAG/testing_fine_tuned_model_with_rag.py @@ -0,0 +1,258 @@ +# -*- coding: utf-8 -*- +"""Testing Fine-tuned model with RAG + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1J8P8cwqwhBo3CNIZaEFe6BMRw0WUfEqy + +## Predict Product Prices + +### And now, to evaluate our fine-tuned open source model +""" + +!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb + +import os +import re +import math + +from google.colab import userdata + +from huggingface_hub import login + +import torch +import torch.nn.functional as F + +from transformers import ( + AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, GenerationConfig) + +from datasets import load_dataset + +from peft import PeftModel + +from sentence_transformers import SentenceTransformer +from langchain.vectorstores import Chroma +from langchain.embeddings import HuggingFaceEmbeddings + +import matplotlib.pyplot as plt + +# Commented out IPython magic to ensure Python compatibility. +# Constants + +BASE_MODEL = "meta-llama/Llama-3.1-8B" +PROJECT_NAME = "pricer" +HF_USER = "Adriana213" + +RUN_NAME = "optim-20250514_061529" +PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}" + +FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}" + +# Data + +DATASET_NAME = f"{HF_USER}/pricer-data" + +# Hyperparameters for QLoRA + +QUANT_4_BIT = True + +# %matplotlib inline + +# Used for writing to output in color + +GREEN = "\033[92m" +YELLOW = "\033[93m" +RED = "\033[91m" +RESET = "\033[0m" +COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN} + +"""### Log in to HuggingFace + + +""" + +hf_token = userdata.get('HF_TOKEN') +login(hf_token, add_to_git_credential=True) + +dataset = load_dataset(DATASET_NAME) +train = dataset['train'] +test = dataset['test'] + +test[0] + +"""## Now load the Tokenizer and Model""" + +if QUANT_4_BIT: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) +else: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + bnb_8bit_compute_dtype=torch.bfloat16 + ) + +# Load the Tokenizer and the Model + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto", +) +base_model.generation_config.pad_token_id = tokenizer.pad_token_id + +# Load the fine-tuned model with PEFT + +fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL) + + +print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB") + +fine_tuned_model + +"""# Evaluation""" + +def extract_price(s): + if "Price is $" in s: + contents = s.split("Price is $")[1] + contents = contents.replace(',','') + match = re.search(r"[-+]?\d*\.\d+|\d+", contents) + return float(match.group()) if match else 0 + return 0 + +extract_price("Price is $a fabulous 899.99 or so") + +# Original prediction function takes the most likely next token + +def model_predict(prompt): + inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda") + attention_mask = torch.ones(inputs.shape, device="cuda") + outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1) + response = tokenizer.decode(outputs[0]) + return extract_price(response) + +# top_K = 3 + +# def improved_model_predict(prompt, device="cuda"): +# set_seed(42) +# inputs = tokenizer.encode(prompt, return_tensors="pt").to(device) +# attention_mask = torch.ones(inputs.shape, device=device) + +# with torch.no_grad(): +# outputs = fine_tuned_model(inputs, attention_mask=attention_mask) +# next_token_logits = outputs.logits[:, -1, :].to('cpu') + +# next_token_probs = F.softmax(next_token_logits, dim=-1) +# top_prob, top_token_id = next_token_probs.topk(top_K) +# prices, weights = [], [] +# for i in range(top_K): +# predicted_token = tokenizer.decode(top_token_id[0][i]) +# probability = top_prob[0][i] +# try: +# result = float(predicted_token) +# except ValueError as e: +# result = 0.0 +# if result > 0: +# prices.append(result) +# weights.append(probability) +# if not prices: +# return 0.0, 0.0 +# total = sum(weights) +# weighted_prices = [price * weight / total for price, weight in zip(prices, weights)] +# return sum(weighted_prices).item() + +embedder = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") +chroma = Chroma( + persist_directory = "chroma_train_index", + embedding_function = embedder +) + +gen_config = GenerationConfig(max_new_tokens=10, do_sample=False) + +def predict_price_rag(desc: str, k: int = 3) -> float: + docs = chroma.similarity_search(desc, k=k) + shots = "\n\n".join(f"Description: {d.page_content}\nPrice is ${d.metadata['price']}" + for d in docs) + prompt = f"{shots}\n\nDescription: {desc}\nPrice is $" + inp = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device) + out = fine_tuned_model.generate(**inp, generation_config=gen_config) + txt = tokenizer.decode(out[0, inp["input_ids"].shape[-1]:], skip_special_tokens=True).strip() + return float(re.findall(r"\d+\.?\d+", txt)[0]) + +class Tester: + + def __init__(self, predictor, data, title=None, size=250): + self.predictor = predictor + self.data = data + self.title = title or predictor.__name__.replace("_", " ").title() + self.size = size + self.guesses = [] + self.truths = [] + self.errors = [] + self.sles = [] + self.colors = [] + + def color_for(self, error, truth): + if error<40 or error/truth < 0.2: + return "green" + elif error<80 or error/truth < 0.4: + return "orange" + else: + return "red" + + def run_datapoint(self, i): + datapoint = self.data[i] + guess = self.predictor(datapoint["text"]) + truth = datapoint["price"] + error = abs(guess - truth) + log_error = math.log(truth+1) - math.log(guess+1) + sle = log_error ** 2 + color = self.color_for(error, truth) + title = datapoint["text"].split("\n\n")[1][:20] + "..." + self.guesses.append(guess) + self.truths.append(truth) + self.errors.append(error) + self.sles.append(sle) + self.colors.append(color) + print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}") + + def chart(self, title): + max_error = max(self.errors) + plt.figure(figsize=(12, 8)) + max_val = max(max(self.truths), max(self.guesses)) + plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6) + plt.scatter(self.truths, self.guesses, s=3, c=self.colors) + plt.xlabel('Ground Truth') + plt.ylabel('Model Estimate') + plt.xlim(0, max_val) + plt.ylim(0, max_val) + plt.title(title) + plt.show() + + def report(self): + average_error = sum(self.errors) / self.size + rmsle = math.sqrt(sum(self.sles) / self.size) + hits = sum(1 for color in self.colors if color=="green") + title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%" + self.chart(title) + + def run(self): + self.error = 0 + for i in range(self.size): + self.run_datapoint(i) + self.report() + + @classmethod + def test(cls, function, data): + cls(function, data).run() + +Tester.test(predict_price_rag, test) \ No newline at end of file