From bfc20be33cd9b425b82986d419d5f6c00cd221a2 Mon Sep 17 00:00:00 2001 From: Adriana394 <158718290+Adriana394@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:33:14 +0200 Subject: [PATCH] Create new_training_with_rag (1).py --- .../new_training_with_rag (1).py | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py diff --git a/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py new file mode 100644 index 0000000..49feb73 --- /dev/null +++ b/week7/community_contributions/price_prediction_with_RAG/new_training_with_rag (1).py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +"""new_training_with_RAG.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1gi8FPI1dtnxBNTf86JdmXQ0BYqnKz7LS + +# Predict Product Prices +""" + +!nvidia-smi + +!pip install -q datasets requests torch peft bitsandbytes transformers trl accelerate sentencepiece matplotlib langchain-community chromadb + +import os +import re +import math +from tqdm import tqdm +from google.colab import userdata +from huggingface_hub import login +import torch +import transformers +from transformers import ( + AutoModelForCausalLM, AutoTokenizer, TrainingArguments, + set_seed, BitsAndBytesConfig, GenerationConfig) + +from datasets import load_dataset +from peft import LoraConfig, PeftModel +from trl import SFTTrainer, SFTConfig +from datetime import datetime +import matplotlib.pyplot as plt + +#LangChain & RAG Imports + +from sentence_transformers import SentenceTransformer +from langchain.schema import Document +from langchain.vectorstores import Chroma +import chromadb +from langchain.embeddings import HuggingFaceEmbeddings + +# Commented out IPython magic to ensure Python compatibility. +# Constants + +BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B" +#BASE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.1' +PROJECT_NAME = "pricer-optim" +HF_USER = "Adriana213" + +# Data + +DATASET_NAME = f"{HF_USER}/pricer-data" +MAX_SEQUENCE_LENGTH = 182 + + +RUN_NAME = f"{PROJECT_NAME}-{datetime.now():%Y%m%d_%H%M%S}" + +HUB_MODEL_NAME = f"{HF_USER}/{RUN_NAME}" + +# Hyperparameters for QLoRA + +LORA_R = 8 +LORA_ALPHA = 32 +TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"] +LORA_DROPOUT = 0.10 +QUANT_4_BIT = True + +# Hyperparameters for Training + +EPOCHS = 2 +BATCH_SIZE = 16 +GRADIENT_ACCUMULATION_STEPS = 1 +LEARNING_RATE = 2e-4 +LR_SCHEDULER_TYPE = 'cosine' +WARMUP_RATIO = 0.05 +OPTIMIZER = "paged_adamw_32bit" +STEPS = 50 +SAVE_STEPS = 200 +EVAL_STEPS = 200 # kept for potential future use + +# %matplotlib inline + +HUB_MODEL_NAME + +"""### Log in to HuggingFace & get Data""" + +hf_token = userdata.get('HF_TOKEN') +login(hf_token, add_to_git_credential=True) + +torch.cuda.empty_cache() + +dataset = load_dataset(DATASET_NAME) +train = dataset['train'] +test = dataset['test'] + +"""## Now load the Tokenizer and Model + +The model is "quantized" - we are reducing the precision to 4 bits. +""" + +# Pick the right quantization + +if QUANT_4_BIT: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4" + ) +else: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + bnb_8bit_compute_dtype=torch.bfloat16 + ) + +# Load the Tokenizer and the Model + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL, + quantization_config=quant_config, + device_map="auto", +) + +base_model.generation_config.pad_token_id = tokenizer.pad_token_id + +print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB") + +"""# Data Collator + +""" + +from trl import DataCollatorForCompletionOnlyLM + +response_template = "Price is $" +collator = DataCollatorForCompletionOnlyLM(response_template, + tokenizer=tokenizer) + +"""# Set up the configuration for Training""" + +# LoRA Config + +lora_parameters = LoraConfig( + lora_alpha = LORA_ALPHA, + lora_dropout = LORA_DROPOUT, + r = LORA_R, + bias = "none", + task_type = "CAUSAL_LM", + target_modules = TARGET_MODULES, +) + +# Training Config + +train_parameters = SFTConfig( + output_dir = RUN_NAME, + num_train_epochs = EPOCHS, + per_device_train_batch_size = BATCH_SIZE, + per_device_eval_batch_size = 4, + eval_strategy = "no", + eval_steps = EVAL_STEPS, + gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS, + optim = OPTIMIZER, + save_steps = SAVE_STEPS, + save_total_limit = 5, + logging_steps = 50, + learning_rate = LEARNING_RATE, + weight_decay = 0.01, + fp16=False, + bf16=True, + max_grad_norm=0.3, + max_steps=-1, + warmup_ratio = WARMUP_RATIO, + group_by_length=True, + lr_scheduler_type = LR_SCHEDULER_TYPE, + run_name = RUN_NAME, + max_seq_length = MAX_SEQUENCE_LENGTH, + dataset_text_field = "text", + save_strategy = "steps", + hub_strategy = "every_save", + push_to_hub = True, + hub_model_id = HUB_MODEL_NAME, + hub_private_repo = True, + report_to = 'none', +) + + +fine_tuning = SFTTrainer( + model = base_model, + train_dataset = train, + eval_dataset=test, + peft_config = lora_parameters, + args = train_parameters, + data_collator = collator, + ) + +"""## Fine Tuning""" + +fine_tuning.train() + +fine_tuning.model.push_to_hub(RUN_NAME, private=True) +print(f"Saved to the hub: {RUN_NAME}") + +"""# Implement RAG""" + +HF_USER = "Adriana213" +RUN_NAME = "pricer-optim-20250514_061529" +fine_tuned_model = PeftModel.from_pretrained(base_model, f"{HF_USER}/{RUN_NAME}") +print(f"✅ Loaded fine-tuned adapter: {HF_USER}/{RUN_NAME}") + +base_model = fine_tuned_model + +"""## Build Chroma index""" + +docs = [ + Document(page_content=text, metadata = {'price': price}) + for text, price in zip(train['text'], train['price']) +] + +# Create embeddings & persist Chroma index + +embedding = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2') +chroma = Chroma.from_documents( + documents = docs, + embedding = embedding, + persist_directory = 'chroma_train_index' +) + +chroma.persist() +print('Chroma index built and persisted.') + +"""## RAG Prediction Function""" + +generation_config = GenerationConfig( + max_new_token = 10, + do_sample = False, + temperature = 0.1 +) + +def predict_price_rag(desc: str, k: int = 3) -> float: + hits = chroma.similarity_search(desc, k = k) + shot_strs = [ + f'Description: {doc.page_content}\nPrice is ${doc.metadata["price"]}' + for doc in hits + ] + + prompt = "\n\n".join(shot_strs) + f"\n\nDescription: {desc}\nPrice is $" + + inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device) + out = base_model.generate(**inputs, generation_config=generation_config) + text = tokenizer.decode( + out[0, inputs["input_ids"].shape[-1]:], + skip_special_tokens=True + ).strip() + return float(re.findall(r"\d+\.?\d+", text)[0]) + +!zip -r chroma_index.zip chroma_train_index + +from google.colab import files +files.download("chroma_index.zip") \ No newline at end of file