LLM_Engineering_OLD/week8/community_contributions/hopeogbons/Deal Intel/train_rf.py

#!/usr/bin/env python3
"""
Train a RandomForestRegressor on embeddings from ChromaDB, save to random_forest_model.pkl.
Logs simple holdout metrics.
"""

import argparse
import joblib
import numpy as np
import chromadb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from logging_utils import init_logger
import config as cfg

logger = init_logger("DealIntel.TrainRF")

def main():
    parser = argparse.ArgumentParser(description="Train Random Forest pricer")
    parser.add_argument("--max-datapoints", type=int, default=cfg.RF_MAX_DATAPOINTS)
    args = parser.parse_args()

    logger.info(f"Loading embeddings from {cfg.DB_PATH}/{cfg.COLLECTION_NAME} (limit={args.max_datapoints})")
    client = chromadb.PersistentClient(path=cfg.DB_PATH)
    collection = client.get_or_create_collection(cfg.COLLECTION_NAME)
    result = collection.get(include=['embeddings', 'metadatas'], limit=args.max_datapoints)

    if not result.get("embeddings"):
        raise RuntimeError("No embeddings found — build the vector store first.")

    X = np.array(result["embeddings"])
    y = np.array([md["price"] for md in result["metadatas"]])

    logger.info(f"Training RF on {X.shape[0]} samples, {X.shape[1]} features")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)

    preds = rf.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    logger.info(f"Holdout RMSE={rmse:.2f}, R2={r2:.3f}")

    joblib.dump(rf, "random_forest_model.pkl")
    logger.info("Saved model to random_forest_model.pkl")

if __name__ == "__main__":
    main()