Files
LLM_Engineering_OLD/week8/community_contributions/hopeogbons/Deal Intel/train_rf.py
Hope Ogbons e6b43082db Add initial implementation of Deal Intel project
This commit introduces the foundational structure for the Deal Intel project, including:
- Environment configuration file (.env.example) for managing secrets and API keys.
- Scripts for building a ChromaDB vector store (build_vector_store.py) and training machine learning models (train_rf.py, train_ensemble.py).
- Health check functionality (health_check.py) to ensure system readiness.
- A launcher script (launcher.py) for executing various commands, including UI launch and health checks.
- Logging utilities (logging_utils.py) for consistent logging across the application.
- A README file providing an overview and setup instructions for the project.

These additions establish a comprehensive framework for an agentic deal-hunting AI system, integrating various components for data processing, model training, and user interaction.
2025-10-31 12:33:13 +01:00

51 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""
Train a RandomForestRegressor on embeddings from ChromaDB, save to random_forest_model.pkl.
Logs simple holdout metrics.
"""
import argparse
import joblib
import numpy as np
import chromadb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from logging_utils import init_logger
import config as cfg
logger = init_logger("DealIntel.TrainRF")
def main():
parser = argparse.ArgumentParser(description="Train Random Forest pricer")
parser.add_argument("--max-datapoints", type=int, default=cfg.RF_MAX_DATAPOINTS)
args = parser.parse_args()
logger.info(f"Loading embeddings from {cfg.DB_PATH}/{cfg.COLLECTION_NAME} (limit={args.max_datapoints})")
client = chromadb.PersistentClient(path=cfg.DB_PATH)
collection = client.get_or_create_collection(cfg.COLLECTION_NAME)
result = collection.get(include=['embeddings', 'metadatas'], limit=args.max_datapoints)
if not result.get("embeddings"):
raise RuntimeError("No embeddings found — build the vector store first.")
X = np.array(result["embeddings"])
y = np.array([md["price"] for md in result["metadatas"]])
logger.info(f"Training RF on {X.shape[0]} samples, {X.shape[1]} features")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)
logger.info(f"Holdout RMSE={rmse:.2f}, R2={r2:.3f}")
joblib.dump(rf, "random_forest_model.pkl")
logger.info("Saved model to random_forest_model.pkl")
if __name__ == "__main__":
main()