Files
LLM_Engineering_OLD/week3/community-contributions/juan_synthetic_data/src/helpers.py
2025-10-23 15:29:54 +01:00

15 lines
570 B
Python

import hashlib
import pandas as pd
def hash_row(row: pd.Series) -> str:
"""Compute MD5 hash for a row to detect duplicates."""
return hashlib.md5(str(tuple(row)).encode()).hexdigest()
def sample_reference(reference_df: pd.DataFrame, n_reference_rows: int) -> list:
"""Return a fresh sample of reference data for batch generation."""
if reference_df is not None and not reference_df.empty:
sample_df = reference_df.sample(min(n_reference_rows, len(reference_df)), replace=False)
return sample_df.to_dict(orient="records")
return []