85 lines
2.5 KiB
Python
85 lines
2.5 KiB
Python
"""
|
|
Embeddings utility using sentence-transformers
|
|
"""
|
|
from sentence_transformers import SentenceTransformer
|
|
import numpy as np
|
|
from typing import List, Union
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class EmbeddingModel:
|
|
"""Wrapper for sentence transformer embeddings"""
|
|
|
|
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
|
|
"""
|
|
Initialize embedding model
|
|
|
|
Args:
|
|
model_name: HuggingFace model name for embeddings
|
|
"""
|
|
self.model_name = model_name
|
|
logger.info(f"Loading embedding model: {model_name}")
|
|
self.model = SentenceTransformer(model_name)
|
|
self.dimension = self.model.get_sentence_embedding_dimension()
|
|
logger.info(f"Embedding dimension: {self.dimension}")
|
|
|
|
def embed(self, texts: Union[str, List[str]]) -> np.ndarray:
|
|
"""
|
|
Generate embeddings for text(s)
|
|
|
|
Args:
|
|
texts: Single text or list of texts
|
|
|
|
Returns:
|
|
Numpy array of embeddings
|
|
"""
|
|
if isinstance(texts, str):
|
|
texts = [texts]
|
|
|
|
embeddings = self.model.encode(texts, show_progress_bar=False)
|
|
return embeddings
|
|
|
|
def embed_query(self, query: str) -> List[float]:
|
|
"""
|
|
Embed a single query - returns as list for ChromaDB compatibility
|
|
|
|
Args:
|
|
query: Query text
|
|
|
|
Returns:
|
|
List of floats representing the embedding
|
|
"""
|
|
embedding = self.model.encode([query], show_progress_bar=False)[0]
|
|
return embedding.tolist()
|
|
|
|
def embed_documents(self, documents: List[str]) -> List[List[float]]:
|
|
"""
|
|
Embed multiple documents - returns as list of lists for ChromaDB
|
|
|
|
Args:
|
|
documents: List of document texts
|
|
|
|
Returns:
|
|
List of embeddings (each as list of floats)
|
|
"""
|
|
embeddings = self.model.encode(documents, show_progress_bar=False)
|
|
return embeddings.tolist()
|
|
|
|
def similarity(self, text1: str, text2: str) -> float:
|
|
"""
|
|
Calculate cosine similarity between two texts
|
|
|
|
Args:
|
|
text1: First text
|
|
text2: Second text
|
|
|
|
Returns:
|
|
Similarity score between 0 and 1
|
|
"""
|
|
emb1, emb2 = self.model.encode([text1, text2])
|
|
|
|
# Cosine similarity
|
|
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
|
return float(similarity)
|