1143 lines
42 KiB
Python
1143 lines
42 KiB
Python
import os
|
||
import json
|
||
import pandas as pd
|
||
import numpy as np
|
||
from pathlib import Path
|
||
import re
|
||
from datetime import datetime, timedelta
|
||
from bs4 import BeautifulSoup
|
||
import html2text
|
||
from collections import Counter, defaultdict, deque
|
||
import warnings
|
||
import time
|
||
import hashlib
|
||
import socket
|
||
import random
|
||
import zipfile
|
||
import tempfile
|
||
import shutil
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
import gradio as gr
|
||
import chromadb
|
||
from sentence_transformers import SentenceTransformer
|
||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||
from openai import OpenAI
|
||
import torch
|
||
|
||
# ================================
|
||
# USAGE PROTECTION SYSTEM
|
||
# ================================
|
||
|
||
class UsageTracker:
|
||
def __init__(self):
|
||
self.hourly_limits = defaultdict(lambda: deque())
|
||
self.daily_limits = defaultdict(int)
|
||
self.total_requests = 0
|
||
self.total_cost = 0.0
|
||
|
||
# STRICTER LIMITS for cost control
|
||
self.max_hourly = 5 # Reduced from 15
|
||
self.max_daily = 20 # Reduced from 100
|
||
self.max_total = 200 # Reduced from 1000
|
||
self.max_daily_cost = 3.0 # $3 daily limit
|
||
|
||
# GPT-4o-mini pricing (approximate cost per request)
|
||
self.cost_per_request = 0.01 # ~1 cent per request (conservative estimate)
|
||
|
||
def can_make_request(self, user_id):
|
||
now = datetime.now()
|
||
hour_ago = now - timedelta(hours=1)
|
||
|
||
# Clean old hourly requests
|
||
while self.hourly_limits[user_id] and self.hourly_limits[user_id][0] < hour_ago:
|
||
self.hourly_limits[user_id].popleft()
|
||
|
||
# Check limits
|
||
if len(self.hourly_limits[user_id]) >= self.max_hourly:
|
||
return False, f"⏰ Hourly limit reached ({self.max_hourly} requests/hour). Please try again in a few minutes."
|
||
|
||
if self.daily_limits[user_id] >= self.max_daily:
|
||
return False, f"📅 Daily limit reached ({self.max_daily} requests/day). Come back tomorrow!"
|
||
|
||
if self.total_requests >= self.max_total:
|
||
return False, "🚫 Service temporarily unavailable due to high usage. Please try again later."
|
||
|
||
# Check estimated daily cost
|
||
if self.total_cost >= self.max_daily_cost:
|
||
return False, f"💰 Daily cost limit (${self.max_daily_cost}) reached. Service will reset tomorrow."
|
||
|
||
return True, "OK"
|
||
|
||
def record_request(self, user_id):
|
||
now = datetime.now()
|
||
self.hourly_limits[user_id].append(now)
|
||
self.daily_limits[user_id] += 1
|
||
self.total_requests += 1
|
||
self.total_cost += self.cost_per_request # Track estimated cost
|
||
|
||
def get_usage_info(self):
|
||
"""Get current usage info for display"""
|
||
return f"""
|
||
**📊 Current Usage:**
|
||
- Total requests today: {self.total_requests}/{self.max_total}
|
||
- Estimated cost today: ${self.total_cost:.2f}/${self.max_daily_cost}
|
||
- Service status: {'🟢 Active' if self.total_requests < self.max_total and self.total_cost < self.max_daily_cost else '🔴 Limited'}
|
||
"""
|
||
|
||
# Initialize tracker - ADD THIS LINE!
|
||
usage_tracker = UsageTracker()
|
||
|
||
|
||
def protected_function(func):
|
||
def wrapper(*args, **kwargs):
|
||
user_id = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
|
||
allowed, message = usage_tracker.can_make_request(user_id)
|
||
|
||
if not allowed:
|
||
return f"⚠️ {message}. Please try again later."
|
||
|
||
usage_tracker.record_request(user_id)
|
||
return func(*args, **kwargs)
|
||
return wrapper
|
||
|
||
# ================================
|
||
# LINKEDIN DATA PROCESSOR
|
||
# ================================
|
||
|
||
class LinkedInDataProcessor:
|
||
def __init__(self, data_path):
|
||
self.data_path = Path(data_path)
|
||
self.profile_data = {}
|
||
self.processed_data = {}
|
||
self.articles_content = []
|
||
self.rag_documents = []
|
||
|
||
def load_all_data(self):
|
||
"""Load all LinkedIn JSON and CSV files including HTML articles"""
|
||
print("🔄 Loading LinkedIn data...")
|
||
|
||
file_mappings = {
|
||
'Profile.csv': 'basic_info',
|
||
'Connections.csv': 'connections',
|
||
'Experience.csv': 'experience',
|
||
'Education.csv': 'education',
|
||
'Skills.csv': 'skills',
|
||
'Certifications.csv': 'certifications',
|
||
'Articles.csv': 'articles_metadata',
|
||
'Comments.csv': 'comments',
|
||
'Shares.csv': 'shares',
|
||
'Positions.csv': 'positions',
|
||
'Languages.csv': 'languages',
|
||
'Projects.csv': 'projects',
|
||
'Publications.csv': 'publications',
|
||
'Recommendations.csv': 'recommendations',
|
||
'Endorsement_Given_Info.csv': 'endorsements_given',
|
||
'Endorsement_Received_Info.csv': 'endorsements_received',
|
||
'Courses.csv': 'courses',
|
||
'Learning.csv': 'learning_paths',
|
||
'Interests.csv': 'interests',
|
||
'Company Follow.csv': 'companies_followed',
|
||
'Reactions.csv': 'reactions',
|
||
'Views.csv': 'views',
|
||
'Saved_Items.csv': 'saved_items',
|
||
}
|
||
|
||
loaded_count = 0
|
||
for file_name, data_type in file_mappings.items():
|
||
file_path = self.data_path / file_name
|
||
if file_path.exists():
|
||
try:
|
||
df = pd.read_csv(file_path, encoding='utf-8')
|
||
self.profile_data[data_type] = df
|
||
print(f"✅ Loaded {file_name}: {len(df)} records")
|
||
loaded_count += 1
|
||
except Exception as e:
|
||
print(f"⚠️ Could not load {file_name}: {str(e)}")
|
||
else:
|
||
print(f"📁 {file_name} not found")
|
||
|
||
self.load_html_articles()
|
||
print(f"🎉 Successfully loaded {loaded_count} data files")
|
||
return loaded_count > 0
|
||
|
||
def load_html_articles(self):
|
||
"""Load and parse HTML articles"""
|
||
print("\n📰 Loading HTML articles...")
|
||
|
||
articles_paths = [
|
||
self.data_path / "Articles" / "Articles",
|
||
self.data_path / "Articles",
|
||
self.data_path / "articles" / "articles",
|
||
self.data_path / "articles",
|
||
]
|
||
|
||
found_path = None
|
||
for path in articles_paths:
|
||
if path.exists():
|
||
found_path = path
|
||
break
|
||
|
||
if not found_path:
|
||
print("📁 Articles folder not found")
|
||
return
|
||
|
||
html_files = list(found_path.glob("*.html"))
|
||
if not html_files:
|
||
print("📄 No HTML files found")
|
||
return
|
||
|
||
print(f"📄 Found {len(html_files)} HTML articles")
|
||
|
||
articles_data = []
|
||
for html_file in html_files:
|
||
try:
|
||
article_data = self.parse_html_article(html_file)
|
||
if article_data:
|
||
articles_data.append(article_data)
|
||
except Exception as e:
|
||
print(f"⚠️ Error parsing {html_file.name}: {str(e)}")
|
||
|
||
self.articles_content = articles_data
|
||
self.profile_data['articles_html'] = articles_data
|
||
print(f"🎉 Successfully loaded {len(articles_data)} articles")
|
||
|
||
def extract_linkedin_url_from_html(self, html_content, filename):
|
||
"""Extract LinkedIn URL from HTML article content"""
|
||
try:
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# Look for canonical URL
|
||
canonical = soup.find('link', {'rel': 'canonical'})
|
||
if canonical and canonical.get('href'):
|
||
url = canonical.get('href')
|
||
if 'linkedin.com' in url:
|
||
return url
|
||
|
||
# Look for meta property og:url
|
||
og_url = soup.find('meta', {'property': 'og:url'})
|
||
if og_url and og_url.get('content'):
|
||
url = og_url.get('content')
|
||
if 'linkedin.com' in url:
|
||
return url
|
||
|
||
# Look for any LinkedIn URLs in the content
|
||
linkedin_pattern = r'https?://(?:www\.)?linkedin\.com/pulse/[^"\s<>]+'
|
||
matches = re.findall(linkedin_pattern, html_content)
|
||
if matches:
|
||
return matches[0]
|
||
|
||
# Fallback: construct URL from filename
|
||
if filename:
|
||
clean_name = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', filename)
|
||
clean_name = clean_name.replace('.html', '')
|
||
|
||
if len(clean_name) > 10 and '-' in clean_name:
|
||
return f"https://www.linkedin.com/pulse/{clean_name}/"
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f"Error extracting LinkedIn URL: {e}")
|
||
return None
|
||
|
||
def parse_html_article(self, file_path):
|
||
"""Parse individual HTML article with LinkedIn URL extraction"""
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
soup = BeautifulSoup(content, 'html.parser')
|
||
|
||
# Extract title
|
||
title_elem = soup.find('h1') or soup.find('title')
|
||
title = title_elem.get_text().strip() if title_elem else self.extract_title_from_filename(file_path.name)
|
||
|
||
# Extract LinkedIn URL
|
||
linkedin_url = self.extract_linkedin_url_from_html(content, file_path.name)
|
||
|
||
# Extract content
|
||
content_selectors = ['article', '.article-content', '.post-content', 'main', '.content', 'body']
|
||
article_content = None
|
||
for selector in content_selectors:
|
||
article_content = soup.select_one(selector)
|
||
if article_content:
|
||
break
|
||
|
||
if not article_content:
|
||
article_content = soup.find('body') or soup
|
||
|
||
# Convert to plain text
|
||
h = html2text.HTML2Text()
|
||
h.ignore_links = True
|
||
h.ignore_images = True
|
||
plain_text = h.handle(str(article_content)).strip()
|
||
|
||
# Extract metadata
|
||
words = re.findall(r'\b\w+\b', plain_text.lower())
|
||
|
||
return {
|
||
'filename': file_path.name,
|
||
'title': title,
|
||
'content': str(article_content),
|
||
'plain_text': plain_text,
|
||
'date_published': self.extract_date_from_filename(file_path.name),
|
||
'word_count': len(words),
|
||
'topics': self.extract_topics(plain_text),
|
||
'writing_style': self.analyze_writing_style(plain_text),
|
||
'linkedin_url': linkedin_url
|
||
}
|
||
|
||
def extract_title_from_filename(self, filename):
|
||
"""Extract readable title from filename"""
|
||
title = filename.replace('.html', '')
|
||
title = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', title)
|
||
title = title.replace('-', ' ').replace('_', ' ')
|
||
return ' '.join(word.capitalize() for word in title.split())
|
||
|
||
def extract_date_from_filename(self, filename):
|
||
"""Extract publication date from filename"""
|
||
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename)
|
||
return date_match.group(1) if date_match else ''
|
||
|
||
def analyze_writing_style(self, text):
|
||
"""Analyze writing style indicators"""
|
||
text_lower = text.lower()
|
||
sentences = re.split(r'[.!?]+', text)
|
||
words = re.findall(r'\b\w+\b', text_lower)
|
||
|
||
return {
|
||
'word_count': len(words),
|
||
'sentence_count': len(sentences),
|
||
'avg_sentence_length': len(words) / max(len(sentences), 1),
|
||
'question_count': text.count('?'),
|
||
'first_person_usage': len(re.findall(r'\b(i|me|my|myself|we|us|our)\b', text_lower)),
|
||
'technical_terms': sum(text_lower.count(term) for term in ['algorithm', 'framework', 'methodology', 'data', 'analysis', 'technology']),
|
||
}
|
||
|
||
def extract_topics(self, text, max_topics=10):
|
||
"""Extract main topics from text"""
|
||
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
|
||
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'been', 'have', 'has', 'had'}
|
||
word_freq = Counter(word for word in words if word not in stop_words and len(word) > 3)
|
||
return [word for word, count in word_freq.most_common(max_topics)]
|
||
|
||
def create_rag_documents(self):
|
||
"""Create documents for RAG system with LinkedIn URLs"""
|
||
self.rag_documents = []
|
||
|
||
# Process profile data
|
||
for data_type, data_content in self.profile_data.items():
|
||
if isinstance(data_content, pd.DataFrame) and not data_content.empty:
|
||
self.process_dataframe_to_documents(data_content, data_type)
|
||
elif isinstance(data_content, list) and data_content:
|
||
self.process_list_to_documents(data_content, data_type)
|
||
|
||
# Process articles with LinkedIn URLs
|
||
if self.articles_content:
|
||
for article in self.articles_content:
|
||
if article['plain_text'].strip():
|
||
self.rag_documents.append({
|
||
'text': article['plain_text'],
|
||
'title': article['title'],
|
||
'source_type': 'article',
|
||
'date_published': article['date_published'],
|
||
'word_count': article['word_count'],
|
||
'topics': article['topics'],
|
||
'linkedin_url': article.get('linkedin_url', ''),
|
||
'filename': article['filename']
|
||
})
|
||
|
||
print(f"📚 Created {len(self.rag_documents)} RAG documents with LinkedIn URLs")
|
||
return self.rag_documents
|
||
|
||
def process_dataframe_to_documents(self, df, data_type):
|
||
"""Convert DataFrame to RAG documents"""
|
||
if data_type == 'experience':
|
||
for _, row in df.iterrows():
|
||
text = f"Experience: {row.get('Title', '')} at {row.get('Company', '')}\n"
|
||
text += f"Duration: {row.get('Started On', '')} - {row.get('Finished On', 'Present')}\n"
|
||
text += f"Description: {row.get('Description', '')}"
|
||
|
||
self.rag_documents.append({
|
||
'text': text,
|
||
'title': f"{row.get('Title', '')} at {row.get('Company', '')}",
|
||
'source_type': 'experience',
|
||
'linkedin_url': ''
|
||
})
|
||
|
||
elif data_type == 'education':
|
||
for _, row in df.iterrows():
|
||
text = f"Education: {row.get('Degree', '')} in {row.get('Field Of Study', '')} from {row.get('School', '')}\n"
|
||
text += f"Duration: {row.get('Start Date', '')} - {row.get('End Date', '')}"
|
||
|
||
self.rag_documents.append({
|
||
'text': text,
|
||
'title': f"{row.get('Degree', '')} - {row.get('School', '')}",
|
||
'source_type': 'education',
|
||
'linkedin_url': ''
|
||
})
|
||
|
||
elif data_type == 'skills':
|
||
if 'Skill' in df.columns:
|
||
skills_text = "Professional Skills: " + ", ".join(df['Skill'].dropna().tolist())
|
||
self.rag_documents.append({
|
||
'text': skills_text,
|
||
'title': 'Professional Skills',
|
||
'source_type': 'skills',
|
||
'linkedin_url': ''
|
||
})
|
||
|
||
elif data_type == 'certifications':
|
||
if 'Name' in df.columns:
|
||
certs_text = "Certifications: " + ", ".join(df['Name'].dropna().tolist())
|
||
self.rag_documents.append({
|
||
'text': certs_text,
|
||
'title': 'Certifications',
|
||
'source_type': 'certifications',
|
||
'linkedin_url': ''
|
||
})
|
||
|
||
elif data_type == 'projects':
|
||
for _, row in df.iterrows():
|
||
text = f"Project: {row.get('Title', '')}\n"
|
||
text += f"Description: {row.get('Description', '')}\n"
|
||
text += f"URL: {row.get('Url', '')}"
|
||
|
||
project_url = row.get('Url', '')
|
||
linkedin_url = project_url if 'linkedin.com' in str(project_url) else ''
|
||
|
||
self.rag_documents.append({
|
||
'text': text,
|
||
'title': row.get('Title', 'Project'),
|
||
'source_type': 'projects',
|
||
'linkedin_url': linkedin_url
|
||
})
|
||
|
||
def process_list_to_documents(self, data_list, data_type):
|
||
"""Convert list data to RAG documents"""
|
||
if data_type == 'articles_html':
|
||
return
|
||
|
||
def get_profile_summary(self):
|
||
"""Get comprehensive profile summary"""
|
||
summary = {
|
||
'total_documents': len(self.rag_documents),
|
||
'articles_count': len(self.articles_content),
|
||
'data_types': list(self.profile_data.keys()),
|
||
'skills_count': len(self.profile_data.get('skills', [])),
|
||
'experience_count': len(self.profile_data.get('experience', [])),
|
||
'education_count': len(self.profile_data.get('education', [])),
|
||
}
|
||
|
||
if self.articles_content:
|
||
total_words = sum(article['word_count'] for article in self.articles_content)
|
||
summary['total_article_words'] = total_words
|
||
summary['avg_article_length'] = total_words // len(self.articles_content)
|
||
|
||
return summary
|
||
|
||
# ================================
|
||
# RAG SYSTEM
|
||
# ================================
|
||
|
||
class LinkedInRAGSystem:
|
||
def __init__(self, chroma_db_path):
|
||
self.chroma_db_path = chroma_db_path
|
||
self.embedding_model = None
|
||
self.cross_encoder_model = None
|
||
self.cross_encoder_tokenizer = None
|
||
self.chroma_client = None
|
||
self.collection = None
|
||
self.openai_client = None
|
||
|
||
def initialize_models(self):
|
||
"""Initialize all required models"""
|
||
print("🔄 Initializing RAG models...")
|
||
|
||
# Initialize OpenAI client
|
||
try:
|
||
api_key = os.getenv('OPENAI_API_KEY')
|
||
if not api_key:
|
||
print("❌ OpenAI API key not found in environment variables")
|
||
return False
|
||
self.openai_client = OpenAI(api_key=api_key)
|
||
print("✅ OpenAI client initialized")
|
||
except Exception as e:
|
||
print(f"❌ Failed to initialize OpenAI client: {e}")
|
||
return False
|
||
|
||
# Load embedding model
|
||
try:
|
||
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
||
print("✅ Embedding model loaded")
|
||
except Exception as e:
|
||
print(f"❌ Failed to load embedding model: {e}")
|
||
return False
|
||
|
||
# Load cross-encoder for reranking
|
||
try:
|
||
cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
||
self.cross_encoder_tokenizer = AutoTokenizer.from_pretrained(cross_encoder_name)
|
||
self.cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(cross_encoder_name)
|
||
print("✅ Cross-encoder model loaded")
|
||
except Exception as e:
|
||
print(f"❌ Failed to load cross-encoder: {e}")
|
||
return False
|
||
|
||
# Initialize ChromaDB
|
||
try:
|
||
self.chroma_client = chromadb.PersistentClient(path=self.chroma_db_path)
|
||
print("✅ ChromaDB initialized")
|
||
except Exception as e:
|
||
print(f"❌ Failed to initialize ChromaDB: {e}")
|
||
return False
|
||
|
||
return True
|
||
|
||
def create_vector_store(self, documents):
|
||
"""Create vector store from documents with enhanced metadata"""
|
||
print("🔄 Creating vector store with LinkedIn URLs...")
|
||
|
||
# Delete existing collection if it exists
|
||
try:
|
||
self.chroma_client.delete_collection("linkedin_profile")
|
||
except:
|
||
pass
|
||
|
||
# Create new collection
|
||
self.collection = self.chroma_client.create_collection("linkedin_profile")
|
||
|
||
# Generate embeddings
|
||
texts = [doc['text'] for doc in documents]
|
||
embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
|
||
|
||
# Prepare data for ChromaDB with enhanced metadata
|
||
ids = [f"doc_{i}" for i in range(len(documents))]
|
||
metadatas = []
|
||
|
||
for doc in documents:
|
||
metadata = {}
|
||
for k, v in doc.items():
|
||
if k != 'text':
|
||
if k == 'linkedin_url' and v:
|
||
metadata[k] = str(v)
|
||
elif k == 'date_published' and v:
|
||
metadata[k] = str(v)
|
||
elif k == 'topics' and isinstance(v, list):
|
||
metadata[k] = ', '.join(v) if v else ''
|
||
elif v is not None:
|
||
metadata[k] = str(v)
|
||
else:
|
||
metadata[k] = ''
|
||
metadatas.append(metadata)
|
||
|
||
# Add to collection
|
||
batch_size = 100
|
||
for i in range(0, len(texts), batch_size):
|
||
end_idx = min(i + batch_size, len(texts))
|
||
self.collection.add(
|
||
embeddings=embeddings[i:end_idx].tolist(),
|
||
documents=texts[i:end_idx],
|
||
metadatas=metadatas[i:end_idx],
|
||
ids=ids[i:end_idx]
|
||
)
|
||
|
||
print(f"✅ Vector store created with {self.collection.count()} documents")
|
||
return True
|
||
|
||
def retrieve_and_rerank(self, query, initial_k=20, final_n=5):
|
||
"""Retrieve and rerank documents"""
|
||
if not self.collection:
|
||
return []
|
||
|
||
try:
|
||
# Initial retrieval
|
||
query_embedding = self.embedding_model.encode(query).tolist()
|
||
results = self.collection.query(
|
||
query_embeddings=[query_embedding],
|
||
n_results=initial_k,
|
||
include=['documents', 'metadatas']
|
||
)
|
||
|
||
if not results['documents'][0]:
|
||
return []
|
||
|
||
# Prepare for reranking
|
||
documents = results['documents'][0]
|
||
metadatas = results['metadatas'][0]
|
||
|
||
# Rerank with cross-encoder
|
||
pairs = [[query, doc] for doc in documents]
|
||
inputs = self.cross_encoder_tokenizer(
|
||
pairs,
|
||
padding=True,
|
||
truncation=True,
|
||
return_tensors='pt',
|
||
max_length=512
|
||
)
|
||
|
||
with torch.no_grad():
|
||
scores = self.cross_encoder_model(**inputs).logits.squeeze()
|
||
|
||
if scores.dim() == 0:
|
||
scores = [scores.item()]
|
||
else:
|
||
scores = scores.tolist()
|
||
|
||
# Sort by score
|
||
scored_docs = list(zip(documents, metadatas, scores))
|
||
scored_docs.sort(key=lambda x: x[2], reverse=True)
|
||
|
||
# Return top documents
|
||
return [{'text': doc, 'metadata': meta, 'score': score}
|
||
for doc, meta, score in scored_docs[:final_n]]
|
||
|
||
except Exception as e:
|
||
print(f"Error in retrieve_and_rerank: {e}")
|
||
return []
|
||
|
||
def generate_response(self, query, retrieved_docs):
|
||
"""Generate response using OpenAI"""
|
||
if not retrieved_docs:
|
||
return "I couldn't find relevant information to answer your question."
|
||
|
||
context = "\n\n".join([doc['text'] for doc in retrieved_docs])
|
||
|
||
messages = [
|
||
{
|
||
"role": "system",
|
||
"content": """You are an AI assistant representing a LinkedIn profile. Answer questions based ONLY on the provided context from the LinkedIn profile data and articles.
|
||
|
||
Guidelines:
|
||
- Be professional and personable
|
||
- Provide specific details when available
|
||
- If information isn't in the context, politely say so
|
||
- Use first person when appropriate (since you're representing the profile owner)
|
||
- Keep responses concise but informative
|
||
- Do not mention or reference the sources in your response - that will be handled separately"""
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": f"Context:\n{context}\n\nQuestion: {query}\n\nPlease answer based on the LinkedIn profile information provided:"
|
||
}
|
||
]
|
||
|
||
try:
|
||
response = self.openai_client.chat.completions.create(
|
||
model="gpt-4o-mini",
|
||
messages=messages,
|
||
max_tokens=400,
|
||
temperature=0.3,
|
||
top_p=0.9
|
||
)
|
||
return response.choices[0].message.content.strip()
|
||
except Exception as e:
|
||
return f"Sorry, I encountered an error generating a response: {str(e)}"
|
||
|
||
def format_sources_with_links(self, retrieved_docs):
|
||
"""Format sources with clickable LinkedIn links"""
|
||
if not retrieved_docs:
|
||
return ""
|
||
|
||
sources_html = "<br><br>**📚 Sources:**<br>"
|
||
|
||
for i, doc in enumerate(retrieved_docs, 1):
|
||
metadata = doc['metadata']
|
||
source_type = metadata.get('source_type', 'Unknown')
|
||
title = metadata.get('title', 'Untitled')
|
||
linkedin_url = metadata.get('linkedin_url', '')
|
||
date_published = metadata.get('date_published', '')
|
||
|
||
# Create source entry
|
||
if linkedin_url:
|
||
# Clickable LinkedIn link
|
||
source_entry = f"🔗 <a href='{linkedin_url}' target='_blank' style='color: #0077B5; text-decoration: none; font-weight: 500;'>{title}</a>"
|
||
if date_published:
|
||
source_entry += f" <span style='color: #666; font-size: 0.9em;'>({date_published})</span>"
|
||
else:
|
||
# No link available
|
||
source_entry = f"📄 **{title}**"
|
||
if date_published:
|
||
source_entry += f" <span style='color: #666; font-size: 0.9em;'>({date_published})</span>"
|
||
|
||
# Add source type badge
|
||
type_color = {
|
||
'article': '#0077B5',
|
||
'experience': '#2D7D32',
|
||
'education': '#7B1FA2',
|
||
'skills': '#F57C00',
|
||
'projects': '#D32F2F',
|
||
'certifications': '#1976D2'
|
||
}.get(source_type, '#666')
|
||
|
||
source_type_badge = f"<span style='background: {type_color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; margin-left: 8px;'>{source_type.title()}</span>"
|
||
|
||
sources_html += f"{i}. {source_entry}{source_type_badge}<br>"
|
||
|
||
return sources_html
|
||
|
||
def chat(self, query):
|
||
"""Main chat function with enhanced source linking"""
|
||
retrieved_docs = self.retrieve_and_rerank(query)
|
||
response = self.generate_response(query, retrieved_docs)
|
||
|
||
# Add formatted sources with links
|
||
sources_info = self.format_sources_with_links(retrieved_docs)
|
||
|
||
return response + sources_info
|
||
|
||
# ================================
|
||
# UTILITY FUNCTIONS
|
||
# ================================
|
||
|
||
def extract_uploaded_data(zip_file_path, extract_to):
|
||
"""Extract uploaded LinkedIn data zip file"""
|
||
try:
|
||
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
||
zip_ref.extractall(extract_to)
|
||
print(f"✅ Extracted data to {extract_to}")
|
||
return True
|
||
except Exception as e:
|
||
print(f"❌ Failed to extract zip file: {e}")
|
||
return False
|
||
|
||
def initialize_linkedin_chatbot(data_path):
|
||
"""Initialize the complete LinkedIn chatbot system with clickable sources"""
|
||
print("🚀 Initializing LinkedIn Profile Chatbot with clickable sources...")
|
||
|
||
# Step 1: Load and process data
|
||
processor = LinkedInDataProcessor(data_path)
|
||
if not processor.load_all_data():
|
||
return None, "Failed to load LinkedIn data. Please check the uploaded data."
|
||
|
||
# Step 2: Create RAG documents with LinkedIn URLs
|
||
documents = processor.create_rag_documents()
|
||
if not documents:
|
||
return None, "No documents created from LinkedIn data."
|
||
|
||
# Count articles with LinkedIn URLs
|
||
articles_with_urls = sum(1 for doc in documents if doc.get('linkedin_url') and doc.get('source_type') == 'article')
|
||
|
||
# Step 3: Initialize RAG system
|
||
temp_db_path = tempfile.mkdtemp()
|
||
rag_system = LinkedInRAGSystem(temp_db_path)
|
||
if not rag_system.initialize_models():
|
||
return None, "Failed to initialize RAG models."
|
||
|
||
# Step 4: Create vector store
|
||
if not rag_system.create_vector_store(documents):
|
||
return None, "Failed to create vector store."
|
||
|
||
# Step 5: Get profile summary
|
||
summary = processor.get_profile_summary()
|
||
|
||
# Create a clean status message
|
||
summary_text = f"""
|
||
### ✅ **AI Assistant Ready with Clickable Sources!**
|
||
|
||
I have successfully analyzed the LinkedIn profile data including **{summary['total_documents']} documents** and **{summary['articles_count']} published articles** ({articles_with_urls} with direct LinkedIn links).
|
||
|
||
**💼 What I can help you discover:**
|
||
- 🎯 **Professional Journey** - Career progression and experience
|
||
- 🛠️ **Skills & Expertise** - Technical and professional capabilities
|
||
- 🎓 **Educational Background** - Academic achievements and learning
|
||
- 📝 **Published Content** - Articles with direct LinkedIn links
|
||
- 🚀 **Projects & Achievements** - Notable work and accomplishments
|
||
- 🌐 **Professional Network** - Industry connections and activities
|
||
|
||
**🔗 Enhanced Features:**
|
||
- **Clickable Sources** - Direct links to LinkedIn articles and content
|
||
- **Smart Source Attribution** - See exactly where information comes from
|
||
- **Professional Context** - Answers based on real LinkedIn profile data
|
||
|
||
**Ready to explore this professional profile!** Ask me anything you'd like to know.
|
||
"""
|
||
|
||
return rag_system, summary_text
|
||
|
||
# ================================
|
||
# GRADIO INTERFACE
|
||
# ================================
|
||
|
||
# Global variables
|
||
current_rag_system = None
|
||
current_status = "Upload your LinkedIn data to get started!"
|
||
|
||
# Add this anywhere in your Gradio interface after the status_display
|
||
usage_info = gr.Markdown(value=usage_tracker.get_usage_info())
|
||
|
||
def process_upload(zip_file):
|
||
"""Process uploaded LinkedIn data"""
|
||
global current_rag_system, current_status
|
||
|
||
if zip_file is None:
|
||
return "Please upload a LinkedIn data ZIP file first.", ""
|
||
|
||
try:
|
||
# Create temporary directory for extraction
|
||
temp_dir = tempfile.mkdtemp()
|
||
|
||
# Extract the uploaded file
|
||
if extract_uploaded_data(zip_file.name, temp_dir):
|
||
# Initialize the RAG system
|
||
rag_system, status_message = initialize_linkedin_chatbot(temp_dir)
|
||
|
||
if rag_system:
|
||
current_rag_system = rag_system
|
||
current_status = status_message
|
||
return status_message, "✅ **Ready to chat!** Ask me anything about the LinkedIn profile."
|
||
else:
|
||
return f"❌ Failed to initialize: {status_message}", ""
|
||
else:
|
||
return "❌ Failed to extract uploaded file.", ""
|
||
|
||
except Exception as e:
|
||
return f"❌ Error processing upload: {str(e)}", ""
|
||
|
||
@protected_function
|
||
def chat_with_profile(message, history):
|
||
"""Chat function with protection"""
|
||
global current_rag_system
|
||
|
||
if current_rag_system is None:
|
||
bot_response = "❌ **Please upload your LinkedIn data first using the file upload above.**"
|
||
history.append((message, bot_response))
|
||
return history, ""
|
||
|
||
if not message.strip():
|
||
bot_response = "👋 Please enter a question about the LinkedIn profile!"
|
||
history.append((message, bot_response))
|
||
return history, ""
|
||
|
||
try:
|
||
bot_response = current_rag_system.chat(message)
|
||
history.append((message, bot_response))
|
||
except Exception as e:
|
||
bot_response = f"❌ **Error**: {str(e)}"
|
||
history.append((message, bot_response))
|
||
|
||
return history, ""
|
||
|
||
# Premium CSS
|
||
premium_css = """
|
||
/* Import Google Fonts */
|
||
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
||
|
||
/* Main container styling */
|
||
.gradio-container {
|
||
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
min-height: 100vh;
|
||
}
|
||
|
||
/* Header styling */
|
||
.main-header {
|
||
background: linear-gradient(135deg, #0077B5 0%, #00A0DC 50%, #40E0D0 100%);
|
||
color: white;
|
||
padding: 2rem;
|
||
border-radius: 20px;
|
||
margin-bottom: 2rem;
|
||
text-align: center;
|
||
box-shadow: 0 10px 30px rgba(0,119,181,0.3);
|
||
border: 1px solid rgba(255,255,255,0.2);
|
||
backdrop-filter: blur(10px);
|
||
}
|
||
|
||
.main-header h1 {
|
||
font-size: 2.5rem;
|
||
font-weight: 700;
|
||
margin-bottom: 0.5rem;
|
||
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
|
||
}
|
||
|
||
.main-header p {
|
||
font-size: 1.2rem;
|
||
opacity: 0.95;
|
||
font-weight: 400;
|
||
}
|
||
|
||
/* Status card styling */
|
||
.status-card {
|
||
background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
|
||
border-radius: 16px;
|
||
padding: 1.5rem;
|
||
margin-bottom: 2rem;
|
||
box-shadow: 0 8px 25px rgba(0,0,0,0.1);
|
||
border: 1px solid rgba(0,119,181,0.1);
|
||
}
|
||
|
||
/* Chat container */
|
||
.chat-container {
|
||
background: white;
|
||
border-radius: 20px;
|
||
padding: 1.5rem;
|
||
box-shadow: 0 10px 40px rgba(0,0,0,0.1);
|
||
border: 1px solid rgba(0,119,181,0.1);
|
||
max-width: 900px;
|
||
margin: 0 auto;
|
||
}
|
||
|
||
/* Upload container */
|
||
.upload-container {
|
||
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
|
||
border-radius: 16px;
|
||
padding: 1.5rem;
|
||
margin-bottom: 2rem;
|
||
border: 2px dashed #0077B5;
|
||
}
|
||
|
||
/* Button styling */
|
||
.primary-btn {
|
||
background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%);
|
||
color: white;
|
||
border: none;
|
||
border-radius: 12px;
|
||
padding: 0.75rem 1.5rem;
|
||
font-weight: 600;
|
||
transition: all 0.3s ease;
|
||
box-shadow: 0 4px 15px rgba(0,119,181,0.3);
|
||
}
|
||
|
||
.primary-btn:hover {
|
||
transform: translateY(-2px);
|
||
box-shadow: 0 6px 20px rgba(0,119,181,0.4);
|
||
}
|
||
|
||
/* Example buttons */
|
||
.example-btn {
|
||
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
|
||
color: #0077B5;
|
||
border: 1px solid #0077B5;
|
||
border-radius: 25px;
|
||
padding: 0.6rem 1.2rem;
|
||
font-weight: 500;
|
||
margin: 0.3rem;
|
||
transition: all 0.3s ease;
|
||
font-size: 0.9rem;
|
||
}
|
||
|
||
.example-btn:hover {
|
||
background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%);
|
||
color: white;
|
||
transform: translateY(-1px);
|
||
box-shadow: 0 4px 12px rgba(0,119,181,0.3);
|
||
}
|
||
|
||
/* Input styling */
|
||
.input-text {
|
||
border: 2px solid #e1e8ed;
|
||
border-radius: 12px;
|
||
padding: 1rem;
|
||
font-size: 1rem;
|
||
transition: all 0.3s ease;
|
||
background: #f8fafc;
|
||
}
|
||
|
||
.input-text:focus {
|
||
border-color: #0077B5;
|
||
box-shadow: 0 0 0 3px rgba(0,119,181,0.1);
|
||
background: white;
|
||
}
|
||
|
||
/* Chatbot styling */
|
||
.chatbot {
|
||
border: none;
|
||
border-radius: 16px;
|
||
box-shadow: inset 0 2px 10px rgba(0,0,0,0.05);
|
||
}
|
||
|
||
/* Accordion styling */
|
||
.accordion {
|
||
background: linear-gradient(135deg, #f8fafc 0%, #e1e8ed 100%);
|
||
border-radius: 12px;
|
||
border: 1px solid #e1e8ed;
|
||
}
|
||
"""
|
||
|
||
# Create Gradio interface
|
||
with gr.Blocks(css=premium_css, title="LinkedIn Profile AI Assistant", theme=gr.themes.Soft()) as interface:
|
||
|
||
# Main Header
|
||
gr.HTML("""
|
||
<div class="main-header">
|
||
<h1>🤖 LinkedIn Profile AI Assistant</h1>
|
||
<p>Intelligent insights with clickable sources to original LinkedIn content</p>
|
||
</div>
|
||
""")
|
||
|
||
# Upload Section
|
||
with gr.Column(elem_classes=["upload-container"]):
|
||
gr.Markdown("### 📁 **Upload Your LinkedIn Data**")
|
||
gr.Markdown("Upload your LinkedIn data export ZIP file to get started. [Learn how to export your LinkedIn data](https://www.linkedin.com/help/linkedin/answer/a1339364)")
|
||
|
||
with gr.Row():
|
||
upload_file = gr.File(
|
||
label="LinkedIn Data ZIP File",
|
||
file_types=[".zip"],
|
||
type="filepath"
|
||
)
|
||
upload_btn = gr.Button(
|
||
"🚀 Process Data",
|
||
variant="primary",
|
||
elem_classes=["primary-btn"]
|
||
)
|
||
|
||
# Status Display
|
||
status_display = gr.Markdown(
|
||
value="📁 **Upload your LinkedIn data ZIP file above to get started!**",
|
||
elem_classes=["status-card"]
|
||
)
|
||
|
||
chat_status = gr.Markdown(
|
||
value="",
|
||
elem_classes=["status-card"]
|
||
)
|
||
|
||
# Main Chat Interface
|
||
with gr.Column(elem_classes=["chat-container"]):
|
||
|
||
# Chat Display
|
||
chatbot = gr.Chatbot(
|
||
label="💬 Professional Profile Assistant",
|
||
height=550,
|
||
show_copy_button=True,
|
||
avatar_images=("👤", "🤖"),
|
||
bubble_full_width=False,
|
||
elem_classes=["chatbot"]
|
||
)
|
||
|
||
# Input Section
|
||
with gr.Row():
|
||
with gr.Column(scale=5):
|
||
msg = gr.Textbox(
|
||
placeholder="Ask about experience, skills, education, articles, or any aspect of the professional profile...",
|
||
label="Your Question",
|
||
lines=2,
|
||
max_lines=4,
|
||
elem_classes=["input-text"]
|
||
)
|
||
with gr.Column(scale=1, min_width=100):
|
||
submit_btn = gr.Button(
|
||
"Send 💬",
|
||
variant="primary",
|
||
size="lg",
|
||
elem_classes=["primary-btn"]
|
||
)
|
||
|
||
# Quick Action Buttons
|
||
with gr.Row():
|
||
clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary", size="sm")
|
||
|
||
# Enhanced Examples Section
|
||
with gr.Accordion("💡 Example Questions - Click to Try", open=False, elem_classes=["accordion"]) as examples_accordion:
|
||
|
||
gr.Markdown("### 🎯 **Professional Experience & Career**")
|
||
with gr.Row():
|
||
exp_q1 = gr.Button("What is the professional background?", elem_classes=["example-btn"], size="sm")
|
||
exp_q2 = gr.Button("Describe the career progression", elem_classes=["example-btn"], size="sm")
|
||
exp_q3 = gr.Button("What are the key achievements?", elem_classes=["example-btn"], size="sm")
|
||
|
||
gr.Markdown("### 🛠️ **Skills & Expertise**")
|
||
with gr.Row():
|
||
skill_q1 = gr.Button("What skills and expertise are highlighted?", elem_classes=["example-btn"], size="sm")
|
||
skill_q2 = gr.Button("What technologies are mentioned?", elem_classes=["example-btn"], size="sm")
|
||
skill_q3 = gr.Button("What are the main areas of expertise?", elem_classes=["example-btn"], size="sm")
|
||
|
||
gr.Markdown("### 📚 **Education & Learning**")
|
||
with gr.Row():
|
||
edu_q1 = gr.Button("Tell me about the educational background", elem_classes=["example-btn"], size="sm")
|
||
edu_q2 = gr.Button("What certifications are mentioned?", elem_classes=["example-btn"], size="sm")
|
||
edu_q3 = gr.Button("What courses or learning paths are included?", elem_classes=["example-btn"], size="sm")
|
||
|
||
gr.Markdown("### 📝 **Articles & Content**")
|
||
with gr.Row():
|
||
content_q1 = gr.Button("What articles have been published?", elem_classes=["example-btn"], size="sm")
|
||
content_q2 = gr.Button("What topics are covered in the writing?", elem_classes=["example-btn"], size="sm")
|
||
content_q3 = gr.Button("What is the writing style like?", elem_classes=["example-btn"], size="sm")
|
||
|
||
# Connect example buttons to input
|
||
example_questions = [
|
||
(exp_q1, "What is the professional background and experience?"),
|
||
(exp_q2, "Describe the career progression and professional journey"),
|
||
(exp_q3, "What are the key achievements and accomplishments?"),
|
||
(skill_q1, "What skills and expertise are highlighted in the profile?"),
|
||
(skill_q2, "What technologies, tools, and platforms are mentioned?"),
|
||
(skill_q3, "What are the main areas of expertise and specialization?"),
|
||
(edu_q1, "Tell me about the educational background and qualifications"),
|
||
(edu_q2, "What certifications and professional credentials are mentioned?"),
|
||
(edu_q3, "What courses, training, or learning paths are included?"),
|
||
(content_q1, "What articles and content have been published?"),
|
||
(content_q2, "What topics and themes are covered in the published writing?"),
|
||
(content_q3, "What is the writing style and approach in the articles?")
|
||
]
|
||
|
||
for btn, question in example_questions:
|
||
btn.click(lambda q=question: q, outputs=msg)
|
||
|
||
# About Section
|
||
with gr.Accordion("ℹ️ About This AI Assistant", open=False, elem_classes=["accordion"]):
|
||
gr.Markdown("""
|
||
### 🚀 **Advanced AI-Powered Profile Analysis with Clickable Sources**
|
||
|
||
This intelligent assistant uses cutting-edge **Retrieval-Augmented Generation (RAG)** technology to provide accurate, contextual answers about LinkedIn profiles with direct links to original content.
|
||
|
||
**🔧 Technical Capabilities:**
|
||
- **Vector Search**: Semantic similarity matching for relevant information retrieval
|
||
- **Cross-Encoder Reranking**: Advanced relevance scoring for precision
|
||
- **GPT-4 Generation**: Natural, human-like response generation
|
||
- **Multi-Source Integration**: Combines structured data and article content
|
||
- **Clickable Sources**: Direct links to original LinkedIn articles and content
|
||
|
||
**📊 Data Sources Analyzed:**
|
||
- Professional experience and job history
|
||
- Educational background and certifications
|
||
- Skills, endorsements, and expertise areas
|
||
- Published articles and thought leadership content (with clickable links)
|
||
- Projects, achievements, and recommendations
|
||
- Professional network activities and engagement
|
||
|
||
**🔒 Privacy & Security:**
|
||
- Only uses uploaded LinkedIn profile data
|
||
- No external data access or web browsing
|
||
- Responses based solely on uploaded content
|
||
- Secure processing with no data retention
|
||
|
||
**⚡ Built with:**
|
||
- Gradio for the interface
|
||
- OpenAI GPT-4 for generation
|
||
- ChromaDB for vector storage
|
||
- Sentence Transformers for embeddings
|
||
- Custom LinkedIn URL extraction
|
||
""")
|
||
|
||
# Event Handlers
|
||
upload_btn.click(
|
||
process_upload,
|
||
inputs=[upload_file],
|
||
outputs=[status_display, chat_status]
|
||
)
|
||
|
||
msg.submit(chat_with_profile, inputs=[msg, chatbot], outputs=[chatbot, msg])
|
||
submit_btn.click(chat_with_profile, inputs=[msg, chatbot], outputs=[chatbot, msg])
|
||
clear_btn.click(lambda: [], outputs=chatbot)
|
||
|
||
# Add this to your existing event handlers
|
||
submit_btn.click(
|
||
lambda: usage_tracker.get_usage_info(),
|
||
outputs=usage_info,
|
||
queue=False
|
||
)
|
||
|
||
# Footer
|
||
gr.HTML("""
|
||
<div style="text-align: center; margin-top: 2rem; padding: 1rem; color: #666; font-size: 0.9rem;">
|
||
<p>🤖 <strong>LinkedIn Profile AI Assistant</strong> | Powered by Advanced RAG Technology with Clickable Sources</p>
|
||
<p>Built with ❤️ using Gradio, OpenAI GPT-4, ChromaDB, and Custom LinkedIn URL extraction</p>
|
||
</div>
|
||
""")
|
||
|
||
# Launch the interface
|
||
# Launch the interface
|
||
if __name__ == "__main__":
|
||
interface.launch() |