š¤ LinkedIn Profile AI Assistant
+Intelligent insights with clickable sources to original LinkedIn content
+diff --git a/week5/community-contributions/linkedin-ai-assistant/app.py b/week5/community-contributions/linkedin-ai-assistant/app.py
new file mode 100644
index 0000000..6e3ed3f
--- /dev/null
+++ b/week5/community-contributions/linkedin-ai-assistant/app.py
@@ -0,0 +1,1143 @@
+import os
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import re
+from datetime import datetime, timedelta
+from bs4 import BeautifulSoup
+import html2text
+from collections import Counter, defaultdict, deque
+import warnings
+import time
+import hashlib
+import socket
+import random
+import zipfile
+import tempfile
+import shutil
+
+warnings.filterwarnings('ignore')
+
+import gradio as gr
+import chromadb
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from openai import OpenAI
+import torch
+
+# ================================
+# USAGE PROTECTION SYSTEM
+# ================================
+
+class UsageTracker:
+ def __init__(self):
+ self.hourly_limits = defaultdict(lambda: deque())
+ self.daily_limits = defaultdict(int)
+ self.total_requests = 0
+ self.total_cost = 0.0
+
+ # STRICTER LIMITS for cost control
+ self.max_hourly = 5 # Reduced from 15
+ self.max_daily = 20 # Reduced from 100
+ self.max_total = 200 # Reduced from 1000
+ self.max_daily_cost = 3.0 # $3 daily limit
+
+ # GPT-4o-mini pricing (approximate cost per request)
+ self.cost_per_request = 0.01 # ~1 cent per request (conservative estimate)
+
+ def can_make_request(self, user_id):
+ now = datetime.now()
+ hour_ago = now - timedelta(hours=1)
+
+ # Clean old hourly requests
+ while self.hourly_limits[user_id] and self.hourly_limits[user_id][0] < hour_ago:
+ self.hourly_limits[user_id].popleft()
+
+ # Check limits
+ if len(self.hourly_limits[user_id]) >= self.max_hourly:
+ return False, f"ā° Hourly limit reached ({self.max_hourly} requests/hour). Please try again in a few minutes."
+
+ if self.daily_limits[user_id] >= self.max_daily:
+ return False, f"š
Daily limit reached ({self.max_daily} requests/day). Come back tomorrow!"
+
+ if self.total_requests >= self.max_total:
+ return False, "š« Service temporarily unavailable due to high usage. Please try again later."
+
+ # Check estimated daily cost
+ if self.total_cost >= self.max_daily_cost:
+ return False, f"š° Daily cost limit (${self.max_daily_cost}) reached. Service will reset tomorrow."
+
+ return True, "OK"
+
+ def record_request(self, user_id):
+ now = datetime.now()
+ self.hourly_limits[user_id].append(now)
+ self.daily_limits[user_id] += 1
+ self.total_requests += 1
+ self.total_cost += self.cost_per_request # Track estimated cost
+
+ def get_usage_info(self):
+ """Get current usage info for display"""
+ return f"""
+**š Current Usage:**
+- Total requests today: {self.total_requests}/{self.max_total}
+- Estimated cost today: ${self.total_cost:.2f}/${self.max_daily_cost}
+- Service status: {'š¢ Active' if self.total_requests < self.max_total and self.total_cost < self.max_daily_cost else 'š“ Limited'}
+"""
+
+# Initialize tracker - ADD THIS LINE!
+usage_tracker = UsageTracker()
+
+
+def protected_function(func):
+ def wrapper(*args, **kwargs):
+ user_id = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
+ allowed, message = usage_tracker.can_make_request(user_id)
+
+ if not allowed:
+ return f"ā ļø {message}. Please try again later."
+
+ usage_tracker.record_request(user_id)
+ return func(*args, **kwargs)
+ return wrapper
+
+# ================================
+# LINKEDIN DATA PROCESSOR
+# ================================
+
+class LinkedInDataProcessor:
+ def __init__(self, data_path):
+ self.data_path = Path(data_path)
+ self.profile_data = {}
+ self.processed_data = {}
+ self.articles_content = []
+ self.rag_documents = []
+
+ def load_all_data(self):
+ """Load all LinkedIn JSON and CSV files including HTML articles"""
+ print("š Loading LinkedIn data...")
+
+ file_mappings = {
+ 'Profile.csv': 'basic_info',
+ 'Connections.csv': 'connections',
+ 'Experience.csv': 'experience',
+ 'Education.csv': 'education',
+ 'Skills.csv': 'skills',
+ 'Certifications.csv': 'certifications',
+ 'Articles.csv': 'articles_metadata',
+ 'Comments.csv': 'comments',
+ 'Shares.csv': 'shares',
+ 'Positions.csv': 'positions',
+ 'Languages.csv': 'languages',
+ 'Projects.csv': 'projects',
+ 'Publications.csv': 'publications',
+ 'Recommendations.csv': 'recommendations',
+ 'Endorsement_Given_Info.csv': 'endorsements_given',
+ 'Endorsement_Received_Info.csv': 'endorsements_received',
+ 'Courses.csv': 'courses',
+ 'Learning.csv': 'learning_paths',
+ 'Interests.csv': 'interests',
+ 'Company Follow.csv': 'companies_followed',
+ 'Reactions.csv': 'reactions',
+ 'Views.csv': 'views',
+ 'Saved_Items.csv': 'saved_items',
+ }
+
+ loaded_count = 0
+ for file_name, data_type in file_mappings.items():
+ file_path = self.data_path / file_name
+ if file_path.exists():
+ try:
+ df = pd.read_csv(file_path, encoding='utf-8')
+ self.profile_data[data_type] = df
+ print(f"ā
Loaded {file_name}: {len(df)} records")
+ loaded_count += 1
+ except Exception as e:
+ print(f"ā ļø Could not load {file_name}: {str(e)}")
+ else:
+ print(f"š {file_name} not found")
+
+ self.load_html_articles()
+ print(f"š Successfully loaded {loaded_count} data files")
+ return loaded_count > 0
+
+ def load_html_articles(self):
+ """Load and parse HTML articles"""
+ print("\nš° Loading HTML articles...")
+
+ articles_paths = [
+ self.data_path / "Articles" / "Articles",
+ self.data_path / "Articles",
+ self.data_path / "articles" / "articles",
+ self.data_path / "articles",
+ ]
+
+ found_path = None
+ for path in articles_paths:
+ if path.exists():
+ found_path = path
+ break
+
+ if not found_path:
+ print("š Articles folder not found")
+ return
+
+ html_files = list(found_path.glob("*.html"))
+ if not html_files:
+ print("š No HTML files found")
+ return
+
+ print(f"š Found {len(html_files)} HTML articles")
+
+ articles_data = []
+ for html_file in html_files:
+ try:
+ article_data = self.parse_html_article(html_file)
+ if article_data:
+ articles_data.append(article_data)
+ except Exception as e:
+ print(f"ā ļø Error parsing {html_file.name}: {str(e)}")
+
+ self.articles_content = articles_data
+ self.profile_data['articles_html'] = articles_data
+ print(f"š Successfully loaded {len(articles_data)} articles")
+
+ def extract_linkedin_url_from_html(self, html_content, filename):
+ """Extract LinkedIn URL from HTML article content"""
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ # Look for canonical URL
+ canonical = soup.find('link', {'rel': 'canonical'})
+ if canonical and canonical.get('href'):
+ url = canonical.get('href')
+ if 'linkedin.com' in url:
+ return url
+
+ # Look for meta property og:url
+ og_url = soup.find('meta', {'property': 'og:url'})
+ if og_url and og_url.get('content'):
+ url = og_url.get('content')
+ if 'linkedin.com' in url:
+ return url
+
+ # Look for any LinkedIn URLs in the content
+ linkedin_pattern = r'https?://(?:www\.)?linkedin\.com/pulse/[^"\s<>]+'
+ matches = re.findall(linkedin_pattern, html_content)
+ if matches:
+ return matches[0]
+
+ # Fallback: construct URL from filename
+ if filename:
+ clean_name = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', filename)
+ clean_name = clean_name.replace('.html', '')
+
+ if len(clean_name) > 10 and '-' in clean_name:
+ return f"https://www.linkedin.com/pulse/{clean_name}/"
+
+ return None
+
+ except Exception as e:
+ print(f"Error extracting LinkedIn URL: {e}")
+ return None
+
+ def parse_html_article(self, file_path):
+ """Parse individual HTML article with LinkedIn URL extraction"""
+ with open(file_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+
+ soup = BeautifulSoup(content, 'html.parser')
+
+ # Extract title
+ title_elem = soup.find('h1') or soup.find('title')
+ title = title_elem.get_text().strip() if title_elem else self.extract_title_from_filename(file_path.name)
+
+ # Extract LinkedIn URL
+ linkedin_url = self.extract_linkedin_url_from_html(content, file_path.name)
+
+ # Extract content
+ content_selectors = ['article', '.article-content', '.post-content', 'main', '.content', 'body']
+ article_content = None
+ for selector in content_selectors:
+ article_content = soup.select_one(selector)
+ if article_content:
+ break
+
+ if not article_content:
+ article_content = soup.find('body') or soup
+
+ # Convert to plain text
+ h = html2text.HTML2Text()
+ h.ignore_links = True
+ h.ignore_images = True
+ plain_text = h.handle(str(article_content)).strip()
+
+ # Extract metadata
+ words = re.findall(r'\b\w+\b', plain_text.lower())
+
+ return {
+ 'filename': file_path.name,
+ 'title': title,
+ 'content': str(article_content),
+ 'plain_text': plain_text,
+ 'date_published': self.extract_date_from_filename(file_path.name),
+ 'word_count': len(words),
+ 'topics': self.extract_topics(plain_text),
+ 'writing_style': self.analyze_writing_style(plain_text),
+ 'linkedin_url': linkedin_url
+ }
+
+ def extract_title_from_filename(self, filename):
+ """Extract readable title from filename"""
+ title = filename.replace('.html', '')
+ title = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', title)
+ title = title.replace('-', ' ').replace('_', ' ')
+ return ' '.join(word.capitalize() for word in title.split())
+
+ def extract_date_from_filename(self, filename):
+ """Extract publication date from filename"""
+ date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename)
+ return date_match.group(1) if date_match else ''
+
+ def analyze_writing_style(self, text):
+ """Analyze writing style indicators"""
+ text_lower = text.lower()
+ sentences = re.split(r'[.!?]+', text)
+ words = re.findall(r'\b\w+\b', text_lower)
+
+ return {
+ 'word_count': len(words),
+ 'sentence_count': len(sentences),
+ 'avg_sentence_length': len(words) / max(len(sentences), 1),
+ 'question_count': text.count('?'),
+ 'first_person_usage': len(re.findall(r'\b(i|me|my|myself|we|us|our)\b', text_lower)),
+ 'technical_terms': sum(text_lower.count(term) for term in ['algorithm', 'framework', 'methodology', 'data', 'analysis', 'technology']),
+ }
+
+ def extract_topics(self, text, max_topics=10):
+ """Extract main topics from text"""
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
+ stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'been', 'have', 'has', 'had'}
+ word_freq = Counter(word for word in words if word not in stop_words and len(word) > 3)
+ return [word for word, count in word_freq.most_common(max_topics)]
+
+ def create_rag_documents(self):
+ """Create documents for RAG system with LinkedIn URLs"""
+ self.rag_documents = []
+
+ # Process profile data
+ for data_type, data_content in self.profile_data.items():
+ if isinstance(data_content, pd.DataFrame) and not data_content.empty:
+ self.process_dataframe_to_documents(data_content, data_type)
+ elif isinstance(data_content, list) and data_content:
+ self.process_list_to_documents(data_content, data_type)
+
+ # Process articles with LinkedIn URLs
+ if self.articles_content:
+ for article in self.articles_content:
+ if article['plain_text'].strip():
+ self.rag_documents.append({
+ 'text': article['plain_text'],
+ 'title': article['title'],
+ 'source_type': 'article',
+ 'date_published': article['date_published'],
+ 'word_count': article['word_count'],
+ 'topics': article['topics'],
+ 'linkedin_url': article.get('linkedin_url', ''),
+ 'filename': article['filename']
+ })
+
+ print(f"š Created {len(self.rag_documents)} RAG documents with LinkedIn URLs")
+ return self.rag_documents
+
+ def process_dataframe_to_documents(self, df, data_type):
+ """Convert DataFrame to RAG documents"""
+ if data_type == 'experience':
+ for _, row in df.iterrows():
+ text = f"Experience: {row.get('Title', '')} at {row.get('Company', '')}\n"
+ text += f"Duration: {row.get('Started On', '')} - {row.get('Finished On', 'Present')}\n"
+ text += f"Description: {row.get('Description', '')}"
+
+ self.rag_documents.append({
+ 'text': text,
+ 'title': f"{row.get('Title', '')} at {row.get('Company', '')}",
+ 'source_type': 'experience',
+ 'linkedin_url': ''
+ })
+
+ elif data_type == 'education':
+ for _, row in df.iterrows():
+ text = f"Education: {row.get('Degree', '')} in {row.get('Field Of Study', '')} from {row.get('School', '')}\n"
+ text += f"Duration: {row.get('Start Date', '')} - {row.get('End Date', '')}"
+
+ self.rag_documents.append({
+ 'text': text,
+ 'title': f"{row.get('Degree', '')} - {row.get('School', '')}",
+ 'source_type': 'education',
+ 'linkedin_url': ''
+ })
+
+ elif data_type == 'skills':
+ if 'Skill' in df.columns:
+ skills_text = "Professional Skills: " + ", ".join(df['Skill'].dropna().tolist())
+ self.rag_documents.append({
+ 'text': skills_text,
+ 'title': 'Professional Skills',
+ 'source_type': 'skills',
+ 'linkedin_url': ''
+ })
+
+ elif data_type == 'certifications':
+ if 'Name' in df.columns:
+ certs_text = "Certifications: " + ", ".join(df['Name'].dropna().tolist())
+ self.rag_documents.append({
+ 'text': certs_text,
+ 'title': 'Certifications',
+ 'source_type': 'certifications',
+ 'linkedin_url': ''
+ })
+
+ elif data_type == 'projects':
+ for _, row in df.iterrows():
+ text = f"Project: {row.get('Title', '')}\n"
+ text += f"Description: {row.get('Description', '')}\n"
+ text += f"URL: {row.get('Url', '')}"
+
+ project_url = row.get('Url', '')
+ linkedin_url = project_url if 'linkedin.com' in str(project_url) else ''
+
+ self.rag_documents.append({
+ 'text': text,
+ 'title': row.get('Title', 'Project'),
+ 'source_type': 'projects',
+ 'linkedin_url': linkedin_url
+ })
+
+ def process_list_to_documents(self, data_list, data_type):
+ """Convert list data to RAG documents"""
+ if data_type == 'articles_html':
+ return
+
+ def get_profile_summary(self):
+ """Get comprehensive profile summary"""
+ summary = {
+ 'total_documents': len(self.rag_documents),
+ 'articles_count': len(self.articles_content),
+ 'data_types': list(self.profile_data.keys()),
+ 'skills_count': len(self.profile_data.get('skills', [])),
+ 'experience_count': len(self.profile_data.get('experience', [])),
+ 'education_count': len(self.profile_data.get('education', [])),
+ }
+
+ if self.articles_content:
+ total_words = sum(article['word_count'] for article in self.articles_content)
+ summary['total_article_words'] = total_words
+ summary['avg_article_length'] = total_words // len(self.articles_content)
+
+ return summary
+
+# ================================
+# RAG SYSTEM
+# ================================
+
+class LinkedInRAGSystem:
+ def __init__(self, chroma_db_path):
+ self.chroma_db_path = chroma_db_path
+ self.embedding_model = None
+ self.cross_encoder_model = None
+ self.cross_encoder_tokenizer = None
+ self.chroma_client = None
+ self.collection = None
+ self.openai_client = None
+
+ def initialize_models(self):
+ """Initialize all required models"""
+ print("š Initializing RAG models...")
+
+ # Initialize OpenAI client
+ try:
+ api_key = os.getenv('OPENAI_API_KEY')
+ if not api_key:
+ print("ā OpenAI API key not found in environment variables")
+ return False
+ self.openai_client = OpenAI(api_key=api_key)
+ print("ā
OpenAI client initialized")
+ except Exception as e:
+ print(f"ā Failed to initialize OpenAI client: {e}")
+ return False
+
+ # Load embedding model
+ try:
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+ print("ā
Embedding model loaded")
+ except Exception as e:
+ print(f"ā Failed to load embedding model: {e}")
+ return False
+
+ # Load cross-encoder for reranking
+ try:
+ cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+ self.cross_encoder_tokenizer = AutoTokenizer.from_pretrained(cross_encoder_name)
+ self.cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(cross_encoder_name)
+ print("ā
Cross-encoder model loaded")
+ except Exception as e:
+ print(f"ā Failed to load cross-encoder: {e}")
+ return False
+
+ # Initialize ChromaDB
+ try:
+ self.chroma_client = chromadb.PersistentClient(path=self.chroma_db_path)
+ print("ā
ChromaDB initialized")
+ except Exception as e:
+ print(f"ā Failed to initialize ChromaDB: {e}")
+ return False
+
+ return True
+
+ def create_vector_store(self, documents):
+ """Create vector store from documents with enhanced metadata"""
+ print("š Creating vector store with LinkedIn URLs...")
+
+ # Delete existing collection if it exists
+ try:
+ self.chroma_client.delete_collection("linkedin_profile")
+ except:
+ pass
+
+ # Create new collection
+ self.collection = self.chroma_client.create_collection("linkedin_profile")
+
+ # Generate embeddings
+ texts = [doc['text'] for doc in documents]
+ embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
+
+ # Prepare data for ChromaDB with enhanced metadata
+ ids = [f"doc_{i}" for i in range(len(documents))]
+ metadatas = []
+
+ for doc in documents:
+ metadata = {}
+ for k, v in doc.items():
+ if k != 'text':
+ if k == 'linkedin_url' and v:
+ metadata[k] = str(v)
+ elif k == 'date_published' and v:
+ metadata[k] = str(v)
+ elif k == 'topics' and isinstance(v, list):
+ metadata[k] = ', '.join(v) if v else ''
+ elif v is not None:
+ metadata[k] = str(v)
+ else:
+ metadata[k] = ''
+ metadatas.append(metadata)
+
+ # Add to collection
+ batch_size = 100
+ for i in range(0, len(texts), batch_size):
+ end_idx = min(i + batch_size, len(texts))
+ self.collection.add(
+ embeddings=embeddings[i:end_idx].tolist(),
+ documents=texts[i:end_idx],
+ metadatas=metadatas[i:end_idx],
+ ids=ids[i:end_idx]
+ )
+
+ print(f"ā
Vector store created with {self.collection.count()} documents")
+ return True
+
+ def retrieve_and_rerank(self, query, initial_k=20, final_n=5):
+ """Retrieve and rerank documents"""
+ if not self.collection:
+ return []
+
+ try:
+ # Initial retrieval
+ query_embedding = self.embedding_model.encode(query).tolist()
+ results = self.collection.query(
+ query_embeddings=[query_embedding],
+ n_results=initial_k,
+ include=['documents', 'metadatas']
+ )
+
+ if not results['documents'][0]:
+ return []
+
+ # Prepare for reranking
+ documents = results['documents'][0]
+ metadatas = results['metadatas'][0]
+
+ # Rerank with cross-encoder
+ pairs = [[query, doc] for doc in documents]
+ inputs = self.cross_encoder_tokenizer(
+ pairs,
+ padding=True,
+ truncation=True,
+ return_tensors='pt',
+ max_length=512
+ )
+
+ with torch.no_grad():
+ scores = self.cross_encoder_model(**inputs).logits.squeeze()
+
+ if scores.dim() == 0:
+ scores = [scores.item()]
+ else:
+ scores = scores.tolist()
+
+ # Sort by score
+ scored_docs = list(zip(documents, metadatas, scores))
+ scored_docs.sort(key=lambda x: x[2], reverse=True)
+
+ # Return top documents
+ return [{'text': doc, 'metadata': meta, 'score': score}
+ for doc, meta, score in scored_docs[:final_n]]
+
+ except Exception as e:
+ print(f"Error in retrieve_and_rerank: {e}")
+ return []
+
+ def generate_response(self, query, retrieved_docs):
+ """Generate response using OpenAI"""
+ if not retrieved_docs:
+ return "I couldn't find relevant information to answer your question."
+
+ context = "\n\n".join([doc['text'] for doc in retrieved_docs])
+
+ messages = [
+ {
+ "role": "system",
+ "content": """You are an AI assistant representing a LinkedIn profile. Answer questions based ONLY on the provided context from the LinkedIn profile data and articles.
+
+Guidelines:
+- Be professional and personable
+- Provide specific details when available
+- If information isn't in the context, politely say so
+- Use first person when appropriate (since you're representing the profile owner)
+- Keep responses concise but informative
+- Do not mention or reference the sources in your response - that will be handled separately"""
+ },
+ {
+ "role": "user",
+ "content": f"Context:\n{context}\n\nQuestion: {query}\n\nPlease answer based on the LinkedIn profile information provided:"
+ }
+ ]
+
+ try:
+ response = self.openai_client.chat.completions.create(
+ model="gpt-4o-mini",
+ messages=messages,
+ max_tokens=400,
+ temperature=0.3,
+ top_p=0.9
+ )
+ return response.choices[0].message.content.strip()
+ except Exception as e:
+ return f"Sorry, I encountered an error generating a response: {str(e)}"
+
+ def format_sources_with_links(self, retrieved_docs):
+ """Format sources with clickable LinkedIn links"""
+ if not retrieved_docs:
+ return ""
+
+ sources_html = "
**š Sources:**
"
+
+ for i, doc in enumerate(retrieved_docs, 1):
+ metadata = doc['metadata']
+ source_type = metadata.get('source_type', 'Unknown')
+ title = metadata.get('title', 'Untitled')
+ linkedin_url = metadata.get('linkedin_url', '')
+ date_published = metadata.get('date_published', '')
+
+ # Create source entry
+ if linkedin_url:
+ # Clickable LinkedIn link
+ source_entry = f"š {title}"
+ if date_published:
+ source_entry += f" ({date_published})"
+ else:
+ # No link available
+ source_entry = f"š **{title}**"
+ if date_published:
+ source_entry += f" ({date_published})"
+
+ # Add source type badge
+ type_color = {
+ 'article': '#0077B5',
+ 'experience': '#2D7D32',
+ 'education': '#7B1FA2',
+ 'skills': '#F57C00',
+ 'projects': '#D32F2F',
+ 'certifications': '#1976D2'
+ }.get(source_type, '#666')
+
+ source_type_badge = f"{source_type.title()}"
+
+ sources_html += f"{i}. {source_entry}{source_type_badge}
"
+
+ return sources_html
+
+ def chat(self, query):
+ """Main chat function with enhanced source linking"""
+ retrieved_docs = self.retrieve_and_rerank(query)
+ response = self.generate_response(query, retrieved_docs)
+
+ # Add formatted sources with links
+ sources_info = self.format_sources_with_links(retrieved_docs)
+
+ return response + sources_info
+
+# ================================
+# UTILITY FUNCTIONS
+# ================================
+
+def extract_uploaded_data(zip_file_path, extract_to):
+ """Extract uploaded LinkedIn data zip file"""
+ try:
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+ zip_ref.extractall(extract_to)
+ print(f"ā
Extracted data to {extract_to}")
+ return True
+ except Exception as e:
+ print(f"ā Failed to extract zip file: {e}")
+ return False
+
+def initialize_linkedin_chatbot(data_path):
+ """Initialize the complete LinkedIn chatbot system with clickable sources"""
+ print("š Initializing LinkedIn Profile Chatbot with clickable sources...")
+
+ # Step 1: Load and process data
+ processor = LinkedInDataProcessor(data_path)
+ if not processor.load_all_data():
+ return None, "Failed to load LinkedIn data. Please check the uploaded data."
+
+ # Step 2: Create RAG documents with LinkedIn URLs
+ documents = processor.create_rag_documents()
+ if not documents:
+ return None, "No documents created from LinkedIn data."
+
+ # Count articles with LinkedIn URLs
+ articles_with_urls = sum(1 for doc in documents if doc.get('linkedin_url') and doc.get('source_type') == 'article')
+
+ # Step 3: Initialize RAG system
+ temp_db_path = tempfile.mkdtemp()
+ rag_system = LinkedInRAGSystem(temp_db_path)
+ if not rag_system.initialize_models():
+ return None, "Failed to initialize RAG models."
+
+ # Step 4: Create vector store
+ if not rag_system.create_vector_store(documents):
+ return None, "Failed to create vector store."
+
+ # Step 5: Get profile summary
+ summary = processor.get_profile_summary()
+
+ # Create a clean status message
+ summary_text = f"""
+### ā
**AI Assistant Ready with Clickable Sources!**
+
+I have successfully analyzed the LinkedIn profile data including **{summary['total_documents']} documents** and **{summary['articles_count']} published articles** ({articles_with_urls} with direct LinkedIn links).
+
+**š¼ What I can help you discover:**
+- šÆ **Professional Journey** - Career progression and experience
+- š ļø **Skills & Expertise** - Technical and professional capabilities
+- š **Educational Background** - Academic achievements and learning
+- š **Published Content** - Articles with direct LinkedIn links
+- š **Projects & Achievements** - Notable work and accomplishments
+- š **Professional Network** - Industry connections and activities
+
+**š Enhanced Features:**
+- **Clickable Sources** - Direct links to LinkedIn articles and content
+- **Smart Source Attribution** - See exactly where information comes from
+- **Professional Context** - Answers based on real LinkedIn profile data
+
+**Ready to explore this professional profile!** Ask me anything you'd like to know.
+"""
+
+ return rag_system, summary_text
+
+# ================================
+# GRADIO INTERFACE
+# ================================
+
+# Global variables
+current_rag_system = None
+current_status = "Upload your LinkedIn data to get started!"
+
+# Add this anywhere in your Gradio interface after the status_display
+usage_info = gr.Markdown(value=usage_tracker.get_usage_info())
+
+def process_upload(zip_file):
+ """Process uploaded LinkedIn data"""
+ global current_rag_system, current_status
+
+ if zip_file is None:
+ return "Please upload a LinkedIn data ZIP file first.", ""
+
+ try:
+ # Create temporary directory for extraction
+ temp_dir = tempfile.mkdtemp()
+
+ # Extract the uploaded file
+ if extract_uploaded_data(zip_file.name, temp_dir):
+ # Initialize the RAG system
+ rag_system, status_message = initialize_linkedin_chatbot(temp_dir)
+
+ if rag_system:
+ current_rag_system = rag_system
+ current_status = status_message
+ return status_message, "ā
**Ready to chat!** Ask me anything about the LinkedIn profile."
+ else:
+ return f"ā Failed to initialize: {status_message}", ""
+ else:
+ return "ā Failed to extract uploaded file.", ""
+
+ except Exception as e:
+ return f"ā Error processing upload: {str(e)}", ""
+
+@protected_function
+def chat_with_profile(message, history):
+ """Chat function with protection"""
+ global current_rag_system
+
+ if current_rag_system is None:
+ bot_response = "ā **Please upload your LinkedIn data first using the file upload above.**"
+ history.append((message, bot_response))
+ return history, ""
+
+ if not message.strip():
+ bot_response = "š Please enter a question about the LinkedIn profile!"
+ history.append((message, bot_response))
+ return history, ""
+
+ try:
+ bot_response = current_rag_system.chat(message)
+ history.append((message, bot_response))
+ except Exception as e:
+ bot_response = f"ā **Error**: {str(e)}"
+ history.append((message, bot_response))
+
+ return history, ""
+
+# Premium CSS
+premium_css = """
+/* Import Google Fonts */
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+
+/* Main container styling */
+.gradio-container {
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ min-height: 100vh;
+}
+
+/* Header styling */
+.main-header {
+ background: linear-gradient(135deg, #0077B5 0%, #00A0DC 50%, #40E0D0 100%);
+ color: white;
+ padding: 2rem;
+ border-radius: 20px;
+ margin-bottom: 2rem;
+ text-align: center;
+ box-shadow: 0 10px 30px rgba(0,119,181,0.3);
+ border: 1px solid rgba(255,255,255,0.2);
+ backdrop-filter: blur(10px);
+}
+
+.main-header h1 {
+ font-size: 2.5rem;
+ font-weight: 700;
+ margin-bottom: 0.5rem;
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
+}
+
+.main-header p {
+ font-size: 1.2rem;
+ opacity: 0.95;
+ font-weight: 400;
+}
+
+/* Status card styling */
+.status-card {
+ background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
+ border-radius: 16px;
+ padding: 1.5rem;
+ margin-bottom: 2rem;
+ box-shadow: 0 8px 25px rgba(0,0,0,0.1);
+ border: 1px solid rgba(0,119,181,0.1);
+}
+
+/* Chat container */
+.chat-container {
+ background: white;
+ border-radius: 20px;
+ padding: 1.5rem;
+ box-shadow: 0 10px 40px rgba(0,0,0,0.1);
+ border: 1px solid rgba(0,119,181,0.1);
+ max-width: 900px;
+ margin: 0 auto;
+}
+
+/* Upload container */
+.upload-container {
+ background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
+ border-radius: 16px;
+ padding: 1.5rem;
+ margin-bottom: 2rem;
+ border: 2px dashed #0077B5;
+}
+
+/* Button styling */
+.primary-btn {
+ background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%);
+ color: white;
+ border: none;
+ border-radius: 12px;
+ padding: 0.75rem 1.5rem;
+ font-weight: 600;
+ transition: all 0.3s ease;
+ box-shadow: 0 4px 15px rgba(0,119,181,0.3);
+}
+
+.primary-btn:hover {
+ transform: translateY(-2px);
+ box-shadow: 0 6px 20px rgba(0,119,181,0.4);
+}
+
+/* Example buttons */
+.example-btn {
+ background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
+ color: #0077B5;
+ border: 1px solid #0077B5;
+ border-radius: 25px;
+ padding: 0.6rem 1.2rem;
+ font-weight: 500;
+ margin: 0.3rem;
+ transition: all 0.3s ease;
+ font-size: 0.9rem;
+}
+
+.example-btn:hover {
+ background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%);
+ color: white;
+ transform: translateY(-1px);
+ box-shadow: 0 4px 12px rgba(0,119,181,0.3);
+}
+
+/* Input styling */
+.input-text {
+ border: 2px solid #e1e8ed;
+ border-radius: 12px;
+ padding: 1rem;
+ font-size: 1rem;
+ transition: all 0.3s ease;
+ background: #f8fafc;
+}
+
+.input-text:focus {
+ border-color: #0077B5;
+ box-shadow: 0 0 0 3px rgba(0,119,181,0.1);
+ background: white;
+}
+
+/* Chatbot styling */
+.chatbot {
+ border: none;
+ border-radius: 16px;
+ box-shadow: inset 0 2px 10px rgba(0,0,0,0.05);
+}
+
+/* Accordion styling */
+.accordion {
+ background: linear-gradient(135deg, #f8fafc 0%, #e1e8ed 100%);
+ border-radius: 12px;
+ border: 1px solid #e1e8ed;
+}
+"""
+
+# Create Gradio interface
+with gr.Blocks(css=premium_css, title="LinkedIn Profile AI Assistant", theme=gr.themes.Soft()) as interface:
+
+ # Main Header
+ gr.HTML("""
+
Intelligent insights with clickable sources to original LinkedIn content
+š¤ LinkedIn Profile AI Assistant | Powered by Advanced RAG Technology with Clickable Sources
+Built with ā¤ļø using Gradio, OpenAI GPT-4, ChromaDB, and Custom LinkedIn URL extraction
+