diff --git a/week5/community-contributions/linkedin-ai-assistant/app.py b/week5/community-contributions/linkedin-ai-assistant/app.py new file mode 100644 index 0000000..6e3ed3f --- /dev/null +++ b/week5/community-contributions/linkedin-ai-assistant/app.py @@ -0,0 +1,1143 @@ +import os +import json +import pandas as pd +import numpy as np +from pathlib import Path +import re +from datetime import datetime, timedelta +from bs4 import BeautifulSoup +import html2text +from collections import Counter, defaultdict, deque +import warnings +import time +import hashlib +import socket +import random +import zipfile +import tempfile +import shutil + +warnings.filterwarnings('ignore') + +import gradio as gr +import chromadb +from sentence_transformers import SentenceTransformer +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from openai import OpenAI +import torch + +# ================================ +# USAGE PROTECTION SYSTEM +# ================================ + +class UsageTracker: + def __init__(self): + self.hourly_limits = defaultdict(lambda: deque()) + self.daily_limits = defaultdict(int) + self.total_requests = 0 + self.total_cost = 0.0 + + # STRICTER LIMITS for cost control + self.max_hourly = 5 # Reduced from 15 + self.max_daily = 20 # Reduced from 100 + self.max_total = 200 # Reduced from 1000 + self.max_daily_cost = 3.0 # $3 daily limit + + # GPT-4o-mini pricing (approximate cost per request) + self.cost_per_request = 0.01 # ~1 cent per request (conservative estimate) + + def can_make_request(self, user_id): + now = datetime.now() + hour_ago = now - timedelta(hours=1) + + # Clean old hourly requests + while self.hourly_limits[user_id] and self.hourly_limits[user_id][0] < hour_ago: + self.hourly_limits[user_id].popleft() + + # Check limits + if len(self.hourly_limits[user_id]) >= self.max_hourly: + return False, f"ā° Hourly limit reached ({self.max_hourly} requests/hour). Please try again in a few minutes." + + if self.daily_limits[user_id] >= self.max_daily: + return False, f"šŸ“… Daily limit reached ({self.max_daily} requests/day). Come back tomorrow!" + + if self.total_requests >= self.max_total: + return False, "🚫 Service temporarily unavailable due to high usage. Please try again later." + + # Check estimated daily cost + if self.total_cost >= self.max_daily_cost: + return False, f"šŸ’° Daily cost limit (${self.max_daily_cost}) reached. Service will reset tomorrow." + + return True, "OK" + + def record_request(self, user_id): + now = datetime.now() + self.hourly_limits[user_id].append(now) + self.daily_limits[user_id] += 1 + self.total_requests += 1 + self.total_cost += self.cost_per_request # Track estimated cost + + def get_usage_info(self): + """Get current usage info for display""" + return f""" +**šŸ“Š Current Usage:** +- Total requests today: {self.total_requests}/{self.max_total} +- Estimated cost today: ${self.total_cost:.2f}/${self.max_daily_cost} +- Service status: {'🟢 Active' if self.total_requests < self.max_total and self.total_cost < self.max_daily_cost else 'šŸ”“ Limited'} +""" + +# Initialize tracker - ADD THIS LINE! +usage_tracker = UsageTracker() + + +def protected_function(func): + def wrapper(*args, **kwargs): + user_id = hashlib.md5(str(time.time()).encode()).hexdigest()[:8] + allowed, message = usage_tracker.can_make_request(user_id) + + if not allowed: + return f"āš ļø {message}. Please try again later." + + usage_tracker.record_request(user_id) + return func(*args, **kwargs) + return wrapper + +# ================================ +# LINKEDIN DATA PROCESSOR +# ================================ + +class LinkedInDataProcessor: + def __init__(self, data_path): + self.data_path = Path(data_path) + self.profile_data = {} + self.processed_data = {} + self.articles_content = [] + self.rag_documents = [] + + def load_all_data(self): + """Load all LinkedIn JSON and CSV files including HTML articles""" + print("šŸ”„ Loading LinkedIn data...") + + file_mappings = { + 'Profile.csv': 'basic_info', + 'Connections.csv': 'connections', + 'Experience.csv': 'experience', + 'Education.csv': 'education', + 'Skills.csv': 'skills', + 'Certifications.csv': 'certifications', + 'Articles.csv': 'articles_metadata', + 'Comments.csv': 'comments', + 'Shares.csv': 'shares', + 'Positions.csv': 'positions', + 'Languages.csv': 'languages', + 'Projects.csv': 'projects', + 'Publications.csv': 'publications', + 'Recommendations.csv': 'recommendations', + 'Endorsement_Given_Info.csv': 'endorsements_given', + 'Endorsement_Received_Info.csv': 'endorsements_received', + 'Courses.csv': 'courses', + 'Learning.csv': 'learning_paths', + 'Interests.csv': 'interests', + 'Company Follow.csv': 'companies_followed', + 'Reactions.csv': 'reactions', + 'Views.csv': 'views', + 'Saved_Items.csv': 'saved_items', + } + + loaded_count = 0 + for file_name, data_type in file_mappings.items(): + file_path = self.data_path / file_name + if file_path.exists(): + try: + df = pd.read_csv(file_path, encoding='utf-8') + self.profile_data[data_type] = df + print(f"āœ… Loaded {file_name}: {len(df)} records") + loaded_count += 1 + except Exception as e: + print(f"āš ļø Could not load {file_name}: {str(e)}") + else: + print(f"šŸ“ {file_name} not found") + + self.load_html_articles() + print(f"šŸŽ‰ Successfully loaded {loaded_count} data files") + return loaded_count > 0 + + def load_html_articles(self): + """Load and parse HTML articles""" + print("\nšŸ“° Loading HTML articles...") + + articles_paths = [ + self.data_path / "Articles" / "Articles", + self.data_path / "Articles", + self.data_path / "articles" / "articles", + self.data_path / "articles", + ] + + found_path = None + for path in articles_paths: + if path.exists(): + found_path = path + break + + if not found_path: + print("šŸ“ Articles folder not found") + return + + html_files = list(found_path.glob("*.html")) + if not html_files: + print("šŸ“„ No HTML files found") + return + + print(f"šŸ“„ Found {len(html_files)} HTML articles") + + articles_data = [] + for html_file in html_files: + try: + article_data = self.parse_html_article(html_file) + if article_data: + articles_data.append(article_data) + except Exception as e: + print(f"āš ļø Error parsing {html_file.name}: {str(e)}") + + self.articles_content = articles_data + self.profile_data['articles_html'] = articles_data + print(f"šŸŽ‰ Successfully loaded {len(articles_data)} articles") + + def extract_linkedin_url_from_html(self, html_content, filename): + """Extract LinkedIn URL from HTML article content""" + try: + soup = BeautifulSoup(html_content, 'html.parser') + + # Look for canonical URL + canonical = soup.find('link', {'rel': 'canonical'}) + if canonical and canonical.get('href'): + url = canonical.get('href') + if 'linkedin.com' in url: + return url + + # Look for meta property og:url + og_url = soup.find('meta', {'property': 'og:url'}) + if og_url and og_url.get('content'): + url = og_url.get('content') + if 'linkedin.com' in url: + return url + + # Look for any LinkedIn URLs in the content + linkedin_pattern = r'https?://(?:www\.)?linkedin\.com/pulse/[^"\s<>]+' + matches = re.findall(linkedin_pattern, html_content) + if matches: + return matches[0] + + # Fallback: construct URL from filename + if filename: + clean_name = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', filename) + clean_name = clean_name.replace('.html', '') + + if len(clean_name) > 10 and '-' in clean_name: + return f"https://www.linkedin.com/pulse/{clean_name}/" + + return None + + except Exception as e: + print(f"Error extracting LinkedIn URL: {e}") + return None + + def parse_html_article(self, file_path): + """Parse individual HTML article with LinkedIn URL extraction""" + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + soup = BeautifulSoup(content, 'html.parser') + + # Extract title + title_elem = soup.find('h1') or soup.find('title') + title = title_elem.get_text().strip() if title_elem else self.extract_title_from_filename(file_path.name) + + # Extract LinkedIn URL + linkedin_url = self.extract_linkedin_url_from_html(content, file_path.name) + + # Extract content + content_selectors = ['article', '.article-content', '.post-content', 'main', '.content', 'body'] + article_content = None + for selector in content_selectors: + article_content = soup.select_one(selector) + if article_content: + break + + if not article_content: + article_content = soup.find('body') or soup + + # Convert to plain text + h = html2text.HTML2Text() + h.ignore_links = True + h.ignore_images = True + plain_text = h.handle(str(article_content)).strip() + + # Extract metadata + words = re.findall(r'\b\w+\b', plain_text.lower()) + + return { + 'filename': file_path.name, + 'title': title, + 'content': str(article_content), + 'plain_text': plain_text, + 'date_published': self.extract_date_from_filename(file_path.name), + 'word_count': len(words), + 'topics': self.extract_topics(plain_text), + 'writing_style': self.analyze_writing_style(plain_text), + 'linkedin_url': linkedin_url + } + + def extract_title_from_filename(self, filename): + """Extract readable title from filename""" + title = filename.replace('.html', '') + title = re.sub(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d+-', '', title) + title = title.replace('-', ' ').replace('_', ' ') + return ' '.join(word.capitalize() for word in title.split()) + + def extract_date_from_filename(self, filename): + """Extract publication date from filename""" + date_match = re.search(r'(\d{4}-\d{2}-\d{2})', filename) + return date_match.group(1) if date_match else '' + + def analyze_writing_style(self, text): + """Analyze writing style indicators""" + text_lower = text.lower() + sentences = re.split(r'[.!?]+', text) + words = re.findall(r'\b\w+\b', text_lower) + + return { + 'word_count': len(words), + 'sentence_count': len(sentences), + 'avg_sentence_length': len(words) / max(len(sentences), 1), + 'question_count': text.count('?'), + 'first_person_usage': len(re.findall(r'\b(i|me|my|myself|we|us|our)\b', text_lower)), + 'technical_terms': sum(text_lower.count(term) for term in ['algorithm', 'framework', 'methodology', 'data', 'analysis', 'technology']), + } + + def extract_topics(self, text, max_topics=10): + """Extract main topics from text""" + words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) + stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'been', 'have', 'has', 'had'} + word_freq = Counter(word for word in words if word not in stop_words and len(word) > 3) + return [word for word, count in word_freq.most_common(max_topics)] + + def create_rag_documents(self): + """Create documents for RAG system with LinkedIn URLs""" + self.rag_documents = [] + + # Process profile data + for data_type, data_content in self.profile_data.items(): + if isinstance(data_content, pd.DataFrame) and not data_content.empty: + self.process_dataframe_to_documents(data_content, data_type) + elif isinstance(data_content, list) and data_content: + self.process_list_to_documents(data_content, data_type) + + # Process articles with LinkedIn URLs + if self.articles_content: + for article in self.articles_content: + if article['plain_text'].strip(): + self.rag_documents.append({ + 'text': article['plain_text'], + 'title': article['title'], + 'source_type': 'article', + 'date_published': article['date_published'], + 'word_count': article['word_count'], + 'topics': article['topics'], + 'linkedin_url': article.get('linkedin_url', ''), + 'filename': article['filename'] + }) + + print(f"šŸ“š Created {len(self.rag_documents)} RAG documents with LinkedIn URLs") + return self.rag_documents + + def process_dataframe_to_documents(self, df, data_type): + """Convert DataFrame to RAG documents""" + if data_type == 'experience': + for _, row in df.iterrows(): + text = f"Experience: {row.get('Title', '')} at {row.get('Company', '')}\n" + text += f"Duration: {row.get('Started On', '')} - {row.get('Finished On', 'Present')}\n" + text += f"Description: {row.get('Description', '')}" + + self.rag_documents.append({ + 'text': text, + 'title': f"{row.get('Title', '')} at {row.get('Company', '')}", + 'source_type': 'experience', + 'linkedin_url': '' + }) + + elif data_type == 'education': + for _, row in df.iterrows(): + text = f"Education: {row.get('Degree', '')} in {row.get('Field Of Study', '')} from {row.get('School', '')}\n" + text += f"Duration: {row.get('Start Date', '')} - {row.get('End Date', '')}" + + self.rag_documents.append({ + 'text': text, + 'title': f"{row.get('Degree', '')} - {row.get('School', '')}", + 'source_type': 'education', + 'linkedin_url': '' + }) + + elif data_type == 'skills': + if 'Skill' in df.columns: + skills_text = "Professional Skills: " + ", ".join(df['Skill'].dropna().tolist()) + self.rag_documents.append({ + 'text': skills_text, + 'title': 'Professional Skills', + 'source_type': 'skills', + 'linkedin_url': '' + }) + + elif data_type == 'certifications': + if 'Name' in df.columns: + certs_text = "Certifications: " + ", ".join(df['Name'].dropna().tolist()) + self.rag_documents.append({ + 'text': certs_text, + 'title': 'Certifications', + 'source_type': 'certifications', + 'linkedin_url': '' + }) + + elif data_type == 'projects': + for _, row in df.iterrows(): + text = f"Project: {row.get('Title', '')}\n" + text += f"Description: {row.get('Description', '')}\n" + text += f"URL: {row.get('Url', '')}" + + project_url = row.get('Url', '') + linkedin_url = project_url if 'linkedin.com' in str(project_url) else '' + + self.rag_documents.append({ + 'text': text, + 'title': row.get('Title', 'Project'), + 'source_type': 'projects', + 'linkedin_url': linkedin_url + }) + + def process_list_to_documents(self, data_list, data_type): + """Convert list data to RAG documents""" + if data_type == 'articles_html': + return + + def get_profile_summary(self): + """Get comprehensive profile summary""" + summary = { + 'total_documents': len(self.rag_documents), + 'articles_count': len(self.articles_content), + 'data_types': list(self.profile_data.keys()), + 'skills_count': len(self.profile_data.get('skills', [])), + 'experience_count': len(self.profile_data.get('experience', [])), + 'education_count': len(self.profile_data.get('education', [])), + } + + if self.articles_content: + total_words = sum(article['word_count'] for article in self.articles_content) + summary['total_article_words'] = total_words + summary['avg_article_length'] = total_words // len(self.articles_content) + + return summary + +# ================================ +# RAG SYSTEM +# ================================ + +class LinkedInRAGSystem: + def __init__(self, chroma_db_path): + self.chroma_db_path = chroma_db_path + self.embedding_model = None + self.cross_encoder_model = None + self.cross_encoder_tokenizer = None + self.chroma_client = None + self.collection = None + self.openai_client = None + + def initialize_models(self): + """Initialize all required models""" + print("šŸ”„ Initializing RAG models...") + + # Initialize OpenAI client + try: + api_key = os.getenv('OPENAI_API_KEY') + if not api_key: + print("āŒ OpenAI API key not found in environment variables") + return False + self.openai_client = OpenAI(api_key=api_key) + print("āœ… OpenAI client initialized") + except Exception as e: + print(f"āŒ Failed to initialize OpenAI client: {e}") + return False + + # Load embedding model + try: + self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2') + print("āœ… Embedding model loaded") + except Exception as e: + print(f"āŒ Failed to load embedding model: {e}") + return False + + # Load cross-encoder for reranking + try: + cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-6-v2" + self.cross_encoder_tokenizer = AutoTokenizer.from_pretrained(cross_encoder_name) + self.cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(cross_encoder_name) + print("āœ… Cross-encoder model loaded") + except Exception as e: + print(f"āŒ Failed to load cross-encoder: {e}") + return False + + # Initialize ChromaDB + try: + self.chroma_client = chromadb.PersistentClient(path=self.chroma_db_path) + print("āœ… ChromaDB initialized") + except Exception as e: + print(f"āŒ Failed to initialize ChromaDB: {e}") + return False + + return True + + def create_vector_store(self, documents): + """Create vector store from documents with enhanced metadata""" + print("šŸ”„ Creating vector store with LinkedIn URLs...") + + # Delete existing collection if it exists + try: + self.chroma_client.delete_collection("linkedin_profile") + except: + pass + + # Create new collection + self.collection = self.chroma_client.create_collection("linkedin_profile") + + # Generate embeddings + texts = [doc['text'] for doc in documents] + embeddings = self.embedding_model.encode(texts, show_progress_bar=True) + + # Prepare data for ChromaDB with enhanced metadata + ids = [f"doc_{i}" for i in range(len(documents))] + metadatas = [] + + for doc in documents: + metadata = {} + for k, v in doc.items(): + if k != 'text': + if k == 'linkedin_url' and v: + metadata[k] = str(v) + elif k == 'date_published' and v: + metadata[k] = str(v) + elif k == 'topics' and isinstance(v, list): + metadata[k] = ', '.join(v) if v else '' + elif v is not None: + metadata[k] = str(v) + else: + metadata[k] = '' + metadatas.append(metadata) + + # Add to collection + batch_size = 100 + for i in range(0, len(texts), batch_size): + end_idx = min(i + batch_size, len(texts)) + self.collection.add( + embeddings=embeddings[i:end_idx].tolist(), + documents=texts[i:end_idx], + metadatas=metadatas[i:end_idx], + ids=ids[i:end_idx] + ) + + print(f"āœ… Vector store created with {self.collection.count()} documents") + return True + + def retrieve_and_rerank(self, query, initial_k=20, final_n=5): + """Retrieve and rerank documents""" + if not self.collection: + return [] + + try: + # Initial retrieval + query_embedding = self.embedding_model.encode(query).tolist() + results = self.collection.query( + query_embeddings=[query_embedding], + n_results=initial_k, + include=['documents', 'metadatas'] + ) + + if not results['documents'][0]: + return [] + + # Prepare for reranking + documents = results['documents'][0] + metadatas = results['metadatas'][0] + + # Rerank with cross-encoder + pairs = [[query, doc] for doc in documents] + inputs = self.cross_encoder_tokenizer( + pairs, + padding=True, + truncation=True, + return_tensors='pt', + max_length=512 + ) + + with torch.no_grad(): + scores = self.cross_encoder_model(**inputs).logits.squeeze() + + if scores.dim() == 0: + scores = [scores.item()] + else: + scores = scores.tolist() + + # Sort by score + scored_docs = list(zip(documents, metadatas, scores)) + scored_docs.sort(key=lambda x: x[2], reverse=True) + + # Return top documents + return [{'text': doc, 'metadata': meta, 'score': score} + for doc, meta, score in scored_docs[:final_n]] + + except Exception as e: + print(f"Error in retrieve_and_rerank: {e}") + return [] + + def generate_response(self, query, retrieved_docs): + """Generate response using OpenAI""" + if not retrieved_docs: + return "I couldn't find relevant information to answer your question." + + context = "\n\n".join([doc['text'] for doc in retrieved_docs]) + + messages = [ + { + "role": "system", + "content": """You are an AI assistant representing a LinkedIn profile. Answer questions based ONLY on the provided context from the LinkedIn profile data and articles. + +Guidelines: +- Be professional and personable +- Provide specific details when available +- If information isn't in the context, politely say so +- Use first person when appropriate (since you're representing the profile owner) +- Keep responses concise but informative +- Do not mention or reference the sources in your response - that will be handled separately""" + }, + { + "role": "user", + "content": f"Context:\n{context}\n\nQuestion: {query}\n\nPlease answer based on the LinkedIn profile information provided:" + } + ] + + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=messages, + max_tokens=400, + temperature=0.3, + top_p=0.9 + ) + return response.choices[0].message.content.strip() + except Exception as e: + return f"Sorry, I encountered an error generating a response: {str(e)}" + + def format_sources_with_links(self, retrieved_docs): + """Format sources with clickable LinkedIn links""" + if not retrieved_docs: + return "" + + sources_html = "

**šŸ“š Sources:**
" + + for i, doc in enumerate(retrieved_docs, 1): + metadata = doc['metadata'] + source_type = metadata.get('source_type', 'Unknown') + title = metadata.get('title', 'Untitled') + linkedin_url = metadata.get('linkedin_url', '') + date_published = metadata.get('date_published', '') + + # Create source entry + if linkedin_url: + # Clickable LinkedIn link + source_entry = f"šŸ”— {title}" + if date_published: + source_entry += f" ({date_published})" + else: + # No link available + source_entry = f"šŸ“„ **{title}**" + if date_published: + source_entry += f" ({date_published})" + + # Add source type badge + type_color = { + 'article': '#0077B5', + 'experience': '#2D7D32', + 'education': '#7B1FA2', + 'skills': '#F57C00', + 'projects': '#D32F2F', + 'certifications': '#1976D2' + }.get(source_type, '#666') + + source_type_badge = f"{source_type.title()}" + + sources_html += f"{i}. {source_entry}{source_type_badge}
" + + return sources_html + + def chat(self, query): + """Main chat function with enhanced source linking""" + retrieved_docs = self.retrieve_and_rerank(query) + response = self.generate_response(query, retrieved_docs) + + # Add formatted sources with links + sources_info = self.format_sources_with_links(retrieved_docs) + + return response + sources_info + +# ================================ +# UTILITY FUNCTIONS +# ================================ + +def extract_uploaded_data(zip_file_path, extract_to): + """Extract uploaded LinkedIn data zip file""" + try: + with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: + zip_ref.extractall(extract_to) + print(f"āœ… Extracted data to {extract_to}") + return True + except Exception as e: + print(f"āŒ Failed to extract zip file: {e}") + return False + +def initialize_linkedin_chatbot(data_path): + """Initialize the complete LinkedIn chatbot system with clickable sources""" + print("šŸš€ Initializing LinkedIn Profile Chatbot with clickable sources...") + + # Step 1: Load and process data + processor = LinkedInDataProcessor(data_path) + if not processor.load_all_data(): + return None, "Failed to load LinkedIn data. Please check the uploaded data." + + # Step 2: Create RAG documents with LinkedIn URLs + documents = processor.create_rag_documents() + if not documents: + return None, "No documents created from LinkedIn data." + + # Count articles with LinkedIn URLs + articles_with_urls = sum(1 for doc in documents if doc.get('linkedin_url') and doc.get('source_type') == 'article') + + # Step 3: Initialize RAG system + temp_db_path = tempfile.mkdtemp() + rag_system = LinkedInRAGSystem(temp_db_path) + if not rag_system.initialize_models(): + return None, "Failed to initialize RAG models." + + # Step 4: Create vector store + if not rag_system.create_vector_store(documents): + return None, "Failed to create vector store." + + # Step 5: Get profile summary + summary = processor.get_profile_summary() + + # Create a clean status message + summary_text = f""" +### āœ… **AI Assistant Ready with Clickable Sources!** + +I have successfully analyzed the LinkedIn profile data including **{summary['total_documents']} documents** and **{summary['articles_count']} published articles** ({articles_with_urls} with direct LinkedIn links). + +**šŸ’¼ What I can help you discover:** +- šŸŽÆ **Professional Journey** - Career progression and experience +- šŸ› ļø **Skills & Expertise** - Technical and professional capabilities +- šŸŽ“ **Educational Background** - Academic achievements and learning +- šŸ“ **Published Content** - Articles with direct LinkedIn links +- šŸš€ **Projects & Achievements** - Notable work and accomplishments +- 🌐 **Professional Network** - Industry connections and activities + +**šŸ”— Enhanced Features:** +- **Clickable Sources** - Direct links to LinkedIn articles and content +- **Smart Source Attribution** - See exactly where information comes from +- **Professional Context** - Answers based on real LinkedIn profile data + +**Ready to explore this professional profile!** Ask me anything you'd like to know. +""" + + return rag_system, summary_text + +# ================================ +# GRADIO INTERFACE +# ================================ + +# Global variables +current_rag_system = None +current_status = "Upload your LinkedIn data to get started!" + +# Add this anywhere in your Gradio interface after the status_display +usage_info = gr.Markdown(value=usage_tracker.get_usage_info()) + +def process_upload(zip_file): + """Process uploaded LinkedIn data""" + global current_rag_system, current_status + + if zip_file is None: + return "Please upload a LinkedIn data ZIP file first.", "" + + try: + # Create temporary directory for extraction + temp_dir = tempfile.mkdtemp() + + # Extract the uploaded file + if extract_uploaded_data(zip_file.name, temp_dir): + # Initialize the RAG system + rag_system, status_message = initialize_linkedin_chatbot(temp_dir) + + if rag_system: + current_rag_system = rag_system + current_status = status_message + return status_message, "āœ… **Ready to chat!** Ask me anything about the LinkedIn profile." + else: + return f"āŒ Failed to initialize: {status_message}", "" + else: + return "āŒ Failed to extract uploaded file.", "" + + except Exception as e: + return f"āŒ Error processing upload: {str(e)}", "" + +@protected_function +def chat_with_profile(message, history): + """Chat function with protection""" + global current_rag_system + + if current_rag_system is None: + bot_response = "āŒ **Please upload your LinkedIn data first using the file upload above.**" + history.append((message, bot_response)) + return history, "" + + if not message.strip(): + bot_response = "šŸ‘‹ Please enter a question about the LinkedIn profile!" + history.append((message, bot_response)) + return history, "" + + try: + bot_response = current_rag_system.chat(message) + history.append((message, bot_response)) + except Exception as e: + bot_response = f"āŒ **Error**: {str(e)}" + history.append((message, bot_response)) + + return history, "" + +# Premium CSS +premium_css = """ +/* Import Google Fonts */ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); + +/* Main container styling */ +.gradio-container { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + min-height: 100vh; +} + +/* Header styling */ +.main-header { + background: linear-gradient(135deg, #0077B5 0%, #00A0DC 50%, #40E0D0 100%); + color: white; + padding: 2rem; + border-radius: 20px; + margin-bottom: 2rem; + text-align: center; + box-shadow: 0 10px 30px rgba(0,119,181,0.3); + border: 1px solid rgba(255,255,255,0.2); + backdrop-filter: blur(10px); +} + +.main-header h1 { + font-size: 2.5rem; + font-weight: 700; + margin-bottom: 0.5rem; + text-shadow: 2px 2px 4px rgba(0,0,0,0.3); +} + +.main-header p { + font-size: 1.2rem; + opacity: 0.95; + font-weight: 400; +} + +/* Status card styling */ +.status-card { + background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%); + border-radius: 16px; + padding: 1.5rem; + margin-bottom: 2rem; + box-shadow: 0 8px 25px rgba(0,0,0,0.1); + border: 1px solid rgba(0,119,181,0.1); +} + +/* Chat container */ +.chat-container { + background: white; + border-radius: 20px; + padding: 1.5rem; + box-shadow: 0 10px 40px rgba(0,0,0,0.1); + border: 1px solid rgba(0,119,181,0.1); + max-width: 900px; + margin: 0 auto; +} + +/* Upload container */ +.upload-container { + background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); + border-radius: 16px; + padding: 1.5rem; + margin-bottom: 2rem; + border: 2px dashed #0077B5; +} + +/* Button styling */ +.primary-btn { + background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%); + color: white; + border: none; + border-radius: 12px; + padding: 0.75rem 1.5rem; + font-weight: 600; + transition: all 0.3s ease; + box-shadow: 0 4px 15px rgba(0,119,181,0.3); +} + +.primary-btn:hover { + transform: translateY(-2px); + box-shadow: 0 6px 20px rgba(0,119,181,0.4); +} + +/* Example buttons */ +.example-btn { + background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); + color: #0077B5; + border: 1px solid #0077B5; + border-radius: 25px; + padding: 0.6rem 1.2rem; + font-weight: 500; + margin: 0.3rem; + transition: all 0.3s ease; + font-size: 0.9rem; +} + +.example-btn:hover { + background: linear-gradient(135deg, #0077B5 0%, #00A0DC 100%); + color: white; + transform: translateY(-1px); + box-shadow: 0 4px 12px rgba(0,119,181,0.3); +} + +/* Input styling */ +.input-text { + border: 2px solid #e1e8ed; + border-radius: 12px; + padding: 1rem; + font-size: 1rem; + transition: all 0.3s ease; + background: #f8fafc; +} + +.input-text:focus { + border-color: #0077B5; + box-shadow: 0 0 0 3px rgba(0,119,181,0.1); + background: white; +} + +/* Chatbot styling */ +.chatbot { + border: none; + border-radius: 16px; + box-shadow: inset 0 2px 10px rgba(0,0,0,0.05); +} + +/* Accordion styling */ +.accordion { + background: linear-gradient(135deg, #f8fafc 0%, #e1e8ed 100%); + border-radius: 12px; + border: 1px solid #e1e8ed; +} +""" + +# Create Gradio interface +with gr.Blocks(css=premium_css, title="LinkedIn Profile AI Assistant", theme=gr.themes.Soft()) as interface: + + # Main Header + gr.HTML(""" +
+

šŸ¤– LinkedIn Profile AI Assistant

+

Intelligent insights with clickable sources to original LinkedIn content

+
+ """) + + # Upload Section + with gr.Column(elem_classes=["upload-container"]): + gr.Markdown("### šŸ“ **Upload Your LinkedIn Data**") + gr.Markdown("Upload your LinkedIn data export ZIP file to get started. [Learn how to export your LinkedIn data](https://www.linkedin.com/help/linkedin/answer/a1339364)") + + with gr.Row(): + upload_file = gr.File( + label="LinkedIn Data ZIP File", + file_types=[".zip"], + type="filepath" + ) + upload_btn = gr.Button( + "šŸš€ Process Data", + variant="primary", + elem_classes=["primary-btn"] + ) + + # Status Display + status_display = gr.Markdown( + value="šŸ“ **Upload your LinkedIn data ZIP file above to get started!**", + elem_classes=["status-card"] + ) + + chat_status = gr.Markdown( + value="", + elem_classes=["status-card"] + ) + + # Main Chat Interface + with gr.Column(elem_classes=["chat-container"]): + + # Chat Display + chatbot = gr.Chatbot( + label="šŸ’¬ Professional Profile Assistant", + height=550, + show_copy_button=True, + avatar_images=("šŸ‘¤", "šŸ¤–"), + bubble_full_width=False, + elem_classes=["chatbot"] + ) + + # Input Section + with gr.Row(): + with gr.Column(scale=5): + msg = gr.Textbox( + placeholder="Ask about experience, skills, education, articles, or any aspect of the professional profile...", + label="Your Question", + lines=2, + max_lines=4, + elem_classes=["input-text"] + ) + with gr.Column(scale=1, min_width=100): + submit_btn = gr.Button( + "Send šŸ’¬", + variant="primary", + size="lg", + elem_classes=["primary-btn"] + ) + + # Quick Action Buttons + with gr.Row(): + clear_btn = gr.Button("šŸ—‘ļø Clear Chat", variant="secondary", size="sm") + + # Enhanced Examples Section + with gr.Accordion("šŸ’” Example Questions - Click to Try", open=False, elem_classes=["accordion"]) as examples_accordion: + + gr.Markdown("### šŸŽÆ **Professional Experience & Career**") + with gr.Row(): + exp_q1 = gr.Button("What is the professional background?", elem_classes=["example-btn"], size="sm") + exp_q2 = gr.Button("Describe the career progression", elem_classes=["example-btn"], size="sm") + exp_q3 = gr.Button("What are the key achievements?", elem_classes=["example-btn"], size="sm") + + gr.Markdown("### šŸ› ļø **Skills & Expertise**") + with gr.Row(): + skill_q1 = gr.Button("What skills and expertise are highlighted?", elem_classes=["example-btn"], size="sm") + skill_q2 = gr.Button("What technologies are mentioned?", elem_classes=["example-btn"], size="sm") + skill_q3 = gr.Button("What are the main areas of expertise?", elem_classes=["example-btn"], size="sm") + + gr.Markdown("### šŸ“š **Education & Learning**") + with gr.Row(): + edu_q1 = gr.Button("Tell me about the educational background", elem_classes=["example-btn"], size="sm") + edu_q2 = gr.Button("What certifications are mentioned?", elem_classes=["example-btn"], size="sm") + edu_q3 = gr.Button("What courses or learning paths are included?", elem_classes=["example-btn"], size="sm") + + gr.Markdown("### šŸ“ **Articles & Content**") + with gr.Row(): + content_q1 = gr.Button("What articles have been published?", elem_classes=["example-btn"], size="sm") + content_q2 = gr.Button("What topics are covered in the writing?", elem_classes=["example-btn"], size="sm") + content_q3 = gr.Button("What is the writing style like?", elem_classes=["example-btn"], size="sm") + + # Connect example buttons to input + example_questions = [ + (exp_q1, "What is the professional background and experience?"), + (exp_q2, "Describe the career progression and professional journey"), + (exp_q3, "What are the key achievements and accomplishments?"), + (skill_q1, "What skills and expertise are highlighted in the profile?"), + (skill_q2, "What technologies, tools, and platforms are mentioned?"), + (skill_q3, "What are the main areas of expertise and specialization?"), + (edu_q1, "Tell me about the educational background and qualifications"), + (edu_q2, "What certifications and professional credentials are mentioned?"), + (edu_q3, "What courses, training, or learning paths are included?"), + (content_q1, "What articles and content have been published?"), + (content_q2, "What topics and themes are covered in the published writing?"), + (content_q3, "What is the writing style and approach in the articles?") + ] + + for btn, question in example_questions: + btn.click(lambda q=question: q, outputs=msg) + + # About Section + with gr.Accordion("ā„¹ļø About This AI Assistant", open=False, elem_classes=["accordion"]): + gr.Markdown(""" + ### šŸš€ **Advanced AI-Powered Profile Analysis with Clickable Sources** + + This intelligent assistant uses cutting-edge **Retrieval-Augmented Generation (RAG)** technology to provide accurate, contextual answers about LinkedIn profiles with direct links to original content. + + **šŸ”§ Technical Capabilities:** + - **Vector Search**: Semantic similarity matching for relevant information retrieval + - **Cross-Encoder Reranking**: Advanced relevance scoring for precision + - **GPT-4 Generation**: Natural, human-like response generation + - **Multi-Source Integration**: Combines structured data and article content + - **Clickable Sources**: Direct links to original LinkedIn articles and content + + **šŸ“Š Data Sources Analyzed:** + - Professional experience and job history + - Educational background and certifications + - Skills, endorsements, and expertise areas + - Published articles and thought leadership content (with clickable links) + - Projects, achievements, and recommendations + - Professional network activities and engagement + + **šŸ”’ Privacy & Security:** + - Only uses uploaded LinkedIn profile data + - No external data access or web browsing + - Responses based solely on uploaded content + - Secure processing with no data retention + + **⚔ Built with:** + - Gradio for the interface + - OpenAI GPT-4 for generation + - ChromaDB for vector storage + - Sentence Transformers for embeddings + - Custom LinkedIn URL extraction + """) + + # Event Handlers + upload_btn.click( + process_upload, + inputs=[upload_file], + outputs=[status_display, chat_status] + ) + + msg.submit(chat_with_profile, inputs=[msg, chatbot], outputs=[chatbot, msg]) + submit_btn.click(chat_with_profile, inputs=[msg, chatbot], outputs=[chatbot, msg]) + clear_btn.click(lambda: [], outputs=chatbot) + + # Add this to your existing event handlers + submit_btn.click( + lambda: usage_tracker.get_usage_info(), + outputs=usage_info, + queue=False + ) + + # Footer + gr.HTML(""" +
+

šŸ¤– LinkedIn Profile AI Assistant | Powered by Advanced RAG Technology with Clickable Sources

+

Built with ā¤ļø using Gradio, OpenAI GPT-4, ChromaDB, and Custom LinkedIn URL extraction

+
+ """) + +# Launch the interface +# Launch the interface +if __name__ == "__main__": + interface.launch() \ No newline at end of file