import os import re from pathlib import Path from typing import List, Optional, Dict, Any import json import tempfile import shutil from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain_openai import ChatOpenAI from langchain_classic.memory import ConversationBufferMemory from langchain_classic.chains import ConversationalRetrievalChain class DevOpsKnowledgeBase: def __init__(self, knowledge_base_path: str, embedding_model: str = "all-MiniLM-L6-v2"): self.knowledge_base_path = Path(knowledge_base_path) self.embedding_model_name = embedding_model self.embedding_model = None self.vectorstore = None self.documents = [] self.chunks = [] self.temp_db_dir = None self.indices = {} self.structure = {} def _parse_structured_content(self, content: str, file_path: Path) -> dict: metadata = {} try: if file_path.suffix.lower() in ['.yaml', '.yml']: import yaml data = yaml.safe_load(content) if isinstance(data, dict): metadata['kind'] = data.get('kind') metadata['api_version'] = data.get('apiVersion') if 'metadata' in data and isinstance(data['metadata'], dict): for key, value in data['metadata'].items(): if isinstance(value, (str, int, float, bool)): metadata[f'meta_{key}'] = value elif isinstance(value, dict): for k, v in value.items(): if isinstance(v, (str, int, float, bool)): metadata[f'meta_{key}_{k}'] = v if 'spec' in data and isinstance(data['spec'], dict): if 'project' in data['spec']: metadata['project'] = data['spec']['project'] if 'destination' in data['spec'] and isinstance(data['spec']['destination'], dict): if 'namespace' in data['spec']['destination']: metadata['namespace'] = data['spec']['destination']['namespace'] elif file_path.suffix.lower() == '.json': data = json.loads(content) if isinstance(data, dict): for key, value in data.items(): if isinstance(value, (str, int, float, bool)): metadata[f'json_{key}'] = value elif file_path.suffix.lower() in ['.tf', '.hcl']: metadata['is_terraform'] = True resources = re.findall(r'resource\s+"([^"]+)"\s+"([^"]+)"', content) if resources: metadata['terraform_resources'] = [r[0] for r in resources] metadata['resource_count'] = len(resources) modules = re.findall(r'module\s+"([^"]+)"', content) if modules: metadata['terraform_modules'] = modules metadata['module_count'] = len(modules) elif file_path.suffix.lower() == '.py': metadata['is_code'] = True metadata['language'] = 'python' imports = re.findall(r'^(?:from|import)\s+(\S+)', content, re.MULTILINE) classes = re.findall(r'^class\s+(\w+)', content, re.MULTILINE) functions = re.findall(r'^def\s+(\w+)', content, re.MULTILINE) if imports: metadata['imports'] = imports[:10] if classes: metadata['classes'] = classes metadata['class_count'] = len(classes) if functions: metadata['functions'] = functions[:20] metadata['function_count'] = len(functions) elif file_path.suffix.lower() in ['.js', '.ts']: metadata['is_code'] = True metadata['language'] = 'javascript' if file_path.suffix == '.js' else 'typescript' imports = re.findall(r'import\s+.*\s+from\s+[\'"]([^\'"]+)[\'"]', content) functions = re.findall(r'(?:function|const|let|var)\s+(\w+)\s*=?\s*(?:async\s*)?\(', content) classes = re.findall(r'class\s+(\w+)', content) if imports: metadata['imports'] = imports[:10] if classes: metadata['classes'] = classes metadata['class_count'] = len(classes) if functions: metadata['function_count'] = len(functions) elif file_path.suffix.lower() in ['.go']: metadata['is_code'] = True metadata['language'] = 'go' packages = re.findall(r'package\s+(\w+)', content) if packages: metadata['package'] = packages[0] imports = re.findall(r'import\s+[\'"]([^\'"]+)[\'"]', content) if imports: metadata['imports'] = imports[:10] except Exception as e: pass return metadata def _extract_content_patterns(self, content: str) -> dict: metadata = {} content_lower = content.lower() urls = re.findall(r'https?://[^\s<>"]+', content) if urls: metadata['has_urls'] = True metadata['url_count'] = len(urls) domains = [] for url in urls: domain_match = re.findall(r'https?://([^/]+)', url) if domain_match: domains.append(domain_match[0]) if domains: metadata['domains'] = list(set(domains))[:5] ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', content) if ips: metadata['has_ips'] = True metadata['ip_count'] = len(set(ips)) versions = re.findall(r'\bv?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b', content) if versions: metadata['has_versions'] = True patterns = { 'has_secrets': any(keyword in content_lower for keyword in ['password', 'secret', 'token', 'api_key', 'apikey']), 'has_monitoring': any(keyword in content_lower for keyword in ['prometheus', 'grafana', 'metrics', 'alert']), 'has_networking': any(keyword in content_lower for keyword in ['ingress', 'service', 'loadbalancer', 'route']), 'has_storage': any(keyword in content_lower for keyword in ['volume', 'pvc', 'storage', 'disk']), 'has_database': any(keyword in content_lower for keyword in ['postgres', 'mysql', 'redis', 'mongodb', 'database']), 'has_deployment': any(keyword in content_lower for keyword in ['deployment', 'statefulset', 'daemonset', 'replica']), } metadata.update({k: v for k, v in patterns.items() if v}) quoted_strings = re.findall(r'"([^"]{3,30})"', content) if quoted_strings: metadata['quoted_strings'] = list(set(quoted_strings))[:10] return metadata def load_documents(self) -> List[Document]: self.documents = [] if not self.knowledge_base_path.exists(): raise ValueError(f"Knowledge base path does not exist: {self.knowledge_base_path}") supported_extensions = {'.yaml', '.yml', '.md', '.txt', '.json', '.tf', '.hcl', '.py', '.js', '.ts', '.go', '.sh', '.rst'} print(f"Loading documents from {self.knowledge_base_path}...") for file_path in self.knowledge_base_path.rglob("*"): if file_path.is_file() and file_path.suffix.lower() in supported_extensions: try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read().strip() if content and len(content) > 50: relative_path = file_path.relative_to(self.knowledge_base_path) parts = relative_path.parts metadata = { "source": str(relative_path), "file_type": file_path.suffix.lower(), "path": str(file_path), "filename": file_path.stem, "full_filename": file_path.name, "char_count": len(content), "word_count": len(content.split()), "line_count": len(content.splitlines()), "depth": len(parts) - 1, "parent_dir": parts[-2] if len(parts) > 1 else "root", "path_level_0": parts[0] if len(parts) > 0 else None, "path_level_1": parts[1] if len(parts) > 1 else None, "path_level_2": parts[2] if len(parts) > 2 else None, "path_level_3": parts[3] if len(parts) > 3 else None, "full_path_parts": list(parts), } metadata.update(self._parse_structured_content(content, file_path)) metadata.update(self._extract_content_patterns(content)) doc = Document(page_content=content, metadata=metadata) self.documents.append(doc) except Exception as e: print(f"Skipped {file_path.name}: {str(e)}") print(f"Loaded {len(self.documents)} documents") return self.documents def discover_structure(self) -> dict: print("\nAuto-discovering repository structure...") structure = { 'total_files': len(self.documents), 'by_file_type': {}, 'by_depth': {}, 'by_parent_dir': {}, 'hierarchy': {}, 'patterns': {} } for doc in self.documents: file_type = doc.metadata.get('file_type', 'unknown') structure['by_file_type'][file_type] = structure['by_file_type'].get(file_type, 0) + 1 depth = doc.metadata.get('depth', 0) structure['by_depth'][depth] = structure['by_depth'].get(depth, 0) + 1 parent = doc.metadata.get('parent_dir', 'unknown') structure['by_parent_dir'][parent] = structure['by_parent_dir'].get(parent, 0) + 1 path_parts = doc.metadata.get('full_path_parts', []) current_level = structure['hierarchy'] for part in path_parts[:-1]: if part not in current_level: current_level[part] = {'_count': 0, '_children': {}} current_level[part]['_count'] += 1 current_level = current_level[part]['_children'] structure['patterns'] = self._detect_patterns() print(f"\nDiscovered Structure:") print(f" Total files: {structure['total_files']}") print(f"\n By file type:") for ftype, count in sorted(structure['by_file_type'].items(), key=lambda x: x[1], reverse=True): print(f" {ftype}: {count}") print(f"\n By depth:") for depth, count in sorted(structure['by_depth'].items()): print(f" Level {depth}: {count} files") print(f"\n Top-level directories:") for dir_name, data in structure['hierarchy'].items(): print(f" {dir_name}/: {data['_count']} files") if structure['patterns']: print(f"\n Detected patterns:") for pattern, count in structure['patterns'].items(): print(f" {pattern}: {count} files") self.structure = structure return structure def _detect_patterns(self) -> dict: patterns = { 'kubernetes_manifests': 0, 'terraform_files': 0, 'python_code': 0, 'javascript_code': 0, 'documentation': 0, 'configuration': 0, } for doc in self.documents: if doc.metadata.get('kind') or doc.metadata.get('api_version'): patterns['kubernetes_manifests'] += 1 if doc.metadata.get('is_terraform'): patterns['terraform_files'] += 1 if doc.metadata.get('language') == 'python': patterns['python_code'] += 1 if doc.metadata.get('language') in ['javascript', 'typescript']: patterns['javascript_code'] += 1 if doc.metadata.get('file_type') in ['.md', '.rst', '.txt']: patterns['documentation'] += 1 if doc.metadata.get('file_type') in ['.yaml', '.yml', '.json', '.toml']: patterns['configuration'] += 1 return {k: v for k, v in patterns.items() if v > 0} def create_dynamic_indices(self) -> dict: print("\nCreating dynamic indices...") indices = { 'by_path_level_0': {}, 'by_path_level_1': {}, 'by_path_level_2': {}, 'by_path_level_3': {}, 'by_file_type': {}, 'by_kind': {}, 'by_language': {}, 'by_parent_dir': {}, 'by_project': {}, 'by_namespace': {}, 'statistics': { 'total_documents': len(self.documents), 'total_chars': sum(d.metadata.get('char_count', 0) for d in self.documents), 'total_lines': sum(d.metadata.get('line_count', 0) for d in self.documents), } } for doc in self.documents: source = doc.metadata.get('source') for level in range(4): level_key = f'path_level_{level}' index_key = f'by_{level_key}' if level_value := doc.metadata.get(level_key): if level_value not in indices[index_key]: indices[index_key][level_value] = [] indices[index_key][level_value].append(source) if file_type := doc.metadata.get('file_type'): if file_type not in indices['by_file_type']: indices['by_file_type'][file_type] = [] indices['by_file_type'][file_type].append(source) if kind := doc.metadata.get('kind'): if kind not in indices['by_kind']: indices['by_kind'][kind] = [] indices['by_kind'][kind].append(source) if language := doc.metadata.get('language'): if language not in indices['by_language']: indices['by_language'][language] = [] indices['by_language'][language].append(source) if parent := doc.metadata.get('parent_dir'): if parent not in indices['by_parent_dir']: indices['by_parent_dir'][parent] = [] indices['by_parent_dir'][parent].append(source) if project := doc.metadata.get('project'): if project not in indices['by_project']: indices['by_project'][project] = [] indices['by_project'][project].append(source) if namespace := doc.metadata.get('namespace'): if namespace not in indices['by_namespace']: indices['by_namespace'][namespace] = [] indices['by_namespace'][namespace].append(source) self.indices = indices print(f"\nIndices Created:") print(f" Total documents indexed: {indices['statistics']['total_documents']}") print(f" Top-level paths: {len(indices['by_path_level_0'])}") print(f" File types: {len(indices['by_file_type'])}") if indices['by_kind']: print(f" Kubernetes kinds: {len(indices['by_kind'])}") if indices['by_language']: print(f" Programming languages: {len(indices['by_language'])}") return indices def chunk_documents_adaptive(self, documents: List[Document]) -> List[Document]: print("\nAdaptive chunking based on file characteristics...") all_chunks = [] strategies = { 'small_structured': [], 'large_structured': [], 'code_files': [], 'documentation': [], 'default': [] } for doc in documents: char_count = doc.metadata.get('char_count', 0) file_type = doc.metadata.get('file_type', '') if file_type in ['.yaml', '.yml', '.json', '.toml']: if char_count < 2000: strategies['small_structured'].append(doc) else: strategies['large_structured'].append(doc) elif file_type in ['.py', '.js', '.go', '.java', '.ts', '.rs', '.sh']: strategies['code_files'].append(doc) elif file_type in ['.md', '.rst', '.txt']: strategies['documentation'].append(doc) else: strategies['default'].append(doc) chunk_configs = { 'small_structured': {'chunk_size': 2000, 'chunk_overlap': 100}, 'large_structured': {'chunk_size': 1500, 'chunk_overlap': 200}, 'code_files': {'chunk_size': 1200, 'chunk_overlap': 150}, 'documentation': {'chunk_size': 1000, 'chunk_overlap': 200}, 'default': {'chunk_size': 1000, 'chunk_overlap': 200} } for strategy_name, docs in strategies.items(): if not docs: continue config = chunk_configs[strategy_name] splitter = RecursiveCharacterTextSplitter( chunk_size=config['chunk_size'], chunk_overlap=config['chunk_overlap'], separators=["\n\n", "\n", " ", ""] ) chunks = splitter.split_documents(docs) for i, chunk in enumerate(chunks): chunk.metadata['chunk_strategy'] = strategy_name chunk.metadata['chunk_id'] = f"{strategy_name}_{i:04d}" all_chunks.extend(chunks) print(f" {strategy_name}: {len(docs)} docs → {len(chunks)} chunks") self.chunks = all_chunks print(f" Total: {len(all_chunks)} chunks created") return all_chunks def initialize_embedding_model(self): print(f"\nInitializing embedding model: {self.embedding_model_name}...") self.embedding_model = HuggingFaceEmbeddings(model_name=self.embedding_model_name) print("Embedding model initialized") def create_vectorstore(self) -> Chroma: if not self.chunks: raise ValueError("No chunks available. Call chunk_documents_adaptive() first.") if not self.embedding_model: raise ValueError("Embedding model not initialized. Call initialize_embedding_model() first.") print("\nCreating vector store...") if self.temp_db_dir: try: shutil.rmtree(self.temp_db_dir) except: pass self.temp_db_dir = tempfile.mkdtemp(prefix="devops_kb_v2_") for chunk in self.chunks: cleaned_metadata = {} for key, value in chunk.metadata.items(): if value is not None and not isinstance(value, (list, dict)): cleaned_metadata[key] = value elif isinstance(value, list) and value: cleaned_metadata[key] = str(value) chunk.metadata = cleaned_metadata self.vectorstore = Chroma.from_documents( documents=self.chunks, embedding=self.embedding_model, persist_directory=self.temp_db_dir ) doc_count = self.vectorstore._collection.count() print(f"Vector store created with {doc_count} documents") return self.vectorstore def initialize(self): print("=" * 70) print("Initializing DevOps Knowledge Base") print("=" * 70) self.load_documents() self.discover_structure() self.create_dynamic_indices() self.chunk_documents_adaptive(self.documents) self.initialize_embedding_model() self.create_vectorstore() print("\n" + "=" * 70) print("Knowledge base initialized successfully!") print("=" * 70) return self.vectorstore class DevOpsAIAssistant: def __init__(self, knowledge_base_path: str, embedding_model: str = "all-MiniLM-L6-v2"): self.knowledge_base = DevOpsKnowledgeBase(knowledge_base_path, embedding_model) self.vectorstore = None self.conversation_chain = None self.memory = None self.llm = None def setup(self): print("\nSetting up DevOps AI Assistant...") self.vectorstore = self.knowledge_base.initialize() api_key = os.getenv('OPENAI_API_KEY') if not api_key: raise ValueError("OPENAI_API_KEY environment variable not set") print("\nInitializing OpenAI LLM...") self.llm = ChatOpenAI( model_name="gpt-4o-mini", temperature=0.3, api_key=api_key ) print("Setting up conversation memory...") self.memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True, output_key='answer' ) print("Creating conversation chain...") retriever = self.vectorstore.as_retriever(search_kwargs={"k": 10}) self.conversation_chain = ConversationalRetrievalChain.from_llm( llm=self.llm, retriever=retriever, memory=self.memory, return_source_documents=True, verbose=False ) print("\n" + "=" * 70) print("DevOps AI Assistant ready!") print("=" * 70) return self def ask(self, question: str) -> dict: if not self.conversation_chain: raise ValueError("Assistant not initialized. Call setup() first.") result = self.conversation_chain.invoke({"question": question}) response = { "answer": result.get('answer', ''), "sources": [] } if result.get('source_documents'): unique_sources = {} for doc in result['source_documents']: source = doc.metadata.get('source') if source not in unique_sources: path_info = "/".join([ doc.metadata.get('path_level_0', ''), doc.metadata.get('path_level_1', ''), doc.metadata.get('path_level_2', '') ]).strip('/') unique_sources[source] = { "content": doc.page_content[:300], "source": source, "file_type": doc.metadata.get('file_type', 'Unknown'), "path_info": path_info, "kind": doc.metadata.get('kind'), "language": doc.metadata.get('language') } response["sources"] = list(unique_sources.values()) return response def get_status(self) -> dict: if not self.vectorstore: return {"status": "not_initialized"} doc_count = self.vectorstore._collection.count() status = { "status": "ready", "documents_loaded": len(self.knowledge_base.documents), "chunks_created": len(self.knowledge_base.chunks), "vectors_in_store": doc_count, "knowledge_base_path": str(self.knowledge_base.knowledge_base_path) } if self.knowledge_base.structure: status["structure"] = { "total_files": self.knowledge_base.structure['total_files'], "file_types": len(self.knowledge_base.structure['by_file_type']), "patterns": self.knowledge_base.structure['patterns'] } if self.knowledge_base.indices: status["indices"] = { "path_levels": len(self.knowledge_base.indices['by_path_level_0']), "kinds": len(self.knowledge_base.indices['by_kind']), "languages": len(self.knowledge_base.indices['by_language']) } return status def create_assistant(knowledge_base_path: str) -> DevOpsAIAssistant: assistant = DevOpsAIAssistant(knowledge_base_path) assistant.setup() return assistant