Refactor DevOps AI Assistant: Enhance knowledge base parsing and indexing, improve error handling, and update user interface elements

This commit is contained in:
Mohamed Salah
2025-10-30 14:51:02 +03:00
parent aa3ddf2538
commit b9e8dc1870
2 changed files with 470 additions and 75 deletions

View File

@@ -1,6 +1,6 @@
import os import os
import gradio as gr import gradio as gr
from devops_ai_assistance import create_assistant, DevOpsAIAssistant from devops_ai_assistance import create_assistant
assistant = None assistant = None
@@ -8,7 +8,6 @@ status_info = None
def initialize_assistant(kb_path: str): def initialize_assistant(kb_path: str):
"""Initialize the assistant with knowledge base"""
global assistant, status_info global assistant, status_info
try: try:
@@ -16,42 +15,41 @@ def initialize_assistant(kb_path: str):
if not kb_path: if not kb_path:
return "Error: Please provide a valid knowledge base path" return "Error: Please provide a valid knowledge base path"
print(f"\n🚀 Initializing with knowledge base: {kb_path}") print(f"\nInitializing with knowledge base: {kb_path}")
assistant = create_assistant(kb_path) assistant = create_assistant(kb_path)
status_info = assistant.get_status() status_info = assistant.get_status()
status_message = f""" status_message = f"""
**DevOps AI Assistant Initialized Successfully!** **DevOps AI Assistant Initialized Successfully**
📊 **Knowledge Base Statistics:** **Knowledge Base Statistics:**
- Documents Loaded: {status_info['documents_loaded']} - Documents Loaded: {status_info['documents_loaded']}
- Chunks Created: {status_info['chunks_created']} - Chunks Created: {status_info['chunks_created']}
- Vectors in Store: {status_info['vectors_in_store']} - Vectors in Store: {status_info['vectors_in_store']}
- Knowledge Base Path: {status_info['knowledge_base_path']} - Knowledge Base Path: {status_info['knowledge_base_path']}
🎯 **Ready to Answer Questions About:** **Ready to Answer Questions About:**
- Kubernetes infrastructure configuration - Kubernetes infrastructure configuration
- ArgoCD deployment manifests - ArgoCD deployment manifests
- Helm charts and values - Helm charts and values
- Infrastructure as Code (IaC) - Infrastructure as Code
- DevOps best practices in your environment - DevOps best practices
Start by asking questions about your k8s cluster infrastructure! Start by asking questions about your infrastructure!
""" """
return status_message return status_message
except Exception as e: except Exception as e:
error_msg = f"Error initializing assistant: {str(e)}" error_msg = f"Error initializing assistant: {str(e)}"
print(f" {error_msg}") print(f"Error: {error_msg}")
return f" {error_msg}" return f"Error: {error_msg}"
def chat_with_assistant(message: str, history): def chat_with_assistant(message: str, history):
"""Chat function for the assistant"""
global assistant global assistant
if not assistant: if not assistant:
bot_response = "Assistant not initialized. Please provide a knowledge base path first." bot_response = "Assistant not initialized. Please provide a knowledge base path first."
history.append((message, bot_response)) history.append((message, bot_response))
return history, "" return history, ""
@@ -66,11 +64,11 @@ def chat_with_assistant(message: str, history):
sources_text = "" sources_text = ""
if result.get('sources'): if result.get('sources'):
sources_text = "\n\n📚 **Sources:**\n" sources_text = "\n\n**Sources:**\n"
for i, source in enumerate(result['sources'], 1): for i, source in enumerate(result['sources'], 1):
source_file = source.get('source', 'Unknown') source_file = source.get('source', 'Unknown')
file_type = source.get('file_type', 'Unknown') file_type = source.get('file_type', 'Unknown')
sources_text += f"\n{i}. **{source_file}** ({file_type})" sources_text += f"\n{i}. {source_file} ({file_type})"
bot_response = answer + sources_text if sources_text else answer bot_response = answer + sources_text if sources_text else answer
@@ -82,16 +80,13 @@ def chat_with_assistant(message: str, history):
def create_interface(): def create_interface():
"""Create the Gradio interface"""
global assistant
with gr.Blocks(title="DevOps AI Assistant") as interface: with gr.Blocks(title="DevOps AI Assistant") as interface:
gr.Markdown("# 🤖 DevOps AI Assistant") gr.Markdown("# DevOps AI Assistant")
gr.Markdown("Intelligent Q&A system for your Kubernetes infrastructure powered by RAG and LLM") gr.Markdown("Intelligent Q&A system for your infrastructure powered by RAG and LLM")
gr.Markdown("## 🔧 Configuration") gr.Markdown("## Configuration")
gr.Markdown("Enter the path to your GitOps repository (knowledge base) to initialize the assistant") gr.Markdown("Enter the path to your GitOps repository to initialize the assistant")
with gr.Row(): with gr.Row():
kb_path_input = gr.Textbox( kb_path_input = gr.Textbox(
@@ -100,39 +95,38 @@ def create_interface():
lines=1, lines=1,
value="/workspace/aau/repositories/infra-gitops/" value="/workspace/aau/repositories/infra-gitops/"
) )
init_button = gr.Button("🚀 Initialize Assistant") init_button = gr.Button("Initialize Assistant")
status_output = gr.Markdown(value="Waiting for initialization...") status_output = gr.Markdown(value="Waiting for initialization...")
gr.Markdown("## 💬 Chat Interface") gr.Markdown("## Chat Interface")
chatbot = gr.Chatbot( chatbot = gr.Chatbot(
label="Conversation", label="Conversation",
height=500, height=500,
show_copy_button=True, show_copy_button=True,
avatar_images=("👤", "🤖"),
bubble_full_width=False bubble_full_width=False
) )
with gr.Row(): with gr.Row():
msg_input = gr.Textbox( msg_input = gr.Textbox(
label="Your Question", label="Your Question",
placeholder="Ask about your k8s infrastructure, ArgoCD, Helm charts, etc...", placeholder="Ask about your infrastructure, ArgoCD, Helm charts, etc...",
lines=2, lines=2,
scale=5 scale=5
) )
send_button = gr.Button("Send 💬", scale=1) send_button = gr.Button("Send", scale=1)
with gr.Row(): with gr.Row():
clear_button = gr.Button("🗑️ Clear Chat", scale=2) clear_button = gr.Button("Clear Chat", scale=2)
with gr.Accordion("📋 Example Questions", open=False): with gr.Accordion("Example Questions", open=False):
gr.Markdown(""" gr.Markdown("""
**Infrastructure & Deployment:** **Infrastructure & Deployment:**
- How is the Kubernetes cluster configured? - How many ArgoCD applications?
- What ArgoCD applications are deployed? - What is the repository structure?
- How many YAML files are there?
- Show me the Helm chart values for nginx - Show me the Helm chart values for nginx
- What storage solutions are available?
**Monitoring & Observability:** **Monitoring & Observability:**
- How is Prometheus configured? - How is Prometheus configured?
@@ -174,9 +168,8 @@ def create_interface():
def main(): def main():
"""Main entry point"""
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("🚀 DevOps AI Assistant - RAG System") print("DevOps AI Assistant - RAG System")
print("=" * 60) print("=" * 60)
print("Starting Gradio server...") print("Starting Gradio server...")
print("\nAccess the application at: http://127.0.0.1:7860") print("\nAccess the application at: http://127.0.0.1:7860")

View File

@@ -1,12 +1,12 @@
import os import os
import re
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional, Dict, Any
import json import json
import tempfile import tempfile
import shutil import shutil
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@@ -24,6 +24,146 @@ class DevOpsKnowledgeBase:
self.documents = [] self.documents = []
self.chunks = [] self.chunks = []
self.temp_db_dir = None self.temp_db_dir = None
self.indices = {}
self.structure = {}
def _parse_structured_content(self, content: str, file_path: Path) -> dict:
metadata = {}
try:
if file_path.suffix.lower() in ['.yaml', '.yml']:
import yaml
data = yaml.safe_load(content)
if isinstance(data, dict):
metadata['kind'] = data.get('kind')
metadata['api_version'] = data.get('apiVersion')
if 'metadata' in data and isinstance(data['metadata'], dict):
for key, value in data['metadata'].items():
if isinstance(value, (str, int, float, bool)):
metadata[f'meta_{key}'] = value
elif isinstance(value, dict):
for k, v in value.items():
if isinstance(v, (str, int, float, bool)):
metadata[f'meta_{key}_{k}'] = v
if 'spec' in data and isinstance(data['spec'], dict):
if 'project' in data['spec']:
metadata['project'] = data['spec']['project']
if 'destination' in data['spec'] and isinstance(data['spec']['destination'], dict):
if 'namespace' in data['spec']['destination']:
metadata['namespace'] = data['spec']['destination']['namespace']
elif file_path.suffix.lower() == '.json':
data = json.loads(content)
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, (str, int, float, bool)):
metadata[f'json_{key}'] = value
elif file_path.suffix.lower() in ['.tf', '.hcl']:
metadata['is_terraform'] = True
resources = re.findall(r'resource\s+"([^"]+)"\s+"([^"]+)"', content)
if resources:
metadata['terraform_resources'] = [r[0] for r in resources]
metadata['resource_count'] = len(resources)
modules = re.findall(r'module\s+"([^"]+)"', content)
if modules:
metadata['terraform_modules'] = modules
metadata['module_count'] = len(modules)
elif file_path.suffix.lower() == '.py':
metadata['is_code'] = True
metadata['language'] = 'python'
imports = re.findall(r'^(?:from|import)\s+(\S+)', content, re.MULTILINE)
classes = re.findall(r'^class\s+(\w+)', content, re.MULTILINE)
functions = re.findall(r'^def\s+(\w+)', content, re.MULTILINE)
if imports:
metadata['imports'] = imports[:10]
if classes:
metadata['classes'] = classes
metadata['class_count'] = len(classes)
if functions:
metadata['functions'] = functions[:20]
metadata['function_count'] = len(functions)
elif file_path.suffix.lower() in ['.js', '.ts']:
metadata['is_code'] = True
metadata['language'] = 'javascript' if file_path.suffix == '.js' else 'typescript'
imports = re.findall(r'import\s+.*\s+from\s+[\'"]([^\'"]+)[\'"]', content)
functions = re.findall(r'(?:function|const|let|var)\s+(\w+)\s*=?\s*(?:async\s*)?\(', content)
classes = re.findall(r'class\s+(\w+)', content)
if imports:
metadata['imports'] = imports[:10]
if classes:
metadata['classes'] = classes
metadata['class_count'] = len(classes)
if functions:
metadata['function_count'] = len(functions)
elif file_path.suffix.lower() in ['.go']:
metadata['is_code'] = True
metadata['language'] = 'go'
packages = re.findall(r'package\s+(\w+)', content)
if packages:
metadata['package'] = packages[0]
imports = re.findall(r'import\s+[\'"]([^\'"]+)[\'"]', content)
if imports:
metadata['imports'] = imports[:10]
except Exception as e:
pass
return metadata
def _extract_content_patterns(self, content: str) -> dict:
metadata = {}
content_lower = content.lower()
urls = re.findall(r'https?://[^\s<>"]+', content)
if urls:
metadata['has_urls'] = True
metadata['url_count'] = len(urls)
domains = []
for url in urls:
domain_match = re.findall(r'https?://([^/]+)', url)
if domain_match:
domains.append(domain_match[0])
if domains:
metadata['domains'] = list(set(domains))[:5]
ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', content)
if ips:
metadata['has_ips'] = True
metadata['ip_count'] = len(set(ips))
versions = re.findall(r'\bv?\d+\.\d+(?:\.\d+)?(?:-[\w.]+)?\b', content)
if versions:
metadata['has_versions'] = True
patterns = {
'has_secrets': any(keyword in content_lower for keyword in ['password', 'secret', 'token', 'api_key', 'apikey']),
'has_monitoring': any(keyword in content_lower for keyword in ['prometheus', 'grafana', 'metrics', 'alert']),
'has_networking': any(keyword in content_lower for keyword in ['ingress', 'service', 'loadbalancer', 'route']),
'has_storage': any(keyword in content_lower for keyword in ['volume', 'pvc', 'storage', 'disk']),
'has_database': any(keyword in content_lower for keyword in ['postgres', 'mysql', 'redis', 'mongodb', 'database']),
'has_deployment': any(keyword in content_lower for keyword in ['deployment', 'statefulset', 'daemonset', 'replica']),
}
metadata.update({k: v for k, v in patterns.items() if v})
quoted_strings = re.findall(r'"([^"]{3,30})"', content)
if quoted_strings:
metadata['quoted_strings'] = list(set(quoted_strings))[:10]
return metadata
def load_documents(self) -> List[Document]: def load_documents(self) -> List[Document]:
self.documents = [] self.documents = []
@@ -31,7 +171,7 @@ class DevOpsKnowledgeBase:
if not self.knowledge_base_path.exists(): if not self.knowledge_base_path.exists():
raise ValueError(f"Knowledge base path does not exist: {self.knowledge_base_path}") raise ValueError(f"Knowledge base path does not exist: {self.knowledge_base_path}")
supported_extensions = {'.yaml', '.yml', '.md', '.txt', '.json'} supported_extensions = {'.yaml', '.yml', '.md', '.txt', '.json', '.tf', '.hcl', '.py', '.js', '.ts', '.go', '.sh', '.rst'}
print(f"Loading documents from {self.knowledge_base_path}...") print(f"Loading documents from {self.knowledge_base_path}...")
@@ -43,14 +183,30 @@ class DevOpsKnowledgeBase:
if content and len(content) > 50: if content and len(content) > 50:
relative_path = file_path.relative_to(self.knowledge_base_path) relative_path = file_path.relative_to(self.knowledge_base_path)
doc = Document( parts = relative_path.parts
page_content=content,
metadata={ metadata = {
"source": str(relative_path), "source": str(relative_path),
"file_type": file_path.suffix.lower(), "file_type": file_path.suffix.lower(),
"path": str(file_path) "path": str(file_path),
} "filename": file_path.stem,
) "full_filename": file_path.name,
"char_count": len(content),
"word_count": len(content.split()),
"line_count": len(content.splitlines()),
"depth": len(parts) - 1,
"parent_dir": parts[-2] if len(parts) > 1 else "root",
"path_level_0": parts[0] if len(parts) > 0 else None,
"path_level_1": parts[1] if len(parts) > 1 else None,
"path_level_2": parts[2] if len(parts) > 2 else None,
"path_level_3": parts[3] if len(parts) > 3 else None,
"full_path_parts": list(parts),
}
metadata.update(self._parse_structured_content(content, file_path))
metadata.update(self._extract_content_patterns(content))
doc = Document(page_content=content, metadata=metadata)
self.documents.append(doc) self.documents.append(doc)
except Exception as e: except Exception as e:
@@ -59,35 +215,235 @@ class DevOpsKnowledgeBase:
print(f"Loaded {len(self.documents)} documents") print(f"Loaded {len(self.documents)} documents")
return self.documents return self.documents
def chunk_documents(self, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]: def discover_structure(self) -> dict:
if not self.documents: print("\nAuto-discovering repository structure...")
raise ValueError("No documents loaded. Call load_documents() first.")
print(f"Splitting {len(self.documents)} documents into chunks...") structure = {
'total_files': len(self.documents),
'by_file_type': {},
'by_depth': {},
'by_parent_dir': {},
'hierarchy': {},
'patterns': {}
}
text_splitter = RecursiveCharacterTextSplitter( for doc in self.documents:
chunk_size=chunk_size, file_type = doc.metadata.get('file_type', 'unknown')
chunk_overlap=chunk_overlap, structure['by_file_type'][file_type] = structure['by_file_type'].get(file_type, 0) + 1
separators=["\n\n", "\n", " ", ""]
)
self.chunks = text_splitter.split_documents(self.documents) depth = doc.metadata.get('depth', 0)
print(f"Created {len(self.chunks)} chunks") structure['by_depth'][depth] = structure['by_depth'].get(depth, 0) + 1
return self.chunks
parent = doc.metadata.get('parent_dir', 'unknown')
structure['by_parent_dir'][parent] = structure['by_parent_dir'].get(parent, 0) + 1
path_parts = doc.metadata.get('full_path_parts', [])
current_level = structure['hierarchy']
for part in path_parts[:-1]:
if part not in current_level:
current_level[part] = {'_count': 0, '_children': {}}
current_level[part]['_count'] += 1
current_level = current_level[part]['_children']
structure['patterns'] = self._detect_patterns()
print(f"\nDiscovered Structure:")
print(f" Total files: {structure['total_files']}")
print(f"\n By file type:")
for ftype, count in sorted(structure['by_file_type'].items(), key=lambda x: x[1], reverse=True):
print(f" {ftype}: {count}")
print(f"\n By depth:")
for depth, count in sorted(structure['by_depth'].items()):
print(f" Level {depth}: {count} files")
print(f"\n Top-level directories:")
for dir_name, data in structure['hierarchy'].items():
print(f" {dir_name}/: {data['_count']} files")
if structure['patterns']:
print(f"\n Detected patterns:")
for pattern, count in structure['patterns'].items():
print(f" {pattern}: {count} files")
self.structure = structure
return structure
def _detect_patterns(self) -> dict:
patterns = {
'kubernetes_manifests': 0,
'terraform_files': 0,
'python_code': 0,
'javascript_code': 0,
'documentation': 0,
'configuration': 0,
}
for doc in self.documents:
if doc.metadata.get('kind') or doc.metadata.get('api_version'):
patterns['kubernetes_manifests'] += 1
if doc.metadata.get('is_terraform'):
patterns['terraform_files'] += 1
if doc.metadata.get('language') == 'python':
patterns['python_code'] += 1
if doc.metadata.get('language') in ['javascript', 'typescript']:
patterns['javascript_code'] += 1
if doc.metadata.get('file_type') in ['.md', '.rst', '.txt']:
patterns['documentation'] += 1
if doc.metadata.get('file_type') in ['.yaml', '.yml', '.json', '.toml']:
patterns['configuration'] += 1
return {k: v for k, v in patterns.items() if v > 0}
def create_dynamic_indices(self) -> dict:
print("\nCreating dynamic indices...")
indices = {
'by_path_level_0': {},
'by_path_level_1': {},
'by_path_level_2': {},
'by_path_level_3': {},
'by_file_type': {},
'by_kind': {},
'by_language': {},
'by_parent_dir': {},
'by_project': {},
'by_namespace': {},
'statistics': {
'total_documents': len(self.documents),
'total_chars': sum(d.metadata.get('char_count', 0) for d in self.documents),
'total_lines': sum(d.metadata.get('line_count', 0) for d in self.documents),
}
}
for doc in self.documents:
source = doc.metadata.get('source')
for level in range(4):
level_key = f'path_level_{level}'
index_key = f'by_{level_key}'
if level_value := doc.metadata.get(level_key):
if level_value not in indices[index_key]:
indices[index_key][level_value] = []
indices[index_key][level_value].append(source)
if file_type := doc.metadata.get('file_type'):
if file_type not in indices['by_file_type']:
indices['by_file_type'][file_type] = []
indices['by_file_type'][file_type].append(source)
if kind := doc.metadata.get('kind'):
if kind not in indices['by_kind']:
indices['by_kind'][kind] = []
indices['by_kind'][kind].append(source)
if language := doc.metadata.get('language'):
if language not in indices['by_language']:
indices['by_language'][language] = []
indices['by_language'][language].append(source)
if parent := doc.metadata.get('parent_dir'):
if parent not in indices['by_parent_dir']:
indices['by_parent_dir'][parent] = []
indices['by_parent_dir'][parent].append(source)
if project := doc.metadata.get('project'):
if project not in indices['by_project']:
indices['by_project'][project] = []
indices['by_project'][project].append(source)
if namespace := doc.metadata.get('namespace'):
if namespace not in indices['by_namespace']:
indices['by_namespace'][namespace] = []
indices['by_namespace'][namespace].append(source)
self.indices = indices
print(f"\nIndices Created:")
print(f" Total documents indexed: {indices['statistics']['total_documents']}")
print(f" Top-level paths: {len(indices['by_path_level_0'])}")
print(f" File types: {len(indices['by_file_type'])}")
if indices['by_kind']:
print(f" Kubernetes kinds: {len(indices['by_kind'])}")
if indices['by_language']:
print(f" Programming languages: {len(indices['by_language'])}")
return indices
def chunk_documents_adaptive(self, documents: List[Document]) -> List[Document]:
print("\nAdaptive chunking based on file characteristics...")
all_chunks = []
strategies = {
'small_structured': [],
'large_structured': [],
'code_files': [],
'documentation': [],
'default': []
}
for doc in documents:
char_count = doc.metadata.get('char_count', 0)
file_type = doc.metadata.get('file_type', '')
if file_type in ['.yaml', '.yml', '.json', '.toml']:
if char_count < 2000:
strategies['small_structured'].append(doc)
else:
strategies['large_structured'].append(doc)
elif file_type in ['.py', '.js', '.go', '.java', '.ts', '.rs', '.sh']:
strategies['code_files'].append(doc)
elif file_type in ['.md', '.rst', '.txt']:
strategies['documentation'].append(doc)
else:
strategies['default'].append(doc)
chunk_configs = {
'small_structured': {'chunk_size': 2000, 'chunk_overlap': 100},
'large_structured': {'chunk_size': 1500, 'chunk_overlap': 200},
'code_files': {'chunk_size': 1200, 'chunk_overlap': 150},
'documentation': {'chunk_size': 1000, 'chunk_overlap': 200},
'default': {'chunk_size': 1000, 'chunk_overlap': 200}
}
for strategy_name, docs in strategies.items():
if not docs:
continue
config = chunk_configs[strategy_name]
splitter = RecursiveCharacterTextSplitter(
chunk_size=config['chunk_size'],
chunk_overlap=config['chunk_overlap'],
separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)
for i, chunk in enumerate(chunks):
chunk.metadata['chunk_strategy'] = strategy_name
chunk.metadata['chunk_id'] = f"{strategy_name}_{i:04d}"
all_chunks.extend(chunks)
print(f" {strategy_name}: {len(docs)} docs → {len(chunks)} chunks")
self.chunks = all_chunks
print(f" Total: {len(all_chunks)} chunks created")
return all_chunks
def initialize_embedding_model(self): def initialize_embedding_model(self):
print(f"Initializing embedding model: {self.embedding_model_name}...") print(f"\nInitializing embedding model: {self.embedding_model_name}...")
self.embedding_model = HuggingFaceEmbeddings(model_name=self.embedding_model_name) self.embedding_model = HuggingFaceEmbeddings(model_name=self.embedding_model_name)
print("Embedding model initialized") print("Embedding model initialized")
def create_vectorstore(self) -> Chroma: def create_vectorstore(self) -> Chroma:
if not self.chunks: if not self.chunks:
raise ValueError("No chunks available. Call chunk_documents() first.") raise ValueError("No chunks available. Call chunk_documents_adaptive() first.")
if not self.embedding_model: if not self.embedding_model:
raise ValueError("Embedding model not initialized. Call initialize_embedding_model() first.") raise ValueError("Embedding model not initialized. Call initialize_embedding_model() first.")
print("Creating vector store...") print("\nCreating vector store...")
if self.temp_db_dir: if self.temp_db_dir:
try: try:
@@ -95,7 +451,16 @@ class DevOpsKnowledgeBase:
except: except:
pass pass
self.temp_db_dir = tempfile.mkdtemp(prefix="devops_kb_") self.temp_db_dir = tempfile.mkdtemp(prefix="devops_kb_v2_")
for chunk in self.chunks:
cleaned_metadata = {}
for key, value in chunk.metadata.items():
if value is not None and not isinstance(value, (list, dict)):
cleaned_metadata[key] = value
elif isinstance(value, list) and value:
cleaned_metadata[key] = str(value)
chunk.metadata = cleaned_metadata
self.vectorstore = Chroma.from_documents( self.vectorstore = Chroma.from_documents(
documents=self.chunks, documents=self.chunks,
@@ -108,15 +473,20 @@ class DevOpsKnowledgeBase:
return self.vectorstore return self.vectorstore
def initialize(self): def initialize(self):
print("Initializing DevOps Knowledge Base...") print("=" * 70)
print("=" * 60) print("Initializing DevOps Knowledge Base")
print("=" * 70)
self.load_documents() self.load_documents()
self.chunk_documents() self.discover_structure()
self.create_dynamic_indices()
self.chunk_documents_adaptive(self.documents)
self.initialize_embedding_model() self.initialize_embedding_model()
self.create_vectorstore() self.create_vectorstore()
print("\nKnowledge base initialized successfully!") print("\n" + "=" * 70)
print("Knowledge base initialized successfully!")
print("=" * 70)
return self.vectorstore return self.vectorstore
@@ -129,7 +499,7 @@ class DevOpsAIAssistant:
self.llm = None self.llm = None
def setup(self): def setup(self):
print("Setting up DevOps AI Assistant...") print("\nSetting up DevOps AI Assistant...")
self.vectorstore = self.knowledge_base.initialize() self.vectorstore = self.knowledge_base.initialize()
@@ -137,7 +507,7 @@ class DevOpsAIAssistant:
if not api_key: if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set") raise ValueError("OPENAI_API_KEY environment variable not set")
print("Initializing OpenAI LLM...") print("\nInitializing OpenAI LLM...")
self.llm = ChatOpenAI( self.llm = ChatOpenAI(
model_name="gpt-4o-mini", model_name="gpt-4o-mini",
temperature=0.3, temperature=0.3,
@@ -152,7 +522,7 @@ class DevOpsAIAssistant:
) )
print("Creating conversation chain...") print("Creating conversation chain...")
retriever = self.vectorstore.as_retriever(search_kwargs={"k": 5}) retriever = self.vectorstore.as_retriever(search_kwargs={"k": 10})
self.conversation_chain = ConversationalRetrievalChain.from_llm( self.conversation_chain = ConversationalRetrievalChain.from_llm(
llm=self.llm, llm=self.llm,
@@ -162,7 +532,9 @@ class DevOpsAIAssistant:
verbose=False verbose=False
) )
print("\n" + "=" * 70)
print("DevOps AI Assistant ready!") print("DevOps AI Assistant ready!")
print("=" * 70)
return self return self
def ask(self, question: str) -> dict: def ask(self, question: str) -> dict:
@@ -177,12 +549,26 @@ class DevOpsAIAssistant:
} }
if result.get('source_documents'): if result.get('source_documents'):
unique_sources = {}
for doc in result['source_documents']: for doc in result['source_documents']:
response["sources"].append({ source = doc.metadata.get('source')
"content": doc.page_content[:300], if source not in unique_sources:
"source": doc.metadata.get('source', 'Unknown'), path_info = "/".join([
"file_type": doc.metadata.get('file_type', 'Unknown') doc.metadata.get('path_level_0', ''),
}) doc.metadata.get('path_level_1', ''),
doc.metadata.get('path_level_2', '')
]).strip('/')
unique_sources[source] = {
"content": doc.page_content[:300],
"source": source,
"file_type": doc.metadata.get('file_type', 'Unknown'),
"path_info": path_info,
"kind": doc.metadata.get('kind'),
"language": doc.metadata.get('language')
}
response["sources"] = list(unique_sources.values())
return response return response
@@ -192,7 +578,7 @@ class DevOpsAIAssistant:
doc_count = self.vectorstore._collection.count() doc_count = self.vectorstore._collection.count()
return { status = {
"status": "ready", "status": "ready",
"documents_loaded": len(self.knowledge_base.documents), "documents_loaded": len(self.knowledge_base.documents),
"chunks_created": len(self.knowledge_base.chunks), "chunks_created": len(self.knowledge_base.chunks),
@@ -200,6 +586,22 @@ class DevOpsAIAssistant:
"knowledge_base_path": str(self.knowledge_base.knowledge_base_path) "knowledge_base_path": str(self.knowledge_base.knowledge_base_path)
} }
if self.knowledge_base.structure:
status["structure"] = {
"total_files": self.knowledge_base.structure['total_files'],
"file_types": len(self.knowledge_base.structure['by_file_type']),
"patterns": self.knowledge_base.structure['patterns']
}
if self.knowledge_base.indices:
status["indices"] = {
"path_levels": len(self.knowledge_base.indices['by_path_level_0']),
"kinds": len(self.knowledge_base.indices['by_kind']),
"languages": len(self.knowledge_base.indices['by_language'])
}
return status
def create_assistant(knowledge_base_path: str) -> DevOpsAIAssistant: def create_assistant(knowledge_base_path: str) -> DevOpsAIAssistant:
assistant = DevOpsAIAssistant(knowledge_base_path) assistant = DevOpsAIAssistant(knowledge_base_path)