Source code for atloop.retrieval.indexer
"""Workspace indexer for file tree and search."""
from typing import Any, Dict, List, Optional
from atloop.runtime import ToolResult
from atloop.tools.runtime import ToolRuntime
[docs]
class WorkspaceIndexer:
"""Workspace indexer for file tree and search."""
[docs]
def __init__(self, tool_runtime: ToolRuntime):
"""
Initialize workspace indexer.
Args:
tool_runtime: Tool runtime instance
"""
self.tool_runtime = tool_runtime
[docs]
def bootstrap(self) -> Dict[str, Any]:
"""
Bootstrap workspace (initial discovery).
Returns:
Dictionary with workspace information
"""
# List file tree
tree_result = self.tool_runtime.list_tree(max_depth=4)
# Get git status
git_status_result = self.tool_runtime.run(
"git status --porcelain=v1 2>/dev/null || echo 'not-git'", timeout_sec=10
)
return {
"file_tree": tree_result.stdout if tree_result.ok else "",
"git_status": git_status_result.stdout if git_status_result.ok else "",
}
[docs]
def list_tree(
self, max_depth: int = 4, ignore_patterns: Optional[List[str]] = None
) -> ToolResult:
"""
List file tree.
Args:
max_depth: Maximum depth
ignore_patterns: Patterns to ignore
Returns:
ToolResult with file tree
"""
return self.tool_runtime.list_tree(max_depth=max_depth, ignore_patterns=ignore_patterns)
[docs]
def search(self, query: str, glob: Optional[str] = None, max_results: int = 50) -> ToolResult:
"""
Search using grep (common Linux tool).
Args:
query: Search query
glob: Glob pattern
max_results: Maximum results
Returns:
ToolResult with search results
"""
return self.tool_runtime.search(query=query, glob=glob, max_results=max_results)
[docs]
def read_snippets(
self,
file_paths: List[str],
context_lines: int = 80,
max_total_size: int = 80 * 1024, # 80KB
max_file_lines: int = 300,
) -> List[Dict[str, Any]]:
"""
Read file snippets with context.
Args:
file_paths: List of file paths to read
context_lines: Number of context lines around matches
max_total_size: Maximum total size in bytes
max_file_lines: Maximum lines per file
Returns:
List of file snippet dictionaries
"""
snippets = []
total_size = 0
for file_path in file_paths:
if total_size >= max_total_size:
break
# Read file using system command
import shlex
file_path_escaped = shlex.quote(file_path)
result = self.tool_runtime.run(
f"head -n {max_file_lines} {file_path_escaped} 2>/dev/null || cat {file_path_escaped} 2>/dev/null",
timeout_sec=10,
)
if result.ok:
content = result.stdout
content_size = len(content.encode("utf-8"))
if total_size + content_size > max_total_size:
# Truncate if needed
remaining = max_total_size - total_size
content = content[:remaining]
content_size = remaining
snippets.append(
{
"path": file_path,
"content": content,
"size": content_size,
"lines": len(content.splitlines()),
}
)
total_size += content_size
return snippets
[docs]
def extract_keywords(self, text: str) -> List[str]:
"""
Extract keywords from text (simple implementation).
Args:
text: Text to extract keywords from
Returns:
List of keywords
"""
# Simple keyword extraction: function names, class names, error messages
import re
keywords = []
# Function/class names (CamelCase, snake_case)
patterns = [
r"\b[A-Z][a-zA-Z0-9]*\b", # CamelCase
r"\b[a-z_][a-z0-9_]*\b", # snake_case
]
for pattern in patterns:
matches = re.findall(pattern, text)
keywords.extend(matches)
# Error messages
error_patterns = [
r"Error:\s*([^\n]+)",
r"Exception:\s*([^\n]+)",
r"FAILED\s+([^\n]+)",
]
for pattern in error_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
keywords.extend(matches)
# Remove duplicates and common words
common_words = {
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"as",
"is",
"was",
"are",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
}
keywords = [k for k in set(keywords) if len(k) > 2 and k.lower() not in common_words]
return keywords[:20] # Limit to 20 keywords