Source code for atloop.retrieval.indexer

"""Workspace indexer for file tree and search."""

from typing import Any, Dict, List, Optional

from atloop.runtime import ToolResult
from atloop.tools.runtime import ToolRuntime



[docs]
class WorkspaceIndexer:
    """Workspace indexer for file tree and search."""


[docs]
    def __init__(self, tool_runtime: ToolRuntime):
        """
        Initialize workspace indexer.

        Args:
            tool_runtime: Tool runtime instance
        """
        self.tool_runtime = tool_runtime



[docs]
    def bootstrap(self) -> Dict[str, Any]:
        """
        Bootstrap workspace (initial discovery).

        Returns:
            Dictionary with workspace information
        """
        # List file tree
        tree_result = self.tool_runtime.list_tree(max_depth=4)

        # Get git status
        git_status_result = self.tool_runtime.run(
            "git status --porcelain=v1 2>/dev/null || echo 'not-git'", timeout_sec=10
        )

        return {
            "file_tree": tree_result.stdout if tree_result.ok else "",
            "git_status": git_status_result.stdout if git_status_result.ok else "",
        }



[docs]
    def list_tree(
        self, max_depth: int = 4, ignore_patterns: Optional[List[str]] = None
    ) -> ToolResult:
        """
        List file tree.

        Args:
            max_depth: Maximum depth
            ignore_patterns: Patterns to ignore

        Returns:
            ToolResult with file tree
        """
        return self.tool_runtime.list_tree(max_depth=max_depth, ignore_patterns=ignore_patterns)



[docs]
    def search(self, query: str, glob: Optional[str] = None, max_results: int = 50) -> ToolResult:
        """
        Search using grep (common Linux tool).

        Args:
            query: Search query
            glob: Glob pattern
            max_results: Maximum results

        Returns:
            ToolResult with search results
        """
        return self.tool_runtime.search(query=query, glob=glob, max_results=max_results)



[docs]
    def read_snippets(
        self,
        file_paths: List[str],
        context_lines: int = 80,
        max_total_size: int = 80 * 1024,  # 80KB
        max_file_lines: int = 300,
    ) -> List[Dict[str, Any]]:
        """
        Read file snippets with context.

        Args:
            file_paths: List of file paths to read
            context_lines: Number of context lines around matches
            max_total_size: Maximum total size in bytes
            max_file_lines: Maximum lines per file

        Returns:
            List of file snippet dictionaries
        """
        snippets = []
        total_size = 0

        for file_path in file_paths:
            if total_size >= max_total_size:
                break

            # Read file using system command
            import shlex

            file_path_escaped = shlex.quote(file_path)
            result = self.tool_runtime.run(
                f"head -n {max_file_lines} {file_path_escaped} 2>/dev/null || cat {file_path_escaped} 2>/dev/null",
                timeout_sec=10,
            )

            if result.ok:
                content = result.stdout
                content_size = len(content.encode("utf-8"))

                if total_size + content_size > max_total_size:
                    # Truncate if needed
                    remaining = max_total_size - total_size
                    content = content[:remaining]
                    content_size = remaining

                snippets.append(
                    {
                        "path": file_path,
                        "content": content,
                        "size": content_size,
                        "lines": len(content.splitlines()),
                    }
                )

                total_size += content_size

        return snippets



[docs]
    def extract_keywords(self, text: str) -> List[str]:
        """
        Extract keywords from text (simple implementation).

        Args:
            text: Text to extract keywords from

        Returns:
            List of keywords
        """
        # Simple keyword extraction: function names, class names, error messages
        import re

        keywords = []

        # Function/class names (CamelCase, snake_case)
        patterns = [
            r"\b[A-Z][a-zA-Z0-9]*\b",  # CamelCase
            r"\b[a-z_][a-z0-9_]*\b",  # snake_case
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text)
            keywords.extend(matches)

        # Error messages
        error_patterns = [
            r"Error:\s*([^\n]+)",
            r"Exception:\s*([^\n]+)",
            r"FAILED\s+([^\n]+)",
        ]

        for pattern in error_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            keywords.extend(matches)

        # Remove duplicates and common words
        common_words = {
            "the",
            "a",
            "an",
            "and",
            "or",
            "but",
            "in",
            "on",
            "at",
            "to",
            "for",
            "of",
            "with",
            "by",
            "from",
            "as",
            "is",
            "was",
            "are",
            "were",
            "be",
            "been",
            "being",
            "have",
            "has",
            "had",
            "do",
            "does",
            "did",
        }

        keywords = [k for k in set(keywords) if len(k) > 2 and k.lower() not in common_words]

        return keywords[:20]  # Limit to 20 keywords