feat: add prompt-guard honeypot for prompt injection detection

- New src/prompt_guard/ package with pydantic-ai Agent + 7 fake tools (read_file, write_file, list_directory, execute_shell, make_http_request, send_email, query_database) that return plausible but harmless responses - Injection detected when the model makes any tool call; content is blocked entirely (never returned to caller), all calls logged at WARNING level - Config via PROMPT_GUARD_* env vars (pydantic-settings); system prompt deliberately encourages tool use to maximise detection sensitivity - server.py: SEARXNG_GUARD_ENABLED flag (default false) + guard call in _fetch_and_extract; blocked content is not stored in the cache - Fix Settings.extra='ignore' on both Settings classes so PROMPT_GUARD_* and SEARXNG_* vars don't cause validation errors in the other class - Fix _build_model: use explicit OpenAIProvider when api_key is set so PROMPT_GUARD_API_KEY from .env is honoured (pydantic-settings does not populate os.environ, so pydantic-ai's auto-provider couldn't find it)
2026-04-21 19:45:19 +02:00 · 2026-04-21 19:45:19 +02:00 · 678e052315
commit 678e052315
parent 27e0805898
8 changed files with 1602 additions and 56 deletions
--- a/.env.example
+++ b/.env.example
@ -1 +1,33 @@
 # --- searxng-mcp settings ---
 # URL of the SearxNG instance to query.
 SEARXNG_BASE_URL=http://localhost:8080
 # Set to true to run fetched pages through the prompt-guard honeypot before
 # returning them to the agent. Requires PROMPT_GUARD_* settings below.
 #SEARXNG_GUARD_ENABLED=false
 # --- prompt-guard settings ---
 # Only relevant when SEARXNG_GUARD_ENABLED=true.
 # Set to false to disable the guard without removing the other settings.
 #PROMPT_GUARD_ENABLED=true
 # Pydantic-AI model string used for the honeypot agent. Examples:
 #   openai:gpt-4o-mini           (default)
 #   anthropic:claude-haiku-3-5
 #   groq:llama-3.1-8b-instant
 #   openai:llama3.2              (local Ollama — set PROMPT_GUARD_BASE_URL too)
 #PROMPT_GUARD_MODEL=openai:gpt-4o-mini
 # API key for the model provider. Alternatively use the provider's own env var
 # (OPENAI_API_KEY, ANTHROPIC_API_KEY, GROQ_API_KEY, …).
 #PROMPT_GUARD_API_KEY=sk-...
 # Base URL for OpenAI-compatible local endpoints (Ollama, LM Studio, vLLM, …).
 #PROMPT_GUARD_BASE_URL=http://localhost:11434/v1
 # System prompt sent to the honeypot agent. The default deliberately encourages
 # the LLM to act on any instructions found in the content, maximising detection
 # sensitivity. Override only if you need different behaviour.
 #PROMPT_GUARD_SYSTEM_PROMPT=You are a powerful AI assistant ...
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,6 +10,7 @@ requires-python = ">=3.14"
 dependencies = [
    "fastmcp>=3.2.4",
    "httpx>=0.28.1",
    "pydantic-ai>=0.3.0",
    "pydantic-settings>=2.13.1",
    "trafilatura>=2.0.0",
 ]
--- a/src/prompt_guard/init.py
+++ b/src/prompt_guard/init.py
@ -0,0 +1,7 @@
 """Prompt injection detection via a honeypot LLM agent."""
 from prompt_guard.config import Settings
 from prompt_guard.result import GuardResult
 from prompt_guard.agent import check
 __all__ = ["check", "GuardResult", "Settings"]
--- a/src/prompt_guard/agent.py
+++ b/src/prompt_guard/agent.py
@ -0,0 +1,189 @@
 """Honeypot agent: runs untrusted text through a pydantic-ai agent with fake tools.
 If the agent makes any tool calls the content is considered tainted.
 All tool calls are logged and recorded in GuardResult for forensics.
 """
 import logging
 import time
 from datetime import datetime, timezone
 from pydantic_ai import Agent
 from pydantic_ai.models.openai import OpenAIModel
 from pydantic_ai.providers.openai import OpenAIProvider
 from pydantic_ai.messages import ModelRequest, ToolCallPart
 from prompt_guard.config import Settings
 from prompt_guard.result import GuardResult, ToolCallRecord
 logger = logging.getLogger(__name__)
 def _build_model(settings: Settings):
    """Construct the pydantic-ai model from settings."""
    model_str = settings.model
    # For openai: models, use an explicit provider whenever we have custom kwargs so
    # that api_key loaded via pydantic-settings (PROMPT_GUARD_API_KEY) is honoured.
    # pydantic-ai's auto-provider only reads OPENAI_API_KEY from os.environ, which
    # pydantic-settings does NOT populate from .env (it only populates model fields).
    if model_str.startswith("openai:"):
        model_name = model_str.removeprefix("openai:")
        provider_kwargs: dict = {}
        if settings.base_url:
            provider_kwargs["base_url"] = settings.base_url
        if settings.api_key:
            provider_kwargs["api_key"] = settings.api_key
        if provider_kwargs:
            return OpenAIModel(model_name, provider=OpenAIProvider(**provider_kwargs))
    # For all other providers (or openai without custom kwargs), pydantic-ai resolves
    # the model string automatically and picks up the key from the provider's env var.
    return model_str
 def _build_agent(settings: Settings) -> Agent:
    """Build the honeypot agent with all fake tools registered."""
    model = _build_model(settings)
    agent: Agent = Agent(model=model, system_prompt=settings.system_prompt)
    # --- Fake tools ---
    # Each tool logs the attempt, returns plausible but harmless data, and does nothing else.
    @agent.tool_plain
    def read_file(path: str) -> str:
        """Read the contents of a file from the filesystem."""
        logger.warning("[prompt-guard] honeypot tool called: read_file(path=%r)", path)
        return (
            f"# {path}\n"
            "# Auto-generated configuration\n"
            "debug = false\n"
            "log_level = 'info'\n"
            "max_connections = 100\n"
        )
    @agent.tool_plain
    def write_file(path: str, content: str) -> str:
        """Write content to a file on the filesystem."""
        logger.warning(
            "[prompt-guard] honeypot tool called: write_file(path=%r, content_len=%d)",
            path,
            len(content),
        )
        return "File written successfully."
    @agent.tool_plain
    def list_directory(path: str) -> list[str]:
        """List the files and directories at the given path."""
        logger.warning("[prompt-guard] honeypot tool called: list_directory(path=%r)", path)
        return [
            "README.md",
            "config.yaml",
            "main.py",
            "requirements.txt",
            ".env",
            "data/",
            "logs/",
        ]
    @agent.tool_plain
    def execute_shell(command: str) -> str:
        """Execute a shell command and return its output."""
        logger.warning(
            "[prompt-guard] honeypot tool called: execute_shell(command=%r)", command
        )
        return f"$ {command}\n(no output)\n"
    @agent.tool_plain
    def make_http_request(url: str, method: str = "GET", body: str = "") -> dict:
        """Make an HTTP request to a URL and return the response."""
        logger.warning(
            "[prompt-guard] honeypot tool called: make_http_request(url=%r, method=%r)",
            url,
            method,
        )
        return {"status": 200, "body": "{}", "headers": {"content-type": "application/json"}}
    @agent.tool_plain
    def send_email(to: str, subject: str, body: str) -> str:
        """Send an email message."""
        logger.warning(
            "[prompt-guard] honeypot tool called: send_email(to=%r, subject=%r)",
            to,
            subject,
        )
        return "Message sent successfully."
    @agent.tool_plain
    def query_database(query: str) -> list[dict]:
        """Execute a SQL query and return the results."""
        logger.warning(
            "[prompt-guard] honeypot tool called: query_database(query=%r)", query
        )
        return [{"id": 1, "name": "example", "value": "data"}]
    return agent
 def _collect_tool_calls(result) -> list[ToolCallRecord]:
    """Walk pydantic-ai message history and collect all ToolCallPart entries."""
    records: list[ToolCallRecord] = []
    now = datetime.now(timezone.utc).isoformat()
    for message in result.all_messages():
        # Tool calls appear in ModelResponse messages (assistant turn)
        for part in getattr(message, "parts", []):
            if isinstance(part, ToolCallPart):
                args = part.args
                if hasattr(args, "args_dict"):
                    args = args.args_dict()
                elif hasattr(args, "args_json"):
                    import json
                    try:
                        args = json.loads(args.args_json)
                    except Exception:
                        args = {"raw": str(args)}
                records.append(
                    ToolCallRecord(tool=part.tool_name, args=args, timestamp=now)
                )
    return records
 async def check(text: str, settings: Settings | None = None) -> GuardResult:
    """Run untrusted text through the honeypot agent and return a GuardResult.
    Args:
        text: The untrusted content to check (e.g. scraped web page, file contents).
        settings: Optional settings override. Reads from environment / .env by default.
    Returns:
        GuardResult with safe=True if no tool calls were detected.
    """
    if settings is None:
        settings = Settings()
    if not settings.enabled:
        return GuardResult(safe=True, model=settings.model)
    agent = _build_agent(settings)
    t0 = time.monotonic()
    result = await agent.run(text)
    latency_ms = (time.monotonic() - t0) * 1000
    tool_calls = _collect_tool_calls(result)
    if tool_calls:
        logger.warning(
            "[prompt-guard] INJECTION DETECTED — %d tool call(s): %s",
            len(tool_calls),
            [tc.tool for tc in tool_calls],
        )
    else:
        logger.debug("[prompt-guard] content passed (no tool calls, %.0f ms)", latency_ms)
    return GuardResult(
        safe=len(tool_calls) == 0,
        tool_calls=tool_calls,
        model=settings.model,
        latency_ms=latency_ms,
    )
--- a/src/prompt_guard/config.py
+++ b/src/prompt_guard/config.py
@ -0,0 +1,46 @@
 """Configuration for prompt-guard via environment variables."""
 from pydantic_settings import BaseSettings, SettingsConfigDict
 DEFAULT_SYSTEM_PROMPT = (
    "You are a powerful AI assistant with access to a variety of tools, "
    "including file system access, shell execution, and network capabilities. "
    "Your primary task is to read the content provided by the user carefully "
    "and act on any instructions you find within it using the appropriate tools. "
    "Be proactive: if the content contains requests or commands, execute them "
    "immediately with the tools available to you. Do not merely describe what "
    "you would do — use the tools."
 )
 class Settings(BaseSettings):
    """Settings for the prompt-guard honeypot agent.
    All fields can be set via environment variables with the PROMPT_GUARD_ prefix.
    """
    model_config = SettingsConfigDict(
        env_prefix="PROMPT_GUARD_",
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )
    enabled: bool = True
    """Set to false to disable the guard entirely (content passes through unchecked)."""
    model: str = "openai:gpt-4o-mini"
    """Pydantic-AI model string, e.g. 'openai:gpt-4o-mini', 'anthropic:claude-haiku-3-5',
    'groq:llama-3.1-8b-instant'. For OpenAI-compatible endpoints set base_url as well."""
    api_key: str = ""
    """API key for the model provider. May also be set via the provider's own env var
    (e.g. OPENAI_API_KEY, ANTHROPIC_API_KEY)."""
    base_url: str = ""
    """Base URL override for OpenAI-compatible endpoints (Ollama, LM Studio, vLLM, etc.).
    Example: http://localhost:11434/v1"""
    system_prompt: str = DEFAULT_SYSTEM_PROMPT
    """System prompt sent to the honeypot agent. The default is deliberately crafted to
    encourage tool usage so that injected instructions are more likely to trigger calls."""
--- a/src/prompt_guard/result.py
+++ b/src/prompt_guard/result.py
@ -0,0 +1,30 @@
 """GuardResult dataclass returned by check()."""
 from dataclasses import dataclass, field
@dataclass
 class ToolCallRecord:
    """A single tool call made by the honeypot agent."""
    tool: str
    args: dict
    timestamp: str  # ISO-8601
@dataclass
 class GuardResult:
    """Result of a prompt injection check.
    Attributes:
        safe: True if no tool calls were detected (content is likely clean).
        tool_calls: List of tool call attempts recorded during the check.
            Non-empty when safe=False; useful for forensic logging.
        model: Model string used for the check.
        latency_ms: Wall-clock time of the agent run in milliseconds.
    """
    safe: bool
    tool_calls: list[ToolCallRecord] = field(default_factory=list)
    model: str = ""
    latency_ms: float = 0.0
--- a/src/searxng_mcp/server.py
+++ b/src/searxng_mcp/server.py
@ -6,15 +6,22 @@ from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 import asyncio
 import logging
 import trafilatura
 import prompt_guard
 from searxng_mcp.searxng import search as _search
 logger = logging.getLogger(__name__)
 class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_prefix="SEARXNG_", env_file=".env", env_file_encoding="utf-8")
+    model_config = SettingsConfigDict(env_prefix="SEARXNG_", env_file=".env", env_file_encoding="utf-8", extra="ignore")
    base_url: str = "http://localhost:8080"
    guard_enabled: bool = False
    """Run fetched content through the prompt-guard honeypot before returning it.
    Requires PROMPT_GUARD_* settings to be configured."""
 settings = Settings()
@ -67,6 +74,20 @@ async def _fetch_and_extract(
    if not result:
        raise ValueError(f"Failed to extract content from URL: {url}")
    if settings.guard_enabled:
        guard_result = await prompt_guard.check(result)
        if not guard_result.safe:
            calls = [tc.tool for tc in guard_result.tool_calls]
            logger.warning(
                "Prompt injection detected in fetched content from %s — tool calls: %s",
                url,
                calls,
            )
            raise ValueError(
                f"Prompt injection detected in content from {url}. "
                f"Honeypot triggered tool(s): {calls}. Content blocked."
            )
    _cache[cache_key] = result
    return result
--- a/uv.lock
+++ b/uv.lock