Initial commit

2026-04-20 11:42:25 +02:00 · 2026-04-20 11:42:25 +02:00 · 8885c1872f
commit 8885c1872f
14 changed files with 1990 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1 @@
 SEARXNG_BASE_URL=http://localhost:8080
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,13 @@
 # Python-generated files
 __pycache__/
 *.py[oc]
 build/
 dist/
 wheels/
 *.egg-info
 # Virtual environments
 .venv
 # Environment variables
 .env
--- a/.opencode/skills/mcp-forge-conventions/SKILL.md
+++ b/.opencode/skills/mcp-forge-conventions/SKILL.md
@ -0,0 +1,76 @@
 ---
 name: mcp-forge-conventions
 description: How to call MCP tools from within mcp-forge execute_python scripts, including tool naming and injection syntax
 ---
 # mcp-forge Conventions
 ## Tool naming
 mcp-forge injects tools using their **bare function name**, not the namespaced name visible to the agent.
 | Agent-side name | mcp-forge `mcp_tools` value | In-script call |
 |---|---|---|
 | `searxng_search` | `"search"` | `search(...)` |
 | `searxng_fetch` | `"fetch"` | `fetch(...)` |
 | `rag-mcp_browse_documents` | `"browse_documents"` | `browse_documents(...)` |
 | `rag-mcp_search_records` | `"search_records"` | `search_records(...)` |
 The pattern: strip any server prefix (e.g. `searxng_`, `rag-mcp_`) and use only the function name.
 ## Injection syntax
 Pass a JSON array of bare tool names to `mcp_tools`:
 ```python
 mcp-forge_execute_python(
    code='results = search(query="foo"); print(results)',
    mcp_tools=["search", "fetch"]
 )
 ```
 ## Listing all available tools
 Use the agent-side `mcp-forge_list_injectable_tools` tool to get the full catalogue before writing scripts:
 ```
 mcp-forge_list_injectable_tools(include_schemas=false)
 ```
 Returns each tool's `tool_name` (injected name), `qualified_name` (`provider.tool`), and provider metadata (name, transport, url). Only tools whose providers are registered in mcp-forge's own config appear here — tools available to the OpenCode agent from other MCP servers (e.g. GitHub) are NOT automatically available inside mcp-forge.
 ## Verifying a single tool name
 To confirm a specific tool name resolves before using it, pass it in `mcp_tools` and check the `available_tools` list in the response. Only successfully resolved tools appear there.
 ```python
 mcp-forge_execute_python(
    code='print("ok")',
    mcp_tools=["search"]
 )
 # response includes: "available_tools": ["search"]
 # if the name is wrong, the whole call errors with "Tool '<name>' not found"
 ```
 ## Return values
 Injected tools return Python objects (lists, dicts). Handle both a direct value and a dict wrapper:
 ```python
 data = search(query="foo")
 records = data.get("result", []) if isinstance(data, dict) else data
 ```
 ## Combining searxng + mcp-forge
 ```python
 mcp-forge_execute_python(
    code='''
 results = search(query="uv python", language="en")
 top = results[0]
 page = fetch(url=top["url"], max_chars=2000)
 print(page["content"])
 ''',
    mcp_tools=["search", "fetch"]
 )
 ```
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.14
--- a/README.md
+++ b/README.md
--- a/1
+++ b/1
@ -0,0 +1 @@
 /home/hans/software/fastmcp/docs/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,22 @@
 [project]
 name = "searxng-mcp"
 version = "0.1.0"
 description = "MCP server exposing SearxNG web search as a tool"
 readme = "README.md"
 authors = [
    { name = "Hans Aschauer", email = "hans.git@ch23.de" }
 ]
 requires-python = ">=3.14"
 dependencies = [
    "fastmcp>=3.2.4",
    "httpx>=0.28.1",
    "pydantic-settings>=2.13.1",
    "trafilatura>=2.0.0",
 ]
 [project.scripts]
 searxng-mcp = "searxng_mcp.__main__:main"
 [build-system]
 requires = ["uv_build>=0.10.8,<0.11.0"]
 build-backend = "uv_build"
--- a/scripts/ingest_fastmcp_docs.py
+++ b/scripts/ingest_fastmcp_docs.py
@ -0,0 +1,221 @@
 """
 Ingest FastMCP documentation into rag-mcp.
 Walks fast_mcp_docs/, reads each .mdx/.md file, and adds it as a record
 in a rag-mcp document. Runs directly against the rag-mcp HTTP MCP endpoint.
 Usage:
    uv run scripts/ingest_fastmcp_docs.py [--dry-run] [--rag-url URL]
 """
 import argparse
 import json
 import os
 import sys
 import time
 from pathlib import Path
 import httpx
 DOCS_DIR = Path(__file__).parent.parent / "fast_mcp_docs"
 DEFAULT_RAG_URL = "http://localhost:8006/mcp"
 DOC_SOURCE = "fastmcp-docs"
 DOC_DESCRIPTION = "FastMCP Python library documentation (prefecthq/fastmcp)"
 DOC_TAGS = ["fastmcp", "mcp", "python", "docs"]
 class RagMcpClient:
    """Minimal synchronous client for rag-mcp HTTP MCP endpoint."""
    def __init__(self, url: str):
        self.url = url
        self.session_id: str | None = None
        self._id = 0
        self.client = httpx.Client(timeout=60.0)
    def _next_id(self) -> int:
        self._id += 1
        return self._id
    def _headers(self) -> dict:
        h = {
            "Content-Type": "application/json",
            "Accept": "application/json, text/event-stream",
        }
        if self.session_id:
            h["Mcp-Session-Id"] = self.session_id
        return h
    def _parse_sse(self, text: str) -> dict:
        """Extract the JSON payload from an SSE response."""
        for line in text.splitlines():
            if line.startswith("data: "):
                return json.loads(line[6:])
        raise ValueError(f"No data line in SSE response: {text[:200]}")
    def initialize(self) -> None:
        payload = {
            "jsonrpc": "2.0",
            "method": "initialize",
            "params": {
                "protocolVersion": "2024-11-05",
                "capabilities": {},
                "clientInfo": {"name": "ingest-fastmcp-docs", "version": "1.0"},
            },
            "id": self._next_id(),
        }
        resp = self.client.post(self.url, json=payload, headers=self._headers())
        resp.raise_for_status()
        self.session_id = resp.headers.get("mcp-session-id")
        result = self._parse_sse(resp.text)
        if "error" in result:
            raise RuntimeError(f"initialize failed: {result['error']}")
        print(f"[rag-mcp] Session: {self.session_id}")
    def call_tool(self, name: str, arguments: dict) -> dict:
        payload = {
            "jsonrpc": "2.0",
            "method": "tools/call",
            "params": {"name": name, "arguments": arguments},
            "id": self._next_id(),
        }
        resp = self.client.post(self.url, json=payload, headers=self._headers())
        resp.raise_for_status()
        result = self._parse_sse(resp.text)
        if "error" in result:
            raise RuntimeError(f"tools/call {name} failed: {result['error']}")
        # Unwrap MCP content envelope
        content = result.get("result", {}).get("content", [])
        if content and content[0].get("type") == "text":
            return json.loads(content[0]["text"])
        return result.get("result", {})
    def close(self) -> None:
        self.client.close()
 def find_doc_files(docs_dir: Path) -> list[Path]:
    files = []
    for root, _dirs, filenames in os.walk(docs_dir):
        for fname in sorted(filenames):
            if fname.endswith((".mdx", ".md")):
                files.append(Path(root) / fname)
    return sorted(files)
 def derive_title(rel_path: Path, content: str) -> str:
    """Extract title from first heading or fall back to filename."""
    for line in content.splitlines():
        line = line.strip()
        if line.startswith("# "):
            return line[2:].strip()
        if line.startswith("title:"):
            return line[6:].strip().strip('"').strip("'")
    return rel_path.stem.replace("-", " ").replace("_", " ").title()
 def derive_section(rel_path: Path) -> str:
    parts = rel_path.parts
    return parts[0] if len(parts) > 1 else "root"
 def main() -> None:
    parser = argparse.ArgumentParser(description="Ingest FastMCP docs into rag-mcp")
    parser.add_argument("--dry-run", action="store_true", help="List files only, no ingestion")
    parser.add_argument("--rag-url", default=DEFAULT_RAG_URL, help="rag-mcp MCP endpoint")
    parser.add_argument("--limit", type=int, default=0, help="Max files to ingest (0=all)")
    args = parser.parse_args()
    files = find_doc_files(DOCS_DIR)
    print(f"Found {len(files)} doc files in {DOCS_DIR}")
    if args.dry_run:
        for f in files:
            print(f"  {f.relative_to(DOCS_DIR)}")
        return
    if args.limit:
        files = files[: args.limit]
        print(f"Limiting to {args.limit} files")
    client = RagMcpClient(args.rag_url)
    client.initialize()
    # Find or create the document
    print("Looking for existing fastmcp-docs document...")
    docs_list = client.call_tool("browse_documents", {"page": 1, "page_size": 50})
    existing_doc = None
    for doc in docs_list:
        if isinstance(doc, dict) and doc.get("source") == DOC_SOURCE:
            existing_doc = doc
            break
    if existing_doc:
        doc_id = existing_doc["id"]
        print(f"Using existing document id={doc_id}")
    else:
        print("Creating new document...")
        new_doc = client.call_tool(
            "add_document",
            {
                "source": DOC_SOURCE,
                "tags": DOC_TAGS,
                "description": DOC_DESCRIPTION,
                "meta": {"repo": "prefecthq/fastmcp", "local_path": str(DOCS_DIR)},
            },
        )
        doc_id = new_doc["id"]
        print(f"Created document id={doc_id}")
    # Ingest each file
    ok = 0
    errors = 0
    for i, fpath in enumerate(files):
        rel = fpath.relative_to(DOCS_DIR)
        try:
            content = fpath.read_text(encoding="utf-8")
        except Exception as e:
            print(f"  [SKIP] {rel}: read error: {e}")
            errors += 1
            continue
        title = derive_title(rel, content)
        section = derive_section(rel)
        try:
            result = client.call_tool(
                "add_record_fields",
                {
                    "document_id": doc_id,
                    "fields": {
                        "title": title,
                        "path": str(rel),
                        "content": content,
                    },
                    "metadata": {
                        "section": section,
                        "path": str(rel),
                        "title": title,
                    },
                    "config": {
                        "chunk_size": 800,
                        "overlap": 80,
                        "embed_full_field": True,
                        "generate_snippets": True,
                    },
                },
            )
            ok += 1
            if (i + 1) % 10 == 0:
                print(f"  [{i+1}/{len(files)}] {rel} -> record_id={result.get('record_id')}")
        except Exception as e:
            print(f"  [ERROR] {rel}: {e}")
            errors += 1
            time.sleep(1)  # back off on error
    client.close()
    print(f"\nDone: {ok} ingested, {errors} errors (document id={doc_id})")
 if __name__ == "__main__":
    main()
--- a/src/searxng_mcp/init.py
+++ b/src/searxng_mcp/init.py
@ -0,0 +1,5 @@
 """SearxNG MCP — package entry point."""
 from searxng_mcp.server import mcp
 __all__ = ["mcp"]
--- a/src/searxng_mcp/main.py
+++ b/src/searxng_mcp/main.py
@ -0,0 +1,41 @@
 """CLI entry point for the SearxNG MCP server."""
 import argparse
 from searxng_mcp.server import mcp
 def main() -> None:
    parser = argparse.ArgumentParser(
        prog="searxng-mcp",
        description="SearxNG MCP server",
    )
    parser.add_argument(
        "--transport",
        choices=["stdio", "http", "sse"],
        default="stdio",
        help="Transport protocol (default: stdio)",
    )
    parser.add_argument(
        "--host",
        default="127.0.0.1",
        help="Host to bind when using http/sse transport (default: 127.0.0.1)",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port to bind when using http/sse transport (default: 8000)",
    )
    args = parser.parse_args()
    kwargs = {"transport": args.transport}
    if args.transport in ("http", "sse"):
        kwargs["host"] = args.host
        kwargs["port"] = args.port
    mcp.run(**kwargs)
 if __name__ == "__main__":
    main()
--- a/src/searxng_mcp/py.typed
+++ b/src/searxng_mcp/py.typed
--- a/src/searxng_mcp/searxng.py
+++ b/src/searxng_mcp/searxng.py
@ -0,0 +1,36 @@
 """HTTP client for the SearxNG search API."""
 from typing import Any
 import httpx
 async def search(
    base_url: str,
    query: str,
    categories: str | None = None,
    engines: str | None = None,
    language: str | None = None,
    pageno: int = 1,
    time_range: str | None = None,
    safesearch: int = 0,
 ) -> dict[str, Any]:
    """Send a search request to a SearxNG instance and return parsed JSON."""
    params: dict[str, Any] = {
        "q": query,
        "format": "json",
        "pageno": pageno,
        "safesearch": safesearch,
    }
    if categories:
        params["categories"] = categories
    if engines:
        params["engines"] = engines
    if language:
        params["language"] = language
    if time_range:
        params["time_range"] = time_range
    async with httpx.AsyncClient() as client:
        response = await client.get(f"{base_url.rstrip('/')}/search", params=params)
        response.raise_for_status()
        return response.json()
--- a/src/searxng_mcp/server.py
+++ b/src/searxng_mcp/server.py
@ -0,0 +1,222 @@
 """SearxNG MCP server."""
 from typing import Annotated, Literal
 from fastmcp import FastMCP
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 import asyncio
 import trafilatura
 from searxng_mcp.searxng import search as _search
 class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_prefix="SEARXNG_", env_file=".env", env_file_encoding="utf-8")
    base_url: str = "http://localhost:8080"
 settings = Settings()
 mcp = FastMCP(
    "SearxNG Search",
    instructions=(
        "Use the search tool to query the web via a SearxNG instance. "
        "Prefer specific queries and use categories/time_range to narrow results. "
        "Use the fetch tool to retrieve a page preview (first N chars). "
        "If the page is truncated and you need more, read the resource "
        "web://fetch?url=<url>&start=<n>&end=<m> to get a specific character slice. "
        "Pages are cached after the first fetch; pass use_cache=false to force a refresh."
    ),
 )
 # In-memory cache: (url, output_format, include_tables, include_images, include_links) -> content
 _cache: dict[tuple, str] = {}
 async def _fetch_and_extract(
    url: str,
    output_format: str = "markdown",
    include_tables: bool = True,
    include_images: bool = False,
    include_links: bool = False,
    use_cache: bool = True,
 ) -> str:
    """Shared fetch+extract logic used by both the tool and resource."""
    cache_key = (url, output_format, include_tables, include_images, include_links)
    if use_cache and cache_key in _cache:
        return _cache[cache_key]
    loop = asyncio.get_event_loop()
    downloaded = await loop.run_in_executor(None, trafilatura.fetch_url, url)
    if not downloaded:
        raise ValueError(f"Failed to fetch URL: {url}")
    result = await loop.run_in_executor(
        None,
        lambda: trafilatura.extract(
            downloaded,
            url=url,
            output_format=output_format,
            include_tables=include_tables,
            include_images=include_images,
            include_links=include_links,
            with_metadata=output_format == "json",
        ),
    )
    if not result:
        raise ValueError(f"Failed to extract content from URL: {url}")
    _cache[cache_key] = result
    return result
@mcp.tool
 async def search(
    query: Annotated[str, Field(description="Search query string.")],
    categories: Annotated[
        str | None,
        Field(description="Comma-separated categories: general, images, news, science, files, social_media, it, map."),
    ] = None,
    engines: Annotated[
        str | None,
        Field(description="Comma-separated engines to use, e.g. 'google,bing'. Overrides categories."),
    ] = None,
    language: Annotated[
        str | None,
        Field(description="BCP 47 language code for results, e.g. 'en', 'de'."),
    ] = None,
    pageno: Annotated[
        int,
        Field(description="Result page number (1-based).", ge=1),
    ] = 1,
    time_range: Annotated[
        Literal["day", "week", "month", "year"] | None,
        Field(description="Restrict results to a time range."),
    ] = None,
    safesearch: Annotated[
        Literal[0, 1, 2],
        Field(description="Safe search level: 0=off, 1=moderate, 2=strict."),
    ] = 0,
 ) -> list[dict]:
    """Search the web via SearxNG and return a list of results.
    Each result contains: title, url, content (snippet), engine, category.
    Returns at most the results provided by the SearxNG instance (typically 10 per page).
    """
    data = await _search(
        base_url=settings.base_url,
        query=query,
        categories=categories,
        engines=engines,
        language=language,
        pageno=pageno,
        time_range=time_range,
        safesearch=safesearch,
    )
    results = data.get("results", [])
    return [
        {
            "title": r.get("title", ""),
            "url": r.get("url", ""),
            "content": r.get("content", ""),
            "engine": r.get("engine", ""),
            "category": r.get("category", ""),
        }
        for r in results
    ]
@mcp.tool
 async def fetch(
    url: Annotated[str, Field(description="URL of the page to fetch and extract.")],
    output_format: Annotated[
        Literal["markdown", "txt", "json"],
        Field(description="Output format for extracted content: markdown, txt, or json (includes metadata)."),
    ] = "markdown",
    include_tables: Annotated[
        bool,
        Field(description="Include tables in extracted content."),
    ] = True,
    include_images: Annotated[
        bool,
        Field(description="Include image descriptions in extracted content."),
    ] = False,
    include_links: Annotated[
        bool,
        Field(description="Include hyperlinks in extracted content."),
    ] = False,
    max_chars: Annotated[
        int,
        Field(description="Maximum characters to return. 0 means no limit.", ge=0),
    ] = 2000,
    start: Annotated[
        int,
        Field(description="Start character offset for slicing extracted content.", ge=0),
    ] = 0,
    end: Annotated[
        int,
        Field(description="End character offset for slicing extracted content. 0 means read to end of content.", ge=0),
    ] = 0,
    use_cache: Annotated[
        bool,
        Field(description="Return cached content if available. Set to false to force a fresh download."),
    ] = True,
 ) -> dict:
    """Fetch a URL and extract its main content, stripping navigation, ads, and boilerplate.
    Returns a preview of the content (up to max_chars) plus total_chars and truncated flag.
    If truncated, use start/end to page through the full content, or read the resource
    web://fetch?url=<url>&start=<n>&end=<m> for specific slices.
    """
    content = await _fetch_and_extract(url, output_format, include_tables, include_images, include_links, use_cache)
    total_chars = len(content)
    # Apply explicit start/end slice first (takes priority over max_chars windowing)
    if start > 0 or end > 0:
        slice_end = end if end > 0 else None
        sliced = content[start:slice_end]
        return {
            "content": sliced,
            "total_chars": total_chars,
            "truncated": False,
        }
    if max_chars > 0 and total_chars > max_chars:
        return {
            "content": content[:max_chars],
            "total_chars": total_chars,
            "truncated": True,
        }
    return {
        "content": content,
        "total_chars": total_chars,
        "truncated": False,
    }
@mcp.resource(
    "web://fetch{?url,start,end,output_format,include_links,include_tables,include_images,use_cache}",
    mime_type="text/markdown",
 )
 async def fetch_slice(
    url: str = "",
    start: int = 0,
    end: int = 0,
    output_format: str = "markdown",
    include_links: bool = False,
    include_tables: bool = True,
    include_images: bool = False,
    use_cache: bool = True,
 ) -> str:
    """Fetch a URL and return a character slice of the extracted content.
    Use start/end to page through large documents (end=0 means read to end of content).
    Example: web://fetch?url=https://example.com/page&start=2000&end=4000
    """
    if not url:
        raise ValueError("url parameter is required")
    content = await _fetch_and_extract(url, output_format, include_tables, include_images, include_links, use_cache)
    if end > 0:
        return content[start:end]
    return content[start:]
--- a/uv.lock
+++ b/uv.lock