From 3ebd7c5e4a9bff4baf4597dac30ffdc70780248b Mon Sep 17 00:00:00 2001
From: Hans Aschauer <hans.git@ch23.de>
Date: Tue, 21 Apr 2026 20:31:49 +0200
Subject: [PATCH] add fetch_raw tool and download-docs skill

- fetch_raw: new MCP tool that fetches URLs via httpx without HTML
  extraction, returning raw text + status_code. Does not call
  raise_for_status() so 404s are returned as data, not exceptions.
- download-docs skill: SKILL.md with 5-step process (branch discovery,
  CI hint, docs dir discovery, recursive download, meta sidecars) and
  complete mcp-forge script skeleton, validated end-to-end inside
  mcp-forge against encode/starlette (26 files, 0 errors).
---
 .opencode/skills/download-docs/SKILL.md | 311 ++++++++++++++++++++++++
 src/searxng_mcp/server.py               |  45 ++++
 2 files changed, 356 insertions(+)
 create mode 100644 .opencode/skills/download-docs/SKILL.md

diff --git a/.opencode/skills/download-docs/SKILL.md b/.opencode/skills/download-docs/SKILL.md
new file mode 100644
index 0000000..358b285
--- /dev/null
+++ b/.opencode/skills/download-docs/SKILL.md
@@ -0,0 +1,311 @@
+---
+name: download-docs
+description: Given a GitHub/GitLab repo, discover the documentation directory and recursively download all doc files into the artifacts directory, preserving the original directory structure and writing a .meta.json sidecar next to each file.
+---
+
+# download-docs Skill
+
+## Overview
+
+This skill downloads an entire documentation tree from a GitHub or GitLab repository
+into the mcp-forge artifact directory. Each file gets a `.meta.json` sidecar with
+provenance metadata.
+
+**Constraints**:
+- mcp-forge is air-gapped. All HTTP goes through injected MCP tools (`fetch_raw`).
+- Inject tools with their bare name: `fetch_raw` (not `searxng_fetch_raw`).
+- All injected tool calls use **keyword-only arguments**.
+
+---
+
+## Process
+
+### Step 1 — Parse user request
+
+Extract `owner`, `repo`, and optionally `branch` from the user's request.
+
+- If branch is not given, try `main` then `master` (check which exists via the GitHub API).
+- Canonical form: `owner/repo` or `https://github.com/owner/repo`.
+
+### Step 2 — Discover documentation directory
+
+Try each candidate path against the GitHub Contents API until one returns a
+non-empty list of entries:
+
+```
+DOC_LOCATIONS = ["docs", "doc", "documentation", "guide", "guides",
+                 "content", "pages", "site", "wiki"]
+```
+
+API endpoint:
+```
+https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}
+```
+
+A `200` response with a JSON **list** (not a dict with `message`) means the
+path exists and is a directory. Use the first match.
+
+### Step 3 — CI pipeline branch hint (optional, best-effort)
+
+Before Step 2 (or if Step 2 finds nothing), scan CI/config files for a
+branch override:
+
+```
+CI_FILES = [
+    ".github/workflows/docs.yml",
+    ".github/workflows/ci.yml",
+    ".github/workflows/deploy.yml",
+    ".gitlab-ci.yml",
+    "mkdocs.yml",
+    "readthedocs.yml",
+    ".readthedocs.yaml",
+]
+```
+
+Fetch each via raw URL:
+```
+https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{ci_file}
+```
+
+Scan content for keywords like `ref:`, `branch:`, `gh-pages`, `checkout`.
+If a specific docs branch is found, update `BRANCH` and re-run Step 2.
+
+### Step 4 — Recursive download
+
+Walk the directory tree depth-first using the GitHub Contents API.
+For each entry:
+
+- `type == "dir"`: recurse (skip hidden dirs and known junk dirs).
+- `type == "file"`: download if extension matches the allowlist.
+
+**Extension allowlist**:
+```
+DOC_EXTENSIONS = {".md", ".mdx", ".rst", ".txt", ".html", ".htm",
+                  ".ipynb", ".yaml", ".yml", ".toml"}
+```
+
+**Skip dirs**:
+```
+SKIP_DIRS = {"__pycache__", ".git", "node_modules", ".venv",
+             ".tox", ".eggs", "dist", "build"}
+```
+Also skip any directory whose name starts with `.`.
+
+**Download raw content**:
+```
+https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}
+```
+
+Rate limit: `time.sleep(0.05)` between API calls.
+
+### Step 5 — Write files + metadata sidecars
+
+For each downloaded file:
+1. Reconstruct the relative path under `{ARTIFACT_DIR}/{repo}/{file_path}`.
+2. Create parent directories with `Path.mkdir(parents=True, exist_ok=True)`.
+3. Write file content (UTF-8, errors=`replace`).
+4. Write `.meta.json` sidecar at `{out_path}.meta.json`.
+
+**Metadata fields**:
+```json
+{
+  "source": "github",
+  "owner": "pydantic",
+  "repo": "pydantic-ai",
+  "branch": "main",
+  "path": "docs/index.md",
+  "raw_url": "https://raw.githubusercontent.com/...",
+  "html_url": "https://github.com/...",
+  "sha": "abc123",
+  "size_bytes": 4096,
+  "content_type": "text/plain",
+  "downloaded_at": "2026-04-21T10:00:00Z"
+}
+```
+
+---
+
+## Complete mcp-forge Script
+
+```python
+import json, os, time
+from pathlib import Path
+from datetime import datetime, timezone
+
+# ── Configuration ────────────────────────────────────────────────────────────
+OWNER   = "pydantic"          # ← set from user request
+REPO    = "pydantic-ai"       # ← set from user request
+BRANCH  = "main"              # ← set from user request or discovered
+
+DOC_LOCATIONS  = ["docs", "doc", "documentation", "guide", "guides",
+                  "content", "pages", "site", "wiki"]
+CI_FILES       = [".github/workflows/docs.yml", ".github/workflows/ci.yml",
+                  ".github/workflows/deploy.yml", ".gitlab-ci.yml",
+                  "mkdocs.yml", "readthedocs.yml", ".readthedocs.yaml"]
+DOC_EXTENSIONS = {".md", ".mdx", ".rst", ".txt", ".html", ".htm",
+                  ".ipynb", ".yaml", ".yml", ".toml"}
+SKIP_DIRS      = {"__pycache__", ".git", "node_modules", ".venv",
+                  ".tox", ".eggs", "dist", "build"}
+
+ARTIFACT_DIR = Path(os.environ["MCP_ARTIFACT_DIR"])
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def gh_contents(path):
+    """Return parsed JSON from GitHub Contents API, or None on failure."""
+    url = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{path}?ref={BRANCH}"
+    r = fetch_raw(url=url)
+    time.sleep(0.05)
+    if r.get("status_code", 200) >= 400:
+        return None
+    try:
+        return json.loads(r["content"])
+    except Exception:
+        return None
+
+def raw_url(path):
+    return f"https://raw.githubusercontent.com/{OWNER}/{REPO}/{BRANCH}/{path}"
+
+def html_url(path):
+    return f"https://github.com/{OWNER}/{REPO}/blob/{BRANCH}/{path}"
+
+def api_contents_url(path):
+    return f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{path}?ref={BRANCH}"
+
+# ── Step 1: confirm branch exists ────────────────────────────────────────────
+for candidate in ([BRANCH] if BRANCH else ["main", "master"]):
+    r = fetch_raw(url=f"https://api.github.com/repos/{OWNER}/{REPO}/branches/{candidate}")
+    if r.get("status_code", 404) == 200:
+        BRANCH = candidate
+        print(f"Branch confirmed: {BRANCH}")
+        break
+else:
+    print("ERROR: could not confirm branch — aborting")
+    raise SystemExit(1)
+
+# ── Step 2 (optional): CI pipeline branch hint ───────────────────────────────
+for ci_file in CI_FILES:
+    r = fetch_raw(url=raw_url(ci_file))
+    if r.get("status_code", 404) == 200:
+        content = r["content"]
+        for line in content.splitlines():
+            if any(kw in line for kw in ("ref:", "branch:", "gh-pages")):
+                print(f"CI hint in {ci_file}: {line.strip()}")
+        break  # only need to find one
+
+# ── Step 3: discover docs directory ──────────────────────────────────────────
+DOC_ROOT = None
+for loc in DOC_LOCATIONS:
+    data = gh_contents(loc)
+    if isinstance(data, list) and len(data) > 0:
+        DOC_ROOT = loc
+        print(f"Found docs at: {DOC_ROOT}")
+        break
+
+if DOC_ROOT is None:
+    print("ERROR: no docs directory found — tried:", DOC_LOCATIONS)
+    raise SystemExit(1)
+
+# ── Step 4 + 5: recursive download ───────────────────────────────────────────
+downloaded = 0
+errors = 0
+now_iso = datetime.now(timezone.utc).isoformat()
+
+def process_dir(api_path):
+    global downloaded, errors
+    entries = gh_contents(api_path)
+    if not isinstance(entries, list):
+        return
+    for entry in entries:
+        name = entry.get("name", "")
+        etype = entry.get("type")
+        epath = entry.get("path", "")
+
+        if etype == "dir":
+            if name in SKIP_DIRS or name.startswith("."):
+                continue
+            process_dir(epath)
+
+        elif etype == "file":
+            ext = Path(name).suffix.lower()
+            if ext not in DOC_EXTENSIONS:
+                continue
+
+            # Download raw content
+            r = fetch_raw(url=raw_url(epath))
+            time.sleep(0.05)
+            if r.get("status_code", 200) >= 400:
+                print(f"  ERROR {r.get('status_code')} {epath}")
+                errors += 1
+                continue
+
+            # Write file
+            out_path = ARTIFACT_DIR / REPO / epath
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            out_path.write_text(r["content"], encoding="utf-8", errors="replace")
+
+            # Write .meta.json sidecar
+            meta = {
+                "source":       "github",
+                "owner":        OWNER,
+                "repo":         REPO,
+                "branch":       BRANCH,
+                "path":         epath,
+                "raw_url":      raw_url(epath),
+                "html_url":     html_url(epath),
+                "sha":          entry.get("sha", ""),
+                "size_bytes":   entry.get("size", len(r["content"])),
+                "content_type": r.get("content_type", "text/plain"),
+                "downloaded_at": now_iso,
+            }
+            meta_path = out_path.parent / (out_path.name + ".meta.json")
+            meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+            downloaded += 1
+            if downloaded % 25 == 0:
+                print(f"  {downloaded} files downloaded...")
+
+process_dir(DOC_ROOT)
+
+print(f"\nDone. Downloaded: {downloaded}, Errors: {errors}")
+print(f"Output: {ARTIFACT_DIR / REPO / DOC_ROOT}")
+```
+
+### Inject with:
+```python
+mcp-forge_execute_python(
+    code=<script above with OWNER/REPO/BRANCH filled in>,
+    mcp_tools=["fetch_raw"]
+)
+```
+
+---
+
+## Usage Examples
+
+**Basic (user provides full path)**:
+> "Download the docs from github.com/pydantic/pydantic-ai"
+
+Set `OWNER="pydantic"`, `REPO="pydantic-ai"`, `BRANCH="main"` (or leave blank for
+auto-detection) and run the script.
+
+**With explicit branch**:
+> "Download docs from tiangolo/fastapi, branch master"
+
+Set `BRANCH="master"` and skip the branch-discovery loop.
+
+**GitLab** (future):
+GitLab uses the same REST pattern but with `https://gitlab.com/api/v4/projects/{id}/repository/tree`.
+Not yet implemented — treat GitLab repos as out of scope for now.
+
+---
+
+## Known Limitations
+
+- GitHub API unauthenticated rate limit: 60 req/hour. For large repos with many
+  subdirectories, consider adding a `Authorization: Bearer <token>` header.
+  `fetch_raw` does not currently support custom headers — a `fetch_raw_auth` variant
+  would be needed.
+- GitLab not supported yet.
+- Only files with known doc extensions are downloaded. Binary assets (images, PDFs)
+  are intentionally skipped.
+- The script is not idempotent: re-running will overwrite existing files silently.
diff --git a/src/searxng_mcp/server.py b/src/searxng_mcp/server.py
index 0a3c50e..2ece0e4 100644
--- a/src/searxng_mcp/server.py
+++ b/src/searxng_mcp/server.py
@@ -7,6 +7,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 
 import asyncio
 import logging
+import httpx
 import trafilatura
 
 import prompt_guard
@@ -241,3 +242,47 @@ async def fetch_slice(
     if end > 0:
         return content[start:end]
     return content[start:]
+
+
+@mcp.tool
+async def fetch_raw(
+    url: Annotated[str, Field(description="URL to fetch. Returns raw text content without HTML extraction.")],
+    encoding: Annotated[str, Field(description="Text encoding to decode the response bytes. Default 'utf-8'.")] = "utf-8",
+    max_bytes: Annotated[int, Field(description="Maximum bytes to return. 0 = no limit.", ge=0)] = 0,
+) -> dict:
+    """Fetch a URL and return the raw response body as text, bypassing trafilatura extraction.
+
+    Use this for:
+    - JSON API responses (GitHub/GitLab REST API, etc.)
+    - Plain text or markdown files served from a CDN/raw URL
+    - Any URL where content extraction would corrupt the data
+
+    Returns a dict with: content (str), status_code (int), content_type (str), total_bytes (int).
+    """
+    def _do_fetch() -> tuple[bytes, int, str]:
+        with httpx.Client(
+            headers={"User-Agent": "searxng-mcp/1.0", "Accept": "*/*"},
+            follow_redirects=True,
+            timeout=30.0,
+        ) as client:
+            response = client.get(url)
+            return response.content, response.status_code, response.headers.get("content-type", "")
+
+    loop = asyncio.get_event_loop()
+    raw, status_code, content_type = await loop.run_in_executor(None, _do_fetch)
+
+    total_bytes = len(raw)
+    if max_bytes > 0:
+        raw = raw[:max_bytes]
+
+    try:
+        text = raw.decode(encoding)
+    except (UnicodeDecodeError, LookupError) as exc:
+        raise ValueError(f"Cannot decode response as {encoding!r}: {exc}") from exc
+
+    return {
+        "content": text,
+        "status_code": status_code,
+        "content_type": content_type,
+        "total_bytes": total_bytes,
+    }