From 3ebd7c5e4a9bff4baf4597dac30ffdc70780248b Mon Sep 17 00:00:00 2001 From: Hans Aschauer Date: Tue, 21 Apr 2026 20:31:49 +0200 Subject: [PATCH] add fetch_raw tool and download-docs skill - fetch_raw: new MCP tool that fetches URLs via httpx without HTML extraction, returning raw text + status_code. Does not call raise_for_status() so 404s are returned as data, not exceptions. - download-docs skill: SKILL.md with 5-step process (branch discovery, CI hint, docs dir discovery, recursive download, meta sidecars) and complete mcp-forge script skeleton, validated end-to-end inside mcp-forge against encode/starlette (26 files, 0 errors). --- .opencode/skills/download-docs/SKILL.md | 311 ++++++++++++++++++++++++ src/searxng_mcp/server.py | 45 ++++ 2 files changed, 356 insertions(+) create mode 100644 .opencode/skills/download-docs/SKILL.md diff --git a/.opencode/skills/download-docs/SKILL.md b/.opencode/skills/download-docs/SKILL.md new file mode 100644 index 0000000..358b285 --- /dev/null +++ b/.opencode/skills/download-docs/SKILL.md @@ -0,0 +1,311 @@ +--- +name: download-docs +description: Given a GitHub/GitLab repo, discover the documentation directory and recursively download all doc files into the artifacts directory, preserving the original directory structure and writing a .meta.json sidecar next to each file. +--- + +# download-docs Skill + +## Overview + +This skill downloads an entire documentation tree from a GitHub or GitLab repository +into the mcp-forge artifact directory. Each file gets a `.meta.json` sidecar with +provenance metadata. + +**Constraints**: +- mcp-forge is air-gapped. All HTTP goes through injected MCP tools (`fetch_raw`). +- Inject tools with their bare name: `fetch_raw` (not `searxng_fetch_raw`). +- All injected tool calls use **keyword-only arguments**. + +--- + +## Process + +### Step 1 — Parse user request + +Extract `owner`, `repo`, and optionally `branch` from the user's request. + +- If branch is not given, try `main` then `master` (check which exists via the GitHub API). +- Canonical form: `owner/repo` or `https://github.com/owner/repo`. + +### Step 2 — Discover documentation directory + +Try each candidate path against the GitHub Contents API until one returns a +non-empty list of entries: + +``` +DOC_LOCATIONS = ["docs", "doc", "documentation", "guide", "guides", + "content", "pages", "site", "wiki"] +``` + +API endpoint: +``` +https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch} +``` + +A `200` response with a JSON **list** (not a dict with `message`) means the +path exists and is a directory. Use the first match. + +### Step 3 — CI pipeline branch hint (optional, best-effort) + +Before Step 2 (or if Step 2 finds nothing), scan CI/config files for a +branch override: + +``` +CI_FILES = [ + ".github/workflows/docs.yml", + ".github/workflows/ci.yml", + ".github/workflows/deploy.yml", + ".gitlab-ci.yml", + "mkdocs.yml", + "readthedocs.yml", + ".readthedocs.yaml", +] +``` + +Fetch each via raw URL: +``` +https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{ci_file} +``` + +Scan content for keywords like `ref:`, `branch:`, `gh-pages`, `checkout`. +If a specific docs branch is found, update `BRANCH` and re-run Step 2. + +### Step 4 — Recursive download + +Walk the directory tree depth-first using the GitHub Contents API. +For each entry: + +- `type == "dir"`: recurse (skip hidden dirs and known junk dirs). +- `type == "file"`: download if extension matches the allowlist. + +**Extension allowlist**: +``` +DOC_EXTENSIONS = {".md", ".mdx", ".rst", ".txt", ".html", ".htm", + ".ipynb", ".yaml", ".yml", ".toml"} +``` + +**Skip dirs**: +``` +SKIP_DIRS = {"__pycache__", ".git", "node_modules", ".venv", + ".tox", ".eggs", "dist", "build"} +``` +Also skip any directory whose name starts with `.`. + +**Download raw content**: +``` +https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path} +``` + +Rate limit: `time.sleep(0.05)` between API calls. + +### Step 5 — Write files + metadata sidecars + +For each downloaded file: +1. Reconstruct the relative path under `{ARTIFACT_DIR}/{repo}/{file_path}`. +2. Create parent directories with `Path.mkdir(parents=True, exist_ok=True)`. +3. Write file content (UTF-8, errors=`replace`). +4. Write `.meta.json` sidecar at `{out_path}.meta.json`. + +**Metadata fields**: +```json +{ + "source": "github", + "owner": "pydantic", + "repo": "pydantic-ai", + "branch": "main", + "path": "docs/index.md", + "raw_url": "https://raw.githubusercontent.com/...", + "html_url": "https://github.com/...", + "sha": "abc123", + "size_bytes": 4096, + "content_type": "text/plain", + "downloaded_at": "2026-04-21T10:00:00Z" +} +``` + +--- + +## Complete mcp-forge Script + +```python +import json, os, time +from pathlib import Path +from datetime import datetime, timezone + +# ── Configuration ──────────────────────────────────────────────────────────── +OWNER = "pydantic" # ← set from user request +REPO = "pydantic-ai" # ← set from user request +BRANCH = "main" # ← set from user request or discovered + +DOC_LOCATIONS = ["docs", "doc", "documentation", "guide", "guides", + "content", "pages", "site", "wiki"] +CI_FILES = [".github/workflows/docs.yml", ".github/workflows/ci.yml", + ".github/workflows/deploy.yml", ".gitlab-ci.yml", + "mkdocs.yml", "readthedocs.yml", ".readthedocs.yaml"] +DOC_EXTENSIONS = {".md", ".mdx", ".rst", ".txt", ".html", ".htm", + ".ipynb", ".yaml", ".yml", ".toml"} +SKIP_DIRS = {"__pycache__", ".git", "node_modules", ".venv", + ".tox", ".eggs", "dist", "build"} + +ARTIFACT_DIR = Path(os.environ["MCP_ARTIFACT_DIR"]) + +# ── Helpers ─────────────────────────────────────────────────────────────────── +def gh_contents(path): + """Return parsed JSON from GitHub Contents API, or None on failure.""" + url = f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{path}?ref={BRANCH}" + r = fetch_raw(url=url) + time.sleep(0.05) + if r.get("status_code", 200) >= 400: + return None + try: + return json.loads(r["content"]) + except Exception: + return None + +def raw_url(path): + return f"https://raw.githubusercontent.com/{OWNER}/{REPO}/{BRANCH}/{path}" + +def html_url(path): + return f"https://github.com/{OWNER}/{REPO}/blob/{BRANCH}/{path}" + +def api_contents_url(path): + return f"https://api.github.com/repos/{OWNER}/{REPO}/contents/{path}?ref={BRANCH}" + +# ── Step 1: confirm branch exists ──────────────────────────────────────────── +for candidate in ([BRANCH] if BRANCH else ["main", "master"]): + r = fetch_raw(url=f"https://api.github.com/repos/{OWNER}/{REPO}/branches/{candidate}") + if r.get("status_code", 404) == 200: + BRANCH = candidate + print(f"Branch confirmed: {BRANCH}") + break +else: + print("ERROR: could not confirm branch — aborting") + raise SystemExit(1) + +# ── Step 2 (optional): CI pipeline branch hint ─────────────────────────────── +for ci_file in CI_FILES: + r = fetch_raw(url=raw_url(ci_file)) + if r.get("status_code", 404) == 200: + content = r["content"] + for line in content.splitlines(): + if any(kw in line for kw in ("ref:", "branch:", "gh-pages")): + print(f"CI hint in {ci_file}: {line.strip()}") + break # only need to find one + +# ── Step 3: discover docs directory ────────────────────────────────────────── +DOC_ROOT = None +for loc in DOC_LOCATIONS: + data = gh_contents(loc) + if isinstance(data, list) and len(data) > 0: + DOC_ROOT = loc + print(f"Found docs at: {DOC_ROOT}") + break + +if DOC_ROOT is None: + print("ERROR: no docs directory found — tried:", DOC_LOCATIONS) + raise SystemExit(1) + +# ── Step 4 + 5: recursive download ─────────────────────────────────────────── +downloaded = 0 +errors = 0 +now_iso = datetime.now(timezone.utc).isoformat() + +def process_dir(api_path): + global downloaded, errors + entries = gh_contents(api_path) + if not isinstance(entries, list): + return + for entry in entries: + name = entry.get("name", "") + etype = entry.get("type") + epath = entry.get("path", "") + + if etype == "dir": + if name in SKIP_DIRS or name.startswith("."): + continue + process_dir(epath) + + elif etype == "file": + ext = Path(name).suffix.lower() + if ext not in DOC_EXTENSIONS: + continue + + # Download raw content + r = fetch_raw(url=raw_url(epath)) + time.sleep(0.05) + if r.get("status_code", 200) >= 400: + print(f" ERROR {r.get('status_code')} {epath}") + errors += 1 + continue + + # Write file + out_path = ARTIFACT_DIR / REPO / epath + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(r["content"], encoding="utf-8", errors="replace") + + # Write .meta.json sidecar + meta = { + "source": "github", + "owner": OWNER, + "repo": REPO, + "branch": BRANCH, + "path": epath, + "raw_url": raw_url(epath), + "html_url": html_url(epath), + "sha": entry.get("sha", ""), + "size_bytes": entry.get("size", len(r["content"])), + "content_type": r.get("content_type", "text/plain"), + "downloaded_at": now_iso, + } + meta_path = out_path.parent / (out_path.name + ".meta.json") + meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8") + + downloaded += 1 + if downloaded % 25 == 0: + print(f" {downloaded} files downloaded...") + +process_dir(DOC_ROOT) + +print(f"\nDone. Downloaded: {downloaded}, Errors: {errors}") +print(f"Output: {ARTIFACT_DIR / REPO / DOC_ROOT}") +``` + +### Inject with: +```python +mcp-forge_execute_python( + code=