feat: add timeout configuration for outgoing HTTP requests and update documentation
This commit is contained in:
parent
3ebd7c5e4a
commit
8288787b4e
4 changed files with 99 additions and 20 deletions
|
|
@ -3,6 +3,10 @@
|
|||
# URL of the SearxNG instance to query.
|
||||
SEARXNG_BASE_URL=http://localhost:8080
|
||||
|
||||
# Default network timeout (seconds) for outgoing HTTP requests.
|
||||
# Tools may override this per call via timeout_seconds.
|
||||
#SEARXNG_REQUEST_TIMEOUT_SECONDS=30
|
||||
|
||||
# Set to true to run fetched pages through the prompt-guard honeypot before
|
||||
# returning them to the agent. Requires PROMPT_GUARD_* settings below.
|
||||
#SEARXNG_GUARD_ENABLED=false
|
||||
|
|
|
|||
10
README.md
10
README.md
|
|
@ -8,8 +8,8 @@ Built with [FastMCP](https://github.com/prefecthq/fastmcp).
|
|||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| `search` | Tool | Query the web via SearxNG. Supports categories, engines, language, time range, safe search, and pagination. |
|
||||
| `fetch` | Tool | Fetch a URL and extract its main content (strips ads, navigation, boilerplate). Returns a preview with `total_chars` and `truncated` metadata. Supports `start`/`end` slicing, `max_chars`, multiple output formats, and in-memory caching. |
|
||||
| `search` | Tool | Query the web via SearxNG. Supports categories, engines, language, time range, safe search, pagination, and per-call timeout overrides. |
|
||||
| `fetch` | Tool | Fetch a URL and extract its main content (strips ads, navigation, boilerplate). Returns a preview with `total_chars` and `truncated` metadata. Supports `start`/`end` slicing, `max_chars`, multiple output formats, in-memory caching, and per-call timeout overrides. |
|
||||
| `web://fetch{?url,...}` | Resource | Read an arbitrary character slice of a fetched page without going through the tool call. Useful for paging through large documents. |
|
||||
|
||||
## Requirements
|
||||
|
|
@ -37,14 +37,20 @@ cp .env.example .env
|
|||
`.env`:
|
||||
```dotenv
|
||||
SEARXNG_BASE_URL=http://localhost:8080
|
||||
SEARXNG_REQUEST_TIMEOUT_SECONDS=30
|
||||
```
|
||||
|
||||
All settings can also be provided as environment variables with the `SEARXNG_` prefix:
|
||||
|
||||
```bash
|
||||
export SEARXNG_BASE_URL=http://localhost:8080
|
||||
export SEARXNG_REQUEST_TIMEOUT_SECONDS=30
|
||||
```
|
||||
|
||||
Timeout behavior:
|
||||
- `SEARXNG_REQUEST_TIMEOUT_SECONDS` sets the default timeout for outgoing HTTP calls.
|
||||
- `search`, `fetch`, `fetch_raw`, and `web://fetch` accept optional `timeout_seconds` to override the default for a single call.
|
||||
|
||||
## Starting the server
|
||||
|
||||
### stdio (default — for use with MCP clients like Claude Desktop)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ async def search(
|
|||
pageno: int = 1,
|
||||
time_range: str | None = None,
|
||||
safesearch: int = 0,
|
||||
timeout_seconds: float = 30.0,
|
||||
) -> dict[str, Any]:
|
||||
"""Send a search request to a SearxNG instance and return parsed JSON."""
|
||||
params: dict[str, Any] = {
|
||||
|
|
@ -30,7 +31,12 @@ async def search(
|
|||
if time_range:
|
||||
params["time_range"] = time_range
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout_seconds) as client:
|
||||
response = await client.get(f"{base_url.rstrip('/')}/search", params=params)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except httpx.TimeoutException as exc:
|
||||
raise ValueError(
|
||||
f"SearxNG request timed out after {timeout_seconds:.2f}s"
|
||||
) from exc
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ class Settings(BaseSettings):
|
|||
model_config = SettingsConfigDict(env_prefix="SEARXNG_", env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||
|
||||
base_url: str = "http://localhost:8080"
|
||||
request_timeout_seconds: float = 30.0
|
||||
"""Default network timeout (seconds) for outgoing HTTP requests."""
|
||||
guard_enabled: bool = False
|
||||
"""Run fetched content through the prompt-guard honeypot before returning it.
|
||||
Requires PROMPT_GUARD_* settings to be configured."""
|
||||
|
|
@ -49,6 +51,7 @@ async def _fetch_and_extract(
|
|||
include_images: bool = False,
|
||||
include_links: bool = False,
|
||||
use_cache: bool = True,
|
||||
timeout_seconds: float | None = None,
|
||||
) -> str:
|
||||
"""Shared fetch+extract logic used by both the tool and resource."""
|
||||
cache_key = (url, output_format, include_tables, include_images, include_links)
|
||||
|
|
@ -56,10 +59,20 @@ async def _fetch_and_extract(
|
|||
if use_cache and cache_key in _cache:
|
||||
return _cache[cache_key]
|
||||
|
||||
effective_timeout = timeout_seconds if timeout_seconds is not None else settings.request_timeout_seconds
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
headers={"User-Agent": "searxng-mcp/1.0", "Accept": "*/*"},
|
||||
follow_redirects=True,
|
||||
timeout=effective_timeout,
|
||||
) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
downloaded = response.text
|
||||
except httpx.TimeoutException as exc:
|
||||
raise ValueError(f"Request timed out after {effective_timeout:.2f}s for URL: {url}") from exc
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
downloaded = await loop.run_in_executor(None, trafilatura.fetch_url, url)
|
||||
if not downloaded:
|
||||
raise ValueError(f"Failed to fetch URL: {url}")
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: trafilatura.extract(
|
||||
|
|
@ -120,6 +133,16 @@ async def search(
|
|||
Literal[0, 1, 2],
|
||||
Field(description="Safe search level: 0=off, 1=moderate, 2=strict."),
|
||||
] = 0,
|
||||
timeout_seconds: Annotated[
|
||||
float | None,
|
||||
Field(
|
||||
description=(
|
||||
"Request timeout in seconds for this call. "
|
||||
"If omitted, uses SEARXNG_REQUEST_TIMEOUT_SECONDS."
|
||||
),
|
||||
gt=0,
|
||||
),
|
||||
] = None,
|
||||
) -> list[dict]:
|
||||
"""Search the web via SearxNG and return a list of results.
|
||||
|
||||
|
|
@ -135,6 +158,7 @@ async def search(
|
|||
pageno=pageno,
|
||||
time_range=time_range,
|
||||
safesearch=safesearch,
|
||||
timeout_seconds=timeout_seconds if timeout_seconds is not None else settings.request_timeout_seconds,
|
||||
)
|
||||
results = data.get("results", [])
|
||||
return [
|
||||
|
|
@ -184,6 +208,16 @@ async def fetch(
|
|||
bool,
|
||||
Field(description="Return cached content if available. Set to false to force a fresh download."),
|
||||
] = True,
|
||||
timeout_seconds: Annotated[
|
||||
float | None,
|
||||
Field(
|
||||
description=(
|
||||
"Request timeout in seconds for downloading this URL. "
|
||||
"If omitted, uses SEARXNG_REQUEST_TIMEOUT_SECONDS."
|
||||
),
|
||||
gt=0,
|
||||
),
|
||||
] = None,
|
||||
) -> dict:
|
||||
"""Fetch a URL and extract its main content, stripping navigation, ads, and boilerplate.
|
||||
|
||||
|
|
@ -191,7 +225,15 @@ async def fetch(
|
|||
If truncated, use start/end to page through the full content, or read the resource
|
||||
web://fetch?url=<url>&start=<n>&end=<m> for specific slices.
|
||||
"""
|
||||
content = await _fetch_and_extract(url, output_format, include_tables, include_images, include_links, use_cache)
|
||||
content = await _fetch_and_extract(
|
||||
url,
|
||||
output_format,
|
||||
include_tables,
|
||||
include_images,
|
||||
include_links,
|
||||
use_cache,
|
||||
timeout_seconds,
|
||||
)
|
||||
total_chars = len(content)
|
||||
|
||||
# Apply explicit start/end slice first (takes priority over max_chars windowing)
|
||||
|
|
@ -218,7 +260,7 @@ async def fetch(
|
|||
|
||||
|
||||
@mcp.resource(
|
||||
"web://fetch{?url,start,end,output_format,include_links,include_tables,include_images,use_cache}",
|
||||
"web://fetch{?url,start,end,output_format,include_links,include_tables,include_images,use_cache,timeout_seconds}",
|
||||
mime_type="text/markdown",
|
||||
)
|
||||
async def fetch_slice(
|
||||
|
|
@ -230,6 +272,7 @@ async def fetch_slice(
|
|||
include_tables: bool = True,
|
||||
include_images: bool = False,
|
||||
use_cache: bool = True,
|
||||
timeout_seconds: float | None = None,
|
||||
) -> str:
|
||||
"""Fetch a URL and return a character slice of the extracted content.
|
||||
|
||||
|
|
@ -238,7 +281,15 @@ async def fetch_slice(
|
|||
"""
|
||||
if not url:
|
||||
raise ValueError("url parameter is required")
|
||||
content = await _fetch_and_extract(url, output_format, include_tables, include_images, include_links, use_cache)
|
||||
content = await _fetch_and_extract(
|
||||
url,
|
||||
output_format,
|
||||
include_tables,
|
||||
include_images,
|
||||
include_links,
|
||||
use_cache,
|
||||
timeout_seconds,
|
||||
)
|
||||
if end > 0:
|
||||
return content[start:end]
|
||||
return content[start:]
|
||||
|
|
@ -249,6 +300,16 @@ async def fetch_raw(
|
|||
url: Annotated[str, Field(description="URL to fetch. Returns raw text content without HTML extraction.")],
|
||||
encoding: Annotated[str, Field(description="Text encoding to decode the response bytes. Default 'utf-8'.")] = "utf-8",
|
||||
max_bytes: Annotated[int, Field(description="Maximum bytes to return. 0 = no limit.", ge=0)] = 0,
|
||||
timeout_seconds: Annotated[
|
||||
float | None,
|
||||
Field(
|
||||
description=(
|
||||
"Request timeout in seconds for this call. "
|
||||
"If omitted, uses SEARXNG_REQUEST_TIMEOUT_SECONDS."
|
||||
),
|
||||
gt=0,
|
||||
),
|
||||
] = None,
|
||||
) -> dict:
|
||||
"""Fetch a URL and return the raw response body as text, bypassing trafilatura extraction.
|
||||
|
||||
|
|
@ -259,17 +320,19 @@ async def fetch_raw(
|
|||
|
||||
Returns a dict with: content (str), status_code (int), content_type (str), total_bytes (int).
|
||||
"""
|
||||
def _do_fetch() -> tuple[bytes, int, str]:
|
||||
with httpx.Client(
|
||||
effective_timeout = timeout_seconds if timeout_seconds is not None else settings.request_timeout_seconds
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
headers={"User-Agent": "searxng-mcp/1.0", "Accept": "*/*"},
|
||||
follow_redirects=True,
|
||||
timeout=30.0,
|
||||
timeout=effective_timeout,
|
||||
) as client:
|
||||
response = client.get(url)
|
||||
return response.content, response.status_code, response.headers.get("content-type", "")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
raw, status_code, content_type = await loop.run_in_executor(None, _do_fetch)
|
||||
response = await client.get(url)
|
||||
raw = response.content
|
||||
status_code = response.status_code
|
||||
content_type = response.headers.get("content-type", "")
|
||||
except httpx.TimeoutException as exc:
|
||||
raise ValueError(f"Request timed out after {effective_timeout:.2f}s for URL: {url}") from exc
|
||||
|
||||
total_bytes = len(raw)
|
||||
if max_bytes > 0:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue