feat: bridge_self bot commands — status, thresholds, reset, health

Adds bot commands for the bridge_self provider so operators can inspect
and manage bridge health from chat: /status, /thresholds, /reset, /health.
Includes Jinja2 templates for both locales, seed data, capability slots,
and a handler that exposes pending deferred backlog plus per-counter
reset. Also adds .claude/skills/ for project-scoped graph-aware skills.
This commit is contained in:
2026-05-16 03:43:48 +03:00
parent 10d30fc956
commit 8651767112
50 changed files with 1311 additions and 60 deletions
@@ -542,8 +542,32 @@ BRIDGE_SELF_CAPABILITIES = ProviderCapabilities(
{"name": "bridge_self_deferred_backlog", "description": "Deferred backlog high"},
{"name": "bridge_self_target_failures", "description": "Target send failures"},
],
command_slots=[],
commands=[],
command_slots=[
# Response templates
{"name": "start", "description": "/start greeting message"},
{"name": "help", "description": "/help command listing"},
{"name": "status", "description": "/status full counter snapshot"},
{"name": "thresholds", "description": "/thresholds configured alert thresholds"},
{"name": "reset", "description": "/reset manual counter reset"},
{"name": "health", "description": "/health terse one-line summary"},
{"name": "rate_limited", "description": "Rate limit warning message"},
{"name": "no_results", "description": "Empty results fallback"},
# Description slots
{"name": "desc_help", "description": "Menu description for /help"},
{"name": "desc_status", "description": "Menu description for /status"},
{"name": "desc_thresholds", "description": "Menu description for /thresholds"},
{"name": "desc_reset", "description": "Menu description for /reset"},
{"name": "desc_health", "description": "Menu description for /health"},
# Usage examples
{"name": "usage_reset", "description": "Usage example for /reset"},
],
commands=[
{"name": "status", "description": "Show current bridge health counters"},
{"name": "thresholds", "description": "Show configured alert thresholds"},
{"name": "reset", "description": "Manually reset a failure counter"},
{"name": "health", "description": "Terse one-line health summary"},
{"name": "help", "description": "Show commands"},
],
)
@@ -0,0 +1 @@
Terse one-line health summary
@@ -0,0 +1 @@
Show available commands
@@ -0,0 +1 @@
Reset a failure counter (tracker:<id>, target:<id>, or all)
@@ -0,0 +1 @@
Show current bridge health counters
@@ -0,0 +1 @@
Show configured alert thresholds
@@ -0,0 +1,5 @@
{%- if healthy -%}
✅ {{ summary }}
{%- else -%}
🚨 {{ summary }}
{%- endif %}
@@ -0,0 +1,5 @@
🩺 <b>Available commands:</b>
{%- for cmd in commands %}
/{{ cmd.name }} — {{ cmd.description }}
{%- if cmd.usage %} ↳ {{ cmd.usage }}{% endif %}
{%- endfor %}
@@ -0,0 +1 @@
⏳ Too many requests. Please wait {{ wait }}s before trying again.
@@ -0,0 +1,14 @@
{%- if success %}
✅ <b>Counter reset</b>
{%- if subject_type == 'all' %}
Cleared {{ previous_count }} of your failure counters (trackers + targets).
{%- else %}
{{ subject_type|capitalize }} <b>{{ subject_name }}</b>{% if subject_id %} (id <code>{{ subject_id }}</code>){% endif %}
Previous count: <b>{{ previous_count }}</b> → 0
{%- endif %}
{%- else %}
❌ <b>Reset failed</b>
{%- if error_message %}
<i>{{ error_message }}</i>
{%- endif %}
{%- endif %}
@@ -0,0 +1,2 @@
👋 Hi! I'm your Notify Bridge bot for <b>Bridge Self-Monitoring</b>.
Use /help to see available commands.
@@ -0,0 +1,28 @@
🩺 <b>Bridge Status</b>
{%- if poll_failures %}
🚨 <b>Tracker poll failures</b>
{%- for f in poll_failures %}
• <b>{{ f.tracker_name }}</b> (id <code>{{ f.tracker_id }}</code>) — {{ f.count }} consecutive
{%- endfor %}
{%- endif %}
{%- if deferred_pending is none %}
⏳ <b>Deferred backlog</b>
Pending: <b>unknown</b> (DB unavailable) · Threshold: {{ deferred_threshold }}
{%- elif deferred_pending %}
⏳ <b>Deferred backlog</b>
Pending: <b>{{ deferred_pending }}</b> · Threshold: {{ deferred_threshold }}
{%- endif %}
{%- if target_failures %}
📡 <b>Target send failures</b>
{%- for f in target_failures %}
• <b>{{ f.target_name }}</b> (id <code>{{ f.target_id }}</code>) — {{ f.count }} consecutive
{%- endfor %}
{%- endif %}
{%- if not poll_failures and not target_failures and deferred_pending == 0 %}
✅ All counters at zero. Nothing to report.
{%- endif %}
@@ -0,0 +1,4 @@
⚙️ <b>Bridge Thresholds</b>
Tracker poll failures: <b>{{ poll_failure_threshold }}</b>
Deferred backlog: <b>{{ deferred_backlog_threshold }}</b>
Target send failures: <b>{{ target_failure_threshold }}</b>
@@ -0,0 +1 @@
/reset tracker:42 (or target:&lt;id&gt;, or all)
@@ -73,6 +73,15 @@ PROVIDER_COMMAND_SLOTS: dict[str, list[str]] = {
# Usage examples
"usage_entities", "usage_state",
],
"bridge_self": [
# Response templates
"start", "help", "status", "thresholds", "reset", "health",
"rate_limited", "no_results",
# Description slots
"desc_help", "desc_status", "desc_thresholds", "desc_reset", "desc_health",
# Usage examples
"usage_reset",
],
}
# Backward-compatible aliases
@@ -0,0 +1 @@
Краткая однострочная сводка состояния
@@ -0,0 +1 @@
Показать доступные команды
@@ -0,0 +1 @@
Сбросить счётчик сбоев (tracker:&lt;id&gt;, target:&lt;id&gt; или all)
@@ -0,0 +1 @@
Показать счётчики состояния моста
@@ -0,0 +1 @@
Показать настроенные пороги оповещений
@@ -0,0 +1,5 @@
{%- if healthy -%}
✅ {{ summary }}
{%- else -%}
🚨 {{ summary }}
{%- endif %}
@@ -0,0 +1,5 @@
🩺 <b>Доступные команды:</b>
{%- for cmd in commands %}
/{{ cmd.name }} — {{ cmd.description }}
{%- if cmd.usage %} ↳ {{ cmd.usage }}{% endif %}
{%- endfor %}
@@ -0,0 +1 @@
Нет результатов.
@@ -0,0 +1 @@
⏳ Слишком много запросов. Подождите {{ wait }}с и попробуйте снова.
@@ -0,0 +1,14 @@
{%- if success %}
✅ <b>Счётчик сброшен</b>
{%- if subject_type == 'all' %}
Очищено {{ previous_count }} ваших счётчиков (трекеры + цели).
{%- else %}
{{ subject_type|capitalize }} <b>{{ subject_name }}</b>{% if subject_id %} (id <code>{{ subject_id }}</code>){% endif %}
Было: <b>{{ previous_count }}</b> → 0
{%- endif %}
{%- else %}
❌ <b>Не удалось сбросить</b>
{%- if error_message %}
<i>{{ error_message }}</i>
{%- endif %}
{%- endif %}
@@ -0,0 +1,2 @@
👋 Привет! Я бот Notify Bridge для <b>Самомониторинга моста</b>.
Используйте /help, чтобы посмотреть доступные команды.
@@ -0,0 +1,28 @@
🩺 <b>Состояние моста</b>
{%- if poll_failures %}
🚨 <b>Сбои опроса трекеров</b>
{%- for f in poll_failures %}
• <b>{{ f.tracker_name }}</b> (id <code>{{ f.tracker_id }}</code>) — {{ f.count }} подряд
{%- endfor %}
{%- endif %}
{%- if deferred_pending is none %}
⏳ <b>Очередь отложенной отправки</b>
В ожидании: <b>неизвестно</b> (БД недоступна) · Порог: {{ deferred_threshold }}
{%- elif deferred_pending %}
⏳ <b>Очередь отложенной отправки</b>
В ожидании: <b>{{ deferred_pending }}</b> · Порог: {{ deferred_threshold }}
{%- endif %}
{%- if target_failures %}
📡 <b>Сбои отправки в адресаты</b>
{%- for f in target_failures %}
• <b>{{ f.target_name }}</b> (id <code>{{ f.target_id }}</code>) — {{ f.count }} подряд
{%- endfor %}
{%- endif %}
{%- if not poll_failures and not target_failures and deferred_pending == 0 %}
✅ Все счётчики в нуле. Всё хорошо.
{%- endif %}
@@ -0,0 +1,4 @@
⚙️ <b>Пороги моста</b>
Сбои опроса трекеров: <b>{{ poll_failure_threshold }}</b>
Очередь отложенной отправки: <b>{{ deferred_backlog_threshold }}</b>
Сбои отправки в адресаты: <b>{{ target_failure_threshold }}</b>
@@ -0,0 +1 @@
/reset tracker:42 (или target:&lt;id&gt;, или all)
@@ -413,6 +413,69 @@ async def get_command_variables(
},
}
# --- Bridge self-monitoring ---
bridge_self_poll_failure_fields = {
"tracker_id": "Tracker id (int)",
"tracker_name": "Tracker display name",
"count": "Consecutive poll failures",
}
bridge_self_target_failure_fields = {
"target_id": "Target id (int)",
"target_name": "Target display name",
"count": "Consecutive send failures",
}
bridge_self = {
"status": {
"description": "/status snapshot of all bridge_self counters",
"variables": {
**common_vars,
"poll_failures": "List of {tracker_id, tracker_name, count} dicts (use {% for f in poll_failures %})",
"deferred_pending": "Pending deferred-dispatch row count for this user",
"deferred_threshold": "Deferred backlog threshold from provider config",
"target_failures": "List of {target_id, target_name, count} dicts (use {% for f in target_failures %})",
},
"poll_failure_fields": bridge_self_poll_failure_fields,
"target_failure_fields": bridge_self_target_failure_fields,
},
"thresholds": {
"description": "/thresholds configured alert thresholds",
"variables": {
**common_vars,
"poll_failure_threshold": "Tracker poll failure threshold (int)",
"deferred_backlog_threshold": "Deferred backlog threshold (int)",
"target_failure_threshold": "Target send failure threshold (int)",
},
},
"reset": {
"description": "/reset result of a manual counter reset",
"variables": {
**common_vars,
"subject_type": "'tracker', 'target', or 'all' (empty on parse error)",
"subject_id": "Subject id (int) or None for 'all' / errors",
"subject_name": "Display name of the subject (empty on error)",
"previous_count": "Counter value before reset",
"success": "Whether the reset succeeded (boolean)",
"error_message": "Error message when success=False (None on success)",
},
},
"health": {
"description": "/health terse one-line summary",
"variables": {
**common_vars,
"healthy": "Whether everything is at zero (boolean)",
"summary": "Human-readable one-line summary",
"tracker_count": "Total enabled tracker count for this user",
"failing_tracker_count": "Number of trackers with non-zero poll failures",
"deferred_pending": "Pending deferred-dispatch row count for this user",
"failing_target_count": "Number of targets with non-zero send failures",
},
},
"desc_thresholds": {"description": "Description for /thresholds command", "variables": common_vars},
"desc_reset": {"description": "Description for /reset command", "variables": common_vars},
"desc_health": {"description": "Description for /health command", "variables": common_vars},
"usage_reset": {"description": "Usage example for /reset", "variables": common_vars},
}
return {
**shared,
"immich": immich,
@@ -421,6 +484,7 @@ async def get_command_variables(
"nut": nut,
"google_photos": google_photos,
"webhook": webhook,
"bridge_self": bridge_self,
}
@@ -625,6 +689,39 @@ async def preview_raw(
{"area_id": "kitchen", "name": "Kitchen", "entity_count": 14},
{"area_id": "entrance", "name": "Entrance", "entity_count": 4},
],
# --- Bridge self-monitoring ---
# /status — three categories of failure (preview shows non-empty data
# so operators can see how the template renders failures, not the
# all-zero case)
"poll_failures": [
{"tracker_id": 12, "tracker_name": "Family Photos poller", "count": 3},
{"tracker_id": 18, "tracker_name": "Vacation 2025 poller", "count": 5},
],
"deferred_pending": 47,
"deferred_threshold": 100,
"target_failures": [
{"target_id": 4, "target_name": "Telegram - Family chat", "count": 7},
],
# /thresholds — user's configured thresholds
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
# /reset — sample success result
"subject_type": "tracker",
"subject_id": 12,
"subject_name": "Family Photos poller",
"previous_count": 3,
"success": True,
"error_message": None,
# /health — sample "unhealthy" line that matches the failure data
# above. Templates that only branch on ``healthy`` still preview
# cleanly — operators wanting the healthy case can flip the flag
# in their template editor.
"healthy": False,
"summary": "2 trackers failing, 47 deferred, 1 target failing",
"tracker_count": 3,
"failing_tracker_count": 2,
"failing_target_count": 1,
}
return render_template_preview(body.template, sample_ctx)
@@ -244,7 +244,7 @@ async def _test_provider_connection(provider: ServiceProvider) -> dict[str, Any]
)
return await ha.test_connection()
if provider.type in ("scheduler", "webhook"):
if provider.type in ("scheduler", "webhook", "bridge_self"):
return {"ok": True, "message": "Virtual provider — always available"}
return {"ok": False, "message": f"Unknown provider type: {provider.type}"}
@@ -0,0 +1,273 @@
"""Bridge self-monitoring bot command handler.
Read-only commands plus a ``/reset`` operator action that clears in-memory
failure counters once the underlying issue has been fixed.
Counters are kept in module-level dicts inside
``services.bridge_self`` see that module for the increment / reset
helpers used by the watcher / scheduler / dispatcher hot path. We only
read snapshots from those dicts here so we never block the emission
side or hold a lock across DB calls.
"""
from __future__ import annotations
import logging
from typing import Any
from ..database.models import (
CommandConfig,
CommandTracker,
CommandTrackerListener,
ServiceProvider,
TelegramBot,
)
from ..services import bridge_self as bs
from .base import CommandResponse, ProviderCommandHandler
from .handler import _render_cmd_template
_LOGGER = logging.getLogger(__name__)
_BRIDGE_SELF_COMMANDS = {"status", "thresholds", "reset", "health"}
# ---------------------------------------------------------------------------
# Context builders — one per command. Each returns the dict passed to the
# template renderer.
# ---------------------------------------------------------------------------
async def _build_status_context(provider: ServiceProvider) -> dict[str, Any]:
"""Snapshot the calling user's counters + pending deferred backlog.
User-scoped: ``get_user_*_failures`` filter the global counter dicts by
ownership in a single batched query so one user cannot see another
user's failing trackers/targets.
"""
thresholds = await bs.get_user_thresholds(provider.user_id)
poll_rows = await bs.get_user_poll_failures(provider.user_id)
poll_failures = sorted(
({"tracker_id": r["id"], "tracker_name": r["name"], "count": r["count"]} for r in poll_rows),
key=lambda r: r["tracker_id"],
)
target_rows = await bs.get_user_target_failures(provider.user_id)
target_failures = sorted(
({"target_id": r["id"], "target_name": r["name"], "count": r["count"]} for r in target_rows),
key=lambda r: r["target_id"],
)
deferred_pending = await bs.get_pending_deferred_count(provider.user_id)
return {
"poll_failures": poll_failures,
"deferred_pending": deferred_pending,
"deferred_threshold": thresholds["deferred_backlog_threshold"],
"target_failures": target_failures,
}
async def _build_thresholds_context(provider: ServiceProvider) -> dict[str, Any]:
"""Render the user's configured alert thresholds."""
thresholds = await bs.get_user_thresholds(provider.user_id)
return {
"poll_failure_threshold": thresholds["poll_failure_threshold"],
"deferred_backlog_threshold": thresholds["deferred_backlog_threshold"],
"target_failure_threshold": thresholds["target_failure_threshold"],
}
def _parse_reset_subject(args: str) -> tuple[str, int | None, str | None]:
"""Parse a ``/reset`` argument into ``(subject_type, subject_id, error)``.
Accepted forms:
* ``tracker:<id>`` clear that tracker's poll-failure counter
* ``target:<id>`` clear that target's send-failure counter
* ``all`` clear every in-memory counter
On a parse error we return ``("", None, "<message>")`` so the handler can
render a templated error reply instead of propagating an exception.
"""
raw = (args or "").strip()
if not raw:
return "", None, "Missing subject. Use tracker:<id>, target:<id>, or all."
if raw.lower() == "all":
return "all", None, None
if ":" not in raw:
return "", None, (
f"Invalid subject '{raw}'. Use tracker:<id>, target:<id>, or all."
)
prefix, _, value = raw.partition(":")
prefix = prefix.strip().lower()
value = value.strip()
if prefix not in ("tracker", "target"):
return "", None, (
f"Unknown subject type '{prefix}'. Use tracker, target, or all."
)
try:
subject_id = int(value)
except ValueError:
return "", None, f"Invalid id '{value}'. Must be an integer."
if subject_id <= 0:
return "", None, f"Invalid id '{value}'. Must be a positive integer."
return prefix, subject_id, None
async def _build_reset_context(args: str, provider: ServiceProvider) -> dict[str, Any]:
"""Reset a counter, enforcing that the subject belongs to ``provider.user_id``.
Without this ownership check, any authenticated user could reset another
user's counters (or wipe global state via ``all``). For ``all`` we only
clear counters tied to trackers/targets owned by the calling user.
"""
subject_type, subject_id, error = _parse_reset_subject(args)
if error is not None:
return {
"subject_type": "",
"subject_id": None,
"subject_name": "",
"previous_count": 0,
"success": False,
"error_message": error,
}
if subject_type == "all":
cleared = await bs.reset_user_counters(provider.user_id)
return {
"subject_type": "all",
"subject_id": None,
"subject_name": "your counters",
"previous_count": cleared,
"success": True,
"error_message": None,
}
# Verify the subject belongs to the calling user before touching the counter.
owner_lookup = bs.find_tracker_owner if subject_type == "tracker" else bs.find_target_owner
owner = await owner_lookup(subject_id) if subject_id is not None else None
if owner is None or owner != provider.user_id:
return {
"subject_type": subject_type,
"subject_id": subject_id,
"subject_name": "",
"previous_count": 0,
"success": False,
"error_message": f"{subject_type} {subject_id} not found",
}
failure_type = "poll_failures" if subject_type == "tracker" else "target_failures"
name_lookup = bs.get_tracker_name if subject_type == "tracker" else bs.get_target_name
subject_name = await name_lookup(subject_id) if subject_id is not None else ""
previous = bs.reset_counter(failure_type, subject_id)
return {
"subject_type": subject_type,
"subject_id": subject_id,
"subject_name": subject_name,
"previous_count": previous,
"success": True,
"error_message": None,
}
async def _build_health_context(provider: ServiceProvider) -> dict[str, Any]:
"""Build the terse one-line health summary context."""
from ..database.models import NotificationTracker
from sqlmodel import select
from sqlmodel.ext.asyncio.session import AsyncSession
from ..database.engine import get_engine
engine = get_engine()
try:
async with AsyncSession(engine) as session:
result = await session.exec(
select(NotificationTracker).where(
NotificationTracker.user_id == provider.user_id,
NotificationTracker.enabled == True, # noqa: E712 — SQLModel needs ==
)
)
tracker_count = len(list(result.all()))
except Exception: # noqa: BLE001
_LOGGER.exception("health: failed to count trackers for user=%s", provider.user_id)
tracker_count = 0
# User-scoped: only count failures that belong to THIS user's trackers/targets.
failing_tracker_count = len(await bs.get_user_poll_failures(provider.user_id))
failing_target_count = len(await bs.get_user_target_failures(provider.user_id))
deferred_pending = await bs.get_pending_deferred_count(provider.user_id)
# Treat unknown deferred count (DB error) as not-healthy so operators
# don't get a misleading "all clear" when the bridge can't introspect itself.
healthy = (
failing_tracker_count == 0
and failing_target_count == 0
and deferred_pending == 0
)
if healthy:
summary = f"Bridge healthy: {tracker_count} trackers polling, 0 failures"
else:
parts: list[str] = []
if failing_tracker_count:
parts.append(f"{failing_tracker_count} trackers failing")
if deferred_pending is None:
parts.append("deferred backlog unknown (DB unavailable)")
elif deferred_pending:
parts.append(f"{deferred_pending} deferred")
if failing_target_count:
parts.append(f"{failing_target_count} targets failing")
summary = ", ".join(parts) or "bridge state unknown"
return {
"healthy": healthy,
"summary": summary,
"tracker_count": tracker_count,
"failing_tracker_count": failing_tracker_count,
"deferred_pending": deferred_pending,
"failing_target_count": failing_target_count,
}
# ---------------------------------------------------------------------------
# Handler
# ---------------------------------------------------------------------------
class BridgeSelfCommandHandler(ProviderCommandHandler):
"""Read-only commands plus /reset for the bridge_self provider."""
provider_type = "bridge_self"
def get_provider_commands(self) -> set[str]:
return _BRIDGE_SELF_COMMANDS
async def handle(
self,
cmd: str,
args: str,
count: int,
locale: str,
response_mode: str,
provider: ServiceProvider,
cmd_templates: dict[str, dict[str, str]],
bot: TelegramBot,
tracker: CommandTracker,
config: CommandConfig,
*,
listener: CommandTrackerListener | None = None,
allowed_album_ids: set[str] | None = None, # noqa: ARG002 — bridge_self has no album scope
page: int = 1,
) -> CommandResponse | None:
if cmd == "status":
ctx = await _build_status_context(provider)
return CommandResponse(text=_render_cmd_template(cmd_templates, "status", locale, ctx))
if cmd == "thresholds":
ctx = await _build_thresholds_context(provider)
return CommandResponse(text=_render_cmd_template(cmd_templates, "thresholds", locale, ctx))
if cmd == "reset":
ctx = await _build_reset_context(args, provider)
return CommandResponse(text=_render_cmd_template(cmd_templates, "reset", locale, ctx))
if cmd == "health":
ctx = await _build_health_context(provider)
return CommandResponse(text=_render_cmd_template(cmd_templates, "health", locale, ctx))
return None
@@ -35,6 +35,7 @@ def _auto_register() -> None:
from .nut_handler import NutCommandHandler
from .webhook_handler import WebhookCommandHandler
from .home_assistant_handler import HomeAssistantCommandHandler
from .bridge_self_handler import BridgeSelfCommandHandler
register_handler(ImmichCommandHandler())
register_handler(GiteaCommandHandler())
@@ -42,6 +43,7 @@ def _auto_register() -> None:
register_handler(NutCommandHandler())
register_handler(WebhookCommandHandler())
register_handler(HomeAssistantCommandHandler())
register_handler(BridgeSelfCommandHandler())
# Auto-register on import
@@ -195,6 +195,10 @@ async def _seed_default_command_templates() -> None:
session, "home_assistant", "Default Home Assistant Commands",
"Default Home Assistant command templates",
)
await _seed_provider_command_template(
session, "bridge_self", "Default Bridge Self-Monitoring Commands",
"Default Bridge Self-Monitoring command templates",
)
await session.commit()
@@ -381,6 +385,16 @@ async def _seed_default_command_configs() -> None:
"default_count": 5,
"rate_limits": {"search": 30, "default": 10},
},
{
"provider_type": "bridge_self",
"name": "Default Bridge Self-Monitoring",
"enabled_commands": [
"help", "status", "thresholds", "reset", "health",
],
"response_mode": "text",
"default_count": 5,
"rate_limits": {"default": 10},
},
]
for cfg in defaults:
@@ -263,6 +263,117 @@ def get_target_last_error(target_id: int) -> str:
return _target_failure_last_error.get(target_id, "")
def get_all_poll_failures() -> dict[int, int]:
"""Return a snapshot of all current poll failure counters (tracker_id -> count).
Only includes non-zero counters. The returned dict is a copy and can be
iterated safely without holding a reference to the live module-level state.
"""
return {tid: count for tid, count in _poll_failure_counts.items() if count > 0}
def get_all_target_failures() -> dict[int, int]:
"""Return a snapshot of all current target failure counters (target_id -> count).
Only includes non-zero counters.
"""
return {tid: count for tid, count in _target_failure_counts.items() if count > 0}
def reset_counter(failure_type: str, subject_id: int | None = None) -> int:
"""Reset bridge_self failure counters.
Args:
failure_type: One of ``"poll_failures"``, ``"target_failures"``, or
``"all"``. Anything else is treated as a no-op.
subject_id: When ``failure_type`` is ``"poll_failures"`` or
``"target_failures"``, the tracker_id / target_id whose counter
to clear. Ignored when ``failure_type == "all"``.
Returns:
The previous count for the reset entry. For ``"all"``, the total
number of entries cleared across both counter dicts. Idempotent
clearing an absent entry returns 0.
"""
if failure_type == "all":
cleared = (
len(_poll_failure_counts)
+ len(_target_failure_counts)
)
_poll_failure_counts.clear()
_poll_failure_last_error.clear()
_target_failure_counts.clear()
_target_failure_last_error.clear()
return cleared
if failure_type == "poll_failures" and subject_id is not None:
previous = _poll_failure_counts.get(subject_id, 0)
reset_poll_counter(subject_id)
return previous
if failure_type == "target_failures" and subject_id is not None:
previous = _target_failure_counts.get(subject_id, 0)
reset_target_counter(subject_id)
return previous
return 0
async def get_pending_deferred_count(user_id: int) -> int | None:
"""Return the count of pending DeferredDispatch rows for a user.
Used by command handlers to render the current backlog in /status and
/health responses. Returns ``None`` on any error so command templates
can render "unknown" instead of a misleading "0" operators looking
at bridge health when the bridge is unhealthy must not be told
everything is fine.
"""
from sqlalchemy import func
from ..database.models import DeferredDispatch
engine = get_engine()
try:
async with AsyncSession(engine) as session:
result = await session.exec(
select(func.count(DeferredDispatch.id))
.where(DeferredDispatch.status == "pending")
.where(DeferredDispatch.user_id == user_id)
)
count = result.first()
return int(count or 0)
except Exception: # noqa: BLE001 — never block a command reply
_LOGGER.exception("get_pending_deferred_count failed for user_id=%s", user_id)
return None
async def get_tracker_name(tracker_id: int) -> str:
"""Return the display name of a NotificationTracker, or ``"tracker {id}"``."""
from ..database.models import NotificationTracker
engine = get_engine()
try:
async with AsyncSession(engine) as session:
tracker = await session.get(NotificationTracker, tracker_id)
if tracker is not None:
return tracker.name or f"tracker {tracker_id}"
except Exception: # noqa: BLE001
_LOGGER.exception("get_tracker_name failed for tracker_id=%s", tracker_id)
return f"tracker {tracker_id}"
async def get_target_name(target_id: int) -> str:
"""Return the display name of a NotificationTarget, or ``"target {id}"``."""
from ..database.models import NotificationTarget
engine = get_engine()
try:
async with AsyncSession(engine) as session:
target = await session.get(NotificationTarget, target_id)
if target is not None:
return target.name or f"target {target_id}"
except Exception: # noqa: BLE001
_LOGGER.exception("get_target_name failed for target_id=%s", target_id)
return f"target {target_id}"
# ---------------------------------------------------------------------------
# User-level helpers
# ---------------------------------------------------------------------------
@@ -300,6 +411,76 @@ async def find_target_owner(target_id: int) -> int | None:
return int(target.user_id)
async def get_user_poll_failures(user_id: int) -> list[dict[str, Any]]:
"""Return ``[{"id", "name", "count"}]`` for trackers owned by ``user_id``.
Single batched query replaces the N+1 pattern of calling
``get_tracker_name`` per failing tracker, and enforces ownership so
one user cannot see another user's failure list.
"""
from ..database.models import NotificationTracker
snapshot = {tid: c for tid, c in _poll_failure_counts.items() if c > 0}
if not snapshot:
return []
engine = get_engine()
async with AsyncSession(engine) as session:
result = await session.exec(
select(NotificationTracker.id, NotificationTracker.name).where(
NotificationTracker.id.in_(list(snapshot.keys())),
NotificationTracker.user_id == user_id,
)
)
rows = list(result.all())
return [
{"id": int(tid), "name": name or f"tracker {tid}", "count": snapshot[int(tid)]}
for tid, name in rows
]
async def get_user_target_failures(user_id: int) -> list[dict[str, Any]]:
"""Return ``[{"id", "name", "count"}]`` for targets owned by ``user_id``."""
from ..database.models import NotificationTarget
snapshot = {tid: c for tid, c in _target_failure_counts.items() if c > 0}
if not snapshot:
return []
engine = get_engine()
async with AsyncSession(engine) as session:
result = await session.exec(
select(NotificationTarget.id, NotificationTarget.name).where(
NotificationTarget.id.in_(list(snapshot.keys())),
NotificationTarget.user_id == user_id,
)
)
rows = list(result.all())
return [
{"id": int(tid), "name": name or f"target {tid}", "count": snapshot[int(tid)]}
for tid, name in rows
]
async def reset_user_counters(user_id: int) -> int:
"""Clear all poll/target counters for trackers/targets owned by ``user_id``.
Returns the number of distinct (tracker + target) entries cleared.
Cross-user counters are left untouched addresses the multi-tenant
safety hole where ``reset_counter("all")`` wiped every user's state.
"""
polls = await get_user_poll_failures(user_id)
tgts = await get_user_target_failures(user_id)
cleared = 0
for entry in polls:
if _poll_failure_counts.pop(entry["id"], None) is not None:
cleared += 1
_poll_failure_last_error.pop(entry["id"], None)
for entry in tgts:
if _target_failure_counts.pop(entry["id"], None) is not None:
cleared += 1
_target_failure_last_error.pop(entry["id"], None)
return cleared
# ---------------------------------------------------------------------------
# Backlog scan
# ---------------------------------------------------------------------------
+383
View File
@@ -263,3 +263,386 @@ def test_default_template_loader_returns_bridge_self_slots() -> None:
# Sanity: each template references at least one of the bridge_self vars.
for tpl in list(en.values()) + list(ru.values()):
assert "{{" in tpl
# ---------------------------------------------------------------------------
# Bot commands — context builders
#
# These tests run against the real (temp-dir) DB via the FastAPI lifespan so
# that ``services.bridge_self`` helpers using ``get_engine()`` resolve to the
# same DB the test seeds rows into. We follow the pattern used by
# test_webhook_status_handler.py — bootstrap once, seed under TestClient.
# ---------------------------------------------------------------------------
def _bootstrap_app():
"""Bring up the app once so migrations run against the temp DB."""
from notify_bridge_server.main import app
return app
async def _seed_user_and_provider(
*, username: str, config: dict[str, int],
) -> tuple[int, int]:
"""Create a fresh ``(user_id, provider_id)`` against the live engine.
Uses two short-lived sessions to avoid SQLAlchemy auto-expiring the
first-committed object once a second commit fires on the same session.
"""
from notify_bridge_server.database.engine import get_engine
from notify_bridge_server.database.models import ServiceProvider, User
engine = get_engine()
async with AsyncSession(engine) as db:
user = User(
username=f"{username}_{datetime.now(timezone.utc).timestamp()}",
hashed_password="x", role="user",
)
db.add(user)
await db.commit()
await db.refresh(user)
user_id = int(user.id)
async with AsyncSession(engine) as db:
provider = ServiceProvider(
user_id=user_id, type="bridge_self", name="Bridge",
config=dict(config),
)
db.add(provider)
await db.commit()
await db.refresh(provider)
provider_id = int(provider.id)
return user_id, provider_id
async def _load_provider(provider_id: int):
from notify_bridge_server.database.engine import get_engine
from notify_bridge_server.database.models import ServiceProvider
engine = get_engine()
async with AsyncSession(engine) as db:
return await db.get(ServiceProvider, provider_id)
def test_command_status_returns_empty_lists_when_no_failures(tmp_data_dir) -> None: # noqa: ARG001
import asyncio
from fastapi.testclient import TestClient
from notify_bridge_server.commands.bridge_self_handler import (
_build_status_context,
)
from notify_bridge_server.services import bridge_self as bs
app = _bootstrap_app()
with TestClient(app):
async def run() -> None:
_user_id, provider_id = await _seed_user_and_provider(
username="status_user",
config={
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
},
)
provider = await _load_provider(provider_id)
# Make sure the in-memory dicts contain nothing.
bs._poll_failure_counts.clear()
bs._target_failure_counts.clear()
ctx = await _build_status_context(provider)
assert ctx["poll_failures"] == []
assert ctx["target_failures"] == []
assert ctx["deferred_pending"] == 0
assert ctx["deferred_threshold"] == 100
asyncio.run(run())
def test_command_thresholds_returns_user_config(tmp_data_dir) -> None: # noqa: ARG001
import asyncio
from fastapi.testclient import TestClient
from notify_bridge_server.commands.bridge_self_handler import (
_build_thresholds_context,
)
app = _bootstrap_app()
with TestClient(app):
async def run() -> None:
_user_id, provider_id = await _seed_user_and_provider(
username="thresholds_user",
config={
"poll_failure_threshold": 7,
"deferred_backlog_threshold": 250,
"target_failure_threshold": 11,
},
)
provider = await _load_provider(provider_id)
ctx = await _build_thresholds_context(provider)
assert ctx == {
"poll_failure_threshold": 7,
"deferred_backlog_threshold": 250,
"target_failure_threshold": 11,
}
asyncio.run(run())
def test_command_reset_clears_named_counter_and_is_idempotent(tmp_data_dir) -> None: # noqa: ARG001
"""``/reset`` clears the in-memory counter and is idempotent.
Now ownership-aware: we seed a real NotificationTracker owned by the
test user so ``find_tracker_owner`` returns a matching user_id and the
reset proceeds. The cross-user-rejection case is covered by
:func:`test_command_reset_rejects_cross_user_subject` below.
"""
import asyncio
from fastapi.testclient import TestClient
from notify_bridge_server.commands.bridge_self_handler import (
_build_reset_context, _parse_reset_subject,
)
from notify_bridge_server.database.engine import get_engine
from notify_bridge_server.database.models import NotificationTracker
from notify_bridge_server.services import bridge_self as bs
app = _bootstrap_app()
with TestClient(app):
async def run() -> None:
user_id, provider_id = await _seed_user_and_provider(
username="reset_user",
config={
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
},
)
provider = await _load_provider(provider_id)
# Seed an owned NotificationTracker so the ownership check
# in _build_reset_context can match it back to user_id.
engine = get_engine()
async with AsyncSession(engine) as db:
tracker = NotificationTracker(
user_id=user_id, provider_id=provider_id,
name="reset-test", enabled=True,
)
db.add(tracker)
await db.commit()
await db.refresh(tracker)
tid = int(tracker.id)
bs.reset_poll_counter(tid)
bs.record_poll_failure(tid, "boom")
bs.record_poll_failure(tid, "boom")
assert bs.get_poll_failure_count(tid) == 2
ctx = await _build_reset_context(f"tracker:{tid}", provider)
assert ctx["success"] is True
assert ctx["subject_type"] == "tracker"
assert ctx["subject_id"] == tid
assert ctx["previous_count"] == 2
assert ctx["error_message"] is None
assert bs.get_poll_failure_count(tid) == 0
# Idempotent — second call still succeeds with previous=0.
ctx2 = await _build_reset_context(f"tracker:{tid}", provider)
assert ctx2["success"] is True
assert ctx2["previous_count"] == 0
# Parse error → templated error, no exception.
bad_ctx = await _build_reset_context("not a subject", provider)
assert bad_ctx["success"] is False
assert bad_ctx["error_message"]
asyncio.run(run())
# Parser direct sanity-checks (pure function, no DB needed).
assert _parse_reset_subject("all") == ("all", None, None)
assert _parse_reset_subject("tracker:42") == ("tracker", 42, None)
assert _parse_reset_subject("target:7") == ("target", 7, None)
_, _, err = _parse_reset_subject("rocket:1")
assert err is not None
def test_command_reset_rejects_cross_user_subject(tmp_data_dir) -> None: # noqa: ARG001
"""User A cannot reset a counter belonging to user B's tracker.
Regression guard for the multi-tenant data-leak hole the original
handler had ``reset_counter`` was called without verifying the
subject's ``user_id`` matched ``provider.user_id``.
"""
import asyncio
from fastapi.testclient import TestClient
from notify_bridge_server.commands.bridge_self_handler import _build_reset_context
from notify_bridge_server.database.engine import get_engine
from notify_bridge_server.database.models import NotificationTracker
from notify_bridge_server.services import bridge_self as bs
app = _bootstrap_app()
with TestClient(app):
async def run() -> None:
user_a_id, provider_a_id = await _seed_user_and_provider(
username="user_a", config={
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
},
)
user_b_id, provider_b_id = await _seed_user_and_provider(
username="user_b", config={
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
},
)
provider_a = await _load_provider(provider_a_id)
# Seed a tracker owned by user B and increment its counter.
engine = get_engine()
async with AsyncSession(engine) as db:
tracker_b = NotificationTracker(
user_id=user_b_id, provider_id=provider_b_id,
name="b-only", enabled=True,
)
db.add(tracker_b)
await db.commit()
await db.refresh(tracker_b)
tid_b = int(tracker_b.id)
bs.reset_poll_counter(tid_b)
bs.record_poll_failure(tid_b, "boom")
assert bs.get_poll_failure_count(tid_b) == 1
# User A tries to reset user B's tracker — must fail.
ctx = await _build_reset_context(f"tracker:{tid_b}", provider_a)
assert ctx["success"] is False
assert "not found" in (ctx["error_message"] or "").lower()
# Counter must remain untouched.
assert bs.get_poll_failure_count(tid_b) == 1
asyncio.run(run())
def test_command_health_is_healthy_when_counters_zero(tmp_data_dir) -> None: # noqa: ARG001
import asyncio
from fastapi.testclient import TestClient
from notify_bridge_server.commands.bridge_self_handler import (
_build_health_context,
)
from notify_bridge_server.services import bridge_self as bs
app = _bootstrap_app()
with TestClient(app):
async def run() -> None:
_user_id, provider_id = await _seed_user_and_provider(
username="health_user",
config={
"poll_failure_threshold": 3,
"deferred_backlog_threshold": 100,
"target_failure_threshold": 5,
},
)
provider = await _load_provider(provider_id)
# Empty counters and no deferred rows for this user.
bs._poll_failure_counts.clear()
bs._target_failure_counts.clear()
ctx = await _build_health_context(provider)
assert ctx["healthy"] is True
assert ctx["failing_tracker_count"] == 0
assert ctx["failing_target_count"] == 0
assert ctx["deferred_pending"] == 0
assert "healthy" in ctx["summary"].lower()
asyncio.run(run())
# ---------------------------------------------------------------------------
# reset_counter direct unit test (covers the "all" branch)
# ---------------------------------------------------------------------------
def test_reset_counter_all_clears_every_dict() -> None:
from notify_bridge_server.services import bridge_self as bs
# Seed both dicts with a couple of entries.
bs._poll_failure_counts.clear()
bs._target_failure_counts.clear()
bs.record_poll_failure(9_401, "boom")
bs.record_poll_failure(9_402, "boom")
bs.record_target_failure(9_501, "503")
cleared = bs.reset_counter("all")
# 2 poll + 1 target = 3 entries cleared.
assert cleared == 3
assert bs._poll_failure_counts == {}
assert bs._target_failure_counts == {}
def test_reset_counter_unknown_failure_type_is_noop() -> None:
from notify_bridge_server.services import bridge_self as bs
bs._poll_failure_counts.clear()
bs.record_poll_failure(9_601, "boom")
# Unknown type returns 0 and leaves dicts intact.
assert bs.reset_counter("rocket_failures", 9_601) == 0
assert bs.get_poll_failure_count(9_601) == 1
# ---------------------------------------------------------------------------
# Capability / handler registration
# ---------------------------------------------------------------------------
def test_capability_registry_lists_bridge_self_commands() -> None:
from notify_bridge_core.providers.capabilities import get_capabilities
caps = get_capabilities("bridge_self")
assert caps is not None
cmd_names = {c["name"] for c in caps.commands}
assert {"status", "thresholds", "reset", "health", "help"}.issubset(cmd_names)
slot_names = {s["name"] for s in caps.command_slots}
# Response slots
assert {"status", "thresholds", "reset", "health"}.issubset(slot_names)
# Description slots — needed so the menu sync registers a description
# for every operator-facing command.
assert {
"desc_status", "desc_thresholds", "desc_reset", "desc_health",
}.issubset(slot_names)
def test_command_handler_registered_for_bridge_self() -> None:
"""Auto-registration wires the bridge_self handler into dispatch."""
from notify_bridge_server.commands.dispatch import get_handler
handler = get_handler("bridge_self")
assert handler is not None
assert handler.provider_type == "bridge_self"
assert {"status", "thresholds", "reset", "health"} == handler.get_provider_commands()
def test_default_command_template_loader_returns_bridge_self_slots() -> None:
"""All shipped command-template slots load for both locales."""
from notify_bridge_core.templates.command_defaults.loader import (
load_default_command_templates,
)
en = load_default_command_templates("en", "bridge_self")
ru = load_default_command_templates("ru", "bridge_self")
required = {"status", "thresholds", "reset", "health", "help", "start"}
assert required.issubset(en.keys())
assert required.issubset(ru.keys())