feat: bridge_self bot commands — status, thresholds, reset, health

Adds bot commands for the bridge_self provider so operators can inspect and manage bridge health from chat: /status, /thresholds, /reset, /health. Includes Jinja2 templates for both locales, seed data, capability slots, and a handler that exposes pending deferred backlog plus per-counter reset. Also adds .claude/skills/ for project-scoped graph-aware skills.
2026-05-16 03:43:48 +03:00
parent 10d30fc956
commit 8651767112
50 changed files with 1311 additions and 60 deletions
@@ -542,8 +542,32 @@ BRIDGE_SELF_CAPABILITIES = ProviderCapabilities(
        {"name": "bridge_self_deferred_backlog", "description": "Deferred backlog high"},
        {"name": "bridge_self_target_failures", "description": "Target send failures"},
    ],
-    command_slots=[],
-    commands=[],
+    command_slots=[
+        # Response templates
+        {"name": "start", "description": "/start greeting message"},
+        {"name": "help", "description": "/help command listing"},
+        {"name": "status", "description": "/status full counter snapshot"},
+        {"name": "thresholds", "description": "/thresholds configured alert thresholds"},
+        {"name": "reset", "description": "/reset manual counter reset"},
+        {"name": "health", "description": "/health terse one-line summary"},
+        {"name": "rate_limited", "description": "Rate limit warning message"},
+        {"name": "no_results", "description": "Empty results fallback"},
+        # Description slots
+        {"name": "desc_help", "description": "Menu description for /help"},
+        {"name": "desc_status", "description": "Menu description for /status"},
+        {"name": "desc_thresholds", "description": "Menu description for /thresholds"},
+        {"name": "desc_reset", "description": "Menu description for /reset"},
+        {"name": "desc_health", "description": "Menu description for /health"},
+        # Usage examples
+        {"name": "usage_reset", "description": "Usage example for /reset"},
+    ],
+    commands=[
+        {"name": "status", "description": "Show current bridge health counters"},
+        {"name": "thresholds", "description": "Show configured alert thresholds"},
+        {"name": "reset", "description": "Manually reset a failure counter"},
+        {"name": "health", "description": "Terse one-line health summary"},
+        {"name": "help", "description": "Show commands"},
+    ],
 )


@@ -0,0 +1 @@
+Terse one-line health summary
@@ -0,0 +1 @@
+Show available commands
@@ -0,0 +1 @@
+Reset a failure counter (tracker:&lt;id&gt;, target:&lt;id&gt;, or all)
@@ -0,0 +1 @@
+Show current bridge health counters
@@ -0,0 +1 @@
+Show configured alert thresholds
@@ -0,0 +1,5 @@
+{%- if healthy -%}
+✅ {{ summary }}
+{%- else -%}
+🚨 {{ summary }}
+{%- endif %}
@@ -0,0 +1,5 @@
+🩺 <b>Available commands:</b>
+{%- for cmd in commands %}
+/{{ cmd.name }} — {{ cmd.description }}
+{%- if cmd.usage %}  ↳ {{ cmd.usage }}{% endif %}
+{%- endfor %}
@@ -0,0 +1 @@
+No results.
@@ -0,0 +1 @@
+⏳ Too many requests. Please wait {{ wait }}s before trying again.
@@ -0,0 +1,14 @@
+{%- if success %}
+✅ <b>Counter reset</b>
+{%- if subject_type == 'all' %}
+Cleared {{ previous_count }} of your failure counters (trackers + targets).
+{%- else %}
+{{ subject_type|capitalize }} <b>{{ subject_name }}</b>{% if subject_id %} (id <code>{{ subject_id }}</code>){% endif %}
+Previous count: <b>{{ previous_count }}</b> → 0
+{%- endif %}
+{%- else %}
+❌ <b>Reset failed</b>
+{%- if error_message %}
+<i>{{ error_message }}</i>
+{%- endif %}
+{%- endif %}
@@ -0,0 +1,2 @@
+👋 Hi! I'm your Notify Bridge bot for <b>Bridge Self-Monitoring</b>.
+Use /help to see available commands.
@@ -0,0 +1,28 @@
+🩺 <b>Bridge Status</b>
+{%- if poll_failures %}
+
+🚨 <b>Tracker poll failures</b>
+{%- for f in poll_failures %}
+• <b>{{ f.tracker_name }}</b> (id <code>{{ f.tracker_id }}</code>) — {{ f.count }} consecutive
+{%- endfor %}
+{%- endif %}
+{%- if deferred_pending is none %}
+
+⏳ <b>Deferred backlog</b>
+Pending: <b>unknown</b> (DB unavailable) · Threshold: {{ deferred_threshold }}
+{%- elif deferred_pending %}
+
+⏳ <b>Deferred backlog</b>
+Pending: <b>{{ deferred_pending }}</b> · Threshold: {{ deferred_threshold }}
+{%- endif %}
+{%- if target_failures %}
+
+📡 <b>Target send failures</b>
+{%- for f in target_failures %}
+• <b>{{ f.target_name }}</b> (id <code>{{ f.target_id }}</code>) — {{ f.count }} consecutive
+{%- endfor %}
+{%- endif %}
+{%- if not poll_failures and not target_failures and deferred_pending == 0 %}
+
+✅ All counters at zero. Nothing to report.
+{%- endif %}
@@ -0,0 +1,4 @@
+⚙️ <b>Bridge Thresholds</b>
+Tracker poll failures: <b>{{ poll_failure_threshold }}</b>
+Deferred backlog: <b>{{ deferred_backlog_threshold }}</b>
+Target send failures: <b>{{ target_failure_threshold }}</b>
@@ -0,0 +1 @@
+/reset tracker:42 (or target:&lt;id&gt;, or all)
@@ -73,6 +73,15 @@ PROVIDER_COMMAND_SLOTS: dict[str, list[str]] = {
        # Usage examples
        "usage_entities", "usage_state",
    ],
+    "bridge_self": [
+        # Response templates
+        "start", "help", "status", "thresholds", "reset", "health",
+        "rate_limited", "no_results",
+        # Description slots
+        "desc_help", "desc_status", "desc_thresholds", "desc_reset", "desc_health",
+        # Usage examples
+        "usage_reset",
+    ],
 }

 # Backward-compatible aliases
@@ -0,0 +1 @@
+Краткая однострочная сводка состояния
@@ -0,0 +1 @@
+Показать доступные команды
@@ -0,0 +1 @@
+Сбросить счётчик сбоев (tracker:&lt;id&gt;, target:&lt;id&gt; или all)
@@ -0,0 +1 @@
+Показать счётчики состояния моста
@@ -0,0 +1 @@
+Показать настроенные пороги оповещений
@@ -0,0 +1,5 @@
+{%- if healthy -%}
+✅ {{ summary }}
+{%- else -%}
+🚨 {{ summary }}
+{%- endif %}
@@ -0,0 +1,5 @@
+🩺 <b>Доступные команды:</b>
+{%- for cmd in commands %}
+/{{ cmd.name }} — {{ cmd.description }}
+{%- if cmd.usage %}  ↳ {{ cmd.usage }}{% endif %}
+{%- endfor %}
@@ -0,0 +1 @@
+Нет результатов.
@@ -0,0 +1 @@
+⏳ Слишком много запросов. Подождите {{ wait }}с и попробуйте снова.
@@ -0,0 +1,14 @@
+{%- if success %}
+✅ <b>Счётчик сброшен</b>
+{%- if subject_type == 'all' %}
+Очищено {{ previous_count }} ваших счётчиков (трекеры + цели).
+{%- else %}
+{{ subject_type|capitalize }} <b>{{ subject_name }}</b>{% if subject_id %} (id <code>{{ subject_id }}</code>){% endif %}
+Было: <b>{{ previous_count }}</b> → 0
+{%- endif %}
+{%- else %}
+❌ <b>Не удалось сбросить</b>
+{%- if error_message %}
+<i>{{ error_message }}</i>
+{%- endif %}
+{%- endif %}
@@ -0,0 +1,2 @@
+👋 Привет! Я бот Notify Bridge для <b>Самомониторинга моста</b>.
+Используйте /help, чтобы посмотреть доступные команды.
@@ -0,0 +1,28 @@
+🩺 <b>Состояние моста</b>
+{%- if poll_failures %}
+
+🚨 <b>Сбои опроса трекеров</b>
+{%- for f in poll_failures %}
+• <b>{{ f.tracker_name }}</b> (id <code>{{ f.tracker_id }}</code>) — {{ f.count }} подряд
+{%- endfor %}
+{%- endif %}
+{%- if deferred_pending is none %}
+
+⏳ <b>Очередь отложенной отправки</b>
+В ожидании: <b>неизвестно</b> (БД недоступна) · Порог: {{ deferred_threshold }}
+{%- elif deferred_pending %}
+
+⏳ <b>Очередь отложенной отправки</b>
+В ожидании: <b>{{ deferred_pending }}</b> · Порог: {{ deferred_threshold }}
+{%- endif %}
+{%- if target_failures %}
+
+📡 <b>Сбои отправки в адресаты</b>
+{%- for f in target_failures %}
+• <b>{{ f.target_name }}</b> (id <code>{{ f.target_id }}</code>) — {{ f.count }} подряд
+{%- endfor %}
+{%- endif %}
+{%- if not poll_failures and not target_failures and deferred_pending == 0 %}
+
+✅ Все счётчики в нуле. Всё хорошо.
+{%- endif %}
@@ -0,0 +1,4 @@
+⚙️ <b>Пороги моста</b>
+Сбои опроса трекеров: <b>{{ poll_failure_threshold }}</b>
+Очередь отложенной отправки: <b>{{ deferred_backlog_threshold }}</b>
+Сбои отправки в адресаты: <b>{{ target_failure_threshold }}</b>
@@ -0,0 +1 @@
+/reset tracker:42 (или target:&lt;id&gt;, или all)
@@ -413,6 +413,69 @@ async def get_command_variables(
        },
    }

+    # --- Bridge self-monitoring ---
+    bridge_self_poll_failure_fields = {
+        "tracker_id": "Tracker id (int)",
+        "tracker_name": "Tracker display name",
+        "count": "Consecutive poll failures",
+    }
+    bridge_self_target_failure_fields = {
+        "target_id": "Target id (int)",
+        "target_name": "Target display name",
+        "count": "Consecutive send failures",
+    }
+    bridge_self = {
+        "status": {
+            "description": "/status snapshot of all bridge_self counters",
+            "variables": {
+                **common_vars,
+                "poll_failures": "List of {tracker_id, tracker_name, count} dicts (use {% for f in poll_failures %})",
+                "deferred_pending": "Pending deferred-dispatch row count for this user",
+                "deferred_threshold": "Deferred backlog threshold from provider config",
+                "target_failures": "List of {target_id, target_name, count} dicts (use {% for f in target_failures %})",
+            },
+            "poll_failure_fields": bridge_self_poll_failure_fields,
+            "target_failure_fields": bridge_self_target_failure_fields,
+        },
+        "thresholds": {
+            "description": "/thresholds configured alert thresholds",
+            "variables": {
+                **common_vars,
+                "poll_failure_threshold": "Tracker poll failure threshold (int)",
+                "deferred_backlog_threshold": "Deferred backlog threshold (int)",
+                "target_failure_threshold": "Target send failure threshold (int)",
+            },
+        },
+        "reset": {
+            "description": "/reset result of a manual counter reset",
+            "variables": {
+                **common_vars,
+                "subject_type": "'tracker', 'target', or 'all' (empty on parse error)",
+                "subject_id": "Subject id (int) or None for 'all' / errors",
+                "subject_name": "Display name of the subject (empty on error)",
+                "previous_count": "Counter value before reset",
+                "success": "Whether the reset succeeded (boolean)",
+                "error_message": "Error message when success=False (None on success)",
+            },
+        },
+        "health": {
+            "description": "/health terse one-line summary",
+            "variables": {
+                **common_vars,
+                "healthy": "Whether everything is at zero (boolean)",
+                "summary": "Human-readable one-line summary",
+                "tracker_count": "Total enabled tracker count for this user",
+                "failing_tracker_count": "Number of trackers with non-zero poll failures",
+                "deferred_pending": "Pending deferred-dispatch row count for this user",
+                "failing_target_count": "Number of targets with non-zero send failures",
+            },
+        },
+        "desc_thresholds": {"description": "Description for /thresholds command", "variables": common_vars},
+        "desc_reset": {"description": "Description for /reset command", "variables": common_vars},
+        "desc_health": {"description": "Description for /health command", "variables": common_vars},
+        "usage_reset": {"description": "Usage example for /reset", "variables": common_vars},
+    }
+
    return {
        **shared,
        "immich": immich,
@@ -421,6 +484,7 @@ async def get_command_variables(
        "nut": nut,
        "google_photos": google_photos,
        "webhook": webhook,
+        "bridge_self": bridge_self,
    }


@@ -625,6 +689,39 @@ async def preview_raw(
            {"area_id": "kitchen", "name": "Kitchen", "entity_count": 14},
            {"area_id": "entrance", "name": "Entrance", "entity_count": 4},
        ],
+        # --- Bridge self-monitoring ---
+        # /status — three categories of failure (preview shows non-empty data
+        # so operators can see how the template renders failures, not the
+        # all-zero case)
+        "poll_failures": [
+            {"tracker_id": 12, "tracker_name": "Family Photos poller", "count": 3},
+            {"tracker_id": 18, "tracker_name": "Vacation 2025 poller", "count": 5},
+        ],
+        "deferred_pending": 47,
+        "deferred_threshold": 100,
+        "target_failures": [
+            {"target_id": 4, "target_name": "Telegram - Family chat", "count": 7},
+        ],
+        # /thresholds — user's configured thresholds
+        "poll_failure_threshold": 3,
+        "deferred_backlog_threshold": 100,
+        "target_failure_threshold": 5,
+        # /reset — sample success result
+        "subject_type": "tracker",
+        "subject_id": 12,
+        "subject_name": "Family Photos poller",
+        "previous_count": 3,
+        "success": True,
+        "error_message": None,
+        # /health — sample "unhealthy" line that matches the failure data
+        # above. Templates that only branch on ``healthy`` still preview
+        # cleanly — operators wanting the healthy case can flip the flag
+        # in their template editor.
+        "healthy": False,
+        "summary": "2 trackers failing, 47 deferred, 1 target failing",
+        "tracker_count": 3,
+        "failing_tracker_count": 2,
+        "failing_target_count": 1,
    }

    return render_template_preview(body.template, sample_ctx)
@@ -244,7 +244,7 @@ async def _test_provider_connection(provider: ServiceProvider) -> dict[str, Any]
        )
        return await ha.test_connection()

-    if provider.type in ("scheduler", "webhook"):
+    if provider.type in ("scheduler", "webhook", "bridge_self"):
        return {"ok": True, "message": "Virtual provider — always available"}

    return {"ok": False, "message": f"Unknown provider type: {provider.type}"}
@@ -0,0 +1,273 @@
+"""Bridge self-monitoring bot command handler.
+
+Read-only commands plus a ``/reset`` operator action that clears in-memory
+failure counters once the underlying issue has been fixed.
+
+Counters are kept in module-level dicts inside
+``services.bridge_self`` — see that module for the increment / reset
+helpers used by the watcher / scheduler / dispatcher hot path. We only
+read snapshots from those dicts here so we never block the emission
+side or hold a lock across DB calls.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from ..database.models import (
+    CommandConfig,
+    CommandTracker,
+    CommandTrackerListener,
+    ServiceProvider,
+    TelegramBot,
+)
+from ..services import bridge_self as bs
+from .base import CommandResponse, ProviderCommandHandler
+from .handler import _render_cmd_template
+
+_LOGGER = logging.getLogger(__name__)
+
+_BRIDGE_SELF_COMMANDS = {"status", "thresholds", "reset", "health"}
+
+
+# ---------------------------------------------------------------------------
+# Context builders — one per command. Each returns the dict passed to the
+# template renderer.
+# ---------------------------------------------------------------------------
+
+
+async def _build_status_context(provider: ServiceProvider) -> dict[str, Any]:
+    """Snapshot the calling user's counters + pending deferred backlog.
+
+    User-scoped: ``get_user_*_failures`` filter the global counter dicts by
+    ownership in a single batched query so one user cannot see another
+    user's failing trackers/targets.
+    """
+    thresholds = await bs.get_user_thresholds(provider.user_id)
+
+    poll_rows = await bs.get_user_poll_failures(provider.user_id)
+    poll_failures = sorted(
+        ({"tracker_id": r["id"], "tracker_name": r["name"], "count": r["count"]} for r in poll_rows),
+        key=lambda r: r["tracker_id"],
+    )
+
+    target_rows = await bs.get_user_target_failures(provider.user_id)
+    target_failures = sorted(
+        ({"target_id": r["id"], "target_name": r["name"], "count": r["count"]} for r in target_rows),
+        key=lambda r: r["target_id"],
+    )
+
+    deferred_pending = await bs.get_pending_deferred_count(provider.user_id)
+
+    return {
+        "poll_failures": poll_failures,
+        "deferred_pending": deferred_pending,
+        "deferred_threshold": thresholds["deferred_backlog_threshold"],
+        "target_failures": target_failures,
+    }
+
+
+async def _build_thresholds_context(provider: ServiceProvider) -> dict[str, Any]:
+    """Render the user's configured alert thresholds."""
+    thresholds = await bs.get_user_thresholds(provider.user_id)
+    return {
+        "poll_failure_threshold": thresholds["poll_failure_threshold"],
+        "deferred_backlog_threshold": thresholds["deferred_backlog_threshold"],
+        "target_failure_threshold": thresholds["target_failure_threshold"],
+    }
+
+
+def _parse_reset_subject(args: str) -> tuple[str, int | None, str | None]:
+    """Parse a ``/reset`` argument into ``(subject_type, subject_id, error)``.
+
+    Accepted forms:
+      * ``tracker:<id>`` — clear that tracker's poll-failure counter
+      * ``target:<id>`` — clear that target's send-failure counter
+      * ``all`` — clear every in-memory counter
+
+    On a parse error we return ``("", None, "<message>")`` so the handler can
+    render a templated error reply instead of propagating an exception.
+    """
+    raw = (args or "").strip()
+    if not raw:
+        return "", None, "Missing subject. Use tracker:<id>, target:<id>, or all."
+    if raw.lower() == "all":
+        return "all", None, None
+    if ":" not in raw:
+        return "", None, (
+            f"Invalid subject '{raw}'. Use tracker:<id>, target:<id>, or all."
+        )
+    prefix, _, value = raw.partition(":")
+    prefix = prefix.strip().lower()
+    value = value.strip()
+    if prefix not in ("tracker", "target"):
+        return "", None, (
+            f"Unknown subject type '{prefix}'. Use tracker, target, or all."
+        )
+    try:
+        subject_id = int(value)
+    except ValueError:
+        return "", None, f"Invalid id '{value}'. Must be an integer."
+    if subject_id <= 0:
+        return "", None, f"Invalid id '{value}'. Must be a positive integer."
+    return prefix, subject_id, None
+
+
+async def _build_reset_context(args: str, provider: ServiceProvider) -> dict[str, Any]:
+    """Reset a counter, enforcing that the subject belongs to ``provider.user_id``.
+
+    Without this ownership check, any authenticated user could reset another
+    user's counters (or wipe global state via ``all``). For ``all`` we only
+    clear counters tied to trackers/targets owned by the calling user.
+    """
+    subject_type, subject_id, error = _parse_reset_subject(args)
+    if error is not None:
+        return {
+            "subject_type": "",
+            "subject_id": None,
+            "subject_name": "",
+            "previous_count": 0,
+            "success": False,
+            "error_message": error,
+        }
+
+    if subject_type == "all":
+        cleared = await bs.reset_user_counters(provider.user_id)
+        return {
+            "subject_type": "all",
+            "subject_id": None,
+            "subject_name": "your counters",
+            "previous_count": cleared,
+            "success": True,
+            "error_message": None,
+        }
+
+    # Verify the subject belongs to the calling user before touching the counter.
+    owner_lookup = bs.find_tracker_owner if subject_type == "tracker" else bs.find_target_owner
+    owner = await owner_lookup(subject_id) if subject_id is not None else None
+    if owner is None or owner != provider.user_id:
+        return {
+            "subject_type": subject_type,
+            "subject_id": subject_id,
+            "subject_name": "",
+            "previous_count": 0,
+            "success": False,
+            "error_message": f"{subject_type} {subject_id} not found",
+        }
+
+    failure_type = "poll_failures" if subject_type == "tracker" else "target_failures"
+    name_lookup = bs.get_tracker_name if subject_type == "tracker" else bs.get_target_name
+    subject_name = await name_lookup(subject_id) if subject_id is not None else ""
+    previous = bs.reset_counter(failure_type, subject_id)
+    return {
+        "subject_type": subject_type,
+        "subject_id": subject_id,
+        "subject_name": subject_name,
+        "previous_count": previous,
+        "success": True,
+        "error_message": None,
+    }
+
+
+async def _build_health_context(provider: ServiceProvider) -> dict[str, Any]:
+    """Build the terse one-line health summary context."""
+    from ..database.models import NotificationTracker
+    from sqlmodel import select
+    from sqlmodel.ext.asyncio.session import AsyncSession
+
+    from ..database.engine import get_engine
+
+    engine = get_engine()
+    try:
+        async with AsyncSession(engine) as session:
+            result = await session.exec(
+                select(NotificationTracker).where(
+                    NotificationTracker.user_id == provider.user_id,
+                    NotificationTracker.enabled == True,  # noqa: E712 — SQLModel needs ==
+                )
+            )
+            tracker_count = len(list(result.all()))
+    except Exception:  # noqa: BLE001
+        _LOGGER.exception("health: failed to count trackers for user=%s", provider.user_id)
+        tracker_count = 0
+
+    # User-scoped: only count failures that belong to THIS user's trackers/targets.
+    failing_tracker_count = len(await bs.get_user_poll_failures(provider.user_id))
+    failing_target_count = len(await bs.get_user_target_failures(provider.user_id))
+    deferred_pending = await bs.get_pending_deferred_count(provider.user_id)
+
+    # Treat unknown deferred count (DB error) as not-healthy so operators
+    # don't get a misleading "all clear" when the bridge can't introspect itself.
+    healthy = (
+        failing_tracker_count == 0
+        and failing_target_count == 0
+        and deferred_pending == 0
+    )
+    if healthy:
+        summary = f"Bridge healthy: {tracker_count} trackers polling, 0 failures"
+    else:
+        parts: list[str] = []
+        if failing_tracker_count:
+            parts.append(f"{failing_tracker_count} trackers failing")
+        if deferred_pending is None:
+            parts.append("deferred backlog unknown (DB unavailable)")
+        elif deferred_pending:
+            parts.append(f"{deferred_pending} deferred")
+        if failing_target_count:
+            parts.append(f"{failing_target_count} targets failing")
+        summary = ", ".join(parts) or "bridge state unknown"
+
+    return {
+        "healthy": healthy,
+        "summary": summary,
+        "tracker_count": tracker_count,
+        "failing_tracker_count": failing_tracker_count,
+        "deferred_pending": deferred_pending,
+        "failing_target_count": failing_target_count,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Handler
+# ---------------------------------------------------------------------------
+
+
+class BridgeSelfCommandHandler(ProviderCommandHandler):
+    """Read-only commands plus /reset for the bridge_self provider."""
+
+    provider_type = "bridge_self"
+
+    def get_provider_commands(self) -> set[str]:
+        return _BRIDGE_SELF_COMMANDS
+
+    async def handle(
+        self,
+        cmd: str,
+        args: str,
+        count: int,
+        locale: str,
+        response_mode: str,
+        provider: ServiceProvider,
+        cmd_templates: dict[str, dict[str, str]],
+        bot: TelegramBot,
+        tracker: CommandTracker,
+        config: CommandConfig,
+        *,
+        listener: CommandTrackerListener | None = None,
+        allowed_album_ids: set[str] | None = None,  # noqa: ARG002 — bridge_self has no album scope
+        page: int = 1,
+    ) -> CommandResponse | None:
+        if cmd == "status":
+            ctx = await _build_status_context(provider)
+            return CommandResponse(text=_render_cmd_template(cmd_templates, "status", locale, ctx))
+        if cmd == "thresholds":
+            ctx = await _build_thresholds_context(provider)
+            return CommandResponse(text=_render_cmd_template(cmd_templates, "thresholds", locale, ctx))
+        if cmd == "reset":
+            ctx = await _build_reset_context(args, provider)
+            return CommandResponse(text=_render_cmd_template(cmd_templates, "reset", locale, ctx))
+        if cmd == "health":
+            ctx = await _build_health_context(provider)
+            return CommandResponse(text=_render_cmd_template(cmd_templates, "health", locale, ctx))
+        return None
@@ -35,6 +35,7 @@ def _auto_register() -> None:
    from .nut_handler import NutCommandHandler
    from .webhook_handler import WebhookCommandHandler
    from .home_assistant_handler import HomeAssistantCommandHandler
+    from .bridge_self_handler import BridgeSelfCommandHandler

    register_handler(ImmichCommandHandler())
    register_handler(GiteaCommandHandler())
@@ -42,6 +43,7 @@ def _auto_register() -> None:
    register_handler(NutCommandHandler())
    register_handler(WebhookCommandHandler())
    register_handler(HomeAssistantCommandHandler())
+    register_handler(BridgeSelfCommandHandler())


 # Auto-register on import
@@ -195,6 +195,10 @@ async def _seed_default_command_templates() -> None:
            session, "home_assistant", "Default Home Assistant Commands",
            "Default Home Assistant command templates",
        )
+        await _seed_provider_command_template(
+            session, "bridge_self", "Default Bridge Self-Monitoring Commands",
+            "Default Bridge Self-Monitoring command templates",
+        )
        await session.commit()


@@ -381,6 +385,16 @@ async def _seed_default_command_configs() -> None:
                "default_count": 5,
                "rate_limits": {"search": 30, "default": 10},
            },
+            {
+                "provider_type": "bridge_self",
+                "name": "Default Bridge Self-Monitoring",
+                "enabled_commands": [
+                    "help", "status", "thresholds", "reset", "health",
+                ],
+                "response_mode": "text",
+                "default_count": 5,
+                "rate_limits": {"default": 10},
+            },
        ]

        for cfg in defaults:
@@ -263,6 +263,117 @@ def get_target_last_error(target_id: int) -> str:
    return _target_failure_last_error.get(target_id, "")


+def get_all_poll_failures() -> dict[int, int]:
+    """Return a snapshot of all current poll failure counters (tracker_id -> count).
+
+    Only includes non-zero counters. The returned dict is a copy and can be
+    iterated safely without holding a reference to the live module-level state.
+    """
+    return {tid: count for tid, count in _poll_failure_counts.items() if count > 0}
+
+
+def get_all_target_failures() -> dict[int, int]:
+    """Return a snapshot of all current target failure counters (target_id -> count).
+
+    Only includes non-zero counters.
+    """
+    return {tid: count for tid, count in _target_failure_counts.items() if count > 0}
+
+
+def reset_counter(failure_type: str, subject_id: int | None = None) -> int:
+    """Reset bridge_self failure counters.
+
+    Args:
+        failure_type: One of ``"poll_failures"``, ``"target_failures"``, or
+            ``"all"``. Anything else is treated as a no-op.
+        subject_id: When ``failure_type`` is ``"poll_failures"`` or
+            ``"target_failures"``, the tracker_id / target_id whose counter
+            to clear. Ignored when ``failure_type == "all"``.
+
+    Returns:
+        The previous count for the reset entry. For ``"all"``, the total
+        number of entries cleared across both counter dicts. Idempotent —
+        clearing an absent entry returns 0.
+    """
+    if failure_type == "all":
+        cleared = (
+            len(_poll_failure_counts)
+            + len(_target_failure_counts)
+        )
+        _poll_failure_counts.clear()
+        _poll_failure_last_error.clear()
+        _target_failure_counts.clear()
+        _target_failure_last_error.clear()
+        return cleared
+    if failure_type == "poll_failures" and subject_id is not None:
+        previous = _poll_failure_counts.get(subject_id, 0)
+        reset_poll_counter(subject_id)
+        return previous
+    if failure_type == "target_failures" and subject_id is not None:
+        previous = _target_failure_counts.get(subject_id, 0)
+        reset_target_counter(subject_id)
+        return previous
+    return 0
+
+
+async def get_pending_deferred_count(user_id: int) -> int | None:
+    """Return the count of pending DeferredDispatch rows for a user.
+
+    Used by command handlers to render the current backlog in /status and
+    /health responses. Returns ``None`` on any error so command templates
+    can render "unknown" instead of a misleading "0" — operators looking
+    at bridge health when the bridge is unhealthy must not be told
+    everything is fine.
+    """
+    from sqlalchemy import func
+
+    from ..database.models import DeferredDispatch
+
+    engine = get_engine()
+    try:
+        async with AsyncSession(engine) as session:
+            result = await session.exec(
+                select(func.count(DeferredDispatch.id))
+                .where(DeferredDispatch.status == "pending")
+                .where(DeferredDispatch.user_id == user_id)
+            )
+            count = result.first()
+            return int(count or 0)
+    except Exception:  # noqa: BLE001 — never block a command reply
+        _LOGGER.exception("get_pending_deferred_count failed for user_id=%s", user_id)
+        return None
+
+
+async def get_tracker_name(tracker_id: int) -> str:
+    """Return the display name of a NotificationTracker, or ``"tracker {id}"``."""
+    from ..database.models import NotificationTracker
+
+    engine = get_engine()
+    try:
+        async with AsyncSession(engine) as session:
+            tracker = await session.get(NotificationTracker, tracker_id)
+            if tracker is not None:
+                return tracker.name or f"tracker {tracker_id}"
+    except Exception:  # noqa: BLE001
+        _LOGGER.exception("get_tracker_name failed for tracker_id=%s", tracker_id)
+    return f"tracker {tracker_id}"
+
+
+async def get_target_name(target_id: int) -> str:
+    """Return the display name of a NotificationTarget, or ``"target {id}"``."""
+    from ..database.models import NotificationTarget
+
+    engine = get_engine()
+    try:
+        async with AsyncSession(engine) as session:
+            target = await session.get(NotificationTarget, target_id)
+            if target is not None:
+                return target.name or f"target {target_id}"
+    except Exception:  # noqa: BLE001
+        _LOGGER.exception("get_target_name failed for target_id=%s", target_id)
+    return f"target {target_id}"
+
+
 # ---------------------------------------------------------------------------
 # User-level helpers
 # ---------------------------------------------------------------------------
@@ -300,6 +411,76 @@ async def find_target_owner(target_id: int) -> int | None:
        return int(target.user_id)


+async def get_user_poll_failures(user_id: int) -> list[dict[str, Any]]:
+    """Return ``[{"id", "name", "count"}]`` for trackers owned by ``user_id``.
+
+    Single batched query — replaces the N+1 pattern of calling
+    ``get_tracker_name`` per failing tracker, and enforces ownership so
+    one user cannot see another user's failure list.
+    """
+    from ..database.models import NotificationTracker
+
+    snapshot = {tid: c for tid, c in _poll_failure_counts.items() if c > 0}
+    if not snapshot:
+        return []
+    engine = get_engine()
+    async with AsyncSession(engine) as session:
+        result = await session.exec(
+            select(NotificationTracker.id, NotificationTracker.name).where(
+                NotificationTracker.id.in_(list(snapshot.keys())),
+                NotificationTracker.user_id == user_id,
+            )
+        )
+        rows = list(result.all())
+    return [
+        {"id": int(tid), "name": name or f"tracker {tid}", "count": snapshot[int(tid)]}
+        for tid, name in rows
+    ]
+
+
+async def get_user_target_failures(user_id: int) -> list[dict[str, Any]]:
+    """Return ``[{"id", "name", "count"}]`` for targets owned by ``user_id``."""
+    from ..database.models import NotificationTarget
+
+    snapshot = {tid: c for tid, c in _target_failure_counts.items() if c > 0}
+    if not snapshot:
+        return []
+    engine = get_engine()
+    async with AsyncSession(engine) as session:
+        result = await session.exec(
+            select(NotificationTarget.id, NotificationTarget.name).where(
+                NotificationTarget.id.in_(list(snapshot.keys())),
+                NotificationTarget.user_id == user_id,
+            )
+        )
+        rows = list(result.all())
+    return [
+        {"id": int(tid), "name": name or f"target {tid}", "count": snapshot[int(tid)]}
+        for tid, name in rows
+    ]
+
+
+async def reset_user_counters(user_id: int) -> int:
+    """Clear all poll/target counters for trackers/targets owned by ``user_id``.
+
+    Returns the number of distinct (tracker + target) entries cleared.
+    Cross-user counters are left untouched — addresses the multi-tenant
+    safety hole where ``reset_counter("all")`` wiped every user's state.
+    """
+    polls = await get_user_poll_failures(user_id)
+    tgts = await get_user_target_failures(user_id)
+    cleared = 0
+    for entry in polls:
+        if _poll_failure_counts.pop(entry["id"], None) is not None:
+            cleared += 1
+        _poll_failure_last_error.pop(entry["id"], None)
+    for entry in tgts:
+        if _target_failure_counts.pop(entry["id"], None) is not None:
+            cleared += 1
+        _target_failure_last_error.pop(entry["id"], None)
+    return cleared
+
+
 # ---------------------------------------------------------------------------
 # Backlog scan
 # ---------------------------------------------------------------------------
@@ -263,3 +263,386 @@ def test_default_template_loader_returns_bridge_self_slots() -> None:
    # Sanity: each template references at least one of the bridge_self vars.
    for tpl in list(en.values()) + list(ru.values()):
        assert "{{" in tpl
+
+
+# ---------------------------------------------------------------------------
+# Bot commands — context builders
+#
+# These tests run against the real (temp-dir) DB via the FastAPI lifespan so
+# that ``services.bridge_self`` helpers using ``get_engine()`` resolve to the
+# same DB the test seeds rows into. We follow the pattern used by
+# test_webhook_status_handler.py — bootstrap once, seed under TestClient.
+# ---------------------------------------------------------------------------
+
+
+def _bootstrap_app():
+    """Bring up the app once so migrations run against the temp DB."""
+    from notify_bridge_server.main import app
+
+    return app
+
+
+async def _seed_user_and_provider(
+    *, username: str, config: dict[str, int],
+) -> tuple[int, int]:
+    """Create a fresh ``(user_id, provider_id)`` against the live engine.
+
+    Uses two short-lived sessions to avoid SQLAlchemy auto-expiring the
+    first-committed object once a second commit fires on the same session.
+    """
+    from notify_bridge_server.database.engine import get_engine
+    from notify_bridge_server.database.models import ServiceProvider, User
+
+    engine = get_engine()
+    async with AsyncSession(engine) as db:
+        user = User(
+            username=f"{username}_{datetime.now(timezone.utc).timestamp()}",
+            hashed_password="x", role="user",
+        )
+        db.add(user)
+        await db.commit()
+        await db.refresh(user)
+        user_id = int(user.id)
+    async with AsyncSession(engine) as db:
+        provider = ServiceProvider(
+            user_id=user_id, type="bridge_self", name="Bridge",
+            config=dict(config),
+        )
+        db.add(provider)
+        await db.commit()
+        await db.refresh(provider)
+        provider_id = int(provider.id)
+    return user_id, provider_id
+
+
+async def _load_provider(provider_id: int):
+    from notify_bridge_server.database.engine import get_engine
+    from notify_bridge_server.database.models import ServiceProvider
+
+    engine = get_engine()
+    async with AsyncSession(engine) as db:
+        return await db.get(ServiceProvider, provider_id)
+
+
+def test_command_status_returns_empty_lists_when_no_failures(tmp_data_dir) -> None:  # noqa: ARG001
+    import asyncio
+    from fastapi.testclient import TestClient
+
+    from notify_bridge_server.commands.bridge_self_handler import (
+        _build_status_context,
+    )
+    from notify_bridge_server.services import bridge_self as bs
+
+    app = _bootstrap_app()
+    with TestClient(app):
+        async def run() -> None:
+            _user_id, provider_id = await _seed_user_and_provider(
+                username="status_user",
+                config={
+                    "poll_failure_threshold": 3,
+                    "deferred_backlog_threshold": 100,
+                    "target_failure_threshold": 5,
+                },
+            )
+            provider = await _load_provider(provider_id)
+
+            # Make sure the in-memory dicts contain nothing.
+            bs._poll_failure_counts.clear()
+            bs._target_failure_counts.clear()
+
+            ctx = await _build_status_context(provider)
+
+            assert ctx["poll_failures"] == []
+            assert ctx["target_failures"] == []
+            assert ctx["deferred_pending"] == 0
+            assert ctx["deferred_threshold"] == 100
+
+        asyncio.run(run())
+
+
+def test_command_thresholds_returns_user_config(tmp_data_dir) -> None:  # noqa: ARG001
+    import asyncio
+    from fastapi.testclient import TestClient
+
+    from notify_bridge_server.commands.bridge_self_handler import (
+        _build_thresholds_context,
+    )
+
+    app = _bootstrap_app()
+    with TestClient(app):
+        async def run() -> None:
+            _user_id, provider_id = await _seed_user_and_provider(
+                username="thresholds_user",
+                config={
+                    "poll_failure_threshold": 7,
+                    "deferred_backlog_threshold": 250,
+                    "target_failure_threshold": 11,
+                },
+            )
+            provider = await _load_provider(provider_id)
+
+            ctx = await _build_thresholds_context(provider)
+
+            assert ctx == {
+                "poll_failure_threshold": 7,
+                "deferred_backlog_threshold": 250,
+                "target_failure_threshold": 11,
+            }
+
+        asyncio.run(run())
+
+
+def test_command_reset_clears_named_counter_and_is_idempotent(tmp_data_dir) -> None:  # noqa: ARG001
+    """``/reset`` clears the in-memory counter and is idempotent.
+
+    Now ownership-aware: we seed a real NotificationTracker owned by the
+    test user so ``find_tracker_owner`` returns a matching user_id and the
+    reset proceeds. The cross-user-rejection case is covered by
+    :func:`test_command_reset_rejects_cross_user_subject` below.
+    """
+    import asyncio
+    from fastapi.testclient import TestClient
+
+    from notify_bridge_server.commands.bridge_self_handler import (
+        _build_reset_context, _parse_reset_subject,
+    )
+    from notify_bridge_server.database.engine import get_engine
+    from notify_bridge_server.database.models import NotificationTracker
+    from notify_bridge_server.services import bridge_self as bs
+
+    app = _bootstrap_app()
+    with TestClient(app):
+        async def run() -> None:
+            user_id, provider_id = await _seed_user_and_provider(
+                username="reset_user",
+                config={
+                    "poll_failure_threshold": 3,
+                    "deferred_backlog_threshold": 100,
+                    "target_failure_threshold": 5,
+                },
+            )
+            provider = await _load_provider(provider_id)
+
+            # Seed an owned NotificationTracker so the ownership check
+            # in _build_reset_context can match it back to user_id.
+            engine = get_engine()
+            async with AsyncSession(engine) as db:
+                tracker = NotificationTracker(
+                    user_id=user_id, provider_id=provider_id,
+                    name="reset-test", enabled=True,
+                )
+                db.add(tracker)
+                await db.commit()
+                await db.refresh(tracker)
+                tid = int(tracker.id)
+
+            bs.reset_poll_counter(tid)
+            bs.record_poll_failure(tid, "boom")
+            bs.record_poll_failure(tid, "boom")
+            assert bs.get_poll_failure_count(tid) == 2
+
+            ctx = await _build_reset_context(f"tracker:{tid}", provider)
+            assert ctx["success"] is True
+            assert ctx["subject_type"] == "tracker"
+            assert ctx["subject_id"] == tid
+            assert ctx["previous_count"] == 2
+            assert ctx["error_message"] is None
+            assert bs.get_poll_failure_count(tid) == 0
+
+            # Idempotent — second call still succeeds with previous=0.
+            ctx2 = await _build_reset_context(f"tracker:{tid}", provider)
+            assert ctx2["success"] is True
+            assert ctx2["previous_count"] == 0
+
+            # Parse error → templated error, no exception.
+            bad_ctx = await _build_reset_context("not a subject", provider)
+            assert bad_ctx["success"] is False
+            assert bad_ctx["error_message"]
+
+        asyncio.run(run())
+
+    # Parser direct sanity-checks (pure function, no DB needed).
+    assert _parse_reset_subject("all") == ("all", None, None)
+    assert _parse_reset_subject("tracker:42") == ("tracker", 42, None)
+    assert _parse_reset_subject("target:7") == ("target", 7, None)
+    _, _, err = _parse_reset_subject("rocket:1")
+    assert err is not None
+
+
+def test_command_reset_rejects_cross_user_subject(tmp_data_dir) -> None:  # noqa: ARG001
+    """User A cannot reset a counter belonging to user B's tracker.
+
+    Regression guard for the multi-tenant data-leak hole the original
+    handler had — ``reset_counter`` was called without verifying the
+    subject's ``user_id`` matched ``provider.user_id``.
+    """
+    import asyncio
+    from fastapi.testclient import TestClient
+
+    from notify_bridge_server.commands.bridge_self_handler import _build_reset_context
+    from notify_bridge_server.database.engine import get_engine
+    from notify_bridge_server.database.models import NotificationTracker
+    from notify_bridge_server.services import bridge_self as bs
+
+    app = _bootstrap_app()
+    with TestClient(app):
+        async def run() -> None:
+            user_a_id, provider_a_id = await _seed_user_and_provider(
+                username="user_a", config={
+                    "poll_failure_threshold": 3,
+                    "deferred_backlog_threshold": 100,
+                    "target_failure_threshold": 5,
+                },
+            )
+            user_b_id, provider_b_id = await _seed_user_and_provider(
+                username="user_b", config={
+                    "poll_failure_threshold": 3,
+                    "deferred_backlog_threshold": 100,
+                    "target_failure_threshold": 5,
+                },
+            )
+            provider_a = await _load_provider(provider_a_id)
+
+            # Seed a tracker owned by user B and increment its counter.
+            engine = get_engine()
+            async with AsyncSession(engine) as db:
+                tracker_b = NotificationTracker(
+                    user_id=user_b_id, provider_id=provider_b_id,
+                    name="b-only", enabled=True,
+                )
+                db.add(tracker_b)
+                await db.commit()
+                await db.refresh(tracker_b)
+                tid_b = int(tracker_b.id)
+
+            bs.reset_poll_counter(tid_b)
+            bs.record_poll_failure(tid_b, "boom")
+            assert bs.get_poll_failure_count(tid_b) == 1
+
+            # User A tries to reset user B's tracker — must fail.
+            ctx = await _build_reset_context(f"tracker:{tid_b}", provider_a)
+            assert ctx["success"] is False
+            assert "not found" in (ctx["error_message"] or "").lower()
+            # Counter must remain untouched.
+            assert bs.get_poll_failure_count(tid_b) == 1
+
+        asyncio.run(run())
+
+
+def test_command_health_is_healthy_when_counters_zero(tmp_data_dir) -> None:  # noqa: ARG001
+    import asyncio
+    from fastapi.testclient import TestClient
+
+    from notify_bridge_server.commands.bridge_self_handler import (
+        _build_health_context,
+    )
+    from notify_bridge_server.services import bridge_self as bs
+
+    app = _bootstrap_app()
+    with TestClient(app):
+        async def run() -> None:
+            _user_id, provider_id = await _seed_user_and_provider(
+                username="health_user",
+                config={
+                    "poll_failure_threshold": 3,
+                    "deferred_backlog_threshold": 100,
+                    "target_failure_threshold": 5,
+                },
+            )
+            provider = await _load_provider(provider_id)
+
+            # Empty counters and no deferred rows for this user.
+            bs._poll_failure_counts.clear()
+            bs._target_failure_counts.clear()
+
+            ctx = await _build_health_context(provider)
+
+            assert ctx["healthy"] is True
+            assert ctx["failing_tracker_count"] == 0
+            assert ctx["failing_target_count"] == 0
+            assert ctx["deferred_pending"] == 0
+            assert "healthy" in ctx["summary"].lower()
+
+        asyncio.run(run())
+
+
+# ---------------------------------------------------------------------------
+# reset_counter direct unit test (covers the "all" branch)
+# ---------------------------------------------------------------------------
+
+
+def test_reset_counter_all_clears_every_dict() -> None:
+    from notify_bridge_server.services import bridge_self as bs
+
+    # Seed both dicts with a couple of entries.
+    bs._poll_failure_counts.clear()
+    bs._target_failure_counts.clear()
+    bs.record_poll_failure(9_401, "boom")
+    bs.record_poll_failure(9_402, "boom")
+    bs.record_target_failure(9_501, "503")
+
+    cleared = bs.reset_counter("all")
+
+    # 2 poll + 1 target = 3 entries cleared.
+    assert cleared == 3
+    assert bs._poll_failure_counts == {}
+    assert bs._target_failure_counts == {}
+
+
+def test_reset_counter_unknown_failure_type_is_noop() -> None:
+    from notify_bridge_server.services import bridge_self as bs
+
+    bs._poll_failure_counts.clear()
+    bs.record_poll_failure(9_601, "boom")
+
+    # Unknown type returns 0 and leaves dicts intact.
+    assert bs.reset_counter("rocket_failures", 9_601) == 0
+    assert bs.get_poll_failure_count(9_601) == 1
+
+
+# ---------------------------------------------------------------------------
+# Capability / handler registration
+# ---------------------------------------------------------------------------
+
+
+def test_capability_registry_lists_bridge_self_commands() -> None:
+    from notify_bridge_core.providers.capabilities import get_capabilities
+
+    caps = get_capabilities("bridge_self")
+    assert caps is not None
+
+    cmd_names = {c["name"] for c in caps.commands}
+    assert {"status", "thresholds", "reset", "health", "help"}.issubset(cmd_names)
+
+    slot_names = {s["name"] for s in caps.command_slots}
+    # Response slots
+    assert {"status", "thresholds", "reset", "health"}.issubset(slot_names)
+    # Description slots — needed so the menu sync registers a description
+    # for every operator-facing command.
+    assert {
+        "desc_status", "desc_thresholds", "desc_reset", "desc_health",
+    }.issubset(slot_names)
+
+
+def test_command_handler_registered_for_bridge_self() -> None:
+    """Auto-registration wires the bridge_self handler into dispatch."""
+    from notify_bridge_server.commands.dispatch import get_handler
+
+    handler = get_handler("bridge_self")
+    assert handler is not None
+    assert handler.provider_type == "bridge_self"
+    assert {"status", "thresholds", "reset", "health"} == handler.get_provider_commands()
+
+
+def test_default_command_template_loader_returns_bridge_self_slots() -> None:
+    """All shipped command-template slots load for both locales."""
+    from notify_bridge_core.templates.command_defaults.loader import (
+        load_default_command_templates,
+    )
+
+    en = load_default_command_templates("en", "bridge_self")
+    ru = load_default_command_templates("ru", "bridge_self")
+
+    required = {"status", "thresholds", "reset", "health", "help", "start"}
+    assert required.issubset(en.keys())
+    assert required.issubset(ru.keys())
				`@@ -0,0 +1 @@`
				`Reset a failure counter (tracker:<id>, target:<id>, or all)`
				`@@ -0,0 +1 @@`
				`⏳ Too many requests. Please wait {{ wait }}s before trying again.`
				`@@ -0,0 +1 @@`
				`Краткая однострочная сводка состояния`
				`@@ -0,0 +1 @@`
				`Показать доступные команды`
				`@@ -0,0 +1 @@`
				`Сбросить счётчик сбоев (tracker:<id>, target:<id> или all)`
				`@@ -0,0 +1 @@`
				`Показать счётчики состояния моста`
				`@@ -0,0 +1 @@`
				`Показать настроенные пороги оповещений`
				`@@ -0,0 +1 @@`
				`⏳ Слишком много запросов. Подождите {{ wait }}с и попробуйте снова.`
				`@@ -0,0 +1 @@`
				`/reset tracker:42 (или target:<id>, или all)`