feat: observability, per-receiver Telegram options, oversized-video fallback

Operability: - Correlation IDs end-to-end: shared dispatch_id between log lines and EventLog rows (event/watcher/scheduled/deferred/action/HA/command paths) and a new X-Request-Id middleware that normalizes inbound ids and binds request_id into log context. - dispatch_summary block merged into EventLog.details: per-target success/failure counts plus Telegram media delivered/skipped/failed and truncated error lists, so partial outcomes surface in the UI. - Diagnostic mode: admin can flip one module to DEBUG for a bounded window with auto-revert (in-memory only; setup_logging() resets on boot, lifespan reverts on shutdown). New /diagnostic-mode endpoints plus DiagnosticsCassette UI on the settings page. Telegram: - Per-receiver options: disable_notification (silent send) and message_thread_id (forum-topic routing), wired through the dispatcher via a ContextVar so all four send sites (sendMessage / sendPhoto-Video- Document / sendMediaGroup / cache-hit POST) pick them up. - send_large_videos_as_documents target setting: bypass the 50 MB sendVideo cap by falling back to sendDocument for oversized videos. - sendMediaGroup byte-budget enforcement (TELEGRAM_MAX_GROUP_TOTAL_BYTES, 45 MB) with per-item fallback on chunk failure so a stale file_id no longer silently drops a cached asset. Tests: - New: diagnostic_mode, dispatch_summary, request_correlation, telegram_media_group_partial, telegram_per_send_options. Docs: - .claude/reviews/: six-axis production-readiness review of v0.8.1. - .claude/docs/functional-review-2026-05-28.md: focused review of Telegram/Immich/logging subsystems.
2026-05-28 15:19:31 +03:00
parent 85a8f1e71c
commit 6a8f374678
39 changed files with 7239 additions and 142 deletions
@@ -14,6 +14,7 @@ Kept in ``notify_bridge_core`` so core modules (``TelegramClient``,

 from __future__ import annotations

+import uuid
 from contextlib import contextmanager
 from contextvars import ContextVar, Token
 from typing import Any, Iterator
@@ -56,6 +57,22 @@ def bind_log_context(**kwargs: Any) -> Iterator[None]:
            var.reset(tok)


+def ensure_dispatch_id() -> str:
+    """Return the bound ``dispatch_id`` if one is active, else a new one.
+
+    Format matches :class:`NotificationDispatcher.dispatch` (``disp:<12 hex>``)
+    so logs and ``EventLog.details.dispatch_id`` use a single shape. Callers
+    typically wrap a top-level handler with::
+
+        with bind_log_context(dispatch_id=ensure_dispatch_id()):
+            ...
+
+    so nested calls inherit the same id and any ``EventLog`` row written
+    inside the block can be correlated with the dispatcher's log lines.
+    """
+    return dispatch_id_var.get() or f"disp:{uuid.uuid4().hex[:12]}"
+
+
 def current_log_context() -> dict[str, Any]:
    """Return a snapshot of the currently-bound context values (non-None)."""
    snap: dict[str, Any] = {}
@@ -64,3 +81,43 @@ def current_log_context() -> dict[str, Any]:
        if val is not None:
            snap[key] = val
    return snap
+
+
+# Keys copied onto ``EventLog.details`` so an operator can grep stderr for
+# the matching ``disp=``/``req=`` log lines after spotting a row in the UI.
+# Kept narrow on purpose — ``chat_id``/``bot_id``/``command`` are already
+# represented by dedicated EventLog columns.
+_CORRELATION_KEYS = ("dispatch_id", "request_id")
+
+
+def enrich_details_with_correlation(
+    details: dict[str, Any] | None,
+) -> dict[str, Any]:
+    """Return a (shallow) copy of ``details`` with active correlation IDs merged in.
+
+    Use this when constructing an ``EventLog.details`` dict so the persisted
+    row carries the same ``dispatch_id`` / ``request_id`` that the stderr log
+    lines emitted during the same dispatch carry. The mapping makes it
+    possible to jump from a row in the dashboard to the corresponding log
+    lines without server-side correlation.
+
+    Existing keys in ``details`` are NOT overwritten — callers can pin a
+    specific value (e.g. a synthetic dispatch_id for a backfilled row) by
+    setting it themselves before calling.
+
+    The copy is shallow. Nested mutable values (lists, dicts) are shared with
+    the input — fine for the all-scalar dicts every current call site passes,
+    but callers that intend to mutate after this returns should ``deepcopy``
+    themselves.
+    """
+    result: dict[str, Any] = dict(details or {})
+    for key in _CORRELATION_KEYS:
+        if key in result:
+            continue
+        var = _VAR_MAP.get(key)
+        if var is None:
+            continue
+        val = var.get()
+        if val is not None:
+            result[key] = val
+    return result
@@ -5,13 +5,12 @@ from __future__ import annotations
 import asyncio
 import contextlib
 import logging
-import uuid
 from dataclasses import dataclass, field
 from typing import Any, AsyncIterator, Awaitable, Callable, Final

 import aiohttp

-from notify_bridge_core.log_context import bind_log_context, dispatch_id_var
+from notify_bridge_core.log_context import bind_log_context, ensure_dispatch_id
 from notify_bridge_core.models.events import ServiceEvent
 from notify_bridge_core.templates.context import build_template_context
 from notify_bridge_core.templates.renderer import render_template
@@ -132,7 +131,7 @@ class NotificationDispatcher:
        Returns one result per target. Per-target failures are isolated;
        a single bad target cannot poison the batch.
        """
-        new_id = dispatch_id_var.get() or f"disp:{uuid.uuid4().hex[:12]}"
+        new_id = ensure_dispatch_id()

        with bind_log_context(dispatch_id=new_id):
            _LOGGER.info(
@@ -341,6 +340,7 @@ class NotificationDispatcher:
        max_size_mb = target.config.get("max_asset_size")
        max_size_bytes = max_size_mb * 1024 * 1024 if max_size_mb else None
        send_large_as_docs = target.config.get("send_large_photos_as_documents", False)
+        send_large_videos_as_docs = target.config.get("send_large_videos_as_documents", False)

        if not bot_token:
            return {"success": False, "error": "Missing bot_token"}
@@ -392,6 +392,8 @@ class NotificationDispatcher:
                    chat_id=receiver.chat_id,
                    text=message,
                    disable_web_page_preview=bool(disable_preview),
+                    disable_notification=receiver.disable_notification,
+                    message_thread_id=receiver.message_thread_id,
                )
                if not text_result.get("success"):
                    _LOGGER.warning(
@@ -409,22 +411,45 @@ class NotificationDispatcher:
                        chunk_delay=chunk_delay,
                        max_asset_data_size=max_size_bytes,
                        send_large_photos_as_documents=send_large_as_docs,
+                        send_large_videos_as_documents=send_large_videos_as_docs,
                        chat_action=chat_action or None,
+                        disable_notification=receiver.disable_notification,
+                        message_thread_id=receiver.message_thread_id,
                    )
-                    if not media_result.get("success"):
+                    delivered = media_result.get("delivered_count", 0)
+                    skipped = media_result.get("skipped_count", 0)
+                    failed = media_result.get("failed_count", 0)
+                    media_success = media_result.get("success", False)
+                    has_partial_loss = skipped > 0 or failed > 0
+
+                    if not media_success:
                        _LOGGER.warning(
-                            "Text sent OK but media failed for chat %s: %s",
-                            receiver.chat_id, media_result.get("error"),
+                            "Text sent OK but media failed for chat %s "
+                            "(delivered=%d skipped=%d failed=%d): %s",
+                            receiver.chat_id, delivered, skipped, failed,
+                            media_result.get("error"),
                        )
+                    elif has_partial_loss:
+                        _LOGGER.warning(
+                            "Partial media delivery for chat %s "
+                            "(delivered=%d skipped=%d failed=%d)",
+                            receiver.chat_id, delivered, skipped, failed,
+                        )
+
+                    if not media_success or has_partial_loss:
                        # Preserve both outcomes — text succeeded, media
-                        # didn't. Operators losing media-failure detail
-                        # in the result dict made root-cause analysis
+                        # partially or fully didn't. Operators losing
+                        # media-failure detail made root-cause analysis
                        # impossible.
                        return {
                            "success": True,
                            "message_id": text_result.get("message_id"),
                            "media_error": media_result.get("error"),
                            "media_failed_at_chunk": media_result.get("failed_at_chunk"),
+                            "media_delivered_count": delivered,
+                            "media_skipped_count": skipped,
+                            "media_failed_count": failed,
+                            "media_errors": media_result.get("errors"),
                        }
                return text_result

@@ -20,9 +20,21 @@ class Receiver:

@dataclass
 class TelegramReceiver(Receiver):
-    """Telegram chat receiver."""
+    """Telegram chat receiver.
+
+    ``disable_notification`` toggles Telegram's ``disable_notification=true``
+    flag — the message is delivered without an audible / vibration alert.
+    Useful for low-priority chats that the user reads but doesn't want to
+    be paged by.
+
+    ``message_thread_id`` routes the send into a specific forum topic on a
+    supergroup with topics enabled. ``None`` means "general topic" (default
+    Telegram behaviour).
+    """

    chat_id: str = ""
+    disable_notification: bool = False
+    message_thread_id: int | None = None


@dataclass
@@ -80,9 +92,30 @@ def _coerce_int(value: Any, default: int) -> int:
        return default


+def _coerce_telegram_thread_id(value: Any) -> int | None:
+    """Coerce a config value to a positive Telegram forum-topic id.
+
+    The Bot API treats omission, ``0``, and negative values all as
+    "general topic", so we collapse them to ``None`` for consistency
+    with the frontend (which rejects ``<= 0``). Booleans are explicitly
+    rejected so ``int(True) == 1`` doesn't silently route a misconfigured
+    chat into topic #1.
+    """
+    if value is None or value == "" or isinstance(value, bool):
+        return None
+    try:
+        n = int(value)
+    except (TypeError, ValueError):
+        return None
+    return n if n > 0 else None
+
+
 _RECEIVER_FACTORIES: dict[str, _ReceiverFactory] = {
    "telegram": lambda locale, config: TelegramReceiver(
-        locale=locale, config=config, chat_id=str(config.get("chat_id", "")),
+        locale=locale, config=config,
+        chat_id=str(config.get("chat_id", "")),
+        disable_notification=bool(config.get("disable_notification", False)),
+        message_thread_id=_coerce_telegram_thread_id(config.get("message_thread_id")),
    ),
    "webhook": lambda locale, config: WebhookReceiver(
        locale=locale, config=config,
@@ -3,12 +3,14 @@
 from __future__ import annotations

 import asyncio
+import contextlib
 import json
 import logging
 import mimetypes
 import re
+from contextvars import ContextVar
 from dataclasses import dataclass, field
-from typing import Any, Callable, Final
+from typing import Any, Callable, Final, Iterator

 import aiohttp
 from aiohttp import FormData
@@ -19,6 +21,7 @@ from .cache import TelegramFileCache
 from .media import (
    TELEGRAM_API_BASE_URL,
    TELEGRAM_MAX_CAPTION_LENGTH,
+    TELEGRAM_MAX_GROUP_TOTAL_BYTES,
    TELEGRAM_MAX_PHOTO_SIZE,
    TELEGRAM_MAX_TEXT_LENGTH,
    TELEGRAM_MAX_VIDEO_SIZE,
@@ -27,7 +30,6 @@ from .media import (
    extract_asset_id_from_url,
    is_asset_cache_key,
    is_asset_id,
-    split_media_by_upload_size,
 )

 _LOGGER = logging.getLogger(__name__)
@@ -56,6 +58,68 @@ _UPLOAD_TIMEOUT: Final = aiohttp.ClientTimeout(total=120, connect=10)
 _DOWNLOAD_TIMEOUT: Final = aiohttp.ClientTimeout(total=120, connect=10)


+# ---------------------------------------------------------------------------
+# Per-send options (disable_notification, message_thread_id, …)
+# ---------------------------------------------------------------------------
+#
+# These are properties of a single send, not of the bot or the client, and
+# they fan out into the JSON / multipart payload at four different sites
+# (sendMessage, sendPhoto/Video/Document, sendMediaGroup, cache-hit POST).
+# Rather than threading the kwargs through every internal helper, we bind
+# them on a ContextVar inside the public ``send_message`` / ``send_notification``
+# entry points; the payload builders read the var when constructing the
+# request. ContextVar propagation isolates concurrent ``asyncio.gather``
+# fan-outs in the dispatcher (one task per receiver) — each task sees the
+# value its own caller bound.
+
+
+@dataclass(frozen=True)
+class _SendOptions:
+    """Per-send Telegram flags applied to every API call within one send.
+
+    ``disable_notification`` maps to Bot API ``disable_notification=true``
+    — the chat receives the message silently. ``message_thread_id`` routes
+    the message into a specific forum-topic on supergroups with topics
+    enabled; ``None`` means "general topic" (Bot API omits the field).
+    """
+
+    disable_notification: bool = False
+    message_thread_id: int | None = None
+
+
+_send_options_var: ContextVar[_SendOptions] = ContextVar(
+    "_tg_send_options", default=_SendOptions(),
+)
+
+
+@contextlib.contextmanager
+def _bind_send_options(opts: _SendOptions) -> Iterator[None]:
+    """Bind per-send options for the duration of the ``with`` block."""
+    token = _send_options_var.set(opts)
+    try:
+        yield
+    finally:
+        _send_options_var.reset(token)
+
+
+def _apply_send_opts_to_payload(payload: dict[str, Any]) -> None:
+    """Merge the active per-send options into a JSON request body."""
+    opts = _send_options_var.get()
+    if opts.disable_notification:
+        payload["disable_notification"] = True
+    if opts.message_thread_id is not None:
+        payload["message_thread_id"] = opts.message_thread_id
+
+
+def _apply_send_opts_to_form(form: FormData) -> None:
+    """Merge the active per-send options into a multipart form payload."""
+    opts = _send_options_var.get()
+    if opts.disable_notification:
+        form.add_field("disable_notification", "true")
+    if opts.message_thread_id is not None:
+        form.add_field("message_thread_id", str(opts.message_thread_id))
+
+
 def _extract_retry_after(result: dict[str, Any]) -> int | None:
    """Return the retry_after seconds from a Telegram error response.

@@ -135,10 +199,27 @@ class _MediaItem:
    keyed by position. Bundling these together prevents the
    ``media_json`` and ``cache_info`` lists from drifting out of
    alignment under future edits.
+
+    ``source_url`` and ``download_headers`` let the per-item fallback
+    re-download a cache-hit item if its ``file_id`` POST returns
+    transient errors — without them, a stale ``file_id`` would silently
+    lose a cached asset that the original single-item path would have
+    recovered.
    """
    media_json: dict[str, Any]
    cache_info: tuple[str, str, str | None, int] | None
    attachment: tuple[str, bytes, str, str] | None  # (name, data, filename, content_type)
+    source_url: str | None = None
+    download_headers: dict[str, str] | None = None
+
+    @property
+    def upload_bytes(self) -> int:
+        """Bytes this item contributes to a multipart sendMediaGroup payload.
+
+        Cached items (referenced by ``file_id``) contribute 0 since
+        Telegram serves them server-side without us re-uploading.
+        """
+        return len(self.attachment[1]) if self.attachment else 0


 def _truncate(text: str, limit: int, *, marker: str = "…") -> str:
@@ -302,6 +383,7 @@ class TelegramClient:
            payload["caption"] = _truncate(caption, TELEGRAM_MAX_CAPTION_LENGTH)
        if reply_to_message_id is not None:
            payload["reply_parameters"] = {"message_id": reply_to_message_id}
+        _apply_send_opts_to_payload(payload)
        try:
            async with self._session.post(
                self._api_url(kind.api_method), json=payload, timeout=_API_TIMEOUT,
@@ -351,6 +433,7 @@ class TelegramClient:
                f.add_field("caption", capped_caption)
            if reply_to_message_id is not None:
                f.add_field("reply_parameters", json.dumps({"message_id": reply_to_message_id}))
+            _apply_send_opts_to_form(f)
            return f

        for attempt in range(1, _TG_429_MAX_ATTEMPTS + 1):
@@ -415,18 +498,54 @@ class TelegramClient:
        chunk_delay: int = 0,
        max_asset_data_size: int | None = None,
        send_large_photos_as_documents: bool = False,
+        send_large_videos_as_documents: bool = False,
        chat_action: str | None = "typing",
+        *,
+        disable_notification: bool = False,
+        message_thread_id: int | None = None,
    ) -> NotificationResult:
        if not assets:
            return await self.send_message(
                chat_id, caption or "", reply_to_message_id,
                disable_web_page_preview, parse_mode,
+                disable_notification=disable_notification,
+                message_thread_id=message_thread_id,
            )

        keepalive: _KeepaliveHandle | None = None
        if chat_action:
            keepalive = self.start_chat_action_keepalive(chat_id, chat_action)

+        # Bind for the whole media-send fan-out — every internal helper
+        # (_send_photo / _send_video / _send_document / _send_media_group /
+        # _post_media_group / _send_from_cache / _upload_media) reads the
+        # current value when it constructs its request payload.
+        opts = _SendOptions(
+            disable_notification=disable_notification,
+            message_thread_id=message_thread_id,
+        )
+        with _bind_send_options(opts):
+            return await self._send_notification_body(
+                chat_id, assets, caption, reply_to_message_id, parse_mode,
+                max_group_size, chunk_delay, max_asset_data_size,
+                send_large_photos_as_documents, send_large_videos_as_documents,
+                keepalive,
+            )
+
+    async def _send_notification_body(
+        self,
+        chat_id: str,
+        assets: list[dict[str, Any]],
+        caption: str | None,
+        reply_to_message_id: int | None,
+        parse_mode: str,
+        max_group_size: int,
+        chunk_delay: int,
+        max_asset_data_size: int | None,
+        send_large_photos_as_documents: bool,
+        send_large_videos_as_documents: bool,
+        keepalive: _KeepaliveHandle | None,
+    ) -> NotificationResult:
        try:
            if len(assets) == 1 and assets[0].get("type") == "photo":
                return await self._send_photo(
@@ -443,6 +562,7 @@ class TelegramClient:
                    assets[0].get("content_type"), assets[0].get("cache_key"),
                    download_headers=assets[0].get("headers"),
                    preloaded_data=assets[0].get("data"),
+                    send_large_videos_as_documents=send_large_videos_as_documents,
                )
            if len(assets) == 1 and assets[0].get("type", "document") == "document":
                url = assets[0].get("url")
@@ -465,7 +585,7 @@ class TelegramClient:
            return await self._send_media_group(
                chat_id, assets, caption, reply_to_message_id, max_group_size,
                chunk_delay, parse_mode, max_asset_data_size,
-                send_large_photos_as_documents,
+                send_large_photos_as_documents, send_large_videos_as_documents,
            )
        finally:
            await self.stop_keepalive(keepalive)
@@ -477,6 +597,9 @@ class TelegramClient:
        reply_to_message_id: int | None = None,
        disable_web_page_preview: bool | None = None,
        parse_mode: str = "HTML",
+        *,
+        disable_notification: bool = False,
+        message_thread_id: int | None = None,
    ) -> NotificationResult:
        if not text:
            _LOGGER.warning("send_message called with empty text — using placeholder")
@@ -490,7 +613,19 @@ class TelegramClient:
            payload["reply_parameters"] = {"message_id": reply_to_message_id}
        if disable_web_page_preview:
            payload["link_preview_options"] = {"is_disabled": True}
+        # sendMessage is a leaf call — its kwargs go straight into the
+        # JSON body. The ContextVar pattern is reserved for the deeper
+        # media paths (``_upload_media`` / ``_post_media_group`` /
+        # ``_send_from_cache``) that can't easily plumb kwargs through.
+        if disable_notification:
+            payload["disable_notification"] = True
+        if message_thread_id is not None:
+            payload["message_thread_id"] = message_thread_id
+        return await self._post_send_message(payload)

+    async def _post_send_message(
+        self, payload: dict[str, Any],
+    ) -> NotificationResult:
        url = self._api_url("sendMessage")
        try:
            async with self._session.post(url, json=payload, timeout=_API_TIMEOUT) as response:
@@ -651,6 +786,7 @@ class TelegramClient:
        max_asset_data_size: int | None = None, content_type: str | None = None,
        cache_key: str | None = None, download_headers: dict[str, str] | None = None,
        preloaded_data: bytes | None = None,
+        send_large_videos_as_documents: bool = False,
    ) -> NotificationResult:
        if not url:
            return {"success": False, "error": "Missing 'url' for video"}
@@ -672,6 +808,18 @@ class TelegramClient:
        if max_asset_data_size is not None and len(data) > max_asset_data_size:
            return {"success": False, "error": "Video exceeds size limit", "skipped": True}
        if len(data) > TELEGRAM_MAX_VIDEO_SIZE:
+            # Telegram's sendVideo hard-caps at 50 MB. Documents accept
+            # up to 2 GB, so when the operator opts in we deliver the
+            # bytes as a document instead of silently dropping the asset.
+            # Loses inline playback but preserves delivery.
+            if send_large_videos_as_documents:
+                filename = url.split("/")[-1].split("?")[0] or "video.mp4"
+                if "." not in filename:
+                    filename = "video.mp4"
+                return await self._send_document(
+                    chat_id, data, filename, caption, reply_to_message_id,
+                    parse_mode, url, content_type, cache_key,
+                )
            return {
                "success": False,
                "error": f"Video exceeds Telegram's {TELEGRAM_MAX_VIDEO_SIZE // (1024*1024)} MB limit",
@@ -723,6 +871,7 @@ class TelegramClient:
        caption: str | None = None, reply_to_message_id: int | None = None,
        max_group_size: int = 10, chunk_delay: int = 0, parse_mode: str = "HTML",
        max_asset_data_size: int | None = None, send_large_photos_as_documents: bool = False,
+        send_large_videos_as_documents: bool = False,
    ) -> NotificationResult:
        # Telegram rejects mixed photo/video + document in a single
        # sendMediaGroup. Split before chunking so a malformed input
@@ -730,75 +879,293 @@ class TelegramClient:
        partitions = self._partition_media_by_kind(assets)

        all_message_ids: list[int] = []
-        first_chunk_overall = True
+        errors: list[dict[str, Any]] = []
+        delivered = 0
+        skipped = 0
+        failed = 0
+        first_send = True
+        # Oversized videos that the operator wants delivered as
+        # documents. Sent after all media-group chunks finish so
+        # they ride out on their own (Telegram refuses to mix
+        # documents with photo/video in one group).
+        deferred_documents: list[_MediaItem] = []
+        # Caption + reply_to are "spent" on the first send attempt,
+        # mirroring the prior contract. If that first attempt fails
+        # entirely, they're lost — same as before. Tracking these as
+        # standalone flags (rather than deriving from ``chunk_idx==0``)
+        # keeps the semantics right across multiple partitions.
+        caption_pending = bool(caption)
+        reply_pending = reply_to_message_id is not None
+
+        async def maybe_delay() -> None:
+            nonlocal first_send
+            if not first_send and chunk_delay > 0:
+                await asyncio.sleep(chunk_delay / 1000)
+            first_send = False
+
        for partition in partitions:
            chunks = [
                partition[i:i + max_group_size]
                for i in range(0, len(partition), max_group_size)
            ]
            for chunk_idx, chunk in enumerate(chunks):
-                if not first_chunk_overall and chunk_delay > 0:
-                    await asyncio.sleep(chunk_delay / 1000)
-
-                # Single-item chunk → use the simpler send_photo/video path.
-                if len(chunk) == 1:
-                    item = chunk[0]
-                    chunk_caption = caption if first_chunk_overall else None
-                    chunk_reply = reply_to_message_id if first_chunk_overall else None
-                    if item.get("type") == "photo":
-                        result = await self._send_photo(
-                            chat_id, item.get("url"), chunk_caption, chunk_reply, parse_mode,
-                            max_asset_data_size, send_large_photos_as_documents,
-                            item.get("content_type"), item.get("cache_key"),
-                            download_headers=item.get("headers"),
-                            preloaded_data=item.get("data"),
-                        )
-                    elif item.get("type") == "video":
-                        result = await self._send_video(
-                            chat_id, item.get("url"), chunk_caption, chunk_reply, parse_mode,
-                            max_asset_data_size,
-                            item.get("content_type"), item.get("cache_key"),
-                            download_headers=item.get("headers"),
-                            preloaded_data=item.get("data"),
-                        )
-                    else:
-                        first_chunk_overall = False
-                        continue
-                    first_chunk_overall = False
-                    if not result.get("success"):
-                        result["failed_at_chunk"] = chunk_idx + 1
-                        return result
-                    if result.get("message_id") is not None:
-                        all_message_ids.append(result["message_id"])
-                    continue
-
-                items = await self._build_media_items(
-                    chunk, max_asset_data_size, caption if first_chunk_overall else None,
-                    parse_mode,
+                # Fetch + filter the parent chunk. Skipped items
+                # (oversized, bad photo, failed download) never enter
+                # ``items`` — count them so the operator-facing result
+                # reflects what actually went out vs got dropped.
+                # Oversized videos opted into doc-fallback get
+                # deferred — they're delivered (eventually) so they
+                # don't count as skipped.
+                items, chunk_deferred = await self._build_media_items(
+                    chunk, max_asset_data_size, send_large_videos_as_documents,
                )
+                deferred_documents.extend(chunk_deferred)
+                skipped += len(chunk) - len(items) - len(chunk_deferred)
+
                if not items:
                    _LOGGER.warning(
-                        "sendMediaGroup skipped — chunk %d/%d had %d input items but 0 usable (all filtered/failed)",
+                        "sendMediaGroup: chunk %d/%d had %d input items but 0 usable",
                        chunk_idx + 1, len(chunks), len(chunk),
                    )
-                    first_chunk_overall = False
                    continue

-                chunk_msg_ids, chunk_err = await self._post_media_group(
-                    chat_id, items, reply_to_message_id if first_chunk_overall else None,
-                    chunk_idx, len(chunks),
+                # Split the chunk into sub-chunks that each fit under
+                # Telegram's per-request byte cap. Per-item filtering
+                # alone can't prevent 413s when several legal-sized
+                # items together bust the envelope.
+                sub_chunks = self._split_items_by_byte_budget(
+                    items, TELEGRAM_MAX_GROUP_TOTAL_BYTES,
                )
-                first_chunk_overall = False
-                if chunk_err is not None:
-                    return chunk_err
-                all_message_ids.extend(chunk_msg_ids)
+                if len(sub_chunks) > 1:
+                    _LOGGER.info(
+                        "sendMediaGroup: byte-budget split chunk %d/%d into %d sub-chunks",
+                        chunk_idx + 1, len(chunks), len(sub_chunks),
+                    )

-        if not all_message_ids:
-            _LOGGER.warning(
-                "sendMediaGroup completed with 0 message_ids — nothing was delivered",
+                for sub_items in sub_chunks:
+                    await maybe_delay()
+                    sub_caption = caption if caption_pending else None
+                    sub_reply = reply_to_message_id if reply_pending else None
+                    caption_pending = False
+                    reply_pending = False
+                    if sub_caption:
+                        self._attach_caption_to_first(
+                            sub_items, sub_caption, parse_mode,
+                        )
+
+                    msg_ids, err = await self._post_media_group(
+                        chat_id, sub_items, sub_reply, chunk_idx, len(chunks),
+                    )
+                    if err is None:
+                        all_message_ids.extend(msg_ids)
+                        delivered += len(sub_items)
+                        continue
+
+                    # Telegram rejected the sub-chunk after our
+                    # pre-flight passed (content / transient / rate).
+                    # Try each item as its own message so partial
+                    # delivery survives the chunk-level failure.
+                    # Record the chunk-level cause first so the
+                    # operator-visible ``errors`` list reads in
+                    # cause-then-consequence order.
+                    _LOGGER.warning(
+                        "sendMediaGroup chunk %d/%d failed (%s) — falling back to per-item",
+                        chunk_idx + 1, len(chunks), err.get("error"),
+                    )
+                    errors.append({
+                        "kind": "chunk",
+                        "chunk": chunk_idx + 1,
+                        "error": err.get("error", "unknown"),
+                        "code": err.get("error_code"),
+                    })
+                    for item_idx, item in enumerate(sub_items):
+                        item_caption = sub_caption if item_idx == 0 else None
+                        item_reply = sub_reply if item_idx == 0 else None
+                        # No ``maybe_delay()`` here: per-item retries
+                        # are a recovery path where added latency
+                        # only widens the outage window — the
+                        # individual sendPhoto/sendVideo calls have
+                        # their own 429 backoff in ``_upload_media``.
+                        item_result = await self._send_item_individually(
+                            chat_id, item, item_caption, item_reply, parse_mode,
+                        )
+                        if item_result.get("success"):
+                            delivered += 1
+                            mid = item_result.get("message_id")
+                            if mid is not None:
+                                all_message_ids.append(mid)
+                        else:
+                            failed += 1
+                            errors.append({
+                                "kind": "item",
+                                "chunk": chunk_idx + 1,
+                                "item_index": item_idx,
+                                "error": item_result.get("error", "unknown"),
+                            })
+
+        # Deferred oversized-videos-as-documents: send each on its own
+        # via sendDocument. They couldn't ride in the media group
+        # because Telegram refuses to mix document with photo/video,
+        # and per-item failures don't poison siblings.
+        for deferred in deferred_documents:
+            await maybe_delay()
+            d_caption = caption if caption_pending else None
+            d_reply = reply_to_message_id if reply_pending else None
+            caption_pending = False
+            reply_pending = False
+            d_result = await self._send_item_individually(
+                chat_id, deferred, d_caption, d_reply, parse_mode,
            )
-            return {"success": False, "error": "no_items_delivered"}
-        return {"success": True, "message_ids": all_message_ids}
+            if d_result.get("success"):
+                delivered += 1
+                mid = d_result.get("message_id")
+                if mid is not None:
+                    all_message_ids.append(mid)
+            else:
+                failed += 1
+                errors.append({
+                    "kind": "deferred_document",
+                    "error": d_result.get("error", "unknown"),
+                })
+
+        if delivered == 0:
+            if skipped > 0 and not errors:
+                msg = f"all {skipped} item(s) filtered before send"
+            elif errors:
+                msg = errors[0].get("error", "no_items_delivered")
+            else:
+                msg = "no_items_delivered"
+            _LOGGER.warning(
+                "sendMediaGroup delivered 0 items (skipped=%d failed=%d)",
+                skipped, failed,
+            )
+            return {
+                "success": False,
+                "error": msg,
+                "message_ids": [],
+                "delivered_count": 0,
+                "skipped_count": skipped,
+                "failed_count": failed,
+                "errors": errors or None,
+                "failed_at_chunk": errors[0].get("chunk") if errors else None,
+            }
+
+        return {
+            "success": True,
+            "message_ids": all_message_ids,
+            "delivered_count": delivered,
+            "skipped_count": skipped,
+            "failed_count": failed,
+            "errors": errors or None,
+        }
+
+    @staticmethod
+    def _split_items_by_byte_budget(
+        items: list[_MediaItem], max_bytes: int,
+    ) -> list[list[_MediaItem]]:
+        """Greedy-pack ``items`` into sub-chunks under ``max_bytes`` each.
+
+        Cached items (``upload_bytes == 0``) are free and never force a
+        split. A single item that on its own exceeds the budget is
+        placed alone — letting Telegram return a precise error rather
+        than dropping it silently. Order is preserved so caption
+        attachment stays deterministic.
+        """
+        if not items:
+            return []
+        groups: list[list[_MediaItem]] = []
+        current: list[_MediaItem] = []
+        current_size = 0
+        for item in items:
+            cost = item.upload_bytes
+            if current and current_size + cost > max_bytes:
+                groups.append(current)
+                current = []
+                current_size = 0
+            current.append(item)
+            current_size += cost
+        if current:
+            groups.append(current)
+        return groups
+
+    @staticmethod
+    def _attach_caption_to_first(
+        items: list[_MediaItem], caption: str, parse_mode: str,
+    ) -> None:
+        """Inject caption + parse_mode into the first item's media_json.
+
+        Telegram displays the caption of the first media-group item; the
+        rest are ignored. Idempotent — re-attaching simply overwrites.
+        """
+        if not items:
+            return
+        items[0].media_json["caption"] = _truncate(caption, TELEGRAM_MAX_CAPTION_LENGTH)
+        items[0].media_json["parse_mode"] = parse_mode
+
+    async def _send_item_individually(
+        self, chat_id: str, item: _MediaItem,
+        caption: str | None, reply_to_message_id: int | None,
+        parse_mode: str,
+    ) -> NotificationResult:
+        """Send one ``_MediaItem`` as a standalone sendPhoto/sendVideo/sendDocument.
+
+        Used as the per-item fallback when sendMediaGroup itself
+        rejects a sub-chunk after pre-flight passed. Reuses already-
+        fetched bytes for fresh items; for cache-hit items that fail
+        the file_id POST, re-downloads from ``source_url`` so a stale
+        ``file_id`` doesn't silently lose an asset — the original
+        single-item path does the same recovery.
+        """
+        media_type = item.media_json.get("type") or "photo"
+        if media_type == "photo":
+            kind = _PHOTO_KIND
+        elif media_type == "video":
+            kind = _VIDEO_KIND
+        else:
+            kind = _DOCUMENT_KIND
+
+        cache: TelegramFileCache | None = None
+        cache_key: str | None = None
+        thumbhash: str | None = None
+        if item.cache_info is not None:
+            ck, _ck_type, ck_thumb, _ck_size = item.cache_info
+            cache = self._get_cache_for_key(ck)
+            cache_key = ck
+            thumbhash = ck_thumb
+
+        # Cached items have no attachment bytes — POST the file_id
+        # reference first; if that fails transiently, re-download via
+        # source_url and upload fresh. This matches what _send_photo /
+        # _send_video do for their cache path.
+        if item.attachment is None:
+            file_id = item.media_json.get("media", "")
+            if file_id and not file_id.startswith("attach://"):
+                cached_result = await self._send_from_cache(
+                    kind, chat_id, file_id, caption, reply_to_message_id, parse_mode,
+                )
+                if cached_result is not None:
+                    return cached_result
+
+            if not item.source_url:
+                return {"success": False, "error": "Cached fallback send failed (no source URL)"}
+            data, err = await self._safe_get(
+                self._resolve_url(item.source_url), item.download_headers,
+            )
+            if data is None:
+                return {"success": False, "error": f"Re-download failed: {err}"}
+            return await self._upload_media(
+                kind, chat_id, data,
+                kind.default_filename, kind.default_content_type,
+                caption, reply_to_message_id, parse_mode,
+                cache, cache_key, thumbhash,
+            )
+
+        _, data, filename, content_type = item.attachment
+        return await self._upload_media(
+            kind, chat_id, data, filename, content_type,
+            caption, reply_to_message_id, parse_mode,
+            cache, cache_key, thumbhash,
+        )

    @staticmethod
    def _partition_media_by_kind(
@@ -830,23 +1197,40 @@ class TelegramClient:
        self,
        chunk: list[dict[str, Any]],
        max_asset_data_size: int | None,
-        first_caption: str | None,
-        parse_mode: str,
-    ) -> list[_MediaItem]:
+        send_large_videos_as_documents: bool = False,
+    ) -> tuple[list[_MediaItem], list[_MediaItem]]:
        """Fetch + filter a chunk and return aligned media-group items.

+        Returns ``(items, deferred_documents)`` — ``items`` go into
+        sendMediaGroup, ``deferred_documents`` are oversized videos
+        retagged as documents (when the caller opted in) that will be
+        sent individually via ``_send_item_individually`` *after* the
+        group sends. Telegram rejects mixing documents with photo/video
+        in one group, so they have to ride out separately.
+
        Concurrency is bounded by ``_MEDIA_FETCH_CONCURRENCY`` so peak
        memory stays predictable. Per-fetch exceptions are isolated via
        ``return_exceptions=True`` so a single failed download cannot
        cancel its peers.
+
+        Caption injection is intentionally NOT performed here — callers
+        attach the caption after byte-budget sub-splitting so it lands
+        on the first item of the first delivered sub-chunk.
        """
        sem = asyncio.Semaphore(_MEDIA_FETCH_CONCURRENCY)

-        async def fetch(idx: int, item: dict[str, Any]) -> tuple[int, dict | None, bytes | None]:
+        async def fetch(
+            idx: int, item: dict[str, Any],
+        ) -> tuple[int, dict | None, bytes | None, bool]:
+            """Returns ``(idx, cached_entry, data, defer_as_document)``.
+
+            ``defer_as_document=True`` signals "video bytes valid but
+            too big for sendVideo — caller should send as document".
+            """
            url = item.get("url")
            if not url:
                _LOGGER.warning("Media skipped: missing url (idx=%d type=%s)", idx, item.get("type"))
-                return idx, None, None
+                return idx, None, None, False
            media_type = item.get("type", "photo")
            custom_cache_key = item.get("cache_key")

@@ -860,7 +1244,7 @@ class TelegramClient:
            )
            cached = item_cache.get(ck, thumbhash=item_thumbhash) if item_cache else None
            if cached and cached.get("file_id"):
-                return idx, cached, None
+                return idx, cached, None, False

            preloaded = item.get("data")
            data: bytes | None
@@ -874,34 +1258,40 @@ class TelegramClient:
                        "Media skipped: download failed (idx=%d type=%s): %s",
                        idx, media_type, err,
                    )
-                    return idx, None, None
+                    return idx, None, None, False

            if max_asset_data_size and len(data) > max_asset_data_size:
                _LOGGER.warning(
                    "Media skipped: size %d exceeds max_asset_data_size %d (idx=%d type=%s)",
                    len(data), max_asset_data_size, idx, media_type,
                )
-                return idx, None, None
+                return idx, None, None, False
            if media_type == "video" and len(data) > TELEGRAM_MAX_VIDEO_SIZE:
+                if send_large_videos_as_documents:
+                    _LOGGER.info(
+                        "Video %d bytes over Telegram limit (idx=%d) — deferring as document",
+                        len(data), idx,
+                    )
+                    return idx, None, data, True
                _LOGGER.warning(
                    "Media skipped: video %d bytes exceeds Telegram limit %d (idx=%d)",
                    len(data), TELEGRAM_MAX_VIDEO_SIZE, idx,
                )
-                return idx, None, None
+                return idx, None, None, False
            if media_type == "photo":
                exceeds, reason, _, _ = check_photo_limits(data)
                if exceeds:
                    _LOGGER.warning(
                        "Media skipped: photo %s (idx=%d)", reason, idx,
                    )
-                    return idx, None, None
-            return idx, None, data
+                    return idx, None, None, False
+            return idx, None, data, False

        raw = await asyncio.gather(
            *(fetch(i, item) for i, item in enumerate(chunk)),
            return_exceptions=True,
        )
-        results: list[tuple[int, dict | None, bytes | None]] = []
+        results: list[tuple[int, dict | None, bytes | None, bool]] = []
        for entry in raw:
            if isinstance(entry, Exception):
                _LOGGER.warning("Media fetch raised: %s", redact_exc(entry))
@@ -909,8 +1299,9 @@ class TelegramClient:
            results.append(entry)

        items: list[_MediaItem] = []
+        deferred_documents: list[_MediaItem] = []
        upload_idx = 0
-        for idx, cached_entry, data in results:
+        for idx, cached_entry, data, defer_as_document in results:
            item = chunk[idx]
            url = item.get("url")
            if not url:
@@ -918,6 +1309,35 @@ class TelegramClient:
            media_type = item.get("type") or "photo"
            custom_cache_key = item.get("cache_key")

+            # Deferred videos-as-documents are NEVER cache hits (the
+            # cache lookup branch returns early before the size check),
+            # so we always have fresh bytes here. Retag the
+            # media_json so ``_send_item_individually`` routes via
+            # ``_DOCUMENT_KIND`` to /sendDocument.
+            if defer_as_document and data is not None:
+                ct = item.get("content_type") or "video/mp4"
+                # Best-effort filename preserves the original
+                # extension so Telegram clients give it a sensible
+                # icon and the recipient can re-open it.
+                fname = url.split("/")[-1].split("?")[0] or "video.mp4"
+                if "." not in fname:
+                    fname = "video.mp4"
+                ck = custom_cache_key or extract_asset_id_from_url(url) or url
+                ck_is_asset = is_asset_cache_key(ck)
+                bare_ck = asset_id_from_cache_key(ck) if ck_is_asset else ck
+                th = (
+                    self._thumbhash_resolver(bare_ck)
+                    if ck_is_asset and self._thumbhash_resolver else None
+                )
+                deferred_documents.append(_MediaItem(
+                    media_json={"type": "document", "media": "attach://deferred"},
+                    cache_info=(ck, "document", th, len(data)),
+                    attachment=("deferred", data, fname, ct),
+                    source_url=url,
+                    download_headers=item.get("headers"),
+                ))
+                continue
+
            if cached_entry and cached_entry.get("file_id"):
                mij: dict[str, Any] = {"type": media_type, "media": cached_entry["file_id"]}
                cache_info: tuple[str, str, str | None, int] | None = None
@@ -940,14 +1360,14 @@ class TelegramClient:
            else:
                continue

-            if first_caption and not items:
-                # Only the first usable item in the first chunk receives
-                # the caption, per Telegram's media-group semantics.
-                mij["caption"] = _truncate(first_caption, TELEGRAM_MAX_CAPTION_LENGTH)
-                mij["parse_mode"] = parse_mode
-
-            items.append(_MediaItem(media_json=mij, cache_info=cache_info, attachment=attachment))
-        return items
+            items.append(_MediaItem(
+                media_json=mij,
+                cache_info=cache_info,
+                attachment=attachment,
+                source_url=url,
+                download_headers=item.get("headers"),
+            ))
+        return items, deferred_documents

    async def _post_media_group(
        self,
@@ -973,6 +1393,7 @@ class TelegramClient:
            for name, payload, filename, ct in attachments:
                f.add_field(name, payload, filename=filename, content_type=ct)
            f.add_field("media", json.dumps(media_json))
+            _apply_send_opts_to_form(f)
            return f

        for attempt in range(1, _TG_429_MAX_ATTEMPTS + 1):
@@ -13,6 +13,11 @@ _LOGGER = logging.getLogger(__name__)
 TELEGRAM_API_BASE_URL: Final = "https://api.telegram.org/bot"
 TELEGRAM_MAX_PHOTO_SIZE: Final = 10 * 1024 * 1024  # 10 MB
 TELEGRAM_MAX_VIDEO_SIZE: Final = 50 * 1024 * 1024  # 50 MB
+# Telegram's sendMediaGroup envelope tops out near 50 MB total (multipart
+# bytes including form overhead). 45 MB keeps a safety margin so we don't
+# eat 413s when the per-item budget admits items that, summed, would
+# bust Telegram's request cap.
+TELEGRAM_MAX_GROUP_TOTAL_BYTES: Final = 45 * 1024 * 1024  # 45 MB
 TELEGRAM_MAX_DIMENSION_SUM: Final = 10000
 # Telegram message-text limit (sendMessage) and caption limit
 # (sendPhoto/sendVideo/sendDocument/first item of sendMediaGroup).
@@ -126,36 +131,6 @@ def build_telegram_asset_entry(
    return entry


-def split_media_by_upload_size(
-    media_items: list[tuple], max_upload_size: int
-) -> list[list[tuple]]:
-    """Split media items into sub-groups respecting upload size limit."""
-    if not media_items:
-        return []
-
-    groups: list[list[tuple]] = []
-    current_group: list[tuple] = []
-    current_size = 0
-
-    for item in media_items:
-        media_ref = item[1]
-        is_cached = item[4]
-        item_size = 0 if is_cached else (len(media_ref) if isinstance(media_ref, bytes) else 0)
-
-        if current_group and current_size + item_size > max_upload_size:
-            groups.append(current_group)
-            current_group = []
-            current_size = 0
-
-        current_group.append(item)
-        current_size += item_size
-
-    if current_group:
-        groups.append(current_group)
-
-    return groups
-
-
 def check_photo_limits(
    data: bytes,
 ) -> tuple[bool, str | None, int | None, int | None]: