"""Watcher service — orchestrates poll -> detect -> notify flow.""" from __future__ import annotations import asyncio import logging from typing import Any, Awaitable, Callable from sqlmodel import select from sqlmodel.ext.asyncio.session import AsyncSession from notify_bridge_core.log_context import ( bind_log_context, ensure_dispatch_id, enrich_details_with_correlation, ) from notify_bridge_core.models.events import ServiceEvent from notify_bridge_core.notifications.dispatcher import NotificationDispatcher, TargetConfig from notify_bridge_core.notifications.telegram.cache import TelegramFileCache from notify_bridge_core.providers.capabilities import get_capabilities from notify_bridge_core.storage import JsonFileBackend from ..database.engine import get_engine from ..database.models import ( EventLog, NotificationTracker, NotificationTrackerState, ServiceProvider, ) from .dispatch_helpers import ( GateReason, apply_tracking_display_filters, evaluate_event_gate, get_app_timezone, load_link_data, resolve_provider_credential, ) from .dispatch_summary import record_dispatch_summary_async _LOGGER = logging.getLogger(__name__) # Module-level Telegram file caches — shared across dispatches for reuse _url_cache: TelegramFileCache | None = None _asset_cache: TelegramFileCache | None = None # Lazy init: creating ``asyncio.Lock()`` at module import time binds the # lock to whichever event loop is current at import (often none / the wrong # one when tests fire up dedicated loops). Defer until first use. _cache_lock: asyncio.Lock | None = None def _get_cache_lock() -> asyncio.Lock: """Return the module cache lock, creating it on first call.""" global _cache_lock if _cache_lock is None: _cache_lock = asyncio.Lock() return _cache_lock async def _load_cache_settings() -> tuple[int, int]: """Return (url_ttl_seconds, asset_max_entries) from app settings. Defaults apply when the settings rows are missing. Reads in a short-lived session to avoid coupling to the caller's transaction. """ from ..api.app_settings import get_setting async with AsyncSession(get_engine()) as session: ttl_hours_str = await get_setting(session, "telegram_cache_ttl_hours") max_entries_str = await get_setting(session, "telegram_asset_cache_max_entries") try: ttl_hours = int(ttl_hours_str) if ttl_hours_str else 720 except ValueError: ttl_hours = 720 try: max_entries = int(max_entries_str) if max_entries_str else 5000 except ValueError: max_entries = 5000 return ttl_hours * 3600, max_entries async def _get_telegram_caches() -> tuple[TelegramFileCache | None, TelegramFileCache | None]: """Lazily initialize shared Telegram file caches using NOTIFY_BRIDGE_DATA_DIR. The URL cache runs in TTL mode (URLs aren't content-addressable); the asset cache runs in thumbhash mode so entries invalidate on visual change rather than age. Both honor an LRU size cap from settings. """ global _url_cache, _asset_cache if _url_cache is not None: return _url_cache, _asset_cache async with _get_cache_lock(): # Double-check after acquiring lock if _url_cache is not None: return _url_cache, _asset_cache import os from pathlib import Path data_dir = os.environ.get("NOTIFY_BRIDGE_DATA_DIR") if not data_dir: return None, None cache_dir = Path(data_dir) / "cache" ttl_seconds, max_entries = await _load_cache_settings() url_cache = TelegramFileCache( JsonFileBackend(cache_dir / "telegram_url_cache.json"), ttl_seconds=ttl_seconds, max_entries=max_entries, ) asset_cache = TelegramFileCache( JsonFileBackend(cache_dir / "telegram_asset_cache.json"), use_thumbhash=True, max_entries=max_entries, ) await url_cache.async_load() await asset_cache.async_load() _url_cache = url_cache _asset_cache = asset_cache _LOGGER.info( "Initialized Telegram caches in %s (url ttl=%ds, max_entries=%d, asset thumbhash mode)", cache_dir, ttl_seconds, max_entries, ) return _url_cache, _asset_cache async def reset_telegram_caches_in_memory() -> None: """Drop in-memory cache refs without touching files on disk. Used after settings changes so the next dispatch re-initializes caches with fresh parameters. Contrast with ``clear_telegram_caches`` which also deletes cached file_ids. """ global _url_cache, _asset_cache async with _get_cache_lock(): _url_cache = None _asset_cache = None _LOGGER.info("Reset Telegram cache refs in memory (files preserved)") async def get_telegram_cache_stats() -> dict[str, Any]: """Return stats for the URL and asset Telegram caches. Loads caches lazily if they haven't been touched by a dispatch yet. Returns zero-counts when ``NOTIFY_BRIDGE_DATA_DIR`` is not configured. """ url_cache, asset_cache = await _get_telegram_caches() empty = {"count": 0, "total_size_bytes": 0, "oldest": None, "newest": None} return { "url": url_cache.stats() if url_cache else empty, "asset": asset_cache.stats() if asset_cache else empty, } async def clear_telegram_caches() -> dict[str, Any]: """Delete both Telegram file caches from disk and reset in-memory state. Next dispatch re-initializes the caches via `_get_telegram_caches()`. Returns a summary with the paths that were removed. """ global _url_cache, _asset_cache async with _get_cache_lock(): removed: list[str] = [] for cache, label in ((_url_cache, "url"), (_asset_cache, "asset")): if cache is not None: await cache.async_remove() removed.append(label) # Also remove files from disk in case caches were never initialized # in this process (data_dir set but dispatch never ran). import os from pathlib import Path data_dir = os.environ.get("NOTIFY_BRIDGE_DATA_DIR") if data_dir: cache_dir = Path(data_dir) / "cache" for name in ("telegram_url_cache.json", "telegram_asset_cache.json"): path = cache_dir / name if path.exists(): try: path.unlink() except OSError as e: _LOGGER.warning("Failed to remove %s: %s", path, e) _url_cache = None _asset_cache = None _LOGGER.info("Cleared Telegram file caches: %s", removed or "none in memory") return {"cleared": True, "removed": removed} # --------------------------------------------------------------------------- # Provider polling registry # --------------------------------------------------------------------------- # # Each registered factory returns (events, new_state). Replaces the long # ``if provider_type == ...`` chain in ``check_tracker``. New pollable # providers register here; webhook-only providers are short-circuited above # via ``capabilities.webhook_based``. class _PollerConnectError(Exception): """Raised by a poller factory when initial provider connection fails.""" def __init__(self, reason: str) -> None: super().__init__(reason) self.reason = reason PollResult = tuple[list[ServiceEvent], dict[str, Any]] PollerFactory = Callable[..., Awaitable[PollResult]] async def _poll_immich(*, provider_config, provider_name, collection_ids, state_dict, **_kw) -> PollResult: from notify_bridge_core.providers.immich import ImmichServiceProvider from .http_session import get_http_session http_session = await get_http_session() immich = ImmichServiceProvider( http_session, provider_config.get("url", ""), provider_config.get("api_key", ""), provider_config.get("external_domain"), provider_name, ) if not await immich.connect(): raise _PollerConnectError("failed to connect to provider") return await immich.poll(collection_ids, state_dict) async def _poll_scheduler(*, provider_name, tracker_name, tracker_filters, collection_ids, state_dict, app_tz, **_kw) -> PollResult: from notify_bridge_core.providers.scheduler import SchedulerServiceProvider sched = SchedulerServiceProvider( name=provider_name, tracker_name=tracker_name, custom_variables=tracker_filters.get("custom_variables", {}), timezone_name=app_tz, ) return await sched.poll(collection_ids, state_dict) async def _poll_nut(*, provider_config, provider_name, collection_ids, state_dict, **_kw) -> PollResult: from notify_bridge_core.providers.nut import NutServiceProvider nut = NutServiceProvider( host=provider_config.get("host", "localhost"), port=provider_config.get("port", 3493), username=provider_config.get("username"), password=provider_config.get("password"), name=provider_name, ) return await nut.poll(collection_ids, state_dict) async def _poll_google_photos(*, provider_config, provider_name, collection_ids, state_dict, **_kw) -> PollResult: from notify_bridge_core.providers.google_photos import GooglePhotosServiceProvider from .http_session import get_http_session http_session = await get_http_session() gp = GooglePhotosServiceProvider( http_session, provider_config.get("client_id", ""), provider_config.get("client_secret", ""), provider_config.get("refresh_token", ""), provider_name, ) if not await gp.connect(): raise _PollerConnectError("failed to connect to Google Photos") return await gp.poll(collection_ids, state_dict) _POLL_FACTORIES: dict[str, PollerFactory] = { "immich": _poll_immich, "scheduler": _poll_scheduler, "nut": _poll_nut, "google_photos": _poll_google_photos, } async def check_tracker(tracker_id: int) -> dict[str, Any]: """Poll a tracker's provider for changes and dispatch notifications.""" # Bind a per-tick dispatch_id so the EventLog row written for each detected # change carries the same correlation id as the dispatcher's log lines. with bind_log_context(dispatch_id=ensure_dispatch_id()): return await _check_tracker_impl(tracker_id) async def _check_tracker_impl(tracker_id: int) -> dict[str, Any]: engine = get_engine() # Load all DB data eagerly before entering aiohttp context async with AsyncSession(engine) as session: tracker = await session.get(NotificationTracker, tracker_id) if not tracker or not tracker.enabled: return {"status": "skipped", "reason": "disabled or not found"} provider = await session.get(ServiceProvider, tracker.provider_id) if not provider: return {"status": "error", "reason": "provider not found"} # Load tracker state result = await session.exec( select(NotificationTrackerState).where(NotificationTrackerState.tracker_id == tracker_id) ) states = result.all() state_dict: dict[str, Any] = {} for s in states: state_dict[s.collection_id] = { "name": s.collection_name or "", "asset_ids": s.asset_ids, "pending_asset_ids": s.pending_asset_ids, "shared": bool(s.shared), "meta_fingerprint": s.meta_fingerprint or {}, } # Snapshot the original fingerprint per collection so we can skip the # (expensive) asset_ids rewrite when nothing changed. For a 200k-asset # album this avoids a ~7 MB JSON write to the state row every tick. original_fingerprints: dict[str, dict[str, Any]] = { cid: dict(cstate.get("meta_fingerprint") or {}) for cid, cstate in state_dict.items() } # Load tracker-target links link_data = await load_link_data(session, tracker_id) # Load app-level timezone for quiet-hours evaluation. app_tz = await get_app_timezone(session) # Snapshot the data we need. These reads happen INSIDE the open # session so we get fresh attribute values; once the block exits, the # ORM instances become detached and any unfetched attribute access # would raise. Pulling primitives here is the deliberate isolation # boundary between the DB phase and the network phase. provider_type = provider.type provider_config = dict(provider.config) provider_name = provider.name tracker_name = tracker.name tracker_user_id = tracker.user_id tracker_filters = dict(tracker.filters) if tracker.filters else {} collection_ids = list(tracker.collection_ids or []) # Now create aiohttp session and poll events: list[ServiceEvent] = [] new_state: dict[str, Any] = {} # Webhook-only providers: capabilities.webhook_based short-circuits the # poll path. Inbound events arrive via the /api/webhooks/* endpoints. caps = get_capabilities(provider_type) if caps is not None and caps.webhook_based: return {"status": "ok", "events_detected": 0, "collections_checked": 0} poller = _POLL_FACTORIES.get(provider_type) if poller is None: return {"status": "error", "reason": f"unsupported provider type: {provider_type}"} try: events, new_state = await poller( provider_config=provider_config, provider_name=provider_name, tracker_name=tracker_name, tracker_filters=tracker_filters, collection_ids=collection_ids, state_dict=state_dict, app_tz=app_tz, ) except _PollerConnectError as exc: # Track consecutive poll failures so the bridge_self provider can # alert when a tracker stops responding. The emission is async # but cheap; we await it inline so its DB writes happen before # check_tracker returns to the scheduler. from .bridge_self import maybe_emit_poll_failure try: await maybe_emit_poll_failure( tracker_id=tracker_id, tracker_name=tracker_name, error=exc.reason, ) except Exception: # noqa: BLE001 _LOGGER.exception("bridge_self poll-failure emission failed") return {"status": "error", "reason": exc.reason} except Exception as exc: # noqa: BLE001 # Catch broader poll exceptions (e.g. a provider-side bug, transient # network error inside the poller after connect) so the same # streak-tracking logic applies. Re-raised after the bookkeeping so # the existing error path keeps logging at the caller. from .bridge_self import maybe_emit_poll_failure try: await maybe_emit_poll_failure( tracker_id=tracker_id, tracker_name=tracker_name, error=str(exc), ) except Exception: # noqa: BLE001 _LOGGER.exception("bridge_self poll-failure emission failed") raise # Successful poll — clear the consecutive-failure counter for this tracker. from .bridge_self import record_poll_success record_poll_success(tracker_id) # Save updated state and log events async with AsyncSession(engine) as session: for cid, cstate in new_state.items(): existing = None for s in states: if s.collection_id == cid: existing = s break current_fingerprint = dict(cstate.get("meta_fingerprint") or {}) prior_fingerprint = original_fingerprints.get(cid, {}) # Skip the DB update when the provider reported no meaningful # change. ``existing`` is None on first-ever fetch for a # collection — that path always writes so the row gets created. if existing is not None and current_fingerprint == prior_fingerprint: continue if existing: existing.asset_ids = cstate.get("asset_ids", []) existing.pending_asset_ids = cstate.get("pending_asset_ids", []) existing.collection_name = cstate.get("name", "") existing.shared = cstate.get("shared", False) existing.meta_fingerprint = current_fingerprint session.add(existing) else: new_ts = NotificationTrackerState( tracker_id=tracker_id, collection_id=cid, collection_name=cstate.get("name", ""), shared=cstate.get("shared", False), asset_ids=cstate.get("asset_ids", []), pending_asset_ids=cstate.get("pending_asset_ids", []), meta_fingerprint=current_fingerprint, ) session.add(new_ts) # Capture the event_log row id alongside each event so the dispatch # loop below can stamp a "dispatch_status=deferred" pointer onto the # row if quiet hours suppresses it. event_log_id_by_event: dict[int, int] = {} for event in events: # Skip persistence for events the dispatch loop will filter # anyway (assets_added with 0 added, assets_removed with 0 # removed). Without this we wrote a "noise" row for every # tracker tick that detected nothing. The dispatch-time filter # below still runs as a safety net. etype = event.event_type.value if etype == "assets_added" and event.added_count == 0: continue if etype == "assets_removed" and event.removed_count == 0: continue assets_count = event.added_count or event.removed_count or 0 details: dict[str, Any] = { "added_count": event.added_count, "removed_count": event.removed_count, "provider_type": event.provider_type.value, } # Scheduler/periodic events carry the schedule context in ``extra`` # (cron expression, interval, timezone, fire count). Surface that # in the event log so the dashboard and audit queries can show # *why* the event fired, not just that it did. if event.event_type.value == "scheduled_message": sched_type = tracker_filters.get("schedule_type", "interval") details["schedule_type"] = sched_type if sched_type == "cron": details["cron_expression"] = tracker_filters.get("cron_expression", "") else: details["interval_seconds"] = tracker.scan_interval details["timezone"] = app_tz fire_count = event.extra.get("fire_count") if event.extra else None if fire_count is not None: details["fire_count"] = fire_count log = EventLog( user_id=tracker.user_id, tracker_id=tracker_id, tracker_name=tracker.name, provider_id=provider.id, provider_name=provider_name, event_type=event.event_type.value, collection_id=event.collection_id, collection_name=event.collection_name, assets_count=assets_count, details=enrich_details_with_correlation(details), ) session.add(log) await session.flush() event_log_id_by_event[id(event)] = log.id await session.commit() # Dispatch notifications — per-link config resolution # Filter out empty events (e.g. assets_added with 0 added) events = [ e for e in events if not (e.event_type.value == "assets_added" and e.added_count == 0) and not (e.event_type.value == "assets_removed" and e.removed_count == 0) ] _LOGGER.info( "Tracker %d: %d events after filter, %d links", tracker_id, len(events), len(link_data), ) if events and link_data: url_cache, asset_cache = await _get_telegram_caches() from .http_session import get_http_session shared_session = await get_http_session() dispatcher = NotificationDispatcher( url_cache=url_cache, asset_cache=asset_cache, session=shared_session, ) from .deferred_dispatch import defer_event, is_deferrable from .scheduler import schedule_deferred_drain from ..database.models import EventLog as _EventLog for event in events: _LOGGER.info( "Dispatching event %s for %s (added=%d removed=%d)", event.event_type.value, event.collection_name, event.added_count, event.removed_count, ) event_log_id = event_log_id_by_event.get(id(event)) # Group targets by tracking-config identity so each unique TC # gets one event-transform pass; targets sharing a TC dispatch # together (preserves the gather-fan-out inside the dispatcher). groups: dict[int, tuple[Any, list[TargetConfig]]] = {} # Track defers in a single dict so we can persist them in one # session + commit at the end of the iteration. ``load_link_data`` # emits multiple entries per broadcast link (one per child) sharing # the same parent ``link_id``; the deferred row is one-per-link, so # ``dict`` keying by ``link_id`` naturally dedupes. defers_for_event: dict[int, datetime] = {} scheduled_until: datetime | None = None for ld in link_data: tc = ld["tracking_config"] if tc is not None: outcome = evaluate_event_gate(event, tc, app_tz) if outcome.reason is GateReason.QUIET_HOURS: if is_deferrable(event.event_type.value) and outcome.quiet_hours_end_at is not None: link_id = ld.get("link_id") if link_id is not None: # Per-link earliest fire_at wins if a future # iteration ever supplies a different end. prior = defers_for_event.get(link_id) if prior is None or outcome.quiet_hours_end_at < prior: defers_for_event[link_id] = outcome.quiet_hours_end_at _LOGGER.info( " Deferred until %s (quiet hours)", outcome.quiet_hours_end_at.isoformat() if outcome.quiet_hours_end_at else "?", ) else: _LOGGER.info( " Suppressed (quiet hours; event type not deferrable)", ) continue if outcome.reason is GateReason.EVENT_TYPE_DISABLED: _LOGGER.info(" Skipped by tracking config filter") continue tmpl = ld["template_config"] target_cfg = TargetConfig( type=ld["target_type"], config=ld["target_config"], template_slots=ld["template_slots"], date_format=tmpl.date_format if tmpl else "%d.%m.%Y, %H:%M UTC", date_only_format=tmpl.date_only_format if tmpl and tmpl.date_only_format else "%d.%m.%Y", provider_api_key=resolve_provider_credential(provider_config), provider_internal_url=provider_config.get("url", ""), provider_external_url=provider_config.get("external_domain", ""), receivers=ld["receivers"], ) key = id(tc) if tc is not None else 0 if key not in groups: groups[key] = (tc, []) # Threaded with target_id/target_name so per-target failure # counters can attribute the dispatch result correctly. groups[key][1].append((target_cfg, ld.get("target_id"), ld.get("target_name", ""))) # Persist defers + stamp the event_log row + schedule drains in a # single transaction. This keeps the "deferred" pill on the # dashboard consistent with the existence of pending rows even if # the process is killed mid-way (either both land or neither does). if defers_for_event: async with AsyncSession(engine) as defer_session: for link_id, fire_at in defers_for_event.items(): await defer_event( defer_session, event=event, user_id=tracker_user_id, tracker_id=tracker_id, link_id=link_id, event_log_id=event_log_id, fire_at=fire_at, ) if scheduled_until is None or fire_at < scheduled_until: scheduled_until = fire_at # Stamp event_log row inside the SAME session so the # "deferred until" pill is only visible if the rows # actually persist. if event_log_id is not None and scheduled_until is not None: el = await defer_session.get(_EventLog, event_log_id) if el is not None: existing = dict(el.details or {}) if not existing.get("dispatch_status"): existing["dispatch_status"] = "deferred" existing["deferred_until"] = scheduled_until.isoformat() el.details = existing defer_session.add(el) await defer_session.commit() # Drain job registration is best-effort: a failure here just # delays delivery until the next scan/restart, not data loss. for fire_at in {*defers_for_event.values()}: try: schedule_deferred_drain(fire_at) except Exception: # noqa: BLE001 _LOGGER.exception( "Failed to schedule deferred drain for %s", fire_at, ) from .bridge_self import ( maybe_emit_target_failure, record_target_success, ) track_target_failures = ( event.provider_type.value != "bridge_self" ) # Per-event accumulator so the summary write covers every # tracking-config group, not just the last one. event_results: list[dict[str, Any]] = [] for tc, target_entries in groups.values(): if not target_entries: continue shaped_event = apply_tracking_display_filters(event, tc) if shaped_event is None: _LOGGER.info( " Event suppressed by display filters (favorites_only)", ) continue target_configs = [entry[0] for entry in target_entries] results = await dispatcher.dispatch(shaped_event, target_configs) event_results.extend(results) for entry, r in zip(target_entries, results): _, target_id, target_name = entry if r.get("success"): _LOGGER.info(" Notification sent successfully") if track_target_failures and target_id is not None: record_target_success(int(target_id)) else: _LOGGER.error(" Notification failed: %s", r.get("error", "unknown")) if track_target_failures and target_id is not None: try: await maybe_emit_target_failure( target_id=int(target_id), target_name=target_name or "", target_type=entry[0].type, error=str(r.get("error") or ""), ) except Exception: # noqa: BLE001 _LOGGER.exception( "bridge_self target-failure emission failed", ) # The EventLog row was committed in the earlier session block # so we run a tiny follow-up UPDATE in a fresh session. Best- # effort: a failure here logs but does not abort the watcher. if event_log_id is not None and event_results: async with AsyncSession(engine) as summary_session: await record_dispatch_summary_async( summary_session, event_log_id, event_results, ) return { "status": "ok", "events_detected": len(events), "collections_checked": len(collection_ids), }