perf(immich): skip full album fetch on idle ticks; delta-fetch for active ones

Optimizes polling for large Immich albums (tested path targets ~200k
assets). Combined impact on idle albums drops per-tick cost from ~150 MB
fetch to ~few hundred bytes; active albums fetch O(changes) instead of
O(library).

Core changes
- ImmichAlbumMeta + get_album_meta() using ?withoutAssets=true as a
  cheap change-detection probe.
- poll() fast-path: skip full fetch when meta fingerprint matches and
  no pending assets are outstanding.
- poll() delta-path: search/metadata with updatedAfter when fingerprint
  changed, falling back to full fetch on count decrease or mixed
  add+remove that delta can't reconcile.
- asyncio.gather over meta probes so a 20-album tracker pays one
  round-trip of latency instead of 20.
- Event payload cap (50 added / 200 removed) so a bulk import can't
  explode a Jinja template or exceed Telegram's message limits.
- Module-level users cache (1h TTL, sha256-keyed) shared across
  providers on the same Immich server.
- Tick-scoped shared-links cache via new
  get_all_shared_links_by_album() — one /api/shared-links request per
  tick instead of one per changed album.

Server changes
- meta_fingerprint JSON column on NotificationTrackerState + migration.
- watcher skips the asset_ids DB rewrite when the fingerprint didn't
  change, avoiding ~8 MB JSON writes on idle ticks for huge albums.
- Adaptive polling: after 10 empty ticks skip 1-in-2, after 30 skip
  1-in-4, reset on first detected change; resets on schedule changes.
- APScheduler jitter (interval/4, capped at 30s) to smooth thundering-
  herd bursts when many trackers share the same scan_interval.
This commit is contained in:
2026-04-22 18:55:26 +03:00
parent d02616069d
commit fe38d20b96
8 changed files with 796 additions and 40 deletions
@@ -309,6 +309,14 @@ async def migrate_schema(engine: AsyncEngine) -> None:
text(f"ALTER TABLE {state_table} ADD COLUMN shared INTEGER DEFAULT 0")
)
logger.info("Added shared column to %s table", state_table)
# meta_fingerprint — small JSON blob captured from the provider's
# cheap meta probe. An empty default means "unknown, do a full
# fetch next tick" so existing rows don't wrongly skip detection.
if not await _has_column(conn, state_table, "meta_fingerprint"):
await conn.execute(
text(f"ALTER TABLE {state_table} ADD COLUMN meta_fingerprint TEXT DEFAULT '{{}}'")
)
logger.info("Added meta_fingerprint column to %s table", state_table)
# Add language_code to telegram_chat if missing
if await _has_table(conn, "telegram_chat"):
@@ -376,6 +376,13 @@ class NotificationTrackerState(SQLModel, table=True):
shared: bool = Field(default=False)
asset_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
pending_asset_ids: list[str] = Field(default_factory=list, sa_column=Column(JSON))
# Lightweight fingerprint ({updated_at, asset_count, shared, name, ...})
# captured from the provider's cheap meta probe. Letting this differ from
# the current provider response is what tells the watcher a full fetch is
# actually required — letting it match lets the watcher skip the big read.
meta_fingerprint: dict[str, Any] = Field(
default_factory=dict, sa_column=Column(JSON)
)
last_updated: datetime = Field(default_factory=_utcnow)
@@ -10,6 +10,49 @@ _LOGGER = logging.getLogger(__name__)
_scheduler: AsyncIOScheduler | None = None
# ---------------------------------------------------------------------------
# Adaptive polling (Tier 6 of the big-album optimization plan).
#
# We don't touch the user-configured ``scan_interval`` — that's still the
# authoritative cadence. Instead, we *skip* a growing fraction of scheduled
# ticks when a tracker is idle, and reset to 1:1 as soon as it detects
# anything. The scheduler keeps running on the user's chosen period, so
# response time to the *first* change after an idle stretch is never worse
# than one tick — but the steady-state HTTP cost for a fleet of idle
# trackers drops by ~75%.
#
# Thresholds are intentionally conservative: a tracker polling every 30 s
# needs 5 min of silence before we halve its effective rate, and 15 min
# before we quarter it. Any caller can disable adaptive behavior by passing
# ``adaptive=False`` in the tracker filters dict (checked in ``_poll_tracker``).
# ---------------------------------------------------------------------------
_ADAPTIVE_HALVE_THRESHOLD = 10 # consecutive empty ticks → 1-in-2
_ADAPTIVE_QUARTER_THRESHOLD = 30 # consecutive empty ticks → 1-in-4
_ADAPTIVE_MAX_SKIP = 4 # hard cap on skip factor
# Per-tracker adaptive state, keyed by tracker_id. Rebuilt on process
# restart — a short warmup period is fine and avoids persisting what is
# effectively a performance heuristic.
_adaptive_state: dict[int, dict[str, int]] = {}
def _compute_jitter(interval_seconds: int) -> int:
"""Return a jitter bound (in seconds) suitable for an IntervalTrigger.
Without jitter, a fleet of N trackers all on ``scan_interval=60`` wake up
at the same wall-clock second every minute — that creates a thundering-
herd on the upstream Immich/Gitea/etc. server. APScheduler's ``jitter``
randomizes each tick's firing time by ±jitter seconds.
We use a quarter of the interval up to a 30 s cap. For short intervals
(≤8 s) jitter would round to 0 — that's fine, at those cadences a
bursty pattern is what the user implicitly opted into.
"""
if interval_seconds <= 0:
return 0
return min(interval_seconds // 4, 30)
def get_scheduler() -> AsyncIOScheduler:
global _scheduler
@@ -271,16 +314,21 @@ async def _load_tracker_jobs() -> None:
tracker.id, tracker.name, e,
)
jitter = _compute_jitter(tracker.scan_interval)
scheduler.add_job(
_poll_tracker,
"interval",
seconds=tracker.scan_interval,
jitter=jitter or None,
id=job_id,
args=[tracker.id],
replace_existing=True,
max_instances=1,
)
_LOGGER.info("Scheduled tracker %d (%s) every %ds", tracker.id, tracker.name, tracker.scan_interval)
_LOGGER.info(
"Scheduled tracker %d (%s) every %ds (jitter ±%ds)",
tracker.id, tracker.name, tracker.scan_interval, jitter,
)
def _add_cron_job(
@@ -313,6 +361,10 @@ async def schedule_tracker(
scheduler = get_scheduler()
job_id = f"tracker_{tracker_id}"
# A reschedule typically follows a config edit or enable/disable flip —
# drop adaptive back-off so the first tick after the change runs promptly.
reset_adaptive_state(tracker_id)
# Remove existing job first to allow trigger type changes
if scheduler.get_job(job_id):
scheduler.remove_job(job_id)
@@ -324,33 +376,113 @@ async def schedule_tracker(
except Exception as e:
_LOGGER.error("Invalid cron for tracker %d: %s — using interval", tracker_id, e)
jitter = _compute_jitter(interval)
scheduler.add_job(
_poll_tracker,
"interval",
seconds=interval,
jitter=jitter or None,
id=job_id,
args=[tracker_id],
replace_existing=True,
)
_LOGGER.info("Scheduled tracker %d every %ds", tracker_id, interval)
_LOGGER.info(
"Scheduled tracker %d every %ds (jitter ±%ds)", tracker_id, interval, jitter,
)
async def unschedule_tracker(tracker_id: int) -> None:
"""Remove a scheduler job for a tracker."""
scheduler = get_scheduler()
job_id = f"tracker_{tracker_id}"
reset_adaptive_state(tracker_id)
if scheduler.get_job(job_id):
scheduler.remove_job(job_id)
_LOGGER.info("Unscheduled tracker %d", tracker_id)
def _adaptive_should_skip(tracker_id: int) -> bool:
"""Return True when the adaptive heuristic says to skip this tick.
Run-length skip: if we're in 1-in-K mode, skip (K-1) ticks between each
real poll. Stateless about the *current* tick counter except for the
``tick_counter`` we bump here.
"""
state = _adaptive_state.get(tracker_id)
if not state:
return False
skip_every = state.get("skip_every", 1)
if skip_every <= 1:
return False
state["tick_counter"] = state.get("tick_counter", 0) + 1
# Fire on ticks where counter % skip_every == 0; skip the rest.
return (state["tick_counter"] % skip_every) != 0
def _adaptive_update(tracker_id: int, events_detected: int) -> None:
"""Update the adaptive counter after a real tick ran."""
state = _adaptive_state.setdefault(
tracker_id, {"empty_count": 0, "skip_every": 1, "tick_counter": 0}
)
if events_detected > 0:
if state["skip_every"] > 1:
_LOGGER.info(
"Adaptive polling: tracker %d saw activity, restoring base rate",
tracker_id,
)
state["empty_count"] = 0
state["skip_every"] = 1
state["tick_counter"] = 0
return
state["empty_count"] = state.get("empty_count", 0) + 1
if (
state["empty_count"] >= _ADAPTIVE_QUARTER_THRESHOLD
and state["skip_every"] < _ADAPTIVE_MAX_SKIP
):
state["skip_every"] = _ADAPTIVE_MAX_SKIP
_LOGGER.info(
"Adaptive polling: tracker %d idle for %d ticks, skipping 3 of 4",
tracker_id, state["empty_count"],
)
elif (
state["empty_count"] >= _ADAPTIVE_HALVE_THRESHOLD
and state["skip_every"] < 2
):
state["skip_every"] = 2
_LOGGER.info(
"Adaptive polling: tracker %d idle for %d ticks, skipping every other",
tracker_id, state["empty_count"],
)
def reset_adaptive_state(tracker_id: int) -> None:
"""Drop cached adaptive counters for a tracker.
Used by API callers that make changes requiring the tracker to run
promptly on the next scheduled tick (enable/disable, config edits,
manual "check now" actions).
"""
_adaptive_state.pop(tracker_id, None)
async def _poll_tracker(tracker_id: int) -> None:
"""Poll a tracker for changes."""
from .watcher import check_tracker
if _adaptive_should_skip(tracker_id):
return
try:
await check_tracker(tracker_id)
result = await check_tracker(tracker_id)
except Exception as e:
_LOGGER.error("Error polling tracker %d: %s", tracker_id, e)
return
# Treat the "error" / "skipped" statuses as inconclusive — don't let
# a transient upstream failure trick the heuristic into backing off.
if isinstance(result, dict) and result.get("status") == "ok":
_adaptive_update(tracker_id, int(result.get("events_detected", 0) or 0))
# ---------------------------------------------------------------------------
@@ -187,8 +187,17 @@ async def check_tracker(tracker_id: int) -> dict[str, Any]:
"asset_ids": s.asset_ids,
"pending_asset_ids": s.pending_asset_ids,
"shared": bool(s.shared),
"meta_fingerprint": s.meta_fingerprint or {},
}
# Snapshot the original fingerprint per collection so we can skip the
# (expensive) asset_ids rewrite when nothing changed. For a 200k-asset
# album this avoids a ~7 MB JSON write to the state row every tick.
original_fingerprints: dict[str, dict[str, Any]] = {
cid: dict(cstate.get("meta_fingerprint") or {})
for cid, cstate in state_dict.items()
}
# Load tracker-target links
link_data = await load_link_data(session, tracker_id)
@@ -279,11 +288,20 @@ async def check_tracker(tracker_id: int) -> dict[str, Any]:
existing = s
break
current_fingerprint = dict(cstate.get("meta_fingerprint") or {})
prior_fingerprint = original_fingerprints.get(cid, {})
# Skip the DB update when the provider reported no meaningful
# change. ``existing`` is None on first-ever fetch for a
# collection — that path always writes so the row gets created.
if existing is not None and current_fingerprint == prior_fingerprint:
continue
if existing:
existing.asset_ids = cstate.get("asset_ids", [])
existing.pending_asset_ids = cstate.get("pending_asset_ids", [])
existing.collection_name = cstate.get("name", "")
existing.shared = cstate.get("shared", False)
existing.meta_fingerprint = current_fingerprint
session.add(existing)
else:
new_ts = NotificationTrackerState(
@@ -293,6 +311,7 @@ async def check_tracker(tracker_id: int) -> dict[str, Any]:
shared=cstate.get("shared", False),
asset_ids=cstate.get("asset_ids", []),
pending_asset_ids=cstate.get("pending_asset_ids", []),
meta_fingerprint=current_fingerprint,
)
session.add(new_ts)