From 3b7808aa9c7488b4392953e2796d4e481e775fd0 Mon Sep 17 00:00:00 2001
From: "alexei.dolgolyov" <dolgolyov.alexei@gmail.com>
Date: Wed, 22 Apr 2026 19:27:09 +0300
Subject: [PATCH] perf(immich): TTL cache for album bodies and shared-link
 listings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bot commands like /random, /latest, /memory refetch the same albums in
quick succession; the GET /api/albums/{id} response can be tens of MB on
large albums, and /api/shared-links has no per-album filter so every
get_shared_links call was already paying for the full server-wide list.

- Module-level 60s TTL cache for album bodies, keyed by
  (server_digest, album_id), 32-entry FIFO cap.  Module-scoped (not
  instance-scoped) because ImmichClient is constructed fresh per request
  in several places, so an instance cache would never survive a second
  caller.  Mirrors the existing _users_cache pattern.
- Module-level 60s TTL cache for the bucketed shared-links map, keyed by
  server_digest.  get_shared_links(album_id) now delegates to a single
  server-wide fetch that serves every album.
- server_digest hashes url+api_key so raw creds don't sit in dict keys.
- get_album(use_cache=False) escape hatch for paths that must observe
  current server state — wired into ImmichActionExecutor.execute (diffs
  the album to decide what to add) and ImmichServiceProvider.poll's
  full-fetch path (stale data would silently delay removal events).
- Async locks guard cache writes with under-lock re-check so concurrent
  misses collapse to one fetch.
---
 .../providers/immich/action_executor.py       |   4 +-
 .../providers/immich/client.py                | 120 +++++++++++++++---
 .../providers/immich/provider.py              |   8 +-
 3 files changed, 113 insertions(+), 19 deletions(-)

diff --git a/packages/core/src/notify_bridge_core/providers/immich/action_executor.py b/packages/core/src/notify_bridge_core/providers/immich/action_executor.py
index facff74..402d424 100644
--- a/packages/core/src/notify_bridge_core/providers/immich/action_executor.py
+++ b/packages/core/src/notify_bridge_core/providers/immich/action_executor.py
@@ -177,7 +177,9 @@ class ImmichActionExecutor(ActionExecutor):
                 needs_thumbnail = album_id in album_created_now
 
                 if album_id and album_id != "__dry_run_new__":
-                    album = await self._client.get_album(album_id)
+                    # Actions diff the current album state to decide what to
+                    # add — must observe fresh data, not a cached view.
+                    album = await self._client.get_album(album_id, use_cache=False)
                     if album is None and create_if_missing and create_album_name:
                         if not dry_run:
                             created = await self._client.create_album(create_album_name)
diff --git a/packages/core/src/notify_bridge_core/providers/immich/client.py b/packages/core/src/notify_bridge_core/providers/immich/client.py
index 3213fe9..252ebae 100644
--- a/packages/core/src/notify_bridge_core/providers/immich/client.py
+++ b/packages/core/src/notify_bridge_core/providers/immich/client.py
@@ -2,8 +2,11 @@
 
 from __future__ import annotations
 
+import asyncio
+import hashlib
 import logging
 import re
+import time
 from typing import Any
 
 import aiohttp
@@ -18,6 +21,51 @@ _LOGGER = logging.getLogger(__name__)
 MAX_SEARCH_QUERY_LEN = 256
 MAX_SEARCH_PERSON_IDS = 50
 
+# Module-level TTL caches for album bodies and shared-link listings. The
+# Immich ``GET /api/albums/{id}`` response can be tens or hundreds of MB on a
+# large album, and bot commands like /random, /latest, /memory all refetch
+# the same album in quick succession. A short TTL makes repeat runs nearly
+# instant and deduplicates concurrent fetches so a burst of commands issues
+# one HTTP call instead of N.
+#
+# Caches are module-scoped (not instance-scoped) because ``ImmichClient`` is
+# constructed fresh per request in several places (api/providers.py,
+# services/action_runner.py, command handlers), so an instance cache would
+# never survive to serve a second caller. This mirrors ``_users_cache`` in
+# ``provider.py``.
+_ALBUM_CACHE_TTL_SECONDS = 60
+_SHARED_LINKS_CACHE_TTL_SECONDS = 60
+# Guard rail against runaway memory — a 200k-asset album response can be
+# ~150 MB, so even modest caps bound the worst case.
+_ALBUM_CACHE_MAX_ENTRIES = 32
+_album_cache_lock = asyncio.Lock()
+# key = (server_digest, album_id); value = (monotonic_ts, raw_api_dict)
+# Store the raw dict rather than the parsed ``ImmichAlbumData`` so callers
+# that pass a ``users_cache`` still get owner-name enrichment on cache hits.
+_album_cache: dict[tuple[str, str], tuple[float, dict[str, Any]]] = {}
+_shared_links_cache_lock = asyncio.Lock()
+# key = server_digest; value = (monotonic_ts, {album_id: [SharedLinkInfo, ...]})
+# The underlying ``/api/shared-links`` endpoint has no per-album filter, so
+# every call was already paying for the full server-wide list. Caching the
+# bucketed result once per server turns N per-album calls into one fetch.
+_shared_links_cache: dict[str, tuple[float, dict[str, list[SharedLinkInfo]]]] = {}
+
+
+def _server_digest(url: str, api_key: str) -> str:
+    """Hashed key that avoids putting raw api_key into cache dict keys."""
+    return hashlib.sha256(f"{url}|{api_key}".encode("utf-8")).hexdigest()[:32]
+
+
+def invalidate_album_cache() -> None:
+    """Drop every cached album body. Call after mutations that invalidate
+    the cached view (e.g. integration tests, manual /refresh commands)."""
+    _album_cache.clear()
+
+
+def invalidate_shared_links_cache() -> None:
+    """Drop every cached shared-link listing."""
+    _shared_links_cache.clear()
+
 # User-facing error bodies — Immich responses may leak internal paths,
 # hostnames, or headers injected by intermediary proxies. These helpers keep
 # only a short, scrubbed summary; full bodies are logged server-side only.
@@ -184,22 +232,30 @@ class ImmichClient:
         return {}
 
     async def get_shared_links(self, album_id: str) -> list[SharedLinkInfo]:
-        links: list[SharedLinkInfo] = []
-        try:
-            async with self._session.get(
-                f"{self._url}/api/shared-links",
-                headers=self._headers,
-            ) as response:
-                if response.status == 200:
-                    data = await response.json()
-                    for link in data:
-                        album = link.get("album")
-                        key = link.get("key")
-                        if album and key and album.get("id") == album_id:
-                            links.append(SharedLinkInfo.from_api_response(link))
-        except aiohttp.ClientError as err:
-            _LOGGER.warning("Failed to fetch shared links: %s", err)
-        return links
+        bucketed = await self._get_shared_links_bucketed()
+        return list(bucketed.get(album_id, []))
+
+    async def _get_shared_links_bucketed(self) -> dict[str, list[SharedLinkInfo]]:
+        """Return ``{album_id: [SharedLinkInfo, ...]}`` for the server, hitting
+        the module-level TTL cache first. Underlying Immich endpoint has no
+        per-album filter, so one server-wide fetch serves every caller until
+        the TTL elapses.
+        """
+        digest = _server_digest(self._url, self._api_key)
+        now = time.monotonic()
+        entry = _shared_links_cache.get(digest)
+        if entry is not None and (now - entry[0]) < _SHARED_LINKS_CACHE_TTL_SECONDS:
+            return entry[1]
+
+        async with _shared_links_cache_lock:
+            # Re-check under the lock — another coroutine may have refreshed
+            # while we waited.
+            entry = _shared_links_cache.get(digest)
+            if entry is not None and (time.monotonic() - entry[0]) < _SHARED_LINKS_CACHE_TTL_SECONDS:
+                return entry[1]
+            fresh = await self.get_all_shared_links_by_album()
+            _shared_links_cache[digest] = (time.monotonic(), fresh)
+            return fresh
 
     async def get_all_shared_links_by_album(self) -> dict[str, list[SharedLinkInfo]]:
         """Fetch every shared link on the server, bucketed by album id.
@@ -247,7 +303,29 @@ class ImmichClient:
         self,
         album_id: str,
         users_cache: dict[str, str] | None = None,
+        *,
+        use_cache: bool = True,
     ) -> ImmichAlbumData | None:
+        """Fetch an album by id, optionally serving from the module-level
+        TTL cache. Pass ``use_cache=False`` from paths that must observe the
+        current server state (e.g. the notification poll loop's full-fetch
+        path, where a stale cached entry would delay asset-removal events).
+        Non-cached fetches still populate the cache for subsequent readers.
+        """
+        cache_key = (_server_digest(self._url, self._api_key), album_id)
+        if use_cache:
+            entry = _album_cache.get(cache_key)
+            if entry is not None and (time.monotonic() - entry[0]) < _ALBUM_CACHE_TTL_SECONDS:
+                # Rehydrate per-call so ``users_cache`` enrichment is applied
+                # with the caller's dict, not whichever one was live when the
+                # cache was populated.
+                return ImmichAlbumData.from_api_response(entry[1], users_cache)
+
+        # Deliberately fetch without holding a lock so concurrent calls for
+        # *different* album_ids (the common case from asyncio.gather in
+        # fetch_albums_with_links) stay parallel. The worst case is a small
+        # duplicate-fetch stampede when two requests miss the same album at
+        # the same instant — acceptable for our scale.
         try:
             async with self._session.get(
                 f"{self._url}/api/albums/{album_id}",
@@ -260,10 +338,18 @@ class ImmichClient:
                         f"Error fetching album {album_id}: HTTP {response.status}"
                     )
                 data = await response.json()
-                return ImmichAlbumData.from_api_response(data, users_cache)
         except aiohttp.ClientError as err:
             raise ImmichApiError(f"Error communicating with Immich: {err}") from err
 
+        async with _album_cache_lock:
+            # Evict the oldest entry if we're at the cap — simple FIFO is fine
+            # for our access pattern (commands touch a small working set).
+            if len(_album_cache) >= _ALBUM_CACHE_MAX_ENTRIES and cache_key not in _album_cache:
+                oldest = min(_album_cache.items(), key=lambda kv: kv[1][0])[0]
+                _album_cache.pop(oldest, None)
+            _album_cache[cache_key] = (time.monotonic(), data)
+        return ImmichAlbumData.from_api_response(data, users_cache)
+
     async def get_album_meta(self, album_id: str) -> ImmichAlbumMeta | None:
         """Fetch album metadata without the assets array.
 
diff --git a/packages/core/src/notify_bridge_core/providers/immich/provider.py b/packages/core/src/notify_bridge_core/providers/immich/provider.py
index 6741dc7..0deb580 100644
--- a/packages/core/src/notify_bridge_core/providers/immich/provider.py
+++ b/packages/core/src/notify_bridge_core/providers/immich/provider.py
@@ -292,7 +292,13 @@ class ImmichServiceProvider(ServiceProvider):
                 # the full-fetch path so removals get detected.
 
             # Full fetch: first tick, or count-decreased, or delta-unsafe.
-            album = await self._client.get_album(album_id, self._users_cache)
+            # Bypass the module-level album cache — this path runs when we
+            # specifically need the current server state (e.g. to detect
+            # asset removals), so a stale cached entry would silently delay
+            # the event.
+            album = await self._client.get_album(
+                album_id, self._users_cache, use_cache=False,
+            )
             if album is None:
                 # Album was deleted between meta probe and full fetch — handle
                 # the deletion the same way as above.