ledgrab/server/src/ledgrab/storage/activity_log_repository.py

"""Append-only repository for the persistent activity / audit log.

Design notes
------------
* Does NOT subclass ``BaseSqliteStore`` — that base loads every row into an
  in-memory cache at construction time, which is wrong for an unbounded,
  append-heavy log.
* All SQL parameters are passed as positional ``?`` placeholders — no user
  input is interpolated into the query string.
* Keyset pagination is implemented via the ``seq`` INTEGER PRIMARY KEY
  AUTOINCREMENT column, which is strictly monotonic and survives rows with
  identical ``ts`` values.
* The repository takes a ``Database`` instance and calls ``db.execute`` /
  ``db.transaction`` directly.
* Thread safety: ``Database.execute`` holds the ``RLock`` for the duration of
  each call.  The repository itself adds no extra locking.  Callers that
  originate from non-event-loop threads MUST marshal via
  ``loop.call_soon_threadsafe`` (that is Phase 2's responsibility).
"""

from __future__ import annotations

from datetime import datetime
from typing import Iterator

from ledgrab.storage.activity_log import ActivityLogEntry, ActivityLogFilters
from ledgrab.storage.database import Database
from ledgrab.utils import get_logger

logger = get_logger(__name__)

_TABLE = "activity_log"


def _build_filter_clause(
    filters: ActivityLogFilters,
    params: list,
    *,
    extra_where: str | None = None,
) -> str:
    """Return a WHERE clause string and append bound parameters to *params*.

    The caller is responsible for providing the opening ``WHERE`` keyword when
    the returned string is non-empty, or ``AND`` when appending to an existing
    predicate.  This function returns only the condition fragment(s) joined by
    ``AND``, or an empty string when *filters* has no restrictions.

    ``extra_where`` is prepended (with AND) before the filter conditions if
    provided (used for the ``seq < ?`` keyset predicate).
    """
    conditions: list[str] = []

    if extra_where:
        conditions.append(extra_where)

    if filters.categories:
        placeholders = ",".join("?" * len(filters.categories))
        conditions.append(f"category IN ({placeholders})")
        params.extend(filters.categories)

    if filters.severities:
        placeholders = ",".join("?" * len(filters.severities))
        conditions.append(f"severity IN ({placeholders})")
        params.extend(filters.severities)

    if filters.actor is not None:
        conditions.append("actor = ?")
        params.append(filters.actor)

    if filters.entity_type is not None:
        conditions.append("entity_type = ?")
        params.append(filters.entity_type)

    if filters.entity_id is not None:
        conditions.append("entity_id = ?")
        params.append(filters.entity_id)

    if filters.since is not None:
        conditions.append("ts >= ?")
        params.append(filters.since.isoformat())

    if filters.until is not None:
        conditions.append("ts <= ?")
        params.append(filters.until.isoformat())

    if filters.message_like is not None:
        # Escape LIKE special characters in the user-supplied substring so that
        # a literal '%' or '_' in the message does not act as a wildcard.
        escaped = filters.message_like.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
        conditions.append("message LIKE ? ESCAPE '\\'")
        params.append(f"%{escaped}%")

    return " AND ".join(conditions)


class ActivityLogRepository:
    """Purpose-built repository for the ``activity_log`` table.

    Parameters
    ----------
    db:
        The shared ``Database`` singleton.  The migration that creates the
        ``activity_log`` table must have been applied before the first call to
        :meth:`record`.  Construction triggers migration via
        ``MigrationRunner`` so users can create the repository directly
        without a separate startup step.
    """

    def __init__(self, db: Database) -> None:
        self._db = db
        # Apply pending migrations (idempotent; most runs are no-ops)
        from ledgrab.storage.data_migrations import ALL_MIGRATIONS, MigrationRunner

        MigrationRunner(db).run(ALL_MIGRATIONS)

    # -- Write ---------------------------------------------------------------

    def record(self, entry: ActivityLogEntry) -> None:
        """Append *entry* to the log.

        The ``seq`` column is assigned by SQLite (AUTOINCREMENT) — ``entry``
        does not carry a ``seq`` field.

        Caller contract: this must be called from the event-loop thread (or
        from a context already serialised through the ``Database`` RLock).
        Cross-thread callers must marshal via ``loop.call_soon_threadsafe``;
        that is Phase 2's responsibility.
        """
        row = entry.to_row()
        self._db.execute(
            f"INSERT INTO {_TABLE} "
            f"(id, ts, category, action, severity, actor, "
            f" entity_type, entity_id, entity_name, message, metadata) "
            f"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
            (
                row["id"],
                row["ts"],
                row["category"],
                row["action"],
                row["severity"],
                row["actor"],
                row["entity_type"],
                row["entity_id"],
                row["entity_name"],
                row["message"],
                row["metadata"],
            ),
        )

    # -- Read ----------------------------------------------------------------

    def query(
        self,
        filters: ActivityLogFilters,
        *,
        before_seq: int | None = None,
        limit: int = 50,
    ) -> list[ActivityLogEntry]:
        """Return the newest matching entries, oldest-first within the page.

        Keyset pagination: pass the smallest ``seq`` seen on the previous page
        as *before_seq* to get the next page.  Entries are fetched in
        ``seq DESC`` order from SQLite and then reversed before returning so
        that the caller receives them in chronological order within the page.

        Parameters
        ----------
        filters:
            Optional filter criteria.
        before_seq:
            Exclusive upper bound on ``seq``.  ``None`` returns the first
            (newest) page.
        limit:
            Maximum number of entries to return.
        """
        params: list = []
        keyset = "seq < ?" if before_seq is not None else None
        if before_seq is not None:
            params.append(before_seq)

        where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
        where_clause = f"WHERE {where_fragment}" if where_fragment else ""
        params.append(limit)

        sql = (
            f"SELECT seq, id, ts, category, action, severity, actor, "
            f"entity_type, entity_id, entity_name, message, metadata "
            f"FROM {_TABLE} "
            f"{where_clause} "
            f"ORDER BY seq DESC "
            f"LIMIT ?"
        )

        cursor = self._db.execute(sql, tuple(params))
        rows = cursor.fetchall()
        # Reverse to return chronological order within the page
        return [ActivityLogEntry.from_row(dict(row)) for row in reversed(rows)]

    def count(self, filters: ActivityLogFilters | None = None) -> int:
        """Return the number of entries matching *filters* (or all entries)."""
        if filters is None:
            filters = ActivityLogFilters()
        params: list = []
        where_fragment = _build_filter_clause(filters, params)
        where_clause = f"WHERE {where_fragment}" if where_fragment else ""

        sql = f"SELECT COUNT(*) AS cnt FROM {_TABLE} {where_clause}"
        cursor = self._db.execute(sql, tuple(params))
        row = cursor.fetchone()
        return int(row["cnt"])

    # -- Maintenance ---------------------------------------------------------

    def prune(
        self,
        *,
        before_ts: datetime | None = None,
        max_entries: int | None = None,
    ) -> int:
        """Delete old / excess entries; return the total rows deleted.

        Both predicates are applied independently:

        1. Delete all rows where ``ts < before_ts`` (age-based pruning).
        2. Keep only the newest ``max_entries`` rows, deleting the rest
           (count-based pruning).

        The two operations run in separate statements so that each can be
        independently enabled or disabled.
        """
        deleted = 0

        if before_ts is not None:
            cursor = self._db.execute(
                f"DELETE FROM {_TABLE} WHERE ts < ?",
                (before_ts.isoformat(),),
            )
            deleted += cursor.rowcount

        if max_entries is not None and max_entries >= 0:
            # Find the seq of the Nth newest row; delete everything older than it.
            cursor = self._db.execute(
                f"SELECT seq FROM {_TABLE} ORDER BY seq DESC LIMIT 1 OFFSET ?",
                (max_entries,),
            )
            cutoff_row = cursor.fetchone()
            if cutoff_row is not None:
                cutoff_seq = cutoff_row["seq"]
                cursor = self._db.execute(
                    f"DELETE FROM {_TABLE} WHERE seq <= ?",
                    (cutoff_seq,),
                )
                deleted += cursor.rowcount

        return deleted

    def clear(self) -> int:
        """Delete all entries; return the number of rows deleted."""
        cursor = self._db.execute(f"DELETE FROM {_TABLE}")
        return cursor.rowcount

    def get_seq_for_id(self, entry_id: str) -> int | None:
        """Return the ``seq`` value for the entry with *entry_id*, or ``None``.

        Used by the API list endpoint to compute the keyset cursor
        (``next_before_seq``) from the oldest entry on the current page.
        """
        cursor = self._db.execute(
            f"SELECT seq FROM {_TABLE} WHERE id = ?",
            (entry_id,),
        )
        row = cursor.fetchone()
        return int(row["seq"]) if row is not None else None

    # -- Export --------------------------------------------------------------

    def iter_export(
        self,
        filters: ActivityLogFilters | None = None,
        *,
        batch_size: int = 1000,
    ) -> Iterator[ActivityLogEntry]:
        """Yield all matching entries in ascending ``seq`` order.

        Fetches rows in bounded batches (keyset-paginated by ``seq``), holding
        the DB lock only for the duration of each ``fetchall()`` and releasing
        it before yielding.  This prevents a slow/stalled export client from
        blocking all other DB operations (record, config writes, etc.) for the
        full duration of the stream.

        Memory usage is bounded to ``batch_size`` rows at a time.
        """
        if filters is None:
            filters = ActivityLogFilters()

        # Keyset cursor: largest seq yielded so far; None means "start from the
        # very beginning".  We iterate ascending (seq ASC), so each batch uses
        # "seq > ?" to advance past the already-yielded rows.
        cursor_seq: int | None = None

        while True:
            # Build params list: cursor_seq placeholder must come first because
            # _build_filter_clause prepends extra_where as the first condition.
            params: list = []
            if cursor_seq is not None:
                params.append(cursor_seq)
                keyset: str | None = "seq > ?"
            else:
                keyset = None
            where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
            where_clause = f"WHERE {where_fragment}" if where_fragment else ""
            params.append(batch_size)

            sql = (
                f"SELECT seq, id, ts, category, action, severity, actor, "
                f"entity_type, entity_id, entity_name, message, metadata "
                f"FROM {_TABLE} "
                f"{where_clause} "
                f"ORDER BY seq ASC "
                f"LIMIT ?"
            )

            # Hold the lock only for the bounded fetchall; release before yielding.
            with self._db._lock:  # noqa: SLF001 — internal access; no public cursor API
                rows = self._db._conn.execute(sql, tuple(params)).fetchall()  # noqa: SLF001

            if not rows:
                break

            for row in rows:
                yield ActivityLogEntry.from_row(dict(row))

            # The last row has the largest seq in this batch (ORDER BY seq ASC).
            cursor_seq = rows[-1]["seq"]

            if len(rows) < batch_size:
                # Fewer rows than requested → this was the final batch.
                break