Files
ledgrab/server/src/ledgrab/storage/activity_log_repository.py
T
alexei.dolgolyov 4a0927521a feat(activity-log): phase 4 - REST API (list/export/settings/clear)
- GET /activity-log: filtered, keyset-paginated list (categories/severities/actor/entity/date/q)
- GET /activity-log/export: streaming CSV/JSON, chunked keyset (releases DB lock per batch), CSV formula-injection guard
- GET/PUT /activity-log/settings: retention config (PUT require_authenticated)
- DELETE /activity-log: clear (require_authenticated, self-audited)
- security: export DoS fix, settings-PUT auth gate, CSV \t/\r guard, metadata-as-JSON
- 122 API tests (auth posture, CSV injection, pagination integrity, filters, settings bounds, clear-audited)
2026-06-09 20:09:46 +03:00

339 lines
12 KiB
Python

"""Append-only repository for the persistent activity / audit log.
Design notes
------------
* Does NOT subclass ``BaseSqliteStore`` — that base loads every row into an
in-memory cache at construction time, which is wrong for an unbounded,
append-heavy log.
* All SQL parameters are passed as positional ``?`` placeholders — no user
input is interpolated into the query string.
* Keyset pagination is implemented via the ``seq`` INTEGER PRIMARY KEY
AUTOINCREMENT column, which is strictly monotonic and survives rows with
identical ``ts`` values.
* The repository takes a ``Database`` instance and calls ``db.execute`` /
``db.transaction`` directly.
* Thread safety: ``Database.execute`` holds the ``RLock`` for the duration of
each call. The repository itself adds no extra locking. Callers that
originate from non-event-loop threads MUST marshal via
``loop.call_soon_threadsafe`` (that is Phase 2's responsibility).
"""
from __future__ import annotations
from datetime import datetime
from typing import Iterator
from ledgrab.storage.activity_log import ActivityLogEntry, ActivityLogFilters
from ledgrab.storage.database import Database
from ledgrab.utils import get_logger
logger = get_logger(__name__)
_TABLE = "activity_log"
def _build_filter_clause(
filters: ActivityLogFilters,
params: list,
*,
extra_where: str | None = None,
) -> str:
"""Return a WHERE clause string and append bound parameters to *params*.
The caller is responsible for providing the opening ``WHERE`` keyword when
the returned string is non-empty, or ``AND`` when appending to an existing
predicate. This function returns only the condition fragment(s) joined by
``AND``, or an empty string when *filters* has no restrictions.
``extra_where`` is prepended (with AND) before the filter conditions if
provided (used for the ``seq < ?`` keyset predicate).
"""
conditions: list[str] = []
if extra_where:
conditions.append(extra_where)
if filters.categories:
placeholders = ",".join("?" * len(filters.categories))
conditions.append(f"category IN ({placeholders})")
params.extend(filters.categories)
if filters.severities:
placeholders = ",".join("?" * len(filters.severities))
conditions.append(f"severity IN ({placeholders})")
params.extend(filters.severities)
if filters.actor is not None:
conditions.append("actor = ?")
params.append(filters.actor)
if filters.entity_type is not None:
conditions.append("entity_type = ?")
params.append(filters.entity_type)
if filters.entity_id is not None:
conditions.append("entity_id = ?")
params.append(filters.entity_id)
if filters.since is not None:
conditions.append("ts >= ?")
params.append(filters.since.isoformat())
if filters.until is not None:
conditions.append("ts <= ?")
params.append(filters.until.isoformat())
if filters.message_like is not None:
# Escape LIKE special characters in the user-supplied substring so that
# a literal '%' or '_' in the message does not act as a wildcard.
escaped = filters.message_like.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
conditions.append("message LIKE ? ESCAPE '\\'")
params.append(f"%{escaped}%")
return " AND ".join(conditions)
class ActivityLogRepository:
"""Purpose-built repository for the ``activity_log`` table.
Parameters
----------
db:
The shared ``Database`` singleton. The migration that creates the
``activity_log`` table must have been applied before the first call to
:meth:`record`. Construction triggers migration via
``MigrationRunner`` so users can create the repository directly
without a separate startup step.
"""
def __init__(self, db: Database) -> None:
self._db = db
# Apply pending migrations (idempotent; most runs are no-ops)
from ledgrab.storage.data_migrations import ALL_MIGRATIONS, MigrationRunner
MigrationRunner(db).run(ALL_MIGRATIONS)
# -- Write ---------------------------------------------------------------
def record(self, entry: ActivityLogEntry) -> None:
"""Append *entry* to the log.
The ``seq`` column is assigned by SQLite (AUTOINCREMENT) — ``entry``
does not carry a ``seq`` field.
Caller contract: this must be called from the event-loop thread (or
from a context already serialised through the ``Database`` RLock).
Cross-thread callers must marshal via ``loop.call_soon_threadsafe``;
that is Phase 2's responsibility.
"""
row = entry.to_row()
self._db.execute(
f"INSERT INTO {_TABLE} "
f"(id, ts, category, action, severity, actor, "
f" entity_type, entity_id, entity_name, message, metadata) "
f"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(
row["id"],
row["ts"],
row["category"],
row["action"],
row["severity"],
row["actor"],
row["entity_type"],
row["entity_id"],
row["entity_name"],
row["message"],
row["metadata"],
),
)
# -- Read ----------------------------------------------------------------
def query(
self,
filters: ActivityLogFilters,
*,
before_seq: int | None = None,
limit: int = 50,
) -> list[ActivityLogEntry]:
"""Return the newest matching entries, oldest-first within the page.
Keyset pagination: pass the smallest ``seq`` seen on the previous page
as *before_seq* to get the next page. Entries are fetched in
``seq DESC`` order from SQLite and then reversed before returning so
that the caller receives them in chronological order within the page.
Parameters
----------
filters:
Optional filter criteria.
before_seq:
Exclusive upper bound on ``seq``. ``None`` returns the first
(newest) page.
limit:
Maximum number of entries to return.
"""
params: list = []
keyset = "seq < ?" if before_seq is not None else None
if before_seq is not None:
params.append(before_seq)
where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
params.append(limit)
sql = (
f"SELECT seq, id, ts, category, action, severity, actor, "
f"entity_type, entity_id, entity_name, message, metadata "
f"FROM {_TABLE} "
f"{where_clause} "
f"ORDER BY seq DESC "
f"LIMIT ?"
)
cursor = self._db.execute(sql, tuple(params))
rows = cursor.fetchall()
# Reverse to return chronological order within the page
return [ActivityLogEntry.from_row(dict(row)) for row in reversed(rows)]
def count(self, filters: ActivityLogFilters | None = None) -> int:
"""Return the number of entries matching *filters* (or all entries)."""
if filters is None:
filters = ActivityLogFilters()
params: list = []
where_fragment = _build_filter_clause(filters, params)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
sql = f"SELECT COUNT(*) AS cnt FROM {_TABLE} {where_clause}"
cursor = self._db.execute(sql, tuple(params))
row = cursor.fetchone()
return int(row["cnt"])
# -- Maintenance ---------------------------------------------------------
def prune(
self,
*,
before_ts: datetime | None = None,
max_entries: int | None = None,
) -> int:
"""Delete old / excess entries; return the total rows deleted.
Both predicates are applied independently:
1. Delete all rows where ``ts < before_ts`` (age-based pruning).
2. Keep only the newest ``max_entries`` rows, deleting the rest
(count-based pruning).
The two operations run in separate statements so that each can be
independently enabled or disabled.
"""
deleted = 0
if before_ts is not None:
cursor = self._db.execute(
f"DELETE FROM {_TABLE} WHERE ts < ?",
(before_ts.isoformat(),),
)
deleted += cursor.rowcount
if max_entries is not None and max_entries >= 0:
# Find the seq of the Nth newest row; delete everything older than it.
cursor = self._db.execute(
f"SELECT seq FROM {_TABLE} ORDER BY seq DESC LIMIT 1 OFFSET ?",
(max_entries,),
)
cutoff_row = cursor.fetchone()
if cutoff_row is not None:
cutoff_seq = cutoff_row["seq"]
cursor = self._db.execute(
f"DELETE FROM {_TABLE} WHERE seq <= ?",
(cutoff_seq,),
)
deleted += cursor.rowcount
return deleted
def clear(self) -> int:
"""Delete all entries; return the number of rows deleted."""
cursor = self._db.execute(f"DELETE FROM {_TABLE}")
return cursor.rowcount
def get_seq_for_id(self, entry_id: str) -> int | None:
"""Return the ``seq`` value for the entry with *entry_id*, or ``None``.
Used by the API list endpoint to compute the keyset cursor
(``next_before_seq``) from the oldest entry on the current page.
"""
cursor = self._db.execute(
f"SELECT seq FROM {_TABLE} WHERE id = ?",
(entry_id,),
)
row = cursor.fetchone()
return int(row["seq"]) if row is not None else None
# -- Export --------------------------------------------------------------
def iter_export(
self,
filters: ActivityLogFilters | None = None,
*,
batch_size: int = 1000,
) -> Iterator[ActivityLogEntry]:
"""Yield all matching entries in ascending ``seq`` order.
Fetches rows in bounded batches (keyset-paginated by ``seq``), holding
the DB lock only for the duration of each ``fetchall()`` and releasing
it before yielding. This prevents a slow/stalled export client from
blocking all other DB operations (record, config writes, etc.) for the
full duration of the stream.
Memory usage is bounded to ``batch_size`` rows at a time.
"""
if filters is None:
filters = ActivityLogFilters()
# Keyset cursor: largest seq yielded so far; None means "start from the
# very beginning". We iterate ascending (seq ASC), so each batch uses
# "seq > ?" to advance past the already-yielded rows.
cursor_seq: int | None = None
while True:
# Build params list: cursor_seq placeholder must come first because
# _build_filter_clause prepends extra_where as the first condition.
params: list = []
if cursor_seq is not None:
params.append(cursor_seq)
keyset: str | None = "seq > ?"
else:
keyset = None
where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
params.append(batch_size)
sql = (
f"SELECT seq, id, ts, category, action, severity, actor, "
f"entity_type, entity_id, entity_name, message, metadata "
f"FROM {_TABLE} "
f"{where_clause} "
f"ORDER BY seq ASC "
f"LIMIT ?"
)
# Hold the lock only for the bounded fetchall; release before yielding.
with self._db._lock: # noqa: SLF001 — internal access; no public cursor API
rows = self._db._conn.execute(sql, tuple(params)).fetchall() # noqa: SLF001
if not rows:
break
for row in rows:
yield ActivityLogEntry.from_row(dict(row))
# The last row has the largest seq in this batch (ORDER BY seq ASC).
cursor_seq = rows[-1]["seq"]
if len(rows) < batch_size:
# Fewer rows than requested → this was the final batch.
break