feat(activity-log): phase 1 - storage model, migration, repository

- ActivityLogEntry dataclass + ActivityCategory/ActivitySeverity + ActivityLogFilters
- additive idempotent migration 002_add_activity_log (indexed activity_log table, seq keyset tiebreaker)
- ActivityLogRepository (record/query/count/prune/clear/iter_export), keyset pagination, parameterized SQL
- 102 unit + adversarial tests (SQL-injection, pagination, prune, codec, migration idempotency)
This commit is contained in:
2026-06-09 17:40:37 +03:00
parent 1afe7d6fcc
commit 1ac4a0f66d
8 changed files with 2100 additions and 18 deletions
+171
View File
@@ -0,0 +1,171 @@
"""Activity / audit log data model.
Defines the ``ActivityLogEntry`` dataclass together with its category and
severity enumerations and the ``ActivityLogFilters`` query object. Row-level
codec (``to_row`` / ``from_row``) converts between the dataclass and a flat
SQLite column dict.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Sequence
# ---------------------------------------------------------------------------
# Enumerations (string constants, not enum.Enum, to stay consistent with the
# rest of the codebase which uses plain string literals)
# ---------------------------------------------------------------------------
class ActivityCategory:
"""Valid ``category`` values for an ``ActivityLogEntry``."""
AUTH = "auth"
DEVICE = "device"
ENTITY = "entity"
CAPTURE = "capture"
SYSTEM = "system"
ALL: tuple[str, ...] = (AUTH, DEVICE, ENTITY, CAPTURE, SYSTEM)
class ActivitySeverity:
"""Valid ``severity`` values for an ``ActivityLogEntry``."""
INFO = "info"
WARNING = "warning"
ERROR = "error"
ALL: tuple[str, ...] = (INFO, WARNING, ERROR)
# ---------------------------------------------------------------------------
# Entry dataclass
# ---------------------------------------------------------------------------
@dataclass
class ActivityLogEntry:
"""One immutable audit record.
Fields
------
id Stable application-assigned identifier, e.g. ``al_<uuid8>``.
ts UTC timestamp of the event (server-assigned at record time).
category Broad bucket — one of :class:`ActivityCategory`.
action Verb-object label, e.g. ``"entity.created"`` or ``"auth.rejected"``.
severity One of :class:`ActivitySeverity`.
actor Who triggered the action, e.g. an API-key label or ``"system"``.
entity_type Optional: kind of entity involved, e.g. ``"output_target"``.
entity_id Optional: stable entity identifier.
entity_name Optional: human-readable entity name at the time of the event.
message Human-readable description suitable for display.
metadata Small structured context (device address, error code, …).
Must be JSON-serialisable; defaults to empty dict.
"""
id: str
ts: datetime
category: str
action: str
severity: str
actor: str
message: str
entity_type: str | None = None
entity_id: str | None = None
entity_name: str | None = None
metadata: dict = field(default_factory=dict)
# -- Row codec -----------------------------------------------------------
def to_row(self) -> dict:
"""Return a flat dict suitable for a parameterised SQLite INSERT.
``ts`` is stored as an ISO-8601 string (UTC). ``metadata`` is stored
as a JSON string. ``seq`` is DB-assigned and is NOT included.
"""
return {
"id": self.id,
"ts": self.ts.isoformat(),
"category": self.category,
"action": self.action,
"severity": self.severity,
"actor": self.actor,
"entity_type": self.entity_type,
"entity_id": self.entity_id,
"entity_name": self.entity_name,
"message": self.message,
"metadata": json.dumps(self.metadata, ensure_ascii=False),
}
@staticmethod
def from_row(row: dict) -> "ActivityLogEntry":
"""Reconstruct an ``ActivityLogEntry`` from a SQLite row dict.
``row`` may include the ``seq`` column — it is ignored here (callers
that need ``seq`` for keyset pagination access it directly from the
row before calling this method).
"""
raw_meta = row.get("metadata") or "{}"
try:
metadata = json.loads(raw_meta)
except (json.JSONDecodeError, TypeError):
metadata = {}
raw_ts = row["ts"]
ts = datetime.fromisoformat(raw_ts)
# Ensure tz-aware UTC even if stored without offset (legacy rows)
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
return ActivityLogEntry(
id=row["id"],
ts=ts,
category=row["category"],
action=row["action"],
severity=row["severity"],
actor=row["actor"],
entity_type=row.get("entity_type"),
entity_id=row.get("entity_id"),
entity_name=row.get("entity_name"),
message=row["message"],
metadata=metadata,
)
# ---------------------------------------------------------------------------
# Filters dataclass
# ---------------------------------------------------------------------------
@dataclass
class ActivityLogFilters:
"""Optional query-time filters for :class:`ActivityLogRepository`.
All fields are optional. ``None`` / empty sequence means "no restriction
on this dimension".
Fields
------
categories Restrict to entries whose ``category`` is in this set.
severities Restrict to entries whose ``severity`` is in this set.
actor Exact match on the ``actor`` field.
entity_type Exact match on the ``entity_type`` field.
entity_id Exact match on the ``entity_id`` field.
since Inclusive lower bound on ``ts`` (``ts >= since``).
until Inclusive upper bound on ``ts`` (``ts <= until``).
message_like Free-text substring match on ``message`` (LIKE ``%value%``).
The value is escaped — no SQL injection risk.
"""
categories: Sequence[str] | None = None
severities: Sequence[str] | None = None
actor: str | None = None
entity_type: str | None = None
entity_id: str | None = None
since: datetime | None = None
until: datetime | None = None
message_like: str | None = None
@@ -0,0 +1,293 @@
"""Append-only repository for the persistent activity / audit log.
Design notes
------------
* Does NOT subclass ``BaseSqliteStore`` — that base loads every row into an
in-memory cache at construction time, which is wrong for an unbounded,
append-heavy log.
* All SQL parameters are passed as positional ``?`` placeholders — no user
input is interpolated into the query string.
* Keyset pagination is implemented via the ``seq`` INTEGER PRIMARY KEY
AUTOINCREMENT column, which is strictly monotonic and survives rows with
identical ``ts`` values.
* The repository takes a ``Database`` instance and calls ``db.execute`` /
``db.transaction`` directly.
* Thread safety: ``Database.execute`` holds the ``RLock`` for the duration of
each call. The repository itself adds no extra locking. Callers that
originate from non-event-loop threads MUST marshal via
``loop.call_soon_threadsafe`` (that is Phase 2's responsibility).
"""
from __future__ import annotations
import sqlite3
from datetime import datetime
from typing import Iterator
from ledgrab.storage.activity_log import ActivityLogEntry, ActivityLogFilters
from ledgrab.storage.database import Database
from ledgrab.utils import get_logger
logger = get_logger(__name__)
_TABLE = "activity_log"
def _build_filter_clause(
filters: ActivityLogFilters,
params: list,
*,
extra_where: str | None = None,
) -> str:
"""Return a WHERE clause string and append bound parameters to *params*.
The caller is responsible for providing the opening ``WHERE`` keyword when
the returned string is non-empty, or ``AND`` when appending to an existing
predicate. This function returns only the condition fragment(s) joined by
``AND``, or an empty string when *filters* has no restrictions.
``extra_where`` is prepended (with AND) before the filter conditions if
provided (used for the ``seq < ?`` keyset predicate).
"""
conditions: list[str] = []
if extra_where:
conditions.append(extra_where)
if filters.categories:
placeholders = ",".join("?" * len(filters.categories))
conditions.append(f"category IN ({placeholders})")
params.extend(filters.categories)
if filters.severities:
placeholders = ",".join("?" * len(filters.severities))
conditions.append(f"severity IN ({placeholders})")
params.extend(filters.severities)
if filters.actor is not None:
conditions.append("actor = ?")
params.append(filters.actor)
if filters.entity_type is not None:
conditions.append("entity_type = ?")
params.append(filters.entity_type)
if filters.entity_id is not None:
conditions.append("entity_id = ?")
params.append(filters.entity_id)
if filters.since is not None:
conditions.append("ts >= ?")
params.append(filters.since.isoformat())
if filters.until is not None:
conditions.append("ts <= ?")
params.append(filters.until.isoformat())
if filters.message_like is not None:
# Escape LIKE special characters in the user-supplied substring so that
# a literal '%' or '_' in the message does not act as a wildcard.
escaped = filters.message_like.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
conditions.append("message LIKE ? ESCAPE '\\'")
params.append(f"%{escaped}%")
return " AND ".join(conditions)
class ActivityLogRepository:
"""Purpose-built repository for the ``activity_log`` table.
Parameters
----------
db:
The shared ``Database`` singleton. The migration that creates the
``activity_log`` table must have been applied before the first call to
:meth:`record`. Construction triggers migration via
``MigrationRunner`` so users can create the repository directly
without a separate startup step.
"""
def __init__(self, db: Database) -> None:
self._db = db
# Apply pending migrations (idempotent; most runs are no-ops)
from ledgrab.storage.data_migrations import ALL_MIGRATIONS, MigrationRunner
MigrationRunner(db).run(ALL_MIGRATIONS)
# -- Write ---------------------------------------------------------------
def record(self, entry: ActivityLogEntry) -> None:
"""Append *entry* to the log.
The ``seq`` column is assigned by SQLite (AUTOINCREMENT) — ``entry``
does not carry a ``seq`` field.
Caller contract: this must be called from the event-loop thread (or
from a context already serialised through the ``Database`` RLock).
Cross-thread callers must marshal via ``loop.call_soon_threadsafe``;
that is Phase 2's responsibility.
"""
row = entry.to_row()
self._db.execute(
f"INSERT INTO {_TABLE} "
f"(id, ts, category, action, severity, actor, "
f" entity_type, entity_id, entity_name, message, metadata) "
f"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(
row["id"],
row["ts"],
row["category"],
row["action"],
row["severity"],
row["actor"],
row["entity_type"],
row["entity_id"],
row["entity_name"],
row["message"],
row["metadata"],
),
)
# -- Read ----------------------------------------------------------------
def query(
self,
filters: ActivityLogFilters,
*,
before_seq: int | None = None,
limit: int = 50,
) -> list[ActivityLogEntry]:
"""Return the newest matching entries, oldest-first within the page.
Keyset pagination: pass the smallest ``seq`` seen on the previous page
as *before_seq* to get the next page. Entries are fetched in
``seq DESC`` order from SQLite and then reversed before returning so
that the caller receives them in chronological order within the page.
Parameters
----------
filters:
Optional filter criteria.
before_seq:
Exclusive upper bound on ``seq``. ``None`` returns the first
(newest) page.
limit:
Maximum number of entries to return.
"""
params: list = []
keyset = "seq < ?" if before_seq is not None else None
if before_seq is not None:
params.append(before_seq)
where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
params.append(limit)
sql = (
f"SELECT seq, id, ts, category, action, severity, actor, "
f"entity_type, entity_id, entity_name, message, metadata "
f"FROM {_TABLE} "
f"{where_clause} "
f"ORDER BY seq DESC "
f"LIMIT ?"
)
cursor = self._db.execute(sql, tuple(params))
rows = cursor.fetchall()
# Reverse to return chronological order within the page
return [ActivityLogEntry.from_row(dict(row)) for row in reversed(rows)]
def count(self, filters: ActivityLogFilters | None = None) -> int:
"""Return the number of entries matching *filters* (or all entries)."""
if filters is None:
filters = ActivityLogFilters()
params: list = []
where_fragment = _build_filter_clause(filters, params)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
sql = f"SELECT COUNT(*) AS cnt FROM {_TABLE} {where_clause}"
cursor = self._db.execute(sql, tuple(params))
row = cursor.fetchone()
return int(row["cnt"])
# -- Maintenance ---------------------------------------------------------
def prune(
self,
*,
before_ts: datetime | None = None,
max_entries: int | None = None,
) -> int:
"""Delete old / excess entries; return the total rows deleted.
Both predicates are applied independently:
1. Delete all rows where ``ts < before_ts`` (age-based pruning).
2. Keep only the newest ``max_entries`` rows, deleting the rest
(count-based pruning).
The two operations run in separate statements so that each can be
independently enabled or disabled.
"""
deleted = 0
if before_ts is not None:
cursor = self._db.execute(
f"DELETE FROM {_TABLE} WHERE ts < ?",
(before_ts.isoformat(),),
)
deleted += cursor.rowcount
if max_entries is not None and max_entries >= 0:
# Find the seq of the Nth newest row; delete everything older than it.
cursor = self._db.execute(
f"SELECT seq FROM {_TABLE} ORDER BY seq DESC LIMIT 1 OFFSET ?",
(max_entries,),
)
cutoff_row = cursor.fetchone()
if cutoff_row is not None:
cutoff_seq = cutoff_row["seq"]
cursor = self._db.execute(
f"DELETE FROM {_TABLE} WHERE seq <= ?",
(cutoff_seq,),
)
deleted += cursor.rowcount
return deleted
def clear(self) -> int:
"""Delete all entries; return the number of rows deleted."""
cursor = self._db.execute(f"DELETE FROM {_TABLE}")
return cursor.rowcount
# -- Export --------------------------------------------------------------
def iter_export(self, filters: ActivityLogFilters | None = None) -> Iterator[ActivityLogEntry]:
"""Yield all matching entries in ascending ``seq`` order.
Uses a server-side cursor so the entire result set is never loaded
into memory — safe for large tables. The connection's ``RLock`` is
held for the duration of the iteration; callers should consume this
iterator promptly.
"""
if filters is None:
filters = ActivityLogFilters()
params: list = []
where_fragment = _build_filter_clause(filters, params)
where_clause = f"WHERE {where_fragment}" if where_fragment else ""
sql = (
f"SELECT seq, id, ts, category, action, severity, actor, "
f"entity_type, entity_id, entity_name, message, metadata "
f"FROM {_TABLE} "
f"{where_clause} "
f"ORDER BY seq ASC"
)
# Use the raw connection directly to get a streaming cursor.
# We borrow the lock for the full iteration.
with self._db._lock: # noqa: SLF001 — internal access; no public cursor API
cursor: sqlite3.Cursor = self._db._conn.execute(sql, tuple(params)) # noqa: SLF001
for row in cursor:
yield ActivityLogEntry.from_row(dict(row))
@@ -213,7 +213,57 @@ class StaticToSingleColorMigration(DataMigration):
return rows_changed
class AddActivityLogTableMigration(DataMigration):
"""Create the ``activity_log`` table and its indexes.
This is a purely additive migration — it does not touch any existing table.
``CREATE TABLE / INDEX IF NOT EXISTS`` ensures idempotency if the migration
somehow runs twice (e.g. after a partial restore).
"""
name = "002_add_activity_log"
def apply(self, conn: sqlite3.Connection) -> int:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS activity_log (
seq INTEGER PRIMARY KEY AUTOINCREMENT,
id TEXT UNIQUE NOT NULL,
ts TEXT NOT NULL,
category TEXT NOT NULL,
action TEXT NOT NULL,
severity TEXT NOT NULL,
actor TEXT NOT NULL,
entity_type TEXT,
entity_id TEXT,
entity_name TEXT,
message TEXT NOT NULL,
metadata TEXT NOT NULL DEFAULT '{}'
)
"""
)
# Primary read path: newest-first keyset pagination
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_activity_log_ts_seq "
"ON activity_log (ts DESC, seq DESC)"
)
# Filter indexes
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_activity_log_category " "ON activity_log (category)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_activity_log_severity " "ON activity_log (severity)"
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_activity_log_actor " "ON activity_log (actor)")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_activity_log_entity "
"ON activity_log (entity_type, entity_id)"
)
return 0
# Master list — ORDER MATTERS. Append new migrations; never reorder.
ALL_MIGRATIONS: list[DataMigration] = [
StaticToSingleColorMigration(),
AddActivityLogTableMigration(),
]