feat(activity-log): phase 4 - REST API (list/export/settings/clear)

- GET /activity-log: filtered, keyset-paginated list (categories/severities/actor/entity/date/q) - GET /activity-log/export: streaming CSV/JSON, chunked keyset (releases DB lock per batch), CSV formula-injection guard - GET/PUT /activity-log/settings: retention config (PUT require_authenticated) - DELETE /activity-log: clear (require_authenticated, self-audited) - security: export DoS fix, settings-PUT auth gate, CSV \t/\r guard, metadata-as-JSON - 122 API tests (auth posture, CSV injection, pagination integrity, filters, settings bounds, clear-audited)
2026-06-09 20:09:46 +03:00
parent 25c613c5cb
commit 4a0927521a
9 changed files with 2594 additions and 34 deletions
@@ -38,6 +38,7 @@ from .routes.snapshot import router as snapshot_router
 from .routes.graph import router as graph_router
 from .routes.calibration import router as calibration_router
 from .routes.setup import router as setup_router
+from .routes.activity_log import router as activity_log_router

 router = APIRouter()
 router.include_router(system_router)
@@ -76,5 +77,6 @@ router.include_router(snapshot_router)
 router.include_router(graph_router)
 router.include_router(calibration_router)
 router.include_router(setup_router)
+router.include_router(activity_log_router)

 __all__ = ["router"]
@@ -0,0 +1,436 @@
+"""Activity-log REST API — query / filter / export / settings / clear.
+
+Endpoints
+---------
+GET    /api/v1/activity-log           List (filterable, keyset-paginated)
+GET    /api/v1/activity-log/export    Streaming CSV or JSON export
+GET    /api/v1/activity-log/settings  Retention settings
+PUT    /api/v1/activity-log/settings  Update retention settings (requires non-anonymous auth)
+DELETE /api/v1/activity-log           Clear all entries (requires non-anonymous auth)
+
+Auth posture
+------------
+- List + read settings (``GET``): ``AuthRequired`` (loopback-anonymous is fine).
+- Export, update settings (``PUT``), and clear: ``require_authenticated()``
+  (loopback-anonymous is rejected; mirrors the backup download / secret-reveal
+  pattern from ``backup.py``).  Updating settings can disable auditing or prune
+  the trail, so it is gated like the destructive clear.
+
+CSV injection
+-------------
+Cells that begin with =, +, -, @, TAB, or CR can trigger formula execution in
+spreadsheet apps (OWASP Formula Injection).  ``_csv_safe`` prefixes any such cell
+with a single quote so formulas are inert.  Fields already go through
+``sanitize_display`` in Phase 3 instrumentation, but the CSV writer applies its
+own guard as defence-in-depth.
+
+Export generator + lock
+-----------------------
+``repo.iter_export()`` fetches rows in bounded batches, holding the DB ``_lock``
+only around each batch fetch and releasing it before yielding — so a slow or
+stalled client never blocks other DB operations.  The ``StreamingResponse``
+generator is wrapped in a ``try/finally`` block so the batch generator is closed
+even when the client disconnects mid-stream.
+"""
+
+from __future__ import annotations
+
+import csv
+import io
+import json
+from datetime import datetime, timezone
+from typing import Annotated, Iterator
+
+from fastapi import APIRouter, Depends, Query
+from fastapi.responses import StreamingResponse
+
+from ledgrab.api.auth import AuthRequired, require_authenticated
+from ledgrab.api.dependencies import (
+    get_activity_log_repo,
+    get_activity_log_retention_engine,
+    get_activity_recorder,
+)
+from ledgrab.api.schemas.activity_log import (
+    ActivityLogPageResponse,
+    ActivityLogSettingsResponse,
+    UpdateActivityLogSettingsRequest,
+)
+from ledgrab.core.activity_log.recorder import ActivityRecorder, entry_to_dict
+from ledgrab.core.activity_log.retention import ActivityLogRetentionEngine
+from ledgrab.storage.activity_log import ActivityCategory, ActivityLogFilters, ActivitySeverity
+from ledgrab.storage.activity_log_repository import ActivityLogRepository
+
+router = APIRouter(prefix="/api/v1/activity-log", tags=["Activity Log"])
+
+# Hard cap on the per-request limit to prevent runaway queries.
+_MAX_LIMIT = 200
+_DEFAULT_LIMIT = 50
+
+# CSV export columns (matches entry_to_dict key order)
+_CSV_COLUMNS = [
+    "id",
+    "ts",
+    "category",
+    "action",
+    "severity",
+    "actor",
+    "entity_type",
+    "entity_id",
+    "entity_name",
+    "message",
+    "metadata",
+]
+
+# Characters that trigger formula injection in spreadsheet apps (OWASP).
+# Leading TAB and CR are also recognised triggers by Excel / Google Sheets.
+_FORMULA_PREFIXES = ("=", "+", "-", "@", "\t", "\r")
+
+
+def _csv_safe(value: str) -> str:
+    """Prefix formula-injection triggers with a literal single-quote.
+
+    A cell starting with =, +, -, or @ can execute as a formula in Excel /
+    Google Sheets.  OWASP recommends prepending a single quote to neutralise it.
+    """
+    if value and value[0] in _FORMULA_PREFIXES:
+        return "'" + value
+    return value
+
+
+def _build_filters(
+    categories: list[str] | None,
+    severities: list[str] | None,
+    actor: str | None,
+    entity_type: str | None,
+    entity_id: str | None,
+    since: datetime | None,
+    until: datetime | None,
+    q: str | None,
+) -> ActivityLogFilters:
+    """Assemble an ``ActivityLogFilters`` dataclass from query parameters."""
+    return ActivityLogFilters(
+        categories=categories or None,
+        severities=severities or None,
+        actor=actor or None,
+        entity_type=entity_type or None,
+        entity_id=entity_id or None,
+        since=since,
+        until=until,
+        message_like=q or None,
+    )
+
+
+# ---------------------------------------------------------------------------
+# GET /api/v1/activity-log  — list
+# ---------------------------------------------------------------------------
+
+
+@router.get("", response_model=ActivityLogPageResponse, summary="List activity-log entries")
+def list_activity_log(
+    auth: AuthRequired,  # noqa: ARG001
+    repo: ActivityLogRepository = Depends(get_activity_log_repo),
+    # ── Filters ────────────────────────────────────────────────────────────
+    categories: Annotated[
+        list[str] | None,
+        Query(
+            description=(
+                "Filter by category (repeatable or comma-separated). "
+                "Values: auth, device, entity, capture, system"
+            )
+        ),
+    ] = None,
+    severities: Annotated[
+        list[str] | None,
+        Query(description="Filter by severity (repeatable). Values: info, warning, error"),
+    ] = None,
+    actor: Annotated[
+        str | None,
+        Query(description="Filter by actor label (exact match)"),
+    ] = None,
+    entity_type: Annotated[
+        str | None,
+        Query(description="Filter by entity type (exact match)"),
+    ] = None,
+    entity_id: Annotated[
+        str | None,
+        Query(description="Filter by entity id (exact match)"),
+    ] = None,
+    since: Annotated[
+        datetime | None,
+        Query(description="Return entries at or after this ISO-8601 datetime"),
+    ] = None,
+    until: Annotated[
+        datetime | None,
+        Query(description="Return entries at or before this ISO-8601 datetime"),
+    ] = None,
+    q: Annotated[
+        str | None,
+        Query(description="Free-text search in the message field (substring)"),
+    ] = None,
+    # ── Pagination ─────────────────────────────────────────────────────────
+    before_seq: Annotated[
+        int | None,
+        Query(
+            description=(
+                "Keyset cursor: pass the 'next_before_seq' from the previous page "
+                "to get the following (older) page. Omit for the first (newest) page."
+            )
+        ),
+    ] = None,
+    limit: Annotated[
+        int,
+        Query(
+            ge=1,
+            le=_MAX_LIMIT,
+            description=f"Max entries per page (default {_DEFAULT_LIMIT}, max {_MAX_LIMIT})",
+        ),
+    ] = _DEFAULT_LIMIT,
+) -> ActivityLogPageResponse:
+    """Return the newest matching entries, oldest-first within the page.
+
+    Keyset pagination: the response includes ``next_before_seq`` — pass it
+    as ``before_seq`` in the next request to get the next (older) page.
+    The ``total`` field is the count of all entries matching the current
+    filters across all pages.
+    """
+    filters = _build_filters(categories, severities, actor, entity_type, entity_id, since, until, q)
+
+    # Fetch limit+1 rows to detect whether an older page exists.
+    #
+    # query() fetches DESC internally (newest-first) then reverses to ascending.
+    # With limit+1, the result is ascending: [oldest_probe, ..., newest].
+    # When we got exactly limit+1 rows, has_more is True and the probe row
+    # (index 0 — the oldest) is the extra one.  We keep the newest `limit` rows
+    # by slicing [1:], which is the actual page content for the client.
+    # When we got <= limit rows, this is the last page and all rows are included.
+    effective_limit = min(limit, _MAX_LIMIT)
+    entries_plus = repo.query(filters, before_seq=before_seq, limit=effective_limit + 1)
+    has_more = len(entries_plus) > effective_limit
+    if has_more:
+        # Drop the oldest probe row; keep the newest `limit` entries.
+        entries = entries_plus[1:]
+    else:
+        entries = entries_plus
+
+    total = repo.count(filters)
+
+    # Compute next_before_seq: the seq of the oldest entry on this page.
+    # query() returns entries ascending (entries[0] is oldest); its seq is the
+    # cursor for the next page.  The next request passes before_seq=X to get
+    # entries with seq < X, i.e. entries older than the oldest entry on this page.
+    # get_seq_for_id() does a cheap indexed point-lookup.
+    next_before_seq: int | None = None
+    if has_more and entries:
+        next_before_seq = repo.get_seq_for_id(entries[0].id)
+
+    return ActivityLogPageResponse(
+        entries=[entry_to_dict(e) for e in entries],  # type: ignore[arg-type]
+        next_before_seq=next_before_seq,
+        has_more=has_more,
+        total=total,
+    )
+
+
+# ---------------------------------------------------------------------------
+# GET /api/v1/activity-log/export  — streaming export (CSV or JSON)
+# ---------------------------------------------------------------------------
+
+
+def _export_csv_generator(
+    repo: ActivityLogRepository,
+    filters: ActivityLogFilters,
+) -> Iterator[bytes]:
+    """Yield UTF-8-encoded CSV chunks one row at a time.
+
+    The generator wraps ``repo.iter_export()`` in a ``try/finally`` so the DB
+    lock is released even on early client disconnect (which triggers
+    ``GeneratorExit``).
+    """
+    gen = repo.iter_export(filters)
+    try:
+        # Header
+        buf = io.StringIO()
+        writer = csv.writer(buf)
+        writer.writerow(_CSV_COLUMNS)
+        yield buf.getvalue().encode("utf-8")
+
+        for entry in gen:
+            d = entry_to_dict(entry)
+            row = []
+            for col in _CSV_COLUMNS:
+                if col == "metadata":
+                    cell = json.dumps(d.get(col) or {})
+                else:
+                    cell = str(d.get(col, "") or "")
+                row.append(_csv_safe(cell))
+            buf = io.StringIO()
+            writer = csv.writer(buf)
+            writer.writerow(row)
+            yield buf.getvalue().encode("utf-8")
+    finally:
+        gen.close()
+
+
+def _export_json_generator(
+    repo: ActivityLogRepository,
+    filters: ActivityLogFilters,
+) -> Iterator[bytes]:
+    """Yield a streamed JSON array, one entry per chunk.
+
+    Format: ``[\\n{entry},\\n{entry},\\n...]\\n``
+    The generator wraps ``repo.iter_export()`` in a ``try/finally`` so the DB
+    lock is released even on early client disconnect.
+    """
+    gen = repo.iter_export(filters)
+    try:
+        first = True
+        yield b"[\n"
+        for entry in gen:
+            d = entry_to_dict(entry)
+            chunk = json.dumps(d, ensure_ascii=False, default=str)
+            if first:
+                yield chunk.encode("utf-8")
+                first = False
+            else:
+                yield b",\n" + chunk.encode("utf-8")
+        yield b"\n]\n"
+    finally:
+        gen.close()
+
+
+@router.get("/export", summary="Export activity-log entries (streaming CSV or JSON)")
+def export_activity_log(
+    auth: AuthRequired,
+    repo: ActivityLogRepository = Depends(get_activity_log_repo),
+    # ── Format ────────────────────────────────────────────────────────────
+    format: Annotated[
+        str,
+        Query(description="Export format: 'csv' or 'json'"),
+    ] = "csv",
+    # ── Same filters as list ───────────────────────────────────────────────
+    categories: Annotated[list[str] | None, Query()] = None,
+    severities: Annotated[list[str] | None, Query()] = None,
+    actor: Annotated[str | None, Query()] = None,
+    entity_type: Annotated[str | None, Query()] = None,
+    entity_id: Annotated[str | None, Query()] = None,
+    since: Annotated[datetime | None, Query()] = None,
+    until: Annotated[datetime | None, Query()] = None,
+    q: Annotated[str | None, Query()] = None,
+) -> StreamingResponse:
+    """Stream all matching entries as CSV or JSON.
+
+    Requires a non-anonymous API key (loopback-anonymous access is rejected
+    because the log may contain IP addresses and entity names).
+    """
+    require_authenticated(auth)
+
+    if format not in ("csv", "json"):
+        from fastapi import HTTPException
+
+        raise HTTPException(
+            status_code=422,
+            detail="'format' must be 'csv' or 'json'",
+        )
+
+    filters = _build_filters(categories, severities, actor, entity_type, entity_id, since, until, q)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H%M%S")
+
+    if format == "csv":
+        filename = f"activity-log-{timestamp}.csv"
+        media_type = "text/csv; charset=utf-8"
+        generator = _export_csv_generator(repo, filters)
+    else:
+        filename = f"activity-log-{timestamp}.json"
+        media_type = "application/json"
+        generator = _export_json_generator(repo, filters)
+
+    return StreamingResponse(
+        generator,
+        media_type=media_type,
+        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
+    )
+
+
+# ---------------------------------------------------------------------------
+# GET /api/v1/activity-log/settings
+# PUT /api/v1/activity-log/settings
+# ---------------------------------------------------------------------------
+
+
+@router.get(
+    "/settings",
+    response_model=ActivityLogSettingsResponse,
+    summary="Get activity-log retention settings",
+)
+def get_activity_log_settings(
+    _: AuthRequired,
+    engine: ActivityLogRetentionEngine = Depends(get_activity_log_retention_engine),
+) -> ActivityLogSettingsResponse:
+    """Return the current activity-log retention settings."""
+    return ActivityLogSettingsResponse(**engine.get_settings())
+
+
+@router.put(
+    "/settings",
+    response_model=ActivityLogSettingsResponse,
+    summary="Update activity-log retention settings",
+)
+async def update_activity_log_settings(
+    auth: AuthRequired,
+    body: UpdateActivityLogSettingsRequest,
+    engine: ActivityLogRetentionEngine = Depends(get_activity_log_retention_engine),
+) -> ActivityLogSettingsResponse:
+    """Update the activity-log retention settings (applied immediately).
+
+    Requires a non-anonymous API key (loopback-anonymous access is rejected)
+    because disabling the log or pruning retention is equivalent in impact to
+    clearing the audit trail.
+
+    Setting ``enabled=false`` records an audit entry BEFORE the flag takes
+    effect so the last entry in the log shows who disabled recording.
+    """
+    require_authenticated(auth)
+    result = await engine.update_settings(
+        enabled=body.enabled,
+        max_days=body.max_days,
+        max_entries=body.max_entries,
+    )
+    return ActivityLogSettingsResponse(**result)
+
+
+# ---------------------------------------------------------------------------
+# DELETE /api/v1/activity-log  — clear
+# ---------------------------------------------------------------------------
+
+
+@router.delete("", summary="Clear all activity-log entries")
+def clear_activity_log(
+    auth: AuthRequired,
+    repo: ActivityLogRepository = Depends(get_activity_log_repo),
+    recorder: ActivityRecorder = Depends(get_activity_recorder),
+) -> dict:
+    """Delete all activity-log entries.
+
+    Requires a non-anonymous API key (loopback-anonymous access is rejected).
+    The clear operation itself is audited — a ``system/activity_log_cleared``
+    entry is recorded AFTER the wipe, so the log shows who cleared it and how
+    many rows were removed.
+
+    Returns ``{"deleted": <count>}``.
+    """
+    require_authenticated(auth)
+
+    deleted = repo.clear()
+
+    # Record the clear action (best-effort — recorder never raises).
+    recorder.record(
+        category=ActivityCategory.SYSTEM,
+        action="activity_log.cleared",
+        severity=ActivitySeverity.INFO,
+        actor=auth,
+        message=f"Activity log cleared ({deleted} entries removed)",
+        metadata={"deleted_count": deleted},
+    )
+
+    return {"deleted": deleted}
@@ -0,0 +1,93 @@
+"""Pydantic schemas for the activity-log API (Phase 4)."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+# ---------------------------------------------------------------------------
+# Entry + page response
+# ---------------------------------------------------------------------------
+
+
+class ActivityLogEntryResponse(BaseModel):
+    """Single audit-log entry.
+
+    Shape matches ``entry_to_dict()`` from
+    ``ledgrab.core.activity_log.recorder`` exactly — that function is the
+    single source of truth for serialisation; this schema documents the wire
+    format.
+    """
+
+    id: str = Field(description="Entry id — 'al_<8-hex>'")
+    ts: str = Field(description="ISO-8601 UTC timestamp")
+    category: str = Field(description="Broad bucket (auth, device, entity, capture, system)")
+    action: str = Field(description="Verb-object label, e.g. 'entity.created'")
+    severity: str = Field(description="info | warning | error")
+    actor: str = Field(description="API-key label or 'system' / 'anonymous'")
+    entity_type: str | None = Field(default=None, description="Affected entity type, if applicable")
+    entity_id: str | None = Field(default=None, description="Affected entity id, if applicable")
+    entity_name: str | None = Field(
+        default=None, description="Entity name at time of event, if applicable"
+    )
+    message: str = Field(description="Human-readable description")
+    metadata: dict[str, Any] = Field(default_factory=dict, description="Extra structured context")
+
+
+class ActivityLogPageResponse(BaseModel):
+    """Paginated list of audit-log entries (keyset cursor)."""
+
+    entries: list[ActivityLogEntryResponse] = Field(description="Entries on this page")
+    next_before_seq: int | None = Field(
+        default=None,
+        description=(
+            "Pass as 'before_seq' in the next request to get the following page. "
+            "None when this is the last page."
+        ),
+    )
+    has_more: bool = Field(
+        description="True when there are more entries before the first entry on this page"
+    )
+    total: int = Field(description="Total entries matching the current filters (all pages)")
+
+
+# ---------------------------------------------------------------------------
+# Settings
+# ---------------------------------------------------------------------------
+
+_MAX_DAYS_CAP = 3650  # 10 years — sanity upper bound
+_MAX_ENTRIES_CAP = 10_000_000  # 10 M rows — sanity upper bound
+
+
+class ActivityLogSettingsResponse(BaseModel):
+    """Current activity-log retention settings."""
+
+    enabled: bool = Field(description="Whether the activity log is recording")
+    max_days: int = Field(
+        ge=0,
+        le=_MAX_DAYS_CAP,
+        description="Retain entries for at most this many days (0 = no age-based pruning)",
+    )
+    max_entries: int = Field(
+        ge=0,
+        le=_MAX_ENTRIES_CAP,
+        description="Keep at most this many entries (0 = no count-based pruning)",
+    )
+
+
+class UpdateActivityLogSettingsRequest(BaseModel):
+    """Request body for PUT /settings."""
+
+    enabled: bool = Field(description="Enable or disable activity-log recording")
+    max_days: int = Field(
+        ge=0,
+        le=_MAX_DAYS_CAP,
+        description="Retain entries for at most this many days (0 = no age-based pruning)",
+    )
+    max_entries: int = Field(
+        ge=0,
+        le=_MAX_ENTRIES_CAP,
+        description="Keep at most this many entries (0 = no count-based pruning)",
+    )
@@ -20,7 +20,6 @@ Design notes

 from __future__ import annotations

-import sqlite3
 from datetime import datetime
 from typing import Iterator

@@ -260,34 +259,80 @@ class ActivityLogRepository:
        cursor = self._db.execute(f"DELETE FROM {_TABLE}")
        return cursor.rowcount

+    def get_seq_for_id(self, entry_id: str) -> int | None:
+        """Return the ``seq`` value for the entry with *entry_id*, or ``None``.
+
+        Used by the API list endpoint to compute the keyset cursor
+        (``next_before_seq``) from the oldest entry on the current page.
+        """
+        cursor = self._db.execute(
+            f"SELECT seq FROM {_TABLE} WHERE id = ?",
+            (entry_id,),
+        )
+        row = cursor.fetchone()
+        return int(row["seq"]) if row is not None else None
+
    # -- Export --------------------------------------------------------------

-    def iter_export(self, filters: ActivityLogFilters | None = None) -> Iterator[ActivityLogEntry]:
+    def iter_export(
+        self,
+        filters: ActivityLogFilters | None = None,
+        *,
+        batch_size: int = 1000,
+    ) -> Iterator[ActivityLogEntry]:
        """Yield all matching entries in ascending ``seq`` order.

-        Uses a server-side cursor so the entire result set is never loaded
-        into memory — safe for large tables.  The connection's ``RLock`` is
-        held for the duration of the iteration; callers should consume this
-        iterator promptly.
+        Fetches rows in bounded batches (keyset-paginated by ``seq``), holding
+        the DB lock only for the duration of each ``fetchall()`` and releasing
+        it before yielding.  This prevents a slow/stalled export client from
+        blocking all other DB operations (record, config writes, etc.) for the
+        full duration of the stream.
+
+        Memory usage is bounded to ``batch_size`` rows at a time.
        """
        if filters is None:
            filters = ActivityLogFilters()

-        params: list = []
-        where_fragment = _build_filter_clause(filters, params)
-        where_clause = f"WHERE {where_fragment}" if where_fragment else ""
+        # Keyset cursor: largest seq yielded so far; None means "start from the
+        # very beginning".  We iterate ascending (seq ASC), so each batch uses
+        # "seq > ?" to advance past the already-yielded rows.
+        cursor_seq: int | None = None

-        sql = (
-            f"SELECT seq, id, ts, category, action, severity, actor, "
-            f"entity_type, entity_id, entity_name, message, metadata "
-            f"FROM {_TABLE} "
-            f"{where_clause} "
-            f"ORDER BY seq ASC"
-        )
+        while True:
+            # Build params list: cursor_seq placeholder must come first because
+            # _build_filter_clause prepends extra_where as the first condition.
+            params: list = []
+            if cursor_seq is not None:
+                params.append(cursor_seq)
+                keyset: str | None = "seq > ?"
+            else:
+                keyset = None
+            where_fragment = _build_filter_clause(filters, params, extra_where=keyset)
+            where_clause = f"WHERE {where_fragment}" if where_fragment else ""
+            params.append(batch_size)

-        # Use the raw connection directly to get a streaming cursor.
-        # We borrow the lock for the full iteration.
-        with self._db._lock:  # noqa: SLF001 — internal access; no public cursor API
-            cursor: sqlite3.Cursor = self._db._conn.execute(sql, tuple(params))  # noqa: SLF001
-            for row in cursor:
+            sql = (
+                f"SELECT seq, id, ts, category, action, severity, actor, "
+                f"entity_type, entity_id, entity_name, message, metadata "
+                f"FROM {_TABLE} "
+                f"{where_clause} "
+                f"ORDER BY seq ASC "
+                f"LIMIT ?"
+            )
+
+            # Hold the lock only for the bounded fetchall; release before yielding.
+            with self._db._lock:  # noqa: SLF001 — internal access; no public cursor API
+                rows = self._db._conn.execute(sql, tuple(params)).fetchall()  # noqa: SLF001
+
+            if not rows:
+                break
+
+            for row in rows:
                yield ActivityLogEntry.from_row(dict(row))
+
+            # The last row has the largest seq in this batch (ORDER BY seq ASC).
+            cursor_seq = rows[-1]["seq"]
+
+            if len(rows) < batch_size:
+                # Fewer rows than requested → this was the final batch.
+                break