feat(backup): bundle assets in ZIP + partial-write hardening + restart log

Auto-backups now produce a ZIP containing ledgrab.db plus every file
in the assets dir under assets/ — matching the manual
GET /api/v1/system/backup format, so restore accepts either output
interchangeably. Legacy .db backups remain listable, restorable, and
prunable; both extensions count toward max_backups.

Writes stage to <name>.partial then os.replace into place — a crash
mid-ZIP never leaves a half-written backup that masquerades as valid.
Stale .partials from prior crashes are swept on the next run.
Symlinks inside the assets dir are skipped so a hostile link can't
slurp a target outside the dir into every backup. Backups larger than
500 MB log a warning so operators notice unbounded asset growth before
disk fills up.

restart.py: redirect the spawned restart script's stdout/stderr to
restart.log and bail out early if the script is missing — silent
failures (PowerShell off PATH, restart.ps1 erroring) used to vanish
into a detached child with no diagnostic trail.

Tests cover happy path, asset bytes round-trip, partial cleanup,
None/missing assets_dir, failure rollback, stale-partial sweep,
symlink rejection, mixed legacy+new listing, and cross-format prune.
This commit is contained in:
2026-05-28 17:25:55 +03:00
parent e4d24a02da
commit 85da2e538d
5 changed files with 348 additions and 27 deletions
+47 -15
View File
@@ -11,6 +11,7 @@ import sys
import threading
import zipfile
from pathlib import Path
from typing import Any
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
@@ -38,28 +39,59 @@ _SERVER_DIR = Path(__file__).resolve().parents[4]
def _schedule_restart() -> None:
"""Spawn a restart script after a short delay so the HTTP response completes."""
"""Spawn a restart script after a short delay so the HTTP response completes.
def _restart():
stdout/stderr of the spawned script are redirected to ``<server>/restart.log``
so a silent failure (PowerShell not on PATH, restart.ps1 erroring, etc.)
leaves evidence on disk instead of vanishing into a detached child.
"""
def _restart() -> None:
import time
time.sleep(1)
# Annotated as ``dict[str, Any]`` because the value union spans
# int flags (Windows ``creationflags``) and bool (POSIX
# ``start_new_session``); a narrower union confuses ``**`` unpacking.
popen_kwargs: dict[str, Any]
if sys.platform == "win32":
subprocess.Popen(
[
"powershell",
"-ExecutionPolicy",
"Bypass",
"-File",
str(_SERVER_DIR / "restart.ps1"),
],
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP,
)
script = _SERVER_DIR / "restart.ps1"
cmd = ["powershell", "-ExecutionPolicy", "Bypass", "-File", str(script)]
popen_kwargs = {
"creationflags": (
subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
),
}
else:
subprocess.Popen(
["bash", str(_SERVER_DIR / "restart.sh")],
start_new_session=True,
script = _SERVER_DIR / "restart.sh"
cmd = ["bash", str(script)]
popen_kwargs = {"start_new_session": True}
if not script.is_file():
logger.error("Restart script missing: %s", script)
return
log_path = _SERVER_DIR / "restart.log"
try:
# Open in append mode so multiple restarts accumulate; the child
# owns its own duped handle, so closing here in the parent is safe.
with open(log_path, "ab") as log_file:
log_file.write(
f"\n--- restart spawned at {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode()
)
log_file.flush()
proc = subprocess.Popen(
cmd,
stdout=log_file,
stderr=subprocess.STDOUT,
**popen_kwargs,
)
logger.info("Restart script launched: %s (PID %s, log %s)", cmd[0], proc.pid, log_path)
except OSError as e:
logger.error("Failed to launch restart script %s: %s", script, e, exc_info=True)
except Exception as e:
logger.error("Unexpected error launching restart script: %s", e, exc_info=True)
threading.Thread(target=_restart, daemon=True).start()
+89 -11
View File
@@ -1,10 +1,12 @@
"""Auto-backup engine — periodic SQLite snapshot backups."""
"""Auto-backup engine — periodic SQLite + assets snapshot backups."""
import asyncio
import os
import tempfile
import zipfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import List
from typing import Iterable, List
from ledgrab.storage.database import Database
from ledgrab.utils import get_logger
@@ -20,19 +22,35 @@ DEFAULT_SETTINGS = {
# Skip the immediate-on-start backup if a recent backup exists within this window.
_STARTUP_BACKUP_COOLDOWN = timedelta(minutes=5)
_BACKUP_EXT = ".db"
# Current write format. ``.db`` is still recognised on read so backups taken
# by older versions remain listable, restorable, and prunable.
_BACKUP_EXT = ".zip"
_RECOGNISED_EXTS: tuple[str, ...] = (".zip", ".db")
# Soft warning threshold — large backups indicate an unbounded assets dir or
# bloated DB. We don't refuse to write (user data is theirs), but log loudly
# so the operator can investigate before disk fills up over many intervals.
_BACKUP_SIZE_WARN_BYTES = 500 * 1024 * 1024 # 500 MB
class AutoBackupEngine:
"""Creates periodic SQLite snapshot backups of the database."""
"""Creates periodic backups of the database and asset files.
Each backup is a ZIP archive containing ``ledgrab.db`` plus every file
from ``assets_dir`` under ``assets/`` — matching the format produced by
the manual ``GET /api/v1/system/backup`` download. The restore endpoint
accepts either ``.zip`` or ``.db`` interchangeably.
"""
def __init__(
self,
backup_dir: Path,
db: Database,
assets_dir: Path | None = None,
):
self._backup_dir = Path(backup_dir)
self._db = db
self._assets_dir = Path(assets_dir) if assets_dir else None
self._task: asyncio.Task | None = None
self._last_backup_time: datetime | None = None
@@ -82,9 +100,14 @@ class AutoBackupEngine:
self._task.cancel()
self._task = None
def _iter_backup_files(self) -> Iterable[Path]:
"""Yield every backup file (both legacy ``.db`` and current ``.zip``)."""
for ext in _RECOGNISED_EXTS:
yield from self._backup_dir.glob(f"*{ext}")
def _most_recent_backup_age(self) -> timedelta | None:
"""Return the age of the newest backup file, or None if no backups exist."""
files = list(self._backup_dir.glob(f"*{_BACKUP_EXT}"))
files = list(self._iter_backup_files())
if not files:
return None
newest = max(files, key=lambda p: p.stat().st_mtime)
@@ -124,15 +147,72 @@ class AutoBackupEngine:
timestamp = now.strftime("%Y-%m-%dT%H%M%S")
filename = f"ledgrab-backup-{timestamp}{_BACKUP_EXT}"
file_path = self._backup_dir / filename
# Stage the ZIP at <name>.partial then os.replace into place once it's
# fully written. A crash mid-write leaves a .partial file (cleaned up
# on the next backup) but never a half-written backup that would fool
# ``_most_recent_backup_age`` / ``_prune_old_backups`` into trusting
# corrupt data.
partial_path = file_path.with_suffix(file_path.suffix + ".partial")
self._db.backup_to(file_path)
# SQLite backup API → temp .db so we get a consistent snapshot
# without holding the DB lock for the ZIP write.
tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
tmp_path = Path(tmp.name)
tmp.close()
asset_count = 0
try:
self._db.backup_to(tmp_path)
with zipfile.ZipFile(partial_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(tmp_path, "ledgrab.db")
if self._assets_dir and self._assets_dir.is_dir():
for asset_file in self._assets_dir.iterdir():
# Skip symlinks: ``is_file()`` follows them and we
# don't want to silently slurp a symlink target that
# lives outside the assets dir into every backup.
if asset_file.is_symlink():
continue
if asset_file.is_file():
zf.write(asset_file, f"assets/{asset_file.name}")
asset_count += 1
os.replace(partial_path, file_path)
except Exception:
# Roll back the staged partial so it doesn't accumulate; the
# finally block still removes the SQLite temp file. Re-raise so
# the caller (``_backup_loop`` / ``trigger_backup``) sees + logs
# the failure instead of silently emitting a missing backup.
partial_path.unlink(missing_ok=True)
raise
finally:
tmp_path.unlink(missing_ok=True)
# Best-effort sweep of any older orphan .partial files left by a
# crash on a previous run.
for stale in self._backup_dir.glob("*.partial"):
try:
stale.unlink()
except OSError:
pass
size_bytes = file_path.stat().st_size
self._last_backup_time = now
logger.info(f"Backup created: {filename}")
logger.info(
"Backup created: %s (%d asset files, %.1f MB)",
filename,
asset_count,
size_bytes / (1024 * 1024),
)
if size_bytes > _BACKUP_SIZE_WARN_BYTES:
logger.warning(
"Backup %s is %.1f MB — exceeds %d MB warning threshold; "
"consider pruning the assets directory or lowering max_backups",
filename,
size_bytes / (1024 * 1024),
_BACKUP_SIZE_WARN_BYTES // (1024 * 1024),
)
def _prune_old_backups(self) -> None:
max_backups = self._settings["max_backups"]
files = sorted(self._backup_dir.glob(f"*{_BACKUP_EXT}"), key=lambda p: p.stat().st_mtime)
files = sorted(self._iter_backup_files(), key=lambda p: p.stat().st_mtime)
excess = len(files) - max_backups
if excess > 0:
for f in files[:excess]:
@@ -179,9 +259,7 @@ class AutoBackupEngine:
def list_backups(self) -> List[dict]:
backups = []
for f in sorted(
self._backup_dir.glob(f"*{_BACKUP_EXT}"), key=lambda p: p.stat().st_mtime, reverse=True
):
for f in sorted(self._iter_backup_files(), key=lambda p: p.stat().st_mtime, reverse=True):
stat = f.stat()
backups.append(
{
+1
View File
@@ -283,6 +283,7 @@ async def lifespan(app: FastAPI):
auto_backup_engine = AutoBackupEngine(
backup_dir=_data_dir / "backups",
db=db,
assets_dir=Path(config.assets.assets_dir),
)
# Create update service (checks for new releases)
@@ -0,0 +1,210 @@
"""Regression coverage for :class:`AutoBackupEngine` ZIP format.
Pins behavior added when auto-backups switched from raw ``.db`` snapshots to
``.zip`` archives (DB + assets), and the partial-write hardening that goes
with it (stage at ``<name>.partial`` then ``os.replace`` so a crash mid-write
never leaves a corrupt file masquerading as a valid backup).
"""
import zipfile
from pathlib import Path
from unittest.mock import patch
import pytest
from ledgrab.core.backup.auto_backup import AutoBackupEngine
from ledgrab.storage.database import Database
@pytest.fixture()
def engine_with_assets(tmp_path):
"""Engine wired to a small DB + assets dir with two sample files."""
db_path = tmp_path / "ledgrab.db"
assets_dir = tmp_path / "assets"
assets_dir.mkdir()
(assets_dir / "alert.wav").write_bytes(b"WAV_PAYLOAD_1")
(assets_dir / "ping.wav").write_bytes(b"WAV_PAYLOAD_2")
db = Database(db_path)
db.upsert("devices", "dev_1", "Living Room", {"id": "dev_1", "type": "mock"})
backup_dir = tmp_path / "backups"
engine = AutoBackupEngine(backup_dir=backup_dir, db=db, assets_dir=assets_dir)
yield engine
db.close()
def _only_backup(backup_dir: Path) -> Path:
"""Return the single .zip backup in ``backup_dir``; assert exactly one."""
zips = sorted(backup_dir.glob("*.zip"))
assert len(zips) == 1, f"expected exactly one .zip, found {[p.name for p in zips]}"
return zips[0]
def test_backup_bundles_db_and_assets(engine_with_assets):
"""Happy path: ZIP contains ``ledgrab.db`` plus every assets file."""
engine_with_assets._perform_backup_sync()
backup = _only_backup(engine_with_assets._backup_dir)
with zipfile.ZipFile(backup) as zf:
names = sorted(zf.namelist())
assert names == ["assets/alert.wav", "assets/ping.wav", "ledgrab.db"]
def test_backup_preserves_asset_bytes(engine_with_assets):
"""The asset binary inside the ZIP matches the source byte-for-byte."""
engine_with_assets._perform_backup_sync()
backup = _only_backup(engine_with_assets._backup_dir)
with zipfile.ZipFile(backup) as zf:
assert zf.read("assets/alert.wav") == b"WAV_PAYLOAD_1"
assert zf.read("assets/ping.wav") == b"WAV_PAYLOAD_2"
def test_backup_no_partial_file_left_on_success(engine_with_assets):
"""After a successful backup, the staging ``.partial`` is renamed away."""
engine_with_assets._perform_backup_sync()
leftovers = list(engine_with_assets._backup_dir.glob("*.partial"))
assert leftovers == []
def test_backup_skips_assets_when_dir_none(tmp_path):
"""``assets_dir=None`` produces a DB-only ZIP, no error."""
db_path = tmp_path / "ledgrab.db"
db = Database(db_path)
db.upsert("devices", "dev_1", "A", {"id": "dev_1"})
backup_dir = tmp_path / "backups"
try:
engine = AutoBackupEngine(backup_dir=backup_dir, db=db, assets_dir=None)
engine._perform_backup_sync()
backup = _only_backup(backup_dir)
with zipfile.ZipFile(backup) as zf:
assert zf.namelist() == ["ledgrab.db"]
finally:
db.close()
def test_backup_skips_assets_when_dir_missing(tmp_path):
"""A non-existent ``assets_dir`` is silently skipped, not an error."""
db_path = tmp_path / "ledgrab.db"
db = Database(db_path)
backup_dir = tmp_path / "backups"
try:
engine = AutoBackupEngine(
backup_dir=backup_dir, db=db, assets_dir=tmp_path / "does-not-exist"
)
engine._perform_backup_sync()
backup = _only_backup(backup_dir)
with zipfile.ZipFile(backup) as zf:
assert zf.namelist() == ["ledgrab.db"]
finally:
db.close()
def test_backup_failure_leaves_no_zip_or_partial(engine_with_assets):
"""When ``db.backup_to`` raises, neither the final .zip nor the .partial
survives. The exception propagates so the caller can log it.
"""
with patch.object(engine_with_assets._db, "backup_to", side_effect=RuntimeError("boom")):
with pytest.raises(RuntimeError, match="boom"):
engine_with_assets._perform_backup_sync()
assert list(engine_with_assets._backup_dir.glob("*.zip")) == []
assert list(engine_with_assets._backup_dir.glob("*.partial")) == []
def test_backup_cleans_stale_partial_from_previous_crash(engine_with_assets):
"""A leftover ``.partial`` from a prior crash is swept on the next run."""
stale = engine_with_assets._backup_dir
stale.mkdir(parents=True, exist_ok=True)
(stale / "ledgrab-backup-2025-01-01T000000.zip.partial").write_bytes(b"corrupt")
engine_with_assets._perform_backup_sync()
assert list(stale.glob("*.partial")) == []
# The successful backup is also present.
assert len(list(stale.glob("*.zip"))) == 1
def test_backup_skips_symlinks_in_assets_dir(tmp_path):
"""Symlinked files in ``assets_dir`` are not bundled (security guard)."""
db_path = tmp_path / "ledgrab.db"
assets_dir = tmp_path / "assets"
assets_dir.mkdir()
(assets_dir / "real.wav").write_bytes(b"REAL")
# Some test environments (e.g. unprivileged Windows) can't create
# symlinks; skip rather than fail spuriously when that's the case.
secret_target = tmp_path / "secret.bin"
secret_target.write_bytes(b"PRIVATE")
link_path = assets_dir / "linked.wav"
try:
link_path.symlink_to(secret_target)
except (OSError, NotImplementedError):
pytest.skip("symlinks not supported in this environment")
db = Database(db_path)
try:
backup_dir = tmp_path / "backups"
engine = AutoBackupEngine(backup_dir=backup_dir, db=db, assets_dir=assets_dir)
engine._perform_backup_sync()
backup = _only_backup(backup_dir)
with zipfile.ZipFile(backup) as zf:
names = zf.namelist()
assert "assets/real.wav" in names
assert "assets/linked.wav" not in names
finally:
db.close()
def test_list_backups_unions_legacy_db_and_new_zip(engine_with_assets):
"""``list_backups`` returns both legacy ``.db`` and current ``.zip`` files
so users can still restore from auto-backups written by older versions.
"""
backup_dir = engine_with_assets._backup_dir
backup_dir.mkdir(parents=True, exist_ok=True)
(backup_dir / "ledgrab-backup-2024-12-31T000000.db").write_bytes(b"legacy")
engine_with_assets._perform_backup_sync()
names = {b["filename"] for b in engine_with_assets.list_backups()}
assert any(n.endswith(".db") for n in names)
assert any(n.endswith(".zip") for n in names)
def test_prune_honors_max_across_formats(tmp_path):
"""``_prune_old_backups`` enforces ``max_backups`` across both extensions."""
db_path = tmp_path / "ledgrab.db"
db = Database(db_path)
backup_dir = tmp_path / "backups"
backup_dir.mkdir()
# Two legacy .db files (older), two .zip files (newer) — max=3 should
# prune the single oldest .db and keep the rest.
import time
(backup_dir / "ledgrab-backup-2024-01-01T000000.db").write_bytes(b"a")
time.sleep(0.02)
(backup_dir / "ledgrab-backup-2024-02-01T000000.db").write_bytes(b"b")
time.sleep(0.02)
(backup_dir / "ledgrab-backup-2024-03-01T000000.zip").write_bytes(b"c")
time.sleep(0.02)
(backup_dir / "ledgrab-backup-2024-04-01T000000.zip").write_bytes(b"d")
try:
engine = AutoBackupEngine(backup_dir=backup_dir, db=db, assets_dir=None)
engine._settings["max_backups"] = 3
engine._prune_old_backups()
remaining = sorted(p.name for p in engine._iter_backup_files())
assert remaining == [
"ledgrab-backup-2024-02-01T000000.db",
"ledgrab-backup-2024-03-01T000000.zip",
"ledgrab-backup-2024-04-01T000000.zip",
]
finally:
db.close()