feat(backup): bundle assets in ZIP + partial-write hardening + restart log

Auto-backups now produce a ZIP containing ledgrab.db plus every file
in the assets dir under assets/ — matching the manual
GET /api/v1/system/backup format, so restore accepts either output
interchangeably. Legacy .db backups remain listable, restorable, and
prunable; both extensions count toward max_backups.

Writes stage to <name>.partial then os.replace into place — a crash
mid-ZIP never leaves a half-written backup that masquerades as valid.
Stale .partials from prior crashes are swept on the next run.
Symlinks inside the assets dir are skipped so a hostile link can't
slurp a target outside the dir into every backup. Backups larger than
500 MB log a warning so operators notice unbounded asset growth before
disk fills up.

restart.py: redirect the spawned restart script's stdout/stderr to
restart.log and bail out early if the script is missing — silent
failures (PowerShell off PATH, restart.ps1 erroring) used to vanish
into a detached child with no diagnostic trail.

Tests cover happy path, asset bytes round-trip, partial cleanup,
None/missing assets_dir, failure rollback, stale-partial sweep,
symlink rejection, mixed legacy+new listing, and cross-format prune.
This commit is contained in:
2026-05-28 17:25:55 +03:00
parent e4d24a02da
commit 85da2e538d
5 changed files with 348 additions and 27 deletions
+48 -16
View File
@@ -11,6 +11,7 @@ import sys
import threading
import zipfile
from pathlib import Path
from typing import Any
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
@@ -38,28 +39,59 @@ _SERVER_DIR = Path(__file__).resolve().parents[4]
def _schedule_restart() -> None:
"""Spawn a restart script after a short delay so the HTTP response completes."""
"""Spawn a restart script after a short delay so the HTTP response completes.
def _restart():
stdout/stderr of the spawned script are redirected to ``<server>/restart.log``
so a silent failure (PowerShell not on PATH, restart.ps1 erroring, etc.)
leaves evidence on disk instead of vanishing into a detached child.
"""
def _restart() -> None:
import time
time.sleep(1)
# Annotated as ``dict[str, Any]`` because the value union spans
# int flags (Windows ``creationflags``) and bool (POSIX
# ``start_new_session``); a narrower union confuses ``**`` unpacking.
popen_kwargs: dict[str, Any]
if sys.platform == "win32":
subprocess.Popen(
[
"powershell",
"-ExecutionPolicy",
"Bypass",
"-File",
str(_SERVER_DIR / "restart.ps1"),
],
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP,
)
script = _SERVER_DIR / "restart.ps1"
cmd = ["powershell", "-ExecutionPolicy", "Bypass", "-File", str(script)]
popen_kwargs = {
"creationflags": (
subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
),
}
else:
subprocess.Popen(
["bash", str(_SERVER_DIR / "restart.sh")],
start_new_session=True,
)
script = _SERVER_DIR / "restart.sh"
cmd = ["bash", str(script)]
popen_kwargs = {"start_new_session": True}
if not script.is_file():
logger.error("Restart script missing: %s", script)
return
log_path = _SERVER_DIR / "restart.log"
try:
# Open in append mode so multiple restarts accumulate; the child
# owns its own duped handle, so closing here in the parent is safe.
with open(log_path, "ab") as log_file:
log_file.write(
f"\n--- restart spawned at {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode()
)
log_file.flush()
proc = subprocess.Popen(
cmd,
stdout=log_file,
stderr=subprocess.STDOUT,
**popen_kwargs,
)
logger.info("Restart script launched: %s (PID %s, log %s)", cmd[0], proc.pid, log_path)
except OSError as e:
logger.error("Failed to launch restart script %s: %s", script, e, exc_info=True)
except Exception as e:
logger.error("Unexpected error launching restart script: %s", e, exc_info=True)
threading.Thread(target=_restart, daemon=True).start()
+89 -11
View File
@@ -1,10 +1,12 @@
"""Auto-backup engine — periodic SQLite snapshot backups."""
"""Auto-backup engine — periodic SQLite + assets snapshot backups."""
import asyncio
import os
import tempfile
import zipfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import List
from typing import Iterable, List
from ledgrab.storage.database import Database
from ledgrab.utils import get_logger
@@ -20,19 +22,35 @@ DEFAULT_SETTINGS = {
# Skip the immediate-on-start backup if a recent backup exists within this window.
_STARTUP_BACKUP_COOLDOWN = timedelta(minutes=5)
_BACKUP_EXT = ".db"
# Current write format. ``.db`` is still recognised on read so backups taken
# by older versions remain listable, restorable, and prunable.
_BACKUP_EXT = ".zip"
_RECOGNISED_EXTS: tuple[str, ...] = (".zip", ".db")
# Soft warning threshold — large backups indicate an unbounded assets dir or
# bloated DB. We don't refuse to write (user data is theirs), but log loudly
# so the operator can investigate before disk fills up over many intervals.
_BACKUP_SIZE_WARN_BYTES = 500 * 1024 * 1024 # 500 MB
class AutoBackupEngine:
"""Creates periodic SQLite snapshot backups of the database."""
"""Creates periodic backups of the database and asset files.
Each backup is a ZIP archive containing ``ledgrab.db`` plus every file
from ``assets_dir`` under ``assets/`` — matching the format produced by
the manual ``GET /api/v1/system/backup`` download. The restore endpoint
accepts either ``.zip`` or ``.db`` interchangeably.
"""
def __init__(
self,
backup_dir: Path,
db: Database,
assets_dir: Path | None = None,
):
self._backup_dir = Path(backup_dir)
self._db = db
self._assets_dir = Path(assets_dir) if assets_dir else None
self._task: asyncio.Task | None = None
self._last_backup_time: datetime | None = None
@@ -82,9 +100,14 @@ class AutoBackupEngine:
self._task.cancel()
self._task = None
def _iter_backup_files(self) -> Iterable[Path]:
"""Yield every backup file (both legacy ``.db`` and current ``.zip``)."""
for ext in _RECOGNISED_EXTS:
yield from self._backup_dir.glob(f"*{ext}")
def _most_recent_backup_age(self) -> timedelta | None:
"""Return the age of the newest backup file, or None if no backups exist."""
files = list(self._backup_dir.glob(f"*{_BACKUP_EXT}"))
files = list(self._iter_backup_files())
if not files:
return None
newest = max(files, key=lambda p: p.stat().st_mtime)
@@ -124,15 +147,72 @@ class AutoBackupEngine:
timestamp = now.strftime("%Y-%m-%dT%H%M%S")
filename = f"ledgrab-backup-{timestamp}{_BACKUP_EXT}"
file_path = self._backup_dir / filename
# Stage the ZIP at <name>.partial then os.replace into place once it's
# fully written. A crash mid-write leaves a .partial file (cleaned up
# on the next backup) but never a half-written backup that would fool
# ``_most_recent_backup_age`` / ``_prune_old_backups`` into trusting
# corrupt data.
partial_path = file_path.with_suffix(file_path.suffix + ".partial")
self._db.backup_to(file_path)
# SQLite backup API → temp .db so we get a consistent snapshot
# without holding the DB lock for the ZIP write.
tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
tmp_path = Path(tmp.name)
tmp.close()
asset_count = 0
try:
self._db.backup_to(tmp_path)
with zipfile.ZipFile(partial_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(tmp_path, "ledgrab.db")
if self._assets_dir and self._assets_dir.is_dir():
for asset_file in self._assets_dir.iterdir():
# Skip symlinks: ``is_file()`` follows them and we
# don't want to silently slurp a symlink target that
# lives outside the assets dir into every backup.
if asset_file.is_symlink():
continue
if asset_file.is_file():
zf.write(asset_file, f"assets/{asset_file.name}")
asset_count += 1
os.replace(partial_path, file_path)
except Exception:
# Roll back the staged partial so it doesn't accumulate; the
# finally block still removes the SQLite temp file. Re-raise so
# the caller (``_backup_loop`` / ``trigger_backup``) sees + logs
# the failure instead of silently emitting a missing backup.
partial_path.unlink(missing_ok=True)
raise
finally:
tmp_path.unlink(missing_ok=True)
# Best-effort sweep of any older orphan .partial files left by a
# crash on a previous run.
for stale in self._backup_dir.glob("*.partial"):
try:
stale.unlink()
except OSError:
pass
size_bytes = file_path.stat().st_size
self._last_backup_time = now
logger.info(f"Backup created: {filename}")
logger.info(
"Backup created: %s (%d asset files, %.1f MB)",
filename,
asset_count,
size_bytes / (1024 * 1024),
)
if size_bytes > _BACKUP_SIZE_WARN_BYTES:
logger.warning(
"Backup %s is %.1f MB — exceeds %d MB warning threshold; "
"consider pruning the assets directory or lowering max_backups",
filename,
size_bytes / (1024 * 1024),
_BACKUP_SIZE_WARN_BYTES // (1024 * 1024),
)
def _prune_old_backups(self) -> None:
max_backups = self._settings["max_backups"]
files = sorted(self._backup_dir.glob(f"*{_BACKUP_EXT}"), key=lambda p: p.stat().st_mtime)
files = sorted(self._iter_backup_files(), key=lambda p: p.stat().st_mtime)
excess = len(files) - max_backups
if excess > 0:
for f in files[:excess]:
@@ -179,9 +259,7 @@ class AutoBackupEngine:
def list_backups(self) -> List[dict]:
backups = []
for f in sorted(
self._backup_dir.glob(f"*{_BACKUP_EXT}"), key=lambda p: p.stat().st_mtime, reverse=True
):
for f in sorted(self._iter_backup_files(), key=lambda p: p.stat().st_mtime, reverse=True):
stat = f.stat()
backups.append(
{
+1
View File
@@ -283,6 +283,7 @@ async def lifespan(app: FastAPI):
auto_backup_engine = AutoBackupEngine(
backup_dir=_data_dir / "backups",
db=db,
assets_dir=Path(config.assets.assets_dir),
)
# Create update service (checks for new releases)