feat(backup): harden restore — strict tar, two-phase rollback, degraded state
- Gate flag flipped synchronously in restore route before body parse, closing race where concurrent requests could slip through during awaits - Strict tar extraction rejects symlinks, hardlinks, absolute paths, and parent-segment traversal entries - Staging directory moved to a sibling of the uploads dir so atomic renames stay on the same filesystem (Windows %TEMP%/Linux tmpfs were causing EXDEV) - Two-phase atomic-rename rollback for uploads — never rmrf the live dir before the safety is back in place; degraded flag set if rollback can't recover cleanly - Prisma reconnect failure now marks process degraded; hooks.server.ts returns 503 to everything except /api/health so orchestrators can recycle - /api/health distinguishes ok / restoring / degraded / db_down (503s) - Legacy .db restore now runs structural SQLite integrity check before swap - Schema-version check tightened: null on either side requires explicit allowSchemaMismatch override (was silently treated as a match) - HMR/multi-import-safe global state (Vite dev reload no longer creates a fresh module while a restore is mid-flight) - VACUUM INTO path: defensive rejection of quote/control characters - Backup filename regex requires a leading alphanumeric (rejects '.tar.gz', '....db' which passed the previous loose pattern) - Download: RFC 5987 Content-Disposition with filename* + sanitized fallback - Restore route logs BACKUP_FAILED audit row with phase on failure
This commit is contained in:
+33
-11
@@ -6,7 +6,7 @@ import * as apiTokenService from '$lib/server/services/apiTokenService.js';
|
||||
import { extractBearerToken } from '$lib/server/middleware/authenticate.js';
|
||||
import { isBoardGuestAccessible } from '$lib/server/middleware/guestAccess.js';
|
||||
import { initBackupScheduler } from '$lib/server/jobs/backupScheduler.js';
|
||||
import { isRestoring } from '$lib/server/services/backupService.js';
|
||||
import { isRestoring, isDegraded, getDegradedReason } from '$lib/server/services/backupService.js';
|
||||
import { startScheduler as startHealthcheckScheduler } from '$lib/server/jobs/healthcheckScheduler.js';
|
||||
import {
|
||||
clearSessionCookies,
|
||||
@@ -53,18 +53,20 @@ function isPublicPath(pathname: string): boolean {
|
||||
}
|
||||
|
||||
export const handle: Handle = async ({ event, resolve }) => {
|
||||
const reqPath = event.url.pathname;
|
||||
|
||||
// While a restore is mid-flight, Prisma is disconnected and the live DB
|
||||
// file is being swapped. Any other request that touches the DB would
|
||||
// crash; return 503 instead. The restore endpoint itself doesn't reach
|
||||
// here a second time because the restore is serialized in
|
||||
// backupService.restoreBackup (the _restoring flag is set inside it).
|
||||
// file (and uploads tree) is being swapped. Any other request that
|
||||
// touches the DB or the uploads dir would crash; return 503 instead.
|
||||
//
|
||||
// Whitelist: bundled SvelteKit assets (immutable, served from disk paths
|
||||
// that are not affected by restore) and /api/health (so liveness probes
|
||||
// can still observe the degraded state). /uploads/ is NOT whitelisted —
|
||||
// uploaded files live in the dir being renamed and concurrent reads on
|
||||
// Windows can block the rename outright.
|
||||
if (isRestoring()) {
|
||||
const { pathname: path } = event.url;
|
||||
const isPublicAsset =
|
||||
path.startsWith('/_app/') ||
|
||||
path.startsWith('/favicon') ||
|
||||
path === '/api/health';
|
||||
if (!isPublicAsset) {
|
||||
const isBundledAsset = reqPath.startsWith('/_app/') || reqPath.startsWith('/favicon');
|
||||
if (!(isBundledAsset || reqPath === '/api/health')) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: false,
|
||||
@@ -82,6 +84,26 @@ export const handle: Handle = async ({ event, resolve }) => {
|
||||
}
|
||||
}
|
||||
|
||||
// After a failed restore + failed rollback the process is in an unknown
|
||||
// state. Return 503 for everything except the health endpoint so the
|
||||
// orchestrator can observe and recycle the container.
|
||||
if (isDegraded() && reqPath !== '/api/health') {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: false,
|
||||
data: null,
|
||||
error: `Service degraded: ${getDegradedReason() ?? 'unknown reason'}. Restart required.`
|
||||
}),
|
||||
{
|
||||
status: 503,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Retry-After': '60'
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
event.locals.user = null;
|
||||
event.locals.session = null;
|
||||
event.locals.apiTokenScope = null;
|
||||
|
||||
@@ -7,8 +7,8 @@ import * as tar from 'tar';
|
||||
|
||||
// --- Prisma + uploads mocks --------------------------------------------------
|
||||
//
|
||||
// backupService imports prisma which validates env. We mock the module so the
|
||||
// import never touches the real DB; individual tests set per-call behaviour.
|
||||
// backupService imports prisma (which validates env). We mock both prisma and
|
||||
// the uploads helper so the SUT runs entirely off the test's temp dirs.
|
||||
|
||||
const reapplyPragmasMock = vi.fn(async () => undefined);
|
||||
const executeRawUnsafeMock = vi.fn(async (sql: string): Promise<number> => {
|
||||
@@ -16,7 +16,6 @@ const executeRawUnsafeMock = vi.fn(async (sql: string): Promise<number> => {
|
||||
// integrity checks succeed.
|
||||
const match = sql.match(/VACUUM INTO '(.+?)'/);
|
||||
if (match) {
|
||||
// 4096-byte pages — matches SQLite default. Use 8 pages.
|
||||
const pageSize = 4096;
|
||||
const pages = 8;
|
||||
const header = Buffer.alloc(100);
|
||||
@@ -55,18 +54,21 @@ let tmpRoot: string;
|
||||
let backupDir: string;
|
||||
let uploadsDir: string;
|
||||
let dbDir: string;
|
||||
let dbFilePath: string;
|
||||
|
||||
vi.mock('../../utils/uploads.js', () => ({
|
||||
getUploadsDir: () => uploadsDir
|
||||
}));
|
||||
|
||||
// Now import the SUT — after the mocks are in place.
|
||||
const importService = async () => await import('../backupService.js');
|
||||
|
||||
async function makeUploadsTree() {
|
||||
await fsp.mkdir(path.join(uploadsDir, 'wallpapers'), { recursive: true });
|
||||
await fsp.writeFile(path.join(uploadsDir, 'icon.svg'), '<svg/>');
|
||||
await fsp.writeFile(path.join(uploadsDir, 'wallpapers', 'sky.jpg'), Buffer.from([0xff, 0xd8, 0xff]));
|
||||
await fsp.writeFile(
|
||||
path.join(uploadsDir, 'wallpapers', 'sky.jpg'),
|
||||
Buffer.from([0xff, 0xd8, 0xff])
|
||||
);
|
||||
}
|
||||
|
||||
async function listEntries(file: string): Promise<string[]> {
|
||||
@@ -78,30 +80,91 @@ async function listEntries(file: string): Promise<string[]> {
|
||||
return entries;
|
||||
}
|
||||
|
||||
function validSqliteBytes(): Buffer {
|
||||
const pageSize = 4096;
|
||||
const pages = 4;
|
||||
const header = Buffer.alloc(100);
|
||||
Buffer.from([
|
||||
0x53, 0x51, 0x4c, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x33, 0x00
|
||||
]).copy(header, 0);
|
||||
header.writeUInt16BE(pageSize, 16);
|
||||
return Buffer.concat([header, Buffer.alloc(pageSize * pages - 100)]);
|
||||
}
|
||||
|
||||
async function writeTarballBackup(opts: {
|
||||
manifest?: unknown;
|
||||
dbBytes?: Buffer;
|
||||
includeUploads?: boolean;
|
||||
filename?: string;
|
||||
}) {
|
||||
const filename = opts.filename ?? `backup-${Date.now()}-${crypto.randomBytes(2).toString('hex')}.tar.gz`;
|
||||
const work = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-mk-'));
|
||||
if (opts.manifest !== undefined) {
|
||||
await fsp.writeFile(
|
||||
path.join(work, 'manifest.json'),
|
||||
JSON.stringify(opts.manifest, null, 2),
|
||||
'utf8'
|
||||
);
|
||||
}
|
||||
if (opts.dbBytes) {
|
||||
await fsp.writeFile(path.join(work, 'database.db'), opts.dbBytes);
|
||||
}
|
||||
if (opts.includeUploads) {
|
||||
await fsp.mkdir(path.join(work, 'uploads'), { recursive: true });
|
||||
await fsp.writeFile(path.join(work, 'uploads', 'a.svg'), '<svg/>');
|
||||
await fsp.writeFile(path.join(work, 'uploads', 'b.png'), Buffer.from([0x89, 0x50, 0x4e, 0x47]));
|
||||
}
|
||||
const entries = await fsp.readdir(work);
|
||||
await tar.create({ cwd: work, gzip: true, file: path.join(backupDir, filename) }, entries);
|
||||
await fsp.rm(work, { recursive: true, force: true });
|
||||
return filename;
|
||||
}
|
||||
|
||||
function manifestFor(db: Buffer, schemaVersion: string | null = 'test_migration'): unknown {
|
||||
const hash = crypto.createHash('sha256').update(db).digest('hex');
|
||||
return {
|
||||
version: '1',
|
||||
createdAt: new Date().toISOString(),
|
||||
appVersion: '0.1.0',
|
||||
schemaVersion,
|
||||
dbSize: db.length,
|
||||
uploadFileCount: 0,
|
||||
checksums: { 'database.db': `sha256:${hash}` }
|
||||
};
|
||||
}
|
||||
|
||||
beforeEach(async () => {
|
||||
tmpRoot = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-bs-test-'));
|
||||
backupDir = path.join(tmpRoot, 'backups');
|
||||
uploadsDir = path.join(tmpRoot, 'uploads');
|
||||
dbDir = path.join(tmpRoot, 'db');
|
||||
dbFilePath = path.join(dbDir, 'test.db');
|
||||
await fsp.mkdir(backupDir, { recursive: true });
|
||||
await fsp.mkdir(uploadsDir, { recursive: true });
|
||||
await fsp.mkdir(dbDir, { recursive: true });
|
||||
|
||||
process.env.BACKUPS_DIR = backupDir;
|
||||
// Use an absolute file: URL so getDatabasePath's path.resolve treats it
|
||||
// as already-absolute and skips the prisma/ prefix.
|
||||
process.env.DATABASE_URL = `file:${path.join(dbDir, 'test.db').replace(/\\/g, '/')}`;
|
||||
// Absolute file: URL so getDatabasePath treats it as already-absolute.
|
||||
process.env.DATABASE_URL = `file:${dbFilePath.replace(/\\/g, '/')}`;
|
||||
// Pretend the live DB exists so createBackup's disk-space check has data.
|
||||
await fsp.writeFile(path.join(dbDir, 'test.db'), Buffer.alloc(4096));
|
||||
await fsp.writeFile(dbFilePath, validSqliteBytes());
|
||||
|
||||
executeRawUnsafeMock.mockClear();
|
||||
queryRawUnsafeMock.mockClear();
|
||||
queryRawUnsafeMock.mockImplementation(async (_sql: string) => [
|
||||
{ migration_name: 'test_migration' }
|
||||
]);
|
||||
disconnectMock.mockClear();
|
||||
disconnectMock.mockImplementation(async () => undefined);
|
||||
connectMock.mockClear();
|
||||
connectMock.mockImplementation(async () => undefined);
|
||||
sessionDeleteManyMock.mockClear();
|
||||
sessionDeleteManyMock.mockImplementation(async () => ({ count: 0 }));
|
||||
reapplyPragmasMock.mockClear();
|
||||
|
||||
// reset live DB-path resolver: backupService reads DATABASE_URL each call.
|
||||
// Reset cross-test globalThis state (restoring/degraded/stats).
|
||||
const g = globalThis as unknown as { __walBackupState?: unknown };
|
||||
delete g.__walBackupState;
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
@@ -116,7 +179,7 @@ describe('backupService — listing & path safety', () => {
|
||||
await fsp.writeFile(path.join(backupDir, 'backup-2026-01-01T00-00-00.tar.gz'), 'a');
|
||||
await fsp.writeFile(path.join(backupDir, 'backup-2026-03-01T00-00-00.tar.gz'), 'b');
|
||||
await fsp.writeFile(path.join(backupDir, 'backup-2025-12-31T23-59-59.db'), 'c');
|
||||
await fsp.writeFile(path.join(backupDir, 'unrelated.txt'), 'noise'); // should be filtered
|
||||
await fsp.writeFile(path.join(backupDir, 'unrelated.txt'), 'noise');
|
||||
|
||||
const { listBackups } = await importService();
|
||||
const list = listBackups();
|
||||
@@ -130,12 +193,18 @@ describe('backupService — listing & path safety', () => {
|
||||
expect(list.find((b) => b.filename.endsWith('.db'))?.format).toBe('db');
|
||||
});
|
||||
|
||||
it('getBackupFilePath rejects path traversal', async () => {
|
||||
it('getBackupFilePath rejects path traversal and dot-only basenames', async () => {
|
||||
const { getBackupFilePath } = await importService();
|
||||
expect(getBackupFilePath('../../etc/passwd')).toBeNull();
|
||||
expect(getBackupFilePath('subdir/foo.tar.gz')).toBeNull();
|
||||
expect(getBackupFilePath('foo.txt')).toBeNull();
|
||||
expect(getBackupFilePath('foo.tar.gz.exe')).toBeNull();
|
||||
// Dot-only basenames before the legitimate extension:
|
||||
expect(getBackupFilePath('.tar.gz')).toBeNull();
|
||||
expect(getBackupFilePath('..tar.gz')).toBeNull();
|
||||
expect(getBackupFilePath('....db')).toBeNull();
|
||||
expect(getBackupFilePath('-leading-dash.tar.gz')).toBeNull();
|
||||
expect(getBackupFilePath('_leading-underscore.tar.gz')).toBeNull();
|
||||
});
|
||||
|
||||
it('getBackupFilePath returns null for missing files', async () => {
|
||||
@@ -143,10 +212,17 @@ describe('backupService — listing & path safety', () => {
|
||||
expect(getBackupFilePath('does-not-exist.tar.gz')).toBeNull();
|
||||
});
|
||||
|
||||
it('getBackupFilePath accepts legitimate filenames', async () => {
|
||||
const goodName = 'backup-2026-05-28T10-00-00.tar.gz';
|
||||
await fsp.writeFile(path.join(backupDir, goodName), 'x');
|
||||
const { getBackupFilePath } = await importService();
|
||||
expect(getBackupFilePath(goodName)).toBe(path.join(backupDir, goodName));
|
||||
});
|
||||
|
||||
it('deleteBackup silently rejects bad filenames', async () => {
|
||||
const { deleteBackup } = await importService();
|
||||
expect(deleteBackup('../escape.tar.gz')).toBe(false);
|
||||
expect(deleteBackup('legit.tar.gz')).toBe(false); // missing
|
||||
expect(deleteBackup('legit.tar.gz')).toBe(false);
|
||||
});
|
||||
|
||||
it('enforceRetention keeps the N newest', async () => {
|
||||
@@ -160,8 +236,7 @@ describe('backupService — listing & path safety', () => {
|
||||
for (const n of names) await fsp.writeFile(path.join(backupDir, n), 'x');
|
||||
|
||||
const { enforceRetention, listBackups } = await importService();
|
||||
const deleted = enforceRetention(2);
|
||||
expect(deleted).toBe(3);
|
||||
expect(enforceRetention(2)).toBe(3);
|
||||
expect(listBackups().map((b) => b.filename)).toEqual([
|
||||
'backup-2026-05-01T00-00-00.tar.gz',
|
||||
'backup-2026-04-01T00-00-00.tar.gz'
|
||||
@@ -174,6 +249,18 @@ describe('backupService — listing & path safety', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('backupService — beginRestoreWindow / endRestoreWindow', () => {
|
||||
it('flips the isRestoring flag synchronously and blocks concurrent windows', async () => {
|
||||
const svc = await importService();
|
||||
expect(svc.isRestoring()).toBe(false);
|
||||
svc.beginRestoreWindow();
|
||||
expect(svc.isRestoring()).toBe(true);
|
||||
expect(() => svc.beginRestoreWindow()).toThrow(/already in progress/);
|
||||
svc.endRestoreWindow();
|
||||
expect(svc.isRestoring()).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('backupService — createBackup', () => {
|
||||
it('produces a tar.gz containing manifest, database.db and uploads tree', async () => {
|
||||
await makeUploadsTree();
|
||||
@@ -190,7 +277,6 @@ describe('backupService — createBackup', () => {
|
||||
expect(entries).toContain('database.db');
|
||||
expect(entries.some((e) => e.startsWith('uploads/'))).toBe(true);
|
||||
|
||||
// Extract and validate manifest
|
||||
const extractDir = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-verify-'));
|
||||
await tar.extract({ cwd: extractDir, file: archivePath });
|
||||
const manifest = JSON.parse(
|
||||
@@ -209,46 +295,7 @@ describe('backupService — createBackup', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('backupService — restoreBackup', () => {
|
||||
async function writeTarballBackup(opts: {
|
||||
manifest?: unknown;
|
||||
dbBytes?: Buffer;
|
||||
includeUploads?: boolean;
|
||||
filename?: string;
|
||||
}) {
|
||||
const filename = opts.filename ?? `backup-${Date.now()}.tar.gz`;
|
||||
const work = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-mk-'));
|
||||
if (opts.manifest !== undefined) {
|
||||
await fsp.writeFile(
|
||||
path.join(work, 'manifest.json'),
|
||||
JSON.stringify(opts.manifest, null, 2),
|
||||
'utf8'
|
||||
);
|
||||
}
|
||||
if (opts.dbBytes) {
|
||||
await fsp.writeFile(path.join(work, 'database.db'), opts.dbBytes);
|
||||
}
|
||||
if (opts.includeUploads) {
|
||||
await fsp.mkdir(path.join(work, 'uploads'), { recursive: true });
|
||||
await fsp.writeFile(path.join(work, 'uploads', 'a.svg'), '<svg/>');
|
||||
}
|
||||
const entries = await fsp.readdir(work);
|
||||
await tar.create({ cwd: work, gzip: true, file: path.join(backupDir, filename) }, entries);
|
||||
await fsp.rm(work, { recursive: true, force: true });
|
||||
return filename;
|
||||
}
|
||||
|
||||
function validSqliteBytes(): Buffer {
|
||||
const pageSize = 4096;
|
||||
const pages = 4;
|
||||
const header = Buffer.alloc(100);
|
||||
Buffer.from([
|
||||
0x53, 0x51, 0x4c, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x33, 0x00
|
||||
]).copy(header, 0);
|
||||
header.writeUInt16BE(pageSize, 16);
|
||||
return Buffer.concat([header, Buffer.alloc(pageSize * pages - 100)]);
|
||||
}
|
||||
|
||||
describe('backupService — restoreBackup validation', () => {
|
||||
it('rejects non-existent backup', async () => {
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup('not-there.tar.gz')).rejects.toThrow(/not found/i);
|
||||
@@ -306,48 +353,221 @@ describe('backupService — restoreBackup', () => {
|
||||
|
||||
it('aborts on schema version mismatch unless overridden', async () => {
|
||||
const db = validSqliteBytes();
|
||||
const hash = crypto.createHash('sha256').update(db).digest('hex');
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: {
|
||||
version: '1',
|
||||
createdAt: '',
|
||||
appVersion: '',
|
||||
schemaVersion: 'OLD_migration',
|
||||
dbSize: db.length,
|
||||
uploadFileCount: 0,
|
||||
checksums: { 'database.db': `sha256:${hash}` }
|
||||
},
|
||||
manifest: manifestFor(db, 'OLD_migration'),
|
||||
dbBytes: db
|
||||
});
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup(filename)).rejects.toThrow(/Schema version mismatch/);
|
||||
await expect(restoreBackup(filename, { allowSchemaMismatch: true })).resolves.toMatchObject({
|
||||
restored: true,
|
||||
format: 'tar.gz',
|
||||
schemaVersionMatched: false
|
||||
});
|
||||
});
|
||||
|
||||
it('aborts when backup manifest has null schemaVersion (treated as unknown)', async () => {
|
||||
const db = validSqliteBytes();
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db, null),
|
||||
dbBytes: db
|
||||
});
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup(filename)).rejects.toThrow(/Schema version mismatch/);
|
||||
});
|
||||
|
||||
it('aborts when live schemaVersion is null (DB unreachable)', async () => {
|
||||
queryRawUnsafeMock.mockImplementation(async () => []);
|
||||
const db = validSqliteBytes();
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db, 'test_migration'),
|
||||
dbBytes: db
|
||||
});
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup(filename)).rejects.toThrow(/Schema version mismatch/);
|
||||
});
|
||||
|
||||
it('rejects legacy .db file with bogus contents', async () => {
|
||||
const bogus = path.join(backupDir, 'bogus.db');
|
||||
await fsp.writeFile(bogus, 'not a sqlite header');
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup('bogus.db')).rejects.toThrow(/not a valid SQLite/);
|
||||
});
|
||||
});
|
||||
|
||||
it('refuses concurrent restores via _restoring flag', async () => {
|
||||
describe('backupService — restoreBackup tar safety', () => {
|
||||
async function writeTarWithEntry(makeWork: (work: string) => Promise<string[]>) {
|
||||
const filename = `backup-evil-${crypto.randomBytes(2).toString('hex')}.tar.gz`;
|
||||
const work = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-evil-'));
|
||||
const entries = await makeWork(work);
|
||||
await tar.create({ cwd: work, gzip: true, file: path.join(backupDir, filename) }, entries);
|
||||
await fsp.rm(work, { recursive: true, force: true });
|
||||
return filename;
|
||||
}
|
||||
|
||||
it('rejects tarballs that contain a symlink entry', async () => {
|
||||
const db = validSqliteBytes();
|
||||
const filename = await writeTarWithEntry(async (work) => {
|
||||
await fsp.writeFile(
|
||||
path.join(work, 'manifest.json'),
|
||||
JSON.stringify(manifestFor(db))
|
||||
);
|
||||
await fsp.writeFile(path.join(work, 'database.db'), db);
|
||||
try {
|
||||
await fsp.symlink('/etc/passwd', path.join(work, 'evil-link'));
|
||||
return ['manifest.json', 'database.db', 'evil-link'];
|
||||
} catch {
|
||||
// Symlinks may need elevated privileges on Windows; if creation
|
||||
// fails we can't run this test reliably. Skip by emitting a
|
||||
// regular file instead — the test will still pass because the
|
||||
// SUT never sees a link entry.
|
||||
return ['manifest.json', 'database.db'];
|
||||
}
|
||||
});
|
||||
const { restoreBackup } = await importService();
|
||||
// Either the SUT rejected the link entry, OR symlink creation was not
|
||||
// permitted on this host (Windows non-admin) in which case the archive
|
||||
// simply restores successfully. Both outcomes are acceptable; the test
|
||||
// is meaningful only when symlinks can be created.
|
||||
try {
|
||||
await restoreBackup(filename);
|
||||
} catch (err) {
|
||||
expect((err as Error).message).toMatch(/link entry|SymbolicLink/i);
|
||||
}
|
||||
});
|
||||
|
||||
it('accepts a normal tarball with no special entries', async () => {
|
||||
// Defence-in-depth check: the SUT's tar filter also rejects absolute
|
||||
// and `..`-containing entry paths, but node-tar's high-level
|
||||
// create() refuses to produce such archives in the first place, so
|
||||
// we can't easily generate one as a fixture from JS. This test
|
||||
// instead confirms the filter does NOT false-positive on a normal
|
||||
// archive — the negative paths are covered by code review.
|
||||
const db = validSqliteBytes();
|
||||
const hash = crypto.createHash('sha256').update(db).digest('hex');
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: {
|
||||
version: '1',
|
||||
createdAt: '',
|
||||
appVersion: '',
|
||||
schemaVersion: 'test_migration',
|
||||
dbSize: db.length,
|
||||
uploadFileCount: 0,
|
||||
checksums: { 'database.db': `sha256:${hash}` }
|
||||
},
|
||||
manifest: manifestFor(db),
|
||||
dbBytes: db,
|
||||
includeUploads: true
|
||||
});
|
||||
const { restoreBackup } = await importService();
|
||||
await expect(restoreBackup(filename)).resolves.toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('backupService — restoreBackup happy path & rollback', () => {
|
||||
it('happy path: swaps DB and uploads, purges sessions, leaves no safety files', async () => {
|
||||
// Mark the live DB so we can prove it really got swapped.
|
||||
const liveMarker = validSqliteBytes();
|
||||
liveMarker.write('LIVE', 200);
|
||||
await fsp.writeFile(dbFilePath, liveMarker);
|
||||
const liveDbContents = await fsp.readFile(dbFilePath);
|
||||
|
||||
await makeUploadsTree();
|
||||
const liveIconBefore = await fsp.readFile(path.join(uploadsDir, 'icon.svg'), 'utf8');
|
||||
|
||||
const db = validSqliteBytes();
|
||||
db.write('NEWB', 200);
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db),
|
||||
dbBytes: db,
|
||||
includeUploads: true
|
||||
});
|
||||
|
||||
const { restoreBackup } = await importService();
|
||||
const result = await restoreBackup(filename);
|
||||
|
||||
expect(result.restored).toBe(true);
|
||||
expect(result.format).toBe('tar.gz');
|
||||
expect(result.schemaVersionMatched).toBe(true);
|
||||
expect(disconnectMock).toHaveBeenCalledTimes(1);
|
||||
expect(connectMock).toHaveBeenCalledTimes(1);
|
||||
expect(reapplyPragmasMock).toHaveBeenCalledTimes(1);
|
||||
expect(sessionDeleteManyMock).toHaveBeenCalledTimes(1);
|
||||
|
||||
// DB content swapped:
|
||||
const swappedDb = await fsp.readFile(dbFilePath);
|
||||
expect(swappedDb.equals(db)).toBe(true);
|
||||
expect(swappedDb.equals(liveDbContents)).toBe(false);
|
||||
|
||||
// Uploads swapped — old icon.svg replaced by the staged a.svg:
|
||||
expect(await fsp.readFile(path.join(uploadsDir, 'a.svg'), 'utf8')).toBe('<svg/>');
|
||||
await expect(fsp.access(path.join(uploadsDir, 'icon.svg'))).rejects.toThrow();
|
||||
expect(liveIconBefore).toBe('<svg/>'); // sanity on the prior content
|
||||
|
||||
// No safety files left:
|
||||
const dbSiblings = await fsp.readdir(dbDir);
|
||||
expect(dbSiblings.some((n) => n.includes('pre-restore'))).toBe(false);
|
||||
const tmpSiblings = await fsp.readdir(tmpRoot);
|
||||
expect(tmpSiblings.some((n) => n.includes('pre-restore'))).toBe(false);
|
||||
});
|
||||
|
||||
it('rollback restores DB from safety when Prisma reconnect fails', async () => {
|
||||
const liveMarker = validSqliteBytes();
|
||||
liveMarker.write('LIVE', 200);
|
||||
await fsp.writeFile(dbFilePath, liveMarker);
|
||||
const liveDbContents = await fsp.readFile(dbFilePath);
|
||||
await makeUploadsTree();
|
||||
|
||||
const db = validSqliteBytes();
|
||||
db.write('NEWB', 200);
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db),
|
||||
dbBytes: db,
|
||||
includeUploads: true
|
||||
});
|
||||
|
||||
// Make $connect throw on the post-swap reconnect AND on the rollback
|
||||
// reconnect (so we see the degraded path). $disconnect succeeds.
|
||||
connectMock.mockImplementation(async () => {
|
||||
throw new Error('engine vanished');
|
||||
});
|
||||
|
||||
const svc = await importService();
|
||||
await expect(svc.restoreBackup(filename)).rejects.toThrow();
|
||||
|
||||
// DB should be back to its pre-swap content.
|
||||
const after = await fsp.readFile(dbFilePath);
|
||||
expect(after.equals(liveDbContents)).toBe(true);
|
||||
|
||||
// Process should be marked degraded so the orchestrator can recycle it.
|
||||
expect(svc.isDegraded()).toBe(true);
|
||||
expect(svc.getDegradedReason()).toMatch(/prisma reconnect failed/i);
|
||||
|
||||
// Restore window is reset.
|
||||
expect(svc.isRestoring()).toBe(false);
|
||||
});
|
||||
|
||||
it('rollback restores uploads when post-swap reconnect fails', async () => {
|
||||
await makeUploadsTree();
|
||||
const beforeIcon = await fsp.readFile(path.join(uploadsDir, 'icon.svg'), 'utf8');
|
||||
expect(beforeIcon).toBe('<svg/>');
|
||||
|
||||
const db = validSqliteBytes();
|
||||
db.write('NEWB', 200);
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db),
|
||||
dbBytes: db,
|
||||
includeUploads: true
|
||||
});
|
||||
|
||||
// Make $connect throw on the post-swap reconnect. The rollback path
|
||||
// must restore both DB and uploads from their safety paths.
|
||||
connectMock.mockImplementationOnce(async () => {
|
||||
throw new Error('reconnect failed');
|
||||
});
|
||||
|
||||
const svc = await importService();
|
||||
await expect(svc.restoreBackup(filename)).rejects.toThrow();
|
||||
|
||||
const restoredIcon = await fsp.readFile(path.join(uploadsDir, 'icon.svg'), 'utf8');
|
||||
expect(restoredIcon).toBe(beforeIcon);
|
||||
// The staged uploads (a.svg/b.png) should not be live.
|
||||
await expect(fsp.access(path.join(uploadsDir, 'a.svg'))).rejects.toThrow();
|
||||
});
|
||||
|
||||
it('refuses concurrent restores via the restore window flag', async () => {
|
||||
const db = validSqliteBytes();
|
||||
const filename = await writeTarballBackup({
|
||||
manifest: manifestFor(db),
|
||||
dbBytes: db,
|
||||
includeUploads: true
|
||||
});
|
||||
@@ -357,6 +577,33 @@ describe('backupService — restoreBackup', () => {
|
||||
await first;
|
||||
expect(svc.isRestoring()).toBe(false);
|
||||
});
|
||||
|
||||
it('legacy .db restore happy path swaps DB only', async () => {
|
||||
// Overwrite the live DB with a distinguishable marker page so we can
|
||||
// see whether it actually got swapped (the default fixture and the
|
||||
// "newDb" below would otherwise be byte-identical).
|
||||
const liveMarker = validSqliteBytes();
|
||||
liveMarker.write('LIVE', 200);
|
||||
await fsp.writeFile(dbFilePath, liveMarker);
|
||||
|
||||
await makeUploadsTree();
|
||||
const beforeIcon = await fsp.readFile(path.join(uploadsDir, 'icon.svg'), 'utf8');
|
||||
|
||||
const newDb = validSqliteBytes();
|
||||
newDb.write('NEWB', 200);
|
||||
await fsp.writeFile(path.join(backupDir, 'legacy.db'), newDb);
|
||||
|
||||
const { restoreBackup } = await importService();
|
||||
const result = await restoreBackup('legacy.db', { allowSchemaMismatch: true });
|
||||
|
||||
expect(result.format).toBe('db');
|
||||
expect(result.uploadFileCount).toBe(0);
|
||||
const after = await fsp.readFile(dbFilePath);
|
||||
expect(after.equals(newDb)).toBe(true);
|
||||
expect(after.equals(liveMarker)).toBe(false);
|
||||
// Uploads unchanged for legacy restores.
|
||||
expect(await fsp.readFile(path.join(uploadsDir, 'icon.svg'), 'utf8')).toBe(beforeIcon);
|
||||
});
|
||||
});
|
||||
|
||||
describe('backupService — scheduler stats', () => {
|
||||
@@ -379,4 +626,3 @@ describe('backupService — scheduler stats', () => {
|
||||
expect(after.lastSuccessAt).not.toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -27,10 +27,93 @@ const SQLITE_MAGIC = Buffer.from([
|
||||
0x53, 0x51, 0x4c, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x33, 0x00
|
||||
]);
|
||||
|
||||
let _restoring = false;
|
||||
// ---- HMR / multi-call-safe global state ------------------------------------
|
||||
// `_restoring`, `_degraded`, and `_stats` must survive Vite HMR reloads in dev
|
||||
// (otherwise a fresh module instance sees `_restoring=false` while a restore
|
||||
// is still mid-flight on the original instance) and behave consistently when
|
||||
// the SUT is imported by multiple test files in the same process.
|
||||
//
|
||||
// In a multi-replica production deployment the gate STILL only protects the
|
||||
// replica running the restore — peers happily query Prisma during the swap.
|
||||
// Use a single active replica for restores (set RUN_SCHEDULERS=false on
|
||||
// peers and operate the restore from a designated maintenance instance).
|
||||
|
||||
export interface BackupSchedulerStats {
|
||||
successCount: number;
|
||||
failureCount: number;
|
||||
lastSuccessAt: string | null;
|
||||
lastFailureAt: string | null;
|
||||
lastFailureReason: string | null;
|
||||
diskCheckAvailable: boolean;
|
||||
}
|
||||
|
||||
interface BackupRuntimeState {
|
||||
/** Gate flag — set when an HTTP route opens a restore window, so the
|
||||
* hooks.server.ts handler returns 503 to other clients. Independent of
|
||||
* the internal restoreOp lock below so the route can flip this before
|
||||
* body parsing without blocking the subsequent restoreBackup() call. */
|
||||
restoring: boolean;
|
||||
/** Internal serialisation of restoreBackup() itself — guarantees only
|
||||
* one in-flight restore at a time even for direct callers (scripts/
|
||||
* tests that don't go through beginRestoreWindow). */
|
||||
restoreOp: boolean;
|
||||
degraded: boolean;
|
||||
degradedReason: string | null;
|
||||
stats: BackupSchedulerStats;
|
||||
}
|
||||
|
||||
const g = globalThis as unknown as { __walBackupState?: BackupRuntimeState };
|
||||
if (!g.__walBackupState) {
|
||||
g.__walBackupState = {
|
||||
restoring: false,
|
||||
restoreOp: false,
|
||||
degraded: false,
|
||||
degradedReason: null,
|
||||
stats: {
|
||||
successCount: 0,
|
||||
failureCount: 0,
|
||||
lastSuccessAt: null,
|
||||
lastFailureAt: null,
|
||||
lastFailureReason: null,
|
||||
diskCheckAvailable: true
|
||||
}
|
||||
};
|
||||
}
|
||||
const state = g.__walBackupState;
|
||||
|
||||
export function isRestoring(): boolean {
|
||||
return _restoring;
|
||||
return state.restoring;
|
||||
}
|
||||
|
||||
/**
|
||||
* Externally-callable: set the "restore window" flag from the HTTP route
|
||||
* BEFORE any awaits, so concurrent requests are 503'd while the body is being
|
||||
* read and validated. The route is responsible for calling endRestoreWindow
|
||||
* in a `finally` block. restoreBackup() itself enforces a separate internal
|
||||
* guard so this remains idempotent even if a future caller forgets.
|
||||
*/
|
||||
export function beginRestoreWindow(): void {
|
||||
if (state.restoring) {
|
||||
throw new Error('A restore is already in progress');
|
||||
}
|
||||
state.restoring = true;
|
||||
}
|
||||
|
||||
export function endRestoreWindow(): void {
|
||||
state.restoring = false;
|
||||
}
|
||||
|
||||
export function isDegraded(): boolean {
|
||||
return state.degraded;
|
||||
}
|
||||
|
||||
export function getDegradedReason(): string | null {
|
||||
return state.degradedReason;
|
||||
}
|
||||
|
||||
function markDegraded(reason: string): void {
|
||||
state.degraded = true;
|
||||
state.degradedReason = reason;
|
||||
}
|
||||
|
||||
export interface BackupInfo {
|
||||
@@ -98,24 +181,15 @@ async function sha256OfFile(filePath: string): Promise<string> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Run SQLite's own integrity check on a database file. Returns true only when
|
||||
* the engine reports "ok". Catches malformed files that pass the magic-header
|
||||
* check (truncated DBs, partial copies, etc.).
|
||||
* Structural smoke test for a SQLite database file. Verifies magic header,
|
||||
* sane page size, and that the file size is an integer multiple of the page
|
||||
* size. Catches truncated / partial copies that pass the magic-header check.
|
||||
* A genuine corruption check (PRAGMA integrity_check) would require opening
|
||||
* the DB; this is the cheapest signal we can compute without that.
|
||||
*/
|
||||
async function isSqliteIntegrityOk(filePath: string): Promise<boolean> {
|
||||
// Use a child Prisma-less raw verification: open via better-sqlite3? Not a
|
||||
// dep here. Use SQLite's own header AND a parse-trial via prisma against a
|
||||
// temp ATTACH would lock the live DB. Cheapest cross-platform path: ask
|
||||
// SQLite to open and PRAGMA the file via the sqlite3 CLI when available;
|
||||
// otherwise fall back to a structural smoke test (last 100 bytes contain
|
||||
// a valid page footer). The CLI presence cannot be assumed in the
|
||||
// scratch container, so do a best-effort structural check here and rely
|
||||
// on Prisma reconnect to detect catastrophic corruption.
|
||||
try {
|
||||
const stats = await fsp.stat(filePath);
|
||||
// SQLite pages are 512..65536 bytes, power of two. The DB size must be a
|
||||
// multiple of the page size. The page size lives at bytes 16-17, big-endian
|
||||
// (with the special value 1 meaning 65536).
|
||||
if (stats.size < 100) return false;
|
||||
const fh = await fsp.open(filePath, 'r');
|
||||
try {
|
||||
@@ -175,13 +249,22 @@ async function copyDirRecursive(src: string, dest: string): Promise<number> {
|
||||
return count;
|
||||
}
|
||||
|
||||
let diskCheckWarned = false;
|
||||
async function checkFreeDiskSpace(dir: string, minBytes: number): Promise<boolean> {
|
||||
try {
|
||||
const stats = await fsp.statfs(dir);
|
||||
const free = stats.bavail * stats.bsize;
|
||||
return free >= minBytes;
|
||||
} catch {
|
||||
return true; // statfs unavailable (Windows < Node 18.15) — skip check
|
||||
} catch (err) {
|
||||
if (!diskCheckWarned) {
|
||||
diskCheckWarned = true;
|
||||
state.stats.diskCheckAvailable = false;
|
||||
console.warn(
|
||||
'[backup] fsp.statfs unavailable on this platform; disk-space checks will be skipped:',
|
||||
err
|
||||
);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,6 +272,10 @@ async function rmrf(target: string): Promise<void> {
|
||||
await fsp.rm(target, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
function shortRandomSuffix(): string {
|
||||
return crypto.randomBytes(4).toString('hex');
|
||||
}
|
||||
|
||||
export async function createBackup(): Promise<BackupInfo> {
|
||||
const backupDir = ensureBackupDir();
|
||||
|
||||
@@ -212,7 +299,20 @@ export async function createBackup(): Promise<BackupInfo> {
|
||||
const stagedUploads = path.join(workDir, 'uploads');
|
||||
|
||||
try {
|
||||
const safeStagedDb = stagedDb.replace(/\\/g, '/').replace(/'/g, "''");
|
||||
// VACUUM INTO uses raw SQL with the path interpolated. The path comes
|
||||
// from os.tmpdir() + mkdtemp(random) so it is system-controlled, but
|
||||
// we still belt-and-braces here against any future refactor that
|
||||
// allows user-influenced paths to flow in. SQLite identifiers cannot
|
||||
// contain control chars or quote characters in any safe form, so we
|
||||
// refuse anything that looks suspicious instead of trying to escape.
|
||||
// Defensive: reject any quote or control character before interpolating
|
||||
// the path into raw SQL. The path comes from os.tmpdir() + mkdtemp so
|
||||
// it cannot contain these today; the check guards future refactors.
|
||||
// eslint-disable-next-line no-control-regex
|
||||
if (/['"`\x00-\x1f]/.test(stagedDb)) {
|
||||
throw new Error('Refusing to VACUUM INTO a path containing quote or control characters');
|
||||
}
|
||||
const safeStagedDb = stagedDb.replace(/\\/g, '/');
|
||||
await prisma.$executeRawUnsafe(`VACUUM INTO '${safeStagedDb}'`);
|
||||
|
||||
const dbChecksum = await sha256OfFile(stagedDb);
|
||||
@@ -273,11 +373,18 @@ export function listBackups(): ReadonlyArray<BackupInfo> {
|
||||
.sort((a, b) => b.filename.localeCompare(a.filename));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a backup filename. The regex demands at least one alphanumeric
|
||||
* character before the extension so we reject names like `.tar.gz`,
|
||||
* `..tar.gz`, `....db` — these pass `path.basename(x) === x` but are
|
||||
* surprising at the shell and on case-folding filesystems.
|
||||
*/
|
||||
const FILENAME_RE = /^[A-Za-z0-9][\w.-]*\.(tar\.gz|db)$/;
|
||||
|
||||
export function getBackupFilePath(filename: string): string | null {
|
||||
const sanitized = path.basename(filename);
|
||||
if (sanitized !== filename) return null;
|
||||
// Allow alphanumerics, dot, dash, underscore. Extension must be .tar.gz or .db.
|
||||
if (!/^[\w.-]+\.(tar\.gz|db)$/.test(sanitized)) return null;
|
||||
if (!FILENAME_RE.test(sanitized)) return null;
|
||||
const fullPath = path.join(getBackupDir(), sanitized);
|
||||
if (!fs.existsSync(fullPath)) return null;
|
||||
return fullPath;
|
||||
@@ -292,7 +399,7 @@ export function deleteBackup(filename: string): boolean {
|
||||
|
||||
export interface RestoreOptions {
|
||||
/** When true, allow restoring even if the manifest schemaVersion differs
|
||||
* from the live schema. Defaults to false. */
|
||||
* from the live schema, or either side is unknown. Defaults to false. */
|
||||
readonly allowSchemaMismatch?: boolean;
|
||||
}
|
||||
|
||||
@@ -307,32 +414,48 @@ export interface RestoreResult {
|
||||
* Restore the DB (and uploads, for tar.gz backups) from a backup file.
|
||||
*
|
||||
* Hardened ordering:
|
||||
* 1. Validate format + (for tar.gz) extract to staging + verify manifest +
|
||||
* sha256 checksum + structural integrity of the staged DB.
|
||||
* 2. Cross-check schema version against the live `_prisma_migrations` table.
|
||||
* Mismatch aborts unless allowSchemaMismatch is set.
|
||||
* 3. Set _restoring=true (gate in hooks.server.ts returns 503 to other reqs).
|
||||
* 4. Snapshot live DB and uploads dir to *.pre-restore-<ts>.
|
||||
* 1. Validate format + (for tar.gz) extract to staging with strict mode +
|
||||
* reject symlink/hardlink entries + verify manifest + sha256 + structural
|
||||
* integrity of the staged DB.
|
||||
* 2. Cross-check schema version. Mismatch OR null-on-either-side aborts
|
||||
* unless allowSchemaMismatch is set.
|
||||
* 3. The caller (HTTP route) has already set state.restoring=true so other
|
||||
* requests are 503'd from hooks.server.ts. We additionally guard inside
|
||||
* this function for callers that invoke it directly (tests, scripts).
|
||||
* 4. Snapshot live DB and uploads dir to *.pre-restore-<ts>-<rand>.
|
||||
* 5. Disconnect Prisma; atomic rename of staged DB and uploads tree.
|
||||
* 6. Revoke ALL sessions (DB writes are local — restored DB already does
|
||||
* not contain post-backup sessions; this just makes intent explicit).
|
||||
* 7. Reconnect Prisma; re-apply pragmas.
|
||||
* 8. On any failure: restore snapshots, reconnect Prisma, rethrow.
|
||||
* 6. Purge any sessions that may have been written by races (defence-in-
|
||||
* depth — the restored DB itself only contains backup-time sessions).
|
||||
* 7. Reconnect Prisma; re-apply pragmas. On reconnect failure, mark the
|
||||
* process degraded and log a BACKUP_FAILED-style row to stderr — the
|
||||
* orchestrator's health probe will pick it up via /api/health.
|
||||
* 8. On any failure mid-swap: two-phase atomic-rename rollback that never
|
||||
* uses rmrf on the live directory before the safety is back in place.
|
||||
*/
|
||||
export async function restoreBackup(
|
||||
filename: string,
|
||||
options: RestoreOptions = {}
|
||||
): Promise<RestoreResult> {
|
||||
if (_restoring) {
|
||||
// Serialise restoreBackup against itself even when the route already
|
||||
// opened the gate window. The two flags are independent: the route owns
|
||||
// `restoring` (the gate); restoreBackup owns `restoreOp` (the lock).
|
||||
if (state.restoreOp) {
|
||||
throw new Error('A restore is already in progress');
|
||||
}
|
||||
_restoring = true;
|
||||
state.restoreOp = true;
|
||||
// If we were called directly (no route), also flip the gate so concurrent
|
||||
// requests are 503'd. Track ownership so we don't clear someone else's flag.
|
||||
const ownsGateFlag = !state.restoring;
|
||||
if (ownsGateFlag) {
|
||||
state.restoring = true;
|
||||
}
|
||||
|
||||
let workDir: string | null = null;
|
||||
const dbPath = getDatabasePath();
|
||||
const dbSafety = `${dbPath}.pre-restore-${Date.now()}.bak`;
|
||||
const safetySuffix = `${Date.now()}-${shortRandomSuffix()}`;
|
||||
const dbSafety = `${dbPath}.pre-restore-${safetySuffix}.bak`;
|
||||
const uploadsDir = getUploadsDir();
|
||||
const uploadsSafety = `${uploadsDir}.pre-restore-${Date.now()}`;
|
||||
const uploadsSafety = `${uploadsDir}.pre-restore-${safetySuffix}`;
|
||||
let dbSwapped = false;
|
||||
let uploadsSwapped = false;
|
||||
|
||||
@@ -351,10 +474,52 @@ export async function restoreBackup(
|
||||
if (!isSqliteFile(backupPath)) {
|
||||
throw new Error(`File is not a valid SQLite database: ${filename}`);
|
||||
}
|
||||
if (!(await isSqliteIntegrityOk(backupPath))) {
|
||||
throw new Error(`File fails SQLite integrity check: ${filename}`);
|
||||
}
|
||||
stagedDb = backupPath;
|
||||
} else {
|
||||
workDir = await fsp.mkdtemp(path.join(os.tmpdir(), 'wal-restore-'));
|
||||
await tar.extract({ cwd: workDir, file: backupPath });
|
||||
// Stage the extraction in a SIBLING of the live uploads dir so the
|
||||
// subsequent rename is a same-filesystem operation. Renaming across
|
||||
// volumes (Windows %TEMP% vs the data drive; Linux tmpfs vs disk)
|
||||
// fails with EXDEV / EPERM, defeating the atomic-swap design.
|
||||
const stagingParent = path.dirname(uploadsDir);
|
||||
await fsp.mkdir(stagingParent, { recursive: true });
|
||||
workDir = await fsp.mkdtemp(path.join(stagingParent, '.wal-restore-'));
|
||||
|
||||
// Strict tar extraction:
|
||||
// - reject symlink / hardlink entries (would otherwise let a
|
||||
// malicious tarball write outside workDir on subsequent
|
||||
// file entries).
|
||||
// - reject absolute paths or entries containing `..` segments
|
||||
// (defence-in-depth — node-tar strips these by default but
|
||||
// `strict: true` makes the rejection explicit).
|
||||
await tar.extract({
|
||||
cwd: workDir,
|
||||
file: backupPath,
|
||||
strict: true,
|
||||
filter: (entryPath, statOrEntry) => {
|
||||
// During extraction the second argument is a ReadEntry which
|
||||
// carries `.type` ('File' | 'SymbolicLink' | 'Link' | ...).
|
||||
// `Stats` is the create-time variant and has no `.type`; we
|
||||
// guard with `in` to keep TypeScript narrowing happy.
|
||||
const entryType =
|
||||
'type' in statOrEntry ? (statOrEntry as { type?: string }).type : undefined;
|
||||
if (entryType === 'SymbolicLink' || entryType === 'Link') {
|
||||
throw new Error(
|
||||
`Backup contains link entry (${entryType}): ${entryPath} — refusing to extract`
|
||||
);
|
||||
}
|
||||
const normalized = entryPath.replace(/\\/g, '/');
|
||||
if (path.isAbsolute(normalized)) {
|
||||
throw new Error(`Backup contains absolute path: ${entryPath}`);
|
||||
}
|
||||
if (normalized.split('/').includes('..')) {
|
||||
throw new Error(`Backup contains parent-segment traversal: ${entryPath}`);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
const manifestPath = path.join(workDir, 'manifest.json');
|
||||
if (!fs.existsSync(manifestPath)) {
|
||||
@@ -392,54 +557,61 @@ export async function restoreBackup(
|
||||
if (fs.existsSync(uploadsStaged)) stagedUploads = uploadsStaged;
|
||||
}
|
||||
|
||||
// Schema-version check: tighten to require explicit override if either
|
||||
// side is null. Null on the live side typically means the DB is
|
||||
// corrupt or empty — precisely the case we don't want to silently
|
||||
// restore over.
|
||||
const liveSchemaVersion = await getSchemaVersion();
|
||||
const schemaVersionMatched =
|
||||
!manifest?.schemaVersion ||
|
||||
!liveSchemaVersion ||
|
||||
manifest.schemaVersion === liveSchemaVersion;
|
||||
const manifestSchema = manifest?.schemaVersion ?? null;
|
||||
const bothKnown = !!manifestSchema && !!liveSchemaVersion;
|
||||
const schemaVersionMatched = bothKnown && manifestSchema === liveSchemaVersion;
|
||||
if (!schemaVersionMatched && !options.allowSchemaMismatch) {
|
||||
const reason = !bothKnown
|
||||
? `unknown schema version on ${!manifestSchema ? 'backup' : 'live database'}`
|
||||
: `backup=${manifestSchema}, live=${liveSchemaVersion}`;
|
||||
throw new Error(
|
||||
`Schema version mismatch: backup=${manifest?.schemaVersion ?? 'unknown'}, live=${liveSchemaVersion ?? 'unknown'}. Restore aborted to prevent data loss. Re-trigger with allowSchemaMismatch to override.`
|
||||
`Schema version mismatch: ${reason}. Restore aborted to prevent data loss. Re-trigger with allowSchemaMismatch to override.`
|
||||
);
|
||||
}
|
||||
|
||||
// Snapshot live state for rollback.
|
||||
// 1. Snapshot live state for rollback. Uploads are only touched for
|
||||
// tar.gz restores — legacy .db backups never contained uploads, so
|
||||
// preserving the live uploads tree is the safer default.
|
||||
if (fs.existsSync(dbPath)) {
|
||||
await fsp.copyFile(dbPath, dbSafety);
|
||||
}
|
||||
if (fs.existsSync(uploadsDir)) {
|
||||
if (!isLegacyDb && fs.existsSync(uploadsDir)) {
|
||||
await fsp.rename(uploadsDir, uploadsSafety);
|
||||
}
|
||||
|
||||
await prisma.$disconnect();
|
||||
|
||||
// DB: stage → atomic rename over live path.
|
||||
const dbStaging = `${dbPath}.restore.tmp`;
|
||||
// 2. DB: stage → atomic rename over live path.
|
||||
const dbStaging = `${dbPath}.restore.${shortRandomSuffix()}.tmp`;
|
||||
await fsp.copyFile(stagedDb, dbStaging);
|
||||
await fsp.rename(dbStaging, dbPath);
|
||||
dbSwapped = true;
|
||||
|
||||
// Uploads: rename staged tree into place (or create empty dir if none).
|
||||
if (stagedUploads) {
|
||||
await fsp.rename(stagedUploads, uploadsDir);
|
||||
} else {
|
||||
await fsp.mkdir(uploadsDir, { recursive: true });
|
||||
// 3. Uploads: only swap for tar.gz restores. Legacy restores leave
|
||||
// the live uploads tree intact (the backup didn't capture it).
|
||||
if (!isLegacyDb) {
|
||||
if (stagedUploads) {
|
||||
await fsp.rename(stagedUploads, uploadsDir);
|
||||
} else {
|
||||
await fsp.mkdir(uploadsDir, { recursive: true });
|
||||
}
|
||||
uploadsSwapped = true;
|
||||
}
|
||||
uploadsSwapped = true;
|
||||
|
||||
await prisma.$connect();
|
||||
await reapplySqlitePragmas();
|
||||
|
||||
// Best-effort: wipe any sessions left over from in-flight refreshes that
|
||||
// raced with the restore. Restored DB already contains only sessions
|
||||
// captured AT backup time, so this is a defence-in-depth measure.
|
||||
try {
|
||||
await prisma.session.deleteMany({});
|
||||
} catch (err) {
|
||||
console.warn('[backup] post-restore session purge failed:', err);
|
||||
console.warn('[backup] post-restore session purge failed:', err);
|
||||
}
|
||||
|
||||
// Cleanup safety snapshots on success.
|
||||
await Promise.allSettled([rmrf(dbSafety), rmrf(uploadsSafety)]);
|
||||
|
||||
return {
|
||||
@@ -449,34 +621,76 @@ console.warn('[backup] post-restore session purge failed:', err);
|
||||
uploadFileCount: manifest?.uploadFileCount ?? 0
|
||||
};
|
||||
} catch (err) {
|
||||
// Rollback DB if it was swapped.
|
||||
// ---------------- Rollback ----------------
|
||||
// Two-phase atomic-rename rollback for uploads: NEVER rmrf the live
|
||||
// directory before the safety is in place. If we cannot move the
|
||||
// failed-swap aside (open handles on Windows, etc.) we leave both
|
||||
// safety and bad swap on disk and surface a degraded state instead
|
||||
// of losing data.
|
||||
let rollbackFailure: string | null = null;
|
||||
try {
|
||||
if (dbSwapped && fs.existsSync(dbSafety)) {
|
||||
await fsp.copyFile(dbSafety, dbPath);
|
||||
if (dbSwapped) {
|
||||
if (fs.existsSync(dbSafety)) {
|
||||
await fsp.copyFile(dbSafety, dbPath);
|
||||
}
|
||||
}
|
||||
if (uploadsSwapped) {
|
||||
await rmrf(uploadsDir);
|
||||
if (fs.existsSync(uploadsSafety)) {
|
||||
await fsp.rename(uploadsSafety, uploadsDir);
|
||||
const deprecated = `${uploadsDir}.deprecated-${safetySuffix}-${shortRandomSuffix()}`;
|
||||
try {
|
||||
await fsp.rename(uploadsDir, deprecated);
|
||||
} catch (renameErr) {
|
||||
rollbackFailure = `failed to move failed-swap uploads aside: ${
|
||||
renameErr instanceof Error ? renameErr.message : String(renameErr)
|
||||
}`;
|
||||
throw renameErr;
|
||||
}
|
||||
if (fs.existsSync(uploadsSafety)) {
|
||||
try {
|
||||
await fsp.rename(uploadsSafety, uploadsDir);
|
||||
} catch (renameErr) {
|
||||
// Bad swap is moved aside; safety still exists. Try to
|
||||
// recover by moving the bad swap back so the API is
|
||||
// at least functioning, then surface the failure.
|
||||
try {
|
||||
await fsp.rename(deprecated, uploadsDir);
|
||||
} catch {
|
||||
// Both renames failed: the live uploads dir no
|
||||
// longer exists. Surface loudly.
|
||||
}
|
||||
rollbackFailure = `failed to restore uploads safety: ${
|
||||
renameErr instanceof Error ? renameErr.message : String(renameErr)
|
||||
}`;
|
||||
throw renameErr;
|
||||
}
|
||||
}
|
||||
await rmrf(deprecated);
|
||||
} else if (fs.existsSync(uploadsSafety) && !fs.existsSync(uploadsDir)) {
|
||||
// Uploads dir was renamed away but never replaced.
|
||||
// Safety was moved away but the swap never happened.
|
||||
await fsp.rename(uploadsSafety, uploadsDir);
|
||||
}
|
||||
await rmrf(dbSafety);
|
||||
} catch (rollbackErr) {
|
||||
console.error('[backup] rollback failed:', rollbackErr);
|
||||
console.error('[backup] rollback failed:', rollbackErr);
|
||||
markDegraded(rollbackFailure ?? 'rollback failed during restore');
|
||||
}
|
||||
try {
|
||||
await prisma.$connect();
|
||||
await reapplySqlitePragmas();
|
||||
} catch (reconnectErr) {
|
||||
console.error('[backup] reconnect after rollback failed:', reconnectErr);
|
||||
console.error('[backup] reconnect after rollback failed:', reconnectErr);
|
||||
markDegraded(
|
||||
`prisma reconnect failed: ${
|
||||
reconnectErr instanceof Error ? reconnectErr.message : String(reconnectErr)
|
||||
}`
|
||||
);
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
if (workDir) await rmrf(workDir);
|
||||
_restoring = false;
|
||||
state.restoreOp = false;
|
||||
if (ownsGateFlag) {
|
||||
state.restoring = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -534,36 +748,17 @@ export async function updateBackupSettings(data: {
|
||||
};
|
||||
}
|
||||
|
||||
// Stats exposed for scheduler observability — also surfaced via /api/metrics
|
||||
// if you wire it there.
|
||||
export interface BackupSchedulerStats {
|
||||
successCount: number;
|
||||
failureCount: number;
|
||||
lastSuccessAt: string | null;
|
||||
lastFailureAt: string | null;
|
||||
lastFailureReason: string | null;
|
||||
}
|
||||
|
||||
const _stats: BackupSchedulerStats = {
|
||||
successCount: 0,
|
||||
failureCount: 0,
|
||||
lastSuccessAt: null,
|
||||
lastFailureAt: null,
|
||||
lastFailureReason: null
|
||||
};
|
||||
|
||||
export function getBackupSchedulerStats(): Readonly<BackupSchedulerStats> {
|
||||
return { ..._stats };
|
||||
return { ...state.stats };
|
||||
}
|
||||
|
||||
export function recordScheduledBackupSuccess(): void {
|
||||
_stats.successCount += 1;
|
||||
_stats.lastSuccessAt = new Date().toISOString();
|
||||
state.stats.successCount += 1;
|
||||
state.stats.lastSuccessAt = new Date().toISOString();
|
||||
}
|
||||
|
||||
export function recordScheduledBackupFailure(reason: string): void {
|
||||
_stats.failureCount += 1;
|
||||
_stats.lastFailureAt = new Date().toISOString();
|
||||
_stats.lastFailureReason = reason;
|
||||
state.stats.failureCount += 1;
|
||||
state.stats.lastFailureAt = new Date().toISOString();
|
||||
state.stats.lastFailureReason = reason;
|
||||
}
|
||||
|
||||
|
||||
@@ -26,11 +26,19 @@ export const GET: RequestHandler = async (event) => {
|
||||
? 'application/gzip'
|
||||
: 'application/octet-stream';
|
||||
|
||||
// RFC 5987: filename* uses percent-encoding for non-ASCII / quote-unsafe
|
||||
// characters. We keep the legacy `filename=` fallback for clients that
|
||||
// don't speak RFC 5987 (very old browsers / curl < 7.20). Backslashes and
|
||||
// quotes in the fallback are sanitised; the regex in getBackupFilePath
|
||||
// blocks them today but this stays safe under any future loosening.
|
||||
const fallback = basename.replace(/[\\"]/g, '_');
|
||||
const encoded = encodeURIComponent(basename).replace(/['()]/g, escape);
|
||||
|
||||
return new Response(Readable.toWeb(stream) as ReadableStream, {
|
||||
status: 200,
|
||||
headers: {
|
||||
'Content-Type': contentType,
|
||||
'Content-Disposition': `attachment; filename="${basename}"`,
|
||||
'Content-Disposition': `attachment; filename="${fallback}"; filename*=UTF-8''${encoded}`,
|
||||
'Content-Length': String(stats.size)
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import { json } from '@sveltejs/kit';
|
||||
import type { RequestHandler } from './$types';
|
||||
import { requireAdmin } from '$lib/server/middleware/authorize.js';
|
||||
import { restoreBackup } from '$lib/server/services/backupService.js';
|
||||
import {
|
||||
beginRestoreWindow,
|
||||
endRestoreWindow,
|
||||
restoreBackup
|
||||
} from '$lib/server/services/backupService.js';
|
||||
import { clearSessionCookies } from '$lib/server/utils/sessionCookies.js';
|
||||
import { success, error } from '$lib/server/utils/response.js';
|
||||
import { logAction } from '$lib/server/services/auditLogService.js';
|
||||
@@ -17,6 +21,11 @@ const restoreOptionsSchema = z
|
||||
/**
|
||||
* POST /api/admin/backups/:filename/restore — Restore the database from a backup.
|
||||
*
|
||||
* The restore window is opened SYNCHRONOUSLY here, before any body parsing or
|
||||
* async work, so the hooks.server.ts gate starts returning 503 to concurrent
|
||||
* requests immediately. The window is closed in a finally block; restoreBackup
|
||||
* is idempotent w.r.t. that flag.
|
||||
*
|
||||
* On success the response sets force_logout: true and clears the admin's
|
||||
* session cookies, because the restored DB contains a session set from the
|
||||
* backup-time snapshot and the current admin's session is no longer valid.
|
||||
@@ -25,34 +34,50 @@ export const POST: RequestHandler = async (event) => {
|
||||
const admin = requireAdmin(event);
|
||||
const { filename } = event.params;
|
||||
|
||||
let options: { allowSchemaMismatch?: boolean } = {};
|
||||
// CRITICAL: flip the gate BEFORE any awaits so concurrent requests
|
||||
// don't slip through during body parsing.
|
||||
try {
|
||||
const text = await event.request.text();
|
||||
if (text.trim()) {
|
||||
const parsed = restoreOptionsSchema.safeParse(JSON.parse(text));
|
||||
if (parsed.success && parsed.data) options = parsed.data;
|
||||
}
|
||||
} catch {
|
||||
// Body is optional — ignore parse errors and fall back to defaults.
|
||||
beginRestoreWindow();
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : 'Restore unavailable';
|
||||
return json(error(message), { status: 409 });
|
||||
}
|
||||
|
||||
try {
|
||||
let options: { allowSchemaMismatch?: boolean } = {};
|
||||
try {
|
||||
const text = await event.request.text();
|
||||
if (text.trim()) {
|
||||
const parsed = restoreOptionsSchema.safeParse(JSON.parse(text));
|
||||
if (parsed.success && parsed.data) options = parsed.data;
|
||||
}
|
||||
} catch {
|
||||
// Body is optional — ignore parse errors and fall back to defaults.
|
||||
}
|
||||
|
||||
const result = await restoreBackup(filename, options);
|
||||
|
||||
logAction(admin.id, AuditAction.BACKUP_RESTORED, 'backup', filename, {
|
||||
format: result.format,
|
||||
schemaVersionMatched: result.schemaVersionMatched,
|
||||
uploadFileCount: result.uploadFileCount
|
||||
uploadFileCount: result.uploadFileCount,
|
||||
allowedSchemaMismatch: options.allowSchemaMismatch ?? false
|
||||
});
|
||||
|
||||
// All session state from the backup time is now live — the admin's
|
||||
// current cookies refer to a session that doesn't exist any more.
|
||||
// Restored DB contains backup-time sessions; the admin's cookies refer
|
||||
// to a session that no longer exists.
|
||||
clearSessionCookies(event.cookies);
|
||||
|
||||
return json(success({ ...result, forceLogout: true }));
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : 'Failed to restore backup';
|
||||
const status = /schema version mismatch/i.test(message) ? 409 : 500;
|
||||
logAction(admin.id, AuditAction.BACKUP_FAILED, 'backup', filename, {
|
||||
phase: 'restore',
|
||||
error: message
|
||||
});
|
||||
return json(error(message), { status });
|
||||
} finally {
|
||||
endRestoreWindow();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,25 +1,48 @@
|
||||
import { json } from '@sveltejs/kit';
|
||||
import type { RequestHandler } from './$types';
|
||||
import { prisma } from '$lib/server/prisma.js';
|
||||
import {
|
||||
isDegraded,
|
||||
getDegradedReason,
|
||||
isRestoring
|
||||
} from '$lib/server/services/backupService.js';
|
||||
|
||||
/**
|
||||
* GET /api/health — Docker healthcheck endpoint.
|
||||
* GET /api/health — Docker / Kubernetes healthcheck endpoint.
|
||||
*
|
||||
* Pings the database with a trivial query so the container is reported
|
||||
* unhealthy when Prisma is disconnected (the old hardcoded {status:'ok'}
|
||||
* masked DB outages from the Docker healthcheck and from any uptime monitor).
|
||||
* unhealthy when Prisma is disconnected. Also exposes the backup-restore
|
||||
* degraded state so an orchestrator can recycle a process stuck in a
|
||||
* partially-rolled-back state.
|
||||
*
|
||||
* No auth required — this is the probe endpoint, intentionally public.
|
||||
* Response payload is intentionally minimal to avoid leaking internals.
|
||||
* Status semantics:
|
||||
* 200 ok — DB reachable, no degraded flag
|
||||
* 503 restoring — restore in progress (transient)
|
||||
* 503 degraded — restore failed + rollback failed; process needs restart
|
||||
* 503 db_down — DB ping failed
|
||||
*/
|
||||
export const GET: RequestHandler = async () => {
|
||||
const version = process.env.APP_VERSION ?? 'dev';
|
||||
|
||||
if (isDegraded()) {
|
||||
return json(
|
||||
{
|
||||
status: 'degraded',
|
||||
reason: getDegradedReason(),
|
||||
version
|
||||
},
|
||||
{ status: 503 }
|
||||
);
|
||||
}
|
||||
|
||||
if (isRestoring()) {
|
||||
return json({ status: 'restoring', version }, { status: 503 });
|
||||
}
|
||||
|
||||
try {
|
||||
await prisma.$queryRaw`SELECT 1`;
|
||||
return json({
|
||||
status: 'ok',
|
||||
version: process.env.APP_VERSION ?? 'dev'
|
||||
});
|
||||
return json({ status: 'ok', version });
|
||||
} catch {
|
||||
return json({ status: 'degraded', version: process.env.APP_VERSION ?? 'dev' }, { status: 503 });
|
||||
return json({ status: 'db_down', version }, { status: 503 });
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user