diff --git a/backend/package-lock.json b/backend/package-lock.json index d0a9014..f0daf0e 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -16,6 +16,7 @@ "js-yaml": "^4.1.1", "jsonwebtoken": "^9.0.2", "multer": "^2.1.1", + "puppeteer-core": "^25.1.0", "sharp": "^0.34.5", "ws": "^8.20.0" }, @@ -498,6 +499,30 @@ "url": "https://opencollective.com/libvips" } }, + "node_modules/@puppeteer/browsers": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-3.0.4.tgz", + "integrity": "sha512-HGM8iAmGTf+Y7t0373szVbTmt3d7vPkYL/1bpOkOFO0YUYLgSeuYBCzESklogNPvOBnZ/MRD5f07OkpqH1trtA==", + "license": "Apache-2.0", + "dependencies": { + "modern-tar": "^0.7.6", + "yargs": "^17.7.2" + }, + "bin": { + "browsers": "lib/main-cli.js" + }, + "engines": { + "node": ">=22.12.0" + }, + "peerDependencies": { + "proxy-agent": ">=8.0.1" + }, + "peerDependenciesMeta": { + "proxy-agent": { + "optional": true + } + } + }, "node_modules/accepts": { "version": "1.3.8", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", @@ -511,6 +536,30 @@ "node": ">= 0.6" } }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "license": "MIT", + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, "node_modules/anymatch": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", @@ -708,6 +757,54 @@ "fsevents": "~2.3.2" } }, + "node_modules/chromium-bidi": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-16.0.1.tgz", + "integrity": "sha512-J63PGu/9PpeCwLIcKYyzWP6yaVL5pxuBc0shlYCYM8BaAkmlwiQboXO1iNbOgSDbVklEyYFfNEcHD8oOAWacUA==", + "license": "Apache-2.0", + "dependencies": { + "mitt": "^3.0.1", + "zod": "^3.24.1" + }, + "engines": { + "node": ">=20.19.0 <22.0.0 || >=22.12.0" + }, + "peerDependencies": { + "devtools-protocol": "*" + } + }, + "node_modules/cliui": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", + "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", + "license": "ISC", + "dependencies": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.1", + "wrap-ansi": "^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "license": "MIT" + }, "node_modules/compressible": { "version": "2.0.18", "resolved": "https://registry.npmjs.org/compressible/-/compressible-2.0.18.tgz", @@ -852,6 +949,12 @@ "node": ">=8" } }, + "node_modules/devtools-protocol": { + "version": "0.0.1624250", + "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1624250.tgz", + "integrity": "sha512-YFAat/lOiIk0ARmBweG+ygrEcbZrq5B9urRyUoeQKp53MlidHXE2TmTbxKcaXoQj7u/aX+jebDO4BW55rs0WwA==", + "license": "BSD-3-Clause" + }, "node_modules/dotenv": { "version": "16.6.1", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", @@ -893,6 +996,12 @@ "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==", "license": "MIT" }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "license": "MIT" + }, "node_modules/encodeurl": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", @@ -932,6 +1041,15 @@ "node": ">= 0.4" } }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/escape-html": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", @@ -1066,6 +1184,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "license": "ISC", + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, "node_modules/get-intrinsic": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", @@ -1239,6 +1366,15 @@ "node": ">=0.10.0" } }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, "node_modules/is-glob": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", @@ -1450,6 +1586,21 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/mitt": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", + "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", + "license": "MIT" + }, + "node_modules/modern-tar": { + "version": "0.7.6", + "resolved": "https://registry.npmjs.org/modern-tar/-/modern-tar-0.7.6.tgz", + "integrity": "sha512-sweCIVXzx1aIGTCdzcMlSZt1h8k5Tmk08VNAuRk3IU28XamGiOH5ypi11g6De2CH7PhYqSSnGy2A/EFhbWnVKg==", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", @@ -1638,6 +1789,23 @@ "dev": true, "license": "MIT" }, + "node_modules/puppeteer-core": { + "version": "25.1.0", + "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-25.1.0.tgz", + "integrity": "sha512-jKzy5y4WG6uNuFbTWgW1D7mqoT9o0nllc/6a1DGF775T1mPmgw3scdFEtEq67yVFikavQmbYq6NLfbTfxHSlqQ==", + "license": "Apache-2.0", + "dependencies": { + "@puppeteer/browsers": "3.0.4", + "chromium-bidi": "16.0.1", + "devtools-protocol": "0.0.1624250", + "typed-query-selector": "^2.12.2", + "webdriver-bidi-protocol": "0.4.2", + "ws": "^8.21.0" + }, + "engines": { + "node": ">=22.12.0" + } + }, "node_modules/qs": { "version": "6.14.2", "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.2.tgz", @@ -1704,6 +1872,15 @@ "node": ">=8.10.0" } }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -1948,6 +2125,32 @@ "safe-buffer": "~5.2.0" } }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -2013,6 +2216,12 @@ "node": ">= 0.6" } }, + "node_modules/typed-query-selector": { + "version": "2.12.2", + "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.2.tgz", + "integrity": "sha512-EOPFbyIub4ngnEdqi2yOcNeDLaX/0jcE1JoAXQDDMIthap7FoN795lc/SHfIq2d416VufXpM8z/lD+WRm2gfOQ==", + "license": "MIT" + }, "node_modules/typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", @@ -2059,10 +2268,33 @@ "node": ">= 0.8" } }, + "node_modules/webdriver-bidi-protocol": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.4.2.tgz", + "integrity": "sha512-VSV+fzfChirL3e7jay2yUC7B4HQCGtEWEg/MSSQbK+qWbqeGlRLlXTzPpYr3XGUvbpDHumWZBJxgesg4N7dbtA==", + "license": "Apache-2.0" + }, + "node_modules/wrap-ansi": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/ws": { - "version": "8.20.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.0.tgz", - "integrity": "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA==", + "version": "8.21.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.21.0.tgz", + "integrity": "sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==", "license": "MIT", "engines": { "node": ">=10.0.0" @@ -2079,6 +2311,51 @@ "optional": true } } + }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "license": "ISC", + "engines": { + "node": ">=10" + } + }, + "node_modules/yargs": { + "version": "17.7.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", + "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", + "license": "MIT", + "dependencies": { + "cliui": "^8.0.1", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "require-directory": "^2.1.1", + "string-width": "^4.2.3", + "y18n": "^5.0.5", + "yargs-parser": "^21.1.1" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs-parser": { + "version": "21.1.1", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", + "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } } } } diff --git a/backend/package.json b/backend/package.json index 73ac269..7a30368 100644 --- a/backend/package.json +++ b/backend/package.json @@ -14,6 +14,8 @@ "lint:routes": "node scripts/check-route-auth.js", "import:content": "node scripts/import-content.js", "import:exam-tasks": "node scripts/import-exam-tasks.js", + "index:textbooks": "node scripts/index-textbooks.js", + "index:textbooks:full": "node scripts/index-textbooks-headless.js", "test": "node --test tests/*.test.js", "hooks:install": "sh ../scripts/install-hooks.sh" }, @@ -26,6 +28,7 @@ "js-yaml": "^4.1.1", "jsonwebtoken": "^9.0.2", "multer": "^2.1.1", + "puppeteer-core": "^25.1.0", "sharp": "^0.34.5", "ws": "^8.20.0" }, diff --git a/backend/scripts/index-textbooks-headless.js b/backend/scripts/index-textbooks-headless.js new file mode 100644 index 0000000..d4247f8 --- /dev/null +++ b/backend/scripts/index-textbooks-headless.js @@ -0,0 +1,86 @@ +'use strict'; +/* index-textbooks-headless.js — полный RAG-индекс: рендерит каждый учебник + * настоящим браузером (puppeteer-core + системный Chrome/Edge) через локальный + * сервер и забирает РЕНДЕРНЫЙ текст параграфов. Покрывает и JS-рендеримые + * учебники (математика/физика-движки), которых нет в статическом HTML. + * + * Требует запущенный сервер (localhost:3000). Долгая операция (минуты). + * Запуск: node backend/scripts/index-textbooks-headless.js + * Дополняет/замещает чанки только для успешно отрендеренных учебников. */ +require('dotenv').config({ path: require('path').join(__dirname, '..', '.env') }); +const fs = require('fs'); +const path = require('path'); +const jwt = require('jsonwebtoken'); +const db = require('../src/db/db'); + +/* Текстовые страницы учебника требуют логина — выпускаем служебный JWT. */ +function authToken() { + const u = db.prepare("SELECT id, role, token_version FROM users WHERE is_banned = 0 AND role IN ('admin','teacher') ORDER BY id LIMIT 1").get() + || db.prepare('SELECT id, role, token_version FROM users WHERE is_banned = 0 ORDER BY id LIMIT 1').get(); + if (!u || !process.env.JWT_SECRET) return null; + return jwt.sign({ id: u.id, role: u.role, tv: u.token_version }, process.env.JWT_SECRET, { algorithm: 'HS256', expiresIn: '4h' }); +} + +const BASE = process.env.ASSISTANT_INDEX_BASE || ('http://localhost:' + (process.env.PORT || 3000)); +const BROWSERS = [ + 'C:/Program Files/Google/Chrome/Application/chrome.exe', + 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe', + 'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe', + 'C:/Program Files/Microsoft/Edge/Application/msedge.exe', +]; +const sleep = (ms) => new Promise(r => setTimeout(r, ms)); + +async function run() { + const puppeteer = require('puppeteer-core'); + const exe = BROWSERS.find(p => { try { return fs.existsSync(p); } catch (e) { return false; } }); + if (!exe) { console.error('Браузер не найден (Chrome/Edge)'); process.exit(1); } + const books = db.prepare('SELECT slug, title FROM textbooks WHERE is_active = 1 ORDER BY slug').all(); + const del = db.prepare('DELETE FROM textbook_chunks WHERE slug = ?'); + const ins = db.prepare('INSERT INTO textbook_chunks (slug, textbook_title, section_title, text) VALUES (?, ?, ?, ?)'); + + const token = authToken(); + if (!token) { console.error('Не удалось выпустить токен (нет пользователя или JWT_SECRET)'); process.exit(1); } + const browser = await puppeteer.launch({ executablePath: exe, headless: true, args: ['--no-sandbox', '--disable-gpu', '--disable-dev-shm-usage'] }); + const page = await browser.newPage(); + await page.setViewport({ width: 1100, height: 900 }); + await page.evaluateOnNewDocument((t) => { try { localStorage.setItem('ls_token', t); } catch (e) {} }, token); + let totalChunks = 0, okBooks = 0; + + for (const b of books) { + let chunks = []; + try { + await page.goto(`${BASE}/textbook/${b.slug}`, { waitUntil: 'networkidle2', timeout: 25000 }); + await page.waitForSelector('.psel-card, .sec', { timeout: 12000 }).catch(() => {}); + await sleep(400); + const ids = await page.$$eval('.psel-card[data-id]', els => els.map(e => ({ id: e.dataset.id, name: ((e.querySelector('.psel-name') || {}).textContent || '').trim() }))); + if (ids.length) { + for (const s of ids) { + try { + await page.evaluate(id => { const c = document.querySelector('.psel-card[data-id="' + id + '"]'); if (c) c.click(); }, s.id); + await sleep(550); + const text = await page.evaluate(() => { const a = document.querySelector('.sec.active'); return a ? a.innerText.replace(/\s+/g, ' ').trim() : ''; }); + if (text && text.length >= 80) chunks.push({ section: s.name.slice(0, 160), text: text.slice(0, 2000) }); + } catch (e) {} + } + } else { + const secs = await page.$$eval('.sec', els => els.map(e => e.innerText.replace(/\s+/g, ' ').trim())); + secs.forEach(t => { if (t && t.length >= 80) chunks.push({ section: '', text: t.slice(0, 2000) }); }); + } + } catch (e) { /* книга не отрендерилась — оставляем как было */ } + + if (chunks.length) { + del.run(b.slug); + for (const c of chunks) ins.run(b.slug, b.title || b.slug, c.section, c.text); + okBooks++; totalChunks += chunks.length; + console.log(` ${b.slug}: ${chunks.length}`); + } else { + console.log(` ${b.slug}: — (нет рендера, оставлено как есть)`); + } + } + + await browser.close(); + console.log(`[headless] готово: ${okBooks}/${books.length} учебников, ${totalChunks} чанков (перезаписаны).`); + process.exit(0); +} + +run().catch(e => { console.error('[headless] ошибка:', e.message); process.exit(1); });