feat(assistant): headless-RAG — индексация JS-рендеримых учебников

scripts/index-textbooks-headless.js: puppeteer-core + системный Chrome/Edge
рендерит каждый учебник через локальный сервер (служебный JWT в localStorage,
т.к. /textbook требует логина), кликает по параграфам и забирает рендерный
текст движков (математика/физика и т.п.) в textbook_chunks. Дополняет
статический индексатор. npm: index:textbooks / index:textbooks:full (headless).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maxim Dolgolyov
2026-06-04 18:27:40 +03:00
parent 2252bbd666
commit 0119ea0f15
3 changed files with 369 additions and 3 deletions
+280 -3
View File
@@ -16,6 +16,7 @@
"js-yaml": "^4.1.1",
"jsonwebtoken": "^9.0.2",
"multer": "^2.1.1",
"puppeteer-core": "^25.1.0",
"sharp": "^0.34.5",
"ws": "^8.20.0"
},
@@ -498,6 +499,30 @@
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@puppeteer/browsers": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-3.0.4.tgz",
"integrity": "sha512-HGM8iAmGTf+Y7t0373szVbTmt3d7vPkYL/1bpOkOFO0YUYLgSeuYBCzESklogNPvOBnZ/MRD5f07OkpqH1trtA==",
"license": "Apache-2.0",
"dependencies": {
"modern-tar": "^0.7.6",
"yargs": "^17.7.2"
},
"bin": {
"browsers": "lib/main-cli.js"
},
"engines": {
"node": ">=22.12.0"
},
"peerDependencies": {
"proxy-agent": ">=8.0.1"
},
"peerDependenciesMeta": {
"proxy-agent": {
"optional": true
}
}
},
"node_modules/accepts": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
@@ -511,6 +536,30 @@
"node": ">= 0.6"
}
},
"node_modules/ansi-regex": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/ansi-styles": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
"license": "MIT",
"dependencies": {
"color-convert": "^2.0.1"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/anymatch": {
"version": "3.1.3",
"resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
@@ -708,6 +757,54 @@
"fsevents": "~2.3.2"
}
},
"node_modules/chromium-bidi": {
"version": "16.0.1",
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-16.0.1.tgz",
"integrity": "sha512-J63PGu/9PpeCwLIcKYyzWP6yaVL5pxuBc0shlYCYM8BaAkmlwiQboXO1iNbOgSDbVklEyYFfNEcHD8oOAWacUA==",
"license": "Apache-2.0",
"dependencies": {
"mitt": "^3.0.1",
"zod": "^3.24.1"
},
"engines": {
"node": ">=20.19.0 <22.0.0 || >=22.12.0"
},
"peerDependencies": {
"devtools-protocol": "*"
}
},
"node_modules/cliui": {
"version": "8.0.1",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
"integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
"license": "ISC",
"dependencies": {
"string-width": "^4.2.0",
"strip-ansi": "^6.0.1",
"wrap-ansi": "^7.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
"license": "MIT",
"dependencies": {
"color-name": "~1.1.4"
},
"engines": {
"node": ">=7.0.0"
}
},
"node_modules/color-name": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
"license": "MIT"
},
"node_modules/compressible": {
"version": "2.0.18",
"resolved": "https://registry.npmjs.org/compressible/-/compressible-2.0.18.tgz",
@@ -852,6 +949,12 @@
"node": ">=8"
}
},
"node_modules/devtools-protocol": {
"version": "0.0.1624250",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1624250.tgz",
"integrity": "sha512-YFAat/lOiIk0ARmBweG+ygrEcbZrq5B9urRyUoeQKp53MlidHXE2TmTbxKcaXoQj7u/aX+jebDO4BW55rs0WwA==",
"license": "BSD-3-Clause"
},
"node_modules/dotenv": {
"version": "16.6.1",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
@@ -893,6 +996,12 @@
"integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==",
"license": "MIT"
},
"node_modules/emoji-regex": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
"license": "MIT"
},
"node_modules/encodeurl": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
@@ -932,6 +1041,15 @@
"node": ">= 0.4"
}
},
"node_modules/escalade": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
"integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/escape-html": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
@@ -1066,6 +1184,15 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/get-caller-file": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
"license": "ISC",
"engines": {
"node": "6.* || 8.* || >= 10.*"
}
},
"node_modules/get-intrinsic": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
@@ -1239,6 +1366,15 @@
"node": ">=0.10.0"
}
},
"node_modules/is-fullwidth-code-point": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/is-glob": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
@@ -1450,6 +1586,21 @@
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/mitt": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
"license": "MIT"
},
"node_modules/modern-tar": {
"version": "0.7.6",
"resolved": "https://registry.npmjs.org/modern-tar/-/modern-tar-0.7.6.tgz",
"integrity": "sha512-sweCIVXzx1aIGTCdzcMlSZt1h8k5Tmk08VNAuRk3IU28XamGiOH5ypi11g6De2CH7PhYqSSnGy2A/EFhbWnVKg==",
"license": "MIT",
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
@@ -1638,6 +1789,23 @@
"dev": true,
"license": "MIT"
},
"node_modules/puppeteer-core": {
"version": "25.1.0",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-25.1.0.tgz",
"integrity": "sha512-jKzy5y4WG6uNuFbTWgW1D7mqoT9o0nllc/6a1DGF775T1mPmgw3scdFEtEq67yVFikavQmbYq6NLfbTfxHSlqQ==",
"license": "Apache-2.0",
"dependencies": {
"@puppeteer/browsers": "3.0.4",
"chromium-bidi": "16.0.1",
"devtools-protocol": "0.0.1624250",
"typed-query-selector": "^2.12.2",
"webdriver-bidi-protocol": "0.4.2",
"ws": "^8.21.0"
},
"engines": {
"node": ">=22.12.0"
}
},
"node_modules/qs": {
"version": "6.14.2",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.14.2.tgz",
@@ -1704,6 +1872,15 @@
"node": ">=8.10.0"
}
},
"node_modules/require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/safe-buffer": {
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@@ -1948,6 +2125,32 @@
"safe-buffer": "~5.2.0"
}
},
"node_modules/string-width": {
"version": "4.2.3",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
"license": "MIT",
"dependencies": {
"emoji-regex": "^8.0.0",
"is-fullwidth-code-point": "^3.0.0",
"strip-ansi": "^6.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/strip-ansi": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
"license": "MIT",
"dependencies": {
"ansi-regex": "^5.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/supports-color": {
"version": "5.5.0",
"resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz",
@@ -2013,6 +2216,12 @@
"node": ">= 0.6"
}
},
"node_modules/typed-query-selector": {
"version": "2.12.2",
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.2.tgz",
"integrity": "sha512-EOPFbyIub4ngnEdqi2yOcNeDLaX/0jcE1JoAXQDDMIthap7FoN795lc/SHfIq2d416VufXpM8z/lD+WRm2gfOQ==",
"license": "MIT"
},
"node_modules/typedarray": {
"version": "0.0.6",
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
@@ -2059,10 +2268,33 @@
"node": ">= 0.8"
}
},
"node_modules/webdriver-bidi-protocol": {
"version": "0.4.2",
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.4.2.tgz",
"integrity": "sha512-VSV+fzfChirL3e7jay2yUC7B4HQCGtEWEg/MSSQbK+qWbqeGlRLlXTzPpYr3XGUvbpDHumWZBJxgesg4N7dbtA==",
"license": "Apache-2.0"
},
"node_modules/wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
"license": "MIT",
"dependencies": {
"ansi-styles": "^4.0.0",
"string-width": "^4.1.0",
"strip-ansi": "^6.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
}
},
"node_modules/ws": {
"version": "8.20.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.20.0.tgz",
"integrity": "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA==",
"version": "8.21.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.21.0.tgz",
"integrity": "sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
@@ -2079,6 +2311,51 @@
"optional": true
}
}
},
"node_modules/y18n": {
"version": "5.0.8",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
"license": "ISC",
"engines": {
"node": ">=10"
}
},
"node_modules/yargs": {
"version": "17.7.2",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
"integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
"license": "MIT",
"dependencies": {
"cliui": "^8.0.1",
"escalade": "^3.1.1",
"get-caller-file": "^2.0.5",
"require-directory": "^2.1.1",
"string-width": "^4.2.3",
"y18n": "^5.0.5",
"yargs-parser": "^21.1.1"
},
"engines": {
"node": ">=12"
}
},
"node_modules/yargs-parser": {
"version": "21.1.1",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
"integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
}
}
}
+3
View File
@@ -14,6 +14,8 @@
"lint:routes": "node scripts/check-route-auth.js",
"import:content": "node scripts/import-content.js",
"import:exam-tasks": "node scripts/import-exam-tasks.js",
"index:textbooks": "node scripts/index-textbooks.js",
"index:textbooks:full": "node scripts/index-textbooks-headless.js",
"test": "node --test tests/*.test.js",
"hooks:install": "sh ../scripts/install-hooks.sh"
},
@@ -26,6 +28,7 @@
"js-yaml": "^4.1.1",
"jsonwebtoken": "^9.0.2",
"multer": "^2.1.1",
"puppeteer-core": "^25.1.0",
"sharp": "^0.34.5",
"ws": "^8.20.0"
},
@@ -0,0 +1,86 @@
'use strict';
/* index-textbooks-headless.js — полный RAG-индекс: рендерит каждый учебник
* настоящим браузером (puppeteer-core + системный Chrome/Edge) через локальный
* сервер и забирает РЕНДЕРНЫЙ текст параграфов. Покрывает и JS-рендеримые
* учебники (математика/физика-движки), которых нет в статическом HTML.
*
* Требует запущенный сервер (localhost:3000). Долгая операция (минуты).
* Запуск: node backend/scripts/index-textbooks-headless.js
* Дополняет/замещает чанки только для успешно отрендеренных учебников. */
require('dotenv').config({ path: require('path').join(__dirname, '..', '.env') });
const fs = require('fs');
const path = require('path');
const jwt = require('jsonwebtoken');
const db = require('../src/db/db');
/* Текстовые страницы учебника требуют логина — выпускаем служебный JWT. */
function authToken() {
const u = db.prepare("SELECT id, role, token_version FROM users WHERE is_banned = 0 AND role IN ('admin','teacher') ORDER BY id LIMIT 1").get()
|| db.prepare('SELECT id, role, token_version FROM users WHERE is_banned = 0 ORDER BY id LIMIT 1').get();
if (!u || !process.env.JWT_SECRET) return null;
return jwt.sign({ id: u.id, role: u.role, tv: u.token_version }, process.env.JWT_SECRET, { algorithm: 'HS256', expiresIn: '4h' });
}
const BASE = process.env.ASSISTANT_INDEX_BASE || ('http://localhost:' + (process.env.PORT || 3000));
const BROWSERS = [
'C:/Program Files/Google/Chrome/Application/chrome.exe',
'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe',
'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe',
'C:/Program Files/Microsoft/Edge/Application/msedge.exe',
];
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
async function run() {
const puppeteer = require('puppeteer-core');
const exe = BROWSERS.find(p => { try { return fs.existsSync(p); } catch (e) { return false; } });
if (!exe) { console.error('Браузер не найден (Chrome/Edge)'); process.exit(1); }
const books = db.prepare('SELECT slug, title FROM textbooks WHERE is_active = 1 ORDER BY slug').all();
const del = db.prepare('DELETE FROM textbook_chunks WHERE slug = ?');
const ins = db.prepare('INSERT INTO textbook_chunks (slug, textbook_title, section_title, text) VALUES (?, ?, ?, ?)');
const token = authToken();
if (!token) { console.error('Не удалось выпустить токен (нет пользователя или JWT_SECRET)'); process.exit(1); }
const browser = await puppeteer.launch({ executablePath: exe, headless: true, args: ['--no-sandbox', '--disable-gpu', '--disable-dev-shm-usage'] });
const page = await browser.newPage();
await page.setViewport({ width: 1100, height: 900 });
await page.evaluateOnNewDocument((t) => { try { localStorage.setItem('ls_token', t); } catch (e) {} }, token);
let totalChunks = 0, okBooks = 0;
for (const b of books) {
let chunks = [];
try {
await page.goto(`${BASE}/textbook/${b.slug}`, { waitUntil: 'networkidle2', timeout: 25000 });
await page.waitForSelector('.psel-card, .sec', { timeout: 12000 }).catch(() => {});
await sleep(400);
const ids = await page.$$eval('.psel-card[data-id]', els => els.map(e => ({ id: e.dataset.id, name: ((e.querySelector('.psel-name') || {}).textContent || '').trim() })));
if (ids.length) {
for (const s of ids) {
try {
await page.evaluate(id => { const c = document.querySelector('.psel-card[data-id="' + id + '"]'); if (c) c.click(); }, s.id);
await sleep(550);
const text = await page.evaluate(() => { const a = document.querySelector('.sec.active'); return a ? a.innerText.replace(/\s+/g, ' ').trim() : ''; });
if (text && text.length >= 80) chunks.push({ section: s.name.slice(0, 160), text: text.slice(0, 2000) });
} catch (e) {}
}
} else {
const secs = await page.$$eval('.sec', els => els.map(e => e.innerText.replace(/\s+/g, ' ').trim()));
secs.forEach(t => { if (t && t.length >= 80) chunks.push({ section: '', text: t.slice(0, 2000) }); });
}
} catch (e) { /* книга не отрендерилась — оставляем как было */ }
if (chunks.length) {
del.run(b.slug);
for (const c of chunks) ins.run(b.slug, b.title || b.slug, c.section, c.text);
okBooks++; totalChunks += chunks.length;
console.log(` ${b.slug}: ${chunks.length}`);
} else {
console.log(` ${b.slug}: — (нет рендера, оставлено как есть)`);
}
}
await browser.close();
console.log(`[headless] готово: ${okBooks}/${books.length} учебников, ${totalChunks} чанков (перезаписаны).`);
process.exit(0);
}
run().catch(e => { console.error('[headless] ошибка:', e.message); process.exit(1); });