Files
Learn_System/backend/scripts/gen-exam-textbook-sections.js
T
Maxim Dolgolyov c9f3eed8ed fix(exam): классификатор § — fallback при 0 совпадений + учёт opts_json; таксономия в репо
- classify(): bestScore стартует с 0 (нужно совпадение>0), иначе берётся явный fallback
  (последнее правило), а не первое. Чинит свал theory-statements→§15 и word-problems→проценты.
- optsText(): анализ текста вариантов ответа (формат пар [label, html]) — theory-statements
  размечаются по содержанию утверждений.
- alg-word-problems fallback → algebra-7-ch3 §16 (задачи уравнением), не проценты.
- Таксономия §: перенесена с gitignore-пути data/ на отслеживаемый
  backend/scripts/exam-textbook-sections.json + генератор gen-exam-textbook-sections.js.
- Результат: 784/800 (98%) размечено, спреды по подтемам корректны.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 16:29:40 +03:00

88 lines
4.8 KiB
JavaScript

#!/usr/bin/env node
/**
* gen-exam-textbook-sections.js
*
* Regenerates the §-section taxonomy of the grades 5-9 math-family textbooks,
* used by tag-exam-textbook.js (the exam→textbook classifier).
*
* Outputs:
* backend/scripts/exam-textbook-sections.json — machine-readable (the classifier reads this)
* plans/exam-textbook-links/taxonomy.md — human-readable reference
*
* Re-run whenever a grade 5-9 algebra/geometry/math chapter gains or renames a §.
* Note: math-5/6 are engine-rendered (math6_engine.js builds <section id="sec-<p.id>">
* from window.M6.paras) — their §s are NOT extracted statically here (emitted with
* engine:'math6' marker); the classifier links them at chapter level.
*
* Usage: node backend/scripts/gen-exam-textbook-sections.js
*/
'use strict';
const fs = require('fs');
const path = require('path');
const DIR = path.join(__dirname, '../../frontend/textbooks');
const OUT_MD = path.join(__dirname, '../../plans/exam-textbook-links/taxonomy.md');
const OUT_JSON = path.join(__dirname, 'exam-textbook-sections.json');
// chapter slug -> html file (from the textbooks table). Order = teaching order.
const CHAPTERS = [
['math-5-ch1', 'math_5_ch1.html'], ['math-5-ch2', 'math_5_ch2.html'], ['math-5-ch3', 'math_5_ch3.html'],
['math-6-ch1', 'math_6_ch1.html'], ['math-6-ch2', 'math_6_ch2.html'], ['math-6-ch3', 'math_6_ch3.html'],
['math-6-ch4', 'math_6_ch4.html'], ['math-6-ch5', 'math_6_ch5.html'], ['math-6-ch6', 'math_6_ch6.html'],
['algebra-7-ch1', 'algebra_7_ch1.html'], ['algebra-7-ch2', 'algebra_7_ch2.html'],
['algebra-7-ch3', 'algebra_7_ch3.html'], ['algebra-7-ch4', 'algebra_7_ch4.html'],
['geometry-7-ch1', 'geometry_7_ch1.html'], ['geometry-7-ch2', 'geometry_7_ch2.html'],
['geometry-7-ch3', 'geometry_7_ch3.html'], ['geometry-7-ch4', 'geometry_7_ch4.html'], ['geometry-7-ch5', 'geometry_7_ch5.html'],
['algebra-8-ch1', 'algebra_8.html'], ['algebra-8-ch2', 'algebra_8_ch2.html'], ['algebra-8-ch3', 'algebra_8_ch3.html'],
['geometry-8-ch1', 'geometry_8_ch1.html'], ['geometry-8-ch2', 'geometry_8_ch2.html'],
['geometry-8-ch3', 'geometry_8_ch3.html'], ['geometry-8-ch4', 'geometry_8_ch4.html'],
['algebra-9-ch1', 'algebra_9_ch1.html'], ['algebra-9-ch2', 'algebra_9_ch2.html'],
['algebra-9-ch3', 'algebra_9_ch3.html'], ['algebra-9-ch4', 'algebra_9_ch4.html'],
['geometry-9-ch1', 'geometry_9_ch1.html'], ['geometry-9-ch2', 'geometry_9_ch2.html'],
['geometry-9-ch3', 'geometry_9_ch3.html'], ['geometry-9-ch4', 'geometry_9_ch4.html'],
];
function strip(html) { return String(html).replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); }
const lines = ['# §-таксономия учебников 5–9 (математика) — эталон для классификатора экзамена math9', ''];
const json = []; // [{book, chapter_slug, subject, grade, para_id, num, title}]
let prevBook = '';
for (const [slug, file] of CHAPTERS) {
const book = slug.replace(/-ch\d+$/, '');
const subject = book.replace(/-\d+$/, ''); // math|algebra|geometry
const grade = Number((book.match(/-(\d+)$/) || [])[1]) || null;
if (book !== prevBook) { lines.push(`\n## ${book}`); prevBook = book; }
const p = path.join(DIR, file);
if (!fs.existsSync(p)) { lines.push(`### ${slug} (FILE MISSING: ${file})`); continue; }
const html = fs.readFileSync(p, 'utf8');
const tm = html.match(/<title>([^<]*)<\/title>/i);
lines.push(`### ${slug}${tm ? strip(tm[1]) : file}`);
const secRe = /<(?:section|div)\b[^>]*\sid="(sec-(?:p\d+|final\d*|[a-z0-9-]+))"[^>]*>/gi;
let m; const secs = [];
while ((m = secRe.exec(html)) !== null) secs.push({ id: m[1], start: m.index });
if (!secs.length) {
lines.push(` (движок math6: статических sec[id] нет; якоря строятся из window.M6.paras → id="sec-<p.id>")`);
json.push({ book, chapter_slug: slug, subject, grade, engine: 'math6', note: 'paras in window.M6 config; anchors sec-<p.id>' });
continue;
}
for (let i = 0; i < secs.length; i++) {
const seg = html.slice(secs[i].start, secs[i + 1] ? secs[i + 1].start : secs[i].start + 4000);
const numM = seg.match(/class="sec-num"[^>]*>([\s\S]*?)<\//i);
const hM = seg.match(/class="sec-h"[^>]*>([\s\S]*?)<\//i);
const paraId = secs[i].id.replace(/^sec-/, ''); // p10 | final3
const num = numM ? strip(numM[1]) : '';
const title = hM ? strip(hM[1]) : '';
lines.push(` ${secs[i].id.padEnd(12)} ${num ? '['+num+'] ' : ''}${title}`);
if (/^p\d+$/.test(paraId)) {
json.push({ book, chapter_slug: slug, subject, grade, para_id: paraId, num, title });
}
}
}
fs.mkdirSync(path.dirname(OUT_MD), { recursive: true });
fs.writeFileSync(OUT_MD, lines.join('\n'), 'utf8');
fs.writeFileSync(OUT_JSON, JSON.stringify(json, null, 2), 'utf8');
console.log('Wrote', OUT_MD);
console.log('Wrote', OUT_JSON, '(' + json.length + ' sections)');