c9f3eed8ed
- classify(): bestScore стартует с 0 (нужно совпадение>0), иначе берётся явный fallback (последнее правило), а не первое. Чинит свал theory-statements→§15 и word-problems→проценты. - optsText(): анализ текста вариантов ответа (формат пар [label, html]) — theory-statements размечаются по содержанию утверждений. - alg-word-problems fallback → algebra-7-ch3 §16 (задачи уравнением), не проценты. - Таксономия §: перенесена с gitignore-пути data/ на отслеживаемый backend/scripts/exam-textbook-sections.json + генератор gen-exam-textbook-sections.js. - Результат: 784/800 (98%) размечено, спреды по подтемам корректны. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
88 lines
4.8 KiB
JavaScript
88 lines
4.8 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* gen-exam-textbook-sections.js
|
|
*
|
|
* Regenerates the §-section taxonomy of the grades 5-9 math-family textbooks,
|
|
* used by tag-exam-textbook.js (the exam→textbook classifier).
|
|
*
|
|
* Outputs:
|
|
* backend/scripts/exam-textbook-sections.json — machine-readable (the classifier reads this)
|
|
* plans/exam-textbook-links/taxonomy.md — human-readable reference
|
|
*
|
|
* Re-run whenever a grade 5-9 algebra/geometry/math chapter gains or renames a §.
|
|
* Note: math-5/6 are engine-rendered (math6_engine.js builds <section id="sec-<p.id>">
|
|
* from window.M6.paras) — their §s are NOT extracted statically here (emitted with
|
|
* engine:'math6' marker); the classifier links them at chapter level.
|
|
*
|
|
* Usage: node backend/scripts/gen-exam-textbook-sections.js
|
|
*/
|
|
'use strict';
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
const DIR = path.join(__dirname, '../../frontend/textbooks');
|
|
const OUT_MD = path.join(__dirname, '../../plans/exam-textbook-links/taxonomy.md');
|
|
const OUT_JSON = path.join(__dirname, 'exam-textbook-sections.json');
|
|
|
|
// chapter slug -> html file (from the textbooks table). Order = teaching order.
|
|
const CHAPTERS = [
|
|
['math-5-ch1', 'math_5_ch1.html'], ['math-5-ch2', 'math_5_ch2.html'], ['math-5-ch3', 'math_5_ch3.html'],
|
|
['math-6-ch1', 'math_6_ch1.html'], ['math-6-ch2', 'math_6_ch2.html'], ['math-6-ch3', 'math_6_ch3.html'],
|
|
['math-6-ch4', 'math_6_ch4.html'], ['math-6-ch5', 'math_6_ch5.html'], ['math-6-ch6', 'math_6_ch6.html'],
|
|
['algebra-7-ch1', 'algebra_7_ch1.html'], ['algebra-7-ch2', 'algebra_7_ch2.html'],
|
|
['algebra-7-ch3', 'algebra_7_ch3.html'], ['algebra-7-ch4', 'algebra_7_ch4.html'],
|
|
['geometry-7-ch1', 'geometry_7_ch1.html'], ['geometry-7-ch2', 'geometry_7_ch2.html'],
|
|
['geometry-7-ch3', 'geometry_7_ch3.html'], ['geometry-7-ch4', 'geometry_7_ch4.html'], ['geometry-7-ch5', 'geometry_7_ch5.html'],
|
|
['algebra-8-ch1', 'algebra_8.html'], ['algebra-8-ch2', 'algebra_8_ch2.html'], ['algebra-8-ch3', 'algebra_8_ch3.html'],
|
|
['geometry-8-ch1', 'geometry_8_ch1.html'], ['geometry-8-ch2', 'geometry_8_ch2.html'],
|
|
['geometry-8-ch3', 'geometry_8_ch3.html'], ['geometry-8-ch4', 'geometry_8_ch4.html'],
|
|
['algebra-9-ch1', 'algebra_9_ch1.html'], ['algebra-9-ch2', 'algebra_9_ch2.html'],
|
|
['algebra-9-ch3', 'algebra_9_ch3.html'], ['algebra-9-ch4', 'algebra_9_ch4.html'],
|
|
['geometry-9-ch1', 'geometry_9_ch1.html'], ['geometry-9-ch2', 'geometry_9_ch2.html'],
|
|
['geometry-9-ch3', 'geometry_9_ch3.html'], ['geometry-9-ch4', 'geometry_9_ch4.html'],
|
|
];
|
|
|
|
function strip(html) { return String(html).replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); }
|
|
|
|
const lines = ['# §-таксономия учебников 5–9 (математика) — эталон для классификатора экзамена math9', ''];
|
|
const json = []; // [{book, chapter_slug, subject, grade, para_id, num, title}]
|
|
let prevBook = '';
|
|
for (const [slug, file] of CHAPTERS) {
|
|
const book = slug.replace(/-ch\d+$/, '');
|
|
const subject = book.replace(/-\d+$/, ''); // math|algebra|geometry
|
|
const grade = Number((book.match(/-(\d+)$/) || [])[1]) || null;
|
|
if (book !== prevBook) { lines.push(`\n## ${book}`); prevBook = book; }
|
|
const p = path.join(DIR, file);
|
|
if (!fs.existsSync(p)) { lines.push(`### ${slug} (FILE MISSING: ${file})`); continue; }
|
|
const html = fs.readFileSync(p, 'utf8');
|
|
const tm = html.match(/<title>([^<]*)<\/title>/i);
|
|
lines.push(`### ${slug} — ${tm ? strip(tm[1]) : file}`);
|
|
|
|
const secRe = /<(?:section|div)\b[^>]*\sid="(sec-(?:p\d+|final\d*|[a-z0-9-]+))"[^>]*>/gi;
|
|
let m; const secs = [];
|
|
while ((m = secRe.exec(html)) !== null) secs.push({ id: m[1], start: m.index });
|
|
if (!secs.length) {
|
|
lines.push(` (движок math6: статических sec[id] нет; якоря строятся из window.M6.paras → id="sec-<p.id>")`);
|
|
json.push({ book, chapter_slug: slug, subject, grade, engine: 'math6', note: 'paras in window.M6 config; anchors sec-<p.id>' });
|
|
continue;
|
|
}
|
|
for (let i = 0; i < secs.length; i++) {
|
|
const seg = html.slice(secs[i].start, secs[i + 1] ? secs[i + 1].start : secs[i].start + 4000);
|
|
const numM = seg.match(/class="sec-num"[^>]*>([\s\S]*?)<\//i);
|
|
const hM = seg.match(/class="sec-h"[^>]*>([\s\S]*?)<\//i);
|
|
const paraId = secs[i].id.replace(/^sec-/, ''); // p10 | final3
|
|
const num = numM ? strip(numM[1]) : '';
|
|
const title = hM ? strip(hM[1]) : '';
|
|
lines.push(` ${secs[i].id.padEnd(12)} ${num ? '['+num+'] ' : ''}${title}`);
|
|
if (/^p\d+$/.test(paraId)) {
|
|
json.push({ book, chapter_slug: slug, subject, grade, para_id: paraId, num, title });
|
|
}
|
|
}
|
|
}
|
|
|
|
fs.mkdirSync(path.dirname(OUT_MD), { recursive: true });
|
|
fs.writeFileSync(OUT_MD, lines.join('\n'), 'utf8');
|
|
fs.writeFileSync(OUT_JSON, JSON.stringify(json, null, 2), 'utf8');
|
|
console.log('Wrote', OUT_MD);
|
|
console.log('Wrote', OUT_JSON, '(' + json.length + ' sections)');
|