fix(exam): классификатор § — fallback при 0 совпадений + учёт opts_json; таксономия в репо

- classify(): bestScore стартует с 0 (нужно совпадение>0), иначе берётся явный fallback
  (последнее правило), а не первое. Чинит свал theory-statements→§15 и word-problems→проценты.
- optsText(): анализ текста вариантов ответа (формат пар [label, html]) — theory-statements
  размечаются по содержанию утверждений.
- alg-word-problems fallback → algebra-7-ch3 §16 (задачи уравнением), не проценты.
- Таксономия §: перенесена с gitignore-пути data/ на отслеживаемый
  backend/scripts/exam-textbook-sections.json + генератор gen-exam-textbook-sections.js.
- Результат: 784/800 (98%) размечено, спреды по подтемам корректны.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maxim Dolgolyov
2026-06-03 16:29:40 +03:00
parent d05bb386a7
commit c9f3eed8ed
5 changed files with 2065 additions and 11 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,87 @@
#!/usr/bin/env node
/**
* gen-exam-textbook-sections.js
*
* Regenerates the §-section taxonomy of the grades 5-9 math-family textbooks,
* used by tag-exam-textbook.js (the exam→textbook classifier).
*
* Outputs:
* backend/scripts/exam-textbook-sections.json — machine-readable (the classifier reads this)
* plans/exam-textbook-links/taxonomy.md — human-readable reference
*
* Re-run whenever a grade 5-9 algebra/geometry/math chapter gains or renames a §.
* Note: math-5/6 are engine-rendered (math6_engine.js builds <section id="sec-<p.id>">
* from window.M6.paras) — their §s are NOT extracted statically here (emitted with
* engine:'math6' marker); the classifier links them at chapter level.
*
* Usage: node backend/scripts/gen-exam-textbook-sections.js
*/
'use strict';
const fs = require('fs');
const path = require('path');
const DIR = path.join(__dirname, '../../frontend/textbooks');
const OUT_MD = path.join(__dirname, '../../plans/exam-textbook-links/taxonomy.md');
const OUT_JSON = path.join(__dirname, 'exam-textbook-sections.json');
// chapter slug -> html file (from the textbooks table). Order = teaching order.
const CHAPTERS = [
['math-5-ch1', 'math_5_ch1.html'], ['math-5-ch2', 'math_5_ch2.html'], ['math-5-ch3', 'math_5_ch3.html'],
['math-6-ch1', 'math_6_ch1.html'], ['math-6-ch2', 'math_6_ch2.html'], ['math-6-ch3', 'math_6_ch3.html'],
['math-6-ch4', 'math_6_ch4.html'], ['math-6-ch5', 'math_6_ch5.html'], ['math-6-ch6', 'math_6_ch6.html'],
['algebra-7-ch1', 'algebra_7_ch1.html'], ['algebra-7-ch2', 'algebra_7_ch2.html'],
['algebra-7-ch3', 'algebra_7_ch3.html'], ['algebra-7-ch4', 'algebra_7_ch4.html'],
['geometry-7-ch1', 'geometry_7_ch1.html'], ['geometry-7-ch2', 'geometry_7_ch2.html'],
['geometry-7-ch3', 'geometry_7_ch3.html'], ['geometry-7-ch4', 'geometry_7_ch4.html'], ['geometry-7-ch5', 'geometry_7_ch5.html'],
['algebra-8-ch1', 'algebra_8.html'], ['algebra-8-ch2', 'algebra_8_ch2.html'], ['algebra-8-ch3', 'algebra_8_ch3.html'],
['geometry-8-ch1', 'geometry_8_ch1.html'], ['geometry-8-ch2', 'geometry_8_ch2.html'],
['geometry-8-ch3', 'geometry_8_ch3.html'], ['geometry-8-ch4', 'geometry_8_ch4.html'],
['algebra-9-ch1', 'algebra_9_ch1.html'], ['algebra-9-ch2', 'algebra_9_ch2.html'],
['algebra-9-ch3', 'algebra_9_ch3.html'], ['algebra-9-ch4', 'algebra_9_ch4.html'],
['geometry-9-ch1', 'geometry_9_ch1.html'], ['geometry-9-ch2', 'geometry_9_ch2.html'],
['geometry-9-ch3', 'geometry_9_ch3.html'], ['geometry-9-ch4', 'geometry_9_ch4.html'],
];
function strip(html) { return String(html).replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); }
const lines = ['# §-таксономия учебников 5–9 (математика) — эталон для классификатора экзамена math9', ''];
const json = []; // [{book, chapter_slug, subject, grade, para_id, num, title}]
let prevBook = '';
for (const [slug, file] of CHAPTERS) {
const book = slug.replace(/-ch\d+$/, '');
const subject = book.replace(/-\d+$/, ''); // math|algebra|geometry
const grade = Number((book.match(/-(\d+)$/) || [])[1]) || null;
if (book !== prevBook) { lines.push(`\n## ${book}`); prevBook = book; }
const p = path.join(DIR, file);
if (!fs.existsSync(p)) { lines.push(`### ${slug} (FILE MISSING: ${file})`); continue; }
const html = fs.readFileSync(p, 'utf8');
const tm = html.match(/<title>([^<]*)<\/title>/i);
lines.push(`### ${slug}${tm ? strip(tm[1]) : file}`);
const secRe = /<(?:section|div)\b[^>]*\sid="(sec-(?:p\d+|final\d*|[a-z0-9-]+))"[^>]*>/gi;
let m; const secs = [];
while ((m = secRe.exec(html)) !== null) secs.push({ id: m[1], start: m.index });
if (!secs.length) {
lines.push(` (движок math6: статических sec[id] нет; якоря строятся из window.M6.paras → id="sec-<p.id>")`);
json.push({ book, chapter_slug: slug, subject, grade, engine: 'math6', note: 'paras in window.M6 config; anchors sec-<p.id>' });
continue;
}
for (let i = 0; i < secs.length; i++) {
const seg = html.slice(secs[i].start, secs[i + 1] ? secs[i + 1].start : secs[i].start + 4000);
const numM = seg.match(/class="sec-num"[^>]*>([\s\S]*?)<\//i);
const hM = seg.match(/class="sec-h"[^>]*>([\s\S]*?)<\//i);
const paraId = secs[i].id.replace(/^sec-/, ''); // p10 | final3
const num = numM ? strip(numM[1]) : '';
const title = hM ? strip(hM[1]) : '';
lines.push(` ${secs[i].id.padEnd(12)} ${num ? '['+num+'] ' : ''}${title}`);
if (/^p\d+$/.test(paraId)) {
json.push({ book, chapter_slug: slug, subject, grade, para_id: paraId, num, title });
}
}
}
fs.mkdirSync(path.dirname(OUT_MD), { recursive: true });
fs.writeFileSync(OUT_MD, lines.join('\n'), 'utf8');
fs.writeFileSync(OUT_JSON, JSON.stringify(json, null, 2), 'utf8');
console.log('Wrote', OUT_MD);
console.log('Wrote', OUT_JSON, '(' + json.length + ' sections)');
+32 -11
View File
@@ -41,7 +41,8 @@ if (examIdx !== -1 && args[examIdx + 1]) {
}
/* ── Taxonomy ─────────────────────────────────────────────────── */
const taxonomy = require('./data/g9_textbook_sections.json');
/* Generated by gen-exam-textbook-sections.js (re-run when textbook §s change). */
const taxonomy = require('./exam-textbook-sections.json');
// Build lookup: book -> [ {chapter_slug, para_id, num, title} ]
// and flat: chapter_slug+para_id -> para number (integer)
@@ -349,8 +350,9 @@ const SUBTOPIC_RULES = {
// Scale/map → math-6-ch2
{ slug: 'math-6-ch2', paragraph: null,
kw: [/масштаб|карт[ае]\s+изображ/i] },
// fallback: percentages (most common in word problems)
{ slug: 'math-6-ch2', paragraph: null, kw: [] },
// fallback: solving text problems via an equation (most generic g9 home,
// far better than defaulting every word problem to "percents")
{ slug: 'algebra-7-ch3', paragraph: 16, kw: [] },
],
/* ── geom-triangles ──────────────────────────────────────── */
@@ -523,6 +525,24 @@ const SUBTOPIC_RULES = {
],
};
/* Extract plain text of answer options (opts_json) so keyword scoring can see
the actual statement contents — crucial for theory-statements, whose question
text is just "Какое из утверждений неверно" with the substance in the options. */
function optsText(task) {
if (!task.opts_json) return '';
try {
const o = JSON.parse(task.opts_json);
const arr = Array.isArray(o) ? o : (o && Array.isArray(o.options) ? o.options : []);
return arr.map(x => {
if (typeof x === 'string') return x;
// exam_tasks stores options as [label, html] pairs, e.g. ["а","$b^{-3}=...$"]
if (Array.isArray(x)) return x.map(String).join(' ');
if (x && typeof x === 'object') return x.html || x.text || x.label || '';
return '';
}).join(' ');
} catch { return ''; }
}
/* ── Classifier ───────────────────────────────────────────── */
function classify(task) {
const subtopic = task.subtopic;
@@ -531,9 +551,15 @@ function classify(task) {
const rules = SUBTOPIC_RULES[subtopic];
if (!rules || !rules.length) return { slug: null, para: null };
const txt = stripText(task.text_html);
const txt = stripText(task.text_html + ' ' + optsText(task));
let bestScore = -1;
// Require at least one keyword match (score > 0) to override the fallback.
// Starting bestScore at 0 (NOT -1) means: when NOTHING matches, the explicit
// fallback (last entry) wins — not the first rule. Critical for subtopics like
// theory-statements / alg-word-problems where the question text often carries
// no distinctive keyword (otherwise every unmatched task collapsed onto the
// first rule — §15 progressions / math-6 percents).
let bestScore = 0;
let bestRule = rules[rules.length - 1]; // last = fallback
for (const rule of rules) {
@@ -548,18 +574,13 @@ function classify(task) {
}
}
// If nothing matched (bestScore == -1 and bestRule is fallback), use first entry (primary)
if (bestScore < 0) {
bestRule = rules[rules.length - 1]; // explicit fallback
}
return { slug: bestRule.slug, para: bestRule.paragraph };
}
/* ── Main ─────────────────────────────────────────────────── */
function main() {
const tasks = db.prepare(`
SELECT id, variant, task_idx, task_type, subtopic, text_html
SELECT id, variant, task_idx, task_type, subtopic, text_html, opts_json
FROM exam_tasks
WHERE exam_key = ?
ORDER BY variant, task_idx