fix(exam): классификатор § — fallback при 0 совпадений + учёт opts_json; таксономия в репо
- classify(): bestScore стартует с 0 (нужно совпадение>0), иначе берётся явный fallback (последнее правило), а не первое. Чинит свал theory-statements→§15 и word-problems→проценты. - optsText(): анализ текста вариантов ответа (формат пар [label, html]) — theory-statements размечаются по содержанию утверждений. - alg-word-problems fallback → algebra-7-ch3 §16 (задачи уравнением), не проценты. - Таксономия §: перенесена с gitignore-пути data/ на отслеживаемый backend/scripts/exam-textbook-sections.json + генератор gen-exam-textbook-sections.js. - Результат: 784/800 (98%) размечено, спреды по подтемам корректны. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* gen-exam-textbook-sections.js
|
||||
*
|
||||
* Regenerates the §-section taxonomy of the grades 5-9 math-family textbooks,
|
||||
* used by tag-exam-textbook.js (the exam→textbook classifier).
|
||||
*
|
||||
* Outputs:
|
||||
* backend/scripts/exam-textbook-sections.json — machine-readable (the classifier reads this)
|
||||
* plans/exam-textbook-links/taxonomy.md — human-readable reference
|
||||
*
|
||||
* Re-run whenever a grade 5-9 algebra/geometry/math chapter gains or renames a §.
|
||||
* Note: math-5/6 are engine-rendered (math6_engine.js builds <section id="sec-<p.id>">
|
||||
* from window.M6.paras) — their §s are NOT extracted statically here (emitted with
|
||||
* engine:'math6' marker); the classifier links them at chapter level.
|
||||
*
|
||||
* Usage: node backend/scripts/gen-exam-textbook-sections.js
|
||||
*/
|
||||
'use strict';
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const DIR = path.join(__dirname, '../../frontend/textbooks');
|
||||
const OUT_MD = path.join(__dirname, '../../plans/exam-textbook-links/taxonomy.md');
|
||||
const OUT_JSON = path.join(__dirname, 'exam-textbook-sections.json');
|
||||
|
||||
// chapter slug -> html file (from the textbooks table). Order = teaching order.
|
||||
const CHAPTERS = [
|
||||
['math-5-ch1', 'math_5_ch1.html'], ['math-5-ch2', 'math_5_ch2.html'], ['math-5-ch3', 'math_5_ch3.html'],
|
||||
['math-6-ch1', 'math_6_ch1.html'], ['math-6-ch2', 'math_6_ch2.html'], ['math-6-ch3', 'math_6_ch3.html'],
|
||||
['math-6-ch4', 'math_6_ch4.html'], ['math-6-ch5', 'math_6_ch5.html'], ['math-6-ch6', 'math_6_ch6.html'],
|
||||
['algebra-7-ch1', 'algebra_7_ch1.html'], ['algebra-7-ch2', 'algebra_7_ch2.html'],
|
||||
['algebra-7-ch3', 'algebra_7_ch3.html'], ['algebra-7-ch4', 'algebra_7_ch4.html'],
|
||||
['geometry-7-ch1', 'geometry_7_ch1.html'], ['geometry-7-ch2', 'geometry_7_ch2.html'],
|
||||
['geometry-7-ch3', 'geometry_7_ch3.html'], ['geometry-7-ch4', 'geometry_7_ch4.html'], ['geometry-7-ch5', 'geometry_7_ch5.html'],
|
||||
['algebra-8-ch1', 'algebra_8.html'], ['algebra-8-ch2', 'algebra_8_ch2.html'], ['algebra-8-ch3', 'algebra_8_ch3.html'],
|
||||
['geometry-8-ch1', 'geometry_8_ch1.html'], ['geometry-8-ch2', 'geometry_8_ch2.html'],
|
||||
['geometry-8-ch3', 'geometry_8_ch3.html'], ['geometry-8-ch4', 'geometry_8_ch4.html'],
|
||||
['algebra-9-ch1', 'algebra_9_ch1.html'], ['algebra-9-ch2', 'algebra_9_ch2.html'],
|
||||
['algebra-9-ch3', 'algebra_9_ch3.html'], ['algebra-9-ch4', 'algebra_9_ch4.html'],
|
||||
['geometry-9-ch1', 'geometry_9_ch1.html'], ['geometry-9-ch2', 'geometry_9_ch2.html'],
|
||||
['geometry-9-ch3', 'geometry_9_ch3.html'], ['geometry-9-ch4', 'geometry_9_ch4.html'],
|
||||
];
|
||||
|
||||
function strip(html) { return String(html).replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); }
|
||||
|
||||
const lines = ['# §-таксономия учебников 5–9 (математика) — эталон для классификатора экзамена math9', ''];
|
||||
const json = []; // [{book, chapter_slug, subject, grade, para_id, num, title}]
|
||||
let prevBook = '';
|
||||
for (const [slug, file] of CHAPTERS) {
|
||||
const book = slug.replace(/-ch\d+$/, '');
|
||||
const subject = book.replace(/-\d+$/, ''); // math|algebra|geometry
|
||||
const grade = Number((book.match(/-(\d+)$/) || [])[1]) || null;
|
||||
if (book !== prevBook) { lines.push(`\n## ${book}`); prevBook = book; }
|
||||
const p = path.join(DIR, file);
|
||||
if (!fs.existsSync(p)) { lines.push(`### ${slug} (FILE MISSING: ${file})`); continue; }
|
||||
const html = fs.readFileSync(p, 'utf8');
|
||||
const tm = html.match(/<title>([^<]*)<\/title>/i);
|
||||
lines.push(`### ${slug} — ${tm ? strip(tm[1]) : file}`);
|
||||
|
||||
const secRe = /<(?:section|div)\b[^>]*\sid="(sec-(?:p\d+|final\d*|[a-z0-9-]+))"[^>]*>/gi;
|
||||
let m; const secs = [];
|
||||
while ((m = secRe.exec(html)) !== null) secs.push({ id: m[1], start: m.index });
|
||||
if (!secs.length) {
|
||||
lines.push(` (движок math6: статических sec[id] нет; якоря строятся из window.M6.paras → id="sec-<p.id>")`);
|
||||
json.push({ book, chapter_slug: slug, subject, grade, engine: 'math6', note: 'paras in window.M6 config; anchors sec-<p.id>' });
|
||||
continue;
|
||||
}
|
||||
for (let i = 0; i < secs.length; i++) {
|
||||
const seg = html.slice(secs[i].start, secs[i + 1] ? secs[i + 1].start : secs[i].start + 4000);
|
||||
const numM = seg.match(/class="sec-num"[^>]*>([\s\S]*?)<\//i);
|
||||
const hM = seg.match(/class="sec-h"[^>]*>([\s\S]*?)<\//i);
|
||||
const paraId = secs[i].id.replace(/^sec-/, ''); // p10 | final3
|
||||
const num = numM ? strip(numM[1]) : '';
|
||||
const title = hM ? strip(hM[1]) : '';
|
||||
lines.push(` ${secs[i].id.padEnd(12)} ${num ? '['+num+'] ' : ''}${title}`);
|
||||
if (/^p\d+$/.test(paraId)) {
|
||||
json.push({ book, chapter_slug: slug, subject, grade, para_id: paraId, num, title });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs.mkdirSync(path.dirname(OUT_MD), { recursive: true });
|
||||
fs.writeFileSync(OUT_MD, lines.join('\n'), 'utf8');
|
||||
fs.writeFileSync(OUT_JSON, JSON.stringify(json, null, 2), 'utf8');
|
||||
console.log('Wrote', OUT_MD);
|
||||
console.log('Wrote', OUT_JSON, '(' + json.length + ' sections)');
|
||||
@@ -41,7 +41,8 @@ if (examIdx !== -1 && args[examIdx + 1]) {
|
||||
}
|
||||
|
||||
/* ── Taxonomy ─────────────────────────────────────────────────── */
|
||||
const taxonomy = require('./data/g9_textbook_sections.json');
|
||||
/* Generated by gen-exam-textbook-sections.js (re-run when textbook §s change). */
|
||||
const taxonomy = require('./exam-textbook-sections.json');
|
||||
|
||||
// Build lookup: book -> [ {chapter_slug, para_id, num, title} ]
|
||||
// and flat: chapter_slug+para_id -> para number (integer)
|
||||
@@ -349,8 +350,9 @@ const SUBTOPIC_RULES = {
|
||||
// Scale/map → math-6-ch2
|
||||
{ slug: 'math-6-ch2', paragraph: null,
|
||||
kw: [/масштаб|карт[ае]\s+изображ/i] },
|
||||
// fallback: percentages (most common in word problems)
|
||||
{ slug: 'math-6-ch2', paragraph: null, kw: [] },
|
||||
// fallback: solving text problems via an equation (most generic g9 home,
|
||||
// far better than defaulting every word problem to "percents")
|
||||
{ slug: 'algebra-7-ch3', paragraph: 16, kw: [] },
|
||||
],
|
||||
|
||||
/* ── geom-triangles ──────────────────────────────────────── */
|
||||
@@ -523,6 +525,24 @@ const SUBTOPIC_RULES = {
|
||||
],
|
||||
};
|
||||
|
||||
/* Extract plain text of answer options (opts_json) so keyword scoring can see
|
||||
the actual statement contents — crucial for theory-statements, whose question
|
||||
text is just "Какое из утверждений неверно" with the substance in the options. */
|
||||
function optsText(task) {
|
||||
if (!task.opts_json) return '';
|
||||
try {
|
||||
const o = JSON.parse(task.opts_json);
|
||||
const arr = Array.isArray(o) ? o : (o && Array.isArray(o.options) ? o.options : []);
|
||||
return arr.map(x => {
|
||||
if (typeof x === 'string') return x;
|
||||
// exam_tasks stores options as [label, html] pairs, e.g. ["а","$b^{-3}=...$"]
|
||||
if (Array.isArray(x)) return x.map(String).join(' ');
|
||||
if (x && typeof x === 'object') return x.html || x.text || x.label || '';
|
||||
return '';
|
||||
}).join(' ');
|
||||
} catch { return ''; }
|
||||
}
|
||||
|
||||
/* ── Classifier ───────────────────────────────────────────── */
|
||||
function classify(task) {
|
||||
const subtopic = task.subtopic;
|
||||
@@ -531,9 +551,15 @@ function classify(task) {
|
||||
const rules = SUBTOPIC_RULES[subtopic];
|
||||
if (!rules || !rules.length) return { slug: null, para: null };
|
||||
|
||||
const txt = stripText(task.text_html);
|
||||
const txt = stripText(task.text_html + ' ' + optsText(task));
|
||||
|
||||
let bestScore = -1;
|
||||
// Require at least one keyword match (score > 0) to override the fallback.
|
||||
// Starting bestScore at 0 (NOT -1) means: when NOTHING matches, the explicit
|
||||
// fallback (last entry) wins — not the first rule. Critical for subtopics like
|
||||
// theory-statements / alg-word-problems where the question text often carries
|
||||
// no distinctive keyword (otherwise every unmatched task collapsed onto the
|
||||
// first rule — §15 progressions / math-6 percents).
|
||||
let bestScore = 0;
|
||||
let bestRule = rules[rules.length - 1]; // last = fallback
|
||||
|
||||
for (const rule of rules) {
|
||||
@@ -548,18 +574,13 @@ function classify(task) {
|
||||
}
|
||||
}
|
||||
|
||||
// If nothing matched (bestScore == -1 and bestRule is fallback), use first entry (primary)
|
||||
if (bestScore < 0) {
|
||||
bestRule = rules[rules.length - 1]; // explicit fallback
|
||||
}
|
||||
|
||||
return { slug: bestRule.slug, para: bestRule.paragraph };
|
||||
}
|
||||
|
||||
/* ── Main ─────────────────────────────────────────────────── */
|
||||
function main() {
|
||||
const tasks = db.prepare(`
|
||||
SELECT id, variant, task_idx, task_type, subtopic, text_html
|
||||
SELECT id, variant, task_idx, task_type, subtopic, text_html, opts_json
|
||||
FROM exam_tasks
|
||||
WHERE exam_key = ?
|
||||
ORDER BY variant, task_idx
|
||||
|
||||
Reference in New Issue
Block a user