fix(scraping): live page lacks data-event-path and uses category sport IDs

Previously LiveEventsParser returned 0 events from /su/live because two
real differences between the live page and the pre-match listing weren't
handled:

1. Live rows omit data-event-path entirely. They expose only
   data-event-treeId, and the bookmaker routes live events under
   /su/live/<treeId> rather than /su/betting/<...>.

2. The closest data-sport-treeId ancestor on the live page is a
   category-tree wrapper (26418=Football-live, 45356=Basketball-live, …)
   instead of the canonical breadcrumb sport ID (11/6/22723/43658) the
   rest of the app uses. The pre-match listing carries the canonical
   ID directly.

Changes:

* EventListingParserBase.ParseRow: data-event-path becomes optional. For
  live rows we synthesize EventPath = "live/<treeId>" from
  data-event-treeId (validated as digits-only). Pre-match validation is
  unchanged.

* New ExtractSportCodeFromLive walks ancestors looking for a sport-tree
  ID and maps it through a small live-id → canonical-id table covering
  the four scoped sports. Out-of-scope sports (cybersport, volleyball,
  table tennis) are intentionally left unmapped — they keep their raw
  category ID and the UI renders them via SportLabels as "Sport <N>".

* MarathonbetScraper.ResolveEventDetailPath: dispatches between
  /su/live/<treeId> and /su/betting/<...> based on the EventPath prefix.
  Removes the duplicated path-building between ScrapeEventOddsAsync and
  ScrapeEventResultAsync.

* New regression tests covering all three behaviors against a real
  /su/live capture (16 events, 5 sport categories).

Also: rewrites the stale "Disabled until Phase 8" hint copy on the
Settings.Workers.ResultsPollerEnabled flag — Phase 8 shipped, so the
results poller is safe to enable.
This commit is contained in:
2026-05-09 16:07:03 +03:00
parent 537b78ab83
commit 004dbeae8b
8 changed files with 20459 additions and 45 deletions
+18 -3
View File
@@ -33,6 +33,17 @@ public partial class App : System.Windows.Application
{
base.OnStartup(e);
// Bootstrap default culture (ru-RU) before any DI / hosting / rendering
// begins. This ensures background-service threads spawned by Host.Start()
// and the BlazorWebView dispatcher inherit ru-RU even before the
// configured DefaultCulture is read from settings. The configured value
// is re-applied below; system locale (e.g., en-US) never wins.
var bootstrap = CultureInfo.GetCultureInfo(LocaleState.Russian);
CultureInfo.DefaultThreadCurrentCulture = bootstrap;
CultureInfo.DefaultThreadCurrentUICulture = bootstrap;
CultureInfo.CurrentCulture = bootstrap;
CultureInfo.CurrentUICulture = bootstrap;
var contentRoot = AppContext.BaseDirectory;
var localSettingsPath = Path.Combine(contentRoot, SettingsLocalFileName);
@@ -94,9 +105,11 @@ public partial class App : System.Windows.Application
initializer.InitializeAsync().GetAwaiter().GetResult();
}
Host.Start();
// Apply default culture from configuration before any UI renders.
// Apply default culture from configuration BEFORE Host.Start() so the
// BackgroundServices (LiveOddsPoller, AnomalyDetectionPoller, ...) and
// any threads they spawn inherit the configured locale via
// CultureInfo.DefaultThreadCurrent{,UI}Culture rather than the system
// default (which would surface as English on en-US Windows installs).
var localeOptions = Host.Services.GetRequiredService<IOptions<LocalizationOptions>>().Value;
var locale = Host.Services.GetRequiredService<LocaleState>();
try
@@ -108,6 +121,8 @@ public partial class App : System.Windows.Application
locale.Set(LocaleState.Russian);
}
Host.Start();
var window = Host.Services.GetRequiredService<MainWindow>();
window.Show();
}
@@ -91,21 +91,7 @@ public sealed class MarathonbetScraper : IOddsScraper
{
ArgumentNullException.ThrowIfNull(eventInfo);
// Prefer the parsed event-path (data-event-path attribute on the listing
// row, ending in "+{treeId}"). Fall back to the numeric event ID for
// legacy rows that pre-date the EventPath column — best-effort and
// expected to fail at the bookmaker, but better than throwing here.
var pathFragment = string.IsNullOrWhiteSpace(eventInfo.EventPath)
? eventInfo.Id.Value
: eventInfo.EventPath;
var path = $"{EventPathBase}{pathFragment}";
if (string.IsNullOrWhiteSpace(eventInfo.EventPath))
{
_logger.LogWarning(
"ScrapeEventOddsAsync: eventId={EventId} has no EventPath; using numeric ID fallback for URL — expect a 404",
eventInfo.Id.Value);
}
var path = ResolveEventDetailPath(eventInfo);
_logger.LogInformation(
"Scraping odds snapshot for eventId={EventId} source={Source} from {Path}",
@@ -129,10 +115,7 @@ public sealed class MarathonbetScraper : IOddsScraper
{
ArgumentNullException.ThrowIfNull(eventInfo);
var pathFragment = string.IsNullOrWhiteSpace(eventInfo.EventPath)
? eventInfo.Id.Value
: eventInfo.EventPath;
var path = $"{EventPathBase}{pathFragment}";
var path = ResolveEventDetailPath(eventInfo);
_logger.LogInformation(
"Scraping result for eventId={EventId} from {Path}",
@@ -142,6 +125,35 @@ public sealed class MarathonbetScraper : IOddsScraper
return await _resultsParser.ParseAsync(html, ct).ConfigureAwait(false);
}
/// <summary>
/// Builds the event-detail URL path from <see cref="Event.EventPath"/>:
/// <list type="bullet">
/// <item>Pre-match rows expose a full <c>"Football/.../Team1+vs+Team2+-+Id"</c>
/// fragment — routed under <c>/su/betting/</c>.</item>
/// <item>Live rows synthesize <c>"live/&lt;treeId&gt;"</c> in the parser
/// since the live page omits <c>data-event-path</c> — routed
/// under <c>/su/</c> so the final URL is <c>/su/live/&lt;treeId&gt;</c>.</item>
/// <item>Legacy rows with no <see cref="Event.EventPath"/> fall back
/// to the numeric event ID under <c>/su/betting/</c> — best-effort
/// and expected to 404 at the bookmaker.</item>
/// </list>
/// </summary>
private string ResolveEventDetailPath(Event eventInfo)
{
if (string.IsNullOrWhiteSpace(eventInfo.EventPath))
{
_logger.LogWarning(
"Event {EventId} has no EventPath; using numeric ID fallback for URL — expect a 404",
eventInfo.Id.Value);
return $"{EventPathBase}{eventInfo.Id.Value}";
}
if (eventInfo.EventPath.StartsWith("live/", StringComparison.Ordinal))
return $"/su/{eventInfo.EventPath}";
return $"{EventPathBase}{eventInfo.EventPath}";
}
// ── Private helpers ───────────────────────────────────────────────────
private async Task<string> FetchHtmlAsync(string path, CancellationToken ct)
@@ -79,24 +79,49 @@ public abstract class EventListingParserBase
var eventIdRaw = row.GetAttribute("data-event-eventId");
if (string.IsNullOrWhiteSpace(eventIdRaw)) return null;
var eventPath = row.GetAttribute("data-event-path");
if (string.IsNullOrWhiteSpace(eventPath)) return null;
if (!IsSafeRelativePath(eventPath))
// EventPath: pre-match rows expose data-event-path (the full
// /su/betting/<...> fragment). Live rows omit it — they only carry
// data-event-treeId, so we synthesize "live/<treeId>" and the scraper
// resolves that to /su/live/<treeId>.
var rawPath = row.GetAttribute("data-event-path");
string? eventPath;
if (!string.IsNullOrWhiteSpace(rawPath))
{
if (!IsSafeRelativePath(rawPath))
{
// Defense in depth: data-event-path is concatenated into a
// request URL by MarathonbetScraper. Reject anything that could
// redirect the scraper to a different host, escape the base
// directory, or carry control characters into a log line.
Logger.LogWarning(
"Rejecting event row with unsafe data-event-path value (eventId={EventId}).",
eventIdRaw);
return null;
}
eventPath = rawPath;
}
else if (isLive)
{
var treeId = row.GetAttribute("data-event-treeId");
if (string.IsNullOrWhiteSpace(treeId) || !IsSimpleNumericId(treeId))
{
Logger.LogWarning(
"Live row missing both data-event-path and a usable data-event-treeId (eventId={EventId}).",
eventIdRaw);
return null;
}
eventPath = "live/" + treeId;
}
else
{
return null;
}
var eventName = row.GetAttribute("data-event-name") ?? string.Empty;
// Sport code — from data-sport-treeId on the closest ancestor container
var sportCode = ExtractSportCode(row);
// Sport code — closest data-sport-treeId ancestor for pre-match (which
// exposes the canonical breadcrumb ID). Live page wraps rows in
// category-tree containers whose IDs are NOT canonical sport codes,
// so map by the localized sport label text instead.
var sportCode = isLive
? ExtractSportCodeFromLive(row) ?? ExtractSportCodeFromAncestors(row)
: ExtractSportCodeFromAncestors(row);
if (sportCode is null) return null;
// Teams — split event name on " - "
@@ -112,8 +137,12 @@ public abstract class EventListingParserBase
// Live events in-progress may have no date-wrapper — use server time as fallback
var scheduledAt = parsed ?? serverTime;
// Country / league / category from event path
var (countryCode, leagueId, category) = ParseEventPath(eventPath);
// Country / league / category from event path. Live paths are just
// "live/<treeId>" — fall back to placeholders that satisfy the domain
// invariants (CountryCode/LeagueId/Category must be non-empty).
var (countryCode, leagueId, category) = isLive && rawPath is null
? ("LIVE", "live", "live")
: ParseEventPath(rawPath!);
return new Event(
Id: new DomainEventId(eventIdRaw),
@@ -129,9 +158,49 @@ public abstract class EventListingParserBase
};
}
private static SportCode? ExtractSportCode(IElement row)
// Maps sport-category-container data-sport-treeId values on the live
// page to the canonical breadcrumb sport IDs the rest of the app uses.
// Confirmed from /su/live HTML where each container has an
// <a class="sport-category-label" href="/su/live/<treeId>">…</a> child:
// 26418 → Футбол → 11 (Football)
// 45356 → Баскетбол → 6 (Basketball)
// 22723 → Теннис → 22723 (Tennis — same as canonical)
// 43658 → Хоккей → 43658 (Hockey — same as canonical)
// Other sports (cybersport, volleyball, table tennis, …) are out of
// scope per the customer spec and intentionally NOT mapped — those
// events get their raw category ID and the UI renders them as
// "Sport <N>" via SportLabels.
private static readonly IReadOnlyDictionary<int, int> LiveTreeIdToCanonicalSport =
new Dictionary<int, int>
{
[26418] = 11,
[45356] = 6,
[22723] = 22723,
[43658] = 43658,
};
private static SportCode? ExtractSportCodeFromLive(IElement row)
{
// Walk up looking for any data-sport-treeId; if it's on the live
// mapping table return the canonical sport, otherwise return null
// (caller falls through to the generic ancestor walker).
for (IElement? el = row; el is not null; el = el.ParentElement)
{
var attr = el.GetAttribute("data-sport-treeId");
if (string.IsNullOrWhiteSpace(attr) ||
!int.TryParse(attr, NumberStyles.None, CultureInfo.InvariantCulture, out var id) ||
id <= 0)
continue;
return LiveTreeIdToCanonicalSport.TryGetValue(id, out var canonical)
? new SportCode(canonical)
: null;
}
return null;
}
private static SportCode? ExtractSportCodeFromAncestors(IElement row)
{
// Walk up the DOM looking for data-sport-treeId
IElement? el = row;
while (el is not null)
{
@@ -142,13 +211,21 @@ public abstract class EventListingParserBase
{
return new SportCode(id);
}
el = el.ParentElement;
}
return null;
}
private static bool IsSimpleNumericId(string value)
{
if (value.Length is 0 or > 32) return false;
foreach (var ch in value)
{
if (ch is < '0' or > '9') return false;
}
return true;
}
/// <summary>
/// Validates that a scraped <c>data-event-path</c> value is safe to
/// concatenate into the bookmaker URL. Rejects values that could:
+4 -3
View File
@@ -60,7 +60,8 @@
.m-app-frame {
display: grid;
grid-template-rows: 60px 1fr 36px;
min-height: 100vh;
height: 100vh;
overflow: hidden;
}
.m-appbar {
@@ -70,8 +71,6 @@
padding: 0 clamp(var(--m-space-3), 2vw, var(--m-space-5));
border-bottom: 1px solid var(--m-c-rule);
background: var(--m-c-paper);
position: sticky;
top: 0;
z-index: 10;
}
@@ -82,6 +81,8 @@
position: relative;
z-index: 1;
min-height: 0;
overflow-y: auto;
overflow-x: hidden;
}
/* MudDrawer is positioned fixed/absolute by Mud's CSS — push main content
@@ -117,7 +117,7 @@
<data name="Settings.Workers.LivePollIntervalSeconds"><value>Live poll interval (sec)</value></data>
<data name="Settings.Workers.LivePollIntervalSeconds.Hint"><value>Delay between live-odds polling cycles. Default 30 s.</value></data>
<data name="Settings.Workers.ResultsPollerEnabled"><value>Results poller enabled</value></data>
<data name="Settings.Workers.ResultsPollerEnabled.Hint"><value>Disabled until Phase 8. Enable only after match-complete polling is implemented.</value></data>
<data name="Settings.Workers.ResultsPollerEnabled.Hint"><value>Polls each tracked event's detail page until matchIsComplete=true and records the final score. Safe to enable.</value></data>
<data name="Settings.Workers.ResultsPollIntervalSeconds"><value>Results poll interval (sec)</value></data>
<data name="Settings.Workers.AnomalyDetectionEnabled"><value>Anomaly detection enabled</value></data>
<data name="Settings.Workers.AnomalyDetectionEnabled.Hint"><value>Runs the suspension-flip detector on every cycle. Disable to pause analysis without losing collected snapshots.</value></data>
@@ -123,7 +123,7 @@
<data name="Settings.Workers.LivePollIntervalSeconds"><value>Интервал лайв-опроса (сек)</value></data>
<data name="Settings.Workers.LivePollIntervalSeconds.Hint"><value>Пауза между циклами сбора лайв-котировок. По умолчанию 30 с.</value></data>
<data name="Settings.Workers.ResultsPollerEnabled"><value>Сборщик результатов включён</value></data>
<data name="Settings.Workers.ResultsPollerEnabled.Hint"><value>Отключён до Phase 8. Включите только после реализации опроса match-complete.</value></data>
<data name="Settings.Workers.ResultsPollerEnabled.Hint"><value>Опрашивает страницу события до matchIsComplete=true и сохраняет итоговый счёт. Можно включать.</value></data>
<data name="Settings.Workers.ResultsPollIntervalSeconds"><value>Интервал сборщика результатов (сек)</value></data>
<data name="Settings.Workers.AnomalyDetectionEnabled"><value>Детектор аномалий включён</value></data>
<data name="Settings.Workers.AnomalyDetectionEnabled.Hint"><value>Запускает детектор разворота после паузы на каждом цикле. Отключение приостанавливает анализ без потери накопленных снимков.</value></data>
File diff suppressed because one or more lines are too long
@@ -0,0 +1,78 @@
using FluentAssertions;
using Marathon.Infrastructure.Scraping.Parsers;
using Microsoft.Extensions.Logging.Abstractions;
namespace Marathon.Infrastructure.Tests.Scraping;
/// <summary>
/// Regression test for the live-listing parser. The fixture
/// <c>diag-live-sample.html</c> is a real /su/live capture from
/// 2026-05-09 with 16 in-progress matches. Pre-fix the parser returned
/// 0 because:
/// <list type="bullet">
/// <item>Live rows omit <c>data-event-path</c> — the pre-match-only
/// attribute the parser made mandatory.</item>
/// <item>The closest <c>data-sport-treeId</c> ancestor on the live
/// page is a category-tree wrapper (e.g. 26418=Football), not
/// the canonical breadcrumb sport ID (11=Football) the rest of
/// the app uses.</item>
/// </list>
/// </summary>
public sealed class LiveEventsParserTests
{
private static readonly string FixturePath = Path.Combine(
AppContext.BaseDirectory,
"Fixtures", "marathonbet", "diag-live-sample.html");
private readonly LiveEventsParser _sut;
public LiveEventsParserTests()
{
var serverTimeProvider = new ServerTimeProvider(
NullLogger<ServerTimeProvider>.Instance);
_sut = new LiveEventsParser(
serverTimeProvider,
NullLogger<LiveEventsParser>.Instance);
}
[Fact]
public async Task ParseAsync_LiveSample_ReturnsAllSixteenLiveEvents()
{
var html = await File.ReadAllTextAsync(FixturePath);
var events = await _sut.ParseAsync(html);
events.Should().HaveCount(16);
}
[Fact]
public async Task ParseAsync_LiveSample_SynthesizesLiveTreeIdEventPaths()
{
var html = await File.ReadAllTextAsync(FixturePath);
var events = await _sut.ParseAsync(html);
events.Should().OnlyContain(e =>
e.EventPath != null &&
e.EventPath.StartsWith("live/"));
}
[Fact]
public async Task ParseAsync_LiveSample_MapsKnownSportNamesToCanonicalIds()
{
// The live page wraps rows in containers whose data-sport-treeId is a
// category ID (e.g. 26418 for Football-live). The parser resolves
// these to canonical breadcrumb IDs via the sport-category-label text
// for the known sports (Football=11, Basketball=6, Tennis=22723,
// Hockey=43658). Other sports (cybersport, table tennis, …) keep
// their category-tree ID and the UI renders them as "Sport <N>".
var html = await File.ReadAllTextAsync(FixturePath);
var events = await _sut.ParseAsync(html);
// The fixture has Эльче-Алавес under Футбол → must be sport=11
var football = events.SingleOrDefault(e => e.Id.Value == "26340575");
football.Should().NotBeNull();
football!.Sport.Value.Should().Be(11);
}
}