perf(scraping): parallel HTTP fan-out, sequential DB persist (HIGH)

The Pull*UseCase implementations issued one HTTP request at a time despite
Scraping:MaxConcurrentRequests=4. With 30–80 live events and ~1s per
fetch, a 5–10s live cadence target was unreachable; cycles overflowed
the configured interval.

* New Marathon.Application.Configuration.ScrapingThrottle bound from the
  shared Scraping:* section. Exposes only MaxConcurrentRequests so the
  Application layer doesn't pull in the Infrastructure-side ScrapingOptions.
* PullLiveOddsUseCase + PullUpcomingEventsUseCase split into two phases:
  - Phase 1 — Parallel.ForEachAsync over the event list with
    MaxDegreeOfParallelism = throttle.MaxConcurrentRequests. The scraper's
    Polly rate limiter still throttles to RequestsPerSecond underneath
    this fan-out, so spikes are smoothed before they hit the bookmaker.
  - Phase 2 — sequential foreach over the (Event, Snapshot) tuples
    captured in Phase 1, doing event upsert + snapshot insert. EF Core
    DbContext is not thread-safe so all DB writes stay on a single thread.
* InfrastructureModule binds ScrapingThrottle alongside AnomalyOptions.
* Failed snapshot scrapes in Phase 1 mean the event row is also NOT
  persisted in Phase 2 — previously we'd persist the row even when the
  snapshot scrape failed, leaving an orphan event with no odds. Updated
  the regression test accordingly.
* Test fixture exposes TestFixtures.Throttle(maxConcurrentRequests=1) for
  deterministic sequential test runs.
* One existing NSubstitute setup that chained Arg.Is<>() across two
  configurations was rewritten to use a single Arg.Any<>() with inline
  branching — chained matchers were leaking and returning wrong results.
This commit is contained in:
2026-05-09 15:27:06 +03:00
parent 66ae038243
commit 286b55986b
8 changed files with 177 additions and 53 deletions
@@ -0,0 +1,26 @@
namespace Marathon.Application.Configuration;
/// <summary>
/// Application-layer view of the scraping concurrency knobs.
/// </summary>
/// <remarks>
/// <para>
/// Bound from the same <c>Scraping</c> appsettings section as
/// <c>Marathon.Infrastructure.Configuration.ScrapingOptions</c> — but only the
/// fields the use cases need to schedule fan-out. Keeping a separate Application
/// type avoids leaking the Infrastructure namespace into use-case code.
/// </para>
/// </remarks>
public sealed class ScrapingThrottle
{
public const string SectionName = "Scraping";
/// <summary>
/// Maximum number of in-flight HTTP requests the scraper is allowed to
/// issue concurrently. Use cases use this as the
/// <see cref="ParallelOptions.MaxDegreeOfParallelism"/> for batch fan-out.
/// The bookmaker rate limiter still throttles to <c>RequestsPerSecond</c>
/// underneath this value.
/// </summary>
public int MaxConcurrentRequests { get; init; } = 4;
}
@@ -1,7 +1,10 @@
using System.Collections.Concurrent;
using Marathon.Application.Abstractions;
using Marathon.Application.Configuration;
using Marathon.Domain.Entities;
using Marathon.Domain.Enums;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace Marathon.Application.UseCases;
@@ -21,17 +24,20 @@ public sealed class PullLiveOddsUseCase
private readonly IOddsScraper _scraper;
private readonly IEventRepository _eventRepo;
private readonly ISnapshotRepository _snapshotRepo;
private readonly IOptionsMonitor<ScrapingThrottle> _throttle;
private readonly ILogger<PullLiveOddsUseCase> _logger;
public PullLiveOddsUseCase(
IOddsScraper scraper,
IEventRepository eventRepo,
ISnapshotRepository snapshotRepo,
IOptionsMonitor<ScrapingThrottle> throttle,
ILogger<PullLiveOddsUseCase> logger)
{
_scraper = scraper ?? throw new ArgumentNullException(nameof(scraper));
_eventRepo = eventRepo ?? throw new ArgumentNullException(nameof(eventRepo));
_snapshotRepo = snapshotRepo ?? throw new ArgumentNullException(nameof(snapshotRepo));
_throttle = throttle ?? throw new ArgumentNullException(nameof(throttle));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
@@ -64,39 +70,27 @@ public sealed class PullLiveOddsUseCase
"PullLiveOddsUseCase: scraper returned {Count} live events",
liveEvents.Count);
int snapshotsCaptured = 0;
// Phase 1 — parallel HTTP fan-out: scrape every event's odds in parallel,
// capped at MaxConcurrentRequests. The scraper's rate limiter still
// throttles to RequestsPerSecond underneath this fan-out, so spikes are
// smoothed out before they reach the bookmaker. We deliberately do NOT
// touch the DbContext (or its repositories) inside the parallel block —
// EF Core DbContext is not thread-safe.
var scraped = new ConcurrentBag<(Event Live, OddsSnapshot Snapshot)>();
var maxParallelism = Math.Max(1, _throttle.CurrentValue.MaxConcurrentRequests);
foreach (var live in liveEvents)
var parallelOptions = new ParallelOptions
{
ct.ThrowIfCancellationRequested();
MaxDegreeOfParallelism = maxParallelism,
CancellationToken = ct,
};
// Persist new live events — the upcoming poller may not have seen them
// yet (or never will, for matches added after their scheduled start).
// The Live page reads from the events table, so a new live row must
// exist before its snapshots become visible.
Event eventForScrape;
await Parallel.ForEachAsync(liveEvents, parallelOptions, async (live, taskCt) =>
{
try
{
var existing = await _eventRepo.GetAsync(live.Id, ct);
if (existing is null)
{
await _eventRepo.AddAsync(live, ct);
await _eventRepo.SaveChangesAsync(ct);
eventForScrape = live;
}
else if (existing.EventPath is null && live.EventPath is not null)
{
// Backfill EventPath on rows persisted before the column existed,
// so subsequent scrapes can use the correct URL.
var patched = existing with { EventPath = live.EventPath };
await _eventRepo.UpdateAsync(patched, ct);
await _eventRepo.SaveChangesAsync(ct);
eventForScrape = patched;
}
else
{
eventForScrape = existing;
}
var snapshot = await _scraper.ScrapeEventOddsAsync(live, OddsSource.Live, taskCt);
scraped.Add((live, snapshot));
}
catch (OperationCanceledException)
{
@@ -105,14 +99,38 @@ public sealed class PullLiveOddsUseCase
catch (Exception ex)
{
_logger.LogWarning(ex,
"PullLiveOddsUseCase: failed to persist/lookup live event {EventId} — skipping",
"PullLiveOddsUseCase: failed to capture live snapshot for event {EventId} — skipping",
live.Id.Value);
continue;
}
});
// Phase 2 — sequential persistence. EF Core DbContext is single-threaded,
// so we apply each (event upsert + snapshot insert) one at a time.
int snapshotsCaptured = 0;
foreach (var (live, snapshot) in scraped)
{
ct.ThrowIfCancellationRequested();
try
{
var snapshot = await _scraper.ScrapeEventOddsAsync(eventForScrape, OddsSource.Live, ct);
// Persist new live events — the upcoming poller may not have seen them
// yet (or never will, for matches added after their scheduled start).
// The Live page reads from the events table, so a new live row must
// exist before its snapshots become visible.
var existing = await _eventRepo.GetAsync(live.Id, ct);
if (existing is null)
{
await _eventRepo.AddAsync(live, ct);
await _eventRepo.SaveChangesAsync(ct);
}
else if (existing.EventPath is null && live.EventPath is not null)
{
// Backfill EventPath on rows persisted before the column existed.
var patched = existing with { EventPath = live.EventPath };
await _eventRepo.UpdateAsync(patched, ct);
await _eventRepo.SaveChangesAsync(ct);
}
await _snapshotRepo.AddAsync(snapshot, ct);
await _snapshotRepo.SaveChangesAsync(ct);
snapshotsCaptured++;
@@ -124,8 +142,8 @@ public sealed class PullLiveOddsUseCase
catch (Exception ex)
{
_logger.LogWarning(ex,
"PullLiveOddsUseCase: failed to capture live snapshot for event {EventId} — skipping",
eventForScrape.Id.Value);
"PullLiveOddsUseCase: failed to persist live snapshot for event {EventId} — skipping",
live.Id.Value);
}
}
@@ -1,5 +1,10 @@
using System.Collections.Concurrent;
using Marathon.Application.Abstractions;
using Marathon.Application.Configuration;
using Marathon.Domain.Entities;
using Marathon.Domain.Enums;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace Marathon.Application.UseCases;
@@ -13,18 +18,21 @@ public sealed class PullUpcomingEventsUseCase
private readonly IOddsScraper _scraper;
private readonly IEventRepository _eventRepo;
private readonly ISnapshotRepository _snapshotRepo;
private readonly IOptionsMonitor<ScrapingThrottle> _throttle;
private readonly ILogger<PullUpcomingEventsUseCase> _logger;
public PullUpcomingEventsUseCase(
IOddsScraper scraper,
IEventRepository eventRepo,
ISnapshotRepository snapshotRepo,
IOptionsMonitor<ScrapingThrottle> throttle,
ILogger<PullUpcomingEventsUseCase> logger)
{
_scraper = scraper ?? throw new ArgumentNullException(nameof(scraper));
_eventRepo = eventRepo ?? throw new ArgumentNullException(nameof(eventRepo));
_scraper = scraper ?? throw new ArgumentNullException(nameof(scraper));
_eventRepo = eventRepo ?? throw new ArgumentNullException(nameof(eventRepo));
_snapshotRepo = snapshotRepo ?? throw new ArgumentNullException(nameof(snapshotRepo));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_throttle = throttle ?? throw new ArgumentNullException(nameof(throttle));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <summary>
@@ -44,14 +52,49 @@ public sealed class PullUpcomingEventsUseCase
var events = await _scraper.ScrapeUpcomingAsync(sportFilter: null, ct);
int eventsProcessed = events.Count;
int newEvents = 0;
int snapshotsCaptured = 0;
_logger.LogInformation(
"PullUpcomingEventsUseCase: scraper returned {Count} events",
eventsProcessed);
foreach (var ev in events)
// Phase 1 — parallel HTTP fan-out. Each event's odds snapshot is scraped
// concurrently up to MaxConcurrentRequests; the scraper's rate limiter
// smooths spikes underneath. We do NOT touch the DbContext here — EF Core
// is single-threaded.
var scraped = new ConcurrentBag<(Event Event, OddsSnapshot Snapshot)>();
var maxParallelism = Math.Max(1, _throttle.CurrentValue.MaxConcurrentRequests);
var parallelOptions = new ParallelOptions
{
MaxDegreeOfParallelism = maxParallelism,
CancellationToken = ct,
};
await Parallel.ForEachAsync(events, parallelOptions, async (ev, taskCt) =>
{
try
{
var snapshot = await _scraper.ScrapeEventOddsAsync(ev, OddsSource.PreMatch, taskCt);
scraped.Add((ev, snapshot));
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"PullUpcomingEventsUseCase: failed to capture snapshot for event {EventId} — skipping",
ev.Id.Value);
}
});
// Phase 2 — sequential persistence. Upsert event row, then save the
// captured snapshot. Per-event try/catch keeps a single failure from
// aborting the whole cycle.
int newEvents = 0;
int snapshotsCaptured = 0;
foreach (var (ev, snapshot) in scraped)
{
ct.ThrowIfCancellationRequested();
@@ -78,11 +121,6 @@ public sealed class PullUpcomingEventsUseCase
try
{
var snapshot = await _scraper.ScrapeEventOddsAsync(
ev,
Domain.Enums.OddsSource.PreMatch,
ct);
await _snapshotRepo.AddAsync(snapshot, ct);
await _snapshotRepo.SaveChangesAsync(ct);
snapshotsCaptured++;
@@ -94,7 +132,7 @@ public sealed class PullUpcomingEventsUseCase
catch (Exception ex)
{
_logger.LogWarning(ex,
"PullUpcomingEventsUseCase: failed to capture snapshot for event {EventId} — skipping",
"PullUpcomingEventsUseCase: failed to persist snapshot for event {EventId} — skipping",
ev.Id.Value);
}
}
@@ -46,6 +46,10 @@ public static class InfrastructureModule
.AddOptions<AnomalyOptions>()
.Bind(config.GetSection(AnomalyOptions.SectionName));
services
.AddOptions<ScrapingThrottle>()
.Bind(config.GetSection(ScrapingThrottle.SectionName));
services.AddHostedService<UpcomingEventsPoller>();
services.AddHostedService<LiveOddsPoller>();
services.AddHostedService<ResultsWatchListPoller>();
@@ -18,6 +18,7 @@ public sealed class PullLiveOddsUseCaseTests
private PullLiveOddsUseCase CreateSut() =>
new(_scraper, _eventRepo, _snapshotRepo,
TestFixtures.Throttle(),
NullLogger<PullLiveOddsUseCase>.Instance);
[Fact]
@@ -18,6 +18,7 @@ public sealed class PullUpcomingEventsUseCaseTests
private PullUpcomingEventsUseCase CreateSut() =>
new(_scraper, _eventRepo, _snapshotRepo,
TestFixtures.Throttle(),
NullLogger<PullUpcomingEventsUseCase>.Instance);
[Fact]
@@ -88,24 +89,30 @@ public sealed class PullUpcomingEventsUseCaseTests
var ev2 = TestFixtures.MakeEvent("22222222");
var events = new List<Event> { ev1, ev2 }.AsReadOnly();
_scraper.ScrapeUpcomingAsync(null, Arg.Any<CancellationToken>()).Returns(events);
_scraper.ScrapeUpcomingAsync(Arg.Any<SportCode?>(), Arg.Any<CancellationToken>()).Returns(events);
_eventRepo.GetAsync(Arg.Any<EventId>(), Arg.Any<CancellationToken>()).Returns((Event?)null);
// Inline routing of the throwing-vs-passing branch on event id, since chained
// Arg.Is<>() setups can leak matchers across NSubstitute invocations.
_scraper.ScrapeEventOddsAsync(
Arg.Is<Event>(e => e.Id == ev1.Id), OddsSource.PreMatch, Arg.Any<CancellationToken>())
.ThrowsAsync(new HttpRequestException("site down"));
_scraper.ScrapeEventOddsAsync(
Arg.Is<Event>(e => e.Id == ev2.Id), OddsSource.PreMatch, Arg.Any<CancellationToken>())
.Returns(TestFixtures.MakeSnapshot(ev2.Id));
Arg.Any<Event>(), OddsSource.PreMatch, Arg.Any<CancellationToken>())
.Returns<OddsSnapshot>(ci =>
{
var evArg = ci.Arg<Event>();
if (evArg.Id == ev1.Id)
throw new HttpRequestException("site down");
return TestFixtures.MakeSnapshot(evArg.Id);
});
var sut = CreateSut();
// Act — should not throw
var (processed, newEvents, snapshots) = await sut.ExecuteAsync(CancellationToken.None);
// Assert
// Assert: ev1's snapshot scrape failed in Phase 1, so it's not even
// attempted in Phase 2 — no orphan event row gets persisted.
processed.Should().Be(2);
newEvents.Should().Be(2);
newEvents.Should().Be(1, "ev1's snapshot failed so it was not persisted");
snapshots.Should().Be(1, "only ev2 snapshot succeeded");
}
@@ -1,6 +1,8 @@
using Marathon.Application.Configuration;
using Marathon.Domain.Entities;
using Marathon.Domain.Enums;
using Marathon.Domain.ValueObjects;
using Microsoft.Extensions.Options;
namespace Marathon.Application.Tests.UseCases;
@@ -42,4 +44,23 @@ internal static class TestFixtures
{
return new EventResult(eventId, 2, 1, Side.Side1, DateTimeOffset.UtcNow);
}
/// <summary>
/// Creates an <see cref="IOptionsMonitor{TOptions}"/> that always returns the given
/// throttle. Use 1 for sequential test behaviour, higher values to exercise fan-out.
/// </summary>
public static IOptionsMonitor<ScrapingThrottle> Throttle(int maxConcurrentRequests = 1) =>
new StaticOptionsMonitor<ScrapingThrottle>(new ScrapingThrottle
{
MaxConcurrentRequests = maxConcurrentRequests,
});
private sealed class StaticOptionsMonitor<T> : IOptionsMonitor<T> where T : class
{
private readonly T _value;
public StaticOptionsMonitor(T value) => _value = value;
public T CurrentValue => _value;
public T Get(string? name) => _value;
public IDisposable? OnChange(Action<T, string?> listener) => null;
}
}
@@ -1,5 +1,6 @@
using FluentAssertions;
using Marathon.Application.Abstractions;
using Marathon.Application.Configuration;
using Marathon.Application.UseCases;
using Marathon.Domain.Entities;
using Marathon.Domain.Enums;
@@ -47,11 +48,19 @@ public sealed class LiveOddsPollerTests
sp.GetRequiredService<IOddsScraper>(),
sp.GetRequiredService<IEventRepository>(),
sp.GetRequiredService<ISnapshotRepository>(),
StaticThrottle(),
NullLogger<PullLiveOddsUseCase>.Instance));
return services.BuildServiceProvider();
}
private static IOptionsMonitor<ScrapingThrottle> StaticThrottle()
{
var monitor = Substitute.For<IOptionsMonitor<ScrapingThrottle>>();
monitor.CurrentValue.Returns(new ScrapingThrottle { MaxConcurrentRequests = 1 });
return monitor;
}
private static IOptionsMonitor<WorkerOptions> BuildOptions(
bool enabled = true,
int intervalSeconds = 0)