feat(audio): Android on-device system playback capture

Enable audio-reactive lighting on the Android-TV build. A push-based AndroidAudioEngine captures system playback audio via AudioPlaybackCapture (API 29+), reusing the existing MediaProjection token, and feeds PCM into the unchanged AudioAnalyzer pipeline. No new Python deps; no Chaquopy/pip changes (numpy already bundled). - Python: android_audio_engine.py — module-level queue + configure/ push_samples/shutdown mirroring mediaprojection_engine; AndroidAudioEngine (priority 100) registered behind a guarded import. push_samples copies and defensively trims/clamps each block so the analyzer can't crash on variable-length or non-frame-divisible PCM. - Kotlin: AudioCapture.kt — AudioRecord + AudioPlaybackCaptureConfiguration, fixed chunk-size block framing, little-endian float32, mic fallback; reads back the actual negotiated channel/sample rate. PythonBridge gains configureAudio/pushAudio/shutdownAudio with a cached module handle. - Wiring: CaptureService starts/stops AudioCapture in the MediaProjection path (gated on API>=29 + RECORD_AUDIO + live projection); MainActivity requests RECORD_AUDIO; manifest declares it. Degrades gracefully when denied; root path stays audio-less by design. - Tests: 13 desktop-CI tests incl. an over-length/non-divisible regression guard that exercises the full read_chunk -> AudioAnalyzer.analyze path.
2026-06-02 03:28:22 +03:00
parent 669ae20824
commit fd62db1720
8 changed files with 833 additions and 0 deletions
@@ -39,6 +39,14 @@
    <!-- POST_NOTIFICATIONS for Android 13+ foreground service notification -->
    <uses-permission android:name="android.permission.POST_NOTIFICATIONS" />

+    <!-- RECORD_AUDIO for on-device system-playback capture (AudioPlaybackCapture,
+         API 29+) feeding audio-reactive lighting. Runtime "dangerous" permission,
+         requested in MainActivity; capture degrades gracefully when denied.
+         Playback capture runs under the existing mediaProjection FGS type, so no
+         FOREGROUND_SERVICE_MICROPHONE / microphone FGS type is needed (that would
+         only be required if the mic-fallback path ran inside the service). -->
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
    <!-- Autostart on boot — BootReceiver spawns CaptureService in root
         mode so capture resumes without the user touching the remote. -->
    <uses-permission android:name="android.permission.RECEIVE_BOOT_COMPLETED" />
@@ -0,0 +1,234 @@
+package com.ledgrab.android
+
+import android.annotation.SuppressLint
+import android.media.AudioAttributes
+import android.media.AudioFormat
+import android.media.AudioPlaybackCaptureConfiguration
+import android.media.AudioRecord
+import android.media.MediaRecorder
+import android.media.projection.MediaProjection
+import android.os.Build
+import android.util.Log
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+
+/**
+ * Captures audio with [AudioRecord] and pushes interleaved float32 PCM to
+ * the LedGrab Python server via [PythonBridge], where the
+ * `android_audio_engine` feeds it into the unchanged audio-analysis
+ * pipeline.
+ *
+ * Two sources:
+ *  - [start] — system playback capture via `AudioPlaybackCapture` (API 29+),
+ *    reusing the same [MediaProjection] token the app already holds for
+ *    screen capture. This is the primary path on the consent flow.
+ *  - [startMic] — microphone fallback (`AudioSource.MIC`) for paths with no
+ *    MediaProjection (root mode) or API < 29.
+ *
+ * Mirrors [ScreenCapture]'s shape: a dedicated capture thread, a single
+ * reusable cross-JNI buffer (no per-block allocation → no GC churn on
+ * low-end TV boxes), and graceful teardown in [stop].
+ *
+ * The capture format is negotiated by [AudioRecord]; the **actual**
+ * channel count and sample rate are read back and forwarded to
+ * `configureAudio` so the Python analyzer's interleaving matches the bytes
+ * we push (e.g. a stereo request that the device satisfies as mono).
+ */
+class AudioCapture(
+    private val projection: MediaProjection?,
+    private val bridge: PythonBridge,
+    private val sampleRate: Int = 48000,
+    private val channels: Int = 2,
+    private val chunkFrames: Int = 1024,
+) {
+    companion object {
+        private const val TAG = "AudioCapture"
+        private const val BYTES_PER_FLOAT = 4
+    }
+
+    private var audioRecord: AudioRecord? = null
+    private var captureThread: Thread? = null
+    @Volatile private var running = false
+
+    /**
+     * Start system playback capture (API 29+). Requires the app to hold
+     * RECORD_AUDIO and a valid [projection]. Returns true if capture began.
+     */
+    @SuppressLint("MissingPermission")
+    fun start(): Boolean {
+        if (running) return true
+        if (Build.VERSION.SDK_INT < Build.VERSION_CODES.Q) {
+            Log.i(TAG, "Playback capture needs API 29+; skipping (have ${Build.VERSION.SDK_INT})")
+            return false
+        }
+        val proj = projection
+        if (proj == null) {
+            Log.i(TAG, "No MediaProjection; playback capture unavailable")
+            return false
+        }
+
+        val config = AudioPlaybackCaptureConfiguration.Builder(proj)
+            .addMatchingUsage(AudioAttributes.USAGE_MEDIA)
+            .addMatchingUsage(AudioAttributes.USAGE_GAME)
+            .addMatchingUsage(AudioAttributes.USAGE_UNKNOWN)
+            .build()
+
+        val record = try {
+            AudioRecord.Builder()
+                .setAudioFormat(audioFormat())
+                .setBufferSizeInBytes(bufferBytes())
+                .setAudioPlaybackCaptureConfig(config)
+                .build()
+        } catch (e: Exception) {
+            Log.e(TAG, "Failed to build playback AudioRecord: ${e.message}")
+            return false
+        }
+        return begin(record, "playback")
+    }
+
+    /**
+     * Start microphone capture (fallback). Works on API 24+ and needs no
+     * MediaProjection. Requires RECORD_AUDIO. Returns true if capture began.
+     *
+     * ⚠️ SECURITY/POLICY: currently UNWIRED (no caller). Microphone capture is
+     * a materially different posture than playback capture — it records real
+     * room audio (bystander voices). Before wiring this into [CaptureService]:
+     *  - add FOREGROUND_SERVICE_MICROPHONE permission + the `microphone` FGS
+     *    type (on API 34+ the service is killed without it), and
+     *  - add the Play Store privacy disclosure for microphone use,
+     *  - re-trigger a security review.
+     * Do NOT call this from inside the foreground service without the above.
+     */
+    @SuppressLint("MissingPermission")
+    fun startMic(): Boolean {
+        if (running) return true
+        val record = try {
+            AudioRecord.Builder()
+                .setAudioSource(MediaRecorder.AudioSource.MIC)
+                .setAudioFormat(audioFormat())
+                .setBufferSizeInBytes(bufferBytes())
+                .build()
+        } catch (e: Exception) {
+            Log.e(TAG, "Failed to build mic AudioRecord: ${e.message}")
+            return false
+        }
+        return begin(record, "mic")
+    }
+
+    /** Stop capturing and release all resources. Idempotent. */
+    fun stop() {
+        running = false
+        // AudioRecord.stop() unblocks a pending READ_BLOCKING read within
+        // milliseconds, so the loop sees running=false and returns well inside
+        // the 500ms join window — release() below won't race a live read.
+        // (Mirrors ScreenCapture's bounded join.)
+        runCatching { audioRecord?.stop() }
+        captureThread?.let { runCatching { it.join(500) } }
+        captureThread = null
+        runCatching { audioRecord?.release() }
+        audioRecord = null
+        runCatching { bridge.shutdownAudio() }
+        Log.i(TAG, "Audio capture stopped")
+    }
+
+    // ── internals ──────────────────────────────────────────────────────
+
+    private fun begin(record: AudioRecord, mode: String): Boolean {
+        if (record.state != AudioRecord.STATE_INITIALIZED) {
+            Log.e(TAG, "AudioRecord ($mode) failed to initialize")
+            runCatching { record.release() }
+            return false
+        }
+        val actualChannels = record.channelCount.coerceAtLeast(1)
+        val actualRate = record.sampleRate
+
+        // Confirm recording actually started before reporting success —
+        // startRecording() can throw (exclusive-capture contention) or
+        // leave the record in a non-recording state, in which case read()
+        // would only ever return errors.
+        val started = runCatching { record.startRecording() }.isSuccess &&
+            record.recordingState == AudioRecord.RECORDSTATE_RECORDING
+        if (!started) {
+            Log.e(TAG, "AudioRecord ($mode) failed to start recording")
+            runCatching { record.release() }
+            return false
+        }
+
+        // Recording confirmed — tell Python the real negotiated format
+        // before frames flow, so the analyzer's channel/sample-rate match
+        // the interleaving we push.
+        bridge.configureAudio(actualRate, actualChannels, chunkFrames)
+
+        audioRecord = record
+        running = true
+        captureThread = Thread(
+            { captureLoop(record, actualChannels) },
+            "LedGrab-AudioCapture",
+        ).also { it.start() }
+        Log.i(TAG, "Audio capture started ($mode, sr=$actualRate ch=$actualChannels chunk=$chunkFrames)")
+        return true
+    }
+
+    /**
+     * Blocking read loop. Accumulates into fixed `chunkFrames * channels`
+     * float blocks and pushes only COMPLETE blocks — [AudioRecord.read]
+     * returns a variable count, so partial reads are stitched here rather
+     * than handed to Python as ragged chunks (the analyzer requires
+     * whole-frame, ≤ chunk-size blocks).
+     */
+    private fun captureLoop(record: AudioRecord, actualChannels: Int) {
+        val blockFloats = chunkFrames * actualChannels
+        val floatBuf = FloatArray(blockFloats)
+        // Reusable little-endian byte buffer — Python copies on push, so the
+        // same backing array is safe to overwrite next block. Default
+        // ByteBuffer order is BIG_ENDIAN, which would corrupt every sample;
+        // LITTLE_ENDIAN matches numpy's native float32 on all Android ABIs.
+        val byteBuf = ByteArray(blockFloats * BYTES_PER_FLOAT)
+        val floatView = ByteBuffer.wrap(byteBuf).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
+
+        var filled = 0
+        while (running) {
+            val n = record.read(floatBuf, filled, blockFloats - filled, AudioRecord.READ_BLOCKING)
+            if (n < 0) {
+                if (running) {
+                    // A negative read (e.g. ERROR_DEAD_OBJECT after an audio-route
+                    // change, ERROR_INVALID_OPERATION) means this AudioRecord is
+                    // finished. Deactivate the Python engine so is_available() stops
+                    // advertising a dead stream and the audio-reactive consumer isn't
+                    // left polling an empty queue forever. We're on the capture thread,
+                    // so we can't call stop() (it would self-join) — just flip running
+                    // and shut the engine down; onDestroy's stop() releases the record.
+                    Log.w(TAG, "AudioRecord.read error: $n — stopping audio capture")
+                    running = false
+                    runCatching { bridge.shutdownAudio() }
+                }
+                break
+            }
+            filled += n
+            if (filled < blockFloats) continue
+
+            floatView.clear()
+            floatView.put(floatBuf, 0, blockFloats)
+            bridge.pushAudio(byteBuf)
+            filled = 0
+        }
+    }
+
+    private fun channelMask(): Int =
+        if (channels >= 2) AudioFormat.CHANNEL_IN_STEREO else AudioFormat.CHANNEL_IN_MONO
+
+    private fun audioFormat(): AudioFormat =
+        AudioFormat.Builder()
+            .setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
+            .setSampleRate(sampleRate)
+            .setChannelMask(channelMask())
+            .build()
+
+    private fun bufferBytes(): Int {
+        val minBuf = AudioRecord.getMinBufferSize(sampleRate, channelMask(), AudioFormat.ENCODING_PCM_FLOAT)
+        // A few blocks of headroom so a slow consumer doesn't overrun the
+        // hardware buffer between reads.
+        val want = chunkFrames * channels * BYTES_PER_FLOAT * 4
+        return if (minBuf > 0) maxOf(minBuf, want) else want
+    }
+}
@@ -4,9 +4,11 @@ import android.app.Notification
 import android.app.NotificationChannel
 import android.app.NotificationManager
 import android.app.PendingIntent
+import android.Manifest
 import android.app.Service
 import android.content.Context
 import android.content.Intent
+import android.content.pm.PackageManager
 import android.content.pm.ServiceInfo
 import android.media.projection.MediaProjection
 import android.media.projection.MediaProjectionManager
@@ -85,6 +87,7 @@ class CaptureService : Service() {
    private var bridge: PythonBridge? = null
    private var screenCapture: ScreenCapture? = null
    private var rootCapture: RootScreenrecord? = null
+    private var audioCapture: AudioCapture? = null
    private var mediaProjection: MediaProjection? = null

    // Service-scoped coroutine scope for the root-capture watchdog.
@@ -338,6 +341,25 @@ class CaptureService : Service() {
            onProjectionStopped = { stopSelf() },
        ).also { it.start() }

+        // Reuse the same projection to capture system playback audio so
+        // audio-reactive lighting works on-device (API 29+, RECORD_AUDIO
+        // granted). Best-effort: screen capture and the server keep running
+        // if audio is unavailable. Started AFTER ScreenCapture so the
+        // projection's callback is already registered.
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q &&
+            checkSelfPermission(Manifest.permission.RECORD_AUDIO) ==
+            PackageManager.PERMISSION_GRANTED
+        ) {
+            audioCapture = AudioCapture(projection, newBridge).also { ac ->
+                if (!ac.start()) {
+                    Log.i(TAG, "Playback audio capture unavailable — continuing without audio")
+                    audioCapture = null
+                }
+            }
+        } else {
+            Log.i(TAG, "RECORD_AUDIO not granted or API < 29 — audio-reactive capture disabled")
+        }
+
        Log.i(TAG, "LedGrab service started (MediaProjection) — web UI at $url")
    }

@@ -351,6 +373,10 @@ class CaptureService : Service() {
        screenCapture?.stop()
        screenCapture = null

+        // Stop audio before the server: stop() calls bridge.shutdownAudio().
+        audioCapture?.stop()
+        audioCapture = null
+
        rootCapture?.stop()
        rootCapture = null

@@ -53,6 +53,7 @@ class MainActivity : Activity() {
        private const val SERVER_PORT = 8080
        private const val REQUEST_MEDIA_PROJECTION = 1001
        private const val REQUEST_POST_NOTIFICATIONS = 1002
+        private const val REQUEST_RECORD_AUDIO = 1003
        private const val QR_SIZE_PX = 560
    }

@@ -215,6 +216,7 @@ class MainActivity : Activity() {

    private fun startCaptureService(resultCode: Int, resultData: Intent) {
        ensureNotificationPermission()
+        ensureAudioPermission()
        val intent = CaptureService.createIntent(this, resultCode, resultData)
        ContextCompat.startForegroundService(this, intent)
        updateUI()
@@ -471,4 +473,24 @@ class MainActivity : Activity() {
            }
        }
    }
+
+    /**
+     * Request RECORD_AUDIO (API 29+) so the capture service can capture
+     * system playback audio for audio-reactive lighting.  Fire-and-forget,
+     * like [ensureNotificationPermission]: capture still works without it
+     * (just no audio), so we don't block on the result.  If first granted
+     * here, audio becomes available on the next Start.
+     */
+    private fun ensureAudioPermission() {
+        if (Build.VERSION.SDK_INT < Build.VERSION_CODES.Q) return
+        if (checkSelfPermission(Manifest.permission.RECORD_AUDIO)
+            != PackageManager.PERMISSION_GRANTED
+        ) {
+            @Suppress("DEPRECATION")
+            requestPermissions(
+                arrayOf(Manifest.permission.RECORD_AUDIO),
+                REQUEST_RECORD_AUDIO,
+            )
+        }
+    }
 }
@@ -28,6 +28,7 @@ class PythonBridge(private val context: Context) {
    // single-writer/single-reader pattern we have here.
    @Volatile private var mediaProjectionEngine: PyObject? = null
    @Volatile private var rootEngine: PyObject? = null
+    @Volatile private var androidAudioEngine: PyObject? = null

    /**
     * Configure the MediaProjection engine with screen dimensions.
@@ -53,6 +54,49 @@ class PythonBridge(private val context: Context) {
        Log.i(TAG, "Root screenrecord engine configured: ${width}x${height}")
    }

+    /**
+     * Configure the Android playback-capture audio engine with the format
+     * actually negotiated by [AudioCapture]'s `AudioRecord`. Must be called
+     * before [pushAudio]. Caches the module handle for the per-block fast
+     * path (same pattern as [configureCapture]).
+     */
+    fun configureAudio(sampleRate: Int, channels: Int, chunkFrames: Int) {
+        val py = Python.getInstance()
+        val engine = py.getModule("ledgrab.core.audio.android_audio_engine")
+        engine.callAttr("configure", sampleRate, channels, chunkFrames)
+        androidAudioEngine = engine
+        Log.i(TAG, "Android audio engine configured: sr=$sampleRate ch=$channels chunk=$chunkFrames")
+    }
+
+    /**
+     * Push one interleaved little-endian float32 PCM block to the Python
+     * audio engine. Called from [AudioCapture]'s capture thread. The byte
+     * array crosses the JNI boundary; Python copies it on receipt, so the
+     * caller may reuse the same buffer for the next block.
+     */
+    fun pushAudio(pcmFloat32: ByteArray) {
+        if (!running) return
+        val engine = androidAudioEngine ?: return
+        try {
+            engine.callAttr("push_samples", pcmFloat32)
+        } catch (e: Exception) {
+            Log.w(TAG, "Failed to push audio: ${e.message}")
+        }
+    }
+
+    /**
+     * Deactivate the Python audio engine. Called from [AudioCapture.stop].
+     */
+    fun shutdownAudio() {
+        val engine = androidAudioEngine ?: return
+        try {
+            engine.callAttr("shutdown")
+        } catch (e: Exception) {
+            Log.w(TAG, "Failed to shut down audio engine: ${e.message}")
+        }
+        androidAudioEngine = null
+    }
+
    /**
     * Start the LedGrab FastAPI server on a background thread.
     *