diff --git a/RELEASENOTES.md b/RELEASENOTES.md index 8d1b81c5f5..1d55ba060a 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -63,6 +63,8 @@ * DataSource: * Audio: * Fix pop sounds that may occur during seeks. + * Fix truncation error accumulation for Sonic's + time-stretching/pitch-shifting algorithm. * Video: * Add workaround for a device issue on Galaxy Tab S7 FE that causes 60fps secure H264 streams to be marked as unsupported diff --git a/libraries/common/src/main/java/androidx/media3/common/audio/Sonic.java b/libraries/common/src/main/java/androidx/media3/common/audio/Sonic.java index ab36af3c53..ae2c84bad5 100644 --- a/libraries/common/src/main/java/androidx/media3/common/audio/Sonic.java +++ b/libraries/common/src/main/java/androidx/media3/common/audio/Sonic.java @@ -52,11 +52,23 @@ import java.util.Arrays; private int pitchFrameCount; private int oldRatePosition; private int newRatePosition; + + /** + * Number of frames pending to be copied from {@link #inputBuffer} directly to {@link + * #outputBuffer}. + * + *

This field is only relevant to time-stretching or pitch-shifting in {@link + * #changeSpeed(double)}, particularly when more frames need to be copied to the {@link + * #outputBuffer} than are available in {@link #inputBuffer} and Sonic must wait until the next + * buffer (or EOS) is queued. + */ private int remainingInputToCopyFrameCount; + private int prevPeriod; private int prevMinDiff; private int minDiff; private int maxDiff; + private double accumulatedSpeedAdjustmentError; /** * Creates a new Sonic audio stream processor. @@ -130,10 +142,26 @@ import java.util.Arrays; */ public void queueEndOfStream() { int remainingFrameCount = inputFrameCount; - float s = speed / pitch; - float r = rate * pitch; + double s = speed / pitch; + double r = rate * pitch; + + // If there are frames to be copied directly onto the output buffer, we should not count those + // as "input frames" because Sonic is not applying any processing on them. + int adjustedRemainingFrames = remainingFrameCount - remainingInputToCopyFrameCount; + + // We add directly to the output the number of frames in remainingInputToCopyFrameCount. + // Otherwise, expectedOutputFrames will be off and will make Sonic output an incorrect number of + // frames. int expectedOutputFrames = - outputFrameCount + (int) ((remainingFrameCount / s + pitchFrameCount) / r + 0.5f); + outputFrameCount + + (int) + ((adjustedRemainingFrames / s + + remainingInputToCopyFrameCount + + accumulatedSpeedAdjustmentError + + pitchFrameCount) + / r + + 0.5); + accumulatedSpeedAdjustmentError = 0; // Add enough silence to flush both input and pitch buffers. inputBuffer = @@ -166,6 +194,7 @@ import java.util.Arrays; prevMinDiff = 0; minDiff = 0; maxDiff = 0; + accumulatedSpeedAdjustmentError = 0; } /** Returns the size of output that can be read with {@link #getOutput(ShortBuffer)}, in bytes. */ @@ -408,14 +437,19 @@ import java.util.Arrays; removePitchFrames(pitchFrameCount - 1); } - private int skipPitchPeriod(short[] samples, int position, float speed, int period) { + private int skipPitchPeriod(short[] samples, int position, double speed, int period) { // Skip over a pitch period, and copy period/speed samples to the output. int newFrameCount; if (speed >= 2.0f) { - newFrameCount = (int) (period / (speed - 1.0f)); + double expectedFrameCount = period / (speed - 1.0) + accumulatedSpeedAdjustmentError; + newFrameCount = (int) Math.round(expectedFrameCount); + accumulatedSpeedAdjustmentError = expectedFrameCount - newFrameCount; } else { newFrameCount = period; - remainingInputToCopyFrameCount = (int) (period * (2.0f - speed) / (speed - 1.0f)); + double expectedInputToCopy = + period * (2.0f - speed) / (speed - 1.0f) + accumulatedSpeedAdjustmentError; + remainingInputToCopyFrameCount = (int) Math.round(expectedInputToCopy); + accumulatedSpeedAdjustmentError = expectedInputToCopy - remainingInputToCopyFrameCount; } outputBuffer = ensureSpaceForAdditionalFrames(outputBuffer, outputFrameCount, newFrameCount); overlapAdd( @@ -431,14 +465,19 @@ import java.util.Arrays; return newFrameCount; } - private int insertPitchPeriod(short[] samples, int position, float speed, int period) { + private int insertPitchPeriod(short[] samples, int position, double speed, int period) { // Insert a pitch period, and determine how much input to copy directly. int newFrameCount; if (speed < 0.5f) { - newFrameCount = (int) (period * speed / (1.0f - speed)); + double expectedFrameCount = period * speed / (1.0f - speed) + accumulatedSpeedAdjustmentError; + newFrameCount = (int) Math.round(expectedFrameCount); + accumulatedSpeedAdjustmentError = expectedFrameCount - newFrameCount; } else { newFrameCount = period; - remainingInputToCopyFrameCount = (int) (period * (2.0f * speed - 1.0f) / (1.0f - speed)); + double expectedInputToCopy = + period * (2.0f * speed - 1.0f) / (1.0f - speed) + accumulatedSpeedAdjustmentError; + remainingInputToCopyFrameCount = (int) Math.round(expectedInputToCopy); + accumulatedSpeedAdjustmentError = expectedInputToCopy - remainingInputToCopyFrameCount; } outputBuffer = ensureSpaceForAdditionalFrames(outputBuffer, outputFrameCount, period + newFrameCount); @@ -461,7 +500,7 @@ import java.util.Arrays; return newFrameCount; } - private void changeSpeed(float speed) { + private void changeSpeed(double speed) { if (inputFrameCount < maxRequiredFrameCount) { return; } @@ -485,7 +524,7 @@ import java.util.Arrays; private void processStreamInput() { // Resample as many pitch periods as we have buffered on the input. int originalOutputFrameCount = outputFrameCount; - float s = speed / pitch; + double s = speed / pitch; float r = rate * pitch; if (s > 1.00001 || s < 0.99999) { changeSpeed(s); diff --git a/libraries/common/src/test/java/androidx/media3/common/audio/RandomParameterizedSonicTest.java b/libraries/common/src/test/java/androidx/media3/common/audio/RandomParameterizedSonicTest.java index 016ee7c8b5..cb474f2d50 100644 --- a/libraries/common/src/test/java/androidx/media3/common/audio/RandomParameterizedSonicTest.java +++ b/libraries/common/src/test/java/androidx/media3/common/audio/RandomParameterizedSonicTest.java @@ -16,6 +16,7 @@ package androidx.media3.common.audio; import static com.google.common.truth.Truth.assertThat; +import static java.lang.Math.max; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; @@ -45,9 +46,30 @@ public final class RandomParameterizedSonicTest { private static final int PARAM_COUNT = 5; private static final int SPEED_DECIMAL_PRECISION = 2; + + /** + * Allowed error tolerance ratio for number of output samples for Sonic's time stretching + * algorithm. + * + *

The actual tolerance is calculated as {@code expectedOutputSampleCount / + * TIME_STRETCHING_SAMPLE_DRIFT_TOLERANCE}, rounded to the nearest integer value. However, we + * always allow a minimum tolerance of ±1 samples. + * + *

This tolerance is roughly equal to an error of 900us/~44 samples/0.000017% for a 90 min mono + * stream @48KHz. To obtain the value, we ran 100 iterations of {@link + * #timeStretching_returnsExpectedNumberOfSamples()} (by setting {@link #PARAM_COUNT} to 10) and + * we calculated the average delta percentage between expected number of samples and actual number + * of samples (b/366169590). + */ + private static final BigDecimal TIME_STRETCHING_SAMPLE_DRIFT_TOLERANCE = + new BigDecimal("0.00000017"); + private static final ImmutableList> SPEED_RANGES = ImmutableList.of( - Range.closedOpen(0f, 1f), Range.closedOpen(1f, 2f), Range.closedOpen(2f, 20f)); + Range.closedOpen(0f, 0.5f), + Range.closedOpen(0.5f, 1f), + Range.closedOpen(1f, 2f), + Range.closedOpen(2f, 20f)); private static final Random random = new Random(/* seed */ 0); @@ -165,6 +187,55 @@ public final class RandomParameterizedSonicTest { .of(expectedSize.longValueExact() - accumulatedError.longValueExact()); } + @Test + public void timeStretching_returnsExpectedNumberOfSamples() { + byte[] buf = new byte[BLOCK_SIZE * BYTES_PER_SAMPLE]; + ShortBuffer outBuffer = ShortBuffer.allocate(BLOCK_SIZE); + Sonic sonic = + new Sonic( + /* inputSampleRateHz= */ SAMPLE_RATE, + /* channelCount= */ 1, + speed, + /* pitch= */ 1, + /* outputSampleRateHz= */ SAMPLE_RATE); + long readSampleCount = 0; + + for (long samplesLeft = streamLength; samplesLeft > 0; samplesLeft -= BLOCK_SIZE) { + random.nextBytes(buf); + if (samplesLeft >= BLOCK_SIZE) { + sonic.queueInput(ByteBuffer.wrap(buf).asShortBuffer()); + } else { + sonic.queueInput( + ByteBuffer.wrap(buf, 0, (int) (samplesLeft * BYTES_PER_SAMPLE)).asShortBuffer()); + sonic.queueEndOfStream(); + } + while (sonic.getOutputSize() > 0) { + sonic.getOutput(outBuffer); + readSampleCount += outBuffer.position(); + outBuffer.clear(); + } + } + sonic.flush(); + + BigDecimal bigSpeed = new BigDecimal(String.valueOf(speed)); + BigDecimal bigLength = new BigDecimal(String.valueOf(streamLength)); + // The scale of expectedSampleCount will always be equal to bigLength. Thus, the result will + // always + // yield an integer. + BigDecimal expectedSampleCount = bigLength.divide(bigSpeed, RoundingMode.HALF_EVEN); + + // Calculate allowed tolerance and round to nearest integer. + BigDecimal allowedTolerance = + TIME_STRETCHING_SAMPLE_DRIFT_TOLERANCE + .multiply(expectedSampleCount) + .setScale(/* newScale= */ 0, RoundingMode.HALF_EVEN); + + // Always allow at least 1 sample of tolerance. + long tolerance = max(allowedTolerance.longValue(), 1); + + assertThat(readSampleCount).isWithin(tolerance).of(expectedSampleCount.longValueExact()); + } + private static float round(float num) { BigDecimal bigDecimal = new BigDecimal(Float.toString(num)); return bigDecimal.setScale(SPEED_DECIMAL_PRECISION, RoundingMode.HALF_EVEN).floatValue(); diff --git a/libraries/test_data/src/test/assets/transformerdumps/wav/sample.wav/doubleSpeed.dump b/libraries/test_data/src/test/assets/transformerdumps/wav/sample.wav/doubleSpeed.dump new file mode 100644 index 0000000000..916aa42ead --- /dev/null +++ b/libraries/test_data/src/test/assets/transformerdumps/wav/sample.wav/doubleSpeed.dump @@ -0,0 +1,62 @@ +format audio: + averageBitrate = 131072 + sampleMimeType = audio/mp4a-latm + channelCount = 1 + sampleRate = 44100 + pcmEncoding = 2 +sample: + trackType = audio + dataHashCode = -858457440 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -317223982 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -510794633 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -392394518 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -1161865299 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = 251977808 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -2046238978 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -1083051456 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = 1068783564 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -825415045 + size = 4096 + isKeyFrame = true +sample: + trackType = audio + dataHashCode = -1525522823 + size = 3140 + isKeyFrame = true +released = true diff --git a/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java b/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java index 9ef45ad23a..283cc1a5fc 100644 --- a/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java +++ b/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java @@ -2122,8 +2122,9 @@ public class TransformerEndToEndTest { sonic.setPitch(resamplingRate); Effects effects = new Effects( - ImmutableList.of(sonic, createByteCountingAudioProcessor(readBytes)), - ImmutableList.of()); + /* audioProcessors= */ ImmutableList.of( + sonic, createByteCountingAudioProcessor(readBytes)), + /* videoEffects= */ ImmutableList.of()); EditedMediaItem editedMediaItem = new EditedMediaItem.Builder(MediaItem.fromUri(WAV_ASSET.uri)).setEffects(effects).build(); @@ -2137,6 +2138,28 @@ public class TransformerEndToEndTest { assertThat(readBytes.get() / 2).isWithin(1).of(29400); } + @Test + public void adjustAudioSpeed_to2pt5Speed_hasExpectedOutputSampleCount() throws Exception { + AtomicInteger readBytes = new AtomicInteger(); + Transformer transformer = new Transformer.Builder(context).build(); + SonicAudioProcessor sonic = new SonicAudioProcessor(); + sonic.setSpeed(2.5f); + Effects effects = + new Effects( + /* audioProcessors= */ ImmutableList.of( + sonic, createByteCountingAudioProcessor(readBytes)), + /* videoEffects= */ ImmutableList.of()); + EditedMediaItem editedMediaItem = + new EditedMediaItem.Builder(MediaItem.fromUri(WAV_ASSET.uri)).setEffects(effects).build(); + + new TransformerAndroidTestRunner.Builder(context, transformer) + .build() + .run(testId, editedMediaItem); + // The test file contains 44100 samples (1 sec @44.1KHz, mono). We expect to receive 44100 / 2.5 + // samples. + assertThat(readBytes.get() / 2).isEqualTo(17640); + } + @Test public void speedAdjustedMedia_shorterAudioTrack_completesWithCorrectDuration() throws Exception { assumeFormatsSupported( diff --git a/libraries/transformer/src/test/java/androidx/media3/transformer/MediaItemExportTest.java b/libraries/transformer/src/test/java/androidx/media3/transformer/MediaItemExportTest.java index 1d94a54f2a..57797879e9 100644 --- a/libraries/transformer/src/test/java/androidx/media3/transformer/MediaItemExportTest.java +++ b/libraries/transformer/src/test/java/androidx/media3/transformer/MediaItemExportTest.java @@ -596,6 +596,36 @@ public final class MediaItemExportTest { getDumpFileName(/* originalFileName= */ FILE_AUDIO_RAW, /* modifications...= */ "48000hz")); } + @Test + public void adjustAudioSpeed_toDoubleSpeed_returnsExpectedNumberOfSamples() throws Exception { + CapturingMuxer.Factory muxerFactory = new CapturingMuxer.Factory(/* handleAudioAsPcm= */ true); + SonicAudioProcessor sonicAudioProcessor = new SonicAudioProcessor(); + sonicAudioProcessor.setSpeed(2f); + Transformer transformer = + createTransformerBuilder(muxerFactory, /* enableFallback= */ false).build(); + MediaItem mediaItem = MediaItem.fromUri(ASSET_URI_PREFIX + FILE_AUDIO_RAW); + AtomicInteger bytesRead = new AtomicInteger(); + + EditedMediaItem editedMediaItem = + new EditedMediaItem.Builder(mediaItem) + .setEffects( + createAudioEffects( + sonicAudioProcessor, createByteCountingAudioProcessor(bytesRead))) + .build(); + + transformer.start(editedMediaItem, outputDir.newFile().getPath()); + TransformerTestRunner.runLooper(transformer); + + // Time stretching 1 second @ 44100Hz into 22050 samples. + assertThat(bytesRead.get() / 2).isEqualTo(22050); + + DumpFileAsserts.assertOutput( + context, + muxerFactory.getCreatedMuxer(), + getDumpFileName( + /* originalFileName= */ FILE_AUDIO_RAW, /* modifications...= */ "doubleSpeed")); + } + @Test public void start_withRawBigEndianAudioInput_completesSuccessfully() throws Exception { CapturingMuxer.Factory muxerFactory = new CapturingMuxer.Factory(/* handleAudioAsPcm= */ true);