From dfe47219f91196a4c77b5af3485ee9affced83ce Mon Sep 17 00:00:00 2001 From: tonihei Date: Mon, 29 Jan 2024 02:29:15 -0800 Subject: [PATCH] Update skip silence algorithm The updated algorithm has two main improvements: - The silence padding is not constant but a ratio of the original silence (up to a defined max) to more naturally represent the original gaps. - The silence is not instantly going to zero, but uses a ramp down and up for a smooth volume transition and also retains a small percentage of the original noise for more natural "silences" that still contain some background noise. #minor-release Issue: google/ExoPlayer#7423 PiperOrigin-RevId: 602322442 (cherry picked from commit bb533332f4b6f46ac9d5ca17cb7943bd1bdb7dd2) --- RELEASENOTES.md | 7 + .../audio/SilenceSkippingAudioProcessor.java | 699 +++++++++++++----- .../SilenceSkippingAudioProcessorTest.java | 76 +- 3 files changed, 592 insertions(+), 190 deletions(-) diff --git a/RELEASENOTES.md b/RELEASENOTES.md index bf9e9216a4..cecd3c4ed2 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -38,6 +38,13 @@ This release includes the following changes since the * Extract audio types from TS descriptors and map them to role flags, allowing users to make better-informed audio track selections ([#973](https://github.com/androidx/media/pull/973)). +* Audio: + * Improve silence skipping algorithm with smooth volume ramp, retained + minimal silence and more natural silence durations + ([#7423](https://github.com/google/ExoPlayer/issues/7423)). +* Video: +* Text: +* Metadata: * Image: * Add support for DASH thumbnails. Grid images are cropped and individual thumbnails are provided to `ImageOutput` close to their presentation diff --git a/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java b/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java index b4d35dd362..bc6eb805d2 100644 --- a/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java +++ b/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java @@ -15,14 +15,16 @@ */ package androidx.media3.exoplayer.audio; +import static androidx.media3.common.util.Assertions.checkArgument; +import static androidx.media3.common.util.Assertions.checkState; import static java.lang.Math.min; import static java.lang.annotation.ElementType.TYPE_USE; import androidx.annotation.IntDef; import androidx.media3.common.C; +import androidx.media3.common.Format; import androidx.media3.common.audio.AudioProcessor; import androidx.media3.common.audio.BaseAudioProcessor; -import androidx.media3.common.util.Assertions; import androidx.media3.common.util.UnstableApi; import androidx.media3.common.util.Util; import java.lang.annotation.Documented; @@ -39,103 +41,212 @@ import java.nio.ByteBuffer; public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { /** - * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short) - * minimumSilenceDurationUs}. + * Default fraction of the original silence to keep. Between [0, 1]. 1 means keep all silence. 0 + * means remove all silence. */ - public static final long DEFAULT_MINIMUM_SILENCE_DURATION_US = 150_000; + public static final float DEFAULT_SILENCE_RETENTION_RATIO = 0.2f; /** - * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short) - * paddingSilenceUs}. + * Default volume percentage to keep. + * + *

Even when modifying the volume to a mute state, it is ideal to decrease the volume instead + * of making the volume zero. Completely silent audio sounds like playback has stopped. While + * decreased volume sounds like very light background noise at a recording studio. */ - public static final long DEFAULT_PADDING_SILENCE_US = 20_000; + public static final int DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE = 10; - /** - * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short) - * silenceThresholdLevel}. - */ + /** Default absolute level below which an individual PCM sample is classified as silent. */ public static final short DEFAULT_SILENCE_THRESHOLD_LEVEL = 1024; + /** + * Default minimum duration of audio that must be below {@code silenceThresholdLevel} before + * silence starts being trimmed. Specified in microseconds. + */ + public static final long DEFAULT_MINIMUM_SILENCE_DURATION_US = 100_000; + + /** + * Default maximum silence to keep in microseconds. This maximum is applied after {@code + * silenceRetentionRatio}. + */ + public static final long DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US = 2_000_000; + + /** + * @deprecated Specify silence behaviour via {@code silenceRetentionRatio} instead. + */ + @Deprecated public static final long DEFAULT_PADDING_SILENCE_US = 20_000; + /** Trimming states. */ @Documented @Retention(RetentionPolicy.SOURCE) @Target(TYPE_USE) @IntDef({ STATE_NOISY, - STATE_MAYBE_SILENT, - STATE_SILENT, + STATE_SHORTENING_SILENCE, }) private @interface State {} /** State when the input is not silent. */ private static final int STATE_NOISY = 0; - /** State when the input may be silent but we haven't read enough yet to know. */ - private static final int STATE_MAYBE_SILENT = 1; + /** + * State when the input has been silent less than or equal to {@link #maxSilenceToKeepDurationUs} + * and the silence is being shortened according to {@link #calculateShortenedSilenceLength(int)}. + */ + private static final int STATE_SHORTENING_SILENCE = 1; - /** State when the input is silent. */ - private static final int STATE_SILENT = 2; + /** Ways to change the volume of silence. */ + @Documented + @Retention(RetentionPolicy.SOURCE) + @Target(TYPE_USE) + @IntDef({ + FADE_OUT, + MUTE, + FADE_IN, + DO_NOT_CHANGE_VOLUME, + }) + private @interface VolumeChangeType {} - private final long minimumSilenceDurationUs; - private final long paddingSilenceUs; - private final short silenceThresholdLevel; - private int bytesPerFrame; - private boolean enabled; + private static final int FADE_OUT = 0; + private static final int MUTE = 1; + private static final int FADE_IN = 2; + private static final int DO_NOT_CHANGE_VOLUME = 3; /** - * Buffers audio data that may be classified as silence while in {@link #STATE_MAYBE_SILENT}. If - * the input becomes noisy before the buffer has filled, it will be output. Otherwise, the buffer - * contents will be dropped and the state will transition to {@link #STATE_SILENT}. + * Used with {@code minVolumeToKeepPercentageWhenMuting} to avoid round off errors. An alternative + * to this would be to use floats, but integer math is probably faster than floats. + */ + private static final int AVOID_TRUNCATION_FACTOR = 1000; + + /** + * Fraction of the original silence to keep. Between [0, 1]. 1 means keep all silence. 0 means + * remove all silence. + */ + private final float silenceRetentionRatio; + + /** Absolute level below which an individual PCM sample is classified as silent. */ + private final short silenceThresholdLevel; + + /** + * Volume percentage to keep. Even when modifying the volume to a mute state, it is ideal to + * decrease the volume instead of making the volume zero. Completely silent audio sounds like + * playback has stopped. While decreased volume sounds like very light background noise from a + * recording studio. + */ + private final int minVolumeToKeepPercentageWhenMuting; + + /** + * Duration of audio that must be below {@link #silenceThresholdLevel} before silence starts being + * trimmed. Specified in microseconds. + */ + private final long minimumSilenceDurationUs; + + /** + * Maximum silence to keep in microseconds. This maximum is applied after {@link + * #silenceRetentionRatio}. + */ + private final long maxSilenceToKeepDurationUs; + + private AudioFormat inputFormat; + private int bytesPerFrame; + private boolean enabled; + private @State int state; + private long skippedFrames; + + /** + * The frames of silence that has been output since the last noise. Used to enforce {@link + * #maxSilenceToKeepDurationUs}. + */ + private int outputSilenceFramesSinceNoise = 0; + + /** + * Buffers audio data that may be classified as silence while in {@link + * #STATE_SHORTENING_SILENCE}. If the input becomes noisy before the buffer has filled, it will be + * output without shortening. Otherwise, the buffer will be output when filled as shortened + * silence and emptied. */ private byte[] maybeSilenceBuffer; /** - * Stores the latest part of the input while silent. It will be output as padding if the next - * input is noisy. + * An index into {@link #maybeSilenceBuffer} pointing to the location where silence that has not + * been output starts. */ - private byte[] paddingBuffer; + private int maybeSilenceBufferStartIndex = 0; - private @State int state; - private int maybeSilenceBufferSize; - private int paddingSize; - private boolean hasOutputNoise; - private long skippedFrames; + /** + * A count of the number of bytes of content in {@link #maybeSilenceBuffer}. The count starts at + * {@link #maybeSilenceBufferStartIndex}, and the bytes counted may wrap around to the start of + * the buffer. The count will never be greater than {@link #maybeSilenceBuffer}'s length. + */ + private int maybeSilenceBufferContentsSize = 0; + + /** Used to hold a subset of the contents of {@link #maybeSilenceBuffer} for convenience. */ + // TODO: This processor can probably be more efficient if this array is not used. Operations like + // modifyVolume() can be applied to a non-contiguous contents, the code is just more complex. + private byte[] contiguousOutputBuffer; /** Creates a new silence skipping audio processor. */ public SilenceSkippingAudioProcessor() { this( DEFAULT_MINIMUM_SILENCE_DURATION_US, - DEFAULT_PADDING_SILENCE_US, + DEFAULT_SILENCE_RETENTION_RATIO, + DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US, + DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE, DEFAULT_SILENCE_THRESHOLD_LEVEL); } /** - * Creates a new silence skipping audio processor. - * - * @param minimumSilenceDurationUs The minimum duration of audio that must be below {@code - * silenceThresholdLevel} to classify that part of audio as silent, in microseconds. - * @param paddingSilenceUs The duration of silence by which to extend non-silent sections, in - * microseconds. The value must not exceed {@code minimumSilenceDurationUs}. - * @param silenceThresholdLevel The absolute level below which an individual PCM sample is - * classified as silent. + * @deprecated Use {@link #SilenceSkippingAudioProcessor(long, float, long, int, short)} instead. */ + @Deprecated public SilenceSkippingAudioProcessor( long minimumSilenceDurationUs, long paddingSilenceUs, short silenceThresholdLevel) { - Assertions.checkArgument(paddingSilenceUs <= minimumSilenceDurationUs); - this.minimumSilenceDurationUs = minimumSilenceDurationUs; - this.paddingSilenceUs = paddingSilenceUs; - this.silenceThresholdLevel = silenceThresholdLevel; - - maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY; - paddingBuffer = Util.EMPTY_BYTE_ARRAY; + this( + minimumSilenceDurationUs, + /* silenceRetentionRatio= */ (float) paddingSilenceUs / minimumSilenceDurationUs, + /* maxSilenceToKeepDurationUs= */ minimumSilenceDurationUs, + /* minVolumeToKeepPercentageWhenMuting= */ 0, + silenceThresholdLevel); } /** - * Sets whether to skip silence in the input. This method may only be called after draining data - * through the processor. The value returned by {@link #isActive()} may change, and the processor - * must be {@link #flush() flushed} before queueing more data. + * Creates a new silence trimming audio processor. * - * @param enabled Whether to skip silence in the input. + * @param minimumSilenceDurationUs Duration of audio that must be below {@code + * silenceThresholdLevel} before silence starts being trimmed, in microseconds. + * @param silenceRetentionRatio Fraction of the original silence to keep. Between [0, 1]. 1 means + * keep all silence. 0 means remove all silence. + * @param maxSilenceToKeepDurationUs Maximum silence to keep in microseconds. This maximum is + * applied after {@link #silenceRetentionRatio}. + * @param minVolumeToKeepPercentageWhenMuting Volume percentage to keep. Even when modifying the + * volume to a mute state, it is ideal to decrease the volume instead of making the volume + * zero. Completely silent audio sounds like playback has stopped. While decreased volume + * sounds like very light background noise from a recording studio. + * @param silenceThresholdLevel Absolute level below which an individual PCM sample is classified + * as silent. + */ + public SilenceSkippingAudioProcessor( + long minimumSilenceDurationUs, + float silenceRetentionRatio, + long maxSilenceToKeepDurationUs, + int minVolumeToKeepPercentageWhenMuting, + short silenceThresholdLevel) { + checkArgument(silenceRetentionRatio >= 0f && silenceRetentionRatio <= 1f); + this.minimumSilenceDurationUs = minimumSilenceDurationUs; + this.silenceRetentionRatio = silenceRetentionRatio; + this.maxSilenceToKeepDurationUs = maxSilenceToKeepDurationUs; + this.minVolumeToKeepPercentageWhenMuting = minVolumeToKeepPercentageWhenMuting; + this.silenceThresholdLevel = silenceThresholdLevel; + inputFormat = AudioFormat.NOT_SET; + maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY; + contiguousOutputBuffer = Util.EMPTY_BYTE_ARRAY; + } + + /** + * Sets whether to shorten silence in the input. This method may only be called after draining + * data through the processor. The value returned by {@link #isActive()} may change, and the + * processor must be {@link #flush() flushed} before queueing more data. + * + * @param enabled Whether to shorten silence in the input. */ public void setEnabled(boolean enabled) { this.enabled = enabled; @@ -149,20 +260,20 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { return skippedFrames; } - // AudioProcessor implementation. - @Override - public AudioFormat onConfigure(AudioFormat inputAudioFormat) + protected AudioFormat onConfigure(AudioFormat inputAudioFormat) throws UnhandledAudioFormatException { if (inputAudioFormat.encoding != C.ENCODING_PCM_16BIT) { throw new UnhandledAudioFormatException(inputAudioFormat); } - return enabled ? inputAudioFormat : AudioFormat.NOT_SET; + this.inputFormat = inputAudioFormat; + bytesPerFrame = inputAudioFormat.channelCount * 2; + return inputAudioFormat; } @Override public boolean isActive() { - return enabled; + return inputFormat.sampleRate != Format.NO_VALUE && enabled; } @Override @@ -172,11 +283,8 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { case STATE_NOISY: processNoisy(inputBuffer); break; - case STATE_MAYBE_SILENT: - processMaybeSilence(inputBuffer); - break; - case STATE_SILENT: - processSilence(inputBuffer); + case STATE_SHORTENING_SILENCE: + shortenSilenceSilenceUntilNoise(inputBuffer); break; default: throw new IllegalStateException(); @@ -185,48 +293,43 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { } @Override - protected void onQueueEndOfStream() { - if (maybeSilenceBufferSize > 0) { - // We haven't received enough silence to transition to the silent state, so output the buffer - // and switch back to the noisy state. - output(maybeSilenceBuffer, maybeSilenceBufferSize); - maybeSilenceBufferSize = 0; - state = STATE_NOISY; - } - if (!hasOutputNoise) { - skippedFrames += paddingSize / bytesPerFrame; + public void onQueueEndOfStream() { + // The maybeSilenceBuffer is only written to in the STATE_SHORTENING_SILENCE state, and + // is always completely flushed before leaving the STATE_SHORTENING_SILENCE. + if (maybeSilenceBufferContentsSize > 0) { + // There's bytes in the buffer. So the final chunk of shortened silence will be output to + // simulate a transition back to the noisy state and the end of output. + outputShortenedSilenceBuffer(/* shouldTransitionToNoisyState= */ true); + outputSilenceFramesSinceNoise = 0; } } @Override - protected void onFlush() { - if (enabled) { - bytesPerFrame = inputAudioFormat.bytesPerFrame; - int maybeSilenceBufferSize = durationUsToFrames(minimumSilenceDurationUs) * bytesPerFrame; + public void onFlush() { + if (isActive()) { + // Divide by 2 to allow the buffer to be split into two bytesPerFrame aligned parts. + int maybeSilenceBufferSize = + alignToBytePerFrameBoundary(durationUsToFrames(minimumSilenceDurationUs) / 2) * 2; if (maybeSilenceBuffer.length != maybeSilenceBufferSize) { maybeSilenceBuffer = new byte[maybeSilenceBufferSize]; - } - paddingSize = durationUsToFrames(paddingSilenceUs) * bytesPerFrame; - if (paddingBuffer.length != paddingSize) { - paddingBuffer = new byte[paddingSize]; + contiguousOutputBuffer = new byte[maybeSilenceBufferSize]; } } state = STATE_NOISY; skippedFrames = 0; - maybeSilenceBufferSize = 0; - hasOutputNoise = false; + outputSilenceFramesSinceNoise = 0; + maybeSilenceBufferStartIndex = 0; + maybeSilenceBufferContentsSize = 0; } @Override - protected void onReset() { + public void onReset() { enabled = false; - paddingSize = 0; + inputFormat = AudioFormat.NOT_SET; maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY; - paddingBuffer = Util.EMPTY_BYTE_ARRAY; + contiguousOutputBuffer = Util.EMPTY_BYTE_ARRAY; } - // Internal methods. - /** * Incrementally processes new input from {@code inputBuffer} while in {@link #STATE_NOISY}, * updating the state if needed. @@ -239,9 +342,9 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { int noiseLimit = findNoiseLimit(inputBuffer); if (noiseLimit == inputBuffer.position()) { // The buffer contains the start of possible silence. - state = STATE_MAYBE_SILENT; + state = STATE_SHORTENING_SILENCE; } else { - inputBuffer.limit(noiseLimit); + inputBuffer.limit(min(noiseLimit, inputBuffer.capacity())); output(inputBuffer); } @@ -251,72 +354,349 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { /** * Incrementally processes new input from {@code inputBuffer} while in {@link - * #STATE_MAYBE_SILENT}, updating the state if needed. + * #STATE_SHORTENING_SILENCE}, updating the state if needed. + * + *

If the amount of silence is less than {@link #minimumSilenceDurationUs}, then {@link + * #DO_NOT_CHANGE_VOLUME} is used to output the silence. + * + *

If the amount of silence is more than {@link #minimumSilenceDurationUs}, then the following + * will be output: + * + *

+ * + *

Transitions to {@link #STATE_NOISY} if noise is encountered. It writes to {@link + * #maybeSilenceBuffer} in contiguous blocks. If the silence available is enough to wrap around + * the end of the buffer then the buffer is filled from {@link #maybeSilenceBufferStartIndex} to + * the buffers end and the beginning of the buffer is filled upon the next call to this method. */ - private void processMaybeSilence(ByteBuffer inputBuffer) { + private void shortenSilenceSilenceUntilNoise(ByteBuffer inputBuffer) { + checkState(maybeSilenceBufferStartIndex < maybeSilenceBuffer.length); + int limit = inputBuffer.limit(); int noisePosition = findNoisePosition(inputBuffer); - int maybeSilenceInputSize = noisePosition - inputBuffer.position(); - int maybeSilenceBufferRemaining = maybeSilenceBuffer.length - maybeSilenceBufferSize; - if (noisePosition < limit && maybeSilenceInputSize < maybeSilenceBufferRemaining) { - // The maybe silence buffer isn't full, so output it and switch back to the noisy state. - output(maybeSilenceBuffer, maybeSilenceBufferSize); - maybeSilenceBufferSize = 0; - state = STATE_NOISY; + int silenceInputSize = noisePosition - inputBuffer.position(); + + int indexToWriteTo; + int contiguousBufferRemaining; + if (maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize < maybeSilenceBuffer.length) { + // ^0---^start---^end---^length + contiguousBufferRemaining = + maybeSilenceBuffer.length + - (maybeSilenceBufferContentsSize + maybeSilenceBufferStartIndex); + indexToWriteTo = maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize; } else { - // Fill as much of the maybe silence buffer as possible. - int bytesToWrite = min(maybeSilenceInputSize, maybeSilenceBufferRemaining); - inputBuffer.limit(inputBuffer.position() + bytesToWrite); - inputBuffer.get(maybeSilenceBuffer, maybeSilenceBufferSize, bytesToWrite); - maybeSilenceBufferSize += bytesToWrite; - if (maybeSilenceBufferSize == maybeSilenceBuffer.length) { - // We've reached a period of silence, so skip it, taking in to account padding for both - // the noisy to silent transition and any future silent to noisy transition. - if (hasOutputNoise) { - output(maybeSilenceBuffer, paddingSize); - skippedFrames += (maybeSilenceBufferSize - paddingSize * 2) / bytesPerFrame; + // The bytes have wrapped around. ^0---^end---^start---^length + int amountInUpperPartOfBuffer = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex; + indexToWriteTo = maybeSilenceBufferContentsSize - amountInUpperPartOfBuffer; + contiguousBufferRemaining = maybeSilenceBufferStartIndex - indexToWriteTo; + } + + boolean noiseFound = noisePosition < limit; + // Fill as much of the silence buffer as possible. + int bytesOfInput = min(silenceInputSize, contiguousBufferRemaining); + inputBuffer.limit(inputBuffer.position() + bytesOfInput); + inputBuffer.get(maybeSilenceBuffer, indexToWriteTo, bytesOfInput); + maybeSilenceBufferContentsSize += bytesOfInput; + + checkState(maybeSilenceBufferContentsSize <= maybeSilenceBuffer.length); + + boolean shouldTransitionToNoisyState = + noiseFound + && + /* The silence before the noise is not enough to fill the remaining buffer. */ + silenceInputSize < contiguousBufferRemaining; + + outputShortenedSilenceBuffer(shouldTransitionToNoisyState); + + if (shouldTransitionToNoisyState) { + state = STATE_NOISY; + outputSilenceFramesSinceNoise = 0; + } + + // Restore the limit. + inputBuffer.limit(limit); + } + + /** See {@link #shortenSilenceSilenceUntilNoise}. */ + private void outputShortenedSilenceBuffer(boolean shouldTransitionToNoisyState) { + int sizeBeforeOutput = maybeSilenceBufferContentsSize; + int bytesToOutput; + @VolumeChangeType int volumeChangeType; + int bytesConsumed; + // Only output when buffer is full or transitioning to noisy state. + if (maybeSilenceBufferContentsSize == maybeSilenceBuffer.length + || shouldTransitionToNoisyState) { + if (outputSilenceFramesSinceNoise == 0) { + // This is the beginning of a silence chunk so keep MINIMUM_SILENCE_DURATION_US / 2 of the + // silence. + if (shouldTransitionToNoisyState) { + volumeChangeType = DO_NOT_CHANGE_VOLUME; + bytesToOutput = maybeSilenceBufferContentsSize; + outputSilence(bytesToOutput, volumeChangeType); + bytesConsumed = bytesToOutput; } else { - skippedFrames += (maybeSilenceBufferSize - paddingSize) / bytesPerFrame; + checkState(maybeSilenceBufferContentsSize >= maybeSilenceBuffer.length / 2); + // To keep this block a tad simpler, by always outputting exactly buffer size / 2 to avoid + // needing to add the shortening code here. + volumeChangeType = FADE_OUT; + bytesToOutput = maybeSilenceBuffer.length / 2; + outputSilence(bytesToOutput, volumeChangeType); + bytesConsumed = bytesToOutput; } - updatePaddingBuffer(inputBuffer, maybeSilenceBuffer, maybeSilenceBufferSize); - maybeSilenceBufferSize = 0; - state = STATE_SILENT; + } else if (shouldTransitionToNoisyState) { + volumeChangeType = FADE_IN; + + int bytesRemainingAfterOutputtingHalfMin = + maybeSilenceBufferContentsSize - maybeSilenceBuffer.length / 2; + + bytesConsumed = bytesRemainingAfterOutputtingHalfMin + maybeSilenceBuffer.length / 2; + int shortenedSilenceLength = + calculateShortenedSilenceLength(bytesRemainingAfterOutputtingHalfMin); + + // For simplicity we fade in over the shortened silence and the half buffer of padding. + // This acts to increase the padding a bit which only helps (probably imperceptibly) + // the sound quality. + bytesToOutput = maybeSilenceBuffer.length / 2 + shortenedSilenceLength; + outputSilence(bytesToOutput, volumeChangeType); + } else { + volumeChangeType = MUTE; + // Output as much as possible while still keeping half the buffer full so that half the + // min silence can be output later as padding. + bytesConsumed = maybeSilenceBufferContentsSize - maybeSilenceBuffer.length / 2; + + bytesToOutput = calculateShortenedSilenceLength(bytesConsumed); + outputSilence(bytesToOutput, volumeChangeType); } - // Restore the limit. - inputBuffer.limit(limit); + checkState( + bytesConsumed % bytesPerFrame == 0, + "bytesConsumed is not aligned to frame size: %s" + bytesConsumed); + + checkState((sizeBeforeOutput >= bytesToOutput)); + + maybeSilenceBufferContentsSize -= bytesConsumed; + maybeSilenceBufferStartIndex += bytesConsumed; + // The start index might wrap back around to the start of the buffer. + maybeSilenceBufferStartIndex %= maybeSilenceBuffer.length; + + outputSilenceFramesSinceNoise += bytesToOutput / bytesPerFrame; + skippedFrames += (bytesConsumed - bytesToOutput) / bytesPerFrame; } } /** - * Incrementally processes new input from {@code inputBuffer} while in {@link #STATE_SILENT}, - * updating the state if needed. + * Returns the appropriate size that a given number of bytes of silence should be shortened to. It + * calculates this using the {@link #outputSilenceFramesSinceNoise} and the {@link + * #silenceRetentionRatio}. The {@link #silenceRetentionRatio} multiplied by {@code + * silenceToShortenBytes} is returned until a max outputted silence length is hit, and then only + * the remaining silence between the current {@link #outputSilenceFramesSinceNoise} and {@link + * #maxSilenceToKeepDurationUs} is reached. */ - private void processSilence(ByteBuffer inputBuffer) { - int limit = inputBuffer.limit(); - int noisyPosition = findNoisePosition(inputBuffer); - inputBuffer.limit(noisyPosition); - skippedFrames += inputBuffer.remaining() / bytesPerFrame; - updatePaddingBuffer(inputBuffer, paddingBuffer, paddingSize); - if (noisyPosition < limit) { - // Output the padding, which may include previous input as well as new input, then transition - // back to the noisy state. - output(paddingBuffer, paddingSize); - state = STATE_NOISY; + private int calculateShortenedSilenceLength(int silenceToShortenBytes) { + // Start skipping silence to keep the silence below MAX_SILENCE_DURATION_US long. + int bytesNeededToReachMax = + (durationUsToFrames(maxSilenceToKeepDurationUs) - outputSilenceFramesSinceNoise) + * bytesPerFrame + - maybeSilenceBuffer.length / 2; - // Restore the limit. - inputBuffer.limit(limit); - } + checkState(bytesNeededToReachMax >= 0); + + return alignToBytePerFrameBoundary( + min(silenceToShortenBytes * silenceRetentionRatio + .5f, bytesNeededToReachMax)); } /** - * Copies {@code length} elements from {@code data} to populate a new output buffer from the - * processor. + * Method used to avoid rounding errors while calculating output and skipped frames. The given + * {@code value} is decreased to the nearest value that is divisible by {@link #bytesPerFrame}. */ - private void output(byte[] data, int length) { - replaceOutputBuffer(length).put(data, 0, length).flip(); - if (length > 0) { - hasOutputNoise = true; + private int alignToBytePerFrameBoundary(int value) { + return (value / bytesPerFrame) * bytesPerFrame; + } + + /** + * Method used to avoid rounding errors while calculating output and skipped frames. The given + * {@code value} is decreased to the nearest value that is divisible by {@link #bytesPerFrame}. + */ + private int alignToBytePerFrameBoundary(float value) { + return alignToBytePerFrameBoundary((int) value); + } + + /** Copies elements from {@code data} to populate a new output buffer from the processor. */ + private void outputRange(byte[] data, int size, @VolumeChangeType int rampType) { + checkArgument( + size % bytesPerFrame == 0, "byteOutput size is not aligned to frame size " + size); + + modifyVolume(data, size, rampType); + replaceOutputBuffer(size).put(data, 0, size).flip(); + } + + /** + * Copies {@code sizeToOutput} elements from the {@link #maybeSilenceBuffer} to {@link + * #contiguousOutputBuffer}. The contents of {@link #maybeSilenceBuffer} can wrap around from the + * end of the buffer and back to the beginning. The {@link #contiguousOutputBuffer} content always + * start from index 0. + * + * @param rampType This parameter is used to determine which part of the {@link + * #maybeSilenceBuffer} contents need to be kept. For {@link #FADE_IN} the end of the contents + * is always kept. Otherwise the beginning of the contents are always kept. + */ + private void outputSilence(int sizeToOutput, @VolumeChangeType int rampType) { + if (sizeToOutput == 0) { + return; + } + + checkArgument(maybeSilenceBufferContentsSize >= sizeToOutput); + + if (rampType == FADE_IN) { + // Keeps the end of the buffer because we are padding the start of the next chunk of noise. + if (maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize + <= maybeSilenceBuffer.length) { + // ^0---^start---^end---^length + System.arraycopy( + maybeSilenceBuffer, + maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize - sizeToOutput, + contiguousOutputBuffer, + 0, + sizeToOutput); + } else { + // ^0---^end--^start---^length + int sizeInUpperPartOfArray = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex; + int sizeInLowerPartOfArray = maybeSilenceBufferContentsSize - sizeInUpperPartOfArray; + if (sizeInLowerPartOfArray >= sizeToOutput) { + // We just need the lower part of the array. + System.arraycopy( + maybeSilenceBuffer, + sizeInLowerPartOfArray - sizeToOutput, + contiguousOutputBuffer, + 0, + sizeToOutput); + } else { + int sizeToOutputInUpperPart = sizeToOutput - sizeInLowerPartOfArray; + System.arraycopy( + maybeSilenceBuffer, + maybeSilenceBuffer.length - sizeToOutputInUpperPart, + contiguousOutputBuffer, + 0, + sizeToOutputInUpperPart); + + // Copy everything from lower part. DO_NOT_CHANGE_VOLUME (which keeps everything) and + // MUTE (where the content that is kept only provides background noise). + System.arraycopy( + maybeSilenceBuffer, + 0, + contiguousOutputBuffer, + sizeToOutputInUpperPart, + sizeInLowerPartOfArray); + } + } + } else { + if (maybeSilenceBufferStartIndex + sizeToOutput <= maybeSilenceBuffer.length) { + // ^0---^start---^end---^length + System.arraycopy( + maybeSilenceBuffer, + maybeSilenceBufferStartIndex, + contiguousOutputBuffer, + 0, + sizeToOutput); + } else { + // ^0---^end (of content to output now)---^start---^length + int sizeToCopyInUpperPartOfArray = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex; + // Copy the upper part of the array. + System.arraycopy( + maybeSilenceBuffer, + maybeSilenceBufferStartIndex, + contiguousOutputBuffer, + 0, + sizeToCopyInUpperPartOfArray); + int amountToCopyFromLowerPartOfArray = sizeToOutput - sizeToCopyInUpperPartOfArray; + System.arraycopy( + maybeSilenceBuffer, + 0, + contiguousOutputBuffer, + sizeToCopyInUpperPartOfArray, + amountToCopyFromLowerPartOfArray); + } + } + + checkArgument( + sizeToOutput % bytesPerFrame == 0, + "sizeToOutput is not aligned to frame size: " + sizeToOutput); + checkState(maybeSilenceBufferStartIndex < maybeSilenceBuffer.length); + + outputRange(contiguousOutputBuffer, sizeToOutput, rampType); + } + + /** + * Modifies the amplitude of the samples in {@code sampleBuffer} based on the given {@link + * VolumeChangeType}. + */ + private void modifyVolume(byte[] sampleBuffer, int size, @VolumeChangeType int volumeChangeType) { + if (volumeChangeType == DO_NOT_CHANGE_VOLUME) { + return; + } + + for (int idx = 0; idx < size; idx += 2) { + byte mostSignificantByte = sampleBuffer[idx + 1]; + byte leastSignificantByte = sampleBuffer[idx]; + int sample = twoByteSampleToInt(mostSignificantByte, leastSignificantByte); + + int volumeModificationPercentage; + if (volumeChangeType == FADE_OUT) { + volumeModificationPercentage = + calculateFadeOutPercentage(/* value= */ idx, /* max= */ size - 1); + } else if (volumeChangeType == FADE_IN) { + volumeModificationPercentage = + calculateFadeInPercentage(/* value= */ idx, /* max= */ size - 1); + } else { + volumeModificationPercentage = minVolumeToKeepPercentageWhenMuting; + } + + sample = (sample * volumeModificationPercentage) / 100; + sampleIntToTwoBigEndianBytes(sampleBuffer, idx, sample); + } + } + + private int calculateFadeOutPercentage(int value, int max) { + return ((minVolumeToKeepPercentageWhenMuting - 100) * ((AVOID_TRUNCATION_FACTOR * value) / max)) + / AVOID_TRUNCATION_FACTOR + + 100; + } + + private int calculateFadeInPercentage(int value, int max) { + return (minVolumeToKeepPercentageWhenMuting + + ((100 - minVolumeToKeepPercentageWhenMuting) * (AVOID_TRUNCATION_FACTOR * value) / max) + / AVOID_TRUNCATION_FACTOR); + } + + private static int twoByteSampleToInt(byte mostSignificantByte, byte leastSignificantByte) { + return ((leastSignificantByte & 0xFF) | mostSignificantByte << 8); + } + + /** + * Converts {@code sample} into the corresponding big-endian 16bit bytes within {@code byteArray}. + */ + private static void sampleIntToTwoBigEndianBytes(byte[] byteArray, int startIndex, int sample) { + // Avoid 16-bit-integer overflow when writing back the manipulated data. + if (sample >= Short.MAX_VALUE) { + byteArray[startIndex] = (byte) 0xFF; + byteArray[startIndex + 1] = (byte) 0x7F; + } else if (sample <= Short.MIN_VALUE) { + byteArray[startIndex] = (byte) 0x00; + byteArray[startIndex + 1] = (byte) 0x80; + } else { + byteArray[startIndex] = (byte) (sample & 0xFF); + byteArray[startIndex + 1] = (byte) (sample >> 8); } } @@ -324,36 +704,14 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { * Copies remaining bytes from {@code data} to populate a new output buffer from the processor. */ private void output(ByteBuffer data) { - int length = data.remaining(); - replaceOutputBuffer(length).put(data).flip(); - if (length > 0) { - hasOutputNoise = true; - } - } - - /** - * Fills {@link #paddingBuffer} using data from {@code input}, plus any additional buffered data - * at the end of {@code buffer} (up to its {@code size}) required to fill it, advancing the input - * position. - */ - private void updatePaddingBuffer(ByteBuffer input, byte[] buffer, int size) { - int fromInputSize = min(input.remaining(), paddingSize); - int fromBufferSize = paddingSize - fromInputSize; - System.arraycopy( - /* src= */ buffer, - /* srcPos= */ size - fromBufferSize, - /* dest= */ paddingBuffer, - /* destPos= */ 0, - /* length= */ fromBufferSize); - input.position(input.limit() - fromInputSize); - input.get(paddingBuffer, fromBufferSize, fromInputSize); + replaceOutputBuffer(data.remaining()).put(data).flip(); } /** * Returns the number of input frames corresponding to {@code durationUs} microseconds of audio. */ private int durationUsToFrames(long durationUs) { - return (int) ((durationUs * inputAudioFormat.sampleRate) / C.MICROS_PER_SECOND); + return (int) ((durationUs * inputFormat.sampleRate) / C.MICROS_PER_SECOND); } /** @@ -362,8 +720,8 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { */ private int findNoisePosition(ByteBuffer buffer) { // The input is in ByteOrder.nativeOrder(), which is little endian on Android. - for (int i = buffer.position(); i < buffer.limit(); i += 2) { - if (Math.abs(buffer.getShort(i)) > silenceThresholdLevel) { + for (int i = buffer.position() + 1; i < buffer.limit(); i += 2) { + if (isNoise(buffer.get(i), buffer.get(i - 1))) { // Round to the start of the frame. return bytesPerFrame * (i / bytesPerFrame); } @@ -377,12 +735,21 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor { */ private int findNoiseLimit(ByteBuffer buffer) { // The input is in ByteOrder.nativeOrder(), which is little endian on Android. - for (int i = buffer.limit() - 2; i >= buffer.position(); i -= 2) { - if (Math.abs(buffer.getShort(i)) > silenceThresholdLevel) { + for (int i = buffer.limit() - 1; i >= buffer.position(); i -= 2) { + if (isNoise(buffer.get(i), buffer.get(i - 1))) { // Return the start of the next frame. return bytesPerFrame * (i / bytesPerFrame) + bytesPerFrame; } } return buffer.position(); } + + /** + * Whether the given two bytes represent a short signed PCM value that is greater than {@link + * #silenceThresholdLevel}. + */ + private boolean isNoise(byte mostSignificantByte, byte leastSignificantByte) { + return Math.abs(twoByteSampleToInt(mostSignificantByte, leastSignificantByte)) + > silenceThresholdLevel; + } } diff --git a/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java b/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java index bb8e87b8b2..6d16a5655f 100644 --- a/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java +++ b/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java @@ -23,6 +23,7 @@ import static java.lang.Short.MAX_VALUE; import androidx.media3.common.C; import androidx.media3.common.audio.AudioProcessor.AudioFormat; import androidx.test.ext.junit.runners.AndroidJUnit4; +import com.google.common.collect.Range; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.ShortBuffer; @@ -86,7 +87,7 @@ public final class SilenceSkippingAudioProcessorTest { @Test public void skipInSilentSignal_skipsEverything() throws Exception { - // Given a signal with only noise. + // Given a signal with only silence. InputBufferProvider inputBufferProvider = getInputBufferProviderForAlternatingSilenceAndNoise( TEST_SIGNAL_SILENCE_DURATION_MS, /* noiseDurationMs= */ 0, TEST_SIGNAL_FRAME_COUNT); @@ -99,14 +100,15 @@ public final class SilenceSkippingAudioProcessorTest { long totalOutputFrames = process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE); - // The entire signal is skipped. - assertThat(totalOutputFrames).isEqualTo(0); - assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(TEST_SIGNAL_FRAME_COUNT); + // The entire signal is skipped except for the DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US. + assertThat(totalOutputFrames).isEqualTo(2000); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()) + .isEqualTo(TEST_SIGNAL_FRAME_COUNT - 2000); } @Test public void skipInNoisySignal_skipsNothing() throws Exception { - // Given a signal with only silence. + // Given a signal with only noise. InputBufferProvider inputBufferProvider = getInputBufferProviderForAlternatingSilenceAndNoise( /* silenceDurationMs= */ 0, TEST_SIGNAL_NOISE_DURATION_MS, TEST_SIGNAL_FRAME_COUNT); @@ -126,6 +128,30 @@ public final class SilenceSkippingAudioProcessorTest { assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(0); } + @Test + public void skipInNoisySignalWithShortSilences_skipsNothing() throws Exception { + // Given a signal with only noise. + InputBufferProvider inputBufferProvider = + getInputBufferProviderForAlternatingSilenceAndNoise( + /* silenceDurationMs= */ 30, + TEST_SIGNAL_NOISE_DURATION_MS - 30, + TEST_SIGNAL_FRAME_COUNT); + + // When processing the entire signal. + SilenceSkippingAudioProcessor silenceSkippingAudioProcessor = + new SilenceSkippingAudioProcessor(); + silenceSkippingAudioProcessor.setEnabled(true); + silenceSkippingAudioProcessor.configure(AUDIO_FORMAT); + silenceSkippingAudioProcessor.flush(); + assertThat(silenceSkippingAudioProcessor.isActive()).isTrue(); + long totalOutputFrames = + process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE); + + // None of the signal is skipped. + assertThat(totalOutputFrames).isEqualTo(TEST_SIGNAL_FRAME_COUNT); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(0); + } + @Test public void skipInAlternatingTestSignal_hasCorrectOutputAndSkippedFrameCounts() throws Exception { // Given a signal that alternates between silence and noise. @@ -145,10 +171,10 @@ public final class SilenceSkippingAudioProcessorTest { long totalOutputFrames = process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE); - // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 * - // 40 frames of padding after that. - assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40)); - assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40)); + // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors). + assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L)); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()) + .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames); } @Test @@ -171,10 +197,10 @@ public final class SilenceSkippingAudioProcessorTest { long totalOutputFrames = process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 80); - // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 * - // 40 frames of padding after that. - assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40)); - assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40)); + // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors). + assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L)); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()) + .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames); } @Test @@ -197,14 +223,14 @@ public final class SilenceSkippingAudioProcessorTest { long totalOutputFrames = process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 120); - // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 * - // 40 frames of padding after that. - assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40)); - assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40)); + // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors). + assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L)); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()) + .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames); } @Test - public void customPaddingValue_hasCorrectOutputAndSkippedFrameCounts() throws Exception { + public void customSilenceRetentionValue_hasCorrectOutputAndSkippedFrameCounts() throws Exception { // Given a signal that alternates between silence and noise. InputBufferProvider inputBufferProvider = getInputBufferProviderForAlternatingSilenceAndNoise( @@ -212,11 +238,13 @@ public final class SilenceSkippingAudioProcessorTest { TEST_SIGNAL_NOISE_DURATION_MS, TEST_SIGNAL_FRAME_COUNT); - // When processing the entire signal with a larger than normal padding silence. + // When processing the entire signal with a smaller than normal retention ratio. SilenceSkippingAudioProcessor silenceSkippingAudioProcessor = new SilenceSkippingAudioProcessor( SilenceSkippingAudioProcessor.DEFAULT_MINIMUM_SILENCE_DURATION_US, - /* paddingSilenceUs= */ 21_000, + /* silenceRetentionRatio= */ 0.05f, + SilenceSkippingAudioProcessor.DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US, + SilenceSkippingAudioProcessor.DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE, SilenceSkippingAudioProcessor.DEFAULT_SILENCE_THRESHOLD_LEVEL); silenceSkippingAudioProcessor.setEnabled(true); silenceSkippingAudioProcessor.configure(AUDIO_FORMAT); @@ -225,10 +253,10 @@ public final class SilenceSkippingAudioProcessorTest { long totalOutputFrames = process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 120); - // The output consists of 50000 frames of noise, plus 21 frames of padding at the start and 99 * - // 42 frames of padding after that. - assertThat(totalOutputFrames).isEqualTo(50000 + (21 + 99 * 42)); - assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (21 + 99 * 42)); + // The output has 50000 frames of noise, plus 50 * 0.05 * 1000 padding (plus rounding errors). + assertThat(totalOutputFrames).isIn(Range.closed(52500L - 500L, 52500L + 500L)); + assertThat(silenceSkippingAudioProcessor.getSkippedFrames()) + .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames); } @Test