From dfe47219f91196a4c77b5af3485ee9affced83ce Mon Sep 17 00:00:00 2001
From: tonihei <tonihei@google.com>
Date: Mon, 29 Jan 2024 02:29:15 -0800
Subject: [PATCH] Update skip silence algorithm

The updated algorithm has two main improvements:
 - The silence padding is not constant but a ratio of the original
   silence (up to a defined max) to more naturally represent the
   original gaps.
 - The silence is not instantly going to zero, but uses a ramp down
   and up for a smooth volume transition and also retains a small
   percentage of the original noise for more natural "silences" that
   still contain some background noise.

#minor-release

Issue: google/ExoPlayer#7423
PiperOrigin-RevId: 602322442
(cherry picked from commit bb533332f4b6f46ac9d5ca17cb7943bd1bdb7dd2)
---
 RELEASENOTES.md                               |   7 +
 .../audio/SilenceSkippingAudioProcessor.java  | 699 +++++++++++++-----
 .../SilenceSkippingAudioProcessorTest.java    |  76 +-
 3 files changed, 592 insertions(+), 190 deletions(-)
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
index bf9e9216a4..cecd3c4ed2 100644
--- a/RELEASENOTES.md
+++ b/RELEASENOTES.md
@@ -38,6 +38,13 @@ This release includes the following changes since the
     *   Extract audio types from TS descriptors and map them to role flags,
         allowing users to make better-informed audio track selections
         ([#973](https://github.com/androidx/media/pull/973)).
+*   Audio:
+    *   Improve silence skipping algorithm with smooth volume ramp, retained
+        minimal silence and more natural silence durations
+        ([#7423](https://github.com/google/ExoPlayer/issues/7423)).
+*   Video:
+*   Text:
+*   Metadata:
 *   Image:
     *   Add support for DASH thumbnails. Grid images are cropped and individual
         thumbnails are provided to `ImageOutput` close to their presentation
diff --git a/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java b/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java
index b4d35dd362..bc6eb805d2 100644
--- a/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java
+++ b/libraries/exoplayer/src/main/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessor.java
@@ -15,14 +15,16 @@
  */
 package androidx.media3.exoplayer.audio;
 
+import static androidx.media3.common.util.Assertions.checkArgument;
+import static androidx.media3.common.util.Assertions.checkState;
 import static java.lang.Math.min;
 import static java.lang.annotation.ElementType.TYPE_USE;
 
 import androidx.annotation.IntDef;
 import androidx.media3.common.C;
+import androidx.media3.common.Format;
 import androidx.media3.common.audio.AudioProcessor;
 import androidx.media3.common.audio.BaseAudioProcessor;
-import androidx.media3.common.util.Assertions;
 import androidx.media3.common.util.UnstableApi;
 import androidx.media3.common.util.Util;
 import java.lang.annotation.Documented;
@@ -39,103 +41,212 @@ import java.nio.ByteBuffer;
 public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
 
   /**
-   * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short)
-   * minimumSilenceDurationUs}.
+   * Default fraction of the original silence to keep. Between [0, 1]. 1 means keep all silence. 0
+   * means remove all silence.
    */
-  public static final long DEFAULT_MINIMUM_SILENCE_DURATION_US = 150_000;
+  public static final float DEFAULT_SILENCE_RETENTION_RATIO = 0.2f;
 
   /**
-   * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short)
-   * paddingSilenceUs}.
+   * Default volume percentage to keep.
+   *
+   * <p>Even when modifying the volume to a mute state, it is ideal to decrease the volume instead
+   * of making the volume zero. Completely silent audio sounds like playback has stopped. While
+   * decreased volume sounds like very light background noise at a recording studio.
    */
-  public static final long DEFAULT_PADDING_SILENCE_US = 20_000;
+  public static final int DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE = 10;
 
-  /**
-   * The default value for {@link #SilenceSkippingAudioProcessor(long, long, short)
-   * silenceThresholdLevel}.
-   */
+  /** Default absolute level below which an individual PCM sample is classified as silent. */
   public static final short DEFAULT_SILENCE_THRESHOLD_LEVEL = 1024;
 
+  /**
+   * Default minimum duration of audio that must be below {@code silenceThresholdLevel} before
+   * silence starts being trimmed. Specified in microseconds.
+   */
+  public static final long DEFAULT_MINIMUM_SILENCE_DURATION_US = 100_000;
+
+  /**
+   * Default maximum silence to keep in microseconds. This maximum is applied after {@code
+   * silenceRetentionRatio}.
+   */
+  public static final long DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US = 2_000_000;
+
+  /**
+   * @deprecated Specify silence behaviour via {@code silenceRetentionRatio} instead.
+   */
+  @Deprecated public static final long DEFAULT_PADDING_SILENCE_US = 20_000;
+
   /** Trimming states. */
   @Documented
   @Retention(RetentionPolicy.SOURCE)
   @Target(TYPE_USE)
   @IntDef({
     STATE_NOISY,
-    STATE_MAYBE_SILENT,
-    STATE_SILENT,
+    STATE_SHORTENING_SILENCE,
   })
   private @interface State {}
 
   /** State when the input is not silent. */
   private static final int STATE_NOISY = 0;
 
-  /** State when the input may be silent but we haven't read enough yet to know. */
-  private static final int STATE_MAYBE_SILENT = 1;
+  /**
+   * State when the input has been silent less than or equal to {@link #maxSilenceToKeepDurationUs}
+   * and the silence is being shortened according to {@link #calculateShortenedSilenceLength(int)}.
+   */
+  private static final int STATE_SHORTENING_SILENCE = 1;
 
-  /** State when the input is silent. */
-  private static final int STATE_SILENT = 2;
+  /** Ways to change the volume of silence. */
+  @Documented
+  @Retention(RetentionPolicy.SOURCE)
+  @Target(TYPE_USE)
+  @IntDef({
+    FADE_OUT,
+    MUTE,
+    FADE_IN,
+    DO_NOT_CHANGE_VOLUME,
+  })
+  private @interface VolumeChangeType {}
 
-  private final long minimumSilenceDurationUs;
-  private final long paddingSilenceUs;
-  private final short silenceThresholdLevel;
-  private int bytesPerFrame;
-  private boolean enabled;
+  private static final int FADE_OUT = 0;
+  private static final int MUTE = 1;
+  private static final int FADE_IN = 2;
+  private static final int DO_NOT_CHANGE_VOLUME = 3;
 
   /**
-   * Buffers audio data that may be classified as silence while in {@link #STATE_MAYBE_SILENT}. If
-   * the input becomes noisy before the buffer has filled, it will be output. Otherwise, the buffer
-   * contents will be dropped and the state will transition to {@link #STATE_SILENT}.
+   * Used with {@code minVolumeToKeepPercentageWhenMuting} to avoid round off errors. An alternative
+   * to this would be to use floats, but integer math is probably faster than floats.
+   */
+  private static final int AVOID_TRUNCATION_FACTOR = 1000;
+
+  /**
+   * Fraction of the original silence to keep. Between [0, 1]. 1 means keep all silence. 0 means
+   * remove all silence.
+   */
+  private final float silenceRetentionRatio;
+
+  /** Absolute level below which an individual PCM sample is classified as silent. */
+  private final short silenceThresholdLevel;
+
+  /**
+   * Volume percentage to keep. Even when modifying the volume to a mute state, it is ideal to
+   * decrease the volume instead of making the volume zero. Completely silent audio sounds like
+   * playback has stopped. While decreased volume sounds like very light background noise from a
+   * recording studio.
+   */
+  private final int minVolumeToKeepPercentageWhenMuting;
+
+  /**
+   * Duration of audio that must be below {@link #silenceThresholdLevel} before silence starts being
+   * trimmed. Specified in microseconds.
+   */
+  private final long minimumSilenceDurationUs;
+
+  /**
+   * Maximum silence to keep in microseconds. This maximum is applied after {@link
+   * #silenceRetentionRatio}.
+   */
+  private final long maxSilenceToKeepDurationUs;
+
+  private AudioFormat inputFormat;
+  private int bytesPerFrame;
+  private boolean enabled;
+  private @State int state;
+  private long skippedFrames;
+
+  /**
+   * The frames of silence that has been output since the last noise. Used to enforce {@link
+   * #maxSilenceToKeepDurationUs}.
+   */
+  private int outputSilenceFramesSinceNoise = 0;
+
+  /**
+   * Buffers audio data that may be classified as silence while in {@link
+   * #STATE_SHORTENING_SILENCE}. If the input becomes noisy before the buffer has filled, it will be
+   * output without shortening. Otherwise, the buffer will be output when filled as shortened
+   * silence and emptied.
    */
   private byte[] maybeSilenceBuffer;
 
   /**
-   * Stores the latest part of the input while silent. It will be output as padding if the next
-   * input is noisy.
+   * An index into {@link #maybeSilenceBuffer} pointing to the location where silence that has not
+   * been output starts.
    */
-  private byte[] paddingBuffer;
+  private int maybeSilenceBufferStartIndex = 0;
 
-  private @State int state;
-  private int maybeSilenceBufferSize;
-  private int paddingSize;
-  private boolean hasOutputNoise;
-  private long skippedFrames;
+  /**
+   * A count of the number of bytes of content in {@link #maybeSilenceBuffer}. The count starts at
+   * {@link #maybeSilenceBufferStartIndex}, and the bytes counted may wrap around to the start of
+   * the buffer. The count will never be greater than {@link #maybeSilenceBuffer}'s length.
+   */
+  private int maybeSilenceBufferContentsSize = 0;
+
+  /** Used to hold a subset of the contents of {@link #maybeSilenceBuffer} for convenience. */
+  // TODO: This processor can probably be more efficient if this array is not used. Operations like
+  //  modifyVolume() can be applied to a non-contiguous contents, the code is just more complex.
+  private byte[] contiguousOutputBuffer;
 
   /** Creates a new silence skipping audio processor. */
   public SilenceSkippingAudioProcessor() {
     this(
         DEFAULT_MINIMUM_SILENCE_DURATION_US,
-        DEFAULT_PADDING_SILENCE_US,
+        DEFAULT_SILENCE_RETENTION_RATIO,
+        DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US,
+        DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE,
         DEFAULT_SILENCE_THRESHOLD_LEVEL);
   }
 
   /**
-   * Creates a new silence skipping audio processor.
-   *
-   * @param minimumSilenceDurationUs The minimum duration of audio that must be below {@code
-   *     silenceThresholdLevel} to classify that part of audio as silent, in microseconds.
-   * @param paddingSilenceUs The duration of silence by which to extend non-silent sections, in
-   *     microseconds. The value must not exceed {@code minimumSilenceDurationUs}.
-   * @param silenceThresholdLevel The absolute level below which an individual PCM sample is
-   *     classified as silent.
+   * @deprecated Use {@link #SilenceSkippingAudioProcessor(long, float, long, int, short)} instead.
    */
+  @Deprecated
   public SilenceSkippingAudioProcessor(
       long minimumSilenceDurationUs, long paddingSilenceUs, short silenceThresholdLevel) {
-    Assertions.checkArgument(paddingSilenceUs <= minimumSilenceDurationUs);
-    this.minimumSilenceDurationUs = minimumSilenceDurationUs;
-    this.paddingSilenceUs = paddingSilenceUs;
-    this.silenceThresholdLevel = silenceThresholdLevel;
-
-    maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY;
-    paddingBuffer = Util.EMPTY_BYTE_ARRAY;
+    this(
+        minimumSilenceDurationUs,
+        /* silenceRetentionRatio= */ (float) paddingSilenceUs / minimumSilenceDurationUs,
+        /* maxSilenceToKeepDurationUs= */ minimumSilenceDurationUs,
+        /* minVolumeToKeepPercentageWhenMuting= */ 0,
+        silenceThresholdLevel);
   }
 
   /**
-   * Sets whether to skip silence in the input. This method may only be called after draining data
-   * through the processor. The value returned by {@link #isActive()} may change, and the processor
-   * must be {@link #flush() flushed} before queueing more data.
+   * Creates a new silence trimming audio processor.
    *
-   * @param enabled Whether to skip silence in the input.
+   * @param minimumSilenceDurationUs Duration of audio that must be below {@code
+   *     silenceThresholdLevel} before silence starts being trimmed, in microseconds.
+   * @param silenceRetentionRatio Fraction of the original silence to keep. Between [0, 1]. 1 means
+   *     keep all silence. 0 means remove all silence.
+   * @param maxSilenceToKeepDurationUs Maximum silence to keep in microseconds. This maximum is
+   *     applied after {@link #silenceRetentionRatio}.
+   * @param minVolumeToKeepPercentageWhenMuting Volume percentage to keep. Even when modifying the
+   *     volume to a mute state, it is ideal to decrease the volume instead of making the volume
+   *     zero. Completely silent audio sounds like playback has stopped. While decreased volume
+   *     sounds like very light background noise from a recording studio.
+   * @param silenceThresholdLevel Absolute level below which an individual PCM sample is classified
+   *     as silent.
+   */
+  public SilenceSkippingAudioProcessor(
+      long minimumSilenceDurationUs,
+      float silenceRetentionRatio,
+      long maxSilenceToKeepDurationUs,
+      int minVolumeToKeepPercentageWhenMuting,
+      short silenceThresholdLevel) {
+    checkArgument(silenceRetentionRatio >= 0f && silenceRetentionRatio <= 1f);
+    this.minimumSilenceDurationUs = minimumSilenceDurationUs;
+    this.silenceRetentionRatio = silenceRetentionRatio;
+    this.maxSilenceToKeepDurationUs = maxSilenceToKeepDurationUs;
+    this.minVolumeToKeepPercentageWhenMuting = minVolumeToKeepPercentageWhenMuting;
+    this.silenceThresholdLevel = silenceThresholdLevel;
+    inputFormat = AudioFormat.NOT_SET;
+    maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY;
+    contiguousOutputBuffer = Util.EMPTY_BYTE_ARRAY;
+  }
+
+  /**
+   * Sets whether to shorten silence in the input. This method may only be called after draining
+   * data through the processor. The value returned by {@link #isActive()} may change, and the
+   * processor must be {@link #flush() flushed} before queueing more data.
+   *
+   * @param enabled Whether to shorten silence in the input.
    */
   public void setEnabled(boolean enabled) {
     this.enabled = enabled;
@@ -149,20 +260,20 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
     return skippedFrames;
   }
 
-  // AudioProcessor implementation.
-
   @Override
-  public AudioFormat onConfigure(AudioFormat inputAudioFormat)
+  protected AudioFormat onConfigure(AudioFormat inputAudioFormat)
       throws UnhandledAudioFormatException {
     if (inputAudioFormat.encoding != C.ENCODING_PCM_16BIT) {
       throw new UnhandledAudioFormatException(inputAudioFormat);
     }
-    return enabled ? inputAudioFormat : AudioFormat.NOT_SET;
+    this.inputFormat = inputAudioFormat;
+    bytesPerFrame = inputAudioFormat.channelCount * 2;
+    return inputAudioFormat;
   }
 
   @Override
   public boolean isActive() {
-    return enabled;
+    return inputFormat.sampleRate != Format.NO_VALUE && enabled;
   }
 
   @Override
@@ -172,11 +283,8 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
         case STATE_NOISY:
           processNoisy(inputBuffer);
           break;
-        case STATE_MAYBE_SILENT:
-          processMaybeSilence(inputBuffer);
-          break;
-        case STATE_SILENT:
-          processSilence(inputBuffer);
+        case STATE_SHORTENING_SILENCE:
+          shortenSilenceSilenceUntilNoise(inputBuffer);
           break;
         default:
           throw new IllegalStateException();
@@ -185,48 +293,43 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
   }
 
   @Override
-  protected void onQueueEndOfStream() {
-    if (maybeSilenceBufferSize > 0) {
-      // We haven't received enough silence to transition to the silent state, so output the buffer
-      // and switch back to the noisy state.
-      output(maybeSilenceBuffer, maybeSilenceBufferSize);
-      maybeSilenceBufferSize = 0;
-      state = STATE_NOISY;
-    }
-    if (!hasOutputNoise) {
-      skippedFrames += paddingSize / bytesPerFrame;
+  public void onQueueEndOfStream() {
+    // The maybeSilenceBuffer is only written to in the STATE_SHORTENING_SILENCE state, and
+    // is always completely flushed before leaving the STATE_SHORTENING_SILENCE.
+    if (maybeSilenceBufferContentsSize > 0) {
+      // There's bytes in the buffer. So the final chunk of shortened silence will be output to
+      // simulate a transition back to the noisy state and the end of output.
+      outputShortenedSilenceBuffer(/* shouldTransitionToNoisyState= */ true);
+      outputSilenceFramesSinceNoise = 0;
     }
   }
 
   @Override
-  protected void onFlush() {
-    if (enabled) {
-      bytesPerFrame = inputAudioFormat.bytesPerFrame;
-      int maybeSilenceBufferSize = durationUsToFrames(minimumSilenceDurationUs) * bytesPerFrame;
+  public void onFlush() {
+    if (isActive()) {
+      // Divide by 2 to allow the buffer to be split into two bytesPerFrame aligned parts.
+      int maybeSilenceBufferSize =
+          alignToBytePerFrameBoundary(durationUsToFrames(minimumSilenceDurationUs) / 2) * 2;
       if (maybeSilenceBuffer.length != maybeSilenceBufferSize) {
         maybeSilenceBuffer = new byte[maybeSilenceBufferSize];
-      }
-      paddingSize = durationUsToFrames(paddingSilenceUs) * bytesPerFrame;
-      if (paddingBuffer.length != paddingSize) {
-        paddingBuffer = new byte[paddingSize];
+        contiguousOutputBuffer = new byte[maybeSilenceBufferSize];
       }
     }
     state = STATE_NOISY;
     skippedFrames = 0;
-    maybeSilenceBufferSize = 0;
-    hasOutputNoise = false;
+    outputSilenceFramesSinceNoise = 0;
+    maybeSilenceBufferStartIndex = 0;
+    maybeSilenceBufferContentsSize = 0;
   }
 
   @Override
-  protected void onReset() {
+  public void onReset() {
     enabled = false;
-    paddingSize = 0;
+    inputFormat = AudioFormat.NOT_SET;
     maybeSilenceBuffer = Util.EMPTY_BYTE_ARRAY;
-    paddingBuffer = Util.EMPTY_BYTE_ARRAY;
+    contiguousOutputBuffer = Util.EMPTY_BYTE_ARRAY;
   }
 
-  // Internal methods.
-
   /**
    * Incrementally processes new input from {@code inputBuffer} while in {@link #STATE_NOISY},
    * updating the state if needed.
@@ -239,9 +342,9 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
     int noiseLimit = findNoiseLimit(inputBuffer);
     if (noiseLimit == inputBuffer.position()) {
       // The buffer contains the start of possible silence.
-      state = STATE_MAYBE_SILENT;
+      state = STATE_SHORTENING_SILENCE;
     } else {
-      inputBuffer.limit(noiseLimit);
+      inputBuffer.limit(min(noiseLimit, inputBuffer.capacity()));
       output(inputBuffer);
     }
 
@@ -251,72 +354,349 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
 
   /**
    * Incrementally processes new input from {@code inputBuffer} while in {@link
-   * #STATE_MAYBE_SILENT}, updating the state if needed.
+   * #STATE_SHORTENING_SILENCE}, updating the state if needed.
+   *
+   * <p>If the amount of silence is less than {@link #minimumSilenceDurationUs}, then {@link
+   * #DO_NOT_CHANGE_VOLUME} is used to output the silence.
+   *
+   * <p>If the amount of silence is more than {@link #minimumSilenceDurationUs}, then the following
+   * will be output:
+   *
+   * <ul>
+   *   <li>A half a buffer full of silence using {@link #FADE_OUT}. This padding has no
+   *       discontinuities.
+   *   <li>A number of bytes between 0 to ({@link #maxSilenceToKeepDurationUs} - padding}. This will
+   *       have discontinuities, that are imperceptible due to {@linkplain #MUTE muting} the
+   *       content.
+   *   <li>If the silence length is over {@link #maxSilenceToKeepDurationUs} - a half buffer (for
+   *       padding that will be applied later) then the silence begins to be thrown away entirely.
+   *   <li>A final silence with a length of a half buffer will be used with a {@link #FADE_IN}. This
+   *       padding has no discontinuities. It will transition with no discontinuities back to the
+   *       {@link #STATE_NOISY}.
+   * </ul>
+   *
+   * <p>Transitions to {@link #STATE_NOISY} if noise is encountered. It writes to {@link
+   * #maybeSilenceBuffer} in contiguous blocks. If the silence available is enough to wrap around
+   * the end of the buffer then the buffer is filled from {@link #maybeSilenceBufferStartIndex} to
+   * the buffers end and the beginning of the buffer is filled upon the next call to this method.
    */
-  private void processMaybeSilence(ByteBuffer inputBuffer) {
+  private void shortenSilenceSilenceUntilNoise(ByteBuffer inputBuffer) {
+    checkState(maybeSilenceBufferStartIndex < maybeSilenceBuffer.length);
+
     int limit = inputBuffer.limit();
     int noisePosition = findNoisePosition(inputBuffer);
-    int maybeSilenceInputSize = noisePosition - inputBuffer.position();
-    int maybeSilenceBufferRemaining = maybeSilenceBuffer.length - maybeSilenceBufferSize;
-    if (noisePosition < limit && maybeSilenceInputSize < maybeSilenceBufferRemaining) {
-      // The maybe silence buffer isn't full, so output it and switch back to the noisy state.
-      output(maybeSilenceBuffer, maybeSilenceBufferSize);
-      maybeSilenceBufferSize = 0;
-      state = STATE_NOISY;
+    int silenceInputSize = noisePosition - inputBuffer.position();
+
+    int indexToWriteTo;
+    int contiguousBufferRemaining;
+    if (maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize < maybeSilenceBuffer.length) {
+      // ^0---^start---^end---^length
+      contiguousBufferRemaining =
+          maybeSilenceBuffer.length
+              - (maybeSilenceBufferContentsSize + maybeSilenceBufferStartIndex);
+      indexToWriteTo = maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize;
     } else {
-      // Fill as much of the maybe silence buffer as possible.
-      int bytesToWrite = min(maybeSilenceInputSize, maybeSilenceBufferRemaining);
-      inputBuffer.limit(inputBuffer.position() + bytesToWrite);
-      inputBuffer.get(maybeSilenceBuffer, maybeSilenceBufferSize, bytesToWrite);
-      maybeSilenceBufferSize += bytesToWrite;
-      if (maybeSilenceBufferSize == maybeSilenceBuffer.length) {
-        // We've reached a period of silence, so skip it, taking in to account padding for both
-        // the noisy to silent transition and any future silent to noisy transition.
-        if (hasOutputNoise) {
-          output(maybeSilenceBuffer, paddingSize);
-          skippedFrames += (maybeSilenceBufferSize - paddingSize * 2) / bytesPerFrame;
+      // The bytes have wrapped around.  ^0---^end---^start---^length
+      int amountInUpperPartOfBuffer = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex;
+      indexToWriteTo = maybeSilenceBufferContentsSize - amountInUpperPartOfBuffer;
+      contiguousBufferRemaining = maybeSilenceBufferStartIndex - indexToWriteTo;
+    }
+
+    boolean noiseFound = noisePosition < limit;
+    // Fill as much of the silence buffer as possible.
+    int bytesOfInput = min(silenceInputSize, contiguousBufferRemaining);
+    inputBuffer.limit(inputBuffer.position() + bytesOfInput);
+    inputBuffer.get(maybeSilenceBuffer, indexToWriteTo, bytesOfInput);
+    maybeSilenceBufferContentsSize += bytesOfInput;
+
+    checkState(maybeSilenceBufferContentsSize <= maybeSilenceBuffer.length);
+
+    boolean shouldTransitionToNoisyState =
+        noiseFound
+            &&
+            /* The silence before the noise is not enough to fill the remaining buffer. */
+            silenceInputSize < contiguousBufferRemaining;
+
+    outputShortenedSilenceBuffer(shouldTransitionToNoisyState);
+
+    if (shouldTransitionToNoisyState) {
+      state = STATE_NOISY;
+      outputSilenceFramesSinceNoise = 0;
+    }
+
+    // Restore the limit.
+    inputBuffer.limit(limit);
+  }
+
+  /** See {@link #shortenSilenceSilenceUntilNoise}. */
+  private void outputShortenedSilenceBuffer(boolean shouldTransitionToNoisyState) {
+    int sizeBeforeOutput = maybeSilenceBufferContentsSize;
+    int bytesToOutput;
+    @VolumeChangeType int volumeChangeType;
+    int bytesConsumed;
+    // Only output when buffer is full or transitioning to noisy state.
+    if (maybeSilenceBufferContentsSize == maybeSilenceBuffer.length
+        || shouldTransitionToNoisyState) {
+      if (outputSilenceFramesSinceNoise == 0) {
+        // This is the beginning of a silence chunk so keep MINIMUM_SILENCE_DURATION_US / 2 of the
+        // silence.
+        if (shouldTransitionToNoisyState) {
+          volumeChangeType = DO_NOT_CHANGE_VOLUME;
+          bytesToOutput = maybeSilenceBufferContentsSize;
+          outputSilence(bytesToOutput, volumeChangeType);
+          bytesConsumed = bytesToOutput;
         } else {
-          skippedFrames += (maybeSilenceBufferSize - paddingSize) / bytesPerFrame;
+          checkState(maybeSilenceBufferContentsSize >= maybeSilenceBuffer.length / 2);
+          // To keep this block a tad simpler, by always outputting exactly buffer size / 2 to avoid
+          // needing to add the shortening code here.
+          volumeChangeType = FADE_OUT;
+          bytesToOutput = maybeSilenceBuffer.length / 2;
+          outputSilence(bytesToOutput, volumeChangeType);
+          bytesConsumed = bytesToOutput;
         }
-        updatePaddingBuffer(inputBuffer, maybeSilenceBuffer, maybeSilenceBufferSize);
-        maybeSilenceBufferSize = 0;
-        state = STATE_SILENT;
+      } else if (shouldTransitionToNoisyState) {
+        volumeChangeType = FADE_IN;
+
+        int bytesRemainingAfterOutputtingHalfMin =
+            maybeSilenceBufferContentsSize - maybeSilenceBuffer.length / 2;
+
+        bytesConsumed = bytesRemainingAfterOutputtingHalfMin + maybeSilenceBuffer.length / 2;
+        int shortenedSilenceLength =
+            calculateShortenedSilenceLength(bytesRemainingAfterOutputtingHalfMin);
+
+        // For simplicity we fade in over the shortened silence and the half buffer of padding.
+        // This acts to increase the padding a bit which only helps (probably imperceptibly)
+        // the sound quality.
+        bytesToOutput = maybeSilenceBuffer.length / 2 + shortenedSilenceLength;
+        outputSilence(bytesToOutput, volumeChangeType);
+      } else {
+        volumeChangeType = MUTE;
+        // Output as much as possible while still keeping half the buffer full so that half the
+        // min silence can be output later as padding.
+        bytesConsumed = maybeSilenceBufferContentsSize - maybeSilenceBuffer.length / 2;
+
+        bytesToOutput = calculateShortenedSilenceLength(bytesConsumed);
+        outputSilence(bytesToOutput, volumeChangeType);
       }
 
-      // Restore the limit.
-      inputBuffer.limit(limit);
+      checkState(
+          bytesConsumed % bytesPerFrame == 0,
+          "bytesConsumed is not aligned to frame size: %s" + bytesConsumed);
+
+      checkState((sizeBeforeOutput >= bytesToOutput));
+
+      maybeSilenceBufferContentsSize -= bytesConsumed;
+      maybeSilenceBufferStartIndex += bytesConsumed;
+      // The start index might wrap back around to the start of the buffer.
+      maybeSilenceBufferStartIndex %= maybeSilenceBuffer.length;
+
+      outputSilenceFramesSinceNoise += bytesToOutput / bytesPerFrame;
+      skippedFrames += (bytesConsumed - bytesToOutput) / bytesPerFrame;
     }
   }
 
   /**
-   * Incrementally processes new input from {@code inputBuffer} while in {@link #STATE_SILENT},
-   * updating the state if needed.
+   * Returns the appropriate size that a given number of bytes of silence should be shortened to. It
+   * calculates this using the {@link #outputSilenceFramesSinceNoise} and the {@link
+   * #silenceRetentionRatio}. The {@link #silenceRetentionRatio} multiplied by {@code
+   * silenceToShortenBytes} is returned until a max outputted silence length is hit, and then only
+   * the remaining silence between the current {@link #outputSilenceFramesSinceNoise} and {@link
+   * #maxSilenceToKeepDurationUs} is reached.
    */
-  private void processSilence(ByteBuffer inputBuffer) {
-    int limit = inputBuffer.limit();
-    int noisyPosition = findNoisePosition(inputBuffer);
-    inputBuffer.limit(noisyPosition);
-    skippedFrames += inputBuffer.remaining() / bytesPerFrame;
-    updatePaddingBuffer(inputBuffer, paddingBuffer, paddingSize);
-    if (noisyPosition < limit) {
-      // Output the padding, which may include previous input as well as new input, then transition
-      // back to the noisy state.
-      output(paddingBuffer, paddingSize);
-      state = STATE_NOISY;
+  private int calculateShortenedSilenceLength(int silenceToShortenBytes) {
+    // Start skipping silence to keep the silence below MAX_SILENCE_DURATION_US long.
+    int bytesNeededToReachMax =
+        (durationUsToFrames(maxSilenceToKeepDurationUs) - outputSilenceFramesSinceNoise)
+                * bytesPerFrame
+            - maybeSilenceBuffer.length / 2;
 
-      // Restore the limit.
-      inputBuffer.limit(limit);
-    }
+    checkState(bytesNeededToReachMax >= 0);
+
+    return alignToBytePerFrameBoundary(
+        min(silenceToShortenBytes * silenceRetentionRatio + .5f, bytesNeededToReachMax));
   }
 
   /**
-   * Copies {@code length} elements from {@code data} to populate a new output buffer from the
-   * processor.
+   * Method used to avoid rounding errors while calculating output and skipped frames. The given
+   * {@code value} is decreased to the nearest value that is divisible by {@link #bytesPerFrame}.
    */
-  private void output(byte[] data, int length) {
-    replaceOutputBuffer(length).put(data, 0, length).flip();
-    if (length > 0) {
-      hasOutputNoise = true;
+  private int alignToBytePerFrameBoundary(int value) {
+    return (value / bytesPerFrame) * bytesPerFrame;
+  }
+
+  /**
+   * Method used to avoid rounding errors while calculating output and skipped frames. The given
+   * {@code value} is decreased to the nearest value that is divisible by {@link #bytesPerFrame}.
+   */
+  private int alignToBytePerFrameBoundary(float value) {
+    return alignToBytePerFrameBoundary((int) value);
+  }
+
+  /** Copies elements from {@code data} to populate a new output buffer from the processor. */
+  private void outputRange(byte[] data, int size, @VolumeChangeType int rampType) {
+    checkArgument(
+        size % bytesPerFrame == 0, "byteOutput size is not aligned to frame size " + size);
+
+    modifyVolume(data, size, rampType);
+    replaceOutputBuffer(size).put(data, 0, size).flip();
+  }
+
+  /**
+   * Copies {@code sizeToOutput} elements from the {@link #maybeSilenceBuffer} to {@link
+   * #contiguousOutputBuffer}. The contents of {@link #maybeSilenceBuffer} can wrap around from the
+   * end of the buffer and back to the beginning. The {@link #contiguousOutputBuffer} content always
+   * start from index 0.
+   *
+   * @param rampType This parameter is used to determine which part of the {@link
+   *     #maybeSilenceBuffer} contents need to be kept. For {@link #FADE_IN} the end of the contents
+   *     is always kept. Otherwise the beginning of the contents are always kept.
+   */
+  private void outputSilence(int sizeToOutput, @VolumeChangeType int rampType) {
+    if (sizeToOutput == 0) {
+      return;
+    }
+
+    checkArgument(maybeSilenceBufferContentsSize >= sizeToOutput);
+
+    if (rampType == FADE_IN) {
+      // Keeps the end of the buffer because we are padding the start of the next chunk of noise.
+      if (maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize
+          <= maybeSilenceBuffer.length) {
+        // ^0---^start---^end---^length
+        System.arraycopy(
+            maybeSilenceBuffer,
+            maybeSilenceBufferStartIndex + maybeSilenceBufferContentsSize - sizeToOutput,
+            contiguousOutputBuffer,
+            0,
+            sizeToOutput);
+      } else {
+        // ^0---^end--^start---^length
+        int sizeInUpperPartOfArray = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex;
+        int sizeInLowerPartOfArray = maybeSilenceBufferContentsSize - sizeInUpperPartOfArray;
+        if (sizeInLowerPartOfArray >= sizeToOutput) {
+          // We just need the lower part of the array.
+          System.arraycopy(
+              maybeSilenceBuffer,
+              sizeInLowerPartOfArray - sizeToOutput,
+              contiguousOutputBuffer,
+              0,
+              sizeToOutput);
+        } else {
+          int sizeToOutputInUpperPart = sizeToOutput - sizeInLowerPartOfArray;
+          System.arraycopy(
+              maybeSilenceBuffer,
+              maybeSilenceBuffer.length - sizeToOutputInUpperPart,
+              contiguousOutputBuffer,
+              0,
+              sizeToOutputInUpperPart);
+
+          // Copy everything from lower part. DO_NOT_CHANGE_VOLUME (which keeps everything) and
+          // MUTE (where the content that is kept only provides background noise).
+          System.arraycopy(
+              maybeSilenceBuffer,
+              0,
+              contiguousOutputBuffer,
+              sizeToOutputInUpperPart,
+              sizeInLowerPartOfArray);
+        }
+      }
+    } else {
+      if (maybeSilenceBufferStartIndex + sizeToOutput <= maybeSilenceBuffer.length) {
+        // ^0---^start---^end---^length
+        System.arraycopy(
+            maybeSilenceBuffer,
+            maybeSilenceBufferStartIndex,
+            contiguousOutputBuffer,
+            0,
+            sizeToOutput);
+      } else {
+        // ^0---^end (of content to output now)---^start---^length
+        int sizeToCopyInUpperPartOfArray = maybeSilenceBuffer.length - maybeSilenceBufferStartIndex;
+        // Copy the upper part of the array.
+        System.arraycopy(
+            maybeSilenceBuffer,
+            maybeSilenceBufferStartIndex,
+            contiguousOutputBuffer,
+            0,
+            sizeToCopyInUpperPartOfArray);
+        int amountToCopyFromLowerPartOfArray = sizeToOutput - sizeToCopyInUpperPartOfArray;
+        System.arraycopy(
+            maybeSilenceBuffer,
+            0,
+            contiguousOutputBuffer,
+            sizeToCopyInUpperPartOfArray,
+            amountToCopyFromLowerPartOfArray);
+      }
+    }
+
+    checkArgument(
+        sizeToOutput % bytesPerFrame == 0,
+        "sizeToOutput is not aligned to frame size: " + sizeToOutput);
+    checkState(maybeSilenceBufferStartIndex < maybeSilenceBuffer.length);
+
+    outputRange(contiguousOutputBuffer, sizeToOutput, rampType);
+  }
+
+  /**
+   * Modifies the amplitude of the samples in {@code sampleBuffer} based on the given {@link
+   * VolumeChangeType}.
+   */
+  private void modifyVolume(byte[] sampleBuffer, int size, @VolumeChangeType int volumeChangeType) {
+    if (volumeChangeType == DO_NOT_CHANGE_VOLUME) {
+      return;
+    }
+
+    for (int idx = 0; idx < size; idx += 2) {
+      byte mostSignificantByte = sampleBuffer[idx + 1];
+      byte leastSignificantByte = sampleBuffer[idx];
+      int sample = twoByteSampleToInt(mostSignificantByte, leastSignificantByte);
+
+      int volumeModificationPercentage;
+      if (volumeChangeType == FADE_OUT) {
+        volumeModificationPercentage =
+            calculateFadeOutPercentage(/* value= */ idx, /* max= */ size - 1);
+      } else if (volumeChangeType == FADE_IN) {
+        volumeModificationPercentage =
+            calculateFadeInPercentage(/* value= */ idx, /* max= */ size - 1);
+      } else {
+        volumeModificationPercentage = minVolumeToKeepPercentageWhenMuting;
+      }
+
+      sample = (sample * volumeModificationPercentage) / 100;
+      sampleIntToTwoBigEndianBytes(sampleBuffer, idx, sample);
+    }
+  }
+
+  private int calculateFadeOutPercentage(int value, int max) {
+    return ((minVolumeToKeepPercentageWhenMuting - 100) * ((AVOID_TRUNCATION_FACTOR * value) / max))
+            / AVOID_TRUNCATION_FACTOR
+        + 100;
+  }
+
+  private int calculateFadeInPercentage(int value, int max) {
+    return (minVolumeToKeepPercentageWhenMuting
+        + ((100 - minVolumeToKeepPercentageWhenMuting) * (AVOID_TRUNCATION_FACTOR * value) / max)
+            / AVOID_TRUNCATION_FACTOR);
+  }
+
+  private static int twoByteSampleToInt(byte mostSignificantByte, byte leastSignificantByte) {
+    return ((leastSignificantByte & 0xFF) | mostSignificantByte << 8);
+  }
+
+  /**
+   * Converts {@code sample} into the corresponding big-endian 16bit bytes within {@code byteArray}.
+   */
+  private static void sampleIntToTwoBigEndianBytes(byte[] byteArray, int startIndex, int sample) {
+    // Avoid 16-bit-integer overflow when writing back the manipulated data.
+    if (sample >= Short.MAX_VALUE) {
+      byteArray[startIndex] = (byte) 0xFF;
+      byteArray[startIndex + 1] = (byte) 0x7F;
+    } else if (sample <= Short.MIN_VALUE) {
+      byteArray[startIndex] = (byte) 0x00;
+      byteArray[startIndex + 1] = (byte) 0x80;
+    } else {
+      byteArray[startIndex] = (byte) (sample & 0xFF);
+      byteArray[startIndex + 1] = (byte) (sample >> 8);
     }
   }
 
@@ -324,36 +704,14 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
    * Copies remaining bytes from {@code data} to populate a new output buffer from the processor.
    */
   private void output(ByteBuffer data) {
-    int length = data.remaining();
-    replaceOutputBuffer(length).put(data).flip();
-    if (length > 0) {
-      hasOutputNoise = true;
-    }
-  }
-
-  /**
-   * Fills {@link #paddingBuffer} using data from {@code input}, plus any additional buffered data
-   * at the end of {@code buffer} (up to its {@code size}) required to fill it, advancing the input
-   * position.
-   */
-  private void updatePaddingBuffer(ByteBuffer input, byte[] buffer, int size) {
-    int fromInputSize = min(input.remaining(), paddingSize);
-    int fromBufferSize = paddingSize - fromInputSize;
-    System.arraycopy(
-        /* src= */ buffer,
-        /* srcPos= */ size - fromBufferSize,
-        /* dest= */ paddingBuffer,
-        /* destPos= */ 0,
-        /* length= */ fromBufferSize);
-    input.position(input.limit() - fromInputSize);
-    input.get(paddingBuffer, fromBufferSize, fromInputSize);
+    replaceOutputBuffer(data.remaining()).put(data).flip();
   }
 
   /**
    * Returns the number of input frames corresponding to {@code durationUs} microseconds of audio.
    */
   private int durationUsToFrames(long durationUs) {
-    return (int) ((durationUs * inputAudioFormat.sampleRate) / C.MICROS_PER_SECOND);
+    return (int) ((durationUs * inputFormat.sampleRate) / C.MICROS_PER_SECOND);
   }
 
   /**
@@ -362,8 +720,8 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
    */
   private int findNoisePosition(ByteBuffer buffer) {
     // The input is in ByteOrder.nativeOrder(), which is little endian on Android.
-    for (int i = buffer.position(); i < buffer.limit(); i += 2) {
-      if (Math.abs(buffer.getShort(i)) > silenceThresholdLevel) {
+    for (int i = buffer.position() + 1; i < buffer.limit(); i += 2) {
+      if (isNoise(buffer.get(i), buffer.get(i - 1))) {
         // Round to the start of the frame.
         return bytesPerFrame * (i / bytesPerFrame);
       }
@@ -377,12 +735,21 @@ public final class SilenceSkippingAudioProcessor extends BaseAudioProcessor {
    */
   private int findNoiseLimit(ByteBuffer buffer) {
     // The input is in ByteOrder.nativeOrder(), which is little endian on Android.
-    for (int i = buffer.limit() - 2; i >= buffer.position(); i -= 2) {
-      if (Math.abs(buffer.getShort(i)) > silenceThresholdLevel) {
+    for (int i = buffer.limit() - 1; i >= buffer.position(); i -= 2) {
+      if (isNoise(buffer.get(i), buffer.get(i - 1))) {
         // Return the start of the next frame.
         return bytesPerFrame * (i / bytesPerFrame) + bytesPerFrame;
       }
     }
     return buffer.position();
   }
+
+  /**
+   * Whether the given two bytes represent a short signed PCM value that is greater than {@link
+   * #silenceThresholdLevel}.
+   */
+  private boolean isNoise(byte mostSignificantByte, byte leastSignificantByte) {
+    return Math.abs(twoByteSampleToInt(mostSignificantByte, leastSignificantByte))
+        > silenceThresholdLevel;
+  }
 }
diff --git a/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java b/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java
index bb8e87b8b2..6d16a5655f 100644
--- a/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java
+++ b/libraries/exoplayer/src/test/java/androidx/media3/exoplayer/audio/SilenceSkippingAudioProcessorTest.java
@@ -23,6 +23,7 @@ import static java.lang.Short.MAX_VALUE;
 import androidx.media3.common.C;
 import androidx.media3.common.audio.AudioProcessor.AudioFormat;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
+import com.google.common.collect.Range;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.ShortBuffer;
@@ -86,7 +87,7 @@ public final class SilenceSkippingAudioProcessorTest {
 
   @Test
   public void skipInSilentSignal_skipsEverything() throws Exception {
-    // Given a signal with only noise.
+    // Given a signal with only silence.
     InputBufferProvider inputBufferProvider =
         getInputBufferProviderForAlternatingSilenceAndNoise(
             TEST_SIGNAL_SILENCE_DURATION_MS, /* noiseDurationMs= */ 0, TEST_SIGNAL_FRAME_COUNT);
@@ -99,14 +100,15 @@ public final class SilenceSkippingAudioProcessorTest {
     long totalOutputFrames =
         process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE);
 
-    // The entire signal is skipped.
-    assertThat(totalOutputFrames).isEqualTo(0);
-    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(TEST_SIGNAL_FRAME_COUNT);
+    // The entire signal is skipped except for the DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US.
+    assertThat(totalOutputFrames).isEqualTo(2000);
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames())
+        .isEqualTo(TEST_SIGNAL_FRAME_COUNT - 2000);
   }
 
   @Test
   public void skipInNoisySignal_skipsNothing() throws Exception {
-    // Given a signal with only silence.
+    // Given a signal with only noise.
     InputBufferProvider inputBufferProvider =
         getInputBufferProviderForAlternatingSilenceAndNoise(
             /* silenceDurationMs= */ 0, TEST_SIGNAL_NOISE_DURATION_MS, TEST_SIGNAL_FRAME_COUNT);
@@ -126,6 +128,30 @@ public final class SilenceSkippingAudioProcessorTest {
     assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(0);
   }
 
+  @Test
+  public void skipInNoisySignalWithShortSilences_skipsNothing() throws Exception {
+    // Given a signal with only noise.
+    InputBufferProvider inputBufferProvider =
+        getInputBufferProviderForAlternatingSilenceAndNoise(
+            /* silenceDurationMs= */ 30,
+            TEST_SIGNAL_NOISE_DURATION_MS - 30,
+            TEST_SIGNAL_FRAME_COUNT);
+
+    // When processing the entire signal.
+    SilenceSkippingAudioProcessor silenceSkippingAudioProcessor =
+        new SilenceSkippingAudioProcessor();
+    silenceSkippingAudioProcessor.setEnabled(true);
+    silenceSkippingAudioProcessor.configure(AUDIO_FORMAT);
+    silenceSkippingAudioProcessor.flush();
+    assertThat(silenceSkippingAudioProcessor.isActive()).isTrue();
+    long totalOutputFrames =
+        process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE);
+
+    // None of the signal is skipped.
+    assertThat(totalOutputFrames).isEqualTo(TEST_SIGNAL_FRAME_COUNT);
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(0);
+  }
+
   @Test
   public void skipInAlternatingTestSignal_hasCorrectOutputAndSkippedFrameCounts() throws Exception {
     // Given a signal that alternates between silence and noise.
@@ -145,10 +171,10 @@ public final class SilenceSkippingAudioProcessorTest {
     long totalOutputFrames =
         process(silenceSkippingAudioProcessor, inputBufferProvider, INPUT_BUFFER_SIZE);
 
-    // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 *
-    // 40 frames of padding after that.
-    assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40));
-    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40));
+    // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors).
+    assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L));
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames())
+        .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames);
   }
 
   @Test
@@ -171,10 +197,10 @@ public final class SilenceSkippingAudioProcessorTest {
     long totalOutputFrames =
         process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 80);
 
-    // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 *
-    // 40 frames of padding after that.
-    assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40));
-    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40));
+    // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors).
+    assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L));
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames())
+        .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames);
   }
 
   @Test
@@ -197,14 +223,14 @@ public final class SilenceSkippingAudioProcessorTest {
     long totalOutputFrames =
         process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 120);
 
-    // The output consists of 50000 frames of noise, plus 20 frames of padding at the start and 99 *
-    // 40 frames of padding after that.
-    assertThat(totalOutputFrames).isEqualTo(50000 + (20 + 99 * 40));
-    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (20 + 99 * 40));
+    // The output has 50000 frames of noise, plus 50 * 0.2 * 1000 padding (plus rounding errors).
+    assertThat(totalOutputFrames).isIn(Range.closed(60000L - 500L, 60000L + 500L));
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames())
+        .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames);
   }
 
   @Test
-  public void customPaddingValue_hasCorrectOutputAndSkippedFrameCounts() throws Exception {
+  public void customSilenceRetentionValue_hasCorrectOutputAndSkippedFrameCounts() throws Exception {
     // Given a signal that alternates between silence and noise.
     InputBufferProvider inputBufferProvider =
         getInputBufferProviderForAlternatingSilenceAndNoise(
@@ -212,11 +238,13 @@ public final class SilenceSkippingAudioProcessorTest {
             TEST_SIGNAL_NOISE_DURATION_MS,
             TEST_SIGNAL_FRAME_COUNT);
 
-    // When processing the entire signal with a larger than normal padding silence.
+    // When processing the entire signal with a smaller than normal retention ratio.
     SilenceSkippingAudioProcessor silenceSkippingAudioProcessor =
         new SilenceSkippingAudioProcessor(
             SilenceSkippingAudioProcessor.DEFAULT_MINIMUM_SILENCE_DURATION_US,
-            /* paddingSilenceUs= */ 21_000,
+            /* silenceRetentionRatio= */ 0.05f,
+            SilenceSkippingAudioProcessor.DEFAULT_MAX_SILENCE_TO_KEEP_DURATION_US,
+            SilenceSkippingAudioProcessor.DEFAULT_MIN_VOLUME_TO_KEEP_PERCENTAGE,
             SilenceSkippingAudioProcessor.DEFAULT_SILENCE_THRESHOLD_LEVEL);
     silenceSkippingAudioProcessor.setEnabled(true);
     silenceSkippingAudioProcessor.configure(AUDIO_FORMAT);
@@ -225,10 +253,10 @@ public final class SilenceSkippingAudioProcessorTest {
     long totalOutputFrames =
         process(silenceSkippingAudioProcessor, inputBufferProvider, /* inputBufferSize= */ 120);
 
-    // The output consists of 50000 frames of noise, plus 21 frames of padding at the start and 99 *
-    // 42 frames of padding after that.
-    assertThat(totalOutputFrames).isEqualTo(50000 + (21 + 99 * 42));
-    assertThat(silenceSkippingAudioProcessor.getSkippedFrames()).isEqualTo(50000 - (21 + 99 * 42));
+    // The output has 50000 frames of noise, plus 50 * 0.05 * 1000 padding (plus rounding errors).
+    assertThat(totalOutputFrames).isIn(Range.closed(52500L - 500L, 52500L + 500L));
+    assertThat(silenceSkippingAudioProcessor.getSkippedFrames())
+        .isEqualTo(TEST_SIGNAL_FRAME_COUNT - totalOutputFrames);
   }
 
   @Test