Fix AV sync for sequences with audio track shorter than video

For each item, AudioGraphInput now pads the input audio track with silence to the duration given in onMediaItemChanged. Possibly resolves Issue: androidx/media#921 . PiperOrigin-RevId: 634753721
2024-05-17 06:52:12 -07:00 · 2024-05-17 06:52:12 -07:00 · b9ec24a269
commit b9ec24a269
parent 34792f7b11
12 changed files with 434 additions and 24 deletions
--- a/RELEASENOTES.md
+++ b/RELEASENOTES.md
@ -45,6 +45,7 @@
        `androidx.media3.transformer.Muxer`.
    *   Fix HEIC image loading from content URI schemes.
        ([#1373](https://github.com/androidx/media/issues/1373)).
+    *   Adjust audio track duration in `AudioGraphInput` to improve AV sync.
 *   Track Selection:
 *   Extractors:
    *   MPEG-TS: Roll forward the change ensuring the last frame is rendered by
--- a/demos/composition/src/main/res/values/arrays.xml
+++ b/demos/composition/src/main/res/values/arrays.xml
@ -32,6 +32,7 @@
    <item>720p H264 video with no audio</item>
    <item>London JPG image (plays for 5 secs at 30 fps)</item>
    <item>Tokyo JPG image (portrait, plays for 5 secs at 30 fps)</item>
+    <item>Pixel 7 shorter audio track</item>
  </string-array>
  <string-array name="preset_uris">
    <item>https://storage.googleapis.com/exoplayer-test-media-1/mp4/android-screens-10s.mp4</item>
@ -51,6 +52,7 @@
    <item>https://storage.googleapis.com/exoplayer-test-media-1/mp4/sample_video_track_only.mp4</item>
    <item>https://storage.googleapis.com/exoplayer-test-media-1/jpg/london.jpg</item>
    <item>https://storage.googleapis.com/exoplayer-test-media-1/jpg/tokyo.jpg</item>
+    <item>https://storage.googleapis.com/exoplayer-temp/audio-blip/metronome_selfie_pixel.mp4</item>
  </string-array>
  <integer-array name="preset_durations">
    <item>10024000</item>
@ -70,5 +72,6 @@
    <item>1001000</item>
    <item>5000000</item>
    <item>5000000</item>
+    <item>2170000</item>
  </integer-array>
 </resources>
--- a/libraries/test_data/src/test/assets/media/mp4/sample_shorter_audio.mp4
+++ b/libraries/test_data/src/test/assets/media/mp4/sample_shorter_audio.mp4
--- a/libraries/test_data/src/test/assets/transformerdumps/mp4/sample_twos_pcm.mp4/toInt16.dump
+++ b/libraries/test_data/src/test/assets/transformerdumps/mp4/sample_twos_pcm.mp4/toInt16.dump
@ -142,8 +142,13 @@ sample:
  isKeyFrame = true
 sample:
  trackType = audio
-  dataHashCode = -363894499
-  size = 2282
+  dataHashCode = 446253636
+  size = 4096
+  isKeyFrame = true
+sample:
+  trackType = audio
+  dataHashCode = -1759454975
+  size = 440
  isKeyFrame = true
 sample:
  trackType = video
--- a/libraries/transformer/src/androidTest/java/androidx/media3/transformer/AndroidTestUtil.java
+++ b/libraries/transformer/src/androidTest/java/androidx/media3/transformer/AndroidTestUtil.java
@ -161,6 +161,18 @@ public final class AndroidTestUtil {
          .setCodecs("avc1.42C015")
          .build();

+  public static final String MP4_ASSET_WITH_SHORTER_AUDIO_URI_STRING =
+      "asset:///media/mp4/sample_shorter_audio.mp4";
+
+  public static final Format MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT =
+      new Format.Builder()
+          .setSampleMimeType(VIDEO_H264)
+          .setWidth(320)
+          .setHeight(240)
+          .setFrameRate(30.00f)
+          .setCodecs("avc1.42C015")
+          .build();
+
  public static final String MP4_ASSET_SEF_URI_STRING =
      "asset:///media/mp4/sample_sef_slow_motion.mp4";
  public static final Format MP4_ASSET_SEF_FORMAT =
--- a/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java
+++ b/libraries/transformer/src/androidTest/java/androidx/media3/transformer/TransformerEndToEndTest.java
@ -25,6 +25,8 @@ import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_URI_STRING;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_WITH_INCREASING_TIMESTAMPS_320W_240H_15S_FORMAT;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_WITH_INCREASING_TIMESTAMPS_320W_240H_15S_URI_STRING;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_WITH_INCREASING_TIMESTAMPS_URI_STRING;
+import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT;
+import static androidx.media3.transformer.AndroidTestUtil.MP4_ASSET_WITH_SHORTER_AUDIO_URI_STRING;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_TRIM_OPTIMIZATION_180_URI_STRING;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_TRIM_OPTIMIZATION_270_URI_STRING;
 import static androidx.media3.transformer.AndroidTestUtil.MP4_TRIM_OPTIMIZATION_URI_STRING;
@ -186,6 +188,46 @@ public class TransformerEndToEndTest {
    assertThat(result.exportResult.processedInputs).hasSize(9);
  }

+  @Test
+  public void compositionEditing_withLongLoopingSequence_completes() throws Exception {
+    Transformer transformer = new Transformer.Builder(context).build();
+    assumeFormatsSupported(
+        context, testId, /* inputFormat= */ MP4_ASSET_FORMAT, /* outputFormat= */ MP4_ASSET_FORMAT);
+    EditedMediaItem imageItem =
+        new EditedMediaItem.Builder(MediaItem.fromUri(JPG_ASSET_URI_STRING))
+            .setDurationUs(500_000)
+            .setFrameRate(30)
+            .build();
+
+    EditedMediaItemSequence imageSequence = new EditedMediaItemSequence(imageItem);
+
+    EditedMediaItem.Builder audioBuilder =
+        new EditedMediaItem.Builder(MediaItem.fromUri(MP4_ASSET_URI_STRING)).setRemoveVideo(true);
+
+    EditedMediaItemSequence loopingAudioSequence =
+        new EditedMediaItemSequence(
+            ImmutableList.of(
+                audioBuilder
+                    .setEffects(
+                        new Effects(
+                            ImmutableList.of(createSonic(/* pitch= */ 0.4f)),
+                            /* videoEffects= */ ImmutableList.of()))
+                    .build()),
+            /* isLooping= */ true);
+
+    Composition composition = new Composition.Builder(imageSequence, loopingAudioSequence).build();
+
+    ExportTestResult result =
+        new TransformerAndroidTestRunner.Builder(context, transformer)
+            .build()
+            .run(testId, composition);
+
+    // Image asset duration is ~0.5s.
+    // loopingAudioSequence: Matches other sequence (~0.5s) and is cut short.
+    assertThat(result.exportResult.durationMs).isAtLeast(450);
+    assertThat(result.exportResult.durationMs).isAtMost(500);
+  }
+
  @Test
  public void videoEditing_withImageInput_completesWithCorrectFrameCountAndDuration()
      throws Exception {
@ -905,7 +947,7 @@ public class TransformerEndToEndTest {

    // The input video is 15.537 seconds.
    // 3 / 0.5 + 3 / 0.75 + 3 + 3 / 1.5 + 3.537 / 2 rounds up to 16_770
-    assertThat(result.exportResult.durationMs).isAtLeast(16_750);
+    assertThat(result.exportResult.durationMs).isAtLeast(16_720);
    assertThat(result.exportResult.durationMs).isAtMost(16_770);
  }

@ -1067,7 +1109,14 @@ public class TransformerEndToEndTest {
    assertThat(result.exportResult.processedInputs).hasSize(6);
    assertThat(result.exportResult.channelCount).isEqualTo(1);
    assertThat(result.exportResult.videoFrameCount).isEqualTo(90);
-    assertThat(result.exportResult.durationMs).isEqualTo(2980);
+    // Audio encoders on different API levels output different audio durations for the same input.
+    // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+    // If the video track is a lot longer than the audio track, then this API difference wouldn't be
+    // seen in this check as the duration is determined by the last video frame.
+    // However, if the audio track is roughly as long as the video track, this API difference
+    // will be seen in result.exportResult.durationMs.
+    assertThat(result.exportResult.durationMs).isAtLeast(2970);
+    assertThat(result.exportResult.durationMs).isAtMost(3020);
  }

  @Test
@ -1097,7 +1146,14 @@ public class TransformerEndToEndTest {
    assertThat(result.exportResult.processedInputs).hasSize(7);
    assertThat(result.exportResult.channelCount).isEqualTo(1);
    assertThat(result.exportResult.videoFrameCount).isEqualTo(92);
-    assertThat(result.exportResult.durationMs).isEqualTo(3105);
+    // Audio encoders on different API levels output different audio durations for the same input.
+    // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+    // If the video track is a lot longer than the audio track, then this API difference wouldn't be
+    // seen in this check as the duration is determined by the last video frame.
+    // However, if the audio track is roughly as long as the video track, this API difference
+    // will be seen in result.exportResult.durationMs.
+    assertThat(result.exportResult.durationMs).isAtLeast(3100);
+    assertThat(result.exportResult.durationMs).isAtMost(3150);
  }

  @Test
@ -1125,7 +1181,14 @@ public class TransformerEndToEndTest {

    assertThat(result.exportResult.processedInputs).hasSize(7);
    assertThat(result.exportResult.channelCount).isEqualTo(1);
-    assertThat(result.exportResult.durationMs).isEqualTo(3133);
+    // Audio encoders on different API levels output different audio durations for the same input.
+    // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+    // If the video track is a lot longer than the audio track, then this API difference wouldn't be
+    // seen in this check as the duration is determined by the last video frame.
+    // However, if the audio track is roughly as long as the video track, this API difference
+    // will be seen in result.exportResult.durationMs.
+    assertThat(result.exportResult.durationMs).isAtLeast(3120);
+    assertThat(result.exportResult.durationMs).isAtMost(3140);
    assertThat(result.exportResult.videoFrameCount).isEqualTo(95);
  }

@ -1152,7 +1215,14 @@ public class TransformerEndToEndTest {

    assertThat(result.exportResult.processedInputs).hasSize(3);
    assertThat(result.exportResult.channelCount).isEqualTo(1);
-    assertThat(result.exportResult.durationMs).isEqualTo(1000);
+    // Audio encoders on different API levels output different audio durations for the same input.
+    // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+    // If the video track is a lot longer than the audio track, then this API difference wouldn't be
+    // seen in this check as the duration is determined by the last video frame.
+    // However, if the audio track is roughly as long as the video track, this API difference
+    // will be seen in result.exportResult.durationMs.
+    assertThat(result.exportResult.durationMs).isAtLeast(1000);
+    assertThat(result.exportResult.durationMs).isAtMost(1050);
  }

  @Test
@ -1398,6 +1468,132 @@ public class TransformerEndToEndTest {
        .isEqualTo(9_500_000);
  }

+  @Test
+  public void transcode_shorterAudio_extendsAudioTrack() throws Exception {
+    assumeFormatsSupported(
+        context,
+        testId,
+        /* inputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT,
+        /* outputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT);
+    Context context = ApplicationProvider.getApplicationContext();
+    Transformer transformer =
+        new Transformer.Builder(context)
+            .setEncoderFactory(new AndroidTestUtil.ForceEncodeEncoderFactory(context))
+            .build();
+    MediaItem mediaItem = MediaItem.fromUri(Uri.parse(MP4_ASSET_WITH_SHORTER_AUDIO_URI_STRING));
+
+    ExportTestResult exportTestResult =
+        new TransformerAndroidTestRunner.Builder(context, transformer)
+            .build()
+            .run(testId, mediaItem);
+
+    Mp4Extractor mp4Extractor = new Mp4Extractor(new DefaultSubtitleParserFactory());
+    FakeExtractorOutput fakeExtractorOutput =
+        TestUtil.extractAllSamplesFromFilePath(mp4Extractor, exportTestResult.filePath);
+    assertThat(fakeExtractorOutput.seekMap.getDurationUs()).isAtLeast(1_150_000);
+    assertThat(fakeExtractorOutput.seekMap.getDurationUs()).isAtMost(1_250_000);
+    assertThat(fakeExtractorOutput.numberOfTracks).isEqualTo(2);
+    for (int i = 0; i < fakeExtractorOutput.numberOfTracks; ++i) {
+      FakeTrackOutput trackOutput = fakeExtractorOutput.trackOutputs.get(i);
+      int sampleCount = trackOutput.getSampleCount();
+      assertThat(trackOutput.getSampleTimeUs(/* index= */ 0)).isEqualTo(0);
+      if (MimeTypes.isVideo(trackOutput.lastFormat.sampleMimeType)) {
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isEqualTo(1_183_333);
+      } else {
+        // Input has 800ms audio. Output should be closer to 1.2s
+        // Audio encoders on different API levels output different audio durations for the same
+        // input.
+        // E.g. on emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isAtLeast(1_150_000);
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isAtMost(1_250_000);
+      }
+    }
+  }
+
+  @Test
+  public void transcode_shorterAudioSequence_extendsAudioTrack() throws Exception {
+    assumeFormatsSupported(
+        context,
+        testId,
+        /* inputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT,
+        /* outputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT);
+    assumeTrue(
+        "Old SDKs have large audio encoder buffer, and hits deadlocks due to b/329087277.",
+        Util.SDK_INT >= 31);
+    Context context = ApplicationProvider.getApplicationContext();
+    Transformer transformer = new Transformer.Builder(context).build();
+    MediaItem mediaItem = MediaItem.fromUri(Uri.parse(MP4_ASSET_WITH_SHORTER_AUDIO_URI_STRING));
+    EditedMediaItem editedMediaItem = new EditedMediaItem.Builder(mediaItem).build();
+
+    Composition composition =
+        new Composition.Builder(new EditedMediaItemSequence(editedMediaItem, editedMediaItem))
+            .build();
+    ExportTestResult exportTestResult =
+        new TransformerAndroidTestRunner.Builder(context, transformer)
+            .build()
+            .run(testId, composition);
+
+    Mp4Extractor mp4Extractor = new Mp4Extractor(new DefaultSubtitleParserFactory());
+    FakeExtractorOutput fakeExtractorOutput =
+        TestUtil.extractAllSamplesFromFilePath(mp4Extractor, exportTestResult.filePath);
+    assertThat(fakeExtractorOutput.seekMap.getDurationUs()).isEqualTo(2_400_000);
+    assertThat(fakeExtractorOutput.numberOfTracks).isEqualTo(2);
+    // Check that both video and audio tracks have duration close to 1 second.
+    for (int i = 0; i < fakeExtractorOutput.numberOfTracks; ++i) {
+      FakeTrackOutput trackOutput = fakeExtractorOutput.trackOutputs.get(i);
+      int sampleCount = trackOutput.getSampleCount();
+      assertThat(trackOutput.getSampleTimeUs(/* index= */ 0)).isEqualTo(0);
+      if (MimeTypes.isVideo(trackOutput.lastFormat.sampleMimeType)) {
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isEqualTo(2_383_333);
+      } else {
+        // Input has 800ms audio. Output should be closer to 2.4s.
+        // Audio encoders on different API levels output different audio durations for the same
+        // input.
+        // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isAtLeast(2_300_000);
+        assertThat(trackOutput.getSampleTimeUs(/* index= */ sampleCount - 1)).isAtMost(2_400_000);
+      }
+    }
+  }
+
+  @Test
+  public void speedAdjustedMedia_shorterAudioTrack_completesWithCorrectDuration() throws Exception {
+    assumeFormatsSupported(
+        context,
+        testId,
+        /* inputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT,
+        /* outputFormat= */ MP4_ASSET_WITH_SHORTER_AUDIO_FORMAT);
+    Transformer transformer = new Transformer.Builder(context).build();
+    SpeedProvider speedProvider =
+        TestSpeedProvider.createWithStartTimes(
+            new long[] {0L, 1L * C.MICROS_PER_SECOND}, new float[] {1f, 0.5f});
+    Pair<AudioProcessor, Effect> speedEffect =
+        Effects.createExperimentalSpeedChangingEffect(speedProvider);
+    Effects effects =
+        new Effects(
+            /* audioProcessors= */ ImmutableList.of(speedEffect.first),
+            /* videoEffects= */ ImmutableList.of(speedEffect.second));
+    EditedMediaItem editedMediaItem =
+        new EditedMediaItem.Builder(MediaItem.fromUri(MP4_ASSET_WITH_SHORTER_AUDIO_URI_STRING))
+            .setEffects(effects)
+            .build();
+    ExportTestResult result =
+        new TransformerAndroidTestRunner.Builder(context, transformer)
+            .build()
+            .run(testId, editedMediaItem);
+
+    // Last video frame PTS is 1.18333
+    // (1.183333 - 1) * 2 + 1 = 1.36667
+    // Audio encoders on different API levels output different audio durations for the same input.
+    // On emulator, API 26 always outputs one access unit (23ms) of audio more than API 33.
+    // If the video track is a lot longer than the audio track, then this API difference wouldn't be
+    // seen in this check as the duration is determined by the last video frame.
+    // However, if the audio track is roughly as long as the video track, this API difference
+    // will be seen in result.exportResult.durationMs.
+    assertThat(result.exportResult.durationMs).isAtLeast(1_360);
+    assertThat(result.exportResult.durationMs).isAtMost(1_400);
+  }
+
  private static AudioProcessor createSonic(float pitch) {
    SonicAudioProcessor sonic = new SonicAudioProcessor();
    sonic.setPitch(pitch);
--- a/libraries/transformer/src/main/java/androidx/media3/transformer/AudioGraphInput.java
+++ b/libraries/transformer/src/main/java/androidx/media3/transformer/AudioGraphInput.java
@ -37,6 +37,7 @@ import androidx.media3.common.audio.ChannelMixingMatrix;
 import androidx.media3.common.audio.SonicAudioProcessor;
 import androidx.media3.common.audio.SpeedChangingAudioProcessor;
 import androidx.media3.common.util.NullableType;
+import androidx.media3.common.util.Util;
 import androidx.media3.decoder.DecoderInputBuffer;
 import com.google.common.collect.ImmutableList;
 import java.nio.ByteBuffer;
@ -57,22 +58,27 @@ import java.util.concurrent.atomic.AtomicReference;
 * "processing" thread.
 */
 /* package */ final class AudioGraphInput implements GraphInput {
+  private static final long MAX_AUDIO_DRIFT_ALLOWED_US = 2000;
  private static final int MAX_INPUT_BUFFER_COUNT = 10;
  private final AudioFormat outputAudioFormat;

-  // TODO(b/260618558): Move silent audio generation upstream of this component.
-  private final SilentAudioGenerator silentAudioGenerator;
  private final Queue<DecoderInputBuffer> availableInputBuffers;
  private final Queue<DecoderInputBuffer> pendingInputBuffers;
  private final AtomicReference<@NullableType MediaItemChange> pendingMediaItemChange;
  private final AtomicLong startTimeUs;

+  // silentAudioGenerator.audioFormat must match the current media item's input format.
+  private SilentAudioGenerator silentAudioGenerator;
  @Nullable private DecoderInputBuffer currentInputBufferBeingOutput;
  private AudioProcessingPipeline audioProcessingPipeline;
  private boolean processedFirstMediaItemChange;
  private boolean receivedEndOfStreamFromInput;
  private boolean queueEndOfStreamAfterSilence;
  private boolean inputBlocked;
+  private long currentItemExpectedInputDurationUs;
+  private long currentItemInputBytesRead;
+  private boolean currentItemSilenceAppended;
+  private boolean isCurrentItemLast;

  /**
   * Creates an instance.
@ -108,6 +114,7 @@ import java.util.concurrent.atomic.AtomicReference;
    checkArgument(
        outputAudioFormat.encoding == C.ENCODING_PCM_16BIT, /* errorMessage= */ outputAudioFormat);
    startTimeUs = new AtomicLong(C.TIME_UNSET);
+    currentItemExpectedInputDurationUs = C.TIME_UNSET;
  }

  /** Returns the {@link AudioFormat} of {@linkplain #getOutput() output buffers}. */
@ -141,6 +148,8 @@ import java.util.concurrent.atomic.AtomicReference;
  /**
   * {@inheritDoc}
   *
+   * <p>When durationUs is {@link C#TIME_UNSET}, silence generation is disabled.
+   *
   * <p>Should only be called by the input thread.
   */
  @Override
@ -246,6 +255,10 @@ import java.util.concurrent.atomic.AtomicReference;
    receivedEndOfStreamFromInput = false;
    queueEndOfStreamAfterSilence = false;
    startTimeUs.set(C.TIME_UNSET);
+    currentItemExpectedInputDurationUs = C.TIME_UNSET;
+    currentItemInputBytesRead = 0;
+    currentItemSilenceAppended = false;
+    isCurrentItemLast = false;
  }

  /**
@ -303,12 +316,21 @@ import java.util.concurrent.atomic.AtomicReference;
    @Nullable DecoderInputBuffer pendingInputBuffer = pendingInputBuffers.peek();
    if (pendingInputBuffer == null) {
      if (pendingMediaItemChange.get() != null) {
+        if (shouldAppendSilence()) {
+          appendSilence();
+          return true;
+        }
        audioProcessingPipeline.queueEndOfStream();
      }
      return false;
    }

    if (pendingInputBuffer.isEndOfStream()) {
+      if (shouldAppendSilence()) {
+        appendSilence();
+        clearAndAddToAvailableBuffers(pendingInputBuffers.remove());
+        return true;
+      }
      audioProcessingPipeline.queueEndOfStream();
      receivedEndOfStreamFromInput = true;
      clearAndAddToAvailableBuffers(pendingInputBuffers.remove());
@ -316,7 +338,10 @@ import java.util.concurrent.atomic.AtomicReference;
    }

    ByteBuffer inputData = checkNotNull(pendingInputBuffer.data);
+    long bytesRemaining = inputData.remaining();
    audioProcessingPipeline.queueInput(inputData);
+    long bytesConsumed = bytesRemaining - inputData.remaining();
+    currentItemInputBytesRead += bytesConsumed;
    if (inputData.hasRemaining()) {
      return false;
    }
@ -332,19 +357,21 @@ import java.util.concurrent.atomic.AtomicReference;
    // When output is fed directly from input, the output ByteBuffer is linked to a specific
    // DecoderInputBuffer. Therefore it must be consumed by the downstream component before it can
    // be used for fresh input.
-    @Nullable DecoderInputBuffer previousOutputBuffer = currentInputBufferBeingOutput;
-    if (previousOutputBuffer != null) {
-      ByteBuffer data = checkStateNotNull(previousOutputBuffer.data);
+    if (currentInputBufferBeingOutput != null) {
+      ByteBuffer data = checkStateNotNull(currentInputBufferBeingOutput.data);
      if (data.hasRemaining()) {
        // Currently output data has not been consumed, return it.
        return data;
      }
-      clearAndAddToAvailableBuffers(previousOutputBuffer);
+      clearAndAddToAvailableBuffers(checkStateNotNull(currentInputBufferBeingOutput));
      currentInputBufferBeingOutput = null;
    }

    @Nullable DecoderInputBuffer currentInputBuffer = pendingInputBuffers.poll();
    if (currentInputBuffer == null) {
+      if (pendingMediaItemChange.get() != null && shouldAppendSilence()) {
+        appendSilence();
+      }
      return EMPTY_BUFFER;
    }
    @Nullable ByteBuffer currentInputBufferData = currentInputBuffer.data;
@ -356,10 +383,16 @@ import java.util.concurrent.atomic.AtomicReference;
        || !currentInputBufferData.hasRemaining()
        || receivedEndOfStreamFromInput) {
      clearAndAddToAvailableBuffers(currentInputBuffer);
+      if (receivedEndOfStreamFromInput && shouldAppendSilence()) {
+        appendSilence();
+      }
      return EMPTY_BUFFER;
    }

    currentInputBufferBeingOutput = currentInputBuffer;
+    // Bytes from currentInputBufferBeingOutput will be read over multiple calls to this method.
+    // Add all bytes now, this line will be reached only once per input buffer.
+    currentItemInputBytesRead += currentInputBufferData.remaining();
    return currentInputBufferData;
  }

@ -400,16 +433,29 @@ import java.util.concurrent.atomic.AtomicReference;
  private void configureForPendingMediaItemChange() throws UnhandledAudioFormatException {
    MediaItemChange pendingChange = checkStateNotNull(pendingMediaItemChange.get());

+    currentItemInputBytesRead = 0;
+    isCurrentItemLast = pendingChange.isLast;
+    currentItemSilenceAppended = false;
    AudioFormat pendingAudioFormat;
    if (pendingChange.format != null) {
+      currentItemExpectedInputDurationUs = pendingChange.durationUs;
      pendingAudioFormat = new AudioFormat(pendingChange.format);
+      silentAudioGenerator = new SilentAudioGenerator(pendingAudioFormat);
    } else { // Generating silence
+      // No audio track. Generate silence based on video track duration after applying effects.
+      if (pendingChange.editedMediaItem.effects.audioProcessors.isEmpty()) {
+        // No audio track and no effects.
+        // Generate silence based on video track duration after applying effects.
+        currentItemExpectedInputDurationUs =
+            pendingChange.editedMediaItem.getDurationAfterEffectsApplied(pendingChange.durationUs);
+      } else {
+        // No audio track, but effects are present.
+        // Generate audio track based on video duration, and apply effects.
+        currentItemExpectedInputDurationUs = pendingChange.durationUs;
+      }
      pendingAudioFormat = silentAudioGenerator.audioFormat;
      startTimeUs.compareAndSet(/* expectedValue= */ C.TIME_UNSET, /* newValue= */ 0);
-      silentAudioGenerator.addSilence(pendingChange.durationUs);
-      if (pendingChange.isLast) {
-        queueEndOfStreamAfterSilence = true;
-      }
+      appendSilence();
    }

    if (processedFirstMediaItemChange) {
@ -427,6 +473,27 @@ import java.util.concurrent.atomic.AtomicReference;
    processedFirstMediaItemChange = true;
  }

+  private boolean shouldAppendSilence() {
+    return !currentItemSilenceAppended
+        && currentItemExpectedInputDurationUs != C.TIME_UNSET
+        && currentItemExpectedInputDurationUs - currentItemActualInputDurationUs()
+            > MAX_AUDIO_DRIFT_ALLOWED_US;
+  }
+
+  private void appendSilence() {
+    silentAudioGenerator.addSilence(
+        currentItemExpectedInputDurationUs - currentItemActualInputDurationUs());
+    currentItemSilenceAppended = true;
+    if (isCurrentItemLast) {
+      queueEndOfStreamAfterSilence = true;
+    }
+  }
+
+  private long currentItemActualInputDurationUs() {
+    long samplesOutput = currentItemInputBytesRead / silentAudioGenerator.audioFormat.bytesPerFrame;
+    return Util.sampleCountToDurationUs(samplesOutput, silentAudioGenerator.audioFormat.sampleRate);
+  }
+
  /**
   * Returns a new configured {@link AudioProcessingPipeline}.
   *
--- a/libraries/transformer/src/main/java/androidx/media3/transformer/AudioGraphInputAudioSink.java
+++ b/libraries/transformer/src/main/java/androidx/media3/transformer/AudioGraphInputAudioSink.java
@ -131,8 +131,10 @@ import java.util.Objects;
      }
    }

+    // During playback, AudioGraphInput doesn't know the full media duration upfront due to seeking.
+    // Pass in C.TIME_UNSET to AudioGraphInput.onMediaItemChanged.
    outputGraphInput.onMediaItemChanged(
-        editedMediaItem, editedMediaItem.durationUs, currentInputFormat, /* isLast= */ false);
+        editedMediaItem, C.TIME_UNSET, currentInputFormat, /* isLast= */ false);
  }

  @Override
--- a/libraries/transformer/src/main/java/androidx/media3/transformer/SequenceAssetLoader.java
+++ b/libraries/transformer/src/main/java/androidx/media3/transformer/SequenceAssetLoader.java
@ -102,6 +102,7 @@ import java.util.concurrent.atomic.AtomicInteger;
  private volatile boolean released;

  private volatile long currentAssetDurationUs;
+  private volatile long currentAssetDurationAfterEffectsAppliedUs;
  private volatile long maxSequenceDurationUs;
  private volatile boolean isMaxSequenceDurationUsFinal;

@ -334,7 +335,9 @@ import java.util.concurrent.atomic.AtomicInteger;

    onMediaItemChangedListener.onMediaItemChanged(
        editedMediaItems.get(currentMediaItemIndex),
-        currentAssetDurationUs,
+        /* durationUs= */ (trackType == C.TRACK_TYPE_AUDIO && isLooping && decodeAudio)
+            ? C.TIME_UNSET
+            : currentAssetDurationUs,
        /* decodedFormat= */ outputFormat,
        /* isLast= */ currentMediaItemIndex == editedMediaItems.size() - 1);
  }
@ -364,11 +367,11 @@ import java.util.concurrent.atomic.AtomicInteger;
    checkArgument(
        durationUs != C.TIME_UNSET || currentMediaItemIndex == editedMediaItems.size() - 1,
        "Could not retrieve required duration for EditedMediaItem " + currentMediaItemIndex);
-    durationUs =
+    currentAssetDurationAfterEffectsAppliedUs =
        editedMediaItems.get(currentMediaItemIndex).getDurationAfterEffectsApplied(durationUs);
    currentAssetDurationUs = durationUs;
    if (editedMediaItems.size() == 1 && !isLooping) {
-      sequenceAssetLoaderListener.onDurationUs(durationUs);
+      sequenceAssetLoaderListener.onDurationUs(currentAssetDurationAfterEffectsAppliedUs);
    }
  }

@ -528,7 +531,7 @@ import java.util.concurrent.atomic.AtomicInteger;
                return;
              }
              addCurrentProcessedInput();
-              totalDurationUs += currentAssetDurationUs;
+              totalDurationUs += currentAssetDurationAfterEffectsAppliedUs;
              currentAssetLoader.release();
              isCurrentAssetFirstAsset = false;
              currentMediaItemIndex++;
--- a/libraries/transformer/src/main/java/androidx/media3/transformer/VideoFrameProcessingWrapper.java
+++ b/libraries/transformer/src/main/java/androidx/media3/transformer/VideoFrameProcessingWrapper.java
@ -60,6 +60,7 @@ import java.util.concurrent.atomic.AtomicLong;
      long durationUs,
      @Nullable Format decodedFormat,
      boolean isLast) {
+    durationUs = editedMediaItem.getDurationAfterEffectsApplied(durationUs);
    if (decodedFormat != null) {
      Size decodedSize = getDecodedSize(decodedFormat);
      videoFrameProcessor.registerInputStream(
--- a/libraries/transformer/src/test/java/androidx/media3/transformer/AudioGraphInputTest.java
+++ b/libraries/transformer/src/test/java/androidx/media3/transformer/AudioGraphInputTest.java
@ -18,7 +18,10 @@ package androidx.media3.transformer;

 import static androidx.media3.common.util.Assertions.checkState;
 import static androidx.media3.common.util.Util.getPcmFormat;
+import static androidx.media3.transformer.TestUtil.createSpeedChangingAudioProcessor;
 import static com.google.common.truth.Truth.assertThat;
+import static java.util.Collections.max;
+import static java.util.Collections.min;

 import androidx.media3.common.C;
 import androidx.media3.common.Format;
@ -28,6 +31,7 @@ import androidx.media3.common.util.Util;
 import androidx.media3.decoder.DecoderInputBuffer;
 import androidx.media3.test.utils.TestUtil;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
+import com.google.common.collect.ImmutableList;
 import com.google.common.primitives.Bytes;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@ -40,6 +44,12 @@ import org.junit.runner.RunWith;
 public class AudioGraphInputTest {
  private static final EditedMediaItem FAKE_ITEM =
      new EditedMediaItem.Builder(MediaItem.EMPTY).build();
+  private static final EditedMediaItem FAKE_ITEM_WITH_DOUBLE_SPEED =
+      new EditedMediaItem.Builder(MediaItem.EMPTY)
+          .setEffects(
+              new Effects(
+                  ImmutableList.of(createSpeedChangingAudioProcessor(2)), ImmutableList.of()))
+          .build();
  private static final AudioFormat MONO_44100 =
      new AudioFormat(/* sampleRate= */ 44_100, /* channelCount= */ 1, C.ENCODING_PCM_16BIT);
  private static final AudioFormat MONO_48000 =
@ -250,10 +260,10 @@ public class AudioGraphInputTest {
            /* editedMediaItem= */ FAKE_ITEM,
            /* inputFormat= */ getPcmFormat(STEREO_44100));
    byte[] inputData = TestUtil.buildTestData(/* length= */ 100 * STEREO_44100.bytesPerFrame);
-
+    // Pass in duration approximately equal to raw data duration ~ 100 / 44100 ~ 2267us.
    audioGraphInput.onMediaItemChanged(
        /* editedMediaItem= */ FAKE_ITEM,
-        /* durationUs= */ 1_000_000,
+        /* durationUs= */ 2267,
        /* decodedFormat= */ getPcmFormat(STEREO_44100),
        /* isLast= */ true);

@ -274,6 +284,90 @@ public class AudioGraphInputTest {
    assertThat(outputBytes).containsExactlyElementsIn(Bytes.asList(inputData));
  }

+  @Test
+  public void getOutput_withNoEffects_returnsInputDataAndSilence() throws Exception {
+    AudioGraphInput audioGraphInput =
+        new AudioGraphInput(
+            /* requestedOutputAudioFormat= */ AudioFormat.NOT_SET,
+            /* editedMediaItem= */ FAKE_ITEM,
+            /* inputFormat= */ getPcmFormat(STEREO_44100));
+    byte[] inputData = TestUtil.buildTestData(/* length= */ 100 * STEREO_44100.bytesPerFrame);
+
+    audioGraphInput.onMediaItemChanged(
+        /* editedMediaItem= */ FAKE_ITEM,
+        /* durationUs= */ 1_000_000,
+        /* decodedFormat= */ getPcmFormat(STEREO_44100),
+        /* isLast= */ true);
+
+    // Force the media item change to be processed.
+    checkState(!audioGraphInput.getOutput().hasRemaining());
+
+    // Queue inputData: 100 * STEREO_44100.bytesPerFrame bytes = 100 PCM samples.
+    // Audio duration is 100 / 44100 seconds ~ 2_268us.
+    DecoderInputBuffer inputBuffer = audioGraphInput.getInputBuffer();
+    inputBuffer.ensureSpaceForWrite(inputData.length);
+    inputBuffer.data.put(inputData).flip();
+    checkState(audioGraphInput.queueInputBuffer());
+
+    // Queue EOS. Input audio track ends before onMediaItemChanged durationUs = 1_000_000.
+    // AudioGraphInput will append generated silence up to target durationUs of 1s (~997_732us).
+    audioGraphInput.getInputBuffer().setFlags(C.BUFFER_FLAG_END_OF_STREAM);
+    checkState(audioGraphInput.queueInputBuffer());
+
+    List<Byte> outputBytes = drainAudioGraphInputUntilEnded(audioGraphInput);
+    long expectedSampleCount = Util.durationUsToSampleCount(1_000_000, STEREO_44100.sampleRate);
+    // Silent audio generator rounds up duration.
+    assertThat(outputBytes.size())
+        .isEqualTo((expectedSampleCount + 1) * STEREO_44100.bytesPerFrame);
+    assertThat(outputBytes.subList(0, inputData.length))
+        .containsExactlyElementsIn(Bytes.asList(inputData))
+        .inOrder();
+    assertThat(min(outputBytes.subList(inputData.length, outputBytes.size()))).isEqualTo(0);
+    assertThat(max(outputBytes.subList(inputData.length, outputBytes.size()))).isEqualTo(0);
+  }
+
+  @Test
+  public void getOutput_withEffects_returnsInputDataAndSilence() throws Exception {
+    AudioGraphInput audioGraphInput =
+        new AudioGraphInput(
+            /* requestedOutputAudioFormat= */ AudioFormat.NOT_SET,
+            /* editedMediaItem= */ FAKE_ITEM_WITH_DOUBLE_SPEED,
+            /* inputFormat= */ getPcmFormat(STEREO_44100));
+    byte[] inputData = TestUtil.buildTestData(/* length= */ 4096 * STEREO_44100.bytesPerFrame);
+
+    audioGraphInput.onMediaItemChanged(
+        /* editedMediaItem= */ FAKE_ITEM_WITH_DOUBLE_SPEED,
+        /* durationUs= */ 1_000_000,
+        /* decodedFormat= */ getPcmFormat(STEREO_44100),
+        /* isLast= */ true);
+
+    // Force the media item change to be processed.
+    checkState(!audioGraphInput.getOutput().hasRemaining());
+
+    // Queue inputData: 4096 * STEREO_44100.bytesPerFrame bytes = 100 PCM samples.
+    // Audio duration is 4096 / 44100 seconds ~ 92_880us.
+    DecoderInputBuffer inputBuffer = audioGraphInput.getInputBuffer();
+    inputBuffer.ensureSpaceForWrite(inputData.length);
+    inputBuffer.data.put(inputData).flip();
+    checkState(audioGraphInput.queueInputBuffer());
+
+    // Queue EOS. Input audio track ends before onMediaItemChanged durationUs = 1_000_000.
+    // AudioGraphInput will append generated silence up to target durationUs of 1s (~907_120us).
+    audioGraphInput.getInputBuffer().setFlags(C.BUFFER_FLAG_END_OF_STREAM);
+    checkState(audioGraphInput.queueInputBuffer());
+
+    List<Byte> outputBytes = drainAudioGraphInputUntilEnded(audioGraphInput);
+    long expectedSampleCount = Util.durationUsToSampleCount(500_000, STEREO_44100.sampleRate);
+    // Silent audio generator rounds up duration.
+    assertThat(outputBytes.size())
+        .isEqualTo((expectedSampleCount + 1) * STEREO_44100.bytesPerFrame);
+    // Sonic takes a while to zero-out the input.
+    assertThat(min(outputBytes.subList(inputData.length * 6 / 10, outputBytes.size())))
+        .isEqualTo(0);
+    assertThat(max(outputBytes.subList(inputData.length * 6 / 10, outputBytes.size())))
+        .isEqualTo(0);
+  }
+
  @Test
  public void getOutput_withSilentMediaItemChange_outputsCorrectAmountOfSilentBytes()
      throws Exception {
@ -294,6 +388,26 @@ public class AudioGraphInputTest {
    assertThat(bytesOutput).isEqualTo(expectedSampleCount * STEREO_44100.bytesPerFrame);
  }

+  @Test
+  public void getOutput_withSilentMediaItemAndEffectsChange_outputsCorrectAmountOfSilentBytes()
+      throws Exception {
+    AudioGraphInput audioGraphInput =
+        new AudioGraphInput(
+            /* requestedOutputAudioFormat= */ AudioFormat.NOT_SET,
+            /* editedMediaItem= */ FAKE_ITEM_WITH_DOUBLE_SPEED,
+            /* inputFormat= */ getPcmFormat(STEREO_44100));
+
+    audioGraphInput.onMediaItemChanged(
+        /* editedMediaItem= */ FAKE_ITEM_WITH_DOUBLE_SPEED,
+        /* durationUs= */ 1_000_000,
+        /* decodedFormat= */ null,
+        /* isLast= */ true);
+
+    int bytesOutput = drainAudioGraphInputUntilEnded(audioGraphInput).size();
+    long expectedSampleCount = Util.durationUsToSampleCount(500_000, STEREO_44100.sampleRate);
+    assertThat(bytesOutput).isEqualTo(expectedSampleCount * STEREO_44100.bytesPerFrame);
+  }
+
  @Test
  public void getOutput_afterFlush_returnsEmptyBuffer() throws Exception {
    AudioGraphInput audioGraphInput =
--- a/libraries/transformer/src/test/java/androidx/media3/transformer/TestUtil.java
+++ b/libraries/transformer/src/test/java/androidx/media3/transformer/TestUtil.java
@ -84,6 +84,12 @@ public final class TestUtil {
    return sonicAudioProcessor;
  }

+  public static SonicAudioProcessor createSpeedChangingAudioProcessor(float speed) {
+    SonicAudioProcessor sonicAudioProcessor = new SonicAudioProcessor();
+    sonicAudioProcessor.setSpeed(speed);
+    return sonicAudioProcessor;
+  }
+
  public static ChannelMixingAudioProcessor createVolumeScalingAudioProcessor(float scale) {
    ChannelMixingAudioProcessor audioProcessor = new ChannelMixingAudioProcessor();
    for (int channel = 1; channel <= 6; channel++) {