Add fps-awareness to DefaultTrackSelector

This change aims to prioritise tracks that have a 'smooth enough for
video' frame rate, without always selecting the track with the highest
frame rate.

In particular MP4 files extracted from motion photos sometimes have two
HEVC tracks, with the higher-res one having a very low frame rate (not
intended for use in video playback). Before this change
`DefaultTrackSelector` would pick the low-fps, high-res track.

This change adds a somewhat arbitrary 10fps threshold for "smooth video
playback", meaning any tracks above this threshold are selected in
preference to tracks below it. Within the tracks above the threshold
other attributes are used to select the preferred track. We deliberately
don't pick the highest-fps track (over pixel count and bitrate), because
most users would prefer to see a 30fps 4k track over a 60fps 720p track.

This change also includes a test MP4 file, extracted from the existing
`jpeg/pixel-motion-photo-2-hevc-tracks.jpg` file by logging
`mp4StartPosition` in
[`MotionPhotoDescription.getMotionPhotoMetadata`](b930b40a16/libraries/extractor/src/main/java/androidx/media3/extractor/jpeg/MotionPhotoDescription.java (L123))
and then using `dd`:

```
mp4StartPosition=2603594

$ dd if=jpeg/pixel-motion-photo-2-hevc-tracks.jpg \
    of=mp4/pixel-motion-photo-2-hevc-tracks.mp4 \
    bs=1 \
    skip=2603594
```

----

This solution is in addition to the `JpegMotionPhotoExtractor` change
made specifically for these two-track motion photos in
5266c71b3a.
We will keep both changes, even though that change is not strictly
needed after this one, because adding the role flags helps to
communicate more clearly the intended usage of these tracks. This
change to consider FPS seems like a generally useful improvement to
`DefaultTrackSelector`, since it seems unlikely we would prefer a 5fps
video track over a 30fps one.

Issue: androidx/media#1051
PiperOrigin-RevId: 611015459
This commit is contained in:
ibaker 2024-02-28 01:05:40 -08:00 committed by Copybara-Service
parent 626a8adfd8
commit c7e00b12b4
6 changed files with 519 additions and 2 deletions

View File

@ -18,6 +18,12 @@
* Add support for changing between SDR and HDR input media in a sequence.
* Add support for composition-level audio effects.
* Track Selection:
* `DefaultTrackSelector`: Prefer video tracks with a 'reasonable' frame
rate (>=10fps) over those with a lower or unset frame rate. This ensures
the player selects the 'real' video track in MP4s extracted from motion
photos that can contain two HEVC tracks where one has a higher
resolution but a very small number of frames
([#1051](https://github.com/androidx/media/issues/1051)).
* Extractors:
* Audio:
* Allow renderer recovery by disabling offload if audio track fails to

View File

@ -3516,6 +3516,12 @@ public class DefaultTrackSelector extends MappingTrackSelector
private static final class VideoTrackInfo extends TrackInfo<VideoTrackInfo> {
/**
* Frame rate below which video playback will definitely not be considered smooth by the human
* eye.
*/
private static final float MIN_REASONABLE_FRAME_RATE = 10;
public static ImmutableList<VideoTrackInfo> createForTrackGroup(
int rendererIndex,
TrackGroup trackGroup,
@ -3551,6 +3557,12 @@ public class DefaultTrackSelector extends MappingTrackSelector
private final Parameters parameters;
private final boolean isWithinMinConstraints;
private final boolean isWithinRendererCapabilities;
/**
* True if {@link Format#frameRate} is set and is at least {@link #MIN_REASONABLE_FRAME_RATE}.
*/
private final boolean hasReasonableFrameRate;
private final int bitrate;
private final int pixelCount;
private final int preferredMimeTypeMatchIndex;
@ -3599,6 +3611,8 @@ public class DefaultTrackSelector extends MappingTrackSelector
|| format.bitrate >= parameters.minVideoBitrate);
isWithinRendererCapabilities =
isSupported(formatSupport, /* allowExceedsCapabilities= */ false);
hasReasonableFrameRate =
format.frameRate != Format.NO_VALUE && format.frameRate >= MIN_REASONABLE_FRAME_RATE;
bitrate = format.bitrate;
pixelCount = format.getPixelCount();
preferredRoleFlagsScore =
@ -3669,16 +3683,19 @@ public class DefaultTrackSelector extends MappingTrackSelector
.compare(info1.preferredRoleFlagsScore, info2.preferredRoleFlagsScore)
// 2. Compare match with implicit content preferences set by the media.
.compareFalseFirst(info1.hasMainOrNoRoleFlag, info2.hasMainOrNoRoleFlag)
// 3. Compare match with technical preferences set by the parameters.
// 3. Compare match with 'reasonable' frame rate threshold.
.compareFalseFirst(info1.hasReasonableFrameRate, info2.hasReasonableFrameRate)
// 4. Compare match with technical preferences set by the parameters.
.compareFalseFirst(info1.isWithinMaxConstraints, info2.isWithinMaxConstraints)
.compareFalseFirst(info1.isWithinMinConstraints, info2.isWithinMinConstraints)
.compare(
info1.preferredMimeTypeMatchIndex,
info2.preferredMimeTypeMatchIndex,
Ordering.natural().reverse())
// 4. Compare match with renderer capability preferences.
// 5. Compare match with renderer capability preferences.
.compareFalseFirst(info1.usesPrimaryDecoder, info2.usesPrimaryDecoder)
.compareFalseFirst(info1.usesHardwareAcceleration, info2.usesHardwareAcceleration);
if (info1.usesPrimaryDecoder && info1.usesHardwareAcceleration) {
chain = chain.compare(info1.codecPreferenceScore, info2.codecPreferenceScore);
}

View File

@ -46,6 +46,7 @@ public class Mp4PlaybackTest {
"midroll-5s.mp4",
"postroll-5s.mp4",
"preroll-5s.mp4",
"pixel-motion-photo-2-hevc-tracks.mp4",
"sample_ac3_fragmented.mp4",
"sample_ac3.mp4",
"sample_ac4_fragmented.mp4",

View File

@ -2821,6 +2821,84 @@ public final class DefaultTrackSelectorTest {
assertFixedSelection(result.selections[0], trackGroups, formatAac);
}
/**
* Tests that the track selector will select a group with a single video track with a 'reasonable'
* frame rate instead of a larger groups of tracks all with lower frame rates (the larger group of
* tracks would normally be preferred).
*/
@Test
public void selectTracks_reasonableFrameRatePreferredOverTrackCount() throws Exception {
Format.Builder formatBuilder = VIDEO_FORMAT.buildUpon();
Format frameRateTooLow = formatBuilder.setFrameRate(5).build();
Format frameRateAlsoTooLow = formatBuilder.setFrameRate(6).build();
Format highEnoughFrameRate = formatBuilder.setFrameRate(30).build();
// Use an adaptive group to check that frame rate has higher priority than number of tracks.
TrackGroup adaptiveFrameRateTooLowGroup = new TrackGroup(frameRateTooLow, frameRateAlsoTooLow);
TrackGroupArray trackGroups =
new TrackGroupArray(adaptiveFrameRateTooLowGroup, new TrackGroup(highEnoughFrameRate));
TrackSelectorResult result =
trackSelector.selectTracks(
new RendererCapabilities[] {VIDEO_CAPABILITIES}, trackGroups, periodId, TIMELINE);
assertFixedSelection(result.selections[0], trackGroups, highEnoughFrameRate);
}
/**
* Tests that the track selector will select the video track with a 'reasonable' frame rate that
* has the best match on other attributes, instead of an otherwise preferred track with a lower
* frame rate.
*/
@Test
public void selectTracks_reasonableFrameRatePreferredButNotHighestFrameRate() throws Exception {
Format.Builder formatBuilder = VIDEO_FORMAT.buildUpon();
Format frameRateUnsetHighRes =
formatBuilder.setFrameRate(Format.NO_VALUE).setWidth(3840).setHeight(2160).build();
Format frameRateTooLowHighRes =
formatBuilder.setFrameRate(5).setWidth(3840).setHeight(2160).build();
Format highEnoughFrameRateHighRes =
formatBuilder.setFrameRate(30).setWidth(1920).setHeight(1080).build();
Format highestFrameRateLowRes =
formatBuilder.setFrameRate(60).setWidth(1280).setHeight(720).build();
TrackGroupArray trackGroups =
new TrackGroupArray(
new TrackGroup(frameRateUnsetHighRes),
new TrackGroup(frameRateTooLowHighRes),
new TrackGroup(highestFrameRateLowRes),
new TrackGroup(highEnoughFrameRateHighRes));
TrackSelectorResult result =
trackSelector.selectTracks(
new RendererCapabilities[] {VIDEO_CAPABILITIES}, trackGroups, periodId, TIMELINE);
assertFixedSelection(result.selections[0], trackGroups, highEnoughFrameRateHighRes);
}
/**
* Tests that the track selector will select a track with {@link C#ROLE_FLAG_MAIN} with an
* 'unreasonably low' frame rate, if the other track with a 'reasonable' frame rate is marked with
* {@link C#ROLE_FLAG_ALTERNATE}. These role flags show an explicit signal from the media, so they
* should be respected.
*/
@Test
public void selectTracks_roleFlagsOverrideReasonableFrameRate() throws Exception {
Format.Builder formatBuilder = VIDEO_FORMAT.buildUpon();
Format mainTrackWithLowFrameRate =
formatBuilder.setFrameRate(3).setRoleFlags(C.ROLE_FLAG_MAIN).build();
Format alternateTrackWithHighFrameRate =
formatBuilder.setFrameRate(30).setRoleFlags(C.ROLE_FLAG_ALTERNATE).build();
TrackGroupArray trackGroups =
new TrackGroupArray(
new TrackGroup(mainTrackWithLowFrameRate),
new TrackGroup(alternateTrackWithHighFrameRate));
TrackSelectorResult result =
trackSelector.selectTracks(
new RendererCapabilities[] {VIDEO_CAPABILITIES}, trackGroups, periodId, TIMELINE);
assertFixedSelection(result.selections[0], trackGroups, mainTrackWithLowFrameRate);
}
/** Tests audio track selection when there are multiple audio renderers. */
@Test
public void selectTracks_multipleRenderer_allSelected() throws Exception {

View File

@ -0,0 +1,415 @@
MediaCodecAdapter (exotest.video.hevc):
inputBuffers:
count = 59
input buffer #0:
timeUs = 1000000000000
contents = length 175795, hash 92D88322
input buffer #1:
timeUs = 1000000033344
contents = length 32825, hash 9E4BBDC9
input buffer #2:
timeUs = 1000000066688
contents = length 30605, hash E792B0E1
input buffer #3:
timeUs = 1000000100033
contents = length 30292, hash C7D67400
input buffer #4:
timeUs = 1000000133377
contents = length 25928, hash EF6730FC
input buffer #5:
timeUs = 1000000166722
contents = length 23135, hash F7CCAB5
input buffer #6:
timeUs = 1000000200066
contents = length 32020, hash C948881C
input buffer #7:
timeUs = 1000000233244
contents = length 142480, hash 898726B
input buffer #8:
timeUs = 1000000266755
contents = length 28601, hash 158799EE
input buffer #9:
timeUs = 1000000300100
contents = length 32815, hash 53ABACC0
input buffer #10:
timeUs = 1000000333444
contents = length 40718, hash 24B50BC1
input buffer #11:
timeUs = 1000000366800
contents = length 29088, hash D18E00AE
input buffer #12:
timeUs = 1000000400144
contents = length 40733, hash 79770CBA
input buffer #13:
timeUs = 1000000433488
contents = length 36545, hash 27A8297C
input buffer #14:
timeUs = 1000000466833
contents = length 154398, hash 9B9013C6
input buffer #15:
timeUs = 1000000500177
contents = length 27135, hash 36386C42
input buffer #16:
timeUs = 1000000533544
contents = length 38747, hash 85D6F019
input buffer #17:
timeUs = 1000000566866
contents = length 29503, hash 9D1B916B
input buffer #18:
timeUs = 1000000600211
contents = length 32772, hash D4AB8735
input buffer #19:
timeUs = 1000000633555
contents = length 30388, hash ED862EDE
input buffer #20:
timeUs = 1000000666900
contents = length 35989, hash 4035491B
input buffer #21:
timeUs = 1000000700244
contents = length 142845, hash EC0DF71D
input buffer #22:
timeUs = 1000000733600
contents = length 28259, hash 8B59F0F6
input buffer #23:
timeUs = 1000000766944
contents = length 40516, hash E8C6D575
input buffer #24:
timeUs = 1000000800288
contents = length 38467, hash 4151BB14
input buffer #25:
timeUs = 1000000833633
contents = length 27748, hash 2DB01A39
input buffer #26:
timeUs = 1000000866977
contents = length 36956, hash 377A5C6C
input buffer #27:
timeUs = 1000000900300
contents = length 27476, hash DA07CDCA
input buffer #28:
timeUs = 1000000933666
contents = length 143200, hash E9E09671
input buffer #29:
timeUs = 1000000967011
contents = length 29122, hash 99DDD644
input buffer #30:
timeUs = 1000001000355
contents = length 39280, hash DC2510AE
input buffer #31:
timeUs = 1000001033700
contents = length 38631, hash AEB965F7
input buffer #32:
timeUs = 1000001067044
contents = length 27422, hash 84AFA85C
input buffer #33:
timeUs = 1000001100388
contents = length 39360, hash 467C7E6E
input buffer #34:
timeUs = 1000001133744
contents = length 24993, hash F10D6C03
input buffer #35:
timeUs = 1000001167088
contents = length 154591, hash 62D2311C
input buffer #36:
timeUs = 1000001200433
contents = length 27223, hash 6733CC93
input buffer #37:
timeUs = 1000001233777
contents = length 27659, hash BCE01964
input buffer #38:
timeUs = 1000001267077
contents = length 39427, hash 4260E860
input buffer #39:
timeUs = 1000001300422
contents = length 27698, hash 8D6087A2
input buffer #40:
timeUs = 1000001333811
contents = length 40089, hash 61C9B394
input buffer #41:
timeUs = 1000001367222
contents = length 27601, hash 7B3D87E8
input buffer #42:
timeUs = 1000001408833
contents = length 219559, hash 881031BA
input buffer #43:
timeUs = 1000001450511
contents = length 30027, hash 7BBBF608
input buffer #44:
timeUs = 1000001492188
contents = length 41623, hash 3A6D4A48
input buffer #45:
timeUs = 1000001600544
contents = length 114695, hash D61EAD29
input buffer #46:
timeUs = 1000001642222
contents = length 82113, hash DA0FCB1F
input buffer #47:
timeUs = 1000001683900
contents = length 59998, hash 72EE3D06
input buffer #48:
timeUs = 1000001725577
contents = length 37475, hash FA6E62C4
input buffer #49:
timeUs = 1000001767244
contents = length 229219, hash 37A06706
input buffer #50:
timeUs = 1000001808922
contents = length 24001, hash 3DA0DA79
input buffer #51:
timeUs = 1000001850533
contents = length 45931, hash 6B88632C
input buffer #52:
timeUs = 1000001892211
contents = length 35838, hash 3DC6FDE6
input buffer #53:
timeUs = 1000001933955
contents = length 36848, hash 6F9986EC
input buffer #54:
timeUs = 1000001975633
contents = length 29700, hash CF094404
input buffer #55:
timeUs = 1000002017311
contents = length 31282, hash 57AABAAA
input buffer #56:
timeUs = 1000002058988
contents = length 171963, hash 7115AF3D
input buffer #57:
timeUs = 1000002100700
contents = length 37550, hash F7D849CB
input buffer #58:
timeUs = 0
flags = 4
contents = length 0, hash 1
outputBuffers:
count = 58
output buffer #0:
timeUs = 1000000000000
size = 175795
rendered = true
output buffer #1:
timeUs = 1000000033344
size = 32825
rendered = true
output buffer #2:
timeUs = 1000000066688
size = 30605
rendered = true
output buffer #3:
timeUs = 1000000100033
size = 30292
rendered = true
output buffer #4:
timeUs = 1000000133377
size = 25928
rendered = true
output buffer #5:
timeUs = 1000000166722
size = 23135
rendered = true
output buffer #6:
timeUs = 1000000200066
size = 32020
rendered = true
output buffer #7:
timeUs = 1000000233244
size = 142480
rendered = true
output buffer #8:
timeUs = 1000000266755
size = 28601
rendered = true
output buffer #9:
timeUs = 1000000300100
size = 32815
rendered = true
output buffer #10:
timeUs = 1000000333444
size = 40718
rendered = true
output buffer #11:
timeUs = 1000000366800
size = 29088
rendered = true
output buffer #12:
timeUs = 1000000400144
size = 40733
rendered = true
output buffer #13:
timeUs = 1000000433488
size = 36545
rendered = true
output buffer #14:
timeUs = 1000000466833
size = 154398
rendered = true
output buffer #15:
timeUs = 1000000500177
size = 27135
rendered = true
output buffer #16:
timeUs = 1000000533544
size = 38747
rendered = true
output buffer #17:
timeUs = 1000000566866
size = 29503
rendered = true
output buffer #18:
timeUs = 1000000600211
size = 32772
rendered = true
output buffer #19:
timeUs = 1000000633555
size = 30388
rendered = true
output buffer #20:
timeUs = 1000000666900
size = 35989
rendered = true
output buffer #21:
timeUs = 1000000700244
size = 142845
rendered = true
output buffer #22:
timeUs = 1000000733600
size = 28259
rendered = true
output buffer #23:
timeUs = 1000000766944
size = 40516
rendered = true
output buffer #24:
timeUs = 1000000800288
size = 38467
rendered = true
output buffer #25:
timeUs = 1000000833633
size = 27748
rendered = true
output buffer #26:
timeUs = 1000000866977
size = 36956
rendered = true
output buffer #27:
timeUs = 1000000900300
size = 27476
rendered = true
output buffer #28:
timeUs = 1000000933666
size = 143200
rendered = true
output buffer #29:
timeUs = 1000000967011
size = 29122
rendered = true
output buffer #30:
timeUs = 1000001000355
size = 39280
rendered = true
output buffer #31:
timeUs = 1000001033700
size = 38631
rendered = true
output buffer #32:
timeUs = 1000001067044
size = 27422
rendered = true
output buffer #33:
timeUs = 1000001100388
size = 39360
rendered = true
output buffer #34:
timeUs = 1000001133744
size = 24993
rendered = true
output buffer #35:
timeUs = 1000001167088
size = 154591
rendered = true
output buffer #36:
timeUs = 1000001200433
size = 27223
rendered = true
output buffer #37:
timeUs = 1000001233777
size = 27659
rendered = true
output buffer #38:
timeUs = 1000001267077
size = 39427
rendered = true
output buffer #39:
timeUs = 1000001300422
size = 27698
rendered = true
output buffer #40:
timeUs = 1000001333811
size = 40089
rendered = true
output buffer #41:
timeUs = 1000001367222
size = 27601
rendered = true
output buffer #42:
timeUs = 1000001408833
size = 219559
rendered = true
output buffer #43:
timeUs = 1000001450511
size = 30027
rendered = true
output buffer #44:
timeUs = 1000001492188
size = 41623
rendered = true
output buffer #45:
timeUs = 1000001600544
size = 114695
rendered = true
output buffer #46:
timeUs = 1000001642222
size = 82113
rendered = true
output buffer #47:
timeUs = 1000001683900
size = 59998
rendered = true
output buffer #48:
timeUs = 1000001725577
size = 37475
rendered = true
output buffer #49:
timeUs = 1000001767244
size = 229219
rendered = true
output buffer #50:
timeUs = 1000001808922
size = 24001
rendered = true
output buffer #51:
timeUs = 1000001850533
size = 45931
rendered = true
output buffer #52:
timeUs = 1000001892211
size = 35838
rendered = true
output buffer #53:
timeUs = 1000001933955
size = 36848
rendered = true
output buffer #54:
timeUs = 1000001975633
size = 29700
rendered = true
output buffer #55:
timeUs = 1000002017311
size = 31282
rendered = true
output buffer #56:
timeUs = 1000002058988
size = 171963
rendered = true
output buffer #57:
timeUs = 1000002100700
size = 37550
rendered = true