Add support for SEI and vexu box parsing.

Stereo view information is stored in the 3D reference displays information SEI and the optional vexu box.  Parsing of the SEI and vexu box is added, and based on the parsed info, proper mapping of primary/secondary view to left/right eye is determined.

PiperOrigin-RevId: 651002190
This commit is contained in:
Googler 2024-07-10 06:57:58 -07:00 committed by Copybara-Service
parent 34a802ef38
commit f673ef43b4
7 changed files with 546 additions and 2 deletions

View File

@ -1097,7 +1097,8 @@ public final class C {
/**
* The stereo mode for 360/3D/VR videos. One of {@link Format#NO_VALUE}, {@link
* #STEREO_MODE_MONO}, {@link #STEREO_MODE_TOP_BOTTOM}, {@link #STEREO_MODE_LEFT_RIGHT} or {@link
* #STEREO_MODE_STEREO_MESH}.
* #STEREO_MODE_STEREO_MESH}, {@link #STEREO_MODE_INTERLEAVED_LEFT_PRIMARY}, {@link
* #STEREO_MODE_INTERLEAVED_RIGHT_PRIMARY}.
*/
@UnstableApi
@Documented
@ -1108,7 +1109,9 @@ public final class C {
STEREO_MODE_MONO,
STEREO_MODE_TOP_BOTTOM,
STEREO_MODE_LEFT_RIGHT,
STEREO_MODE_STEREO_MESH
STEREO_MODE_STEREO_MESH,
STEREO_MODE_INTERLEAVED_LEFT_PRIMARY,
STEREO_MODE_INTERLEAVED_RIGHT_PRIMARY
})
public @interface StereoMode {}
@ -1127,6 +1130,18 @@ public final class C {
*/
@UnstableApi public static final int STEREO_MODE_STEREO_MESH = 3;
/**
* Indicates interleaved stereo layout with the left view being the primary view, used with
* 360/3D/VR videos.
*/
@UnstableApi public static final int STEREO_MODE_INTERLEAVED_LEFT_PRIMARY = 4;
/**
* Indicates interleaved stereo layout with the right view being the primary view, used with
* 360/3D/VR videos.
*/
@UnstableApi public static final int STEREO_MODE_INTERLEAVED_RIGHT_PRIMARY = 5;
// LINT.IfChange(color_space)
/**
* Video color spaces, also referred to as color standards. One of {@link Format#NO_VALUE}, {@link

View File

@ -384,6 +384,40 @@ public final class NalUnitUtil {
}
}
/** Holds data parsed from a H.265 3D reference displays information SEI message. */
public static final class H265Sei3dRefDisplayInfoData {
public final int precRefDisplayWidth;
public final int precRefViewingDist;
public final int numRefDisplays;
public final int leftViewId;
public final int rightViewId;
public final int exponentRefDisplayWidth;
public final int mantissaRefDisplayWidth;
public final int exponentRefViewingDist;
public final int mantissaRefViewingDist;
public H265Sei3dRefDisplayInfoData(
int precRefDisplayWidth,
int precRefViewingDist,
int numRefDisplays,
int leftViewId,
int rightViewId,
int exponentRefDisplayWidth,
int mantissaRefDisplayWidth,
int exponentRefViewingDist,
int mantissaRefViewingDist) {
this.precRefDisplayWidth = precRefDisplayWidth;
this.precRefViewingDist = precRefViewingDist;
this.numRefDisplays = numRefDisplays;
this.leftViewId = leftViewId;
this.rightViewId = rightViewId;
this.exponentRefDisplayWidth = exponentRefDisplayWidth;
this.mantissaRefDisplayWidth = mantissaRefDisplayWidth;
this.exponentRefViewingDist = exponentRefViewingDist;
this.mantissaRefViewingDist = mantissaRefViewingDist;
}
}
/** Four initial bytes that must prefix NAL units for decoding. */
public static final byte[] NAL_START_CODE = new byte[] {0, 0, 0, 1};
@ -1574,6 +1608,112 @@ public final class NalUnitUtil {
return new PpsData(picParameterSetId, seqParameterSetId, bottomFieldPicOrderInFramePresentFlag);
}
/**
* Parses a H.265 3D reference displays information SEI message syntax defined in ITU-T
* Recommendation H.265 (2019) subsection G.14.2.3. Given a generic PREFIX_SEI NAL unit, only 3D
* reference displays information SEI is parsed, if exists.
*
* @param nalData A buffer containing escaped prefix SEI data.
* @param nalOffset The offset of the NAL unit header in {@code nalData}.
* @param nalLimit The limit of the NAL unit in {@code nalData}.
* @return A parsed representation of the PPS data.
*/
@Nullable
public static H265Sei3dRefDisplayInfoData parseH265Sei3dRefDisplayInfo(
byte[] nalData, int nalOffset, int nalLimit) {
int seiRbspPos = nalOffset + 2;
int last1BitBytePos = nalLimit - 1;
while (nalData[last1BitBytePos] == 0 && last1BitBytePos > seiRbspPos) {
last1BitBytePos--;
}
if (nalData[last1BitBytePos] == 0 || last1BitBytePos <= seiRbspPos) {
return null;
}
ParsableNalUnitBitArray data =
new ParsableNalUnitBitArray(nalData, seiRbspPos, last1BitBytePos + 1);
// Every SEI message must have at least 2 bytes for the payload type and size.
while (data.canReadBits(16)) {
// Parsing sei_message() in subsection 7.3.5.
int payloadType = 0;
int nextByte = data.readBits(8);
while (nextByte == 255) {
payloadType += 255;
nextByte = data.readBits(8);
}
payloadType += nextByte;
int payloadSize = 0;
nextByte = data.readBits(8);
while (nextByte == 255) {
payloadSize += 255;
nextByte = data.readBits(8);
}
payloadSize += nextByte;
if (payloadSize == 0 || !data.canReadBits(payloadSize)) {
return null;
}
if (payloadType == 176) { // three_dimensional_reference_displays_info()
int precRefDisplayWidth = data.readUnsignedExpGolombCodedInt(); // prec_ref_display_width
boolean refViewingDistanceFlag = data.readBit(); // ref_viewing_distance_flag
int precRefViewingDist = 0;
if (refViewingDistanceFlag) {
precRefViewingDist = data.readUnsignedExpGolombCodedInt(); // prec_ref_viewing_dist
}
int numRefDisplaysMinus1 = data.readUnsignedExpGolombCodedInt(); // num_ref_displays_minus1
int leftViewId = -1;
int rightViewId = -1;
int exponentRefDisplayWidth = -1;
int mantissaRefDisplayWidth = -1;
int exponentRefViewingDist = -1;
int mantissaRefViewingDist = -1;
for (int i = 0; i <= numRefDisplaysMinus1; i++) {
leftViewId = data.readUnsignedExpGolombCodedInt(); // left_view_id[i]
rightViewId = data.readUnsignedExpGolombCodedInt(); // right_view_id[i]
exponentRefDisplayWidth = data.readBits(6); // exponent_ref_display_width[i]
if (exponentRefDisplayWidth == 63) {
return null;
}
int refDispWidthBits =
exponentRefDisplayWidth == 0
? max(0, precRefDisplayWidth - 30)
: max(0, exponentRefDisplayWidth + precRefDisplayWidth - 31);
mantissaRefDisplayWidth =
data.readBits(refDispWidthBits); // mantissa_ref_display_width[i]
if (refViewingDistanceFlag) {
exponentRefViewingDist = data.readBits(6); // exponent_ref_viewing_distance[i]
if (exponentRefViewingDist == 63) {
return null;
}
int refViewDistBits =
exponentRefViewingDist == 0
? max(0, precRefViewingDist - 30)
: max(0, exponentRefViewingDist + precRefViewingDist - 31);
mantissaRefViewingDist =
data.readBits(refViewDistBits); // mantissa_ref_viewing_distance[i]
}
if (data.readBit()) { // additional_shift_present_flag[i]
data.skipBits(10); // num_sample_shift_plus512[i]
}
}
return new H265Sei3dRefDisplayInfoData(
precRefDisplayWidth,
precRefViewingDist,
numRefDisplaysMinus1 + 1,
leftViewId,
rightViewId,
exponentRefDisplayWidth,
mantissaRefDisplayWidth,
exponentRefViewingDist,
mantissaRefViewingDist);
}
}
return null;
}
/**
* Finds the first NAL unit in {@code data}.
*

View File

@ -66,6 +66,8 @@ public final class NalUnitUtilTest {
0x4D, 0x40, 0x40, 0x40, 0x40, 0x20);
private static final byte[] H265_SPS_TEST_DATA_2VIEWS_VIEW_1 =
createByteArray(0x42, 0x09, 0x0E, 0x82, 0x2E, 0x45, 0x8A, 0xA0, 0x05, 0x01);
private static final byte[] H265_SPS_TEST_DATA_2VIEWS_SEI =
createByteArray(0x4E, 0x01, 0xB0, 0x04, 0x04, 0x0A, 0x80, 0x20, 0x80);
private static final byte[] H265_VPS_TEST_DATA_2VIEWS_HDR =
createByteArray(
@ -89,6 +91,8 @@ public final class NalUnitUtilTest {
0xFF, 0xFF, 0xFC, 0x8C, 0x41, 0x46, 0x84, 0x3F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC8, 0xC4, 0x14, 0x68, 0x43, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC, 0x95, 0xA8,
0x18);
private static final byte[] H265_SPS_TEST_DATA_2VIEWS_HDR_SEI =
createByteArray(0x4E, 0x01, 0xB0, 0x04, 0x04, 0x0D, 0x00, 0x20, 0x80);
@Test
public void findNalUnit() {
@ -293,6 +297,17 @@ public final class NalUnitUtilTest {
assertThat(spsDataView1.height).isEqualTo(1080);
assertThat(spsDataView1.bitDepthLumaMinus8).isEqualTo(0);
assertThat(spsDataView1.bitDepthChromaMinus8).isEqualTo(0);
NalUnitUtil.H265Sei3dRefDisplayInfoData seiData =
NalUnitUtil.parseH265Sei3dRefDisplayInfo(
H265_SPS_TEST_DATA_2VIEWS_SEI, 0, H265_SPS_TEST_DATA_2VIEWS_SEI.length);
assertThat(seiData.precRefDisplayWidth).isEqualTo(31);
assertThat(seiData.precRefViewingDist).isEqualTo(0);
assertThat(seiData.numRefDisplays).isEqualTo(1);
assertThat(seiData.leftViewId).isEqualTo(1);
assertThat(seiData.rightViewId).isEqualTo(0);
assertThat(seiData.exponentRefDisplayWidth).isEqualTo(0);
assertThat(seiData.mantissaRefDisplayWidth).isEqualTo(0);
}
@Test
@ -360,6 +375,17 @@ public final class NalUnitUtilTest {
assertThat(spsDataView1.height).isEqualTo(2160);
assertThat(spsDataView1.bitDepthLumaMinus8).isEqualTo(2);
assertThat(spsDataView1.bitDepthChromaMinus8).isEqualTo(2);
NalUnitUtil.H265Sei3dRefDisplayInfoData seiData =
NalUnitUtil.parseH265Sei3dRefDisplayInfo(
H265_SPS_TEST_DATA_2VIEWS_HDR_SEI, 0, H265_SPS_TEST_DATA_2VIEWS_HDR_SEI.length);
assertThat(seiData.precRefDisplayWidth).isEqualTo(31);
assertThat(seiData.precRefViewingDist).isEqualTo(0);
assertThat(seiData.numRefDisplays).isEqualTo(1);
assertThat(seiData.leftViewId).isEqualTo(0);
assertThat(seiData.rightViewId).isEqualTo(1);
assertThat(seiData.exponentRefDisplayWidth).isEqualTo(0);
assertThat(seiData.mantissaRefDisplayWidth).isEqualTo(0);
}
@Test

View File

@ -103,6 +103,7 @@ public final class HevcConfig {
@C.ColorSpace int colorSpace = Format.NO_VALUE;
@C.ColorRange int colorRange = Format.NO_VALUE;
@C.ColorTransfer int colorTransfer = Format.NO_VALUE;
@C.StereoMode int stereoMode = Format.NO_VALUE;
float pixelWidthHeightRatio = 1;
int maxNumReorderPics = Format.NO_VALUE;
@Nullable String codecs = null;
@ -150,6 +151,16 @@ public final class HevcConfig {
spsData.profileTierLevel.constraintBytes,
spsData.profileTierLevel.generalLevelIdc);
}
} else if (nalUnitType == NalUnitUtil.H265_NAL_UNIT_TYPE_PREFIX_SEI && j == 0) {
NalUnitUtil.H265Sei3dRefDisplayInfoData seiData =
NalUnitUtil.parseH265Sei3dRefDisplayInfo(
buffer, bufferPosition, bufferPosition + nalUnitLength);
if (seiData != null && currentVpsData != null) {
stereoMode =
(seiData.leftViewId == currentVpsData.layerInfos.get(0).viewId)
? C.STEREO_MODE_INTERLEAVED_LEFT_PRIMARY
: C.STEREO_MODE_INTERLEAVED_RIGHT_PRIMARY;
}
}
bufferPosition += nalUnitLength;
data.skipBytes(nalUnitLength);
@ -168,6 +179,7 @@ public final class HevcConfig {
colorSpace,
colorRange,
colorTransfer,
stereoMode,
pixelWidthHeightRatio,
maxNumReorderPics,
codecs,
@ -216,6 +228,11 @@ public final class HevcConfig {
*/
public final @C.ColorTransfer int colorTransfer;
/**
* The {@link C.StereoMode} of the video or {@link Format#NO_VALUE} if unknown or not applicable.
*/
public final @C.StereoMode int stereoMode;
/** The pixel width to height ratio. */
public final float pixelWidthHeightRatio;
@ -248,6 +265,7 @@ public final class HevcConfig {
@C.ColorSpace int colorSpace,
@C.ColorRange int colorRange,
@C.ColorTransfer int colorTransfer,
@C.StereoMode int stereoMode,
float pixelWidthHeightRatio,
int maxNumReorderPics,
@Nullable String codecs,
@ -261,6 +279,7 @@ public final class HevcConfig {
this.colorSpace = colorSpace;
this.colorRange = colorRange;
this.colorTransfer = colorTransfer;
this.stereoMode = stereoMode;
this.pixelWidthHeightRatio = pixelWidthHeightRatio;
this.maxNumReorderPics = maxNumReorderPics;
this.codecs = codecs;

View File

@ -63,6 +63,15 @@ import java.util.List;
@SuppressWarnings("ConstantCaseForConstants")
public static final int TYPE_lhvC = 0x6C687643;
@SuppressWarnings("ConstantCaseForConstants")
public static final int TYPE_vexu = 0x76657875;
@SuppressWarnings("ConstantCaseForConstants")
public static final int TYPE_eyes = 0x65796573;
@SuppressWarnings("ConstantCaseForConstants")
public static final int TYPE_stri = 0x73747269;
@SuppressWarnings("ConstantCaseForConstants")
public static final int TYPE_vp08 = 0x76703038;

View File

@ -1204,6 +1204,10 @@ import java.util.Objects;
}
maxNumReorderSamples = hevcConfig.maxNumReorderPics;
codecs = hevcConfig.codecs;
if (hevcConfig.stereoMode != Format.NO_VALUE) {
// HEVCDecoderConfigurationRecord may include 3D reference displays information SEI.
stereoMode = hevcConfig.stereoMode;
}
colorSpace = hevcConfig.colorSpace;
colorRange = hevcConfig.colorRange;
colorTransfer = hevcConfig.colorTransfer;
@ -1257,6 +1261,27 @@ import java.util.Objects;
false, "initializationData must be already set from hvcC atom");
}
codecs = lhevcConfig.codecs;
} else if (childAtomType == Atom.TYPE_vexu) {
VexuData vexuData = parseVideoExtendedUsageBox(parent, childStartPosition, childAtomSize);
if (vexuData != null && vexuData.eyesData != null) {
if (vpsData != null && vpsData.layerInfos.size() >= 2) {
// This is MV-HEVC case, so both eye views should be marked as available.
ExtractorUtil.checkContainerInput(
vexuData.hasBothEyeViews(), "both eye views must be marked as available");
// Based on subsection 1.4.3 of Apples proposed ISOBMFF extensions for stereo video
// (https://developer.apple.com/av-foundation/Stereo-Video-ISOBMFF-Extensions.pdf):
// "For multiview coding, there is no implied ordering and the eye_views_reversed field
// should be set to 0".
ExtractorUtil.checkContainerInput(
!vexuData.eyesData.striData.eyeViewsReversed,
"for MV-HEVC, eye_views_reversed must be set to false");
} else if (stereoMode == Format.NO_VALUE) {
stereoMode =
vexuData.eyesData.striData.eyeViewsReversed
? C.STEREO_MODE_INTERLEAVED_RIGHT_PRIMARY
: C.STEREO_MODE_INTERLEAVED_LEFT_PRIMARY;
}
}
} else if (childAtomType == Atom.TYPE_dvcC || childAtomType == Atom.TYPE_dvvC) {
@Nullable DolbyVisionConfig dolbyVisionConfig = DolbyVisionConfig.parse(parent);
if (dolbyVisionConfig != null) {
@ -2035,6 +2060,54 @@ import java.util.Objects;
/* peakBitrate= */ peakBitrate > 0 ? peakBitrate : Format.NO_VALUE);
}
/**
* Returns stereo video playback related meta data from the vexu box. See
* https://developer.apple.com/av-foundation/Stereo-Video-ISOBMFF-Extensions.pdf for ref.
*/
@Nullable
/* package */ static VexuData parseVideoExtendedUsageBox(
ParsableByteArray parent, int position, int size) throws ParserException {
parent.setPosition(position + Atom.HEADER_SIZE);
int childPosition = parent.getPosition();
@Nullable EyesData eyesData = null;
while (childPosition - position < size) {
parent.setPosition(childPosition);
int childAtomSize = parent.readInt();
ExtractorUtil.checkContainerInput(childAtomSize > 0, "childAtomSize must be positive");
int childAtomType = parent.readInt();
if (childAtomType == Atom.TYPE_eyes) {
eyesData = parseStereoViewBox(parent, childPosition, childAtomSize);
}
childPosition += childAtomSize;
}
return eyesData == null ? null : new VexuData(eyesData);
}
@Nullable
private static EyesData parseStereoViewBox(ParsableByteArray parent, int position, int size)
throws ParserException {
parent.setPosition(position + Atom.HEADER_SIZE);
int childPosition = parent.getPosition();
while (childPosition - position < size) {
parent.setPosition(childPosition);
int childAtomSize = parent.readInt();
ExtractorUtil.checkContainerInput(childAtomSize > 0, "childAtomSize must be positive");
if (parent.readInt() == Atom.TYPE_stri) {
// The stri box extends FullBox that includes version (8 bits) and flags (24 bits).
parent.skipBytes(4);
int striInfo = parent.readUnsignedByte() & 0x0F;
return new EyesData(
new StriData(
((striInfo & 0x01) == 0x01),
((striInfo & 0x02) == 0x02),
((striInfo & 0x08) == 0x08),
((striInfo & 0x04) == 0x04)));
}
childPosition += childAtomSize;
}
return null;
}
/**
* Parses encryption data from an audio/video sample entry, returning a pair consisting of the
* unencrypted atom type and a {@link TrackEncryptionBox}. Null is returned if no common
@ -2293,6 +2366,49 @@ import java.util.Objects;
}
}
/** Data parsed from stri box. */
private static final class StriData {
private final boolean hasLeftEyeView;
private final boolean hasRightEyeView;
private final boolean eyeViewsReversed;
private final boolean hasAdditionalViews;
public StriData(
boolean hasLeftEyeView,
boolean hasRightEyeView,
boolean eyeViewsReversed,
boolean hasAdditionalViews) {
this.hasLeftEyeView = hasLeftEyeView;
this.hasRightEyeView = hasRightEyeView;
this.eyeViewsReversed = eyeViewsReversed;
this.hasAdditionalViews = hasAdditionalViews;
}
}
/** Data parsed from eyes box. */
private static final class EyesData {
private final StriData striData;
public EyesData(StriData striData) {
this.striData = striData;
}
}
/** Data parsed from vexu box. */
/* package */ static final class VexuData {
@Nullable private final EyesData eyesData;
public VexuData(EyesData eyesData) {
this.eyesData = eyesData;
}
public boolean hasBothEyeViews() {
return eyesData != null
&& eyesData.striData.hasLeftEyeView
&& eyesData.striData.hasRightEyeView;
}
}
/** A box containing sample sizes (e.g. stsz, stz2). */
private interface SampleSizeBox {

View File

@ -38,6 +38,206 @@ public final class AtomParsersTest {
private static final byte[] SIXTEEN_BIT_STZ2 =
Util.getBytesFromHexString(ATOM_HEADER + "00000010" + SAMPLE_COUNT + "0001000200030004");
// Sample 'vexu' with 'eyes' containing 'stri' along with other optional boxes.
private static final byte[] VEXU_DATA0 =
new byte[] {
// size (101), 'vexu'
0,
0,
0,
101,
118,
101,
120,
117,
// size (69), 'eyes'
0,
0,
0,
69,
101,
121,
101,
115,
// size (13), 'stri'
0,
0,
0,
13,
115,
116,
114,
105,
0,
0,
0,
0,
3,
// size (24), 'cams'
0,
0,
0,
24,
99,
97,
109,
115,
0,
0,
0,
16,
98,
108,
105,
110,
0,
0,
0,
0,
0,
0,
75,
40,
// size (24), 'cmfy'
0,
0,
0,
24,
99,
109,
102,
121,
0,
0,
0,
16,
100,
97,
100,
106,
0,
0,
0,
0,
0,
0,
0,
-56,
// size (24), 'proj'
0,
0,
0,
24,
112,
114,
111,
106,
0,
0,
0,
16,
112,
114,
106,
105,
0,
0,
0,
0,
114,
101,
99,
116,
};
// Sample 'vexu' with the use of 'must' to list required boxes.
private static final byte[] VEXU_DATA1 =
new byte[] {
// size (78), 'vexu'
0,
0,
0,
78,
118,
101,
120,
117,
// size (16), 'must' --> requires 'eyes'
0,
0,
0,
16,
109,
117,
115,
116,
0,
0,
0,
0,
101,
121,
101,
115,
// size (54), 'eyes'
0,
0,
0,
54,
101,
121,
101,
115,
// size (20), 'must' --> requires 'stri' and 'hero'
0,
0,
0,
20,
109,
117,
115,
116,
0,
0,
0,
0,
115,
116,
114,
105,
104,
101,
114,
111,
// size (13), 'stri'
0,
0,
0,
13,
115,
116,
114,
105,
0,
0,
0,
0,
3,
// size (13), 'hero'
0,
0,
0,
13,
104,
101,
114,
111,
0,
0,
0,
0,
1,
};
@Test
public void parseCommonEncryptionSinfFromParentIgnoresUnknownSchemeType() throws ParserException {
byte[] cencSinf =
@ -67,6 +267,25 @@ public final class AtomParsersTest {
verifyStz2Parsing(new Atom.LeafAtom(Atom.TYPE_stsz, new ParsableByteArray(SIXTEEN_BIT_STZ2)));
}
@Test
public void vexuParsings() throws ParserException {
AtomParsers.VexuData vexuData = null;
assertThat(
vexuData =
AtomParsers.parseVideoExtendedUsageBox(
new ParsableByteArray(VEXU_DATA0), 0, VEXU_DATA0.length))
.isNotNull();
assertThat(vexuData).isNotNull();
assertThat(vexuData.hasBothEyeViews()).isTrue();
assertThat(
vexuData =
AtomParsers.parseVideoExtendedUsageBox(
new ParsableByteArray(VEXU_DATA1), 0, VEXU_DATA1.length))
.isNotNull();
assertThat(vexuData).isNotNull();
assertThat(vexuData.hasBothEyeViews()).isTrue();
}
private static void verifyStz2Parsing(Atom.LeafAtom stz2Atom) {
AtomParsers.Stz2SampleSizeBox box = new AtomParsers.Stz2SampleSizeBox(stz2Atom);
assertThat(box.getSampleCount()).isEqualTo(4);