Add support for Audio Vorbis codec in Mp4Muxer.

Update esdsBox to support muxing of files encoded with Vorbis audio codec .

PiperOrigin-RevId: 655159074
This commit is contained in:
Googler 2024-07-23 07:36:53 -07:00 committed by Copybara-Service
parent 0ac90855b4
commit b77f1d0f99
9 changed files with 590 additions and 26 deletions

View File

@ -495,6 +495,29 @@ public final class MimeTypes {
}
}
/**
* Returns the MP4 object type identifier corresponding to a MIME type, as defined in RFC 6381 and
* <a href="https://mp4ra.org/registered-types/object-types">MPEG-4 Object Types</a>.
*
* @param sampleMimeType The MIME type of the track.
* @return The corresponding MP4 object type identifier, or {@code null} if it could not be
* determined.
*/
@UnstableApi
@Nullable
public static Byte getMp4ObjectTypeFromMimeType(String sampleMimeType) {
switch (sampleMimeType) {
case MimeTypes.AUDIO_AAC:
return (byte) 0x40;
case MimeTypes.AUDIO_VORBIS:
return (byte) 0xDD;
case MimeTypes.VIDEO_MP4V:
return (byte) 0x20;
default:
return null;
}
}
/**
* Returns the MIME type corresponding to an MP4 object type identifier, as defined in RFC 6381
* and https://mp4ra.org/#/object_types.

View File

@ -43,7 +43,7 @@ import org.junit.runners.Parameterized.Parameters;
/** End to end parameterized instrumentation tests for {@link Mp4Muxer}. */
@RunWith(Parameterized.class)
public class Mp4MuxerEndToEndParameterizedAndroidTest {
private static final String H263_GPP = "bbb_176x144_128kbps_15fps_h263.3gp";
private static final String H263_3GP = "bbb_176x144_128kbps_15fps_h263.3gp";
private static final String H264_MP4 = "sample_no_bframes.mp4";
private static final String H264_WITH_NON_REFERENCE_B_FRAMES_MP4 =
"bbb_800x640_768kbps_30fps_avc_non_reference_3b.mp4";
@ -52,25 +52,27 @@ public class Mp4MuxerEndToEndParameterizedAndroidTest {
private static final String H265_HDR10_MP4 = "hdr10-720p.mp4";
private static final String H265_WITH_METADATA_TRACK_MP4 = "h265_with_metadata_track.mp4";
private static final String AV1_MP4 = "sample_av1.mp4";
private static final String AMR_NB = "bbb_mono_8kHz_12.2kbps_amrnb.3gp";
private static final String AMR_WB = "bbb_mono_16kHz_23.05kbps_amrwb.3gp";
private static final String AMR_NB_3GP = "bbb_mono_8kHz_12.2kbps_amrnb.3gp";
private static final String AMR_WB_3GP = "bbb_mono_16kHz_23.05kbps_amrwb.3gp";
private static final String MPEG4_MP4 = "bbb_176x144_192kbps_15fps_mpeg4.mp4";
private static final String OPUS_OGG = "bbb_6ch_8kHz_opus.ogg";
private static final String VORBIS_OGG = "bbb_1ch_16kHz_q10_vorbis.ogg";
@Parameters(name = "{0}")
public static ImmutableList<String> mediaSamples() {
return ImmutableList.of(
H263_GPP,
H263_3GP,
H264_MP4,
H264_WITH_NON_REFERENCE_B_FRAMES_MP4,
H264_WITH_PYRAMID_B_FRAMES_MP4,
H265_HDR10_MP4,
H265_WITH_METADATA_TRACK_MP4,
AV1_MP4,
AMR_NB,
AMR_WB,
AMR_NB_3GP,
AMR_WB_3GP,
MPEG4_MP4,
OPUS_OGG);
OPUS_OGG,
VORBIS_OGG);
}
@Parameter public @MonotonicNonNull String inputFile;

View File

@ -521,22 +521,24 @@ import java.util.List;
String mimeType = checkNotNull(format.sampleMimeType);
switch (mimeType) {
case MimeTypes.AUDIO_AAC:
case MimeTypes.VIDEO_MP4V:
case MimeTypes.AUDIO_VORBIS:
return esdsBox(format);
case MimeTypes.AUDIO_AMR_NB:
return damrBox(/* mode= */ (short) 0x81FF); // mode set: all enabled for AMR-NB
case MimeTypes.AUDIO_AMR_WB:
return damrBox(/* mode= */ (short) 0x83FF); // mode set: all enabled for AMR-WB
case MimeTypes.VIDEO_H263:
return d263Box();
case MimeTypes.AUDIO_OPUS:
return dOpsBox(format);
case MimeTypes.VIDEO_H263:
return d263Box();
case MimeTypes.VIDEO_H264:
return avcCBox(format);
case MimeTypes.VIDEO_H265:
return hvcCBox(format);
case MimeTypes.VIDEO_AV1:
return av1CBox(format);
case MimeTypes.VIDEO_MP4V:
return esdsBox(format);
default:
throw new IllegalArgumentException("Unsupported format: " + mimeType);
}
@ -1319,6 +1321,7 @@ import java.util.List;
String mimeType = checkNotNull(format.sampleMimeType);
switch (mimeType) {
case MimeTypes.AUDIO_AAC:
case MimeTypes.AUDIO_VORBIS:
return "mp4a";
case MimeTypes.AUDIO_AMR_NB:
return "samr";
@ -1348,32 +1351,40 @@ import java.util.List;
byte[] csd0 = format.initializationData.get(0);
checkArgument(csd0.length > 0, "csd-0 is empty.");
ByteBuffer csd0ByteBuffer = ByteBuffer.wrap(csd0);
String mimeType = checkNotNull(format.sampleMimeType);
boolean isVorbis = mimeType.equals(MimeTypes.AUDIO_VORBIS);
ByteBuffer csdByteBuffer =
isVorbis ? getVorbisInitializationData(format) : ByteBuffer.wrap(csd0);
int peakBitrate = format.peakBitrate;
int averageBitrate = format.averageBitrate;
boolean isVideo = MimeTypes.isVideo(format.sampleMimeType);
boolean isVideo = MimeTypes.isVideo(mimeType);
int csd0Size = csd0ByteBuffer.limit();
ByteBuffer dsiSizeBuffer = getSizeBuffer(csd0Size);
ByteBuffer dcdSizeBuffer = getSizeBuffer(csd0Size + dsiSizeBuffer.remaining() + 14);
int csdSize = csdByteBuffer.remaining();
ByteBuffer dsiSizeBuffer = getSizeBuffer(csdSize);
ByteBuffer dcdSizeBuffer = getSizeBuffer(csdSize + dsiSizeBuffer.remaining() + 14);
ByteBuffer esdSizeBuffer =
getSizeBuffer(csd0Size + dsiSizeBuffer.remaining() + dcdSizeBuffer.remaining() + 21);
getSizeBuffer(csdSize + dsiSizeBuffer.remaining() + dcdSizeBuffer.remaining() + 21);
ByteBuffer contents = ByteBuffer.allocate(csd0Size + MAX_FIXED_LEAF_BOX_SIZE);
ByteBuffer contents = ByteBuffer.allocate(csdSize + MAX_FIXED_LEAF_BOX_SIZE);
contents.putInt(0x00); // Version and flags.
contents.put((byte) 0x03); // ES_DescrTag
contents.put(esdSizeBuffer);
contents.putShort((short) 0x0000); // First 16 bits of ES_ID.
contents.put(isVideo ? (byte) 0x1f : (byte) 0x00); // Last 8 bits of ES_ID.
contents.putShort((short) 0x0000); // ES_ID
// streamDependenceFlag (1 bit) + URL_Flag (1 bit) + OCRstreamFlag (1 bit) + streamPriority (5
// bits)
contents.put(isVideo ? (byte) 0x1f : (byte) 0x00);
contents.put((byte) 0x04); // DecoderConfigDescrTag
contents.put(dcdSizeBuffer);
contents.put(isVideo ? (byte) 0x20 : (byte) 0x40); // objectTypeIndication
// streamType (6 bits) | upStream (1 bit) | reserved = 1 (1 bit)
contents.put((byte) ((isVideo ? (0x04 << 2) : (0x05 << 2)) | 0x01)); // streamType
Byte objectType = checkNotNull(MimeTypes.getMp4ObjectTypeFromMimeType(mimeType));
contents.put(objectType); // objectTypeIndication
// streamType (6 bits) + upStream (1 bit) + reserved = 1 (1 bit)
contents.put((byte) ((isVideo ? (0x04 << 2) : (0x05 << 2)) | 0x01));
int size = isVideo ? 0x017700 : 0x000300;
contents.putShort((short) ((size >> 8) & 0xFFFF)); // First 16 bits of buffer size.
@ -1384,8 +1395,8 @@ import java.util.List;
contents.put((byte) 0x05); // DecoderSpecificInfoTag
contents.put(dsiSizeBuffer);
contents.put(csd0ByteBuffer);
csd0ByteBuffer.rewind();
contents.put(csdByteBuffer);
csdByteBuffer.rewind();
contents.put((byte) 0x06); // SLConfigDescriptorTag
contents.put((byte) 0x01);
@ -1412,6 +1423,35 @@ import java.util.List;
return sizeBuffer;
}
/* Returns csd wrapped in ByteBuffer in vorbis codec initialization data format. */
private static ByteBuffer getVorbisInitializationData(Format format) {
checkArgument(
format.initializationData.size() > 1, "csd-1 should contain setup header for Vorbis.");
byte[] csd0 = format.initializationData.get(0); // identification Header
// csd0Size is represented using "Xiph lacing" style.
// The lacing size is split into 255 values, stored as unsigned octets for example, 500 is
// coded 255;245 or [0xFF 0xF5]. A frame with a size multiple of 255 is coded with a 0 at the
// end of the size for example, 765 is coded 255;255;255;0 or [0xFF 0xFF 0xFF 0x00].
byte[] csd0Size = new byte[csd0.length / 255 + 1];
Arrays.fill(csd0Size, (byte) 0xFF);
csd0Size[csd0Size.length - 1] = (byte) (csd0.length % 255);
byte[] csd1 = format.initializationData.get(1); // setUp Header
checkArgument(csd1.length > 0, "csd-1 should be present and contain setup header for Vorbis.");
// Add 2 bytes - 1 for Vorbis audio and 1 for comment header length.
ByteBuffer csd = ByteBuffer.allocate(csd0Size.length + csd0.length + csd1.length + 2);
csd.put((byte) 0x02); // Vorbis audio
csd.put(csd0Size); // Size of identification header
csd.put((byte) 0); // Length of comment header
csd.put(csd0);
csd.put(csd1);
csd.flip();
return csd;
}
/** Returns the audio damr box. */
private static ByteBuffer damrBox(short mode) {

View File

@ -35,7 +35,27 @@ import java.nio.ByteBuffer;
/**
* A muxer for creating a fragmented MP4 file.
*
* <p>The muxer supports writing H263, H264, H265 and AV1 video, AAC and Opus audio, and metadata.
* <p>Muxer supports muxing of:
*
* <ul>
* <li>Video Codecs:
* <ul>
* <li>AV1
* <li>MPEG-4
* <li>H.263
* <li>H.264 (AVC)
* <li>H.265 (HEVC)
* </ul>
* <li>Audio Codecs:
* <ul>
* <li>AAC
* <li>AMR-NB (Narrowband AMR)
* <li>AMR-WB (Wideband AMR)
* <li>Opus
* <li>Vorbis
* </ul>
* <li>Metadata
* </ul>
*
* <p>All the operations are performed on the caller thread.
*

View File

@ -54,7 +54,27 @@ import org.checkerframework.checker.nullness.qual.EnsuresNonNull;
/**
* A muxer for creating an MP4 container file.
*
* <p>The muxer supports writing H263, H264, H265 and AV1 video, AAC and Opus audio, and metadata.
* <p>Muxer supports muxing of:
*
* <ul>
* <li>Video Codecs:
* <ul>
* <li>AV1
* <li>MPEG-4
* <li>H.263
* <li>H.264 (AVC)
* <li>H.265 (HEVC)
* </ul>
* <li>Audio Codecs:
* <ul>
* <li>AAC
* <li>AMR-NB (Narrowband AMR)
* <li>AMR-WB (Wideband AMR)
* <li>Opus
* <li>Vorbis
* </ul>
* <li>Metadata
* </ul>
*
* <p>All the operations are performed on the caller thread.
*

View File

@ -288,6 +288,27 @@ public class BoxesTest {
context, dumpableBox, getExpectedDumpFilePath("audio_sample_entry_box_opus"));
}
@Test
public void createAudioSampleEntryBox_forVorbis_matchesExpected() throws Exception {
Format format =
FAKE_AUDIO_FORMAT
.buildUpon()
.setSampleMimeType(MimeTypes.AUDIO_VORBIS)
.setInitializationData(
ImmutableList.of(
BaseEncoding.base16()
.decode("01766F726269730000000001803E0000000000009886010000000000A901"),
BaseEncoding.base16()
.decode("05766F726269732442435601004000001842102A05AD638E3A01")))
.build();
ByteBuffer audioSampleEntryBox = Boxes.audioSampleEntry(format);
DumpableMp4Box dumpableBox = new DumpableMp4Box(audioSampleEntryBox);
DumpFileAsserts.assertOutput(
context, dumpableBox, getExpectedDumpFilePath("audio_sample_entry_box_vorbis"));
}
@Test
public void createAudioSampleEntryBox_withUnknownAudioFormat_throws() {
// The audio format contains an unknown MIME type.

View File

@ -0,0 +1,2 @@
mp4a (132 bytes):
Data = length 124, hash 1414ED96

View File

@ -0,0 +1,436 @@
seekMap:
isSeekable = true
duration = 2992000
getPosition(0) = [[timeUs=0, position=400052]]
getPosition(1) = [[timeUs=1, position=400241]]
getPosition(1496000) = [[timeUs=1496000, position=412467]]
getPosition(2992000) = [[timeUs=2992000, position=424658]]
numberOfTracks = 1
track 0:
total output bytes = 24785
sample count = 103
format 0:
averageBitrate = 99992
id = 1
sampleMimeType = audio/vorbis
maxInputSize = 295
channelCount = 1
sampleRate = 16000
language = ```
metadata = entries=[Mp4Timestamp: creation time=100000000, modification time=500000000, timescale=10000]
initializationData:
data = length 30, hash C22462B1
data = length 3539, hash F8106892
sample 0:
time = 0
flags = 1
data = length 189, hash 52080AE0
sample 1:
time = 0
flags = 1
data = length 179, hash 8C37301
sample 2:
time = 16000
flags = 1
data = length 242, hash 1582B575
sample 3:
time = 40000
flags = 1
data = length 255, hash 98244FA3
sample 4:
time = 72000
flags = 1
data = length 240, hash 71BDFCD9
sample 5:
time = 104000
flags = 1
data = length 253, hash EFD3DBFF
sample 6:
time = 136000
flags = 1
data = length 242, hash AACF8258
sample 7:
time = 168000
flags = 1
data = length 245, hash 1CC8835E
sample 8:
time = 200000
flags = 1
data = length 250, hash FD19F7C0
sample 9:
time = 232000
flags = 1
data = length 250, hash 8AF27182
sample 10:
time = 264000
flags = 1
data = length 242, hash B82131C5
sample 11:
time = 296000
flags = 1
data = length 249, hash 38239C5F
sample 12:
time = 328000
flags = 1
data = length 245, hash D2FB1E64
sample 13:
time = 360000
flags = 1
data = length 246, hash C1FC7B6A
sample 14:
time = 392000
flags = 1
data = length 247, hash 2598BCEB
sample 15:
time = 424000
flags = 1
data = length 248, hash 56C3B18C
sample 16:
time = 456000
flags = 1
data = length 253, hash A93E963E
sample 17:
time = 488000
flags = 1
data = length 250, hash B06DDB1E
sample 18:
time = 520000
flags = 1
data = length 242, hash 26EBF2C6
sample 19:
time = 552000
flags = 1
data = length 258, hash 35393C2
sample 20:
time = 584000
flags = 1
data = length 253, hash C4C2692F
sample 21:
time = 616000
flags = 1
data = length 254, hash 4796C450
sample 22:
time = 648000
flags = 1
data = length 245, hash DDECF577
sample 23:
time = 680000
flags = 1
data = length 252, hash 5D243245
sample 24:
time = 712000
flags = 1
data = length 189, hash AA61F7FC
sample 25:
time = 736000
flags = 1
data = length 184, hash 87D9A9A6
sample 26:
time = 752000
flags = 1
data = length 187, hash C94C11BE
sample 27:
time = 768000
flags = 1
data = length 256, hash 36CB9380
sample 28:
time = 792000
flags = 1
data = length 262, hash B729F6C9
sample 29:
time = 824000
flags = 1
data = length 252, hash 1AF3D36B
sample 30:
time = 856000
flags = 1
data = length 260, hash 9C2CA006
sample 31:
time = 888000
flags = 1
data = length 257, hash CDC89AFD
sample 32:
time = 920000
flags = 1
data = length 247, hash 425DFE94
sample 33:
time = 952000
flags = 1
data = length 254, hash 97F2A00F
sample 34:
time = 984000
flags = 1
data = length 259, hash 5CFD2BD1
sample 35:
time = 1016000
flags = 1
data = length 262, hash A1B3B3A6
sample 36:
time = 1048000
flags = 1
data = length 258, hash A54300B3
sample 37:
time = 1080000
flags = 1
data = length 242, hash 1CCE647C
sample 38:
time = 1112000
flags = 1
data = length 249, hash 6B917E41
sample 39:
time = 1144000
flags = 1
data = length 246, hash 86324731
sample 40:
time = 1176000
flags = 1
data = length 263, hash 4B98FCB4
sample 41:
time = 1208000
flags = 1
data = length 247, hash 16B758D0
sample 42:
time = 1240000
flags = 1
data = length 189, hash CDA345D1
sample 43:
time = 1264000
flags = 1
data = length 183, hash 1538735F
sample 44:
time = 1280000
flags = 1
data = length 183, hash 47117509
sample 45:
time = 1296000
flags = 1
data = length 188, hash 7E7B0A91
sample 46:
time = 1312000
flags = 1
data = length 185, hash 5A283C8F
sample 47:
time = 1328000
flags = 1
data = length 254, hash 7A56B0F0
sample 48:
time = 1352000
flags = 1
data = length 257, hash 4180CACF
sample 49:
time = 1384000
flags = 1
data = length 256, hash 392BF996
sample 50:
time = 1416000
flags = 1
data = length 259, hash 36E26615
sample 51:
time = 1448000
flags = 1
data = length 258, hash E7A523F0
sample 52:
time = 1480000
flags = 1
data = length 254, hash BB991F97
sample 53:
time = 1512000
flags = 1
data = length 264, hash EBD1FD4F
sample 54:
time = 1544000
flags = 1
data = length 256, hash EF1FA741
sample 55:
time = 1576000
flags = 1
data = length 253, hash AA10B2A
sample 56:
time = 1608000
flags = 1
data = length 258, hash C995778
sample 57:
time = 1640000
flags = 1
data = length 243, hash 37667B06
sample 58:
time = 1672000
flags = 1
data = length 254, hash A1263AA0
sample 59:
time = 1704000
flags = 1
data = length 194, hash E7EB2AFD
sample 60:
time = 1728000
flags = 1
data = length 192, hash 9405895B
sample 61:
time = 1744000
flags = 1
data = length 254, hash A0F66B2E
sample 62:
time = 1768000
flags = 1
data = length 249, hash 2B815793
sample 63:
time = 1800000
flags = 1
data = length 254, hash 8A845D4F
sample 64:
time = 1832000
flags = 1
data = length 247, hash 32E3FF95
sample 65:
time = 1864000
flags = 1
data = length 185, hash DCD1005B
sample 66:
time = 1888000
flags = 1
data = length 191, hash 9EF9175D
sample 67:
time = 1904000
flags = 1
data = length 251, hash 8B20EFE4
sample 68:
time = 1928000
flags = 1
data = length 251, hash 2A349E41
sample 69:
time = 1960000
flags = 1
data = length 254, hash C4977BE
sample 70:
time = 1992000
flags = 1
data = length 240, hash 230286DE
sample 71:
time = 2024000
flags = 1
data = length 259, hash 19565AB6
sample 72:
time = 2056000
flags = 1
data = length 248, hash 1EE00686
sample 73:
time = 2088000
flags = 1
data = length 245, hash C5DA60D6
sample 74:
time = 2120000
flags = 1
data = length 243, hash B5DF7416
sample 75:
time = 2152000
flags = 1
data = length 263, hash 43CEE6FD
sample 76:
time = 2184000
flags = 1
data = length 257, hash 4B30653E
sample 77:
time = 2216000
flags = 1
data = length 262, hash 2C3B847A
sample 78:
time = 2248000
flags = 1
data = length 246, hash A5398B6D
sample 79:
time = 2280000
flags = 1
data = length 252, hash 54031889
sample 80:
time = 2312000
flags = 1
data = length 243, hash CD416D89
sample 81:
time = 2344000
flags = 1
data = length 247, hash 37940B14
sample 82:
time = 2376000
flags = 1
data = length 255, hash E5973397
sample 83:
time = 2408000
flags = 1
data = length 250, hash EEF7942C
sample 84:
time = 2440000
flags = 1
data = length 255, hash D3E3D314
sample 85:
time = 2472000
flags = 1
data = length 250, hash 5CC231E2
sample 86:
time = 2504000
flags = 1
data = length 252, hash 55453FFB
sample 87:
time = 2536000
flags = 1
data = length 265, hash 6653F47
sample 88:
time = 2568000
flags = 1
data = length 264, hash A8029392
sample 89:
time = 2600000
flags = 1
data = length 262, hash A0016CDE
sample 90:
time = 2632000
flags = 1
data = length 257, hash 64872E6A
sample 91:
time = 2664000
flags = 1
data = length 245, hash 29A516B2
sample 92:
time = 2696000
flags = 1
data = length 250, hash 3A9E8EC3
sample 93:
time = 2728000
flags = 1
data = length 252, hash 78D64CE9
sample 94:
time = 2760000
flags = 1
data = length 250, hash 4CAB463A
sample 95:
time = 2792000
flags = 1
data = length 249, hash A1079906
sample 96:
time = 2824000
flags = 1
data = length 255, hash EEAA6634
sample 97:
time = 2856000
flags = 1
data = length 256, hash BFD77057
sample 98:
time = 2888000
flags = 1
data = length 249, hash 9ED80271
sample 99:
time = 2920000
flags = 1
data = length 252, hash 2C3206C8
sample 100:
time = 2952000
flags = 1
data = length 134, hash AA8D9693
sample 101:
time = 2976000
flags = 1
data = length 180, hash 63217EFD
sample 102:
time = 2992000
flags = 536870913
data = length 179, hash 8529BBE7
tracksEnded = true