webm_extractor: Support for multiple tracks

Github Issue: #363
This commit is contained in:
Oliver Woodman 2015-04-22 16:18:23 +01:00
parent f637fde962
commit c807c69a93
2 changed files with 167 additions and 63 deletions

View File

@ -56,6 +56,8 @@ public final class WebmExtractor implements Extractor {
private static final int VORBIS_MAX_INPUT_SIZE = 8192;
private static final int OPUS_MAX_INPUT_SIZE = 5760;
private static final int ENCRYPTION_IV_SIZE = 8;
private static final int TRACK_TYPE_AUDIO = 2;
private static final int TRACK_TYPE_VIDEO = 1;
private static final int UNKNOWN = -1;
private static final int ID_EBML = 0x1A45DFA3;
@ -73,6 +75,8 @@ public final class WebmExtractor implements Extractor {
private static final int ID_BLOCK = 0xA1;
private static final int ID_TRACKS = 0x1654AE6B;
private static final int ID_TRACK_ENTRY = 0xAE;
private static final int ID_TRACK_NUMBER = 0xD7;
private static final int ID_TRACK_TYPE = 0x83;
private static final int ID_CODEC_ID = 0x86;
private static final int ID_CODEC_PRIVATE = 0x63A2;
private static final int ID_CODEC_DELAY = 0x56AA;
@ -110,32 +114,30 @@ public final class WebmExtractor implements Extractor {
private long segmentContentSize = UNKNOWN;
private long timecodeScale = 1000000L;
private long durationUs = C.UNKNOWN_TIME_US;
private int pixelWidth = UNKNOWN;
private int pixelHeight = UNKNOWN;
private int channelCount = UNKNOWN;
private int sampleRate = UNKNOWN;
private byte[] codecPrivate;
private String codecId;
private long codecDelayNs;
private long seekPreRollNs;
private boolean isAudioTrack;
private boolean hasContentEncryption;
private byte[] encryptionKeyId;
private TrackFormat trackFormat; // Used to store the last seen track.
private TrackFormat audioTrackFormat;
private TrackFormat videoTrackFormat;
private boolean sentDrmInitData;
// Cue related elements.
private long clusterTimecodeUs = UNKNOWN;
private LongArray cueTimesUs;
private LongArray cueClusterPositions;
private boolean seenClusterPositionForCurrentCuePoint;
// Sample reading state.
private int blockBytesRead;
private int sampleState;
private int sampleSize;
private int sampleTrackNumber;
private int sampleFlags;
private long sampleTimeUs;
private boolean sampleRead;
// Extractor outputs.
private ExtractorOutput extractorOutput;
private TrackOutput trackOutput;
public WebmExtractor() {
this(new DefaultEbmlReader());
@ -151,8 +153,6 @@ public final class WebmExtractor implements Extractor {
@Override
public void init(ExtractorOutput output) {
extractorOutput = output;
trackOutput = output.track(0);
extractorOutput.endTracks();
}
@Override
@ -199,6 +199,8 @@ public final class WebmExtractor implements Extractor {
case ID_TIME_CODE:
case ID_PIXEL_WIDTH:
case ID_PIXEL_HEIGHT:
case ID_TRACK_NUMBER:
case ID_TRACK_TYPE:
case ID_CODEC_DELAY:
case ID_SEEK_PRE_ROLL:
case ID_CHANNELS:
@ -240,11 +242,17 @@ public final class WebmExtractor implements Extractor {
cueTimesUs = new LongArray();
cueClusterPositions = new LongArray();
return;
case ID_CUE_POINT:
seenClusterPositionForCurrentCuePoint = false;
return;
case ID_CONTENT_ENCODING:
// TODO: check and fail if more than one content encoding is present.
return;
case ID_CONTENT_ENCRYPTION:
hasContentEncryption = true;
trackFormat.hasContentEncryption = true;
return;
case ID_TRACK_ENTRY:
trackFormat = new TrackFormat();
return;
default:
return;
@ -257,21 +265,47 @@ public final class WebmExtractor implements Extractor {
extractorOutput.seekMap(buildCues());
return;
case ID_CONTENT_ENCODING:
if (!hasContentEncryption) {
if (!trackFormat.hasContentEncryption) {
// We found a ContentEncoding other than Encryption.
throw new ParserException("Found an unsupported ContentEncoding");
}
if (encryptionKeyId == null) {
if (trackFormat.encryptionKeyId == null) {
throw new ParserException("Encrypted Track found but ContentEncKeyID was not found");
}
extractorOutput.drmInitData(
new DrmInitData.Universal(MimeTypes.VIDEO_WEBM, encryptionKeyId));
return;
case ID_AUDIO:
isAudioTrack = true;
if (!sentDrmInitData) {
extractorOutput.drmInitData(
new DrmInitData.Universal(MimeTypes.VIDEO_WEBM, trackFormat.encryptionKeyId));
sentDrmInitData = true;
}
return;
case ID_TRACK_ENTRY:
trackOutput.format(isAudioTrack ? buildAudioFormat() : buildVideoFormat());
if (trackFormat.number == UNKNOWN || trackFormat.type == UNKNOWN) {
throw new ParserException("Mandatory element TrackNumber or TrackType not found");
}
if ((trackFormat.type == TRACK_TYPE_AUDIO && audioTrackFormat != null)
|| (trackFormat.type == TRACK_TYPE_VIDEO && videoTrackFormat != null)) {
// There is more than 1 audio/video track. Ignore everything but the first.
trackFormat = null;
return;
}
if (trackFormat.type == TRACK_TYPE_AUDIO && isCodecSupported(trackFormat.codecId)) {
audioTrackFormat = trackFormat;
audioTrackFormat.trackOutput = extractorOutput.track(audioTrackFormat.number);
audioTrackFormat.trackOutput.format(buildAudioFormat());
} else if (trackFormat.type == TRACK_TYPE_VIDEO && isCodecSupported(trackFormat.codecId)) {
videoTrackFormat = trackFormat;
videoTrackFormat.trackOutput = extractorOutput.track(videoTrackFormat.number);
videoTrackFormat.trackOutput.format(buildVideoFormat());
} else {
// Unsupported track type. Do nothing.
}
trackFormat = null;
return;
case ID_TRACKS:
if (videoTrackFormat == null && audioTrackFormat == null) {
throw new ParserException("No valid tracks were found");
}
extractorOutput.endTracks();
return;
default:
return;
@ -296,19 +330,25 @@ public final class WebmExtractor implements Extractor {
timecodeScale = value;
return;
case ID_PIXEL_WIDTH:
pixelWidth = (int) value;
trackFormat.pixelWidth = (int) value;
return;
case ID_PIXEL_HEIGHT:
pixelHeight = (int) value;
trackFormat.pixelHeight = (int) value;
return;
case ID_TRACK_NUMBER:
trackFormat.number = (int) value;
return;
case ID_TRACK_TYPE:
trackFormat.type = (int) value;
return;
case ID_CODEC_DELAY:
codecDelayNs = value;
trackFormat.codecDelayNs = value;
return;
case ID_SEEK_PRE_ROLL:
seekPreRollNs = value;
trackFormat.seekPreRollNs = value;
return;
case ID_CHANNELS:
channelCount = (int) value;
trackFormat.channelCount = (int) value;
return;
case ID_CONTENT_ENCODING_ORDER:
// This extractor only supports one ContentEncoding element and hence the order has to be 0.
@ -345,7 +385,13 @@ public final class WebmExtractor implements Extractor {
cueTimesUs.add(scaleTimecodeToUs(value));
return;
case ID_CUE_CLUSTER_POSITION:
cueClusterPositions.add(value);
if (!seenClusterPositionForCurrentCuePoint) {
// If there's more than one video/audio track, then there could be more than one
// CueTrackPositions within a single CuePoint. In such a case, ignore all but the first
// one (since the cluster position will be quite close for all the tracks).
cueClusterPositions.add(value);
seenClusterPositionForCurrentCuePoint = true;
}
return;
case ID_TIME_CODE:
clusterTimecodeUs = scaleTimecodeToUs(value);
@ -361,7 +407,7 @@ public final class WebmExtractor implements Extractor {
durationUs = scaleTimecodeToUs((long) value);
return;
case ID_SAMPLING_FREQUENCY:
sampleRate = (int) value;
trackFormat.sampleRate = (int) value;
return;
default:
return;
@ -377,11 +423,7 @@ public final class WebmExtractor implements Extractor {
}
return;
case ID_CODEC_ID:
// Validate that CodecID is supported.
if (!isCodecSupported(value)) {
throw new ParserException("CodecID " + value + " not supported");
}
codecId = value;
trackFormat.codecId = value;
return;
default:
return;
@ -392,12 +434,12 @@ public final class WebmExtractor implements Extractor {
throws IOException, InterruptedException {
switch (id) {
case ID_CODEC_PRIVATE:
codecPrivate = new byte[contentSize];
input.readFully(codecPrivate, 0, contentSize);
trackFormat.codecPrivate = new byte[contentSize];
input.readFully(trackFormat.codecPrivate, 0, contentSize);
return;
case ID_CONTENT_ENCRYPTION_KEY_ID:
encryptionKeyId = new byte[contentSize];
input.readFully(encryptionKeyId, 0, contentSize);
trackFormat.encryptionKeyId = new byte[contentSize];
input.readFully(trackFormat.encryptionKeyId, 0, contentSize);
return;
case ID_SIMPLE_BLOCK:
case ID_BLOCK:
@ -407,16 +449,33 @@ public final class WebmExtractor implements Extractor {
// differ only in the way flags are specified.
if (sampleState == SAMPLE_STATE_START) {
// Value of trackNumber is not used but needs to be read.
varintReader.readUnsignedVarint(input, false, false);
sampleTrackNumber = (int) varintReader.readUnsignedVarint(input, false, true);
blockBytesRead = varintReader.getLastLength();
sampleState = SAMPLE_STATE_HEADER;
}
// Ignore the frame if the track number equals neither the audio track nor the video track.
if ((audioTrackFormat != null && videoTrackFormat != null
&& audioTrackFormat.number != sampleTrackNumber
&& videoTrackFormat.number != sampleTrackNumber)
|| (audioTrackFormat != null && videoTrackFormat == null
&& audioTrackFormat.number != sampleTrackNumber)
|| (audioTrackFormat == null && videoTrackFormat != null
&& videoTrackFormat.number != sampleTrackNumber)) {
input.skipFully(contentSize - blockBytesRead);
sampleState = SAMPLE_STATE_START;
return;
}
TrackFormat sampleTrackFormat =
(audioTrackFormat != null && sampleTrackNumber == audioTrackFormat.number)
? audioTrackFormat : videoTrackFormat;
TrackOutput trackOutput = sampleTrackFormat.trackOutput;
if (sampleState == SAMPLE_STATE_HEADER) {
byte[] sampleHeaderScratchData = sampleHeaderScratch.data;
// Next 3 bytes have timecode and flags. If encrypted, the 4th byte is a signal byte.
int remainingHeaderLength = hasContentEncryption ? 4 : 3;
int remainingHeaderLength = sampleTrackFormat.hasContentEncryption ? 4 : 3;
input.readFully(sampleHeaderScratchData, 0, remainingHeaderLength);
blockBytesRead += remainingHeaderLength;
@ -444,7 +503,7 @@ public final class WebmExtractor implements Extractor {
boolean isEncrypted = false;
// If encrypted, the fourth byte is an encryption signal byte.
if (hasContentEncryption) {
if (sampleTrackFormat.hasContentEncryption) {
if ((sampleHeaderScratchData[3] & 0x80) == 0x80) {
throw new ParserException("Extension bit is set in signal byte");
}
@ -469,7 +528,7 @@ public final class WebmExtractor implements Extractor {
blockBytesRead += trackOutput.sampleData(input, contentSize - blockBytesRead);
}
if (CODEC_ID_VORBIS.equals(codecId)) {
if (CODEC_ID_VORBIS.equals(sampleTrackFormat.codecId)) {
// Vorbis decoder in android MediaCodec [1] expects the last 4 bytes of the sample to be
// the number of samples in the current page. This definition holds good only for Ogg and
// irrelevant for WebM. So we always set this to -1 (the decoder will ignore this value if
@ -496,15 +555,15 @@ public final class WebmExtractor implements Extractor {
}
/**
* Builds an video {@link MediaFormat} containing recently gathered Audio information.
* Builds an video {@link MediaFormat} containing recently gathered Video information.
*
* @return The built {@link MediaFormat}.
* @throws ParserException If the codec is unsupported.
*/
private MediaFormat buildVideoFormat() throws ParserException {
if (CODEC_ID_VP9.equals(codecId)) {
if (videoTrackFormat != null && CODEC_ID_VP9.equals(videoTrackFormat.codecId)) {
return MediaFormat.createVideoFormat(MimeTypes.VIDEO_VP9, MediaFormat.NO_VALUE, durationUs,
pixelWidth, pixelHeight, null);
videoTrackFormat.pixelWidth, videoTrackFormat.pixelHeight, null);
} else {
throw new ParserException("Unable to build format");
}
@ -517,16 +576,20 @@ public final class WebmExtractor implements Extractor {
* @throws ParserException If the codec is unsupported.
*/
private MediaFormat buildAudioFormat() throws ParserException {
if (CODEC_ID_VORBIS.equals(codecId)) {
if (audioTrackFormat != null && CODEC_ID_VORBIS.equals(audioTrackFormat.codecId)) {
return MediaFormat.createAudioFormat(MimeTypes.AUDIO_VORBIS, VORBIS_MAX_INPUT_SIZE,
durationUs, channelCount, sampleRate, parseVorbisCodecPrivate());
} else if (CODEC_ID_OPUS.equals(codecId)) {
durationUs, audioTrackFormat.channelCount, audioTrackFormat.sampleRate,
parseVorbisCodecPrivate());
} else if (audioTrackFormat != null && CODEC_ID_OPUS.equals(audioTrackFormat.codecId)) {
ArrayList<byte[]> opusInitializationData = new ArrayList<byte[]>(3);
opusInitializationData.add(codecPrivate);
opusInitializationData.add(ByteBuffer.allocate(Long.SIZE).putLong(codecDelayNs).array());
opusInitializationData.add(ByteBuffer.allocate(Long.SIZE).putLong(seekPreRollNs).array());
opusInitializationData.add(audioTrackFormat.codecPrivate);
opusInitializationData.add(
ByteBuffer.allocate(Long.SIZE).putLong(audioTrackFormat.codecDelayNs).array());
opusInitializationData.add(
ByteBuffer.allocate(Long.SIZE).putLong(audioTrackFormat.seekPreRollNs).array());
return MediaFormat.createAudioFormat(MimeTypes.AUDIO_OPUS, OPUS_MAX_INPUT_SIZE,
durationUs, channelCount, sampleRate, opusInitializationData);
durationUs, audioTrackFormat.channelCount, audioTrackFormat.sampleRate,
opusInitializationData);
} else {
throw new ParserException("Unable to build format");
}
@ -576,6 +639,7 @@ public final class WebmExtractor implements Extractor {
*/
private ArrayList<byte[]> parseVorbisCodecPrivate() throws ParserException {
try {
byte[] codecPrivate = audioTrackFormat.codecPrivate;
if (codecPrivate[0] != 0x02) {
throw new ParserException("Error parsing vorbis codec private");
}
@ -672,4 +736,28 @@ public final class WebmExtractor implements Extractor {
}
private static final class TrackFormat {
// Common track elements.
public String codecId;
public int number = UNKNOWN;
public int type = UNKNOWN;
public boolean hasContentEncryption;
public byte[] encryptionKeyId;
// Video track related elements.
public int pixelWidth = UNKNOWN;
public int pixelHeight = UNKNOWN;
// Audio track related elements.
public int channelCount = UNKNOWN;
public int sampleRate = UNKNOWN;
public byte[] codecPrivate;
public long codecDelayNs = UNKNOWN;
public long seekPreRollNs = UNKNOWN;
public TrackOutput trackOutput;
}
}

View File

@ -456,11 +456,15 @@ public class WebmExtractorTest extends InstrumentationTestCase {
byte[] cipherModeBytes = getIntegerBytes(contentEncodingSettings.aesCipherMode);
return createByteArray(
0x16, 0x54, 0xAE, 0x6B, // Tracks
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, // size=72
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4E, // size=78
0xAE, // TrackEntry
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3F, // size=63
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x45, // size=69
0x86, // CodecID
0x85, 0x56, 0x5F, 0x56, 0x50, codecIsVp9 ? 0x39 : 0x30, // size=5 value=V_VP9/0
0xD7, // TrackNumber
0x81, 0x01, // size=1 value=1
0x83, // TrackType
0x81, 0x01, // size=1 value=1
0x6D, 0x80, // ContentEncodings
0xA4, // size=36
0x62, 0x40, // ContentEncoding
@ -492,11 +496,15 @@ public class WebmExtractorTest extends InstrumentationTestCase {
} else {
return createByteArray(
0x16, 0x54, 0xAE, 0x6B, // Tracks
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, // size=36
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, // size=42
0xAE, // TrackEntry
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B, // size=27
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, // size=33
0x86, // CodecID
0x85, 0x56, 0x5F, 0x56, 0x50, codecIsVp9 ? 0x39 : 0x30, // size=5 value=V_VP9/0
0xD7, // TrackNumber
0x81, 0x01, // size=1 value=1
0x83, // TrackType
0x81, 0x01, // size=1 value=1
0xE0, // Video
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, // size=8
0xB0, // PixelWidth
@ -510,11 +518,15 @@ public class WebmExtractorTest extends InstrumentationTestCase {
byte[] channelCountBytes = getIntegerBytes(channelCount);
return createByteArray(
0x16, 0x54, 0xAE, 0x6B, // Tracks
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x39, // size=57
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3F, // size=63
0xAE, // TrackEntry
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, // size=48
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, // size=54
0x86, // CodecID
0x86, 0x41, 0x5F, 0x4F, 0x50, 0x55, 0x53, // size=6 value=A_OPUS
0xD7, // TrackNumber
0x81, 0x01, // size=1 value=1
0x83, // TrackType
0x81, 0x02, // size=1 value=2
0x56, 0xAA, // CodecDelay
0x83, 0x63, 0x2E, 0xA0, // size=3 value=6500000
0x56, 0xBB, // SeekPreRoll
@ -533,11 +545,15 @@ public class WebmExtractorTest extends InstrumentationTestCase {
byte[] channelCountBytes = getIntegerBytes(channelCount);
byte[] tracksElement = createByteArray(
0x16, 0x54, 0xAE, 0x6B, // Tracks
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x9C, // size=4252
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0xA2, // size=4258
0xAE, // TrackEntry
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x93, // size=4243 (36+4207)
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x99, // size=4249 (42+4207)
0x86, // CodecID
0x88, 0x41, 0x5f, 0x56, 0x4f, 0x52, 0x42, 0x49, 0x53, // size=8 value=A_VORBIS
0xD7, // TrackNumber
0x81, 0x01, // size=1 value=1
0x83, // TrackType
0x81, 0x02, // size=1 value=2
0xE1, // Audio
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0D, // size=13
0x9F, // Channels
@ -560,7 +576,7 @@ public class WebmExtractorTest extends InstrumentationTestCase {
byte[] sizeBytes = getIntegerBytes(size);
return createByteArray(
0x1C, 0x53, 0xBB, 0x6B, // Cues
0x01, 0x00, 0x00, 0x00, sizeBytes[0], sizeBytes[1], sizeBytes[2], sizeBytes[3]); // size=31
0x01, 0x00, 0x00, 0x00, sizeBytes[0], sizeBytes[1], sizeBytes[2], sizeBytes[3]);
}
private static byte[] createCuePointElement(int cueTime, int cueClusterPosition) {