WebVTT parser improvements.

* Split findNextCueHeader and validateWebVttHeader into static methods.
  This is a step toward WebVTT in HLS, where we'll need to re-use these
  to peek at the top of the WebVTT file (they'll be moved into a util
  class).
* Made parser robust against bad cue headers + added a test.
* Removed spurious looking assertion in WebvttSubtitle.
This commit is contained in:
Oliver Woodman 2015-11-25 17:05:20 +00:00
parent 963e604ffd
commit 72f093c4f6
4 changed files with 87 additions and 49 deletions

View File

@ -0,0 +1,10 @@
WEBVTT # This comment is allowed
00:00.000 --> 00:01.234
This is the first subtitle.
00:02.badbadbadbadbadbad --> 00:03.456
This is the second subtitle.
00:04.000 --> 00:05.000
This is the third subtitle.

View File

@ -33,6 +33,7 @@ public class WebvttParserTest extends InstrumentationTestCase {
private static final String TYPICAL_WITH_IDS_FILE = "webvtt/typical_with_identifiers";
private static final String TYPICAL_WITH_COMMENTS_FILE = "webvtt/typical_with_comments";
private static final String WITH_POSITIONING_FILE = "webvtt/with_positioning";
private static final String WITH_BAD_CUE_HEADER_FILE = "webvtt/with_bad_cue_header";
private static final String WITH_TAGS_FILE = "webvtt/with_tags";
private static final String EMPTY_FILE = "webvtt/empty";
@ -132,6 +133,20 @@ public class WebvttParserTest extends InstrumentationTestCase {
Cue.ANCHOR_TYPE_END, 0.1f);
}
public void testParseWithBadCueHeader() throws IOException {
WebvttParser parser = new WebvttParser();
InputStream inputStream =
getInstrumentation().getContext().getResources().getAssets().open(WITH_BAD_CUE_HEADER_FILE);
WebvttSubtitle subtitle = parser.parse(inputStream);
// test event count
assertEquals(4, subtitle.getEventTimeCount());
// test cues
assertCue(subtitle, 0, 0, 1234000, "This is the first subtitle.");
assertCue(subtitle, 2, 4000000, 5000000, "This is the third subtitle.");
}
private static void assertCue(WebvttSubtitle subtitle, int eventTimeIndex, long startTimeUs,
int endTimeUs, String text) {
assertCue(subtitle, eventTimeIndex, startTimeUs, endTimeUs, text, null, Cue.DIMEN_UNSET,

View File

@ -23,6 +23,7 @@ import com.google.android.exoplayer.util.MimeTypes;
import android.text.Html;
import android.text.Layout.Alignment;
import android.text.TextUtils;
import android.util.Log;
import java.io.BufferedReader;
@ -43,9 +44,9 @@ public final class WebvttParser implements SubtitleParser {
private static final String TAG = "WebvttParser";
private static final Pattern HEADER = Pattern.compile("^\uFEFF?WEBVTT((\u0020|\u0009).*)?$");
private static final Pattern COMMENT_BLOCK = Pattern.compile("^NOTE((\u0020|\u0009).*)?$");
private static final Pattern COMMENT = Pattern.compile("^NOTE((\u0020|\u0009).*)?$");
private static final Pattern CUE_HEADER = Pattern.compile("^(\\S+)\\s+-->\\s+(\\S+)(.*)?$");
private static final Pattern CUE_SETTING = Pattern.compile("\\S+?:\\S+");
private static final Pattern CUE_SETTING = Pattern.compile("(\\S+?):(\\S+)");
private final PositionHolder positionHolder;
private final StringBuilder textBuilder;
@ -62,51 +63,26 @@ public final class WebvttParser implements SubtitleParser {
@Override
public final WebvttSubtitle parse(InputStream inputStream) throws IOException {
ArrayList<WebvttCue> subtitles = new ArrayList<>();
BufferedReader webvttData = new BufferedReader(new InputStreamReader(inputStream, C.UTF8_NAME));
String line;
// File should start with "WEBVTT".
line = webvttData.readLine();
if (line == null || !HEADER.matcher(line).matches()) {
throw new ParserException("Expected WEBVTT. Got " + line);
}
// Validate the first line of the header, and skip the remainder.
validateWebvttHeaderLine(webvttData);
while (!TextUtils.isEmpty(webvttData.readLine())) {}
// Parse the remainder of the header.
while (true) {
line = webvttData.readLine();
if (line == null) {
// We reached EOF before finishing the header.
throw new ParserException("Expected an empty line after webvtt header");
} else if (line.isEmpty()) {
// We read the newline that separates the header from the body.
break;
}
}
// Process the cues and text.
while ((line = webvttData.readLine()) != null) {
// Skip a comment block, if present.
Matcher matcher = COMMENT_BLOCK.matcher(line);
if (matcher.find()) {
// Skip until the end of the comment block.
while ((line = webvttData.readLine()) != null && !line.isEmpty()) {
// Ignore comment text.
}
ArrayList<WebvttCue> subtitles = new ArrayList<>();
Matcher cueHeaderMatcher;
while ((cueHeaderMatcher = findNextCueHeader(webvttData)) != null) {
long cueStartTime;
long cueEndTime;
try {
// Parse the cue start and end times.
cueStartTime = parseTimestampUs(cueHeaderMatcher.group(1));
cueEndTime = parseTimestampUs(cueHeaderMatcher.group(2));
} catch (NumberFormatException e) {
Log.w(TAG, "Skipping cue with bad header: " + cueHeaderMatcher.group());
continue;
}
// Skip anything other than a cue header.
matcher = CUE_HEADER.matcher(line);
if (!matcher.matches()) {
continue;
}
// Parse the cue start and end times.
long cueStartTime = parseTimestampUs(matcher.group(1));
long cueEndTime = parseTimestampUs(matcher.group(2));
// Default cue settings.
Alignment cueTextAlignment = null;
float cueLine = Cue.DIMEN_UNSET;
@ -117,12 +93,10 @@ public final class WebvttParser implements SubtitleParser {
float cueWidth = Cue.DIMEN_UNSET;
// Parse the cue settings list.
matcher = CUE_SETTING.matcher(matcher.group(3));
while (matcher.find()) {
String match = matcher.group();
String[] parts = match.split(":", 2);
String name = parts[0];
String value = parts[1];
Matcher cueSettingMatcher = CUE_SETTING.matcher(cueHeaderMatcher.group(3));
while (cueSettingMatcher.find()) {
String name = cueSettingMatcher.group(1);
String value = cueSettingMatcher.group(2);
try {
if ("line".equals(name)) {
parseLineAttribute(value, positionHolder);
@ -141,7 +115,7 @@ public final class WebvttParser implements SubtitleParser {
Log.w(TAG, "Unknown cue setting " + name + ":" + value);
}
} catch (NumberFormatException e) {
Log.w(TAG, e.getMessage() + ": " + match);
Log.w(TAG, "Skipping bad cue setting: " + cueSettingMatcher.group());
}
}
@ -153,6 +127,7 @@ public final class WebvttParser implements SubtitleParser {
// Parse the cue text.
textBuilder.setLength(0);
String line;
while ((line = webvttData.readLine()) != null && !line.isEmpty()) {
if (textBuilder.length() > 0) {
textBuilder.append("<br>");
@ -169,6 +144,45 @@ public final class WebvttParser implements SubtitleParser {
return new WebvttSubtitle(subtitles);
}
/**
* Reads and validates the first line of a WebVTT file.
*
* @param input The input from which the line should be read.
* @throws ParserException If the line isn't the start of a valid WebVTT file.
* @throws IOException If an error occurs reading from the input.
*/
private static void validateWebvttHeaderLine(BufferedReader input) throws IOException {
String line = input.readLine();
if (line == null || !HEADER.matcher(line).matches()) {
throw new ParserException("Expected WEBVTT. Got " + line);
}
}
/**
* Reads lines up to and including the next WebVTT cue header.
*
* @param input The input from which lines should be read.
* @throws IOException If an error occurs reading from the input.
* @return A {@link Matcher} for the WebVTT cue header, or null if the end of the input was
* reached without a cue header being found. In the case that a cue header is found, groups 1,
* 2 and 3 of the returned matcher contain the start time, end time and settings list.
*/
private static Matcher findNextCueHeader(BufferedReader input) throws IOException {
String line;
while ((line = input.readLine()) != null) {
if (COMMENT.matcher(line).matches()) {
// Skip until the end of the comment block.
while ((line = input.readLine()) != null && !line.isEmpty()) {}
} else {
Matcher cueHeaderMatcher = CUE_HEADER.matcher(line);
if (cueHeaderMatcher.matches()) {
return cueHeaderMatcher;
}
}
}
return null;
}
private static long parseTimestampUs(String s) throws NumberFormatException {
long value = 0;
String[] parts = s.split("\\.", 2);

View File

@ -56,7 +56,6 @@ public final class WebvttSubtitle implements Subtitle {
@Override
public int getNextEventTimeIndex(long timeUs) {
Assertions.checkArgument(timeUs >= 0);
int index = Util.binarySearchCeil(sortedCueTimesUs, timeUs, false, false);
return index < sortedCueTimesUs.length ? index : -1;
}