From 6e58ca6baad6f1ad4e35818b1e561d54e6b79a1a Mon Sep 17 00:00:00 2001 From: Ian Baker Date: Mon, 12 Dec 2022 10:55:15 +0000 Subject: [PATCH] Merge pull request #10750 from Stronger197:subrip_utf_16 PiperOrigin-RevId: 492164739 (cherry picked from commit a9191418051a19681ddf884163ac5553871ec658) --- RELEASENOTES.md | 3 + .../media3/common/util/ParsableByteArray.java | 162 ++++++++-- .../common/util/ParsableByteArrayTest.java | 298 +++++++++++++++++- .../extractor/text/subrip/SubripDecoder.java | 20 +- .../extractor/text/tx3g/Tx3gDecoder.java | 18 +- .../text/subrip/SubripDecoderTest.java | 28 ++ .../test/assets/media/subrip/typical_utf16be | Bin 0 -> 434 bytes .../test/assets/media/subrip/typical_utf16le | Bin 0 -> 434 bytes 8 files changed, 476 insertions(+), 53 deletions(-) create mode 100644 libraries/test_data/src/test/assets/media/subrip/typical_utf16be create mode 100644 libraries/test_data/src/test/assets/media/subrip/typical_utf16le diff --git a/RELEASENOTES.md b/RELEASENOTES.md index 0d6f026f79..ce0948036b 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -16,6 +16,9 @@ * Audio: * Use the compressed audio format bitrate to calculate the min buffer size for `AudioTrack` in direct playbacks (passthrough). +* Text: + * SubRip: Add support for UTF-16 files if they start with a byte order + mark. * Session: * Add helper method to convert platform session token to Media3 `SessionToken` ([#171](https://github.com/androidx/media/issues/171)). diff --git a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java index 0367ab8f22..bd1117bc78 100644 --- a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java +++ b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java @@ -17,6 +17,9 @@ package androidx.media3.common.util; import androidx.annotation.Nullable; import com.google.common.base.Charsets; +import com.google.common.collect.ImmutableSet; +import com.google.common.primitives.Chars; +import com.google.common.primitives.UnsignedBytes; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.Arrays; @@ -28,6 +31,12 @@ import java.util.Arrays; @UnstableApi public final class ParsableByteArray { + private static final char[] CR_AND_LF = {'\r', '\n'}; + private static final char[] LF = {'\n'}; + private static final ImmutableSet SUPPORTED_CHARSETS_FOR_READLINE = + ImmutableSet.of( + Charsets.US_ASCII, Charsets.UTF_8, Charsets.UTF_16, Charsets.UTF_16BE, Charsets.UTF_16LE); + private byte[] data; private int position; // TODO(internal b/147657250): Enforce this limit on all read methods. @@ -490,45 +499,47 @@ public final class ParsableByteArray { } /** - * Reads a line of text. + * Reads a line of text in UTF-8. * - *

A line is considered to be terminated by any one of a carriage return ('\r'), a line feed - * ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). The UTF-8 charset is - * used. This method discards leading UTF-8 byte order marks, if present. - * - * @return The line not including any line-termination characters, or null if the end of the data - * has already been reached. + *

Equivalent to passing {@link Charsets#UTF_8} to {@link #readLine(Charset)}. */ @Nullable public String readLine() { + return readLine(Charsets.UTF_8); + } + + /** + * Reads a line of text in {@code charset}. + * + *

A line is considered to be terminated by any one of a carriage return ('\r'), a line feed + * ('\n'), or a carriage return followed immediately by a line feed ('\r\n'). This method discards + * leading UTF byte order marks (BOM), if present. + * + *

The {@linkplain #getPosition() position} is advanced to start of the next line (i.e. any + * line terminators are skipped). + * + * @param charset The charset used to interpret the bytes as a {@link String}. + * @return The line not including any line-termination characters, or null if the end of the data + * has already been reached. + * @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16, + * UTF-16BE, and UTF-16LE are supported. + */ + @Nullable + public String readLine(Charset charset) { + Assertions.checkArgument( + SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset); if (bytesLeft() == 0) { return null; } - int lineLimit = position; - while (lineLimit < limit && !Util.isLinebreak(data[lineLimit])) { - lineLimit++; + if (!charset.equals(Charsets.US_ASCII)) { + readUtfCharsetFromBom(); // Skip BOM if present } - if (lineLimit - position >= 3 - && data[position] == (byte) 0xEF - && data[position + 1] == (byte) 0xBB - && data[position + 2] == (byte) 0xBF) { - // There's a UTF-8 byte order mark at the start of the line. Discard it. - position += 3; - } - String line = Util.fromUtf8Bytes(data, position, lineLimit - position); - position = lineLimit; + int lineLimit = findNextLineTerminator(charset); + String line = readString(lineLimit - position, charset); if (position == limit) { return line; } - if (data[position] == '\r') { - position++; - if (position == limit) { - return line; - } - } - if (data[position] == '\n') { - position++; - } + skipLineTerminator(charset); return line; } @@ -566,4 +577,99 @@ public final class ParsableByteArray { position += length; return value; } + + /** + * Reads a UTF byte order mark (BOM) and returns the UTF {@link Charset} it represents. Returns + * {@code null} without advancing {@link #getPosition() position} if no BOM is found. + */ + @Nullable + public Charset readUtfCharsetFromBom() { + if (bytesLeft() >= 3 + && data[position] == (byte) 0xEF + && data[position + 1] == (byte) 0xBB + && data[position + 2] == (byte) 0xBF) { + position += 3; + return Charsets.UTF_8; + } else if (bytesLeft() >= 2) { + if (data[position] == (byte) 0xFE && data[position + 1] == (byte) 0xFF) { + position += 2; + return Charsets.UTF_16BE; + } else if (data[position] == (byte) 0xFF && data[position + 1] == (byte) 0xFE) { + position += 2; + return Charsets.UTF_16LE; + } + } + return null; + } + + /** + * Returns the index of the next occurrence of '\n' or '\r', or {@link #limit} if none is found. + */ + private int findNextLineTerminator(Charset charset) { + int stride; + if (charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) { + stride = 1; + } else if (charset.equals(Charsets.UTF_16) + || charset.equals(Charsets.UTF_16LE) + || charset.equals(Charsets.UTF_16BE)) { + stride = 2; + } else { + throw new IllegalArgumentException("Unsupported charset: " + charset); + } + for (int i = position; i < limit - (stride - 1); i += stride) { + if ((charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) + && Util.isLinebreak(data[i])) { + return i; + } else if ((charset.equals(Charsets.UTF_16) || charset.equals(Charsets.UTF_16BE)) + && data[i] == 0x00 + && Util.isLinebreak(data[i + 1])) { + return i; + } else if (charset.equals(Charsets.UTF_16LE) + && data[i + 1] == 0x00 + && Util.isLinebreak(data[i])) { + return i; + } + } + return limit; + } + + private void skipLineTerminator(Charset charset) { + if (readCharacterIfInList(charset, CR_AND_LF) == '\r') { + readCharacterIfInList(charset, LF); + } + } + + /** + * Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and + * advances {@link #position} past it if it's in {@code chars}, otherwise returns {@code 0} + * without advancing {@link #position}. Returns {@code 0} if {@link #bytesLeft()} doesn't allow + * reading a whole character in {@code charset}. + * + *

Only supports characters in {@code chars} that occupy a single code unit (i.e. one byte for + * UTF-8 and two bytes for UTF-16). + */ + private char readCharacterIfInList(Charset charset, char[] chars) { + char character; + int characterSize; + if ((charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) && bytesLeft() >= 1) { + character = Chars.checkedCast(UnsignedBytes.toInt(data[position])); + characterSize = 1; + } else if ((charset.equals(Charsets.UTF_16) || charset.equals(Charsets.UTF_16BE)) + && bytesLeft() >= 2) { + character = Chars.fromBytes(data[position], data[position + 1]); + characterSize = 2; + } else if (charset.equals(Charsets.UTF_16LE) && bytesLeft() >= 2) { + character = Chars.fromBytes(data[position + 1], data[position]); + characterSize = 2; + } else { + return 0; + } + + if (Chars.contains(chars, character)) { + position += characterSize; + return Chars.checkedCast(character); + } else { + return 0; + } + } } diff --git a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java index ddaf7ee981..cddf95c9f8 100644 --- a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java +++ b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java @@ -15,11 +15,13 @@ */ package androidx.media3.common.util; +import static androidx.media3.test.utils.TestUtil.createByteArray; import static com.google.common.truth.Truth.assertThat; import static java.nio.charset.Charset.forName; import static org.junit.Assert.fail; import androidx.test.ext.junit.runners.AndroidJUnit4; +import com.google.common.base.Charsets; import com.google.common.primitives.Bytes; import java.nio.ByteBuffer; import java.util.Arrays; @@ -548,48 +550,324 @@ public final class ParsableByteArrayTest { } @Test - public void readSingleLineWithoutEndingTrail() { - byte[] bytes = new byte[] {'f', 'o', 'o'}; + public void readSingleLineWithoutEndingTrail_ascii() { + byte[] bytes = "foo".getBytes(Charsets.US_ASCII); ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(3); + assertThat(parser.readLine(Charsets.US_ASCII)).isNull(); + } + + @Test + public void readSingleLineWithEndingLf_ascii() { + byte[] bytes = "foo\n".getBytes(Charsets.US_ASCII); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); + assertThat(parser.readLine(Charsets.US_ASCII)).isNull(); + } + + @Test + public void readTwoLinesWithCrFollowedByLf_ascii() { + byte[] bytes = "foo\r\nbar".getBytes(Charsets.US_ASCII); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(5); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.US_ASCII)).isNull(); + } + + @Test + public void readThreeLinesWithEmptyLine_ascii() { + byte[] bytes = "foo\r\n\rbar".getBytes(Charsets.US_ASCII); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(5); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(6); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(9); + assertThat(parser.readLine(Charsets.US_ASCII)).isNull(); + } + + @Test + public void readFourLinesWithLfFollowedByCr_ascii() { + byte[] bytes = "foo\n\r\rbar\r\n".getBytes(Charsets.US_ASCII); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(5); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(6); + assertThat(parser.readLine(Charsets.US_ASCII)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(11); + assertThat(parser.readLine(Charsets.US_ASCII)).isNull(); + } + + @Test + public void readSingleLineWithoutEndingTrail_utf8() { + byte[] bytes = "foo".getBytes(Charsets.UTF_8); + ParsableByteArray parser = new ParsableByteArray(bytes); + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(3); assertThat(parser.readLine()).isNull(); } @Test - public void readSingleLineWithEndingLf() { - byte[] bytes = new byte[] {'f', 'o', 'o', '\n'}; + public void readSingleLineWithEndingLf_utf8() { + byte[] bytes = "foo\n".getBytes(Charsets.UTF_8); ParsableByteArray parser = new ParsableByteArray(bytes); + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); assertThat(parser.readLine()).isNull(); } @Test - public void readTwoLinesWithCrFollowedByLf() { - byte[] bytes = new byte[] {'f', 'o', 'o', '\r', '\n', 'b', 'a', 'r'}; + public void readTwoLinesWithCrFollowedByLf_utf8() { + byte[] bytes = "foo\r\nbar".getBytes(Charsets.UTF_8); ParsableByteArray parser = new ParsableByteArray(bytes); + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(5); assertThat(parser.readLine()).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(8); assertThat(parser.readLine()).isNull(); } @Test - public void readThreeLinesWithEmptyLine() { - byte[] bytes = new byte[] {'f', 'o', 'o', '\r', '\n', '\r', 'b', 'a', 'r'}; + public void readThreeLinesWithEmptyLineAndLeadingBom_utf8() { + byte[] bytes = + Bytes.concat(createByteArray(0xEF, 0xBB, 0xBF), "foo\r\n\rbar".getBytes(Charsets.UTF_8)); ParsableByteArray parser = new ParsableByteArray(bytes); + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); assertThat(parser.readLine()).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(9); assertThat(parser.readLine()).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(12); assertThat(parser.readLine()).isNull(); } @Test - public void readFourLinesWithLfFollowedByCr() { - byte[] bytes = new byte[] {'f', 'o', 'o', '\n', '\r', '\r', 'b', 'a', 'r', '\r', '\n'}; + public void readFourLinesWithLfFollowedByCr_utf8() { + byte[] bytes = "foo\n\r\rbar\r\n".getBytes(Charsets.UTF_8); ParsableByteArray parser = new ParsableByteArray(bytes); + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); assertThat(parser.readLine()).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(5); assertThat(parser.readLine()).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(6); assertThat(parser.readLine()).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(11); assertThat(parser.readLine()).isNull(); } + + @Test + public void readSingleLineWithoutEndingTrail_utf16() { + // Use UTF_16BE because we don't want the leading BOM that's added by getBytes(UTF_16). We + // explicitly test with a BOM elsewhere. + byte[] bytes = "foo".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(6); + assertThat(parser.readLine(Charsets.UTF_16)).isNull(); + } + + @Test + public void readSingleLineWithEndingLf_utf16() { + // Use UTF_16BE because we don't want the leading BOM that's added by getBytes(UTF_16). We + // explicitly test with a BOM elsewhere. + byte[] bytes = "foo\n".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16)).isNull(); + } + + @Test + public void readTwoLinesWithCrFollowedByLf_utf16() { + // Use UTF_16BE because we don't want the leading BOM that's added by getBytes(UTF_16). We + // explicitly test with a BOM elsewhere. + byte[] bytes = "foo\r\nbar".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(16); + assertThat(parser.readLine(Charsets.UTF_16)).isNull(); + } + + @Test + public void readThreeLinesWithEmptyLineAndLeadingBom_utf16() { + // getBytes(UTF_16) always adds the leading BOM. + byte[] bytes = "foo\r\n\rbar".getBytes(Charsets.UTF_16); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(14); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(20); + assertThat(parser.readLine(Charsets.UTF_16)).isNull(); + } + + @Test + public void readFourLinesWithLfFollowedByCr_utf16() { + // Use UTF_16BE because we don't want the leading BOM that's added by getBytes(UTF_16). We + // explicitly test with a BOM elsewhere. + byte[] bytes = "foo\n\r\rbar\r\n".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(22); + assertThat(parser.readLine(Charsets.UTF_16)).isNull(); + } + + @Test + public void readSingleLineWithoutEndingTrail_utf16be() { + byte[] bytes = "foo".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(6); + assertThat(parser.readLine(Charsets.UTF_16BE)).isNull(); + } + + @Test + public void readSingleLineWithEndingLf_utf16be() { + byte[] bytes = "foo\n".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16BE)).isNull(); + } + + @Test + public void readTwoLinesWithCrFollowedByLf_utf16be() { + byte[] bytes = "foo\r\nbar".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(16); + assertThat(parser.readLine(Charsets.UTF_16BE)).isNull(); + } + + @Test + public void readThreeLinesWithEmptyLineAndLeadingBom_utf16be() { + byte[] bytes = + Bytes.concat(createByteArray(0xFE, 0xFF), "foo\r\n\rbar".getBytes(Charsets.UTF_16BE)); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(14); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(20); + assertThat(parser.readLine(Charsets.UTF_16BE)).isNull(); + } + + @Test + public void readFourLinesWithLfFollowedByCr_utf16be() { + byte[] bytes = "foo\n\r\rbar\r\n".getBytes(Charsets.UTF_16BE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16BE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(22); + assertThat(parser.readLine(Charsets.UTF_16BE)).isNull(); + } + + @Test + public void readSingleLineWithoutEndingTrail_utf16le() { + byte[] bytes = "foo".getBytes(Charsets.UTF_16LE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(6); + assertThat(parser.readLine(Charsets.UTF_16LE)).isNull(); + } + + @Test + public void readSingleLineWithEndingLf_utf16le() { + byte[] bytes = "foo\n".getBytes(Charsets.UTF_16LE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16LE)).isNull(); + } + + @Test + public void readTwoLinesWithCrFollowedByLf_utf16le() { + byte[] bytes = "foo\r\nbar".getBytes(Charsets.UTF_16LE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(16); + assertThat(parser.readLine(Charsets.UTF_16LE)).isNull(); + } + + @Test + public void readThreeLinesWithEmptyLineAndLeadingBom_utf16le() { + byte[] bytes = + Bytes.concat(createByteArray(0xFF, 0xFE), "foo\r\n\rbar".getBytes(Charsets.UTF_16LE)); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(14); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(20); + assertThat(parser.readLine(Charsets.UTF_16LE)).isNull(); + } + + @Test + public void readFourLinesWithLfFollowedByCr_utf16le() { + byte[] bytes = "foo\n\r\rbar\r\n".getBytes(Charsets.UTF_16LE); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(10); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo(""); + assertThat(parser.getPosition()).isEqualTo(12); + assertThat(parser.readLine(Charsets.UTF_16LE)).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(22); + assertThat(parser.readLine(Charsets.UTF_16LE)).isNull(); + } } diff --git a/libraries/extractor/src/main/java/androidx/media3/extractor/text/subrip/SubripDecoder.java b/libraries/extractor/src/main/java/androidx/media3/extractor/text/subrip/SubripDecoder.java index 1ecc7f425d..6147ff92ad 100644 --- a/libraries/extractor/src/main/java/androidx/media3/extractor/text/subrip/SubripDecoder.java +++ b/libraries/extractor/src/main/java/androidx/media3/extractor/text/subrip/SubripDecoder.java @@ -27,6 +27,8 @@ import androidx.media3.common.util.ParsableByteArray; import androidx.media3.common.util.UnstableApi; import androidx.media3.extractor.text.SimpleSubtitleDecoder; import androidx.media3.extractor.text.Subtitle; +import com.google.common.base.Charsets; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -76,9 +78,10 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { ArrayList cues = new ArrayList<>(); LongArray cueTimesUs = new LongArray(); ParsableByteArray subripData = new ParsableByteArray(data, length); + Charset charset = detectUtfCharset(subripData); @Nullable String currentLine; - while ((currentLine = subripData.readLine()) != null) { + while ((currentLine = subripData.readLine(charset)) != null) { if (currentLine.length() == 0) { // Skip blank lines. continue; @@ -93,7 +96,7 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { } // Read and parse the timing line. - currentLine = subripData.readLine(); + currentLine = subripData.readLine(charset); if (currentLine == null) { Log.w(TAG, "Unexpected end"); break; @@ -111,13 +114,13 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { // Read and parse the text and tags. textBuilder.setLength(0); tags.clear(); - currentLine = subripData.readLine(); + currentLine = subripData.readLine(charset); while (!TextUtils.isEmpty(currentLine)) { if (textBuilder.length() > 0) { textBuilder.append("
"); } textBuilder.append(processLine(currentLine, tags)); - currentLine = subripData.readLine(); + currentLine = subripData.readLine(charset); } Spanned text = Html.fromHtml(textBuilder.toString()); @@ -140,6 +143,15 @@ public final class SubripDecoder extends SimpleSubtitleDecoder { return new SubripSubtitle(cuesArray, cueTimesUsArray); } + /** + * Determine UTF encoding of the byte array from a byte order mark (BOM), defaulting to UTF-8 if + * no BOM is found. + */ + private Charset detectUtfCharset(ParsableByteArray data) { + @Nullable Charset charset = data.readUtfCharsetFromBom(); + return charset != null ? charset : Charsets.UTF_8; + } + /** * Trims and removes tags from the given line. The removed tags are added to {@code tags}. * diff --git a/libraries/extractor/src/main/java/androidx/media3/extractor/text/tx3g/Tx3gDecoder.java b/libraries/extractor/src/main/java/androidx/media3/extractor/text/tx3g/Tx3gDecoder.java index e0339d8f97..e66888b807 100644 --- a/libraries/extractor/src/main/java/androidx/media3/extractor/text/tx3g/Tx3gDecoder.java +++ b/libraries/extractor/src/main/java/androidx/media3/extractor/text/tx3g/Tx3gDecoder.java @@ -26,6 +26,7 @@ import android.text.style.ForegroundColorSpan; import android.text.style.StyleSpan; import android.text.style.TypefaceSpan; import android.text.style.UnderlineSpan; +import androidx.annotation.Nullable; import androidx.media3.common.C; import androidx.media3.common.text.Cue; import androidx.media3.common.util.Log; @@ -36,6 +37,7 @@ import androidx.media3.extractor.text.SimpleSubtitleDecoder; import androidx.media3.extractor.text.Subtitle; import androidx.media3.extractor.text.SubtitleDecoderException; import com.google.common.base.Charsets; +import java.nio.charset.Charset; import java.util.List; /** @@ -48,16 +50,12 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { private static final String TAG = "Tx3gDecoder"; - private static final char BOM_UTF16_BE = '\uFEFF'; - private static final char BOM_UTF16_LE = '\uFFFE'; - private static final int TYPE_STYL = 0x7374796c; private static final int TYPE_TBOX = 0x74626f78; private static final String TX3G_SERIF = "Serif"; private static final int SIZE_ATOM_HEADER = 8; private static final int SIZE_SHORT = 2; - private static final int SIZE_BOM_UTF16 = 2; private static final int SIZE_STYLE_RECORD = 12; private static final int FONT_FACE_BOLD = 0x0001; @@ -173,13 +171,11 @@ public final class Tx3gDecoder extends SimpleSubtitleDecoder { if (textLength == 0) { return ""; } - if (parsableByteArray.bytesLeft() >= SIZE_BOM_UTF16) { - char firstChar = parsableByteArray.peekChar(); - if (firstChar == BOM_UTF16_BE || firstChar == BOM_UTF16_LE) { - return parsableByteArray.readString(textLength, Charsets.UTF_16); - } - } - return parsableByteArray.readString(textLength, Charsets.UTF_8); + int textStartPosition = parsableByteArray.getPosition(); + @Nullable Charset charset = parsableByteArray.readUtfCharsetFromBom(); + int bomSize = parsableByteArray.getPosition() - textStartPosition; + return parsableByteArray.readString( + textLength - bomSize, charset != null ? charset : Charsets.UTF_8); } private void applyStyleRecord(ParsableByteArray parsableByteArray, SpannableStringBuilder cueText) diff --git a/libraries/extractor/src/test/java/androidx/media3/extractor/text/subrip/SubripDecoderTest.java b/libraries/extractor/src/test/java/androidx/media3/extractor/text/subrip/SubripDecoderTest.java index e9a4b8f8b8..259a72809d 100644 --- a/libraries/extractor/src/test/java/androidx/media3/extractor/text/subrip/SubripDecoderTest.java +++ b/libraries/extractor/src/test/java/androidx/media3/extractor/text/subrip/SubripDecoderTest.java @@ -40,6 +40,8 @@ public final class SubripDecoderTest { private static final String TYPICAL_NEGATIVE_TIMESTAMPS = "media/subrip/typical_negative_timestamps"; private static final String TYPICAL_UNEXPECTED_END = "media/subrip/typical_unexpected_end"; + private static final String TYPICAL_UTF16BE = "media/subrip/typical_utf16be"; + private static final String TYPICAL_UTF16LE = "media/subrip/typical_utf16le"; private static final String TYPICAL_WITH_TAGS = "media/subrip/typical_with_tags"; private static final String TYPICAL_NO_HOURS_AND_MILLIS = "media/subrip/typical_no_hours_and_millis"; @@ -148,6 +150,32 @@ public final class SubripDecoderTest { assertTypicalCue2(subtitle, 2); } + @Test + public void decodeTypicalUtf16LittleEndian() throws IOException { + SubripDecoder decoder = new SubripDecoder(); + byte[] bytes = + TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16LE); + Subtitle subtitle = decoder.decode(bytes, bytes.length, false); + + assertThat(subtitle.getEventTimeCount()).isEqualTo(6); + assertTypicalCue1(subtitle, 0); + assertTypicalCue2(subtitle, 2); + assertTypicalCue3(subtitle, 4); + } + + @Test + public void decodeTypicalUtf16BigEndian() throws IOException { + SubripDecoder decoder = new SubripDecoder(); + byte[] bytes = + TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16BE); + Subtitle subtitle = decoder.decode(bytes, bytes.length, false); + + assertThat(subtitle.getEventTimeCount()).isEqualTo(6); + assertTypicalCue1(subtitle, 0); + assertTypicalCue2(subtitle, 2); + assertTypicalCue3(subtitle, 4); + } + @Test public void decodeCueWithTag() throws IOException { SubripDecoder decoder = new SubripDecoder(); diff --git a/libraries/test_data/src/test/assets/media/subrip/typical_utf16be b/libraries/test_data/src/test/assets/media/subrip/typical_utf16be new file mode 100644 index 0000000000000000000000000000000000000000..9531c268087bec207cf8b766bc60ef01c13b354a GIT binary patch literal 434 zcmaKoYYM_J5QOJ$fhL z8D_R1{Qq)-*}-tqO|8x&-1|vHs%KDIh3R-#L%;p$w?V&&oGVf1wXJ&kW5gQ71~