From fe19d8c9be94bbf1a1be8d3f49b2de52f0e2f1ae Mon Sep 17 00:00:00 2001 From: ibaker Date: Tue, 25 Feb 2025 03:16:29 -0800 Subject: [PATCH] Fix casting in `ParsableByteArray.peekCharacterAndSize` This was introduced in https://github.com/androidx/media/commit/841bdc6efe784ff3194813aa656d4cad6b0e9a39. There's no need to cast the `char` (2 bytes) down to a `byte` in order to pack it into an `int` (4 bytes) alongside a short (2 bytes). We also don't need to use `short` to track the character count (max 4 with UTF-8) - a single byte is enough. This change also uses `Ints.fromBytes` to avoid having to reason about how casting & bit-shifting interact. This change introduces a test which reproduces the failure reported in Issue: androidx/media#2167. #cherrypick PiperOrigin-RevId: 730809219 --- RELEASENOTES.md | 2 ++ .../media3/common/util/ParsableByteArray.java | 21 +++++++++------- .../common/util/ParsableByteArrayTest.java | 25 +++++++++++++++++++ .../text/webvtt/WebvttParserTest.java | 13 ++++++++++ .../with_wide_unicode_chars_and_cr_endings | 1 + 5 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings diff --git a/RELEASENOTES.md b/RELEASENOTES.md index 2d8a5b5f1a..9da943fb0d 100644 --- a/RELEASENOTES.md +++ b/RELEASENOTES.md @@ -17,6 +17,8 @@ * Audio: * Video: * Text: + * Fix handling of multi-byte UTF-8 characters in WebVTT files using CR + line endings ([#2167](https://github.com/androidx/media/issues/2167)). * Metadata: * Image: * DataSource: diff --git a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java index ecb7f4cf8e..11bb6db7a6 100644 --- a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java +++ b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java @@ -19,7 +19,6 @@ import androidx.annotation.Nullable; import com.google.common.collect.ImmutableSet; import com.google.common.primitives.Chars; import com.google.common.primitives.Ints; -import com.google.common.primitives.UnsignedBytes; import com.google.errorprone.annotations.CheckReturnValue; import java.nio.ByteBuffer; import java.nio.charset.Charset; @@ -721,8 +720,8 @@ public final class ParsableByteArray { /** * Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and the - * number of bytes the character takes up within the array packed into an int. First four bytes - * are the character and the second four is the size in bytes it takes. Returns 0 if {@link + * number of bytes the character takes up within the array packed into an int. First two bytes are + * the character and the second two is the size in bytes it takes. Returns 0 if {@link * #bytesLeft()} doesn't allow reading a whole character in {@code charset} or if the {@code * charset} is not one of US_ASCII, UTF-8, UTF-16, UTF-16BE, or UTF-16LE. * @@ -730,23 +729,27 @@ public final class ParsableByteArray { * bytes for UTF-16). */ private int peekCharacterAndSize(Charset charset) { - byte character; - short characterSize; + byte charByte1; + byte charByte2; + byte characterSize; if ((charset.equals(StandardCharsets.UTF_8) || charset.equals(StandardCharsets.US_ASCII)) && bytesLeft() >= 1) { - character = (byte) Chars.checkedCast(UnsignedBytes.toInt(data[position])); + charByte1 = 0; + charByte2 = data[position]; characterSize = 1; } else if ((charset.equals(StandardCharsets.UTF_16) || charset.equals(StandardCharsets.UTF_16BE)) && bytesLeft() >= 2) { - character = (byte) Chars.fromBytes(data[position], data[position + 1]); + charByte1 = data[position]; + charByte2 = data[position + 1]; characterSize = 2; } else if (charset.equals(StandardCharsets.UTF_16LE) && bytesLeft() >= 2) { - character = (byte) Chars.fromBytes(data[position + 1], data[position]); + charByte1 = data[position + 1]; + charByte2 = data[position]; characterSize = 2; } else { return 0; } - return (Chars.checkedCast(character) << Short.SIZE) + characterSize; + return Ints.fromBytes(charByte1, charByte2, (byte) 0, characterSize); } } diff --git a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java index 76b0664a2f..11842b9c79 100644 --- a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java +++ b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java @@ -679,6 +679,31 @@ public final class ParsableByteArrayTest { assertThat(parser.readLine()).isNull(); } + @Test + public void readTwoLinesWithCr_utf8() { + byte[] bytes = "foo\rbar".getBytes(StandardCharsets.UTF_8); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); + assertThat(parser.readLine()).isEqualTo("bar"); + assertThat(parser.getPosition()).isEqualTo(7); + assertThat(parser.readLine()).isNull(); + } + + // https://github.com/androidx/media/issues/2167 + @Test + public void readTwoLinesWithCrAndWideChar_utf8() { + byte[] bytes = "foo\r\uD83D\uDE1B".getBytes(StandardCharsets.UTF_8); + ParsableByteArray parser = new ParsableByteArray(bytes); + + assertThat(parser.readLine()).isEqualTo("foo"); + assertThat(parser.getPosition()).isEqualTo(4); + assertThat(parser.readLine()).isEqualTo("\uD83D\uDE1B"); + assertThat(parser.getPosition()).isEqualTo(8); + assertThat(parser.readLine()).isNull(); + } + @Test public void readTwoLinesWithCrFollowedByLf_utf8() { byte[] bytes = "foo\r\nbar".getBytes(StandardCharsets.UTF_8); diff --git a/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java b/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java index e0834919c0..708249661a 100644 --- a/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java +++ b/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java @@ -56,6 +56,8 @@ public class WebvttParserTest { "media/webvtt/with_overlapping_timestamps"; private static final String WITH_VERTICAL_FILE = "media/webvtt/with_vertical"; private static final String WITH_RUBIES_FILE = "media/webvtt/with_rubies"; + private static final String WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE = + "media/webvtt/with_wide_unicode_chars_and_cr_endings"; private static final String WITH_BAD_CUE_HEADER_FILE = "media/webvtt/with_bad_cue_header"; private static final String WITH_TAGS_FILE = "media/webvtt/with_tags"; private static final String WITH_CSS_STYLES = "media/webvtt/with_css_styles"; @@ -497,6 +499,17 @@ public class WebvttParserTest { assertThat((Spanned) fourthCue.text).hasNoSpans(); } + // https://github.com/androidx/media/issues/2167 + @Test + public void parseWithWideUnicodeCharsAndCrLineEndings() throws Exception { + ImmutableList allCues = + getCuesForTestAsset(WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE); + + // WebvttCueParser normalizes all line endings within cue text to \n. + assertThat(Iterables.getOnlyElement(Iterables.getOnlyElement(allCues).cues).text.toString()) + .isEqualTo("\uD83D\uDE1B\n\uD83D\uDE1B"); + } + @Test public void parseWithBadCueHeader() throws Exception { List allCues = getCuesForTestAsset(WITH_BAD_CUE_HEADER_FILE); diff --git a/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings b/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings new file mode 100644 index 0000000000..4962f26ffa --- /dev/null +++ b/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings @@ -0,0 +1 @@ +WEBVTT 00:00.000 --> 00:01.234 😛 😛 \ No newline at end of file