From fe19d8c9be94bbf1a1be8d3f49b2de52f0e2f1ae Mon Sep 17 00:00:00 2001
From: ibaker <ibaker@google.com>
Date: Tue, 25 Feb 2025 03:16:29 -0800
Subject: [PATCH] Fix casting in `ParsableByteArray.peekCharacterAndSize`

This was introduced in https://github.com/androidx/media/commit/841bdc6efe784ff3194813aa656d4cad6b0e9a39.

There's no need to cast the `char` (2 bytes) down to a `byte` in order
to pack it into an `int` (4 bytes) alongside a short (2 bytes).

We also don't need to use `short` to track the character count (max 4
with UTF-8) - a single byte is enough. This change also uses
`Ints.fromBytes` to avoid having to reason about how casting &
bit-shifting interact.

This change introduces a test which reproduces the failure reported in
Issue: androidx/media#2167.

#cherrypick

PiperOrigin-RevId: 730809219
---
 RELEASENOTES.md                               |  2 ++
 .../media3/common/util/ParsableByteArray.java | 21 +++++++++-------
 .../common/util/ParsableByteArrayTest.java    | 25 +++++++++++++++++++
 .../text/webvtt/WebvttParserTest.java         | 13 ++++++++++
 .../with_wide_unicode_chars_and_cr_endings    |  1 +
 5 files changed, 53 insertions(+), 9 deletions(-)
 create mode 100644 libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings

diff --git a/RELEASENOTES.md b/RELEASENOTES.md
index 2d8a5b5f1a..9da943fb0d 100644
--- a/RELEASENOTES.md
+++ b/RELEASENOTES.md
@@ -17,6 +17,8 @@
 *   Audio:
 *   Video:
 *   Text:
+    *   Fix handling of multi-byte UTF-8 characters in WebVTT files using CR
+        line endings ([#2167](https://github.com/androidx/media/issues/2167)).
 *   Metadata:
 *   Image:
 *   DataSource:
diff --git a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java
index ecb7f4cf8e..11bb6db7a6 100644
--- a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java
+++ b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java
@@ -19,7 +19,6 @@ import androidx.annotation.Nullable;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.primitives.Chars;
 import com.google.common.primitives.Ints;
-import com.google.common.primitives.UnsignedBytes;
 import com.google.errorprone.annotations.CheckReturnValue;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
@@ -721,8 +720,8 @@ public final class ParsableByteArray {
 
   /**
    * Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and the
-   * number of bytes the character takes up within the array packed into an int. First four bytes
-   * are the character and the second four is the size in bytes it takes. Returns 0 if {@link
+   * number of bytes the character takes up within the array packed into an int. First two bytes are
+   * the character and the second two is the size in bytes it takes. Returns 0 if {@link
    * #bytesLeft()} doesn't allow reading a whole character in {@code charset} or if the {@code
    * charset} is not one of US_ASCII, UTF-8, UTF-16, UTF-16BE, or UTF-16LE.
    *
@@ -730,23 +729,27 @@ public final class ParsableByteArray {
    * bytes for UTF-16).
    */
   private int peekCharacterAndSize(Charset charset) {
-    byte character;
-    short characterSize;
+    byte charByte1;
+    byte charByte2;
+    byte characterSize;
     if ((charset.equals(StandardCharsets.UTF_8) || charset.equals(StandardCharsets.US_ASCII))
         && bytesLeft() >= 1) {
-      character = (byte) Chars.checkedCast(UnsignedBytes.toInt(data[position]));
+      charByte1 = 0;
+      charByte2 = data[position];
       characterSize = 1;
     } else if ((charset.equals(StandardCharsets.UTF_16)
             || charset.equals(StandardCharsets.UTF_16BE))
         && bytesLeft() >= 2) {
-      character = (byte) Chars.fromBytes(data[position], data[position + 1]);
+      charByte1 = data[position];
+      charByte2 = data[position + 1];
       characterSize = 2;
     } else if (charset.equals(StandardCharsets.UTF_16LE) && bytesLeft() >= 2) {
-      character = (byte) Chars.fromBytes(data[position + 1], data[position]);
+      charByte1 = data[position + 1];
+      charByte2 = data[position];
       characterSize = 2;
     } else {
       return 0;
     }
-    return (Chars.checkedCast(character) << Short.SIZE) + characterSize;
+    return Ints.fromBytes(charByte1, charByte2, (byte) 0, characterSize);
   }
 }
diff --git a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java
index 76b0664a2f..11842b9c79 100644
--- a/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java
+++ b/libraries/common/src/test/java/androidx/media3/common/util/ParsableByteArrayTest.java
@@ -679,6 +679,31 @@ public final class ParsableByteArrayTest {
     assertThat(parser.readLine()).isNull();
   }
 
+  @Test
+  public void readTwoLinesWithCr_utf8() {
+    byte[] bytes = "foo\rbar".getBytes(StandardCharsets.UTF_8);
+    ParsableByteArray parser = new ParsableByteArray(bytes);
+
+    assertThat(parser.readLine()).isEqualTo("foo");
+    assertThat(parser.getPosition()).isEqualTo(4);
+    assertThat(parser.readLine()).isEqualTo("bar");
+    assertThat(parser.getPosition()).isEqualTo(7);
+    assertThat(parser.readLine()).isNull();
+  }
+
+  // https://github.com/androidx/media/issues/2167
+  @Test
+  public void readTwoLinesWithCrAndWideChar_utf8() {
+    byte[] bytes = "foo\r\uD83D\uDE1B".getBytes(StandardCharsets.UTF_8);
+    ParsableByteArray parser = new ParsableByteArray(bytes);
+
+    assertThat(parser.readLine()).isEqualTo("foo");
+    assertThat(parser.getPosition()).isEqualTo(4);
+    assertThat(parser.readLine()).isEqualTo("\uD83D\uDE1B");
+    assertThat(parser.getPosition()).isEqualTo(8);
+    assertThat(parser.readLine()).isNull();
+  }
+
   @Test
   public void readTwoLinesWithCrFollowedByLf_utf8() {
     byte[] bytes = "foo\r\nbar".getBytes(StandardCharsets.UTF_8);
diff --git a/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java b/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java
index e0834919c0..708249661a 100644
--- a/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java
+++ b/libraries/extractor/src/test/java/androidx/media3/extractor/text/webvtt/WebvttParserTest.java
@@ -56,6 +56,8 @@ public class WebvttParserTest {
       "media/webvtt/with_overlapping_timestamps";
   private static final String WITH_VERTICAL_FILE = "media/webvtt/with_vertical";
   private static final String WITH_RUBIES_FILE = "media/webvtt/with_rubies";
+  private static final String WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE =
+      "media/webvtt/with_wide_unicode_chars_and_cr_endings";
   private static final String WITH_BAD_CUE_HEADER_FILE = "media/webvtt/with_bad_cue_header";
   private static final String WITH_TAGS_FILE = "media/webvtt/with_tags";
   private static final String WITH_CSS_STYLES = "media/webvtt/with_css_styles";
@@ -497,6 +499,17 @@ public class WebvttParserTest {
     assertThat((Spanned) fourthCue.text).hasNoSpans();
   }
 
+  // https://github.com/androidx/media/issues/2167
+  @Test
+  public void parseWithWideUnicodeCharsAndCrLineEndings() throws Exception {
+    ImmutableList<CuesWithTiming> allCues =
+        getCuesForTestAsset(WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE);
+
+    // WebvttCueParser normalizes all line endings within cue text to \n.
+    assertThat(Iterables.getOnlyElement(Iterables.getOnlyElement(allCues).cues).text.toString())
+        .isEqualTo("\uD83D\uDE1B\n\uD83D\uDE1B");
+  }
+
   @Test
   public void parseWithBadCueHeader() throws Exception {
     List<CuesWithTiming> allCues = getCuesForTestAsset(WITH_BAD_CUE_HEADER_FILE);
diff --git a/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings b/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings
new file mode 100644
index 0000000000..4962f26ffa
--- /dev/null
+++ b/libraries/test_data/src/test/assets/media/webvtt/with_wide_unicode_chars_and_cr_endings
@@ -0,0 +1 @@
+WEBVTT00:00.000 --> 00:01.234😛😛
\ No newline at end of file