Fix casting in ParsableByteArray.peekCharacterAndSize

This was introduced in 841bdc6efe.

There's no need to cast the `char` (2 bytes) down to a `byte` in order
to pack it into an `int` (4 bytes) alongside a short (2 bytes).

We also don't need to use `short` to track the character count (max 4
with UTF-8) - a single byte is enough. This change also uses
`Ints.fromBytes` to avoid having to reason about how casting &
bit-shifting interact.

This change introduces a test which reproduces the failure reported in
Issue: androidx/media#2167.

#cherrypick

PiperOrigin-RevId: 730809219
This commit is contained in:
ibaker 2025-02-25 03:16:29 -08:00 committed by Copybara-Service
parent 3f493eaf9e
commit fe19d8c9be
5 changed files with 53 additions and 9 deletions

View File

@ -17,6 +17,8 @@
* Audio:
* Video:
* Text:
* Fix handling of multi-byte UTF-8 characters in WebVTT files using CR
line endings ([#2167](https://github.com/androidx/media/issues/2167)).
* Metadata:
* Image:
* DataSource:

View File

@ -19,7 +19,6 @@ import androidx.annotation.Nullable;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Chars;
import com.google.common.primitives.Ints;
import com.google.common.primitives.UnsignedBytes;
import com.google.errorprone.annotations.CheckReturnValue;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
@ -721,8 +720,8 @@ public final class ParsableByteArray {
/**
* Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and the
* number of bytes the character takes up within the array packed into an int. First four bytes
* are the character and the second four is the size in bytes it takes. Returns 0 if {@link
* number of bytes the character takes up within the array packed into an int. First two bytes are
* the character and the second two is the size in bytes it takes. Returns 0 if {@link
* #bytesLeft()} doesn't allow reading a whole character in {@code charset} or if the {@code
* charset} is not one of US_ASCII, UTF-8, UTF-16, UTF-16BE, or UTF-16LE.
*
@ -730,23 +729,27 @@ public final class ParsableByteArray {
* bytes for UTF-16).
*/
private int peekCharacterAndSize(Charset charset) {
byte character;
short characterSize;
byte charByte1;
byte charByte2;
byte characterSize;
if ((charset.equals(StandardCharsets.UTF_8) || charset.equals(StandardCharsets.US_ASCII))
&& bytesLeft() >= 1) {
character = (byte) Chars.checkedCast(UnsignedBytes.toInt(data[position]));
charByte1 = 0;
charByte2 = data[position];
characterSize = 1;
} else if ((charset.equals(StandardCharsets.UTF_16)
|| charset.equals(StandardCharsets.UTF_16BE))
&& bytesLeft() >= 2) {
character = (byte) Chars.fromBytes(data[position], data[position + 1]);
charByte1 = data[position];
charByte2 = data[position + 1];
characterSize = 2;
} else if (charset.equals(StandardCharsets.UTF_16LE) && bytesLeft() >= 2) {
character = (byte) Chars.fromBytes(data[position + 1], data[position]);
charByte1 = data[position + 1];
charByte2 = data[position];
characterSize = 2;
} else {
return 0;
}
return (Chars.checkedCast(character) << Short.SIZE) + characterSize;
return Ints.fromBytes(charByte1, charByte2, (byte) 0, characterSize);
}
}

View File

@ -679,6 +679,31 @@ public final class ParsableByteArrayTest {
assertThat(parser.readLine()).isNull();
}
@Test
public void readTwoLinesWithCr_utf8() {
byte[] bytes = "foo\rbar".getBytes(StandardCharsets.UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes);
assertThat(parser.readLine()).isEqualTo("foo");
assertThat(parser.getPosition()).isEqualTo(4);
assertThat(parser.readLine()).isEqualTo("bar");
assertThat(parser.getPosition()).isEqualTo(7);
assertThat(parser.readLine()).isNull();
}
// https://github.com/androidx/media/issues/2167
@Test
public void readTwoLinesWithCrAndWideChar_utf8() {
byte[] bytes = "foo\r\uD83D\uDE1B".getBytes(StandardCharsets.UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes);
assertThat(parser.readLine()).isEqualTo("foo");
assertThat(parser.getPosition()).isEqualTo(4);
assertThat(parser.readLine()).isEqualTo("\uD83D\uDE1B");
assertThat(parser.getPosition()).isEqualTo(8);
assertThat(parser.readLine()).isNull();
}
@Test
public void readTwoLinesWithCrFollowedByLf_utf8() {
byte[] bytes = "foo\r\nbar".getBytes(StandardCharsets.UTF_8);

View File

@ -56,6 +56,8 @@ public class WebvttParserTest {
"media/webvtt/with_overlapping_timestamps";
private static final String WITH_VERTICAL_FILE = "media/webvtt/with_vertical";
private static final String WITH_RUBIES_FILE = "media/webvtt/with_rubies";
private static final String WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE =
"media/webvtt/with_wide_unicode_chars_and_cr_endings";
private static final String WITH_BAD_CUE_HEADER_FILE = "media/webvtt/with_bad_cue_header";
private static final String WITH_TAGS_FILE = "media/webvtt/with_tags";
private static final String WITH_CSS_STYLES = "media/webvtt/with_css_styles";
@ -497,6 +499,17 @@ public class WebvttParserTest {
assertThat((Spanned) fourthCue.text).hasNoSpans();
}
// https://github.com/androidx/media/issues/2167
@Test
public void parseWithWideUnicodeCharsAndCrLineEndings() throws Exception {
ImmutableList<CuesWithTiming> allCues =
getCuesForTestAsset(WITH_WIDE_CHARS_AND_CR_ENDINGS_FILE);
// WebvttCueParser normalizes all line endings within cue text to \n.
assertThat(Iterables.getOnlyElement(Iterables.getOnlyElement(allCues).cues).text.toString())
.isEqualTo("\uD83D\uDE1B\n\uD83D\uDE1B");
}
@Test
public void parseWithBadCueHeader() throws Exception {
List<CuesWithTiming> allCues = getCuesForTestAsset(WITH_BAD_CUE_HEADER_FILE);

View File

@ -0,0 +1 @@
WEBVTT 00:00.000 --> 00:01.234 😛 😛