Add ParsableByteArray.peekCodePoint and deprecate peekChar(Charset)

This is analagous to `String.codePointAt` and similar methods. By
dealing only in unicode code points, and returning an `int`, we can
avoid having to reason about how to handle UTF-8 code units that
require two UTF-16 code units (each one a Java `char`) to represent
them.

The zero-arg `peekChar` method remains undeprecated, because it behaves
as "you'd expect" when compared to `peekUnsignedByte` (always reads two
big-endian bytes).

PiperOrigin-RevId: 733752645
This commit is contained in:
ibaker 2025-03-05 08:54:02 -08:00 committed by Copybara-Service
parent d7163534ff
commit d7574ffd66
3 changed files with 470 additions and 52 deletions

View File

@ -15,12 +15,18 @@
*/ */
package androidx.media3.common.util; package androidx.media3.common.util;
import static java.nio.ByteOrder.BIG_ENDIAN;
import static java.nio.ByteOrder.LITTLE_ENDIAN;
import androidx.annotation.Nullable; import androidx.annotation.Nullable;
import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Chars; import com.google.common.primitives.Chars;
import com.google.common.primitives.Ints; import com.google.common.primitives.Ints;
import com.google.common.primitives.UnsignedBytes;
import com.google.common.primitives.UnsignedInts;
import com.google.errorprone.annotations.CheckReturnValue; import com.google.errorprone.annotations.CheckReturnValue;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.Arrays;
@ -33,6 +39,9 @@ import java.util.Arrays;
@CheckReturnValue @CheckReturnValue
public final class ParsableByteArray { public final class ParsableByteArray {
/** A value that is outside the valid range of unicode code points. */
public static final int INVALID_CODE_POINT = 0x11_0000;
private static final char[] CR_AND_LF = {'\r', '\n'}; private static final char[] CR_AND_LF = {'\r', '\n'};
private static final char[] LF = {'\n'}; private static final char[] LF = {'\n'};
private static final ImmutableSet<Charset> SUPPORTED_CHARSETS_FOR_READLINE = private static final ImmutableSet<Charset> SUPPORTED_CHARSETS_FOR_READLINE =
@ -239,30 +248,74 @@ public final class ParsableByteArray {
return (data[position] & 0xFF); return (data[position] & 0xFF);
} }
/** /** Peeks at the next two bytes and interprets them as a big-endian char. */
* Peeks at the next char.
*
* <p>Equivalent to passing {@link StandardCharsets#UTF_16} or {@link StandardCharsets#UTF_16BE}
* to {@link #peekChar(Charset)}.
*/
public char peekChar() { public char peekChar() {
return (char) ((data[position] & 0xFF) << 8 | (data[position + 1] & 0xFF)); return peekChar(BIG_ENDIAN, /* offset= */ 0);
} }
/** /**
* Peeks at the next char (as decoded by {@code charset}) * @deprecated Either use {@link #peekChar()} to peek the next two bytes (big-endian) or {@link
* * #peekCodePoint(Charset)} to peek in a {@link Charset}-aware way.
* <p>If {@code charset} is UTF-8, only single-byte characters are supported and this method
* returns zero if {@link #position} is pointing to any part of a multi-byte character.
*
* @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
* UTF-16BE, and UTF-16LE are supported.
*/ */
// TODO: b/398845842 - Make this work 'correctly' for multi-byte UTF-8, or deprecate it. @Deprecated
public char peekChar(Charset charset) { public char peekChar(Charset charset) {
Assertions.checkArgument( Assertions.checkArgument(
SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset); SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
return (char) (peekCharacterAndSize(charset) >> Short.SIZE); if (bytesLeft() < 1) {
return 0;
}
if (charset.equals(StandardCharsets.US_ASCII)) {
return (char) peekUnsignedByte();
} else if (charset.equals(StandardCharsets.UTF_8)) {
return (data[position] & 0x80) == 0 ? (char) peekUnsignedByte() : 0;
} else {
// UTF-16
if (bytesLeft() < 2) {
return 0;
}
ByteOrder byteOrder = charset.equals(StandardCharsets.UTF_16LE) ? LITTLE_ENDIAN : BIG_ENDIAN;
return peekChar(byteOrder, /* offset= */ 0);
}
}
/** Peek the UTF-16 char at {@link #position}{@code + offset}. */
private char peekChar(ByteOrder byteOrder, int offset) {
return byteOrder == BIG_ENDIAN
? Chars.fromBytes(data[position + offset], data[position + offset + 1])
: Chars.fromBytes(data[position + offset + 1], data[position + offset]);
}
/**
* Peeks at the code point starting at {@link #getPosition()} as interpreted by {@code charset}.
*
* <p>The exact behaviour depends on {@code charset}:
*
* <ul>
* <li>US_ASCII: Returns the byte at {@link #getPosition()} if it's valid ASCII (less than
* {@code 0x80}), otherwise returns {@link #INVALID_CODE_POINT}.
* <li>UTF-8: If {@link #getPosition()} is the start of a UTF-8 code unit the whole unit is
* decoded and returned. Otherwise {@link #INVALID_CODE_POINT} is returned.
* <li>UTF-16 (all endian-nesses):
* <ul>
* <li>If {@link #getPosition()} is at the start of a {@linkplain
* Character#isHighSurrogate(char) high surrogate} code unit and the following two
* bytes are a {@linkplain Character#isLowSurrogate(char)} low surrogate} code unit,
* the {@linkplain Character#toCodePoint(char, char) combined code point} is returned.
* <li>Otherwise the single code unit starting at {@link #getPosition()} is returned
* directly.
* <li>UTF-16 has no support for byte-level synchronization, so if {@link #getPosition()}
* is not aligned with the start of a UTF-16 code unit then the result is undefined.
* </ul>
* </ul>
*
* @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
* UTF-16BE, and UTF-16LE are supported.
* @throws IndexOutOfBoundsException if {@link #bytesLeft()} doesn't allow reading the smallest
* code unit in {@code charset} (1 byte for ASCII and UTF-8, 2 bytes for UTF-16).
*/
public int peekCodePoint(Charset charset) {
int codePointAndSize = peekCodePointAndSize(charset);
return codePointAndSize != 0 ? Ints.checkedCast(codePointAndSize >>> 8) : INVALID_CODE_POINT;
} }
/** Reads the next byte as an unsigned value. */ /** Reads the next byte as an unsigned value. */
@ -708,53 +761,145 @@ public final class ParsableByteArray {
* without advancing {@link #position}. Returns {@code 0} if {@link #bytesLeft()} doesn't allow * without advancing {@link #position}. Returns {@code 0} if {@link #bytesLeft()} doesn't allow
* reading a whole character in {@code charset}. * reading a whole character in {@code charset}.
* *
* <p>Only supports characters in {@code chars} that occupy a single code unit (i.e. one byte for * <p>Only supports characters in {@code chars} that are in the Basic Multilingual Plane (occupy a
* UTF-8 and two bytes for UTF-16). * single char).
*/ */
private char readCharacterIfInList(Charset charset, char[] chars) { private char readCharacterIfInList(Charset charset, char[] chars) {
int characterAndSize = peekCharacterAndSize(charset); if (bytesLeft() < getSmallestCodeUnitSize(charset)) {
return 0;
}
int codePointAndSize = peekCodePointAndSize(charset);
if (codePointAndSize == 0) {
return 0;
}
if (characterAndSize != 0 && Chars.contains(chars, (char) (characterAndSize >> Short.SIZE))) { int codePoint = UnsignedInts.checkedCast(codePointAndSize >>> 8);
position += characterAndSize & 0xFFFF; if (Character.isSupplementaryCodePoint(codePoint)) {
return (char) (characterAndSize >> Short.SIZE); return 0;
}
char c = Chars.checkedCast(codePoint);
if (Chars.contains(chars, c)) {
position += Ints.checkedCast(codePointAndSize & 0xFF);
return c;
} else { } else {
return 0; return 0;
} }
} }
/** /**
* Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and the * Peeks at the code unit at {@link #position} (as decoded by {@code charset}), and the number of
* number of bytes the character takes up within the array packed into an int. First two bytes are * bytes it occupies within {@link #data}.
* the character and the second two is the size in bytes it takes. Returns 0 if {@link
* #bytesLeft()} doesn't allow reading a whole character in {@code charset} or if the {@code
* charset} is not one of US_ASCII, UTF-8, UTF-16, UTF-16BE, or UTF-16LE.
* *
* <p>Only supports characters that occupy a single code unit (i.e. one byte for UTF-8 and two * <p>See {@link #peekCodePoint(Charset)} for detailed per-charset behaviour & edge cases.
* bytes for UTF-16). *
* @return The code point in the upper 24 bits, and the size in bytes in the lower 8 bits. Or zero
* if no valid code unit starts at {@link #position} and fits within {@link #bytesLeft()}.
* @throws IndexOutOfBoundsException if {@link #bytesLeft()} doesn't allow reading the smallest
* code unit in {@code charset} (1 byte for ASCII and UTF-8, 2 bytes for UTF-16).
* @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
* UTF-16BE, and UTF-16LE are supported.
*/ */
private int peekCharacterAndSize(Charset charset) { private int peekCodePointAndSize(Charset charset) {
byte charByte1; Assertions.checkArgument(
byte charByte2; SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
byte characterSize; if (bytesLeft() < getSmallestCodeUnitSize(charset)) {
if (bytesLeft() >= 1 throw new IndexOutOfBoundsException("position=" + position + ", limit=" + limit);
&& ((charset.equals(StandardCharsets.UTF_8) && (data[position] & 0x80) == 0) }
|| charset.equals(StandardCharsets.US_ASCII))) { int codePoint;
// TODO: b/398845842 - Handle multi-byte UTF-8. byte codePointSize;
charByte1 = 0; if (charset.equals(StandardCharsets.US_ASCII)) {
charByte2 = data[position]; if ((data[position] & 0x80) != 0) {
characterSize = 1;
} else if (bytesLeft() >= 2
&& (charset.equals(StandardCharsets.UTF_16) || charset.equals(StandardCharsets.UTF_16BE))) {
charByte1 = data[position];
charByte2 = data[position + 1];
characterSize = 2;
} else if (bytesLeft() >= 2 && charset.equals(StandardCharsets.UTF_16LE)) {
charByte1 = data[position + 1];
charByte2 = data[position];
characterSize = 2;
} else {
return 0; return 0;
} }
return Ints.fromBytes(charByte1, charByte2, (byte) 0, characterSize); codePoint = UnsignedBytes.toInt(data[position]);
codePointSize = 1;
} else if (charset.equals(StandardCharsets.UTF_8)) {
codePointSize = peekUtf8CodeUnitSize();
switch (codePointSize) {
case 1:
codePoint = UnsignedBytes.toInt(data[position]);
break;
case 2:
codePoint = decodeUtf8CodeUnit(0, 0, data[position], data[position + 1]);
break;
case 3:
int firstByteWithoutStartCode = data[position] & 0xF;
codePoint =
decodeUtf8CodeUnit(
0, firstByteWithoutStartCode, data[position + 1], data[position + 2]);
break;
case 4:
codePoint =
decodeUtf8CodeUnit(
data[position], data[position + 1], data[position + 2], data[position + 3]);
break;
case 0:
default:
return 0;
}
} else {
// UTF-16
ByteOrder byteOrder = charset.equals(StandardCharsets.UTF_16LE) ? LITTLE_ENDIAN : BIG_ENDIAN;
char c = peekChar(byteOrder, /* offset= */ 0);
if (Character.isHighSurrogate(c) && bytesLeft() >= 4) {
char lowSurrogate = peekChar(byteOrder, /* offset= */ 2);
codePoint = Character.toCodePoint(c, lowSurrogate);
codePointSize = 4;
} else {
// This is either a BMP code point, an unpaired surrogate, or position is in the middle of
// a matching surrogate pair.
codePoint = c;
codePointSize = 2;
}
}
return (codePoint << 8) | codePointSize;
}
private static int getSmallestCodeUnitSize(Charset charset) {
Assertions.checkArgument(
SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
return charset.equals(StandardCharsets.UTF_8) || charset.equals(StandardCharsets.US_ASCII)
? 1
: 2;
}
/**
* Returns the size (in bytes) of the UTF-8 code unit starting at {@link #position}. Returns zero
* if no full UTF-8 code unit seems to start at {@link #position}.
*/
private byte peekUtf8CodeUnitSize() {
if ((data[position] & 0x80) == 0) {
return 1;
} else if ((data[position] & 0xE0) == 0xC0
&& bytesLeft() >= 2
&& isUtf8ContinuationByte(data[position + 1])) {
return 2;
} else if ((data[position] & 0xF0) == 0xE0
&& bytesLeft() >= 3
&& isUtf8ContinuationByte(data[position + 1])
&& isUtf8ContinuationByte(data[position + 2])) {
return 3;
} else if ((data[position] & 0xF8) == 0xF0
&& bytesLeft() >= 4
&& isUtf8ContinuationByte(data[position + 1])
&& isUtf8ContinuationByte(data[position + 2])
&& isUtf8ContinuationByte(data[position + 3])) {
return 4;
} else {
// We found a pattern that doesn't seem to be valid UTF-8.
return 0;
}
}
private static boolean isUtf8ContinuationByte(byte b) {
return (b & 0xC0) == 0x80;
}
private static int decodeUtf8CodeUnit(int b1, int b2, int b3, int b4) {
return Ints.fromBytes(
(byte) 0,
UnsignedBytes.checkedCast(((b1 & 0x7) << 2) | (b2 & 0b0011_0000) >> 4),
UnsignedBytes.checkedCast(((byte) b2 & 0xF) << 4 | ((byte) b3 & 0b0011_1100) >> 2),
UnsignedBytes.checkedCast(((byte) b3 & 0x3) << 6 | ((byte) b4 & 0x3F)));
} }
} }

View File

@ -998,6 +998,257 @@ public final class ParsableByteArrayTest {
assertThat(parser.readLine(UTF_16LE)).isNull(); assertThat(parser.readLine(UTF_16LE)).isNull();
} }
@Test
public void peekCodePoint_ascii() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(US_ASCII));
assertThat(parser.peekCodePoint(US_ASCII)).isEqualTo((int) 'f');
}
@Test
public void peekCodePoint_ascii_invalid() {
// Choose é from ISO 8859-1 which is not valid 7-bit ASCII (since it has a high MSB).
ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xE9));
assertThat(parser.peekCodePoint(US_ASCII)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
}
@Test
public void peekCodePoint_ascii_atLimit_throwsException() {
// Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(US_ASCII), /* limit= */ 2);
parser.setPosition(2);
IndexOutOfBoundsException e =
assertThrows(IndexOutOfBoundsException.class, () -> parser.peekCodePoint(US_ASCII));
assertThat(e).hasMessageThat().contains("position=2");
assertThat(e).hasMessageThat().contains("limit=2");
}
@Test
public void peekCodePoint_utf8() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_8));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo((int) 'f');
}
@Test
public void peekCodePoint_utf8_twoByteCharacter() {
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_8));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo((int) 'é');
}
@Test
public void peekCodePoint_utf8_twoByteCharacter_misaligned() {
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_8));
parser.setPosition(1);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
}
@Test
public void peekCodePoint_utf8_threeByteCharacter() {
ParsableByteArray parser = new ParsableByteArray("".getBytes(UTF_8));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo((int) 'ऊ');
}
@Test
public void peekCodePoint_utf8_threeByteCharacter_misaligned() {
ParsableByteArray parser = new ParsableByteArray("".getBytes(UTF_8));
parser.setPosition(1);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
}
@Test
public void peekCodePoint_utf8_fourByteCharacter() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_8));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(Character.codePointAt("\uD83D\uDE1B", 0));
}
@Test
public void peekCodePoint_utf8_fourByteCharacter_misaligned() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_8));
parser.setPosition(1);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
}
@Test
public void peekCodePoint_utf8_atLimit_throwsException() {
// Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_8), /* limit= */ 2);
parser.setPosition(2);
IndexOutOfBoundsException e =
assertThrows(IndexOutOfBoundsException.class, () -> parser.peekCodePoint(UTF_8));
assertThat(e).hasMessageThat().contains("position=2");
assertThat(e).hasMessageThat().contains("limit=2");
}
@Test
public void peekCodePoint_utf8_invalidByteSequence() {
// 2-byte start character not followed by anything.
ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xC1));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 2-byte character truncated by limit.
parser = new ParsableByteArray("é".getBytes(UTF_8), /* limit= */ 1);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 2-byte start character not followed by a continuation byte.
parser = new ParsableByteArray(TestUtil.createByteArray(0xC1, 'a'));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 3-byte start character followed by only one byte.
parser = new ParsableByteArray(TestUtil.createByteArray(0xE1, 0x81));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 3-byte character truncated by limit.
parser = new ParsableByteArray("".getBytes(UTF_8), /* limit= */ 2);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 3-byte start character followed by only one continuation byte.
parser = new ParsableByteArray(TestUtil.createByteArray(0xE1, 0x81, 'a'));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 4-byte start character followed by only two bytes.
parser = new ParsableByteArray(TestUtil.createByteArray(0xF1, 0x81, 0x81));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 4-byte character truncated by limit.
parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_8), /* limit= */ 3);
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
// 4-byte start character followed by only two continuation bytes.
parser = new ParsableByteArray(TestUtil.createByteArray(0xF1, 0x81, 0x81, 'a'));
assertThat(parser.peekCodePoint(UTF_8)).isEqualTo(ParsableByteArray.INVALID_CODE_POINT);
}
@Test
public void peekCodePoint_utf16() {
// Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE));
int expectedCodePoint = 'f';
assertThat(parser.peekCodePoint(UTF_16)).isEqualTo(expectedCodePoint);
assertThat(parser.peekCodePoint(UTF_16BE)).isEqualTo(expectedCodePoint);
}
@Test
public void peekCodePoint_utf16_basicMultilingualPlane() {
// Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16BE));
int expectedCodePoint = 'é';
assertThat(parser.peekCodePoint(UTF_16)).isEqualTo(expectedCodePoint);
assertThat(parser.peekCodePoint(UTF_16BE)).isEqualTo(expectedCodePoint);
}
@Test
public void peekCodePoint_utf16_surrogatePair() {
// Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE));
int expectedCodePoint = Character.codePointAt("\uD83D\uDE1B", 0);
assertThat(parser.peekCodePoint(UTF_16)).isEqualTo(expectedCodePoint);
assertThat(parser.peekCodePoint(UTF_16BE)).isEqualTo(expectedCodePoint);
}
@Test
public void peekCodePoint_utf16_splitSurrogatePair_returnsLowSurrogate() {
// Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE));
parser.skipBytes(2);
int expectedCodePoint = 0xDE1B;
assertThat(parser.peekCodePoint(UTF_16)).isEqualTo(expectedCodePoint);
assertThat(parser.peekCodePoint(UTF_16BE)).isEqualTo(expectedCodePoint);
}
@Test
public void peekCodePoint_utf16_misaligned_returnsGarbage() {
// Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE));
// Move the position so we are reading the second byte of 'f' and the first byte of 'o'.
parser.setPosition(1);
int expectedCodePoint = '昀';
assertThat(parser.peekCodePoint(UTF_16)).isEqualTo(expectedCodePoint);
assertThat(parser.peekCodePoint(UTF_16BE)).isEqualTo(expectedCodePoint);
}
@Test
public void peekCodePoint_utf16_atLimit_throwsException() {
// Use UTF_16BE to avoid encoding a BOM. Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE), /* limit= */ 2);
// Only one readable byte, not enough for a UTF-16 code unit.
parser.setPosition(1);
IndexOutOfBoundsException e1 =
assertThrows(IndexOutOfBoundsException.class, () -> parser.peekCodePoint(UTF_16));
assertThat(e1).hasMessageThat().contains("position=1");
assertThat(e1).hasMessageThat().contains("limit=2");
IndexOutOfBoundsException e2 =
assertThrows(IndexOutOfBoundsException.class, () -> parser.peekCodePoint(UTF_16BE));
assertThat(e2).hasMessageThat().contains("position=1");
assertThat(e2).hasMessageThat().contains("limit=2");
}
@Test
public void peekCodePoint_utf16le() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE));
assertThat(parser.peekCodePoint(UTF_16LE)).isEqualTo((int) 'f');
}
@Test
public void peekCodePoint_utf16le_basicMultilingualPlane() {
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16LE));
assertThat(parser.peekCodePoint(UTF_16LE)).isEqualTo((int) 'é');
}
@Test
public void peekCodePoint_utf16le_surrogatePair() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE));
assertThat(parser.peekCodePoint(UTF_16LE)).isEqualTo(Character.codePointAt("\uD83D\uDE1B", 0));
}
@Test
public void peekCodePoint_utf16le_splitSurrogatePair_returnsLowSurrogate() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE));
parser.skipBytes(2);
assertThat(parser.peekCodePoint(UTF_16LE)).isEqualTo(0xDE1B);
}
@Test
public void peekCodePoint_utf16le_misaligned_returnsGarbage() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE));
// Move the position so we are reading the second byte of 'f' and the first byte of 'o'.
parser.setPosition(1);
assertThat(parser.peekCodePoint(UTF_16LE)).isEqualTo((int) '漀');
}
@Test
public void peekCodePoint_utf16le_atLimit_throwsException() {
// Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE), /* limit= */ 2);
// Only one readable byte, not enough for a UTF-16 code unit.
parser.setPosition(1);
IndexOutOfBoundsException e =
assertThrows(IndexOutOfBoundsException.class, () -> parser.peekCodePoint(UTF_16LE));
assertThat(e).hasMessageThat().contains("position=1");
assertThat(e).hasMessageThat().contains("limit=2");
}
@Test @Test
public void peekChar() { public void peekChar() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
@ -1045,6 +1296,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_ascii() { public void peekChar_ascii() {
byte[] bytes = "foo".getBytes(US_ASCII); byte[] bytes = "foo".getBytes(US_ASCII);
ParsableByteArray parser = new ParsableByteArray(bytes); ParsableByteArray parser = new ParsableByteArray(bytes);
@ -1053,6 +1305,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_ascii_invalid_returns8BitCharacterAnyway() { public void peekChar_ascii_invalid_returns8BitCharacterAnyway() {
// Choose é from ISO 8859-1 which is not valid 7-bit ASCII (since it has a high MSB). // Choose é from ISO 8859-1 which is not valid 7-bit ASCII (since it has a high MSB).
ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xE9)); ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xE9));
@ -1061,6 +1314,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_ascii_atLimit_throwsException() { public void peekChar_ascii_atLimit_throwsException() {
// Set the limit before the end of the byte array. // Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(US_ASCII), /* limit= */ 2); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(US_ASCII), /* limit= */ 2);
@ -1071,6 +1325,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_oneByteCharacter() { public void peekChar_utf8_oneByteCharacter() {
byte[] bytes = "foo".getBytes(UTF_8); byte[] bytes = "foo".getBytes(UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes); ParsableByteArray parser = new ParsableByteArray(bytes);
@ -1079,6 +1334,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_twoByteCharacter_returnsZero() { public void peekChar_utf8_twoByteCharacter_returnsZero() {
byte[] bytes = "étude".getBytes(UTF_8); byte[] bytes = "étude".getBytes(UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes); ParsableByteArray parser = new ParsableByteArray(bytes);
@ -1088,6 +1344,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_threeByteCharacter_returnsZero() { public void peekChar_utf8_threeByteCharacter_returnsZero() {
ParsableByteArray parser = new ParsableByteArray("".getBytes(UTF_8)); ParsableByteArray parser = new ParsableByteArray("".getBytes(UTF_8));
@ -1096,6 +1353,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_fourByteCharacter_returnsZero() { public void peekChar_utf8_fourByteCharacter_returnsZero() {
byte[] bytes = "\uD83D\uDE1B".getBytes(UTF_8); byte[] bytes = "\uD83D\uDE1B".getBytes(UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes); ParsableByteArray parser = new ParsableByteArray(bytes);
@ -1105,6 +1363,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_splitFourByteChar_returnsZero() { public void peekChar_utf8_splitFourByteChar_returnsZero() {
byte[] bytes = "\uD83D\uDE1B".getBytes(UTF_8); byte[] bytes = "\uD83D\uDE1B".getBytes(UTF_8);
ParsableByteArray parser = new ParsableByteArray(bytes); ParsableByteArray parser = new ParsableByteArray(bytes);
@ -1115,6 +1374,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_atLimit_returnsZero() { public void peekChar_utf8_atLimit_returnsZero() {
// Set the limit before the end of the byte array. // Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_8), /* limit= */ 2); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_8), /* limit= */ 2);
@ -1125,6 +1385,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf8_invalidByteSequence() { public void peekChar_utf8_invalidByteSequence() {
// 2-byte start character not followed by anything. // 2-byte start character not followed by anything.
ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xC1)); ParsableByteArray parser = new ParsableByteArray(TestUtil.createByteArray(0xC1));
@ -1164,6 +1425,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16() { public void peekChar_utf16() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE)); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE));
@ -1174,6 +1436,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16_basicMultilingualPlane() { public void peekChar_utf16_basicMultilingualPlane() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16BE)); ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16BE));
@ -1184,6 +1447,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16_surrogatePair_returnsHighSurrogate() { public void peekChar_utf16_surrogatePair_returnsHighSurrogate() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE)); ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE));
@ -1195,6 +1459,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16_splitSurrogatePair_returnsLowSurrogate() { public void peekChar_utf16_splitSurrogatePair_returnsLowSurrogate() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE)); ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16BE));
@ -1207,6 +1472,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16_misaligned_returnsGarbage() { public void peekChar_utf16_misaligned_returnsGarbage() {
// Use UTF_16BE to avoid encoding a BOM. // Use UTF_16BE to avoid encoding a BOM.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE)); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE));
@ -1219,6 +1485,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16_atLimit_returnsZero() { public void peekChar_utf16_atLimit_returnsZero() {
// Use UTF_16BE to avoid encoding a BOM. Set the limit before the end of the byte array. // Use UTF_16BE to avoid encoding a BOM. Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE), /* limit= */ 2); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16BE), /* limit= */ 2);
@ -1232,6 +1499,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le() { public void peekChar_utf16le() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE)); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE));
@ -1239,6 +1507,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le_basicMultilingualPlane() { public void peekChar_utf16le_basicMultilingualPlane() {
ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16LE)); ParsableByteArray parser = new ParsableByteArray("étude".getBytes(UTF_16LE));
@ -1246,6 +1515,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le_surrogatePair_returnsHighSurrogate() { public void peekChar_utf16le_surrogatePair_returnsHighSurrogate() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE)); ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE));
@ -1254,6 +1524,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le_splitSurrogatePair_returnsLowSurrogate() { public void peekChar_utf16le_splitSurrogatePair_returnsLowSurrogate() {
ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE)); ParsableByteArray parser = new ParsableByteArray("\uD83D\uDE1B".getBytes(UTF_16LE));
parser.skipBytes(2); parser.skipBytes(2);
@ -1263,6 +1534,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le_misaligned_returnsGarbage() { public void peekChar_utf16le_misaligned_returnsGarbage() {
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE)); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE));
// Move the position so we are reading the second byte of 'f' and the first byte of 'o'. // Move the position so we are reading the second byte of 'f' and the first byte of 'o'.
@ -1272,6 +1544,7 @@ public final class ParsableByteArrayTest {
} }
@Test @Test
@SuppressWarnings("deprecation") // Testing deprecated method
public void peekChar_utf16le_atLimit_returnsZero() { public void peekChar_utf16le_atLimit_returnsZero() {
// Set the limit before the end of the byte array. // Set the limit before the end of the byte array.
ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE), /* limit= */ 2); ParsableByteArray parser = new ParsableByteArray("foo".getBytes(UTF_16LE), /* limit= */ 2);

View File

@ -227,7 +227,7 @@ public final class SsaParser implements SubtitleParser {
private void parseScriptInfo(ParsableByteArray data, Charset charset) { private void parseScriptInfo(ParsableByteArray data, Charset charset) {
@Nullable String currentLine; @Nullable String currentLine;
while ((currentLine = data.readLine(charset)) != null while ((currentLine = data.readLine(charset)) != null
&& (data.bytesLeft() == 0 || data.peekChar(charset) != '[')) { && (data.bytesLeft() == 0 || data.peekCodePoint(charset) != '[')) {
String[] infoNameAndValue = currentLine.split(":"); String[] infoNameAndValue = currentLine.split(":");
if (infoNameAndValue.length != 2) { if (infoNameAndValue.length != 2) {
continue; continue;
@ -266,7 +266,7 @@ public final class SsaParser implements SubtitleParser {
@Nullable SsaStyle.Format formatInfo = null; @Nullable SsaStyle.Format formatInfo = null;
@Nullable String currentLine; @Nullable String currentLine;
while ((currentLine = data.readLine(charset)) != null while ((currentLine = data.readLine(charset)) != null
&& (data.bytesLeft() == 0 || data.peekChar(charset) != '[')) { && (data.bytesLeft() == 0 || data.peekCodePoint(charset) != '[')) {
if (currentLine.startsWith(FORMAT_LINE_PREFIX)) { if (currentLine.startsWith(FORMAT_LINE_PREFIX)) {
formatInfo = SsaStyle.Format.fromFormatLine(currentLine); formatInfo = SsaStyle.Format.fromFormatLine(currentLine);
} else if (currentLine.startsWith(STYLE_LINE_PREFIX)) { } else if (currentLine.startsWith(STYLE_LINE_PREFIX)) {