Define how PBA.peekChar behaves for UTF-8 multi-byte characters

PiperOrigin-RevId: 730908033
This commit is contained in:
ibaker 2025-02-25 09:03:00 -08:00 committed by Copybara-Service
parent cecee3bdda
commit 6ffa0854d5

View File

@ -252,9 +252,13 @@ public final class ParsableByteArray {
/** /**
* Peeks at the next char (as decoded by {@code charset}) * Peeks at the next char (as decoded by {@code charset})
* *
* <p>If {@code charset} is UTF-8, only single-byte characters are supported and this method
* returns zero if {@link #position} is pointing to any part of a multi-byte character.
*
* @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16, * @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
* UTF-16BE, and UTF-16LE are supported. * UTF-16BE, and UTF-16LE are supported.
*/ */
// TODO: b/398845842 - Make this work 'correctly' for multi-byte UTF-8, or deprecate it.
public char peekChar(Charset charset) { public char peekChar(Charset charset) {
Assertions.checkArgument( Assertions.checkArgument(
SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset); SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
@ -732,18 +736,19 @@ public final class ParsableByteArray {
byte charByte1; byte charByte1;
byte charByte2; byte charByte2;
byte characterSize; byte characterSize;
if ((charset.equals(StandardCharsets.UTF_8) || charset.equals(StandardCharsets.US_ASCII)) if (bytesLeft() >= 1
&& bytesLeft() >= 1) { && ((charset.equals(StandardCharsets.UTF_8) && (data[position] & 0x80) == 0)
|| charset.equals(StandardCharsets.US_ASCII))) {
// TODO: b/398845842 - Handle multi-byte UTF-8.
charByte1 = 0; charByte1 = 0;
charByte2 = data[position]; charByte2 = data[position];
characterSize = 1; characterSize = 1;
} else if ((charset.equals(StandardCharsets.UTF_16) } else if (bytesLeft() >= 2
|| charset.equals(StandardCharsets.UTF_16BE)) && (charset.equals(StandardCharsets.UTF_16) || charset.equals(StandardCharsets.UTF_16BE))) {
&& bytesLeft() >= 2) {
charByte1 = data[position]; charByte1 = data[position];
charByte2 = data[position + 1]; charByte2 = data[position + 1];
characterSize = 2; characterSize = 2;
} else if (charset.equals(StandardCharsets.UTF_16LE) && bytesLeft() >= 2) { } else if (bytesLeft() >= 2 && charset.equals(StandardCharsets.UTF_16LE)) {
charByte1 = data[position + 1]; charByte1 = data[position + 1];
charByte2 = data[position]; charByte2 = data[position];
characterSize = 2; characterSize = 2;