Fixes for correctly supporting UTF-16 and UTF-16BE charsets

For fields encoded using UTF-16 or UTF-16BE charsets when looking for
termination character we have to look for two zero consecutive bytes.

Otherwise, as many characters encoded with UTF-16 or UTF-16BE has one
of their 2 bytes set with the value zero, we will be truncating text
fields.
This commit is contained in:
J. Oliva 2015-02-26 21:54:29 +01:00
parent ebbd022a52
commit c135bb7a57
2 changed files with 36 additions and 12 deletions

View File

@ -29,6 +29,11 @@ import java.util.Map;
*/
public class Id3Parser implements MetadataParser<Map<String, Object>> {
private static final int ID3_TEXT_ENCODING_ISO_8859_1 = 0;
private static final int ID3_TEXT_ENCODING_UTF_16 = 1;
private static final int ID3_TEXT_ENCODING_UTF_16BE = 2;
private static final int ID3_TEXT_ENCODING_UTF_8 = 3;
@Override
public boolean canParse(String mimeType) {
return mimeType.equals(MimeTypes.APPLICATION_ID3);
@ -60,10 +65,10 @@ public class Id3Parser implements MetadataParser<Map<String, Object>> {
byte[] frame = new byte[frameSize - 1];
id3Data.readBytes(frame, 0, frameSize - 1);
int firstZeroIndex = indexOf(frame, 0, (byte) 0);
int firstZeroIndex = indexOfEOS(frame, 0, encoding);
String description = new String(frame, 0, firstZeroIndex, charset);
int valueStartIndex = indexOfNot(frame, firstZeroIndex, (byte) 0);
int valueEndIndex = indexOf(frame, valueStartIndex, (byte) 0);
int valueStartIndex = firstZeroIndex + 1;
int valueEndIndex = indexOfEOS(frame, valueStartIndex, encoding);
String value = new String(frame, valueStartIndex, valueEndIndex - valueStartIndex,
charset);
metadata.put(TxxxMetadata.TYPE, new TxxxMetadata(description, value));
@ -73,7 +78,7 @@ public class Id3Parser implements MetadataParser<Map<String, Object>> {
id3Data.readBytes(frame, 0, frameSize);
int firstZeroIndex = indexOf(frame, 0, (byte) 0);
String owner = new String(frame, 0, firstZeroIndex);
String owner = new String(frame, 0, firstZeroIndex, "ISO-8859-1");
byte[] privateData = new byte[frameSize - firstZeroIndex - 1];
System.arraycopy(frame, firstZeroIndex + 1, privateData, 0, frameSize - firstZeroIndex - 1);
metadata.put(PrivMetadata.TYPE, new PrivMetadata(owner, privateData));
@ -85,13 +90,13 @@ public class Id3Parser implements MetadataParser<Map<String, Object>> {
id3Data.readBytes(frame, 0, frameSize - 1);
int firstZeroIndex = indexOf(frame, 0, (byte) 0);
String mimeType = new String(frame, 0, firstZeroIndex);
String mimeType = new String(frame, 0, firstZeroIndex, "ISO-8859-1");
int filenameStartIndex = firstZeroIndex + 1;
int filenameEndIndex = indexOf(frame, filenameStartIndex, (byte) 0);
int filenameEndIndex = indexOfEOS(frame, filenameStartIndex, encoding);
String filename = new String(frame, filenameStartIndex,
filenameEndIndex - filenameStartIndex, charset);
int descriptionStartIndex = filenameEndIndex + 1;
int descriptionEndIndex = indexOf(frame, descriptionStartIndex, (byte) 0);
int descriptionEndIndex = indexOfEOS(frame, descriptionStartIndex, encoding);
String description = new String(frame, descriptionStartIndex,
descriptionEndIndex - descriptionStartIndex, charset);
@ -131,6 +136,25 @@ public class Id3Parser implements MetadataParser<Map<String, Object>> {
return data.length;
}
private static int indexOfEOS(byte[] data, int fromIndex, int encodingByte) {
int terminationPos = indexOf(data, fromIndex, (byte) 0);
// For single byte encoding charsets, we are done
if(encodingByte == ID3_TEXT_ENCODING_ISO_8859_1 || encodingByte == ID3_TEXT_ENCODING_UTF_8) {
return terminationPos;
}
// Otherwise, look for a two zero bytes
while(terminationPos < data.length - 1) {
if(data[terminationPos + 1] == (byte) 0) {
return terminationPos + 1;
}
terminationPos = indexOf(data, terminationPos + 1, (byte) 0);
}
return data.length;
}
/**
* Parses an ID3 header.
*
@ -175,13 +199,13 @@ public class Id3Parser implements MetadataParser<Map<String, Object>> {
*/
private static String getCharsetName(int encodingByte) {
switch (encodingByte) {
case 0:
case ID3_TEXT_ENCODING_ISO_8859_1:
return "ISO-8859-1";
case 1:
case ID3_TEXT_ENCODING_UTF_16:
return "UTF-16";
case 2:
case ID3_TEXT_ENCODING_UTF_16BE:
return "UTF-16BE";
case 3:
case ID3_TEXT_ENCODING_UTF_8:
return "UTF-8";
default:
return "ISO-8859-1";

View File

@ -26,7 +26,7 @@ public class PrivMetadata {
public final String owner;
public final byte[] privateData;
public PrivMetadata(String owner, byte [] privateData) {
public PrivMetadata(String owner, byte[] privateData) {
this.owner = owner;
this.privateData = privateData;
}