Fix AsciiBytes unicode decoding

Fix the decoding logic in the AsciiBytes `hashCode` and `matches` to
correctly deal with multi-byte encodings.

Fixes gh-12504
pull/12697/head
Phillip Webb 7 years ago
parent 98a2a91d16
commit 9a64d3bf3f

@ -29,7 +29,9 @@ final class AsciiBytes {
private static final String EMPTY_STRING = "";
private static final int[] EXCESS = { 0x0, 0x1080, 0x96, 0x1c82080 };
private static final int[] INITIAL_BYTE_BITMASK = { 0x7F, 0x1F, 0x0F, 0x07 };
private static final int SUBSEQUENT_BYTE_BITMASK = 0x3F;
private final byte[] bytes;
@ -142,13 +144,10 @@ final class AsciiBytes {
int totalLen = (nameLen + (suffix == 0 ? 0 : 1));
for (int i = this.offset; i < this.offset + this.length; i++) {
int b = this.bytes[i];
if (b < 0) {
b = b & 0x7F;
int limit = getRemainingUtfBytes(b);
for (int j = 0; j < limit; j++) {
b = (b << 6) + (this.bytes[++i] & 0xFF);
}
b -= EXCESS[limit];
int remainingUtfBytes = getNumberOfUtfBytes(b) - 1;
b &= INITIAL_BYTE_BITMASK[remainingUtfBytes];
for (int j = 0; j < remainingUtfBytes; j++) {
b = (b << 6) + (this.bytes[++i] & SUBSEQUENT_BYTE_BITMASK);
}
char c = getChar(name, suffix, charIndex++);
if (b <= 0xFFFF) {
@ -185,13 +184,10 @@ final class AsciiBytes {
if (hash == 0 && this.bytes.length > 0) {
for (int i = this.offset; i < this.offset + this.length; i++) {
int b = this.bytes[i];
if (b < 0) {
b = b & 0x7F;
int limit = getRemainingUtfBytes(b);
for (int j = 0; j < limit; j++) {
b = (b << 6) + (this.bytes[++i] & 0xFF);
}
b -= EXCESS[limit];
int remainingUtfBytes = getNumberOfUtfBytes(b) - 1;
b &= INITIAL_BYTE_BITMASK[remainingUtfBytes];
for (int j = 0; j < remainingUtfBytes; j++) {
b = (b << 6) + (this.bytes[++i] & SUBSEQUENT_BYTE_BITMASK);
}
if (b <= 0xFFFF) {
hash = 31 * hash + b;
@ -206,8 +202,16 @@ final class AsciiBytes {
return hash;
}
private int getRemainingUtfBytes(int b) {
return (b < 96 ? 1 : (b < 112 ? 2 : 3));
private int getNumberOfUtfBytes(int b) {
if ((b & 0x80) == 0) {
return 1;
}
int numberOfUtfBytes = 0;
while ((b & 0x80) != 0) {
b <<= 1;
numberOfUtfBytes++;
}
return numberOfUtfBytes;
}
@Override

@ -1,5 +1,5 @@
/*
* Copyright 2012-2017 the original author or authors.
* Copyright 2012-2018 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -184,6 +184,18 @@ public class AsciiBytesTests {
matchesSameAsString("\ud83d\udca9");
}
@Test
public void hashCodeFromInstanceMatchesHashCodeFromString() {
String name = "fonts/宋体/simsun.ttf";
assertThat(new AsciiBytes(name).hashCode()).isEqualTo(AsciiBytes.hashCode(name));
}
@Test
public void instanceCreatedFromCharSequenceMatchesSameCharSequence() {
String name = "fonts/宋体/simsun.ttf";
assertThat(new AsciiBytes(name).matches(name, NO_SUFFIX)).isTrue();
}
private void matchesSameAsString(String input) {
assertThat(new AsciiBytes(input).matches(input, NO_SUFFIX)).isTrue();
}

Loading…
Cancel
Save