diff options
author | Eugene Kliuchnikov <eustas@google.com> | 2018-10-24 16:06:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-10-24 16:06:09 +0200 |
commit | d0ffe60b87aa5ec302fcb031c8ebf726c1a1692a (patch) | |
tree | a4963c527854ab3ee63c3f7fa9a7b99818000e2a /java/org/brotli/dec/Transform.java | |
parent | d6d98957ca8ccb1ef45922e978bb10efca0ea541 (diff) | |
download | brotli-d0ffe60b87aa5ec302fcb031c8ebf726c1a1692a.zip brotli-d0ffe60b87aa5ec302fcb031c8ebf726c1a1692a.tar.gz brotli-d0ffe60b87aa5ec302fcb031c8ebf726c1a1692a.tar.bz2 |
Verbose CLI + start pulling "Shared-Brotli" (#722)
* Verbose CLI + start pulling "Shared-Brotli"
* vesbose CLI output; fix #666
* pull `SHIFT` transforms; currently this is semantically dead code;
later it will be used by "Shared-Brotli"
Diffstat (limited to 'java/org/brotli/dec/Transform.java')
-rw-r--r-- | java/org/brotli/dec/Transform.java | 183 |
1 files changed, 152 insertions, 31 deletions
diff --git a/java/org/brotli/dec/Transform.java b/java/org/brotli/dec/Transform.java index b90f2e9..3279ee7 100644 --- a/java/org/brotli/dec/Transform.java +++ b/java/org/brotli/dec/Transform.java @@ -10,13 +10,58 @@ import java.nio.ByteBuffer; /** * Transformations on dictionary words. + * + * Transform descriptor is a triplet: {prefix, operator, suffix}. + * "prefix" and "suffix" are short strings inserted before and after transformed dictionary word. + * "operator" is applied to dictionary word itself. + * + * Some operators has "built-in" parameters, i.e. parameter is defined by operator ordinal. Other + * operators have "external" parameters, supplied via additional table encoded in shared dictionary. + * + * Operators: + * - IDENTITY (0): dictionary word is inserted "as is" + * - OMIT_LAST_N (1 - 9): last N octets of dictionary word are not inserted; N == ordinal + * - OMIT_FIRST_M (12-20): first M octets of dictionary word are not inserted; M == ordinal - 11 + * - UPPERCASE_FIRST (10): first "scalar" is XOR'ed with number 32 + * - UPPERCASE_ALL (11): all "scalars" are XOR'ed with number 32 + * - SHIFT_FIRST (21): first "scalar" is shifted by number form parameter table + * - SHIFT_ALL (22): all "scalar" is shifted by number form parameter table + * + * Here "scalar" is a variable length character coding similar to UTF-8 encoding. + * UPPERCASE_XXX / SHIFT_XXX operators were designed to change the case of UTF-8 encoded characters. + * While UPPERCASE_XXX works well only on ASCII charset, SHIFT is much more generic and could be + * used for most (all?) alphabets. */ final class Transform { - static final int NUM_TRANSFORMS = 121; - private static final int[] TRANSFORMS = new int[NUM_TRANSFORMS * 3]; - private static final byte[] PREFIX_SUFFIX = new byte[217]; - private static final int[] PREFIX_SUFFIX_HEADS = new int[51]; + static final class Transforms { + final int numTransforms; + final int[] triplets; + final byte[] prefixSuffixStorage; + final int[] prefixSuffixHeads; + final short[] params; + + Transforms(int numTransforms, int prefixSuffixLen, int prefixSuffixCount) { + this.numTransforms = numTransforms; + this.triplets = new int[numTransforms * 3]; + this.params = new short[numTransforms]; + this.prefixSuffixStorage = new byte[prefixSuffixLen]; + this.prefixSuffixHeads = new int[prefixSuffixCount + 1]; + } + } + + static final int NUM_RFC_TRANSFORMS = 121; + static final Transforms RFC_TRANSFORMS = new Transforms(NUM_RFC_TRANSFORMS, 167, 50); + + private static final int OMIT_FIRST_LAST_LIMIT = 9; + + private static final int IDENTITY = 0; + private static final int OMIT_LAST_BASE = IDENTITY + 1 - 1; // there is no OMIT_LAST_0. + private static final int UPPERCASE_FIRST = OMIT_LAST_BASE + OMIT_FIRST_LAST_LIMIT + 1; + private static final int UPPERCASE_ALL = UPPERCASE_FIRST + 1; + private static final int OMIT_FIRST_BASE = UPPERCASE_ALL + 1 - 1; // there is no OMIT_FIRST_0. + private static final int SHIFT_FIRST = OMIT_FIRST_BASE + OMIT_FIRST_LAST_LIMIT + 1; + private static final int SHIFT_ALL = SHIFT_FIRST + 1; // Bundle of 0-terminated strings. private static final String PREFIX_SUFFIX_SRC = "# #s #, #e #.# the #.com/#\u00C2\u00A0# of # and" @@ -29,71 +74,87 @@ final class Transform { + " G% ! *A *% H! D I!+! J!+ K +- *4! A L!*4 M N +6 O!*% +.! K *G P +%( ! G *D +D " + " Q +# *K!*G!+D!+# +G +A +4!+% +K!+4!*D!+K!*K"; - private static void unpackTransforms(byte[] prefixSuffix, int[] prefixSuffixHeads, - int[] transforms, String prefixSuffixSrc, String transformsSrc) { + private static void unpackTransforms(byte[] prefixSuffix, + int[] prefixSuffixHeads, int[] transforms, String prefixSuffixSrc, String transformsSrc) { int n = prefixSuffixSrc.length(); int index = 1; + int j = 0; for (int i = 0; i < n; ++i) { char c = prefixSuffixSrc.charAt(i); - prefixSuffix[i] = (byte) c; if (c == 35) { // == # - prefixSuffixHeads[index++] = i + 1; - prefixSuffix[i] = 0; + prefixSuffixHeads[index++] = j; + } else { + prefixSuffix[j++] = (byte) c; } } - for (int i = 0; i < NUM_TRANSFORMS * 3; ++i) { + for (int i = 0; i < NUM_RFC_TRANSFORMS * 3; ++i) { transforms[i] = transformsSrc.charAt(i) - 32; } } static { - unpackTransforms(PREFIX_SUFFIX, PREFIX_SUFFIX_HEADS, TRANSFORMS, PREFIX_SUFFIX_SRC, - TRANSFORMS_SRC); + unpackTransforms(RFC_TRANSFORMS.prefixSuffixStorage, RFC_TRANSFORMS.prefixSuffixHeads, + RFC_TRANSFORMS.triplets, PREFIX_SUFFIX_SRC, TRANSFORMS_SRC); } - static int transformDictionaryWord(byte[] dst, int dstOffset, ByteBuffer data, int wordOffset, - int len, int transformIndex) { + static int transformDictionaryWord(byte[] dst, int dstOffset, ByteBuffer src, int srcOffset, + int len, Transforms transforms, int transformIndex) { int offset = dstOffset; + int[] triplets = transforms.triplets; + byte[] prefixSuffixStorage = transforms.prefixSuffixStorage; + int[] prefixSuffixHeads = transforms.prefixSuffixHeads; int transformOffset = 3 * transformIndex; - int transformPrefix = PREFIX_SUFFIX_HEADS[TRANSFORMS[transformOffset]]; - int transformType = TRANSFORMS[transformOffset + 1]; - int transformSuffix = PREFIX_SUFFIX_HEADS[TRANSFORMS[transformOffset + 2]]; + int prefixIdx = triplets[transformOffset]; + int transformType = triplets[transformOffset + 1]; + int suffixIdx = triplets[transformOffset + 2]; + int prefix = prefixSuffixHeads[prefixIdx]; + int prefixEnd = prefixSuffixHeads[prefixIdx + 1]; + int suffix = prefixSuffixHeads[suffixIdx]; + int suffixEnd = prefixSuffixHeads[suffixIdx + 1]; + + int omitFirst = transformType - OMIT_FIRST_BASE; + int omitLast = transformType - OMIT_LAST_BASE; + if (omitFirst < 1 || omitFirst > OMIT_FIRST_LAST_LIMIT) { + omitFirst = 0; + } + if (omitLast < 1 || omitLast > OMIT_FIRST_LAST_LIMIT) { + omitLast = 0; + } // Copy prefix. - while (PREFIX_SUFFIX[transformPrefix] != 0) { - dst[offset++] = PREFIX_SUFFIX[transformPrefix++]; + while (prefix != prefixEnd) { + dst[offset++] = prefixSuffixStorage[prefix++]; } // Copy trimmed word. - int omitFirst = transformType >= 12 ? (transformType - 11) : 0; if (omitFirst > len) { omitFirst = len; } - wordOffset += omitFirst; + srcOffset += omitFirst; len -= omitFirst; - len -= transformType <= 9 ? transformType : 0; // Omit last. + len -= omitLast; int i = len; while (i > 0) { - dst[offset++] = data.get(wordOffset++); + dst[offset++] = src.get(srcOffset++); i--; } // Ferment. - if (transformType == 11 || transformType == 10) { + if (transformType == UPPERCASE_FIRST || transformType == UPPERCASE_ALL) { int uppercaseOffset = offset - len; - if (transformType == 10) { + if (transformType == UPPERCASE_FIRST) { len = 1; } while (len > 0) { - int tmp = dst[uppercaseOffset] & 0xFF; - if (tmp < 0xc0) { - if (tmp >= 97 && tmp <= 122) { // in [a..z] range + int c0 = dst[uppercaseOffset] & 0xFF; + if (c0 < 0xC0) { + if (c0 >= 97 && c0 <= 122) { // in [a..z] range dst[uppercaseOffset] ^= (byte) 32; } uppercaseOffset += 1; len -= 1; - } else if (tmp < 0xe0) { + } else if (c0 < 0xE0) { dst[uppercaseOffset + 1] ^= (byte) 32; uppercaseOffset += 2; len -= 2; @@ -103,11 +164,71 @@ final class Transform { len -= 3; } } + } else if (transformType == SHIFT_FIRST || transformType == SHIFT_ALL) { + int shiftOffset = offset - len; + short param = transforms.params[transformIndex]; + /* Limited sign extension: scalar < (1 << 24). */ + int scalar = (param & 0x7FFF) + (0x1000000 - (param & 0x8000)); + while (len > 0) { + int step = 1; + int c0 = dst[shiftOffset] & 0xFF; + if (c0 < 0x80) { + /* 1-byte rune / 0sssssss / 7 bit scalar (ASCII). */ + scalar += c0; + dst[shiftOffset] = (byte) (scalar & 0x7F); + } else if (c0 < 0xC0) { + /* Continuation / 10AAAAAA. */ + } else if (c0 < 0xE0) { + /* 2-byte rune / 110sssss AAssssss / 11 bit scalar. */ + if (len >= 2) { + byte c1 = dst[shiftOffset + 1]; + scalar += (c1 & 0x3F) | ((c0 & 0x1F) << 6); + dst[shiftOffset] = (byte) (0xC0 | ((scalar >> 6) & 0x1F)); + dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | (scalar & 0x3F)); + step = 2; + } else { + step = len; + } + } else if (c0 < 0xF0) { + /* 3-byte rune / 1110ssss AAssssss BBssssss / 16 bit scalar. */ + if (len >= 3) { + byte c1 = dst[shiftOffset + 1]; + byte c2 = dst[shiftOffset + 2]; + scalar += (c2 & 0x3F) | ((c1 & 0x3F) << 6) | ((c0 & 0x0F) << 12); + dst[shiftOffset] = (byte) (0xE0 | ((scalar >> 12) & 0x0F)); + dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 6) & 0x3F)); + dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | (scalar & 0x3F)); + step = 3; + } else { + step = len; + } + } else if (c0 < 0xF8) { + /* 4-byte rune / 11110sss AAssssss BBssssss CCssssss / 21 bit scalar. */ + if (len >= 4) { + byte c1 = dst[shiftOffset + 1]; + byte c2 = dst[shiftOffset + 2]; + byte c3 = dst[shiftOffset + 3]; + scalar += (c3 & 0x3F) | ((c2 & 0x3F) << 6) | ((c1 & 0x3F) << 12) | ((c0 & 0x07) << 18); + dst[shiftOffset] = (byte) (0xF0 | ((scalar >> 18) & 0x07)); + dst[shiftOffset + 1] = (byte) ((c1 & 0xC0) | ((scalar >> 12) & 0x3F)); + dst[shiftOffset + 2] = (byte) ((c2 & 0xC0) | ((scalar >> 6) & 0x3F)); + dst[shiftOffset + 3] = (byte) ((c3 & 0xC0) | (scalar & 0x3F)); + step = 4; + } else { + step = len; + } + } + shiftOffset += step; + len -= step; + if (transformType == SHIFT_FIRST) { + len = 0; + } + } } // Copy suffix. - while (PREFIX_SUFFIX[transformSuffix] != 0) { - dst[offset++] = PREFIX_SUFFIX[transformSuffix++]; + while (suffix != suffixEnd) { + dst[offset++] = prefixSuffixStorage[suffix++]; } return offset - dstOffset; |