diff options
Diffstat (limited to 'scripts/dictionary/step-04-generate-java-literals.py')
-rw-r--r-- | scripts/dictionary/step-04-generate-java-literals.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/scripts/dictionary/step-04-generate-java-literals.py b/scripts/dictionary/step-04-generate-java-literals.py new file mode 100644 index 0000000..fd3c64e --- /dev/null +++ b/scripts/dictionary/step-04-generate-java-literals.py @@ -0,0 +1,79 @@ +# Step 04 - generate Java literals. +# +# Java byte-code has ridiculous restrictions. There is no such thing as +# "array literal" - those are implemented as series of data[x] = y; +# as a consequence N-byte array will use 7N bytes in class, plus N bytes +# in instantiated variable. Also no literal could be longer than 64KiB. +# +# To keep dictionary data compact both in source code and in compiled format +# we use the following tricks: +# * use String as a data container +# * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows +# efficient conversion to byte array; also ASCII characters use only 1 byte +#. of memory (UTF-8 encoding) +# * RLE-compress sequence of 8-th bits +# +# This script generates literals used in Java code. + +bin_path = "dictionary.bin" + +with open(bin_path, "rb") as raw: + data = raw.read() + +low = [] +hi = [] +is_skip = True +skip_flip_offset = 36 +cntr = skip_flip_offset +for b in data: + value = ord(b) + low.append(chr(value & 0x7F)) + if is_skip: + if value < 0x80: + cntr += 1 + else: + is_skip = False + hi.append(unichr(cntr)) + cntr = skip_flip_offset + 1 + else: + if value >= 0x80: + cntr += 1 + else: + is_skip = True + hi.append(unichr(cntr)) + cntr = skip_flip_offset + 1 +hi.append(unichr(cntr)) + +low0 = low[0 : len(low) // 2] +low1 = low[len(low) // 2 : len(low)] + +def escape(chars): + result = [] + for c in chars: + if "\r" == c: + result.append("\\r") + elif "\n" == c: + result.append("\\n") + elif "\t" == c: + result.append("\\t") + elif "\"" == c: + result.append("\\\"") + elif "\\" == c: + result.append("\\\\") + elif ord(c) < 32 or ord(c) >= 127: + result.append("\\u%04X" % ord(c)) + else: + result.append(c); + return result + + +source_code = [ + " private static final String DATA0 = \"", "".join(escape(low0)), "\";\n", + " private static final String DATA1 = \"", "".join(escape(low1)), "\";\n", + " private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n" +] + +src_path = "DictionaryData.inc.java" + +with open(src_path, "w") as source: + source.write("".join(source_code)) |