aboutsummaryrefslogtreecommitdiff
path: root/scripts/dictionary/step-04-generate-java-literals.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/dictionary/step-04-generate-java-literals.py')
-rw-r--r--scripts/dictionary/step-04-generate-java-literals.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/scripts/dictionary/step-04-generate-java-literals.py b/scripts/dictionary/step-04-generate-java-literals.py
new file mode 100644
index 0000000..fd3c64e
--- /dev/null
+++ b/scripts/dictionary/step-04-generate-java-literals.py
@@ -0,0 +1,79 @@
+# Step 04 - generate Java literals.
+#
+# Java byte-code has ridiculous restrictions. There is no such thing as
+# "array literal" - those are implemented as series of data[x] = y;
+# as a consequence N-byte array will use 7N bytes in class, plus N bytes
+# in instantiated variable. Also no literal could be longer than 64KiB.
+#
+# To keep dictionary data compact both in source code and in compiled format
+# we use the following tricks:
+# * use String as a data container
+# * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows
+# efficient conversion to byte array; also ASCII characters use only 1 byte
+#. of memory (UTF-8 encoding)
+# * RLE-compress sequence of 8-th bits
+#
+# This script generates literals used in Java code.
+
+bin_path = "dictionary.bin"
+
+with open(bin_path, "rb") as raw:
+ data = raw.read()
+
+low = []
+hi = []
+is_skip = True
+skip_flip_offset = 36
+cntr = skip_flip_offset
+for b in data:
+ value = ord(b)
+ low.append(chr(value & 0x7F))
+ if is_skip:
+ if value < 0x80:
+ cntr += 1
+ else:
+ is_skip = False
+ hi.append(unichr(cntr))
+ cntr = skip_flip_offset + 1
+ else:
+ if value >= 0x80:
+ cntr += 1
+ else:
+ is_skip = True
+ hi.append(unichr(cntr))
+ cntr = skip_flip_offset + 1
+hi.append(unichr(cntr))
+
+low0 = low[0 : len(low) // 2]
+low1 = low[len(low) // 2 : len(low)]
+
+def escape(chars):
+ result = []
+ for c in chars:
+ if "\r" == c:
+ result.append("\\r")
+ elif "\n" == c:
+ result.append("\\n")
+ elif "\t" == c:
+ result.append("\\t")
+ elif "\"" == c:
+ result.append("\\\"")
+ elif "\\" == c:
+ result.append("\\\\")
+ elif ord(c) < 32 or ord(c) >= 127:
+ result.append("\\u%04X" % ord(c))
+ else:
+ result.append(c);
+ return result
+
+
+source_code = [
+ " private static final String DATA0 = \"", "".join(escape(low0)), "\";\n",
+ " private static final String DATA1 = \"", "".join(escape(low1)), "\";\n",
+ " private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n"
+]
+
+src_path = "DictionaryData.inc.java"
+
+with open(src_path, "w") as source:
+ source.write("".join(source_code))