aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorjneb <jneb@users.noreply.github.com>2016-12-20 14:41:47 +0100
committerEugene Kliuchnikov <eustas@google.com>2016-12-20 14:41:47 +0100
commitf62cd2bcd294fa1562034acd4c6cde028744191e (patch)
tree2dea958e39151b41bf676816764972a9beb6cd08 /python
parent89a5b6e625eebadb0324f2cc1cccad4c2265aaf7 (diff)
downloadbrotli-f62cd2bcd294fa1562034acd4c6cde028744191e.zip
brotli-f62cd2bcd294fa1562034acd4c6cde028744191e.tar.gz
brotli-f62cd2bcd294fa1562034acd4c6cde028744191e.tar.bz2
brotlidump.py: disassemble brotli file (revisited) (#314)
* Create brotlidump.py Sorry, I am a newbie. I couldn't find my file anymore when I wanted to edit it. Hope I don't waste your time. * Fixed a bug where it couldn't read its own compression. The problem was that a prefix code ending with a 16 "repeat" didn't realize the table was full already. Also minor bug fixes, comments and stuff. * Major refactoring Rewrote almost everything. Now can dump its own compression. * Now more or less complete Appears to handle all files completely (including metablock data). Used as inspiration for the the hex example (see makehexexample.py)
Diffstat (limited to 'python')
-rw-r--r--python/brotlidump.py2361
1 files changed, 2361 insertions, 0 deletions
diff --git a/python/brotlidump.py b/python/brotlidump.py
new file mode 100644
index 0000000..a625368
--- /dev/null
+++ b/python/brotlidump.py
@@ -0,0 +1,2361 @@
+#!python3
+"""Program to dump contents of Brotli compressed files showing the compression format.
+Jurjen N.E. Bos, 2016.
+I found the following issues with the Brotli format:
+- The distance alphabet has size 16+(48<<POSTFIX),
+ but the last symbols are useless.
+ It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
+- The block type code is useless if NBLTYPES==2, you would only need 1 symbol
+ anyway, so why don't you just switch to "the other" type?
+"""
+import struct
+from operator import itemgetter, methodcaller
+from itertools import accumulate, repeat
+from collections import defaultdict, deque
+from functools import partial
+
+class InvalidStream(Exception): pass
+#lookup table
+L, I, D = "literal", "insert&copy", "distance"
+pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
+
+def outputCharFormatter(c):
+ """Show character in readable format
+ """
+ #TODO 2: allow hex only output
+ if 32<c<127: return chr(c)
+ elif c==10: return '\\n'
+ elif c==13: return '\\r'
+ elif c==32: return '" "'
+ else: return '\\x{:02x}'.format(c)
+
+def outputFormatter(s):
+ """Show string or char.
+ """
+ result = ''
+ def formatSubString(s):
+ for c in s:
+ if c==32: yield ' '
+ else: yield outputCharFormatter(c)
+ if len(result)<200: return ''.join(formatSubString(s))
+ else:
+ return ''.join(formatSubString(s[:100]))+'...'+ \
+ ''.join(formatSubString(s[-100:]))
+
+
+class BitStream:
+ """Represent a bytes object. Can read bits and prefix codes the way
+ Brotli does.
+ """
+ def __init__(self, byteString):
+ self.data = byteString
+ #position in bits: byte pos is pos>>3, bit pos is pos&7
+ self.pos = 0
+
+ def __repr__(self):
+ """Representation
+ >>> olleke
+ BitStream(pos=0:0)
+ """
+ return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
+
+ def read(self, n):
+ """Read n bits from the stream and return as an integer.
+ Produces zero bits beyond the stream.
+ >>> olleke.data[0]==27
+ True
+ >>> olleke.read(5)
+ 27
+
+ >>> olleke
+ BitStream(pos=0:5)
+ """
+ value = self.peek(n)
+ self.pos += n
+ if self.pos>len(self.data)*8:
+ raise ValueError('Read past end of stream')
+ return value
+
+ def peek(self, n):
+ """Peek an n bit integer from the stream without updating the pointer.
+ It is not an error to read beyond the end of the stream.
+ >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
+ True
+ >>> olleke.peek(15)
+ 11803
+ >>> hex(olleke.peek(32))
+ '0x2e1b'
+ """
+ #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
+ #convert to int: int.from_bytes(..., 'little')
+ #shift out the bits from the first byte: >>(self.pos&7)
+ #mask unwanted bits: & (1<<n)-1
+ return int.from_bytes(
+ self.data[self.pos>>3:self.pos+n+7>>3],
+ 'little')>>(self.pos&7) & (1<<n)-1
+
+ def readBytes(self, n):
+ """Read n bytes from the stream on a byte boundary.
+ """
+ if self.pos&7: raise ValueError('readBytes: need byte boundary')
+ result = self.data[self.pos>>3:(self.pos>>3)+n]
+ self.pos += 8*n
+ return result
+
+#-----------------------Symbol-------------------------------------------
+class Symbol:
+ """A symbol in a code.
+ Refers back to the code that contains it.
+ Index is the place in the alphabet of the symbol.
+ """
+ __slots__ = 'code', 'index'
+ def __init__(self, code, value):
+ self.code = code
+ self.index = value
+
+ def __repr__(self):
+ return 'Symbol({}, {})'.format(self.code.name, self.index)
+
+ def __len__(self):
+ """Number of bits in the prefix notation of this symbol
+ """
+ return self.code.length(self.index)
+
+ def __int__(self):
+ return self.index
+
+ #these routines call equivalent routine in Code class
+ def bitPattern(self):
+ """Value of the symbol in the stream
+ """
+ return self.code.bitPattern(self.index)
+
+ def extraBits(self):
+ """Number of extra bits to read for this symbol
+ """
+ return self.code.extraBits(self.index)
+
+ def __str__(self):
+ """Short descriptor of the symbol without extra bits.
+ """
+ return self.code.mnemonic(self.index)
+
+ #requiring optional extra bits, if self.code supports them
+ def value(self, extra=None):
+ """The value used for processing. Can be a tuple.
+ with optional extra bits
+ """
+ if isinstance(self.code, WithExtra):
+ if not 0<=extra<1<<self.extraBits():
+ raise ValueError("value: extra value doesn't fit in extraBits")
+ return self.code.value(self.index, extra)
+ if extra is not None:
+ raise ValueError('value: no extra bits for this code')
+ return self.code.value(self.index)
+
+ def explanation(self, extra=None):
+ """Long explanation of the value from the numeric value
+ with optional extra bits
+ Used by Layout.verboseRead when printing the value
+ """
+ if isinstance(self.code, WithExtra):
+ return self.code.callback(self, extra)
+ return self.code.callback(self)
+
+#========================Code definitions==================================
+class RangeDecoder:
+ """A decoder for the Code class that assumes the symbols
+ are encoded consecutively in binary.
+ It all depends on the "alphabetSize" property.
+ The range runs from 0 to alphabetSize-1.
+ This is the default decoder.
+ """
+ def __init__(self, *, alphabetSize=None, bitLength=None, **args):
+ if bitLength is not None: alphabetSize = 1<<bitLength
+ if alphabetSize is not None:
+ self.alphabetSize = alphabetSize
+ self.maxLength = (alphabetSize-1).bit_length()
+
+ def __len__(self):
+ return self.alphabetSize
+
+ def __iter__(self):
+ """Produce all symbols.
+ """
+ return map(partial(Symbol, self), range(len(self)))
+
+ def __getitem__(self, index):
+ if index>=self.alphabetSize: raise ValueError('index out of range')
+ return Symbol(self, index)
+
+ def bitPattern(self, index):
+ return '{:0{}b}'.format(index, self.maxLength)
+
+ def length(self, index):
+ """Encoding length of given symbol.
+ Does not depend on index in this case.
+ """
+ return self.maxLength
+
+ def decodePeek(self, data):
+ """Find which symbol index matches the given data (from peek, as a number)
+ and return the number of bits decoded.
+ Can also be used to figure out length of a symbol.
+ """
+ return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
+
+class PrefixDecoder:
+ """A decoder for the Code class that uses a prefix code.
+ The code is determined by encoding:
+ encode[p] gives the index corresponding to bit pattern p.
+ Used setDecode(decodeTable) to switch the decoder from the default
+ to a prefix decoder, or pass decodeTable at init.
+ You can also use setLength(lengthTable)
+ to define the encoding from the lengths.
+ The set of symbol values does not need to be consecutive.
+ """
+ def __init__(self, *, decodeTable=None, **args):
+ if decodeTable is not None: self.setDecode(decodeTable)
+
+ def __len__(self):
+ return len(self.decodeTable)
+
+ def __iter__(self):
+ def revBits(index):
+ return self.bitPattern(index)[::-1]
+ return (
+ Symbol(self, index)
+ for index in sorted(self.decodeTable.values(), key=revBits)
+ )
+
+ def __getitem__(self, index):
+ if index not in self.lengthTable:
+ raise ValueError('No symbol {}[{}]'.format(
+ self.__class__.__name__, index))
+ return Symbol(self, index)
+
+ def bitPattern(self, index):
+ bits = next(b for (b,s) in self.decodeTable.items() if s==index)
+ return '{:0{}b}'.format(bits, self.length(index))
+
+ def length(self, index):
+ """Encoding length of given symbol.
+ """
+ return self.lengthTable[index]
+
+ def decodePeek(self, data):
+ """Find which symbol index matches the given data (from peek, as a number)
+ and return the number of bits decoded.
+ Can also be used to figure out length of a symbol.
+ """
+ #do binary search for word length
+ #invariant: lo<=length<=hi
+ lo, hi = self.minLength, self.maxLength
+ while lo<=hi:
+ mid = lo+hi>>1
+ #note lo<=mid<hi at this point
+ mask = (1<<mid)-1
+ #lets see what happens if we guess length is mid
+ try: index = self.decodeTable[data&mask]
+ except KeyError:
+ #too many bits specified, reduce estimated length
+ hi = mid-1
+ continue
+ #we found a symbol, but there could be a longer match
+ symbolLength = self.lengthTable[index]
+ if symbolLength<=mid:
+ #all bits match, symbol must be right
+ return symbolLength, Symbol(self, index)
+ #there must be more bits to match
+ lo = mid+1
+ return lo, Symbol(self, index)
+
+ #routine to set up the tables
+ def setDecode(self, decodeTable):
+ """Store decodeTable,
+ and compute lengthTable, minLength, maxLength from encodings.
+ """
+ self.decodeTable = decodeTable
+ #set of symbols with unknown length
+ todo = set(decodeTable)
+ #bit size under investigation
+ maskLength = 0
+ lengthTable = {}
+ while todo:
+ mask = (1<<maskLength)-1
+ #split the encodings that we didn't find yet using b bits
+ splitSymbols = defaultdict(list)
+ for s in todo: splitSymbols[s&mask].append(s)
+ #unique encodings have a length of maskLength bits
+ #set length, and remove from todo list
+ for s,subset in splitSymbols.items():
+ if len(subset)==1:
+ lengthTable[self.decodeTable[s]] = maskLength
+ todo.remove(s)
+ #now investigate with longer mask
+ maskLength +=1
+ #save result
+ self.lengthTable = lengthTable
+ self.minLength = min(lengthTable.values())
+ self.maxLength = max(lengthTable.values())
+ self.switchToPrefix()
+
+ def setLength(self, lengthTable):
+ """Given the bit pattern lengths for symbols given in lengthTable,
+ set decodeTable, minLength, maxLength
+ """
+ self.lengthTable = lengthTable
+ self.minLength = min(lengthTable.values())
+ self.maxLength = max(lengthTable.values())
+ #compute the backwards codes first; then reverse them
+ #compute (backwards) first code for every separate lengths
+ nextCodes = []
+ #build codes for each length, from right to left
+ code = 0
+ for bits in range(self.maxLength+1):
+ code <<= 1
+ nextCodes.append(code)
+ code += sum(x==bits for x in lengthTable.values())
+ self.decodeTable = {}
+ #count codes for each length, and store reversed in the table
+ for symbol in sorted(lengthTable):
+ bits = lengthTable[symbol]
+ bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
+ self.decodeTable[int(bitpattern[::-1], 2)] = symbol
+ nextCodes[bits] += 1
+ self.switchToPrefix()
+
+ def switchToPrefix(self):
+ """This routine makes sure the prefix decoder is activated.
+ """
+ self.mode = PrefixDecoder
+
+class Code(RangeDecoder, PrefixDecoder):
+ """An alphabet of symbols, that can be read from a stream.
+ If you use setDecode or setLength, you have a prefix code,
+ otherwise you have a range code.
+ Features:
+ code[index] produces symbol with given index
+ value(index): value of symbol
+ mnemonic(index): short description of symbol
+ explanation(index): show meaning of symbol, shown in Layout.verboseRead
+ iter(code): produce all symbols in some order
+ name: show as context in Layout.verboseRead
+ """
+ name = '?'
+ #callback is a function that gets the symbol and the extra bits
+ #default callback calls explanation
+ def __init__(self, name=None, *, callback=None, description='', **args):
+ """Don't forget to set either alphabetSize or decodeTable
+ """
+ #set name when provided, otherwise take class variable
+ if name is not None: self.name = name
+ if callback is not None: self.callback = callback
+ self.description = description
+ #mode switch
+ if 'bitLength' in args or 'alphabetSize' in args:
+ self.mode = RangeDecoder
+ RangeDecoder.__init__(self, **args)
+ elif 'decodeTable' in args:
+ self.mode = PrefixDecoder
+ PrefixDecoder.__init__(self, **args)
+ else:
+ super().__init__(**args)
+
+ def __repr__(self):
+ return self.__class__.__name__+' '+self.name
+
+ #the routines that get switched between RangeDecoder and PrefixDecoder
+ def __len__(self): return self.mode.__len__(self)
+ def __iter__(self): return self.mode.__iter__(self)
+ def __getitem__(self, index): return self.mode.__getitem__(self, index)
+ def bitPattern(self, index): return self.mode.bitPattern(self, index)
+ def length(self, index): return self.mode.length(self, index)
+ def decodePeek(self, data): return self.mode.decodePeek(self, data)
+ #general routines
+ def value(self, index, extra=None):
+ """Get value of symbol for computations.
+ Override where needed.
+ """
+ if extra is not None:
+ raise ValueError('value: no extra for this symbol')
+ return index
+
+ def mnemonic(self, index):
+ """Give mnemonic of symbol.
+ Override where needed.
+ """
+ return str(self.value(index))
+
+ def callback(self, symbol):
+ return self.explanation(symbol.index)
+
+ def explanation(self, index):
+ """Long explanation of the value from the numeric value
+ This is a default routine.
+ You can customize in three ways:
+ - set description to add some text
+ - override to get more control
+ - set callback to make it dependent on you local variables
+ """
+ value = self.value(index)
+ return '{0}{1}: {2}'.format(
+ self.description and self.description+': ',
+ self.bitPattern(index),
+ value,
+ )
+
+ def extraBits(self, index):
+ return 0
+
+ #Routines that use the decode interface
+ def showCode(self, width=80):
+ """Show all words of the code in a nice format.
+ """
+ #make table of all symbols with binary strings
+ symbolStrings = [
+ (self.bitPattern(s.index), self.mnemonic(s.index))
+ for s in self
+ ]
+ #determine column widths the way Lisp programmers do it
+ leftColWidth, rightColWidth = map(max, map(
+ map,
+ repeat(len),
+ zip(*symbolStrings)
+ ))
+ colwidth = leftColWidth+rightColWidth
+ columns = 81//(colwidth+2)
+ rows = -(-len(symbolStrings)//columns)
+ def justify(bs):
+ b,s = bs
+ return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
+ for i in range(rows):
+ print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
+
+ def readTuple(self, stream):
+ """Read symbol from stream. Returns symbol, length.
+ """
+ length, symbol = self.decodePeek(stream.peek(self.maxLength))
+ stream.pos += length
+ return length, symbol
+
+ def readTupleAndExtra(self, stream):
+ return self.readTuple(stream)+(0, None)
+
+class WithExtra(Code):
+ """Extension for Code so that symbol may have extra bits associated.
+ If you supply an extraTable, you can use extraBits
+ You can define an extraTable,
+ which allows to call extraBits to get the number of extraBits.
+ Otherwise, you can supply extraBits yourself.
+ Routine readTupleAndExtra now reads the extra bits too.
+ Value probably needs to be overridden; see Enumerator.
+ Note: this does not give you an decodeTable.
+ """
+ #redefine these if you don't want to use an extraTable
+ def extraBits(self, index):
+ """Get the number of extra bits for this symbol.
+ """
+ return self.extraTable[index]
+
+ def mnemonic(self, index):
+ """This value must be independent of extra.
+ """
+ return str(index)
+
+ def readTupleAndExtra(self, stream):
+ """Read symbol and extrabits from stream.
+ Returns symbol length, symbol, extraBits, extra
+ >>> olleke.pos = 6
+ >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
+ (2, Symbol(MLEN, 4), 16, 46)
+ """
+ length, symbol = self.decodePeek(stream.peek(self.maxLength))
+ stream.pos += length
+ extraBits = self.extraBits(symbol.index)
+ return length, symbol, extraBits, stream.read(extraBits)
+
+ def explanation(self, index, extra=None):
+ """Expanded version of Code.explanation supporting extra bits.
+ If you don't supply extra, it is not mentioned.
+ """
+ extraBits = 0 if extra is None else self.extraBits(index)
+ if not hasattr(self, 'extraTable'):
+ formatString = '{0}{3}'
+ lo = hi = value = self.value(index, extra)
+ elif extraBits==0:
+ formatString = '{0}{2}: {3}'
+ lo, hi = self.span(index)
+ value = lo
+ else:
+ formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
+ lo, hi = self.span(index)
+ value = lo+extra
+ return formatString.format(
+ self.description and self.description+': ',
+ 'x'*extraBits,
+ self.bitPattern(index),
+ lo, hi,
+ extra,
+ value,
+ )
+
+ def callback(self, symbol, extra):
+ return self.explanation(symbol.index, extra)
+
+class BoolCode(Code):
+ """Same as Code(bitLength=1), but shows a boolean.
+ """
+ def __init__(self, name=None, **args):
+ super().__init__(name, bitLength=1, **args)
+
+ def value(self, index, extra=None):
+ return bool(super().value(index, extra))
+
+class Enumerator(WithExtra):
+ """Code that is defined by the ExtraTable.
+ extraTable is a class variable that contains
+ the extraBits of the symbols from 0
+ value0 contains the value of symbol 0
+ encodings is not neccessary, but allowed.
+ Note: place for FixedCode to make sure extraBits works
+ """
+ def __init__(self, name=None, **args):
+ #if there is no decodeTable to determine length, compute it ourselves
+ if 'decodeTable' not in args:
+ args['alphabetSize'] = len(self.extraTable)
+ super().__init__(name, **args)
+
+ def __len__(self):
+ return len(self.extraTable)
+
+ def __getitem__(self, index):
+ """Faster than PrefixDecoder
+ """
+ if index>=len(self.extraTable):
+ raise ValueError("No symbol {}[{}]".format(
+ self.__class__.__name__, index))
+ return Symbol(self, index)
+
+ def value(self, index, extra):
+ """Override if you don't define value0 and extraTable
+ """
+ lower, upper = self.span(index)
+ value = lower+(extra or 0)
+ if value>upper:
+ raise ValueError('value: extra out of range')
+ return value
+
+ def span(self, index):
+ """Give the range of possible values in a tuple
+ Useful for mnemonic and explanation
+ """
+ lower = self.value0+sum(1<<x for x in self.extraTable[:index])
+ upper = lower+(1<<self.extraTable[index])
+ return lower, upper-1
+
+#======================Code subclasses======================================
+#Alphabets used in the metablock header----------------------------------
+#For prefix codes
+class PrefixCodeHeader(WithExtra):
+ """Header of prefix codes.
+ """
+ def __init__(self, codename):
+ super().__init__('PFX', bitLength=2)
+ #this is the name of the code that it describes
+ self.codename = codename
+
+ def extraBits(self, index):
+ return 2 if index==1 else 0
+
+ def value(self, index, extra):
+ """Returns ('Simple', #codewords) or ('Complex', HSKIP)
+ """
+ if index==1:
+ if extra>3:
+ raise ValueError('value: extra out of range')
+ return 'Simple', extra+1
+ if extra:
+ raise ValueError('value: extra out of range')
+ return 'Complex', index
+
+ def explanation(self, index, extra):
+ if index==1:
+ return '{} is simple with {} code word{}'.format(
+ self.codename, extra+1, 's' if extra else '')
+ lengths = [1, 2, 3, 4, 0, 5, 17, 6]
+ return '{} is complex with lengths {}...'.format(
+ self.codename,
+ ','.join(
+ map(str, lengths[index:index+5]))
+ )
+
+class TreeShapeAlhabet(BoolCode):
+ """The bit used to indicate if four word code is "deep" or "wide"
+ """
+ name = 'SHAPE'
+ def value(self, index):
+ return [(2,2,2,2), (1,2,3,3)][index]
+
+ def explanation(self, index):
+ return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
+
+class LengthOfLengthAlphabet(Code):
+ """For use in decoding complex code descriptors.
+ >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
+ >>> print(lengthOfLengthAlphabet[2])
+ coded with 2 bits
+ >>> len(lengthOfLengthAlphabet[0])
+ 2
+ >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
+ [2, 4, 3, 2, 2, 4]
+ >>> lengthOfLengthAlphabet.showCode()
+ 00:skipped 01:coded with 4 bits 0111:coded with 1 bits
+ 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits
+ """
+ decodeTable = {
+ 0b00:0, 0b10:3,
+ 0b0111:1, 0b01:4,
+ 0b011:2, 0b1111:5,
+ }
+
+ def __init__(self, name=None, **args):
+ super().__init__(name, decodeTable=self.decodeTable, **args)
+
+ def mnemonic(self, index):
+ if index==0: return 'skipped'
+ return 'coded with {} bits'.format(index)
+
+ def explanation(self, index, extra=None):
+ return self.description+': '+self.mnemonic(index)
+
+class LengthAlphabet(WithExtra):
+ """Length of symbols
+ Used during construction of a code.
+ """
+ def __init__(self, name):
+ super().__init__(name, alphabetSize=18)
+
+ def extraBits(self, index):
+ return {16:2, 17:3}.get(index, 0)
+
+ def mnemonic(self, index):
+ if index==0: return 'unused'
+ elif index==16: return 'rep xx'
+ elif index==17: return 'zero xxx'
+ else: return 'len {}'.format(index)
+
+ def explanation(self, index, extra):
+ return self.description.format(self[index], extra)
+
+ def value(self, index, extra):
+ #the caller got the length already, so extra is enough
+ return extra
+
+#Stream header
+class WindowSizeAlphabet(Code):
+ """The alphabet used for window size in the stream header.
+ >>> WindowSizeAlphabet()[10].explanation()
+ 'windowsize=(1<<10)-16=1008'
+ """
+ decodeTable = {
+ 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22,
+ 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23,
+ 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24,
+ 0b1010001: 13, 0b0000001: 17, 0b1001: 21,
+ 0b0010001: None,
+ }
+
+ name = 'WSIZE'
+
+ def __init__(self, name=None):
+ super().__init__(name, decodeTable=self.decodeTable)
+
+ def value(self, index):
+ #missing value gives index None
+ if index is None: return None
+ return (1<<index)-16
+
+ def explanation(self, index):
+ return 'windowsize=(1<<{})-16={}'.format(
+ index, (1<<index)-16)
+
+#Metablock
+class MetablockLengthAlphabet(WithExtra):
+ """Used for the meta block length;
+ also indicates a block with no data
+ >>> metablockLengthAlphabet = MetablockLengthAlphabet()
+ >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
+ Symbol(MLEN, 0)
+ 'empty'
+ >>> metablockLengthAlphabet[3]
+ Traceback (most recent call last):
+ ...
+ ValueError: No symbol MetablockLengthAlphabet[3]
+ >>> print(metablockLengthAlphabet[4])
+ hhhh00
+ >>> metablockLengthAlphabet[4].value(0x1000)
+ 4097
+ >>> metablockLengthAlphabet[5].value(0x1000)
+ Traceback (most recent call last):
+ ...
+ InvalidStream: Zeros in high nibble of MLEN
+ >>> metablockLengthAlphabet[5].explanation(0x12345)
+ 'data length: 12345h+1=74566'
+ >>> metablockLengthAlphabet.showCode()
+ 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty
+ """
+ decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
+
+ name = 'MLEN'
+ def __init__(self, name=None):
+ super().__init__(name, decodeTable=self.decodeTable)
+
+ def extraBits(self, index):
+ return index*4
+
+ def mnemonic(self, index):
+ if index==0: return 'empty'
+ return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
+
+ def value(self, index, extra):
+ extraBits = self.extraBits(index)
+ if not 0<=extra<1<<extraBits:
+ raise ValueError('value: extra out of range')
+ if index==0: return 0
+ if index>4 and extra>>extraBits-4==0: raise InvalidStream(
+ 'Zeros in high nibble of MLEN')
+ return extra+1
+
+ def explanation(self, index, extra):
+ if index==0: return '11: empty block'
+ extraBits = self.extraBits(index)
+ return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
+
+
+class ReservedAlphabet(BoolCode):
+ """The reserved bit that must be zero.
+ """
+ name = 'RSVD'
+ def value(self, index):
+ if index: raise ValueError('Reserved bit is not zero')
+
+ def explanation(self, index):
+ return 'Reserved (must be zero)'
+
+class FillerAlphabet(Code):
+ def __init__(self, *, streamPos):
+ super().__init__('SKIP', bitLength=(-streamPos)&7)
+
+ def explanation(self, index):
+ return '{} bit{} ignored'.format(
+ self.length(index),
+ '' if self.length(index)==1 else 's',
+ )
+
+class SkipLengthAlphabet(WithExtra):
+ """Used for the skip length in an empty metablock
+ >>> skipLengthAlphabet = SkipLengthAlphabet()
+ >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
+ Symbol(SKIP, 0)
+ 'empty'
+ >>> skipLengthAlphabet[4]
+ Traceback (most recent call last):
+ ...
+ ValueError: index out of range
+ >>> print(skipLengthAlphabet[3])
+ hhhhhh11
+ >>> skipLengthAlphabet[2].value(0x1000)
+ 4097
+ >>> skipLengthAlphabet[3].value(0x1000)
+ Traceback (most recent call last):
+ ...
+ InvalidStream: Zeros in high byte of SKIPBYTES
+ >>> skipLengthAlphabet[3].explanation(0x12345)
+ 'skip length: 12345h+1=74566'
+ >>> skipLengthAlphabet.showCode()
+ 00:empty 01:hh01 10:hhhh10 11:hhhhhh11
+ """
+ def __init__(self):
+ super().__init__('SKIP', bitLength=2)
+
+ def extraBits(self, index):
+ return index*8
+
+ def mnemonic(self, index):
+ if index==0: return 'empty'
+ return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
+
+ def value(self, index, extra):
+ extraBits = self.extraBits(index)
+ if not 0<=extra<1<<extraBits:
+ raise ValueError('value: extra out of range')
+ if index==0: return 0
+ if index>1 and extra>>extraBits-8==0:
+ raise InvalidStream('Zeros in high byte of SKIPBYTES')
+ return extra+1
+
+ def explanation(self, index, extra):
+ if index==0: return '00: no skip'
+ extraBits = self.extraBits(index)
+ return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
+
+
+class TypeCountAlphabet(Enumerator):
+ """Used for giving block type counts and tree counts.
+ >>> TypeCountAlphabet(description='').showCode()
+ 0:0 0101:xx,0101 1011:xxxxx,1011
+ 0001:0001 1101:xxxxxx,1101 0111:xxx,0111
+ 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111
+ """
+ decodeTable = {
+ 0b0: 0, 0b1001: 5,
+ 0b0001: 1, 0b1011: 6,
+ 0b0011: 2, 0b1101: 7,
+ 0b0101: 3, 0b1111: 8,
+ 0b0111: 4,
+ }
+
+ value0 = 1
+ extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
+ name = 'BT#'
+
+ def __init__(self, name=None, *, description):
+ super().__init__(
+ name,
+ decodeTable=self.decodeTable,
+ description=description)
+
+ def mnemonic(self, index):
+ if index==0: return '0'
+ if index==1: return '0001'
+ return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
+
+ def explanation(self, index, extra):
+ value = self.value(index, extra)
+ description = self.description
+ if value==1: description = description[:-1]
+ return '{}: {} {}'.format(
+ self.mnemonic(index),
+ value,
+ description)
+
+class BlockTypeAlphabet(Code):
+ """The block types; this code works for all three kinds.
+ >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
+ >>> print(*(x for x in b))
+ prev +1 #0 #1 #2 #3 #4
+ """
+ def __init__(self, name, NBLTYPES, **args):
+ super().__init__(name, alphabetSize=NBLTYPES+2, **args)
+ self.NBLTYPES = NBLTYPES
+
+ def mnemonic(self, index):
+ if index==0: return 'prev'
+ elif index==1: return '+1'
+ else: return '#'+str(index-2)
+
+ def value(self, index):
+ return index-2
+
+ def explanation(self, index):
+ if index==0: return '0: previous'
+ elif index==1: return '1: increment'
+ else: return 'Set block type to: '+str(index-2)
+
+class BlockCountAlphabet(Enumerator):
+ """Block counts
+ >>> b = BlockCountAlphabet('L')
+ >>> print(b[25])
+ [24*x]: BC16625-16793840
+ """
+
+ value0 = 1
+ extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
+ def __init__(self, name, **args):
+ super().__init__(name, alphabetSize=26, **args)
+
+ def mnemonic(self, index):
+ extraBits = self.extraBits(index)
+ return '{}: BC{}-{}'.format(
+ 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
+ *self.span(index))
+
+ def explanation(self, index, extra):
+ return 'Block count: '+super().explanation(index, extra)
+
+class DistanceParamAlphabet(WithExtra):
+ """The distance parameters NPOSTFIX and NDIRECT.
+ Although these are treated as two in the description, this is easier.
+ """
+ def __init__(self):
+ super().__init__('DIST', bitLength=2)
+
+ def extraBits(self, index):
+ return 4
+
+ def value(self, index, extra):
+ """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
+ """
+ if extra>15:
+ raise ValueError('value: extra out of range')
+ return index, extra<<index
+
+ def explanation(self, index, extra):
+ return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
+ index, extra, index, extra<<index)
+
+ def mnemonic(self, index):
+ return 'PF'+str(index)
+
+class LiteralContextMode(Code):
+ """For the literal context modes.
+ >>> LiteralContextMode().showCode()
+ 00:LSB6 01:MSB6 10:UTF8 11:Signed
+ >>> LiteralContextMode().explanation(2)
+ 'Context mode for type 9: 2(UTF8)'
+ """
+
+ def __init__(self, *, number=9):
+ super().__init__('LC'+str(number), bitLength=2)
+ self.number = number
+
+ def mnemonic(self, index):
+ return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
+
+ def explanation(self, index):
+ return 'Context mode for type {}: {}({})'.format(
+ self.number,
+ index,
+ self.mnemonic(index))
+
+class RLEmaxAlphabet(Enumerator):
+ """Used for describing the run length encoding used for describing context maps.
+ >>> RLEmaxAlphabet().showCode()
+ 0:1 1:more
+ """
+ value0 = 0
+ extraTable = [0, 4]
+ name = 'RLE#'
+
+ def mnemonic(self, index):
+ return ['1', 'more'][index]
+
+ def explanation(self, index, extra):
+ description = self.description and self.description+': '
+ if index==0: return description+'No RLE coding'
+ return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
+
+class TreeAlphabet(WithExtra):
+ """The alphabet to enumerate entries (called trees) in the context map.
+ parameters are RLEMAX and NTREES
+ >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
+ >>> len(t)
+ 8
+ >>> print(t[2])
+ xx+4 zeroes
+ >>> t[3].explanation(2)
+ '8+010=10 zeroes'
+ >>> t[0].value(0)
+ (1, 0)
+ """
+ name = 'CMI'
+ def __init__(self, name=None, *, RLEMAX, NTREES, **args):
+ super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
+ self.RLEMAX = RLEMAX
+ self.NTREES = NTREES
+
+ def extraBits(self, index):
+ if 0<index<=self.RLEMAX: return index
+ return 0
+
+ def mnemonic(self, index):
+ if index==0: return 'map #0'
+ if index<=self.RLEMAX:
+ return '{}+{} zeroes'.format('x'*index, 1<<index)
+ return 'map #{}'.format(index-self.RLEMAX)
+
+ def value(self, index, extra):
+ """Give count and value."""
+ index = index
+ if index==0: return 1, 0
+ if index<=self.RLEMAX: return (1<<index)+extra, 0
+ return 1, index-self.RLEMAX
+
+ def explanation(self, index, extra):
+ description = self.description and self.description+': '
+ if index==0: return description+'map #0'
+ if index<=self.RLEMAX:
+ return '{}+{:0{}b}={} zeroes'.format(
+ (1<<index),
+ extra, self.extraBits(index),
+ (1<<index)+extra)
+ return '{}map #{}-{}={}'.format(
+ description,
+ index, self.RLEMAX, index-self.RLEMAX)
+
+#Prefix alphabets for the data stream----------------------------------
+class LiteralAlphabet(Code):
+ """Alphabet of symbols.
+ """
+ minLength = maxLength = 8
+ def __init__(self, number):
+ super().__init__('L'+str(number), alphabetSize=1<<8)
+
+ def mnemonic(self, index):
+ return outputCharFormatter(index)
+
+ def value(self, index, extra=None):
+ return index
+
+ def explanation(self, index, extra=None):
+ return self.mnemonic(index)
+
+class InsertLengthAlphabet(Enumerator):
+ """Intern code for insert counts
+ """
+ value0 = 0
+ extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
+
+class CopyLengthAlphabet(Enumerator):
+ value0 = 2
+ extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
+
+class InsertAndCopyAlphabet(WithExtra):
+ """The insert and copy code
+ >>> for x in range(0,704,704//13):
+ ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
+ 0 I0C2&D=0
+ 110110 I6+xC8&D=0
+ 1101100 I5C22+xxx&D=0
+ 10100010 I4C4
+ 11011000 I3C10+x
+ 100001110 I14+xxC8
+ 101000100 I10+xxC22+xxx
+ 101111010 I98+xxxxxC14+xx
+ 110110000 I6+xC70+xxxxx
+ 111100110 I1090+[10*x]C8
+ 1000011100 I26+xxxC326+[8*x]
+ 1001010010 I322+[8*x]C14+xx
+ 1010001000 I194+[7*x]C70+xxxxx
+ 1010111110 I22594+[24*x]C1094+[10*x]
+ """
+ insertLengthAlphabet = InsertLengthAlphabet(None)
+ copyLengthAlphabet = CopyLengthAlphabet(None)
+
+ def __init__(self, number=''):
+ super().__init__('IC'+str(number), bitLength=10)
+
+ def __len__(self):
+ return 704
+
+ def extraBits(self, index):
+ insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
+ return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
+ CopyLengthAlphabet.extraTable[copySymbol.index]
+
+ def splitSymbol(self, index):
+ """Give relevant values for computations:
+ (insertSymbol, copySymbol, dist0flag)
+ """
+ #determine insert and copy upper bits from table
+ row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
+ col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
+ #determine inserts and copy sub codes
+ insertLengthCode = row<<3 | index>>3&7
+ if row: insertLengthCode -= 8
+ copyLengthCode = col<<3 | index&7
+ return (
+ Symbol(self.insertLengthAlphabet, insertLengthCode),
+ Symbol(self.copyLengthAlphabet, copyLengthCode),
+ row==0
+ )
+
+ def mnemonic(self, index):
+ """Make a nice mnemonic
+ """
+ i,c,d0 = self.splitSymbol(index)
+ iLower, _ = i.code.span(i.index)
+ iExtra = i.extraBits()
+ cLower, _ = c.code.span(c.index)
+ cExtra = c.extraBits()
+ return 'I{}{}{}C{}{}{}{}'.format(
+ iLower,
+ '+' if iExtra else '',
+ 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
+ cLower,
+ '+' if cExtra else '',
+ 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
+ '&D=0' if d0 else '')
+
+ def value(self, index, extra):
+ i,c,d0 = self.splitSymbol(index)
+ iExtra = i.extraBits()
+ ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
+ insert = i.value(ie)
+ copy = c.value(ce)
+ return insert, copy, d0
+
+ def explanation(self, index, extra):
+ insert, copy, d0 = self.value(index, extra)
+ if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
+ else: return 'Literal: {}, copy: {}'.format(insert, copy)
+
+class DistanceAlphabet(WithExtra):
+ """Represent the distance encoding.
+ Dynamically generated alphabet.
+ This is what the documentation should have said:
+ Ignoring offsets for the moment, the "long" encoding works as follows:
+ Write the distance in binary as follows:
+ 1xy..yz..z, then the distance symbol consists of n..nxz..z
+ Where:
+ n is one less than number of bits in y
+ x is a single bit
+ y..y are n+1 extra bits (encoded in the bit stream)
+ z..z is NPOSTFIX bits that are part of the symbol
+ The offsets are so as to start at the lowest useable value:
+ if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
+ then n..nxz..z is symbol -NDIRECT-16
+ >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
+ >>> print(d[4], d[17], d[34])
+ last-1 1 10xx00-5
+ >>> [str(d[x]) for x in range(26, 32)]
+ ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
+ """
+ def __init__(self, number, *, NPOSTFIX, NDIRECT):
+ self.NPOSTFIX = NPOSTFIX
+ self.NDIRECT = NDIRECT
+ #set length
+ #Actually, not all symbols are used,
+ #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
+ super().__init__('D'+str(number),
+ alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
+
+ def extraBits(self, index):
+ """Indicate how many extra bits are needed to interpret symbol
+ >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
+ >>> [d[i].extraBits() for i in range(26)]
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ >>> [d[i].extraBits() for i in range(26,36)]
+ [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
+ """
+ if index<16+self.NDIRECT: return 0
+ return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
+
+ def value(self, dcode, dextra):
+ """Decode value of symbol together with the extra bits.
+ >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
+ >>> d[34].value(2)
+ (0, 35)
+ """
+ if dcode<16:
+ return [(1,0),(2,0),(3,0),(4,0),
+ (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
+ (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
+ ][dcode]
+ if dcode<16+self.NDIRECT:
+ return (0,dcode-16)
+ #we use the original formulas, instead of my clear explanation
+ POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
+ ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
+ hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
+ lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
+ offset = ((2 + (hcode & 1)) << ndistbits) - 4
+ distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
+ return (0,distance)
+
+ def mnemonic(self, index, verbose=False):
+ """Give mnemonic representation of meaning.
+ verbose compresses strings of x's
+ """
+ if index<16:
+ return ['last', '2last', '3last', '4last',
+ 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
+ '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
+ ][index]
+ if index<16+self.NDIRECT:
+ return str(index-16)
+ #construct strings like "1xx01-15"
+ index -= self.NDIRECT+16
+ hcode = index >> self.NPOSTFIX
+ lcode = index & (1<<self.NPOSTFIX)-1
+ if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
+ else: formatString = '1{0}{1}{4:+d}'
+ return formatString.format(
+ hcode&1,
+ 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
+ lcode, self.NPOSTFIX,
+ self.NDIRECT+1-(4<<self.NPOSTFIX))
+
+ def explanation(self, index, extra):
+ """
+ >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
+ >>> d[55].explanation(13)
+ '11[1101]01-5: [0]+240'
+ """
+ extraBits = self.extraBits(index)
+ extraString = '[{:0{}b}]'.format(extra, extraBits)
+ return '{0}: [{1[0]}]{1[1]:+d}'.format(
+ self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
+ self.value(index, extra))
+
+#Classes for doing actual work------------------------------------------
+class ContextModeKeeper:
+ """For computing the literal context mode.
+ You feed it characters, and it computes indices in the context map.
+ """
+ def __init__(self, mode):
+ self.chars = deque([0,0], maxlen=2)
+ self.mode = mode
+
+ def setContextMode(self, mode):
+ """Switch to given context mode (0..3)"""
+ self.mode = mode
+ def getIndex(self):
+ if self.mode==0: #LSB6
+ return self.chars[1]&0x3f
+ elif self.mode==1: #MSB6
+ return self.chars[1]>>2
+ elif self.mode==2: #UTF8: character class of previous and a bit of the second
+ p2,p1 = self.chars
+ return self.lut0[p1]|self.lut1[p2]
+ elif self.mode==3: #Signed: initial bits of last two bytes
+ p2,p1 = self.chars
+ return self.lut2[p1]<<3|self.lut2[p2]
+
+ def add(self, index):
+ """Adjust the context for output char (as int)."""
+ self.chars.append(index)
+
+ #0: control #16: quote #32: ,:; #48: AEIOU
+ #4: tab/lf/cr #20: % #36: . #52: BC..Z
+ #8: space #24: (<[{ #40: = #56: aeiou
+ #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z
+ lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+ 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+ 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+ 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+ 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0
+ ]+[0,1]*32+[2,3]*32
+ #0: space 1:punctuation 2:digit/upper 3:lower
+ lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
+ ]+[0]*96+[2]*32
+ #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
+ lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
+ assert len(lut0)==len(lut1)==len(lut2)==256
+
+class WordList:
+ """Word list.
+ >>> WordList().word(7, 35555)
+ b'Program to '
+ """
+ NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10,
+ 10, 10, 10, 9, 9, 8, 7, 7, 8, 7,
+ 7, 6, 6, 5, 5]
+ def __init__(self):
+ self.file = open('dict', 'rb')
+ self.compileActions()
+
+ def word(self, size, dist):
+ """Get word
+ """
+ #split dist in index and action
+ ndbits = self.NDBITS[size]
+ index = dist&(1<<ndbits)-1
+ action = dist>>ndbits
+ #compute position in file
+ position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
+ self.file.seek(position)
+ return self.doAction(self.file.read(size), action)
+
+ def upperCase1(self, word):
+ word = word.decode('utf8')
+ word = word[0].upper()+word[1:]
+ return word.encode('utf8')
+
+
+ #Super compact form of action table.
+ #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
+ actionTable = r"""
+ 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_
+ 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+.
+ 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w
+ 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+,
+ 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+="
+ 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+="
+ 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_
+ 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_
+ 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\'
+ 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+,
+ 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+="
+ 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_
+ 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+,
+ 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+(
+ 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._
+ 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+.
+ 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\'
+ 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._
+ 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+="
+ 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\'
+ 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\'
+ 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+.
+ 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+">
+ 23:w[:-3] 48:w[:-7] 98:_+w+=\'
+ 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+,
+ """
+
+ def compileActions(self):
+ """Build the action table from the text above
+ """
+ import re
+ self.actionList = actions = [None]*121
+ #Action 73, which is too long, looks like this when expanded:
+ actions[73] = "b' the '+w+b' of the '"
+ #find out what the columns are
+ actionLines = self.actionTable.splitlines()
+ colonPositions = [m.start()
+ for m in re.finditer(':',actionLines[1])
+ ]+[100]
+ columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
+ for i in range(len(colonPositions)-1)]
+ for line in self.actionTable.splitlines(keepends=False):
+ for start,end in columns:
+ action = line[start:end]
+ #skip empty actions
+ if not action or action.isspace(): continue
+ #chop it up, and check if the colon is properly placed
+ index, colon, action = action[:3], action[3], action[4:]
+ assert colon==':'
+ #remove filler spaces at right
+ action = action.rstrip()
+ #replace space symbols
+ action = action.replace('_', ' ')
+ wPos = action.index('w')
+ #add quotes around left string when present
+ #translation: any pattern from beginning, up to
+ #(but not including) a + following by a w later on
+ action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
+ #add quotes around right string when present
+ #translation: anything with a w in it, followed by a +
+ #and a pattern up to the end
+ #(there is no variable lookbehind assertion,
+ #so we have to copy the pattern)
+ action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
+ #expand shortcut for uppercaseAll
+ action = action.replace(".U", ".upper()")
+ #store action
+ actions[int(index)] = action
+
+ def doAction(self, w, action):
+ """Perform the proper action
+ """
+ #set environment for the UpperCaseFirst
+ U = self.upperCase1
+ return eval(self.actionList[action], locals())
+
+class Layout:
+ """Class to layout the output.
+ """
+ #display width of hexdata+bitdata
+ width = 25
+ #general
+ def __init__(self, stream):
+ self.stream = stream
+ self.bitPtr = self.width
+
+ def makeHexData(self, pos):
+ """Produce hex dump of all data containing the bits
+ from pos to stream.pos
+ """
+ firstAddress = pos+7>>3
+ lastAddress = self.stream.pos+7>>3
+ return ''.join(map('{:02x} '.format,
+ self.stream.data[firstAddress:lastAddress]))
+
+ def formatBitData(self, pos, width1, width2=0):
+ """Show formatted bit data:
+ Bytes are separated by commas
+ whole bytes are displayed in hex
+ >>> Layout(olleke).formatBitData(6, 2, 16)
+ '|00h|2Eh,|00'
+ >>> Layout(olleke).formatBitData(4, 1, 0)
+ '1'
+ """
+ result = []
+ #make empty prefix code explicit
+ if width1==0: result = ['()', ',']
+ for width in width1, width2:
+ #skip empty width2
+ if width==0: continue
+ #build result backwards in a list
+ while width>0:
+ availableBits = 8-(pos&7)
+ if width<availableBits:
+ #read partial byte, beginning nor ending at boundary
+ data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
+ result.append('{:0{}b}'.format(data, width))
+ elif availableBits<8:
+ #read rest of byte, ending at boundary
+ data = self.stream.data[pos>>3] >> (pos&7)
+ result.append('|{:0{}b}'.format(data, availableBits))
+ else:
+ #read whole byte (in hex), beginning and ending at boundary
+ data = self.stream.data[pos>>3]
+ result.append('|{:02X}h'.format(data))
+ width -= availableBits
+ pos += availableBits
+ #if width overshot from the availableBits subtraction, fix it
+ pos += width
+ #add comma to separate fields
+ result.append(',')
+ #concatenate pieces, reversed, skipping the last space
+ return ''.join(result[-2::-1])
+
+ def readPrefixCode(self, alphabet):
+ """give alphabet the prefix code that is read from the stream
+ Called for the following alphabets, in this order:
+ The alphabet in question must have a "logical" order,
+ otherwise the assignment of symbols doesn't work.
+ """
+ mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
+ if mode=='Complex':
+ #for a complex code, numberOfSymbols means hskip
+ self.readComplexCode(numberOfSymbols, alphabet)
+ return alphabet
+ else:
+ table = []
+ #Set table of lengths for mnemonic function
+ lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
+ #adjust mnemonic function of alphabet class
+ def myMnemonic(index):
+ return '{} bit{}: {}'.format(
+ lengths[i],
+ '' if lengths[i]==1 else 's',
+ alphabet.__class__.mnemonic(alphabet, index)
+ )
+ alphabet.mnemonic = myMnemonic
+ for i in range(numberOfSymbols):
+ table.append(self.verboseRead(alphabet, skipExtra=True).index)
+ #restore mnemonic
+ del alphabet.mnemonic
+ if numberOfSymbols==4:
+ #read tree shape to redefine lengths
+ lengths = self.verboseRead(TreeShapeAlhabet())
+ #construct the alphabet prefix code
+ alphabet.setLength(dict(zip(table, lengths)))
+ return alphabet
+
+ def readComplexCode(self, hskip, alphabet):
+ """Read complex code"""
+ stream = self.stream
+ #read the lengths for the length code
+ lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
+ codeLengths = {}
+ total = 0
+ lol = LengthOfLengthAlphabet('##'+alphabet.name)
+ #lengthCode will be used for coding the lengths of the new code
+ #we use it for display until now; definition comes below
+ lengthCode = LengthAlphabet('#'+alphabet.name)
+ lengthIter = iter(lengths)
+ while total<32:
+ newSymbol = next(lengthIter)
+ lol.description = str(lengthCode[newSymbol])
+ length = self.verboseRead(lol)
+ if length:
+ codeLengths[newSymbol] = length
+ total += 32>>length
+ if total>=32:
+ break
+ if total>32: raise ValueError("Stream format")
+ #Now set the encoding of the lengthCode
+ lengthCode.setLength(codeLengths)
+ print("***** Lengths for {} will be coded as:".format(alphabet.name))
+ lengthCode.showCode()
+ #Now determine the symbol lengths with the lengthCode
+ symbolLengths = {}
+ total = 0
+ lastLength = 8
+ alphabetIter = iter(alphabet)
+ while total<32768:
+ #look ahead to see what is going to happen
+ length = lengthCode.decodePeek(
+ self.stream.peek(lengthCode.maxLength))[1].index
+ #in every branch, set lengthCode.description to explanatory text
+ #lengthCode calls format(symbol, extra) with this string
+ if length==0:
+ symbol = next(alphabetIter)
+ lengthCode.description = 'symbol {} unused'.format(symbol)
+ self.verboseRead(lengthCode)
+ #unused symbol
+ continue
+ if length==16:
+ lengthCode.description = \
+ '{1}+3 symbols of length '+str(lastLength)
+ extra = self.verboseRead(lengthCode)
+ #scan series of 16s (repeat counts)
+ #start with repeat count 2
+ repeat = 2
+ startSymbol = next(alphabetIter)
+ endSymbol = next(alphabetIter)
+ symbolLengths[startSymbol.index] = \
+ symbolLengths[endSymbol.index] = lastLength
+ #count the two just defined symbols
+ total += 2*32768>>lastLength
+ #note: loop may end because we're there
+ #even if a 16 _appears_ to follow
+ while True:
+ #determine last symbol
+ oldRepeat = repeat
+ repeat = (repeat-2<<2)+extra+3
+ #read as many symbols as repeat increased
+ for i in range(oldRepeat, repeat):
+ endSymbol = next(alphabetIter)
+ symbolLengths[endSymbol.index] = lastLength
+ #compute new total; it may be end of loop
+ total += (repeat-oldRepeat)*32768>>lastLength
+ if total>=32768: break
+ #see if there is more to do
+ length = lengthCode.decodePeek(
+ self.stream.peek(lengthCode.maxLength))[1].index
+ if length!=16: break
+ lengthCode.description = 'total {}+{{1}} symbols'.format(
+ (repeat-2<<2)+3)
+ extra = self.verboseRead(lengthCode)
+ elif length==17:
+ #read, and show explanation
+ lengthCode.description = '{1}+3 unused'
+ extra = self.verboseRead(lengthCode)
+ #scan series of 17s (groups of zero counts)
+ #start with repeat count 2
+ repeat = 2
+ startSymbol = next(alphabetIter)
+ endSymbol = next(alphabetIter)
+ #note: loop will not end with total==32768,
+ #since total doesn't change here
+ while True:
+ #determine last symbol
+ oldRepeat = repeat
+ repeat = (repeat-2<<3)+extra+3
+ #read as many symbols as repeat increases
+ for i in range(repeat-oldRepeat):
+ endSymbol = next(alphabetIter)
+ #see if there is more to do
+ length = lengthCode.decodePeek(
+ self.stream.peek(lengthCode.maxLength))[1].index
+ if length!=17: break
+ lengthCode.description = 'total {}+{{1}} unused'.format(
+ (repeat-2<<3)+3)
+ extra = self.verboseRead(lengthCode)
+ else:
+ symbol = next(alphabetIter)
+ #double braces for format
+ char = str(symbol)
+ if char in '{}': char *= 2
+ lengthCode.description = \
+ 'Length for {} is {{0.index}} bits'.format(char)
+ #output is not needed (will be 0)
+ self.verboseRead(lengthCode)
+ symbolLengths[symbol.index] = length
+ total += 32768>>length
+ lastLength = length
+ assert total==32768
+ alphabet.setLength(symbolLengths)
+ print('End of table. Prefix code '+alphabet.name+':')
+ alphabet.showCode()
+
+ #stream
+ def processStream(self):
+ """Process a brotli stream.
+ """
+ print('addr hex{:{}s}binary context explanation'.format(
+ '', self.width-10))
+ print('Stream header'.center(60, '-'))
+ self.windowSize = self.verboseRead(WindowSizeAlphabet())
+ print('Metablock header'.center(60, '='))
+ self.ISLAST = False
+ self.output = bytearray()
+ while not self.ISLAST:
+ self.ISLAST = self.verboseRead(
+ BoolCode('LAST', description="Last block"))
+ if self.ISLAST:
+ if self.verboseRead(
+ BoolCode('EMPTY', description="Empty block")): break
+ if self.metablockLength(): continue
+ if not self.ISLAST and self.uncompressed(): continue
+ print('Block type descriptors'.center(60, '-'))
+ self.numberOfBlockTypes = {}
+ self.currentBlockCounts = {}
+ self.blockTypeCodes = {}
+ self.blockCountCodes = {}
+ for blockType in (L,I,D): self.blockType(blockType)
+ print('Distance code parameters'.center(60, '-'))
+ self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
+ self.readLiteralContextModes()
+ print('Context maps'.center(60, '-'))
+ self.cmaps = {}
+ #keep the number of each kind of prefix tree for the last loop
+ numberOfTrees = {I: self.numberOfBlockTypes[I]}
+ for blockType in (L,D):
+ numberOfTrees[blockType] = self.contextMap(blockType)
+ print('Prefix code lists'.center(60, '-'))
+ self.prefixCodes = {}
+ for blockType in (L,I,D):
+ self.readPrefixArray(blockType, numberOfTrees[blockType])
+ self.metablock()
+
+ #metablock header
+ def verboseRead(self, alphabet, context='', skipExtra=False):
+ """Read symbol and extra from stream and explain what happens.
+ Returns the value of the symbol
+ >>> olleke.pos = 0
+ >>> l = Layout(olleke)
+ >>> l.verboseRead(WindowSizeAlphabet())
+ 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
+ 4194288
+ """
+ #TODO 2: verbosity level, e.g. show only codes and maps in header
+ stream = self.stream
+ pos = stream.pos
+ if skipExtra:
+ length, symbol = alphabet.readTuple(stream)
+ extraBits, extra = 0, None
+ else:
+ length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
+ stream)
+ #fields: address, hex data, binary data, name of alphabet, explanation
+ hexdata = self.makeHexData(pos)
+ addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
+ bitdata = self.formatBitData(pos, length, extraBits)
+ #bitPtr moves bitdata so that the bytes are easier to read
+ #jump back to right if a new byte starts
+ if '|' in bitdata[1:]:
+ #start over on the right side
+ self.bitPtr = self.width
+ fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
+ if fillWidth<0: fillWidth = 0
+ print('{:<5s} {:<{}s} {:7s} {}'.format(
+ addressField,
+ hexdata+' '*fillWidth+bitdata, self.width,
+ context+alphabet.name,
+ symbol if skipExtra else symbol.explanation(extra),
+ ))
+ #jump to the right if we started with a '|'
+ #because we didn't jump before printing
+ if bitdata.startswith('|'): self.bitPtr = self.width
+ else: self.bitPtr -= len(bitdata)
+ return symbol if skipExtra else symbol.value(extra)
+
+ def metablockLength(self):
+ """Read MNIBBLES and meta block length;
+ if empty block, skip block and return true.
+ """
+ self.MLEN = self.verboseRead(MetablockLengthAlphabet())
+ if self.MLEN:
+ return False
+ #empty block; skip and return False
+ self.verboseRead(ReservedAlphabet())
+ MSKIP = self.verboseRead(SkipLengthAlphabet())
+ self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
+ self.stream.pos += 8*MSKIP
+ print("Skipping to {:x}".format(self.stream.pos>>3))
+ return True
+
+ def uncompressed(self):
+ """If true, handle uncompressed data
+ """
+ ISUNCOMPRESSED = self.verboseRead(
+ BoolCode('UNCMPR', description='Is uncompressed?'))
+ if ISUNCOMPRESSED:
+ self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
+ print('Uncompressed data:')
+ self.output += self.stream.readBytes(self.MLEN)
+ print(outputFormatter(self.output[-self.MLEN:]))
+ return ISUNCOMPRESSED
+
+ def blockType(self, kind):
+ """Read block type switch descriptor for given kind of blockType."""
+ NBLTYPES = self.verboseRead(TypeCountAlphabet(
+ 'BT#'+kind[0].upper(),
+ description='{} block types'.format(kind),
+ ))
+ self.numberOfBlockTypes[kind] = NBLTYPES
+ if NBLTYPES>=2:
+ self.blockTypeCodes[kind] = self.readPrefixCode(
+ BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
+ self.blockCountCodes[kind] = self.readPrefixCode(
+ BlockCountAlphabet('BC'+kind[0].upper()))
+ blockCount = self.verboseRead(self.blockCountCodes[kind])
+ else:
+ blockCount = 1<<24
+ self.currentBlockCounts[kind] = blockCount
+
+ def readLiteralContextModes(self):
+ """Read literal context modes.
+ LSB6: lower 6 bits of last char
+ MSB6: upper 6 bits of last char
+ UTF8: rougly dependent on categories:
+ upper 4 bits depend on category of last char:
+ control/whitespace/space/ punctuation/quote/%/open/close/
+ comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
+ lower 2 bits depend on category of 2nd last char:
+ space/punctuation/digit or upper/lowercase
+ signed: hamming weight of last 2 chars
+ """
+ print('Context modes'.center(60, '-'))
+ self.literalContextModes = []
+ for i in range(self.numberOfBlockTypes[L]):
+ self.literalContextModes.append(
+ self.verboseRead(LiteralContextMode(number=i)))
+
+ def contextMap(self, kind):
+ """Read context maps
+ Returns the number of differnt values on the context map
+ (In other words, the number of prefix trees)
+ """
+ NTREES = self.verboseRead(TypeCountAlphabet(
+ kind[0].upper()+'T#',
+ description='{} prefix trees'.format(kind)))
+ mapSize = {L:64, D:4}[kind]
+ if NTREES<2:
+ self.cmaps[kind] = [0]*mapSize
+ else:
+ #read CMAPkind
+ RLEMAX = self.verboseRead(RLEmaxAlphabet(
+ 'RLE#'+kind[0].upper(),
+ description=kind+' context map'))
+ alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
+ cmapCode = self.readPrefixCode(alphabet)
+ tableSize = mapSize*self.numberOfBlockTypes[kind]
+ cmap = []
+ while len(cmap)<tableSize:
+ cmapCode.description = 'map {}, entry {}'.format(
+ *divmod(len(cmap), mapSize))
+ count, value = self.verboseRead(cmapCode)
+ cmap.extend([value]*count)
+ assert len(cmap)==tableSize
+ IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
+ if IMTF:
+ self.IMTF(cmap)
+ if kind==L:
+ print('Context maps for literal data:')
+ for i in range(0, len(cmap), 64):
+ print(*(
+ ''.join(map(str, cmap[j:j+8]))
+ for j in range(i, i+64, 8)
+ ))
+ else:
+ print('Context map for distances:')
+ print(*(
+ ''.join(map('{:x}'.format, cmap[i:i+4]))
+ for i in range(0, len(cmap), 4)
+ ))
+ self.cmaps[kind] = cmap
+ return NTREES
+
+ @staticmethod
+ def IMTF(v):
+ """In place inverse move to front transform.
+ """
+ #mtf is initialized virtually with range(infinity)
+ mtf = []
+ for i, vi in enumerate(v):
+ #get old value from mtf. If never seen, take virtual value
+ try: value = mtf.pop(vi)
+ except IndexError: value = vi
+ #put value at front
+ mtf.insert(0, value)
+ #replace transformed value
+ v[i] = value
+
+ def readPrefixArray(self, kind, numberOfTrees):
+ """Read prefix code array"""
+ prefixes = []
+ for i in range(numberOfTrees):
+ if kind==L: alphabet = LiteralAlphabet(i)
+ elif kind==I: alphabet = InsertAndCopyAlphabet(i)
+ elif kind==D: alphabet = DistanceAlphabet(
+ i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
+ self.readPrefixCode(alphabet)
+ prefixes.append(alphabet)
+ self.prefixCodes[kind] = prefixes
+
+ #metablock data
+ def metablock(self):
+ """Process the data.
+ Relevant variables of self:
+ numberOfBlockTypes[kind]: number of block types
+ currentBlockTypes[kind]: current block types (=0)
+ literalContextModes: the context modes for the literal block types
+ currentBlockCounts[kind]: counters for block types
+ blockTypeCodes[kind]: code for block type
+ blockCountCodes[kind]: code for block count
+ cmaps[kind]: the context maps (not for I)
+ prefixCodes[kind][#]: the prefix codes
+ lastDistances: the last four distances
+ lastChars: the last two chars
+ output: the result
+ """
+ print('Meta block contents'.center(60, '='))
+ self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
+ self.lastDistances = deque([17,16,11,4], maxlen=4)
+ #the current context mode is for block type 0
+ self.contextMode = ContextModeKeeper(self.literalContextModes[0])
+ wordList = WordList()
+
+ #setup distance callback function
+ def distanceCallback(symbol, extra):
+ "callback function for displaying decoded distance"
+ index, offset = symbol.value(extra)
+ if index:
+ #recent distance
+ distance = self.lastDistances[-index]+offset
+ return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
+ #absolute value
+ if offset<=maxDistance:
+ return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
+ #word list value
+ action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
+ return '{}-{} gives word {},{} action {}'.format(
+ offset, maxDistance, copyLen, word, action)
+ for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
+
+ blockLen = 0
+ #there we go
+ while blockLen<self.MLEN:
+ #get insert&copy command
+ litLen, copyLen, dist0Flag = self.verboseRead(
+ self.prefixCodes[I][
+ self.figureBlockType(I)])
+ #literal data
+ for i in range(litLen):
+ bt = self.figureBlockType(L)
+ cm = self.contextMode.getIndex()
+ ct = self.cmaps[L][bt<<6|cm]
+ char = self.verboseRead(
+ self.prefixCodes[L][ct],
+ context='{},{}='.format(bt,cm))
+ self.contextMode.add(char)
+ self.output.append(char)
+ blockLen += litLen
+ #check if we're done
+ if blockLen>=self.MLEN: return
+ #distance
+ #distances are computed relative to output length, at most window size
+ maxDistance = min(len(self.output), self.windowSize)
+ if dist0Flag:
+ distance = self.lastDistances[-1]
+ else:
+ bt = self.figureBlockType(D)
+ cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
+ ct = self.cmaps[D][bt<<2|cm]
+ index, offset = self.verboseRead(
+ self.prefixCodes[D][ct],
+ context='{},{}='.format(bt,cm))
+ distance = self.lastDistances[-index]+offset if index else offset
+ if index==1 and offset==0:
+ #to make sure distance is not put in last distance list
+ dist0Flag = True
+ if distance<=maxDistance:
+ #copy from output
+ for i in range(
+ maxDistance-distance,
+ maxDistance-distance+copyLen):
+ self.output.append(self.output[i])
+ if not dist0Flag: self.lastDistances.append(distance)
+ comment = 'Seen before'
+ else:
+ #fetch from wordlist
+ newWord = wordList.word(copyLen, distance-maxDistance-1)
+ self.output.extend(newWord)
+ #adjust copyLen to reflect actual new data
+ copyLen = len(newWord)
+ comment = 'From wordlist'
+ blockLen += copyLen
+ print(' '*40,
+ comment,
+ ': "',
+ outputFormatter(self.output[-copyLen:]),
+ '"',
+ sep='')
+ self.contextMode.add(self.output[-2])
+ self.contextMode.add(self.output[-1])
+
+ def figureBlockType(self, kind):
+ counts, types = self.currentBlockCounts, self.currentBlockTypes
+ if counts[kind]==0:
+ newType = self.verboseRead(self.blockTypeCodes[kind])
+ if newType==-2: newType = types['P'+kind]
+ elif newType==-1:
+ newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
+ types['P'+kind] = types[kind]
+ types[kind] = newType
+ counts[kind] = self.verboseRead(self.blockCountCodes[kind])
+ counts[kind] -=1
+ return types[kind]
+
+__test__ = {
+'BitStream': """
+ >>> bs = BitStream(b'Jurjen')
+ >>> bs.readBytes(2)
+ b'Ju'
+ >>> bs.read(6) #r=01110010
+ 50
+ >>> bs
+ BitStream(pos=2:6)
+ >>> bs.peek(5) #j=01101010
+ 9
+ >>> bs.readBytes(2)
+ Traceback (most recent call last):
+ ...
+ ValueError: readBytes: need byte boundary
+ """,
+
+'Symbol': """
+ >>> a=Symbol(MetablockLengthAlphabet(),5)
+ >>> len(a)
+ 2
+ >>> int(a)
+ 5
+ >>> a.bitPattern()
+ '01'
+ >>> a.value(200000)
+ 200001
+ >>> a.explanation(300000)
+ 'data length: 493e0h+1=300001'
+ """,
+
+'RangeDecoder': """
+ >>> a=RangeDecoder(bitLength=3)
+ >>> len(a)
+ 8
+ >>> a.name='t'
+ >>> list(a)
+ [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
+ >>> a[2]
+ Symbol(t, 2)
+ >>> a.bitPattern(4)
+ '100'
+ >>> a.length(2)
+ 3
+ >>> a.decodePeek(15)
+ (3, Symbol(t, 7))
+ >>>
+
+ """,
+
+'PrefixDecoder': """
+ >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
+ >>> len(a)
+ 4
+ >>> a.name='t'
+ >>> list(a)
+ [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
+ >>> a.decodePeek(22)
+ (1, Symbol(t, 1))
+ >>> a.decodePeek(27)
+ (3, Symbol(t, 3))
+ >>> a.length(1)
+ 1
+ >>> a.length(4)
+ 3
+ """,
+
+'Code': """
+ >>> a=Code('t',alphabetSize=10)
+ >>> len(a)
+ 10
+ >>> a.showCode()
+ 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
+ >>> a.setLength({2:1,3:2,5:3,6:3})
+ >>> a.showCode()
+ 0:2 01:3 011:5 111:6
+ >>> len(a)
+ 4
+ >>> def callback(i): return 'call{}back'.format(i)
+ >>> a=Code('t',callback=callback,bitLength=3)
+ >>> a[6].explanation()
+ 'call6back'
+ """,
+
+'WithExtra': """
+ >>> class A(WithExtra):
+ ... extraTable = [0,1,1,2,2]
+ >>> a=A('t',alphabetSize=5)
+ >>> a[1]
+ Symbol(t, 1)
+ >>> a.extraBits(2)
+ 1
+ >>> a.mnemonic(4)
+ '4'
+ >>> a.readTupleAndExtra(BitStream(b'\x5b'))
+ (3, Symbol(t, 3), 2, 3)
+ """,
+
+'BoolCode': """
+ >>> BoolCode('test')[0].explanation()
+ '0: False'
+ """,
+
+'Enumerator': """
+ >>> class A(Enumerator):
+ ... extraTable = [0,1,1,2,2]
+ ... value0=3
+ >>> a=A(alphabetLength=5)
+ >>> a.value(3)
+ Traceback (most recent call last):
+ ...
+ TypeError: value() missing 1 required positional argument: 'extra'
+ >>> a.explanation(3,4)
+ 'xx 011: 8-11; 8+4=12'
+ """,
+
+'WindowSizeAlphabet': """
+ >>> windowSizeAlphabet = WindowSizeAlphabet()
+ >>> windowSizeAlphabet[0]
+ Traceback (most recent call last):
+ ...
+ ValueError: No symbol WindowSizeAlphabet[0]
+ >>> len(windowSizeAlphabet)
+ 16
+ >>> windowSizeAlphabet[21]
+ Symbol(WSIZE, 21)
+ >>> windowSizeAlphabet[21].bitPattern()
+ '1001'
+ >>> windowSizeAlphabet[21].extraBits()
+ 0
+ >>> windowSizeAlphabet[21].index
+ 21
+ >>> windowSizeAlphabet[10].value()
+ 1008
+ >>> windowSizeAlphabet[10].explanation()
+ 'windowsize=(1<<10)-16=1008'
+ >>> windowSizeAlphabet.showCode()
+ 0:65520 1100001:16368 1110001:32752 0011:262128
+ 0000001:131056 0010001:None 1001:2097136 1011:4194288
+ 1000001:4080 1010001:8176 0101:524272 0111:1048560
+ 0100001:1008 0110001:2032 1101:8388592 1111:16777200
+ """,
+
+'TypeCountAlphabet': """
+ >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
+ >>> len(typeCountAlphabet)
+ 9
+ >>> typeCountAlphabet[3]
+ Symbol(BT#, 3)
+ >>> typeCountAlphabet[9]
+ Traceback (most recent call last):
+ ...
+ ValueError: No symbol TypeCountAlphabet[9]
+ >>> print(typeCountAlphabet[3])
+ xx,0101
+ >>> typeCountAlphabet[8].value(127)
+ 256
+ >>> typeCountAlphabet[4].explanation(2)
+ 'xxx,0111: 11 bananas'
+ >>> typeCountAlphabet[0].explanation()
+ '0: 1 banana'
+ """,
+
+'DistanceParamAlphabet': """
+ >>> dpa = DistanceParamAlphabet()
+ >>> dpa.showCode()
+ 00:PF0 01:PF1 10:PF2 11:PF3
+ >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
+ (2, Symbol(DIST, 1), 4, 10)
+ >>> dpa.explanation(2, 5)
+ '2 postfix bits and 0101<<2=20 direct codes'
+ """,
+
+'LiteralAlphabet': """
+ >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS
+ 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0
+ 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1
+ 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2
+ ...
+ 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff
+ 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc
+ 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd
+ 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce
+ 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf
+ """,
+
+'BlockCountAlphabet': """
+ >>> bc=BlockCountAlphabet('BCL')
+ >>> len(bc)
+ 26
+ >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
+ >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
+ 'Block count: xx 00000: 1-4; 1+2=3'
+ >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
+ 'Block count: xxx 00110: 33-40; 33+0=33'
+ >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
+ 'Block count: xxxxxx 10001: 305-368; 305+28=333'
+ >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
+ 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
+ """,
+
+'Layout': """
+ >>> olleke.pos = 0
+ >>> l = Layout(olleke)
+ >>> l.verboseRead(WindowSizeAlphabet())
+ 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
+ 4194288
+ >>> l.verboseRead(BoolCode('LAST', description="Last block"))
+ 1 LAST Last block: 1: True
+ True
+ >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
+ 0 EMPTY Empty block: 0: False
+ False
+ >>> l.verboseRead(MetablockLengthAlphabet())
+ 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
+ 47
+ >>> olleke.pos = 76
+ >>> l = Layout(olleke)
+ >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
+ 000a 82 10|1100 D0 10[15*x]-3
+ >>> x.explanation(0x86a3)
+ '10[1000011010100011]-3: [0]+100000'
+ """,
+
+'olleke': """
+ >>> olleke.pos = 0
+ >>> try: Layout(olleke).processStream()
+ ... except NotImplementedError: pass
+ ... #doctest: +REPORT_NDIFF
+ addr hex binary context explanation
+ -----------------------Stream header------------------------
+ 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
+ ======================Metablock header======================
+ 1 LAST Last block: 1: True
+ 0 EMPTY Empty block: 0: False
+ 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47
+ -------------------Block type descriptors-------------------
+ 0003 00 0 BT#L 0: 1 literal block type
+ 0 BT#I 0: 1 insert&copy block type
+ 0 BT#D 0: 1 distance block type
+ ------------------Distance code parameters------------------
+ 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
+ -----------------------Context modes------------------------
+ 10 LC0 Context mode for type 0: 2(UTF8)
+ ------------------------Context maps------------------------
+ 0 LT# 0: 1 literal prefix tree
+ 0 DT# 0: 1 distance prefix tree
+ ---------------------Prefix code lists----------------------
+ 10 PFX L0 is complex with lengths 3,4,0,5,17...
+ 0005 4f 1|0 ##L0 len 3: coded with 3 bits
+ 0111 ##L0 len 4: coded with 1 bits
+ 10 ##L0 unused: coded with 3 bits
+ 0006 d6 0|0 ##L0 len 5: skipped
+ 011 ##L0 zero xxx: coded with 2 bits
+ ***** Lengths for L0 will be coded as:
+ 0:len 4 01:zero xxx 011:unused 111:len 3
+ 0007 95 1|11,01 #L0 7+3 unused
+ 0 #L0 Length for \\n is 4 bits
+ 001,01 #L0 1+3 unused
+ 0008 44 010,0|1 #L0 total 19+2 unused
+ 0 #L0 Length for " " is 4 bits
+ 0 #L0 Length for ! is 4 bits
+ 0009 cb 011,|01 #L0 3+3 unused
+ |110,01 #L0 total 35+6 unused
+ 000a 82 0 #L0 Length for K is 4 bits
+ 000,01 #L0 0+3 unused
+ 0 #L0 Length for O is 4 bits
+ 000b 4d 01|1 #L0 symbol P unused
+ 011 #L0 symbol Q unused
+ 0 #L0 Length for R is 4 bits
+ 000c 88 000,|01 #L0 0+3 unused
+ |100,01 #L0 total 11+4 unused
+ 000d b6 0 #L0 Length for b is 4 bits
+ 011 #L0 symbol c unused
+ 011 #L0 symbol d unused
+ 000e 27 11|1 #L0 Length for e is 3 bits
+ 010,01 #L0 2+3 unused
+ |0 #L0 Length for k is 4 bits
+ 000f 1f 111 #L0 Length for l is 3 bits
+ 011 #L0 symbol m unused
+ 0 #L0 Length for n is 4 bits
+ |0 #L0 Length for o is 4 bits
+ 0010 c1 000,01 #L0 0+3 unused
+ 0 #L0 Length for s is 4 bits
+ 0011 b4 0|11 #L0 symbol t unused
+ 0 #L0 Length for u is 4 bits
+ End of table. Prefix code L0:
+ 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s
+ 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u
+ 11,01 PFX IC0 is simple with 4 code words
+ 0012 2a |2Ah|10 IC0 ? bits: I5C4
+ 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7
+ 0015 22 0010|111011 IC0 ? bits: I8+xC5
+ 0016 8c 001100|0010 IC0 ? bits: I0C14+xx
+ 0 SHAPE False: lengths 2,2,2,2
+ 0017 74 10,0|1 PFX D0 is simple with 3 code words
+ 0018 a6 0|01110 D0 1 bit: 2last-3
+ 010011 D0 2 bits: 11xx-3
+ 0019 aa 01010|1 D0 2 bits: 11xxx-3
+ ====================Meta block contents=====================
+ |1,01 IC0 Literal: 9, copy: 5
+ 001a 41 0001 0,0=L0 O
+ 100 0,48=L0 l
+ 001b a2 10|0 0,62=L0 l
+ 000 0,63=L0 e
+ 001c a1 1|101 0,59=L0 k
+ 000 0,63=L0 e
+ |1010 0,59=L0 " "
+ 001d b5 0101 0,11=L0 b
+ |1011 0,60=L0 o
+ 001e 24 0 0,3=D0 Distance: 2last-3=8
+ Seen before: "lleke"
+ 0,10 IC0 Literal: 6, copy: 7
+ |0010 0,59=L0 \\n
+ 001f 89 1001 0,7=L0 R
+ 000 0,52=L0 e
+ 0020 fa 010|1 0,58=L0 b
+ 1111 0,63=L0 u
+ 0021 eb 011|1 0,59=L0 s
+ 11,01 0,3=D0 Absolute value: 12 (pos 8)
+ Seen before: "olleke\\n"
+ 0022 db 01,1|1 IC0 Literal: 0, copy: 15
+ |110,11 0,3=D0 Absolute value: 27 (pos 0)
+ Seen before: "Olleke bolleke\\n"
+ 0023 f8 00 IC0 Literal: 5, copy: 4
+ 1110 0,7=L0 K
+ 0024 2c 00|11 0,52=L0 n
+ 1011 0,62=L0 o
+ 0025 0d 1|00 0,59=L0 l
+ 0110 0,63=L0 !
+ """,
+
+'file': """
+ >>> try: Layout(BitStream(
+ ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
+ ... .read())).processStream()
+ ... except NotImplementedError: pass
+ addr hex binary context explanation
+ -----------------------Stream header------------------------
+ 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
+ ======================Metablock header======================
+ 1 LAST Last block: 1: True
+ 0 EMPTY Empty block: 0: False
+ 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
+ -------------------Block type descriptors-------------------
+ 0003 00 0 BT#L 0: 1 literal block type
+ 0 BT#I 0: 1 insert&copy block type
+ 0 BT#D 0: 1 distance block type
+ ------------------Distance code parameters------------------
+ 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
+ -----------------------Context modes------------------------
+ 10 LC0 Context mode for type 0: 2(UTF8)
+ ------------------------Context maps------------------------
+ 0 LT# 0: 1 literal prefix tree
+ 0 DT# 0: 1 distance prefix tree
+ ---------------------Prefix code lists----------------------
+ 0005 b0 0|1,01 PFX L0 is simple with 2 code words
+ 0006 b2 0|1011000 L0 1 bit: X
+ 0007 ea 0|1011001 L0 1 bit: Y
+ 01,01 PFX IC0 is simple with 2 code words
+ 0008 81 0000001|111 IC0 1 bit: I1C9&D=0
+ 0009 47 02 0|47h|1 IC0 1 bit: I1C9
+ 00,01 PFX D0 is simple with 1 code word
+ 000b 8a 010|000 D0 0 bits: 10x-3
+ ====================Meta block contents=====================
+ 1 IC0 Literal: 1, copy: 9
+ 0 0,0=L0 X
+ 0,() 0,3=D0 Absolute value: 1 (pos 0)
+ Seen before: "XXXXXXXXX"
+ 0 IC0 Literal: 1, copy: 9, same distance
+ |1 0,54=L0 Y
+ Seen before: "YYYYYYYYY"
+ """,
+
+'XY': """
+ >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
+ ... except NotImplementedError: pass
+ addr hex binary context explanation
+ -----------------------Stream header------------------------
+ 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288
+ ======================Metablock header======================
+ 1 LAST Last block: 1: True
+ 0 EMPTY Empty block: 0: False
+ 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20
+ -------------------Block type descriptors-------------------
+ 0003 00 0 BT#L 0: 1 literal block type
+ 0 BT#I 0: 1 insert&copy block type
+ 0 BT#D 0: 1 distance block type
+ ------------------Distance code parameters------------------
+ 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes
+ -----------------------Context modes------------------------
+ 10 LC0 Context mode for type 0: 2(UTF8)
+ ------------------------Context maps------------------------
+ 0 LT# 0: 1 literal prefix tree
+ 0 DT# 0: 1 distance prefix tree
+ ---------------------Prefix code lists----------------------
+ 0005 b0 0|1,01 PFX L0 is simple with 2 code words
+ 0006 b2 0|1011000 L0 1 bit: X
+ 0007 82 0|1011001 L0 1 bit: Y
+ 00,01 PFX IC0 is simple with 1 code word
+ 0008 84 0000100|100 IC0 0 bits: I4C6&D=0
+ 0009 00 00,0|1 PFX D0 is simple with 1 code word
+ 000a e0 0|00000 D0 0 bits: last
+ ====================Meta block contents=====================
+ () IC0 Literal: 4, copy: 6, same distance
+ 0 0,0=L0 X
+ 0 0,52=L0 X
+ 0 0,54=L0 X
+ 0 0,54=L0 X
+ Seen before: "XXXXXX"
+ () IC0 Literal: 4, copy: 6, same distance
+ 1 0,54=L0 Y
+ 1 0,54=L0 Y
+ |1 0,54=L0 Y
+ 000b 01 1 0,54=L0 Y
+ Seen before: "YYYYYY"
+ """,
+
+'empty': """
+ >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
+ ... except NotImplementedError: pass
+ addr hex binary context explanation
+ -----------------------Stream header------------------------
+ 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056
+ ======================Metablock header======================
+ |1 LAST Last block: 1: True
+ 0001 16 0 EMPTY Empty block: 0: False
+ 11 MLEN 11: empty block
+ 0 RSVD Reserved (must be zero)
+ 0002 00 000000|00,01 SKIP skip length: 0h+1=1
+ |00 SKIP 2 bits ignored
+ Skipping to 4
+ """,
+
+}
+
+if __name__=='__main__':
+ import sys
+ if len(sys.argv)>1:
+ l = Layout(BitStream(open(sys.argv[1],'rb').read()))
+ l.processStream()
+ else:
+ sys.path.append("h:/Persoonlijk/bin")
+ try:
+ import brotli
+ open('brotlidump.br', 'wb').write(
+ brotli.compress(
+ open('brotlidump.py', 'r').read()
+ ))
+ olleke = BitStream(brotli.compress(
+ 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
+ except ImportError: pass
+ import doctest
+ doctest.testmod(optionflags=doctest.REPORT_NDIFF
+ #|doctest.FAIL_FAST
+ )