diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/brotlidump.py | 2361 |
1 files changed, 0 insertions, 2361 deletions
diff --git a/python/brotlidump.py b/python/brotlidump.py deleted file mode 100644 index a625368..0000000 --- a/python/brotlidump.py +++ /dev/null @@ -1,2361 +0,0 @@ -#!python3 -"""Program to dump contents of Brotli compressed files showing the compression format. -Jurjen N.E. Bos, 2016. -I found the following issues with the Brotli format: -- The distance alphabet has size 16+(48<<POSTFIX), - but the last symbols are useless. - It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter. -- The block type code is useless if NBLTYPES==2, you would only need 1 symbol - anyway, so why don't you just switch to "the other" type? -""" -import struct -from operator import itemgetter, methodcaller -from itertools import accumulate, repeat -from collections import defaultdict, deque -from functools import partial - -class InvalidStream(Exception): pass -#lookup table -L, I, D = "literal", "insert©", "distance" -pL, pI, pD = 'P'+L, 'P'+I, 'P'+D - -def outputCharFormatter(c): - """Show character in readable format - """ - #TODO 2: allow hex only output - if 32<c<127: return chr(c) - elif c==10: return '\\n' - elif c==13: return '\\r' - elif c==32: return '" "' - else: return '\\x{:02x}'.format(c) - -def outputFormatter(s): - """Show string or char. - """ - result = '' - def formatSubString(s): - for c in s: - if c==32: yield ' ' - else: yield outputCharFormatter(c) - if len(result)<200: return ''.join(formatSubString(s)) - else: - return ''.join(formatSubString(s[:100]))+'...'+ \ - ''.join(formatSubString(s[-100:])) - - -class BitStream: - """Represent a bytes object. Can read bits and prefix codes the way - Brotli does. - """ - def __init__(self, byteString): - self.data = byteString - #position in bits: byte pos is pos>>3, bit pos is pos&7 - self.pos = 0 - - def __repr__(self): - """Representation - >>> olleke - BitStream(pos=0:0) - """ - return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) - - def read(self, n): - """Read n bits from the stream and return as an integer. - Produces zero bits beyond the stream. - >>> olleke.data[0]==27 - True - >>> olleke.read(5) - 27 - - >>> olleke - BitStream(pos=0:5) - """ - value = self.peek(n) - self.pos += n - if self.pos>len(self.data)*8: - raise ValueError('Read past end of stream') - return value - - def peek(self, n): - """Peek an n bit integer from the stream without updating the pointer. - It is not an error to read beyond the end of the stream. - >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 - True - >>> olleke.peek(15) - 11803 - >>> hex(olleke.peek(32)) - '0x2e1b' - """ - #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] - #convert to int: int.from_bytes(..., 'little') - #shift out the bits from the first byte: >>(self.pos&7) - #mask unwanted bits: & (1<<n)-1 - return int.from_bytes( - self.data[self.pos>>3:self.pos+n+7>>3], - 'little')>>(self.pos&7) & (1<<n)-1 - - def readBytes(self, n): - """Read n bytes from the stream on a byte boundary. - """ - if self.pos&7: raise ValueError('readBytes: need byte boundary') - result = self.data[self.pos>>3:(self.pos>>3)+n] - self.pos += 8*n - return result - -#-----------------------Symbol------------------------------------------- -class Symbol: - """A symbol in a code. - Refers back to the code that contains it. - Index is the place in the alphabet of the symbol. - """ - __slots__ = 'code', 'index' - def __init__(self, code, value): - self.code = code - self.index = value - - def __repr__(self): - return 'Symbol({}, {})'.format(self.code.name, self.index) - - def __len__(self): - """Number of bits in the prefix notation of this symbol - """ - return self.code.length(self.index) - - def __int__(self): - return self.index - - #these routines call equivalent routine in Code class - def bitPattern(self): - """Value of the symbol in the stream - """ - return self.code.bitPattern(self.index) - - def extraBits(self): - """Number of extra bits to read for this symbol - """ - return self.code.extraBits(self.index) - - def __str__(self): - """Short descriptor of the symbol without extra bits. - """ - return self.code.mnemonic(self.index) - - #requiring optional extra bits, if self.code supports them - def value(self, extra=None): - """The value used for processing. Can be a tuple. - with optional extra bits - """ - if isinstance(self.code, WithExtra): - if not 0<=extra<1<<self.extraBits(): - raise ValueError("value: extra value doesn't fit in extraBits") - return self.code.value(self.index, extra) - if extra is not None: - raise ValueError('value: no extra bits for this code') - return self.code.value(self.index) - - def explanation(self, extra=None): - """Long explanation of the value from the numeric value - with optional extra bits - Used by Layout.verboseRead when printing the value - """ - if isinstance(self.code, WithExtra): - return self.code.callback(self, extra) - return self.code.callback(self) - -#========================Code definitions================================== -class RangeDecoder: - """A decoder for the Code class that assumes the symbols - are encoded consecutively in binary. - It all depends on the "alphabetSize" property. - The range runs from 0 to alphabetSize-1. - This is the default decoder. - """ - def __init__(self, *, alphabetSize=None, bitLength=None, **args): - if bitLength is not None: alphabetSize = 1<<bitLength - if alphabetSize is not None: - self.alphabetSize = alphabetSize - self.maxLength = (alphabetSize-1).bit_length() - - def __len__(self): - return self.alphabetSize - - def __iter__(self): - """Produce all symbols. - """ - return map(partial(Symbol, self), range(len(self))) - - def __getitem__(self, index): - if index>=self.alphabetSize: raise ValueError('index out of range') - return Symbol(self, index) - - def bitPattern(self, index): - return '{:0{}b}'.format(index, self.maxLength) - - def length(self, index): - """Encoding length of given symbol. - Does not depend on index in this case. - """ - return self.maxLength - - def decodePeek(self, data): - """Find which symbol index matches the given data (from peek, as a number) - and return the number of bits decoded. - Can also be used to figure out length of a symbol. - """ - return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1) - -class PrefixDecoder: - """A decoder for the Code class that uses a prefix code. - The code is determined by encoding: - encode[p] gives the index corresponding to bit pattern p. - Used setDecode(decodeTable) to switch the decoder from the default - to a prefix decoder, or pass decodeTable at init. - You can also use setLength(lengthTable) - to define the encoding from the lengths. - The set of symbol values does not need to be consecutive. - """ - def __init__(self, *, decodeTable=None, **args): - if decodeTable is not None: self.setDecode(decodeTable) - - def __len__(self): - return len(self.decodeTable) - - def __iter__(self): - def revBits(index): - return self.bitPattern(index)[::-1] - return ( - Symbol(self, index) - for index in sorted(self.decodeTable.values(), key=revBits) - ) - - def __getitem__(self, index): - if index not in self.lengthTable: - raise ValueError('No symbol {}[{}]'.format( - self.__class__.__name__, index)) - return Symbol(self, index) - - def bitPattern(self, index): - bits = next(b for (b,s) in self.decodeTable.items() if s==index) - return '{:0{}b}'.format(bits, self.length(index)) - - def length(self, index): - """Encoding length of given symbol. - """ - return self.lengthTable[index] - - def decodePeek(self, data): - """Find which symbol index matches the given data (from peek, as a number) - and return the number of bits decoded. - Can also be used to figure out length of a symbol. - """ - #do binary search for word length - #invariant: lo<=length<=hi - lo, hi = self.minLength, self.maxLength - while lo<=hi: - mid = lo+hi>>1 - #note lo<=mid<hi at this point - mask = (1<<mid)-1 - #lets see what happens if we guess length is mid - try: index = self.decodeTable[data&mask] - except KeyError: - #too many bits specified, reduce estimated length - hi = mid-1 - continue - #we found a symbol, but there could be a longer match - symbolLength = self.lengthTable[index] - if symbolLength<=mid: - #all bits match, symbol must be right - return symbolLength, Symbol(self, index) - #there must be more bits to match - lo = mid+1 - return lo, Symbol(self, index) - - #routine to set up the tables - def setDecode(self, decodeTable): - """Store decodeTable, - and compute lengthTable, minLength, maxLength from encodings. - """ - self.decodeTable = decodeTable - #set of symbols with unknown length - todo = set(decodeTable) - #bit size under investigation - maskLength = 0 - lengthTable = {} - while todo: - mask = (1<<maskLength)-1 - #split the encodings that we didn't find yet using b bits - splitSymbols = defaultdict(list) - for s in todo: splitSymbols[s&mask].append(s) - #unique encodings have a length of maskLength bits - #set length, and remove from todo list - for s,subset in splitSymbols.items(): - if len(subset)==1: - lengthTable[self.decodeTable[s]] = maskLength - todo.remove(s) - #now investigate with longer mask - maskLength +=1 - #save result - self.lengthTable = lengthTable - self.minLength = min(lengthTable.values()) - self.maxLength = max(lengthTable.values()) - self.switchToPrefix() - - def setLength(self, lengthTable): - """Given the bit pattern lengths for symbols given in lengthTable, - set decodeTable, minLength, maxLength - """ - self.lengthTable = lengthTable - self.minLength = min(lengthTable.values()) - self.maxLength = max(lengthTable.values()) - #compute the backwards codes first; then reverse them - #compute (backwards) first code for every separate lengths - nextCodes = [] - #build codes for each length, from right to left - code = 0 - for bits in range(self.maxLength+1): - code <<= 1 - nextCodes.append(code) - code += sum(x==bits for x in lengthTable.values()) - self.decodeTable = {} - #count codes for each length, and store reversed in the table - for symbol in sorted(lengthTable): - bits = lengthTable[symbol] - bitpattern = '{:0{}b}'.format(nextCodes[bits], bits) - self.decodeTable[int(bitpattern[::-1], 2)] = symbol - nextCodes[bits] += 1 - self.switchToPrefix() - - def switchToPrefix(self): - """This routine makes sure the prefix decoder is activated. - """ - self.mode = PrefixDecoder - -class Code(RangeDecoder, PrefixDecoder): - """An alphabet of symbols, that can be read from a stream. - If you use setDecode or setLength, you have a prefix code, - otherwise you have a range code. - Features: - code[index] produces symbol with given index - value(index): value of symbol - mnemonic(index): short description of symbol - explanation(index): show meaning of symbol, shown in Layout.verboseRead - iter(code): produce all symbols in some order - name: show as context in Layout.verboseRead - """ - name = '?' - #callback is a function that gets the symbol and the extra bits - #default callback calls explanation - def __init__(self, name=None, *, callback=None, description='', **args): - """Don't forget to set either alphabetSize or decodeTable - """ - #set name when provided, otherwise take class variable - if name is not None: self.name = name - if callback is not None: self.callback = callback - self.description = description - #mode switch - if 'bitLength' in args or 'alphabetSize' in args: - self.mode = RangeDecoder - RangeDecoder.__init__(self, **args) - elif 'decodeTable' in args: - self.mode = PrefixDecoder - PrefixDecoder.__init__(self, **args) - else: - super().__init__(**args) - - def __repr__(self): - return self.__class__.__name__+' '+self.name - - #the routines that get switched between RangeDecoder and PrefixDecoder - def __len__(self): return self.mode.__len__(self) - def __iter__(self): return self.mode.__iter__(self) - def __getitem__(self, index): return self.mode.__getitem__(self, index) - def bitPattern(self, index): return self.mode.bitPattern(self, index) - def length(self, index): return self.mode.length(self, index) - def decodePeek(self, data): return self.mode.decodePeek(self, data) - #general routines - def value(self, index, extra=None): - """Get value of symbol for computations. - Override where needed. - """ - if extra is not None: - raise ValueError('value: no extra for this symbol') - return index - - def mnemonic(self, index): - """Give mnemonic of symbol. - Override where needed. - """ - return str(self.value(index)) - - def callback(self, symbol): - return self.explanation(symbol.index) - - def explanation(self, index): - """Long explanation of the value from the numeric value - This is a default routine. - You can customize in three ways: - - set description to add some text - - override to get more control - - set callback to make it dependent on you local variables - """ - value = self.value(index) - return '{0}{1}: {2}'.format( - self.description and self.description+': ', - self.bitPattern(index), - value, - ) - - def extraBits(self, index): - return 0 - - #Routines that use the decode interface - def showCode(self, width=80): - """Show all words of the code in a nice format. - """ - #make table of all symbols with binary strings - symbolStrings = [ - (self.bitPattern(s.index), self.mnemonic(s.index)) - for s in self - ] - #determine column widths the way Lisp programmers do it - leftColWidth, rightColWidth = map(max, map( - map, - repeat(len), - zip(*symbolStrings) - )) - colwidth = leftColWidth+rightColWidth - columns = 81//(colwidth+2) - rows = -(-len(symbolStrings)//columns) - def justify(bs): - b,s = bs - return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth) - for i in range(rows): - print(' '.join(map(justify, symbolStrings[i::rows])).rstrip()) - - def readTuple(self, stream): - """Read symbol from stream. Returns symbol, length. - """ - length, symbol = self.decodePeek(stream.peek(self.maxLength)) - stream.pos += length - return length, symbol - - def readTupleAndExtra(self, stream): - return self.readTuple(stream)+(0, None) - -class WithExtra(Code): - """Extension for Code so that symbol may have extra bits associated. - If you supply an extraTable, you can use extraBits - You can define an extraTable, - which allows to call extraBits to get the number of extraBits. - Otherwise, you can supply extraBits yourself. - Routine readTupleAndExtra now reads the extra bits too. - Value probably needs to be overridden; see Enumerator. - Note: this does not give you an decodeTable. - """ - #redefine these if you don't want to use an extraTable - def extraBits(self, index): - """Get the number of extra bits for this symbol. - """ - return self.extraTable[index] - - def mnemonic(self, index): - """This value must be independent of extra. - """ - return str(index) - - def readTupleAndExtra(self, stream): - """Read symbol and extrabits from stream. - Returns symbol length, symbol, extraBits, extra - >>> olleke.pos = 6 - >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) - (2, Symbol(MLEN, 4), 16, 46) - """ - length, symbol = self.decodePeek(stream.peek(self.maxLength)) - stream.pos += length - extraBits = self.extraBits(symbol.index) - return length, symbol, extraBits, stream.read(extraBits) - - def explanation(self, index, extra=None): - """Expanded version of Code.explanation supporting extra bits. - If you don't supply extra, it is not mentioned. - """ - extraBits = 0 if extra is None else self.extraBits(index) - if not hasattr(self, 'extraTable'): - formatString = '{0}{3}' - lo = hi = value = self.value(index, extra) - elif extraBits==0: - formatString = '{0}{2}: {3}' - lo, hi = self.span(index) - value = lo - else: - formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' - lo, hi = self.span(index) - value = lo+extra - return formatString.format( - self.description and self.description+': ', - 'x'*extraBits, - self.bitPattern(index), - lo, hi, - extra, - value, - ) - - def callback(self, symbol, extra): - return self.explanation(symbol.index, extra) - -class BoolCode(Code): - """Same as Code(bitLength=1), but shows a boolean. - """ - def __init__(self, name=None, **args): - super().__init__(name, bitLength=1, **args) - - def value(self, index, extra=None): - return bool(super().value(index, extra)) - -class Enumerator(WithExtra): - """Code that is defined by the ExtraTable. - extraTable is a class variable that contains - the extraBits of the symbols from 0 - value0 contains the value of symbol 0 - encodings is not neccessary, but allowed. - Note: place for FixedCode to make sure extraBits works - """ - def __init__(self, name=None, **args): - #if there is no decodeTable to determine length, compute it ourselves - if 'decodeTable' not in args: - args['alphabetSize'] = len(self.extraTable) - super().__init__(name, **args) - - def __len__(self): - return len(self.extraTable) - - def __getitem__(self, index): - """Faster than PrefixDecoder - """ - if index>=len(self.extraTable): - raise ValueError("No symbol {}[{}]".format( - self.__class__.__name__, index)) - return Symbol(self, index) - - def value(self, index, extra): - """Override if you don't define value0 and extraTable - """ - lower, upper = self.span(index) - value = lower+(extra or 0) - if value>upper: - raise ValueError('value: extra out of range') - return value - - def span(self, index): - """Give the range of possible values in a tuple - Useful for mnemonic and explanation - """ - lower = self.value0+sum(1<<x for x in self.extraTable[:index]) - upper = lower+(1<<self.extraTable[index]) - return lower, upper-1 - -#======================Code subclasses====================================== -#Alphabets used in the metablock header---------------------------------- -#For prefix codes -class PrefixCodeHeader(WithExtra): - """Header of prefix codes. - """ - def __init__(self, codename): - super().__init__('PFX', bitLength=2) - #this is the name of the code that it describes - self.codename = codename - - def extraBits(self, index): - return 2 if index==1 else 0 - - def value(self, index, extra): - """Returns ('Simple', #codewords) or ('Complex', HSKIP) - """ - if index==1: - if extra>3: - raise ValueError('value: extra out of range') - return 'Simple', extra+1 - if extra: - raise ValueError('value: extra out of range') - return 'Complex', index - - def explanation(self, index, extra): - if index==1: - return '{} is simple with {} code word{}'.format( - self.codename, extra+1, 's' if extra else '') - lengths = [1, 2, 3, 4, 0, 5, 17, 6] - return '{} is complex with lengths {}...'.format( - self.codename, - ','.join( - map(str, lengths[index:index+5])) - ) - -class TreeShapeAlhabet(BoolCode): - """The bit used to indicate if four word code is "deep" or "wide" - """ - name = 'SHAPE' - def value(self, index): - return [(2,2,2,2), (1,2,3,3)][index] - - def explanation(self, index): - return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) - -class LengthOfLengthAlphabet(Code): - """For use in decoding complex code descriptors. - >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') - >>> print(lengthOfLengthAlphabet[2]) - coded with 2 bits - >>> len(lengthOfLengthAlphabet[0]) - 2 - >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] - [2, 4, 3, 2, 2, 4] - >>> lengthOfLengthAlphabet.showCode() - 00:skipped 01:coded with 4 bits 0111:coded with 1 bits - 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits - """ - decodeTable = { - 0b00:0, 0b10:3, - 0b0111:1, 0b01:4, - 0b011:2, 0b1111:5, - } - - def __init__(self, name=None, **args): - super().__init__(name, decodeTable=self.decodeTable, **args) - - def mnemonic(self, index): - if index==0: return 'skipped' - return 'coded with {} bits'.format(index) - - def explanation(self, index, extra=None): - return self.description+': '+self.mnemonic(index) - -class LengthAlphabet(WithExtra): - """Length of symbols - Used during construction of a code. - """ - def __init__(self, name): - super().__init__(name, alphabetSize=18) - - def extraBits(self, index): - return {16:2, 17:3}.get(index, 0) - - def mnemonic(self, index): - if index==0: return 'unused' - elif index==16: return 'rep xx' - elif index==17: return 'zero xxx' - else: return 'len {}'.format(index) - - def explanation(self, index, extra): - return self.description.format(self[index], extra) - - def value(self, index, extra): - #the caller got the length already, so extra is enough - return extra - -#Stream header -class WindowSizeAlphabet(Code): - """The alphabet used for window size in the stream header. - >>> WindowSizeAlphabet()[10].explanation() - 'windowsize=(1<<10)-16=1008' - """ - decodeTable = { - 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, - 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, - 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, - 0b1010001: 13, 0b0000001: 17, 0b1001: 21, - 0b0010001: None, - } - - name = 'WSIZE' - - def __init__(self, name=None): - super().__init__(name, decodeTable=self.decodeTable) - - def value(self, index): - #missing value gives index None - if index is None: return None - return (1<<index)-16 - - def explanation(self, index): - return 'windowsize=(1<<{})-16={}'.format( - index, (1<<index)-16) - -#Metablock -class MetablockLengthAlphabet(WithExtra): - """Used for the meta block length; - also indicates a block with no data - >>> metablockLengthAlphabet = MetablockLengthAlphabet() - >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) - Symbol(MLEN, 0) - 'empty' - >>> metablockLengthAlphabet[3] - Traceback (most recent call last): - ... - ValueError: No symbol MetablockLengthAlphabet[3] - >>> print(metablockLengthAlphabet[4]) - hhhh00 - >>> metablockLengthAlphabet[4].value(0x1000) - 4097 - >>> metablockLengthAlphabet[5].value(0x1000) - Traceback (most recent call last): - ... - InvalidStream: Zeros in high nibble of MLEN - >>> metablockLengthAlphabet[5].explanation(0x12345) - 'data length: 12345h+1=74566' - >>> metablockLengthAlphabet.showCode() - 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty - """ - decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} - - name = 'MLEN' - def __init__(self, name=None): - super().__init__(name, decodeTable=self.decodeTable) - - def extraBits(self, index): - return index*4 - - def mnemonic(self, index): - if index==0: return 'empty' - return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) - - def value(self, index, extra): - extraBits = self.extraBits(index) - if not 0<=extra<1<<extraBits: - raise ValueError('value: extra out of range') - if index==0: return 0 - if index>4 and extra>>extraBits-4==0: raise InvalidStream( - 'Zeros in high nibble of MLEN') - return extra+1 - - def explanation(self, index, extra): - if index==0: return '11: empty block' - extraBits = self.extraBits(index) - return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) - - -class ReservedAlphabet(BoolCode): - """The reserved bit that must be zero. - """ - name = 'RSVD' - def value(self, index): - if index: raise ValueError('Reserved bit is not zero') - - def explanation(self, index): - return 'Reserved (must be zero)' - -class FillerAlphabet(Code): - def __init__(self, *, streamPos): - super().__init__('SKIP', bitLength=(-streamPos)&7) - - def explanation(self, index): - return '{} bit{} ignored'.format( - self.length(index), - '' if self.length(index)==1 else 's', - ) - -class SkipLengthAlphabet(WithExtra): - """Used for the skip length in an empty metablock - >>> skipLengthAlphabet = SkipLengthAlphabet() - >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) - Symbol(SKIP, 0) - 'empty' - >>> skipLengthAlphabet[4] - Traceback (most recent call last): - ... - ValueError: index out of range - >>> print(skipLengthAlphabet[3]) - hhhhhh11 - >>> skipLengthAlphabet[2].value(0x1000) - 4097 - >>> skipLengthAlphabet[3].value(0x1000) - Traceback (most recent call last): - ... - InvalidStream: Zeros in high byte of SKIPBYTES - >>> skipLengthAlphabet[3].explanation(0x12345) - 'skip length: 12345h+1=74566' - >>> skipLengthAlphabet.showCode() - 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 - """ - def __init__(self): - super().__init__('SKIP', bitLength=2) - - def extraBits(self, index): - return index*8 - - def mnemonic(self, index): - if index==0: return 'empty' - return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) - - def value(self, index, extra): - extraBits = self.extraBits(index) - if not 0<=extra<1<<extraBits: - raise ValueError('value: extra out of range') - if index==0: return 0 - if index>1 and extra>>extraBits-8==0: - raise InvalidStream('Zeros in high byte of SKIPBYTES') - return extra+1 - - def explanation(self, index, extra): - if index==0: return '00: no skip' - extraBits = self.extraBits(index) - return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) - - -class TypeCountAlphabet(Enumerator): - """Used for giving block type counts and tree counts. - >>> TypeCountAlphabet(description='').showCode() - 0:0 0101:xx,0101 1011:xxxxx,1011 - 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 - 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 - """ - decodeTable = { - 0b0: 0, 0b1001: 5, - 0b0001: 1, 0b1011: 6, - 0b0011: 2, 0b1101: 7, - 0b0101: 3, 0b1111: 8, - 0b0111: 4, - } - - value0 = 1 - extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] - name = 'BT#' - - def __init__(self, name=None, *, description): - super().__init__( - name, - decodeTable=self.decodeTable, - description=description) - - def mnemonic(self, index): - if index==0: return '0' - if index==1: return '0001' - return 'x'*(self.extraBits(index))+','+self.bitPattern(index) - - def explanation(self, index, extra): - value = self.value(index, extra) - description = self.description - if value==1: description = description[:-1] - return '{}: {} {}'.format( - self.mnemonic(index), - value, - description) - -class BlockTypeAlphabet(Code): - """The block types; this code works for all three kinds. - >>> b = BlockTypeAlphabet('T', NBLTYPES=5) - >>> print(*(x for x in b)) - prev +1 #0 #1 #2 #3 #4 - """ - def __init__(self, name, NBLTYPES, **args): - super().__init__(name, alphabetSize=NBLTYPES+2, **args) - self.NBLTYPES = NBLTYPES - - def mnemonic(self, index): - if index==0: return 'prev' - elif index==1: return '+1' - else: return '#'+str(index-2) - - def value(self, index): - return index-2 - - def explanation(self, index): - if index==0: return '0: previous' - elif index==1: return '1: increment' - else: return 'Set block type to: '+str(index-2) - -class BlockCountAlphabet(Enumerator): - """Block counts - >>> b = BlockCountAlphabet('L') - >>> print(b[25]) - [24*x]: BC16625-16793840 - """ - - value0 = 1 - extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] - def __init__(self, name, **args): - super().__init__(name, alphabetSize=26, **args) - - def mnemonic(self, index): - extraBits = self.extraBits(index) - return '{}: BC{}-{}'.format( - 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), - *self.span(index)) - - def explanation(self, index, extra): - return 'Block count: '+super().explanation(index, extra) - -class DistanceParamAlphabet(WithExtra): - """The distance parameters NPOSTFIX and NDIRECT. - Although these are treated as two in the description, this is easier. - """ - def __init__(self): - super().__init__('DIST', bitLength=2) - - def extraBits(self, index): - return 4 - - def value(self, index, extra): - """Returns NPOSTFIX and NDIRECT<<NPOSTFIX - """ - if extra>15: - raise ValueError('value: extra out of range') - return index, extra<<index - - def explanation(self, index, extra): - return '{} postfix bits and {:04b}<<{}={} direct codes'.format( - index, extra, index, extra<<index) - - def mnemonic(self, index): - return 'PF'+str(index) - -class LiteralContextMode(Code): - """For the literal context modes. - >>> LiteralContextMode().showCode() - 00:LSB6 01:MSB6 10:UTF8 11:Signed - >>> LiteralContextMode().explanation(2) - 'Context mode for type 9: 2(UTF8)' - """ - - def __init__(self, *, number=9): - super().__init__('LC'+str(number), bitLength=2) - self.number = number - - def mnemonic(self, index): - return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] - - def explanation(self, index): - return 'Context mode for type {}: {}({})'.format( - self.number, - index, - self.mnemonic(index)) - -class RLEmaxAlphabet(Enumerator): - """Used for describing the run length encoding used for describing context maps. - >>> RLEmaxAlphabet().showCode() - 0:1 1:more - """ - value0 = 0 - extraTable = [0, 4] - name = 'RLE#' - - def mnemonic(self, index): - return ['1', 'more'][index] - - def explanation(self, index, extra): - description = self.description and self.description+': ' - if index==0: return description+'No RLE coding' - return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) - -class TreeAlphabet(WithExtra): - """The alphabet to enumerate entries (called trees) in the context map. - parameters are RLEMAX and NTREES - >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) - >>> len(t) - 8 - >>> print(t[2]) - xx+4 zeroes - >>> t[3].explanation(2) - '8+010=10 zeroes' - >>> t[0].value(0) - (1, 0) - """ - name = 'CMI' - def __init__(self, name=None, *, RLEMAX, NTREES, **args): - super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) - self.RLEMAX = RLEMAX - self.NTREES = NTREES - - def extraBits(self, index): - if 0<index<=self.RLEMAX: return index - return 0 - - def mnemonic(self, index): - if index==0: return 'map #0' - if index<=self.RLEMAX: - return '{}+{} zeroes'.format('x'*index, 1<<index) - return 'map #{}'.format(index-self.RLEMAX) - - def value(self, index, extra): - """Give count and value.""" - index = index - if index==0: return 1, 0 - if index<=self.RLEMAX: return (1<<index)+extra, 0 - return 1, index-self.RLEMAX - - def explanation(self, index, extra): - description = self.description and self.description+': ' - if index==0: return description+'map #0' - if index<=self.RLEMAX: - return '{}+{:0{}b}={} zeroes'.format( - (1<<index), - extra, self.extraBits(index), - (1<<index)+extra) - return '{}map #{}-{}={}'.format( - description, - index, self.RLEMAX, index-self.RLEMAX) - -#Prefix alphabets for the data stream---------------------------------- -class LiteralAlphabet(Code): - """Alphabet of symbols. - """ - minLength = maxLength = 8 - def __init__(self, number): - super().__init__('L'+str(number), alphabetSize=1<<8) - - def mnemonic(self, index): - return outputCharFormatter(index) - - def value(self, index, extra=None): - return index - - def explanation(self, index, extra=None): - return self.mnemonic(index) - -class InsertLengthAlphabet(Enumerator): - """Intern code for insert counts - """ - value0 = 0 - extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24] - -class CopyLengthAlphabet(Enumerator): - value0 = 2 - extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24] - -class InsertAndCopyAlphabet(WithExtra): - """The insert and copy code - >>> for x in range(0,704,704//13): - ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) - 0 I0C2&D=0 - 110110 I6+xC8&D=0 - 1101100 I5C22+xxx&D=0 - 10100010 I4C4 - 11011000 I3C10+x - 100001110 I14+xxC8 - 101000100 I10+xxC22+xxx - 101111010 I98+xxxxxC14+xx - 110110000 I6+xC70+xxxxx - 111100110 I1090+[10*x]C8 - 1000011100 I26+xxxC326+[8*x] - 1001010010 I322+[8*x]C14+xx - 1010001000 I194+[7*x]C70+xxxxx - 1010111110 I22594+[24*x]C1094+[10*x] - """ - insertLengthAlphabet = InsertLengthAlphabet(None) - copyLengthAlphabet = CopyLengthAlphabet(None) - - def __init__(self, number=''): - super().__init__('IC'+str(number), bitLength=10) - - def __len__(self): - return 704 - - def extraBits(self, index): - insertSymbol, copySymbol, dist0 = self.splitSymbol(index) - return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ - CopyLengthAlphabet.extraTable[copySymbol.index] - - def splitSymbol(self, index): - """Give relevant values for computations: - (insertSymbol, copySymbol, dist0flag) - """ - #determine insert and copy upper bits from table - row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] - col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] - #determine inserts and copy sub codes - insertLengthCode = row<<3 | index>>3&7 - if row: insertLengthCode -= 8 - copyLengthCode = col<<3 | index&7 - return ( - Symbol(self.insertLengthAlphabet, insertLengthCode), - Symbol(self.copyLengthAlphabet, copyLengthCode), - row==0 - ) - - def mnemonic(self, index): - """Make a nice mnemonic - """ - i,c,d0 = self.splitSymbol(index) - iLower, _ = i.code.span(i.index) - iExtra = i.extraBits() - cLower, _ = c.code.span(c.index) - cExtra = c.extraBits() - return 'I{}{}{}C{}{}{}{}'.format( - iLower, - '+' if iExtra else '', - 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), - cLower, - '+' if cExtra else '', - 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), - '&D=0' if d0 else '') - - def value(self, index, extra): - i,c,d0 = self.splitSymbol(index) - iExtra = i.extraBits() - ce, ie = extra>>iExtra, extra&(1<<iExtra)-1 - insert = i.value(ie) - copy = c.value(ce) - return insert, copy, d0 - - def explanation(self, index, extra): - insert, copy, d0 = self.value(index, extra) - if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy) - else: return 'Literal: {}, copy: {}'.format(insert, copy) - -class DistanceAlphabet(WithExtra): - """Represent the distance encoding. - Dynamically generated alphabet. - This is what the documentation should have said: - Ignoring offsets for the moment, the "long" encoding works as follows: - Write the distance in binary as follows: - 1xy..yz..z, then the distance symbol consists of n..nxz..z - Where: - n is one less than number of bits in y - x is a single bit - y..y are n+1 extra bits (encoded in the bit stream) - z..z is NPOSTFIX bits that are part of the symbol - The offsets are so as to start at the lowest useable value: - if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1 - then n..nxz..z is symbol -NDIRECT-16 - >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) - >>> print(d[4], d[17], d[34]) - last-1 1 10xx00-5 - >>> [str(d[x]) for x in range(26, 32)] - ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] - """ - def __init__(self, number, *, NPOSTFIX, NDIRECT): - self.NPOSTFIX = NPOSTFIX - self.NDIRECT = NDIRECT - #set length - #Actually, not all symbols are used, - #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX) - super().__init__('D'+str(number), - alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX)) - - def extraBits(self, index): - """Indicate how many extra bits are needed to interpret symbol - >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) - >>> [d[i].extraBits() for i in range(26)] - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - >>> [d[i].extraBits() for i in range(26,36)] - [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] - """ - if index<16+self.NDIRECT: return 0 - return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) - - def value(self, dcode, dextra): - """Decode value of symbol together with the extra bits. - >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) - >>> d[34].value(2) - (0, 35) - """ - if dcode<16: - return [(1,0),(2,0),(3,0),(4,0), - (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), - (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) - ][dcode] - if dcode<16+self.NDIRECT: - return (0,dcode-16) - #we use the original formulas, instead of my clear explanation - POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 - ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) - hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX - lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK - offset = ((2 + (hcode & 1)) << ndistbits) - 4 - distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 - return (0,distance) - - def mnemonic(self, index, verbose=False): - """Give mnemonic representation of meaning. - verbose compresses strings of x's - """ - if index<16: - return ['last', '2last', '3last', '4last', - 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', - '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' - ][index] - if index<16+self.NDIRECT: - return str(index-16) - #construct strings like "1xx01-15" - index -= self.NDIRECT+16 - hcode = index >> self.NPOSTFIX - lcode = index & (1<<self.NPOSTFIX)-1 - if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}' - else: formatString = '1{0}{1}{4:+d}' - return formatString.format( - hcode&1, - 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), - lcode, self.NPOSTFIX, - self.NDIRECT+1-(4<<self.NPOSTFIX)) - - def explanation(self, index, extra): - """ - >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) - >>> d[55].explanation(13) - '11[1101]01-5: [0]+240' - """ - extraBits = self.extraBits(index) - extraString = '[{:0{}b}]'.format(extra, extraBits) - return '{0}: [{1[0]}]{1[1]:+d}'.format( - self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), - self.value(index, extra)) - -#Classes for doing actual work------------------------------------------ -class ContextModeKeeper: - """For computing the literal context mode. - You feed it characters, and it computes indices in the context map. - """ - def __init__(self, mode): - self.chars = deque([0,0], maxlen=2) - self.mode = mode - - def setContextMode(self, mode): - """Switch to given context mode (0..3)""" - self.mode = mode - def getIndex(self): - if self.mode==0: #LSB6 - return self.chars[1]&0x3f - elif self.mode==1: #MSB6 - return self.chars[1]>>2 - elif self.mode==2: #UTF8: character class of previous and a bit of the second - p2,p1 = self.chars - return self.lut0[p1]|self.lut1[p2] - elif self.mode==3: #Signed: initial bits of last two bytes - p2,p1 = self.chars - return self.lut2[p1]<<3|self.lut2[p2] - - def add(self, index): - """Adjust the context for output char (as int).""" - self.chars.append(index) - - #0: control #16: quote #32: ,:; #48: AEIOU - #4: tab/lf/cr #20: % #36: . #52: BC..Z - #8: space #24: (<[{ #40: = #56: aeiou - #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z - lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, - 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, - 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, - 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, - 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, - 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 - ]+[0,1]*32+[2,3]*32 - #0: space 1:punctuation 2:digit/upper 3:lower - lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, - 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 - ]+[0]*96+[2]*32 - #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 - lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] - assert len(lut0)==len(lut1)==len(lut2)==256 - -class WordList: - """Word list. - >>> WordList().word(7, 35555) - b'Program to ' - """ - NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, - 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, - 7, 6, 6, 5, 5] - def __init__(self): - self.file = open('dict', 'rb') - self.compileActions() - - def word(self, size, dist): - """Get word - """ - #split dist in index and action - ndbits = self.NDBITS[size] - index = dist&(1<<ndbits)-1 - action = dist>>ndbits - #compute position in file - position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index - self.file.seek(position) - return self.doAction(self.file.read(size), action) - - def upperCase1(self, word): - word = word.decode('utf8') - word = word[0].upper()+word[1:] - return word.encode('utf8') - - - #Super compact form of action table. - #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst - actionTable = r""" - 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_ - 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+. - 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w - 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+, - 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+=" - 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+=" - 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_ - 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_ - 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\' - 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+, - 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+=" - 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_ - 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+, - 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+( - 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._ - 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+. - 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\' - 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._ - 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+=" - 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\' - 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' - 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. - 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> - 23:w[:-3] 48:w[:-7] 98:_+w+=\' - 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, - """ - - def compileActions(self): - """Build the action table from the text above - """ - import re - self.actionList = actions = [None]*121 - #Action 73, which is too long, looks like this when expanded: - actions[73] = "b' the '+w+b' of the '" - #find out what the columns are - actionLines = self.actionTable.splitlines() - colonPositions = [m.start() - for m in re.finditer(':',actionLines[1]) - ]+[100] - columns = [(colonPositions[i]-3,colonPositions[i+1]-3) - for i in range(len(colonPositions)-1)] - for line in self.actionTable.splitlines(keepends=False): - for start,end in columns: - action = line[start:end] - #skip empty actions - if not action or action.isspace(): continue - #chop it up, and check if the colon is properly placed - index, colon, action = action[:3], action[3], action[4:] - assert colon==':' - #remove filler spaces at right - action = action.rstrip() - #replace space symbols - action = action.replace('_', ' ') - wPos = action.index('w') - #add quotes around left string when present - #translation: any pattern from beginning, up to - #(but not including) a + following by a w later on - action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) - #add quotes around right string when present - #translation: anything with a w in it, followed by a + - #and a pattern up to the end - #(there is no variable lookbehind assertion, - #so we have to copy the pattern) - action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) - #expand shortcut for uppercaseAll - action = action.replace(".U", ".upper()") - #store action - actions[int(index)] = action - - def doAction(self, w, action): - """Perform the proper action - """ - #set environment for the UpperCaseFirst - U = self.upperCase1 - return eval(self.actionList[action], locals()) - -class Layout: - """Class to layout the output. - """ - #display width of hexdata+bitdata - width = 25 - #general - def __init__(self, stream): - self.stream = stream - self.bitPtr = self.width - - def makeHexData(self, pos): - """Produce hex dump of all data containing the bits - from pos to stream.pos - """ - firstAddress = pos+7>>3 - lastAddress = self.stream.pos+7>>3 - return ''.join(map('{:02x} '.format, - self.stream.data[firstAddress:lastAddress])) - - def formatBitData(self, pos, width1, width2=0): - """Show formatted bit data: - Bytes are separated by commas - whole bytes are displayed in hex - >>> Layout(olleke).formatBitData(6, 2, 16) - '|00h|2Eh,|00' - >>> Layout(olleke).formatBitData(4, 1, 0) - '1' - """ - result = [] - #make empty prefix code explicit - if width1==0: result = ['()', ','] - for width in width1, width2: - #skip empty width2 - if width==0: continue - #build result backwards in a list - while width>0: - availableBits = 8-(pos&7) - if width<availableBits: - #read partial byte, beginning nor ending at boundary - data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1 - result.append('{:0{}b}'.format(data, width)) - elif availableBits<8: - #read rest of byte, ending at boundary - data = self.stream.data[pos>>3] >> (pos&7) - result.append('|{:0{}b}'.format(data, availableBits)) - else: - #read whole byte (in hex), beginning and ending at boundary - data = self.stream.data[pos>>3] - result.append('|{:02X}h'.format(data)) - width -= availableBits - pos += availableBits - #if width overshot from the availableBits subtraction, fix it - pos += width - #add comma to separate fields - result.append(',') - #concatenate pieces, reversed, skipping the last space - return ''.join(result[-2::-1]) - - def readPrefixCode(self, alphabet): - """give alphabet the prefix code that is read from the stream - Called for the following alphabets, in this order: - The alphabet in question must have a "logical" order, - otherwise the assignment of symbols doesn't work. - """ - mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) - if mode=='Complex': - #for a complex code, numberOfSymbols means hskip - self.readComplexCode(numberOfSymbols, alphabet) - return alphabet - else: - table = [] - #Set table of lengths for mnemonic function - lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] - #adjust mnemonic function of alphabet class - def myMnemonic(index): - return '{} bit{}: {}'.format( - lengths[i], - '' if lengths[i]==1 else 's', - alphabet.__class__.mnemonic(alphabet, index) - ) - alphabet.mnemonic = myMnemonic - for i in range(numberOfSymbols): - table.append(self.verboseRead(alphabet, skipExtra=True).index) - #restore mnemonic - del alphabet.mnemonic - if numberOfSymbols==4: - #read tree shape to redefine lengths - lengths = self.verboseRead(TreeShapeAlhabet()) - #construct the alphabet prefix code - alphabet.setLength(dict(zip(table, lengths))) - return alphabet - - def readComplexCode(self, hskip, alphabet): - """Read complex code""" - stream = self.stream - #read the lengths for the length code - lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] - codeLengths = {} - total = 0 - lol = LengthOfLengthAlphabet('##'+alphabet.name) - #lengthCode will be used for coding the lengths of the new code - #we use it for display until now; definition comes below - lengthCode = LengthAlphabet('#'+alphabet.name) - lengthIter = iter(lengths) - while total<32: - newSymbol = next(lengthIter) - lol.description = str(lengthCode[newSymbol]) - length = self.verboseRead(lol) - if length: - codeLengths[newSymbol] = length - total += 32>>length - if total>=32: - break - if total>32: raise ValueError("Stream format") - #Now set the encoding of the lengthCode - lengthCode.setLength(codeLengths) - print("***** Lengths for {} will be coded as:".format(alphabet.name)) - lengthCode.showCode() - #Now determine the symbol lengths with the lengthCode - symbolLengths = {} - total = 0 - lastLength = 8 - alphabetIter = iter(alphabet) - while total<32768: - #look ahead to see what is going to happen - length = lengthCode.decodePeek( - self.stream.peek(lengthCode.maxLength))[1].index - #in every branch, set lengthCode.description to explanatory text - #lengthCode calls format(symbol, extra) with this string - if length==0: - symbol = next(alphabetIter) - lengthCode.description = 'symbol {} unused'.format(symbol) - self.verboseRead(lengthCode) - #unused symbol - continue - if length==16: - lengthCode.description = \ - '{1}+3 symbols of length '+str(lastLength) - extra = self.verboseRead(lengthCode) - #scan series of 16s (repeat counts) - #start with repeat count 2 - repeat = 2 - startSymbol = next(alphabetIter) - endSymbol = next(alphabetIter) - symbolLengths[startSymbol.index] = \ - symbolLengths[endSymbol.index] = lastLength - #count the two just defined symbols - total += 2*32768>>lastLength - #note: loop may end because we're there - #even if a 16 _appears_ to follow - while True: - #determine last symbol - oldRepeat = repeat - repeat = (repeat-2<<2)+extra+3 - #read as many symbols as repeat increased - for i in range(oldRepeat, repeat): - endSymbol = next(alphabetIter) - symbolLengths[endSymbol.index] = lastLength - #compute new total; it may be end of loop - total += (repeat-oldRepeat)*32768>>lastLength - if total>=32768: break - #see if there is more to do - length = lengthCode.decodePeek( - self.stream.peek(lengthCode.maxLength))[1].index - if length!=16: break - lengthCode.description = 'total {}+{{1}} symbols'.format( - (repeat-2<<2)+3) - extra = self.verboseRead(lengthCode) - elif length==17: - #read, and show explanation - lengthCode.description = '{1}+3 unused' - extra = self.verboseRead(lengthCode) - #scan series of 17s (groups of zero counts) - #start with repeat count 2 - repeat = 2 - startSymbol = next(alphabetIter) - endSymbol = next(alphabetIter) - #note: loop will not end with total==32768, - #since total doesn't change here - while True: - #determine last symbol - oldRepeat = repeat - repeat = (repeat-2<<3)+extra+3 - #read as many symbols as repeat increases - for i in range(repeat-oldRepeat): - endSymbol = next(alphabetIter) - #see if there is more to do - length = lengthCode.decodePeek( - self.stream.peek(lengthCode.maxLength))[1].index - if length!=17: break - lengthCode.description = 'total {}+{{1}} unused'.format( - (repeat-2<<3)+3) - extra = self.verboseRead(lengthCode) - else: - symbol = next(alphabetIter) - #double braces for format - char = str(symbol) - if char in '{}': char *= 2 - lengthCode.description = \ - 'Length for {} is {{0.index}} bits'.format(char) - #output is not needed (will be 0) - self.verboseRead(lengthCode) - symbolLengths[symbol.index] = length - total += 32768>>length - lastLength = length - assert total==32768 - alphabet.setLength(symbolLengths) - print('End of table. Prefix code '+alphabet.name+':') - alphabet.showCode() - - #stream - def processStream(self): - """Process a brotli stream. - """ - print('addr hex{:{}s}binary context explanation'.format( - '', self.width-10)) - print('Stream header'.center(60, '-')) - self.windowSize = self.verboseRead(WindowSizeAlphabet()) - print('Metablock header'.center(60, '=')) - self.ISLAST = False - self.output = bytearray() - while not self.ISLAST: - self.ISLAST = self.verboseRead( - BoolCode('LAST', description="Last block")) - if self.ISLAST: - if self.verboseRead( - BoolCode('EMPTY', description="Empty block")): break - if self.metablockLength(): continue - if not self.ISLAST and self.uncompressed(): continue - print('Block type descriptors'.center(60, '-')) - self.numberOfBlockTypes = {} - self.currentBlockCounts = {} - self.blockTypeCodes = {} - self.blockCountCodes = {} - for blockType in (L,I,D): self.blockType(blockType) - print('Distance code parameters'.center(60, '-')) - self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) - self.readLiteralContextModes() - print('Context maps'.center(60, '-')) - self.cmaps = {} - #keep the number of each kind of prefix tree for the last loop - numberOfTrees = {I: self.numberOfBlockTypes[I]} - for blockType in (L,D): - numberOfTrees[blockType] = self.contextMap(blockType) - print('Prefix code lists'.center(60, '-')) - self.prefixCodes = {} - for blockType in (L,I,D): - self.readPrefixArray(blockType, numberOfTrees[blockType]) - self.metablock() - - #metablock header - def verboseRead(self, alphabet, context='', skipExtra=False): - """Read symbol and extra from stream and explain what happens. - Returns the value of the symbol - >>> olleke.pos = 0 - >>> l = Layout(olleke) - >>> l.verboseRead(WindowSizeAlphabet()) - 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 - 4194288 - """ - #TODO 2: verbosity level, e.g. show only codes and maps in header - stream = self.stream - pos = stream.pos - if skipExtra: - length, symbol = alphabet.readTuple(stream) - extraBits, extra = 0, None - else: - length, symbol, extraBits, extra = alphabet.readTupleAndExtra( - stream) - #fields: address, hex data, binary data, name of alphabet, explanation - hexdata = self.makeHexData(pos) - addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' - bitdata = self.formatBitData(pos, length, extraBits) - #bitPtr moves bitdata so that the bytes are easier to read - #jump back to right if a new byte starts - if '|' in bitdata[1:]: - #start over on the right side - self.bitPtr = self.width - fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) - if fillWidth<0: fillWidth = 0 - print('{:<5s} {:<{}s} {:7s} {}'.format( - addressField, - hexdata+' '*fillWidth+bitdata, self.width, - context+alphabet.name, - symbol if skipExtra else symbol.explanation(extra), - )) - #jump to the right if we started with a '|' - #because we didn't jump before printing - if bitdata.startswith('|'): self.bitPtr = self.width - else: self.bitPtr -= len(bitdata) - return symbol if skipExtra else symbol.value(extra) - - def metablockLength(self): - """Read MNIBBLES and meta block length; - if empty block, skip block and return true. - """ - self.MLEN = self.verboseRead(MetablockLengthAlphabet()) - if self.MLEN: - return False - #empty block; skip and return False - self.verboseRead(ReservedAlphabet()) - MSKIP = self.verboseRead(SkipLengthAlphabet()) - self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) - self.stream.pos += 8*MSKIP - print("Skipping to {:x}".format(self.stream.pos>>3)) - return True - - def uncompressed(self): - """If true, handle uncompressed data - """ - ISUNCOMPRESSED = self.verboseRead( - BoolCode('UNCMPR', description='Is uncompressed?')) - if ISUNCOMPRESSED: - self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) - print('Uncompressed data:') - self.output += self.stream.readBytes(self.MLEN) - print(outputFormatter(self.output[-self.MLEN:])) - return ISUNCOMPRESSED - - def blockType(self, kind): - """Read block type switch descriptor for given kind of blockType.""" - NBLTYPES = self.verboseRead(TypeCountAlphabet( - 'BT#'+kind[0].upper(), - description='{} block types'.format(kind), - )) - self.numberOfBlockTypes[kind] = NBLTYPES - if NBLTYPES>=2: - self.blockTypeCodes[kind] = self.readPrefixCode( - BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) - self.blockCountCodes[kind] = self.readPrefixCode( - BlockCountAlphabet('BC'+kind[0].upper())) - blockCount = self.verboseRead(self.blockCountCodes[kind]) - else: - blockCount = 1<<24 - self.currentBlockCounts[kind] = blockCount - - def readLiteralContextModes(self): - """Read literal context modes. - LSB6: lower 6 bits of last char - MSB6: upper 6 bits of last char - UTF8: rougly dependent on categories: - upper 4 bits depend on category of last char: - control/whitespace/space/ punctuation/quote/%/open/close/ - comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant - lower 2 bits depend on category of 2nd last char: - space/punctuation/digit or upper/lowercase - signed: hamming weight of last 2 chars - """ - print('Context modes'.center(60, '-')) - self.literalContextModes = [] - for i in range(self.numberOfBlockTypes[L]): - self.literalContextModes.append( - self.verboseRead(LiteralContextMode(number=i))) - - def contextMap(self, kind): - """Read context maps - Returns the number of differnt values on the context map - (In other words, the number of prefix trees) - """ - NTREES = self.verboseRead(TypeCountAlphabet( - kind[0].upper()+'T#', - description='{} prefix trees'.format(kind))) - mapSize = {L:64, D:4}[kind] - if NTREES<2: - self.cmaps[kind] = [0]*mapSize - else: - #read CMAPkind - RLEMAX = self.verboseRead(RLEmaxAlphabet( - 'RLE#'+kind[0].upper(), - description=kind+' context map')) - alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) - cmapCode = self.readPrefixCode(alphabet) - tableSize = mapSize*self.numberOfBlockTypes[kind] - cmap = [] - while len(cmap)<tableSize: - cmapCode.description = 'map {}, entry {}'.format( - *divmod(len(cmap), mapSize)) - count, value = self.verboseRead(cmapCode) - cmap.extend([value]*count) - assert len(cmap)==tableSize - IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF')) - if IMTF: - self.IMTF(cmap) - if kind==L: - print('Context maps for literal data:') - for i in range(0, len(cmap), 64): - print(*( - ''.join(map(str, cmap[j:j+8])) - for j in range(i, i+64, 8) - )) - else: - print('Context map for distances:') - print(*( - ''.join(map('{:x}'.format, cmap[i:i+4])) - for i in range(0, len(cmap), 4) - )) - self.cmaps[kind] = cmap - return NTREES - - @staticmethod - def IMTF(v): - """In place inverse move to front transform. - """ - #mtf is initialized virtually with range(infinity) - mtf = [] - for i, vi in enumerate(v): - #get old value from mtf. If never seen, take virtual value - try: value = mtf.pop(vi) - except IndexError: value = vi - #put value at front - mtf.insert(0, value) - #replace transformed value - v[i] = value - - def readPrefixArray(self, kind, numberOfTrees): - """Read prefix code array""" - prefixes = [] - for i in range(numberOfTrees): - if kind==L: alphabet = LiteralAlphabet(i) - elif kind==I: alphabet = InsertAndCopyAlphabet(i) - elif kind==D: alphabet = DistanceAlphabet( - i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT) - self.readPrefixCode(alphabet) - prefixes.append(alphabet) - self.prefixCodes[kind] = prefixes - - #metablock data - def metablock(self): - """Process the data. - Relevant variables of self: - numberOfBlockTypes[kind]: number of block types - currentBlockTypes[kind]: current block types (=0) - literalContextModes: the context modes for the literal block types - currentBlockCounts[kind]: counters for block types - blockTypeCodes[kind]: code for block type - blockCountCodes[kind]: code for block count - cmaps[kind]: the context maps (not for I) - prefixCodes[kind][#]: the prefix codes - lastDistances: the last four distances - lastChars: the last two chars - output: the result - """ - print('Meta block contents'.center(60, '=')) - self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1} - self.lastDistances = deque([17,16,11,4], maxlen=4) - #the current context mode is for block type 0 - self.contextMode = ContextModeKeeper(self.literalContextModes[0]) - wordList = WordList() - - #setup distance callback function - def distanceCallback(symbol, extra): - "callback function for displaying decoded distance" - index, offset = symbol.value(extra) - if index: - #recent distance - distance = self.lastDistances[-index]+offset - return 'Distance: {}last{:+d}={}'.format(index, offset, distance) - #absolute value - if offset<=maxDistance: - return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset) - #word list value - action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen]) - return '{}-{} gives word {},{} action {}'.format( - offset, maxDistance, copyLen, word, action) - for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback - - blockLen = 0 - #there we go - while blockLen<self.MLEN: - #get insert© command - litLen, copyLen, dist0Flag = self.verboseRead( - self.prefixCodes[I][ - self.figureBlockType(I)]) - #literal data - for i in range(litLen): - bt = self.figureBlockType(L) - cm = self.contextMode.getIndex() - ct = self.cmaps[L][bt<<6|cm] - char = self.verboseRead( - self.prefixCodes[L][ct], - context='{},{}='.format(bt,cm)) - self.contextMode.add(char) - self.output.append(char) - blockLen += litLen - #check if we're done - if blockLen>=self.MLEN: return - #distance - #distances are computed relative to output length, at most window size - maxDistance = min(len(self.output), self.windowSize) - if dist0Flag: - distance = self.lastDistances[-1] - else: - bt = self.figureBlockType(D) - cm = {2:0, 3:1, 4:2}.get(copyLen, 3) - ct = self.cmaps[D][bt<<2|cm] - index, offset = self.verboseRead( - self.prefixCodes[D][ct], - context='{},{}='.format(bt,cm)) - distance = self.lastDistances[-index]+offset if index else offset - if index==1 and offset==0: - #to make sure distance is not put in last distance list - dist0Flag = True - if distance<=maxDistance: - #copy from output - for i in range( - maxDistance-distance, - maxDistance-distance+copyLen): - self.output.append(self.output[i]) - if not dist0Flag: self.lastDistances.append(distance) - comment = 'Seen before' - else: - #fetch from wordlist - newWord = wordList.word(copyLen, distance-maxDistance-1) - self.output.extend(newWord) - #adjust copyLen to reflect actual new data - copyLen = len(newWord) - comment = 'From wordlist' - blockLen += copyLen - print(' '*40, - comment, - ': "', - outputFormatter(self.output[-copyLen:]), - '"', - sep='') - self.contextMode.add(self.output[-2]) - self.contextMode.add(self.output[-1]) - - def figureBlockType(self, kind): - counts, types = self.currentBlockCounts, self.currentBlockTypes - if counts[kind]==0: - newType = self.verboseRead(self.blockTypeCodes[kind]) - if newType==-2: newType = types['P'+kind] - elif newType==-1: - newType = (types[kind]+1)%self.numberOfBlockTypes[kind] - types['P'+kind] = types[kind] - types[kind] = newType - counts[kind] = self.verboseRead(self.blockCountCodes[kind]) - counts[kind] -=1 - return types[kind] - -__test__ = { -'BitStream': """ - >>> bs = BitStream(b'Jurjen') - >>> bs.readBytes(2) - b'Ju' - >>> bs.read(6) #r=01110010 - 50 - >>> bs - BitStream(pos=2:6) - >>> bs.peek(5) #j=01101010 - 9 - >>> bs.readBytes(2) - Traceback (most recent call last): - ... - ValueError: readBytes: need byte boundary - """, - -'Symbol': """ - >>> a=Symbol(MetablockLengthAlphabet(),5) - >>> len(a) - 2 - >>> int(a) - 5 - >>> a.bitPattern() - '01' - >>> a.value(200000) - 200001 - >>> a.explanation(300000) - 'data length: 493e0h+1=300001' - """, - -'RangeDecoder': """ - >>> a=RangeDecoder(bitLength=3) - >>> len(a) - 8 - >>> a.name='t' - >>> list(a) - [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] - >>> a[2] - Symbol(t, 2) - >>> a.bitPattern(4) - '100' - >>> a.length(2) - 3 - >>> a.decodePeek(15) - (3, Symbol(t, 7)) - >>> - - """, - -'PrefixDecoder': """ - >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) - >>> len(a) - 4 - >>> a.name='t' - >>> list(a) - [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] - >>> a.decodePeek(22) - (1, Symbol(t, 1)) - >>> a.decodePeek(27) - (3, Symbol(t, 3)) - >>> a.length(1) - 1 - >>> a.length(4) - 3 - """, - -'Code': """ - >>> a=Code('t',alphabetSize=10) - >>> len(a) - 10 - >>> a.showCode() - 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 - >>> a.setLength({2:1,3:2,5:3,6:3}) - >>> a.showCode() - 0:2 01:3 011:5 111:6 - >>> len(a) - 4 - >>> def callback(i): return 'call{}back'.format(i) - >>> a=Code('t',callback=callback,bitLength=3) - >>> a[6].explanation() - 'call6back' - """, - -'WithExtra': """ - >>> class A(WithExtra): - ... extraTable = [0,1,1,2,2] - >>> a=A('t',alphabetSize=5) - >>> a[1] - Symbol(t, 1) - >>> a.extraBits(2) - 1 - >>> a.mnemonic(4) - '4' - >>> a.readTupleAndExtra(BitStream(b'\x5b')) - (3, Symbol(t, 3), 2, 3) - """, - -'BoolCode': """ - >>> BoolCode('test')[0].explanation() - '0: False' - """, - -'Enumerator': """ - >>> class A(Enumerator): - ... extraTable = [0,1,1,2,2] - ... value0=3 - >>> a=A(alphabetLength=5) - >>> a.value(3) - Traceback (most recent call last): - ... - TypeError: value() missing 1 required positional argument: 'extra' - >>> a.explanation(3,4) - 'xx 011: 8-11; 8+4=12' - """, - -'WindowSizeAlphabet': """ - >>> windowSizeAlphabet = WindowSizeAlphabet() - >>> windowSizeAlphabet[0] - Traceback (most recent call last): - ... - ValueError: No symbol WindowSizeAlphabet[0] - >>> len(windowSizeAlphabet) - 16 - >>> windowSizeAlphabet[21] - Symbol(WSIZE, 21) - >>> windowSizeAlphabet[21].bitPattern() - '1001' - >>> windowSizeAlphabet[21].extraBits() - 0 - >>> windowSizeAlphabet[21].index - 21 - >>> windowSizeAlphabet[10].value() - 1008 - >>> windowSizeAlphabet[10].explanation() - 'windowsize=(1<<10)-16=1008' - >>> windowSizeAlphabet.showCode() - 0:65520 1100001:16368 1110001:32752 0011:262128 - 0000001:131056 0010001:None 1001:2097136 1011:4194288 - 1000001:4080 1010001:8176 0101:524272 0111:1048560 - 0100001:1008 0110001:2032 1101:8388592 1111:16777200 - """, - -'TypeCountAlphabet': """ - >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') - >>> len(typeCountAlphabet) - 9 - >>> typeCountAlphabet[3] - Symbol(BT#, 3) - >>> typeCountAlphabet[9] - Traceback (most recent call last): - ... - ValueError: No symbol TypeCountAlphabet[9] - >>> print(typeCountAlphabet[3]) - xx,0101 - >>> typeCountAlphabet[8].value(127) - 256 - >>> typeCountAlphabet[4].explanation(2) - 'xxx,0111: 11 bananas' - >>> typeCountAlphabet[0].explanation() - '0: 1 banana' - """, - -'DistanceParamAlphabet': """ - >>> dpa = DistanceParamAlphabet() - >>> dpa.showCode() - 00:PF0 01:PF1 10:PF2 11:PF3 - >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) - (2, Symbol(DIST, 1), 4, 10) - >>> dpa.explanation(2, 5) - '2 postfix bits and 0101<<2=20 direct codes' - """, - -'LiteralAlphabet': """ - >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS - 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 - 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 - 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 - ... - 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff - 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc - 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd - 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce - 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf - """, - -'BlockCountAlphabet': """ - >>> bc=BlockCountAlphabet('BCL') - >>> len(bc) - 26 - >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') - >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) - 'Block count: xx 00000: 1-4; 1+2=3' - >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) - 'Block count: xxx 00110: 33-40; 33+0=33' - >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) - 'Block count: xxxxxx 10001: 305-368; 305+28=333' - >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) - 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' - """, - -'Layout': """ - >>> olleke.pos = 0 - >>> l = Layout(olleke) - >>> l.verboseRead(WindowSizeAlphabet()) - 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 - 4194288 - >>> l.verboseRead(BoolCode('LAST', description="Last block")) - 1 LAST Last block: 1: True - True - >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) - 0 EMPTY Empty block: 0: False - False - >>> l.verboseRead(MetablockLengthAlphabet()) - 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 - 47 - >>> olleke.pos = 76 - >>> l = Layout(olleke) - >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) - 000a 82 10|1100 D0 10[15*x]-3 - >>> x.explanation(0x86a3) - '10[1000011010100011]-3: [0]+100000' - """, - -'olleke': """ - >>> olleke.pos = 0 - >>> try: Layout(olleke).processStream() - ... except NotImplementedError: pass - ... #doctest: +REPORT_NDIFF - addr hex binary context explanation - -----------------------Stream header------------------------ - 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 - ======================Metablock header====================== - 1 LAST Last block: 1: True - 0 EMPTY Empty block: 0: False - 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 - -------------------Block type descriptors------------------- - 0003 00 0 BT#L 0: 1 literal block type - 0 BT#I 0: 1 insert© block type - 0 BT#D 0: 1 distance block type - ------------------Distance code parameters------------------ - 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes - -----------------------Context modes------------------------ - 10 LC0 Context mode for type 0: 2(UTF8) - ------------------------Context maps------------------------ - 0 LT# 0: 1 literal prefix tree - 0 DT# 0: 1 distance prefix tree - ---------------------Prefix code lists---------------------- - 10 PFX L0 is complex with lengths 3,4,0,5,17... - 0005 4f 1|0 ##L0 len 3: coded with 3 bits - 0111 ##L0 len 4: coded with 1 bits - 10 ##L0 unused: coded with 3 bits - 0006 d6 0|0 ##L0 len 5: skipped - 011 ##L0 zero xxx: coded with 2 bits - ***** Lengths for L0 will be coded as: - 0:len 4 01:zero xxx 011:unused 111:len 3 - 0007 95 1|11,01 #L0 7+3 unused - 0 #L0 Length for \\n is 4 bits - 001,01 #L0 1+3 unused - 0008 44 010,0|1 #L0 total 19+2 unused - 0 #L0 Length for " " is 4 bits - 0 #L0 Length for ! is 4 bits - 0009 cb 011,|01 #L0 3+3 unused - |110,01 #L0 total 35+6 unused - 000a 82 0 #L0 Length for K is 4 bits - 000,01 #L0 0+3 unused - 0 #L0 Length for O is 4 bits - 000b 4d 01|1 #L0 symbol P unused - 011 #L0 symbol Q unused - 0 #L0 Length for R is 4 bits - 000c 88 000,|01 #L0 0+3 unused - |100,01 #L0 total 11+4 unused - 000d b6 0 #L0 Length for b is 4 bits - 011 #L0 symbol c unused - 011 #L0 symbol d unused - 000e 27 11|1 #L0 Length for e is 3 bits - 010,01 #L0 2+3 unused - |0 #L0 Length for k is 4 bits - 000f 1f 111 #L0 Length for l is 3 bits - 011 #L0 symbol m unused - 0 #L0 Length for n is 4 bits - |0 #L0 Length for o is 4 bits - 0010 c1 000,01 #L0 0+3 unused - 0 #L0 Length for s is 4 bits - 0011 b4 0|11 #L0 symbol t unused - 0 #L0 Length for u is 4 bits - End of table. Prefix code L0: - 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s - 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u - 11,01 PFX IC0 is simple with 4 code words - 0012 2a |2Ah|10 IC0 ? bits: I5C4 - 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 - 0015 22 0010|111011 IC0 ? bits: I8+xC5 - 0016 8c 001100|0010 IC0 ? bits: I0C14+xx - 0 SHAPE False: lengths 2,2,2,2 - 0017 74 10,0|1 PFX D0 is simple with 3 code words - 0018 a6 0|01110 D0 1 bit: 2last-3 - 010011 D0 2 bits: 11xx-3 - 0019 aa 01010|1 D0 2 bits: 11xxx-3 - ====================Meta block contents===================== - |1,01 IC0 Literal: 9, copy: 5 - 001a 41 0001 0,0=L0 O - 100 0,48=L0 l - 001b a2 10|0 0,62=L0 l - 000 0,63=L0 e - 001c a1 1|101 0,59=L0 k - 000 0,63=L0 e - |1010 0,59=L0 " " - 001d b5 0101 0,11=L0 b - |1011 0,60=L0 o - 001e 24 0 0,3=D0 Distance: 2last-3=8 - Seen before: "lleke" - 0,10 IC0 Literal: 6, copy: 7 - |0010 0,59=L0 \\n - 001f 89 1001 0,7=L0 R - 000 0,52=L0 e - 0020 fa 010|1 0,58=L0 b - 1111 0,63=L0 u - 0021 eb 011|1 0,59=L0 s - 11,01 0,3=D0 Absolute value: 12 (pos 8) - Seen before: "olleke\\n" - 0022 db 01,1|1 IC0 Literal: 0, copy: 15 - |110,11 0,3=D0 Absolute value: 27 (pos 0) - Seen before: "Olleke bolleke\\n" - 0023 f8 00 IC0 Literal: 5, copy: 4 - 1110 0,7=L0 K - 0024 2c 00|11 0,52=L0 n - 1011 0,62=L0 o - 0025 0d 1|00 0,59=L0 l - 0110 0,63=L0 ! - """, - -'file': """ - >>> try: Layout(BitStream( - ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') - ... .read())).processStream() - ... except NotImplementedError: pass - addr hex binary context explanation - -----------------------Stream header------------------------ - 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 - ======================Metablock header====================== - 1 LAST Last block: 1: True - 0 EMPTY Empty block: 0: False - 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 - -------------------Block type descriptors------------------- - 0003 00 0 BT#L 0: 1 literal block type - 0 BT#I 0: 1 insert© block type - 0 BT#D 0: 1 distance block type - ------------------Distance code parameters------------------ - 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes - -----------------------Context modes------------------------ - 10 LC0 Context mode for type 0: 2(UTF8) - ------------------------Context maps------------------------ - 0 LT# 0: 1 literal prefix tree - 0 DT# 0: 1 distance prefix tree - ---------------------Prefix code lists---------------------- - 0005 b0 0|1,01 PFX L0 is simple with 2 code words - 0006 b2 0|1011000 L0 1 bit: X - 0007 ea 0|1011001 L0 1 bit: Y - 01,01 PFX IC0 is simple with 2 code words - 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 - 0009 47 02 0|47h|1 IC0 1 bit: I1C9 - 00,01 PFX D0 is simple with 1 code word - 000b 8a 010|000 D0 0 bits: 10x-3 - ====================Meta block contents===================== - 1 IC0 Literal: 1, copy: 9 - 0 0,0=L0 X - 0,() 0,3=D0 Absolute value: 1 (pos 0) - Seen before: "XXXXXXXXX" - 0 IC0 Literal: 1, copy: 9, same distance - |1 0,54=L0 Y - Seen before: "YYYYYYYYY" - """, - -'XY': """ - >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() - ... except NotImplementedError: pass - addr hex binary context explanation - -----------------------Stream header------------------------ - 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 - ======================Metablock header====================== - 1 LAST Last block: 1: True - 0 EMPTY Empty block: 0: False - 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 - -------------------Block type descriptors------------------- - 0003 00 0 BT#L 0: 1 literal block type - 0 BT#I 0: 1 insert© block type - 0 BT#D 0: 1 distance block type - ------------------Distance code parameters------------------ - 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes - -----------------------Context modes------------------------ - 10 LC0 Context mode for type 0: 2(UTF8) - ------------------------Context maps------------------------ - 0 LT# 0: 1 literal prefix tree - 0 DT# 0: 1 distance prefix tree - ---------------------Prefix code lists---------------------- - 0005 b0 0|1,01 PFX L0 is simple with 2 code words - 0006 b2 0|1011000 L0 1 bit: X - 0007 82 0|1011001 L0 1 bit: Y - 00,01 PFX IC0 is simple with 1 code word - 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 - 0009 00 00,0|1 PFX D0 is simple with 1 code word - 000a e0 0|00000 D0 0 bits: last - ====================Meta block contents===================== - () IC0 Literal: 4, copy: 6, same distance - 0 0,0=L0 X - 0 0,52=L0 X - 0 0,54=L0 X - 0 0,54=L0 X - Seen before: "XXXXXX" - () IC0 Literal: 4, copy: 6, same distance - 1 0,54=L0 Y - 1 0,54=L0 Y - |1 0,54=L0 Y - 000b 01 1 0,54=L0 Y - Seen before: "YYYYYY" - """, - -'empty': """ - >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() - ... except NotImplementedError: pass - addr hex binary context explanation - -----------------------Stream header------------------------ - 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 - ======================Metablock header====================== - |1 LAST Last block: 1: True - 0001 16 0 EMPTY Empty block: 0: False - 11 MLEN 11: empty block - 0 RSVD Reserved (must be zero) - 0002 00 000000|00,01 SKIP skip length: 0h+1=1 - |00 SKIP 2 bits ignored - Skipping to 4 - """, - -} - -if __name__=='__main__': - import sys - if len(sys.argv)>1: - l = Layout(BitStream(open(sys.argv[1],'rb').read())) - l.processStream() - else: - sys.path.append("h:/Persoonlijk/bin") - try: - import brotli - open('brotlidump.br', 'wb').write( - brotli.compress( - open('brotlidump.py', 'r').read() - )) - olleke = BitStream(brotli.compress( - 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) - except ImportError: pass - import doctest - doctest.testmod(optionflags=doctest.REPORT_NDIFF - #|doctest.FAIL_FAST - ) |