From fd96151b2a04667dac73242b72bc62e007b2b543 Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Tue, 20 Dec 2016 18:00:51 +0100 Subject: Move brotlidump.py to research/ (#487) --- research/brotlidump.py | 2361 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2361 insertions(+) create mode 100644 research/brotlidump.py (limited to 'research') diff --git a/research/brotlidump.py b/research/brotlidump.py new file mode 100644 index 0000000..a625368 --- /dev/null +++ b/research/brotlidump.py @@ -0,0 +1,2361 @@ +#!python3 +"""Program to dump contents of Brotli compressed files showing the compression format. +Jurjen N.E. Bos, 2016. +I found the following issues with the Brotli format: +- The distance alphabet has size 16+(48<>3, bit pos is pos&7 + self.pos = 0 + + def __repr__(self): + """Representation + >>> olleke + BitStream(pos=0:0) + """ + return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) + + def read(self, n): + """Read n bits from the stream and return as an integer. + Produces zero bits beyond the stream. + >>> olleke.data[0]==27 + True + >>> olleke.read(5) + 27 + + >>> olleke + BitStream(pos=0:5) + """ + value = self.peek(n) + self.pos += n + if self.pos>len(self.data)*8: + raise ValueError('Read past end of stream') + return value + + def peek(self, n): + """Peek an n bit integer from the stream without updating the pointer. + It is not an error to read beyond the end of the stream. + >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 + True + >>> olleke.peek(15) + 11803 + >>> hex(olleke.peek(32)) + '0x2e1b' + """ + #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] + #convert to int: int.from_bytes(..., 'little') + #shift out the bits from the first byte: >>(self.pos&7) + #mask unwanted bits: & (1<>3:self.pos+n+7>>3], + 'little')>>(self.pos&7) & (1<>3:(self.pos>>3)+n] + self.pos += 8*n + return result + +#-----------------------Symbol------------------------------------------- +class Symbol: + """A symbol in a code. + Refers back to the code that contains it. + Index is the place in the alphabet of the symbol. + """ + __slots__ = 'code', 'index' + def __init__(self, code, value): + self.code = code + self.index = value + + def __repr__(self): + return 'Symbol({}, {})'.format(self.code.name, self.index) + + def __len__(self): + """Number of bits in the prefix notation of this symbol + """ + return self.code.length(self.index) + + def __int__(self): + return self.index + + #these routines call equivalent routine in Code class + def bitPattern(self): + """Value of the symbol in the stream + """ + return self.code.bitPattern(self.index) + + def extraBits(self): + """Number of extra bits to read for this symbol + """ + return self.code.extraBits(self.index) + + def __str__(self): + """Short descriptor of the symbol without extra bits. + """ + return self.code.mnemonic(self.index) + + #requiring optional extra bits, if self.code supports them + def value(self, extra=None): + """The value used for processing. Can be a tuple. + with optional extra bits + """ + if isinstance(self.code, WithExtra): + if not 0<=extra<1<=self.alphabetSize: raise ValueError('index out of range') + return Symbol(self, index) + + def bitPattern(self, index): + return '{:0{}b}'.format(index, self.maxLength) + + def length(self, index): + """Encoding length of given symbol. + Does not depend on index in this case. + """ + return self.maxLength + + def decodePeek(self, data): + """Find which symbol index matches the given data (from peek, as a number) + and return the number of bits decoded. + Can also be used to figure out length of a symbol. + """ + return self.maxLength, Symbol(self, data&(1<>1 + #note lo<=mid>> olleke.pos = 6 + >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) + (2, Symbol(MLEN, 4), 16, 46) + """ + length, symbol = self.decodePeek(stream.peek(self.maxLength)) + stream.pos += length + extraBits = self.extraBits(symbol.index) + return length, symbol, extraBits, stream.read(extraBits) + + def explanation(self, index, extra=None): + """Expanded version of Code.explanation supporting extra bits. + If you don't supply extra, it is not mentioned. + """ + extraBits = 0 if extra is None else self.extraBits(index) + if not hasattr(self, 'extraTable'): + formatString = '{0}{3}' + lo = hi = value = self.value(index, extra) + elif extraBits==0: + formatString = '{0}{2}: {3}' + lo, hi = self.span(index) + value = lo + else: + formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' + lo, hi = self.span(index) + value = lo+extra + return formatString.format( + self.description and self.description+': ', + 'x'*extraBits, + self.bitPattern(index), + lo, hi, + extra, + value, + ) + + def callback(self, symbol, extra): + return self.explanation(symbol.index, extra) + +class BoolCode(Code): + """Same as Code(bitLength=1), but shows a boolean. + """ + def __init__(self, name=None, **args): + super().__init__(name, bitLength=1, **args) + + def value(self, index, extra=None): + return bool(super().value(index, extra)) + +class Enumerator(WithExtra): + """Code that is defined by the ExtraTable. + extraTable is a class variable that contains + the extraBits of the symbols from 0 + value0 contains the value of symbol 0 + encodings is not neccessary, but allowed. + Note: place for FixedCode to make sure extraBits works + """ + def __init__(self, name=None, **args): + #if there is no decodeTable to determine length, compute it ourselves + if 'decodeTable' not in args: + args['alphabetSize'] = len(self.extraTable) + super().__init__(name, **args) + + def __len__(self): + return len(self.extraTable) + + def __getitem__(self, index): + """Faster than PrefixDecoder + """ + if index>=len(self.extraTable): + raise ValueError("No symbol {}[{}]".format( + self.__class__.__name__, index)) + return Symbol(self, index) + + def value(self, index, extra): + """Override if you don't define value0 and extraTable + """ + lower, upper = self.span(index) + value = lower+(extra or 0) + if value>upper: + raise ValueError('value: extra out of range') + return value + + def span(self, index): + """Give the range of possible values in a tuple + Useful for mnemonic and explanation + """ + lower = self.value0+sum(1<3: + raise ValueError('value: extra out of range') + return 'Simple', extra+1 + if extra: + raise ValueError('value: extra out of range') + return 'Complex', index + + def explanation(self, index, extra): + if index==1: + return '{} is simple with {} code word{}'.format( + self.codename, extra+1, 's' if extra else '') + lengths = [1, 2, 3, 4, 0, 5, 17, 6] + return '{} is complex with lengths {}...'.format( + self.codename, + ','.join( + map(str, lengths[index:index+5])) + ) + +class TreeShapeAlhabet(BoolCode): + """The bit used to indicate if four word code is "deep" or "wide" + """ + name = 'SHAPE' + def value(self, index): + return [(2,2,2,2), (1,2,3,3)][index] + + def explanation(self, index): + return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) + +class LengthOfLengthAlphabet(Code): + """For use in decoding complex code descriptors. + >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') + >>> print(lengthOfLengthAlphabet[2]) + coded with 2 bits + >>> len(lengthOfLengthAlphabet[0]) + 2 + >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] + [2, 4, 3, 2, 2, 4] + >>> lengthOfLengthAlphabet.showCode() + 00:skipped 01:coded with 4 bits 0111:coded with 1 bits + 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits + """ + decodeTable = { + 0b00:0, 0b10:3, + 0b0111:1, 0b01:4, + 0b011:2, 0b1111:5, + } + + def __init__(self, name=None, **args): + super().__init__(name, decodeTable=self.decodeTable, **args) + + def mnemonic(self, index): + if index==0: return 'skipped' + return 'coded with {} bits'.format(index) + + def explanation(self, index, extra=None): + return self.description+': '+self.mnemonic(index) + +class LengthAlphabet(WithExtra): + """Length of symbols + Used during construction of a code. + """ + def __init__(self, name): + super().__init__(name, alphabetSize=18) + + def extraBits(self, index): + return {16:2, 17:3}.get(index, 0) + + def mnemonic(self, index): + if index==0: return 'unused' + elif index==16: return 'rep xx' + elif index==17: return 'zero xxx' + else: return 'len {}'.format(index) + + def explanation(self, index, extra): + return self.description.format(self[index], extra) + + def value(self, index, extra): + #the caller got the length already, so extra is enough + return extra + +#Stream header +class WindowSizeAlphabet(Code): + """The alphabet used for window size in the stream header. + >>> WindowSizeAlphabet()[10].explanation() + 'windowsize=(1<<10)-16=1008' + """ + decodeTable = { + 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, + 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, + 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, + 0b1010001: 13, 0b0000001: 17, 0b1001: 21, + 0b0010001: None, + } + + name = 'WSIZE' + + def __init__(self, name=None): + super().__init__(name, decodeTable=self.decodeTable) + + def value(self, index): + #missing value gives index None + if index is None: return None + return (1<>> metablockLengthAlphabet = MetablockLengthAlphabet() + >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) + Symbol(MLEN, 0) + 'empty' + >>> metablockLengthAlphabet[3] + Traceback (most recent call last): + ... + ValueError: No symbol MetablockLengthAlphabet[3] + >>> print(metablockLengthAlphabet[4]) + hhhh00 + >>> metablockLengthAlphabet[4].value(0x1000) + 4097 + >>> metablockLengthAlphabet[5].value(0x1000) + Traceback (most recent call last): + ... + InvalidStream: Zeros in high nibble of MLEN + >>> metablockLengthAlphabet[5].explanation(0x12345) + 'data length: 12345h+1=74566' + >>> metablockLengthAlphabet.showCode() + 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty + """ + decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} + + name = 'MLEN' + def __init__(self, name=None): + super().__init__(name, decodeTable=self.decodeTable) + + def extraBits(self, index): + return index*4 + + def mnemonic(self, index): + if index==0: return 'empty' + return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) + + def value(self, index, extra): + extraBits = self.extraBits(index) + if not 0<=extra<1<4 and extra>>extraBits-4==0: raise InvalidStream( + 'Zeros in high nibble of MLEN') + return extra+1 + + def explanation(self, index, extra): + if index==0: return '11: empty block' + extraBits = self.extraBits(index) + return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) + + +class ReservedAlphabet(BoolCode): + """The reserved bit that must be zero. + """ + name = 'RSVD' + def value(self, index): + if index: raise ValueError('Reserved bit is not zero') + + def explanation(self, index): + return 'Reserved (must be zero)' + +class FillerAlphabet(Code): + def __init__(self, *, streamPos): + super().__init__('SKIP', bitLength=(-streamPos)&7) + + def explanation(self, index): + return '{} bit{} ignored'.format( + self.length(index), + '' if self.length(index)==1 else 's', + ) + +class SkipLengthAlphabet(WithExtra): + """Used for the skip length in an empty metablock + >>> skipLengthAlphabet = SkipLengthAlphabet() + >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) + Symbol(SKIP, 0) + 'empty' + >>> skipLengthAlphabet[4] + Traceback (most recent call last): + ... + ValueError: index out of range + >>> print(skipLengthAlphabet[3]) + hhhhhh11 + >>> skipLengthAlphabet[2].value(0x1000) + 4097 + >>> skipLengthAlphabet[3].value(0x1000) + Traceback (most recent call last): + ... + InvalidStream: Zeros in high byte of SKIPBYTES + >>> skipLengthAlphabet[3].explanation(0x12345) + 'skip length: 12345h+1=74566' + >>> skipLengthAlphabet.showCode() + 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 + """ + def __init__(self): + super().__init__('SKIP', bitLength=2) + + def extraBits(self, index): + return index*8 + + def mnemonic(self, index): + if index==0: return 'empty' + return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) + + def value(self, index, extra): + extraBits = self.extraBits(index) + if not 0<=extra<1<1 and extra>>extraBits-8==0: + raise InvalidStream('Zeros in high byte of SKIPBYTES') + return extra+1 + + def explanation(self, index, extra): + if index==0: return '00: no skip' + extraBits = self.extraBits(index) + return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) + + +class TypeCountAlphabet(Enumerator): + """Used for giving block type counts and tree counts. + >>> TypeCountAlphabet(description='').showCode() + 0:0 0101:xx,0101 1011:xxxxx,1011 + 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 + 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 + """ + decodeTable = { + 0b0: 0, 0b1001: 5, + 0b0001: 1, 0b1011: 6, + 0b0011: 2, 0b1101: 7, + 0b0101: 3, 0b1111: 8, + 0b0111: 4, + } + + value0 = 1 + extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] + name = 'BT#' + + def __init__(self, name=None, *, description): + super().__init__( + name, + decodeTable=self.decodeTable, + description=description) + + def mnemonic(self, index): + if index==0: return '0' + if index==1: return '0001' + return 'x'*(self.extraBits(index))+','+self.bitPattern(index) + + def explanation(self, index, extra): + value = self.value(index, extra) + description = self.description + if value==1: description = description[:-1] + return '{}: {} {}'.format( + self.mnemonic(index), + value, + description) + +class BlockTypeAlphabet(Code): + """The block types; this code works for all three kinds. + >>> b = BlockTypeAlphabet('T', NBLTYPES=5) + >>> print(*(x for x in b)) + prev +1 #0 #1 #2 #3 #4 + """ + def __init__(self, name, NBLTYPES, **args): + super().__init__(name, alphabetSize=NBLTYPES+2, **args) + self.NBLTYPES = NBLTYPES + + def mnemonic(self, index): + if index==0: return 'prev' + elif index==1: return '+1' + else: return '#'+str(index-2) + + def value(self, index): + return index-2 + + def explanation(self, index): + if index==0: return '0: previous' + elif index==1: return '1: increment' + else: return 'Set block type to: '+str(index-2) + +class BlockCountAlphabet(Enumerator): + """Block counts + >>> b = BlockCountAlphabet('L') + >>> print(b[25]) + [24*x]: BC16625-16793840 + """ + + value0 = 1 + extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] + def __init__(self, name, **args): + super().__init__(name, alphabetSize=26, **args) + + def mnemonic(self, index): + extraBits = self.extraBits(index) + return '{}: BC{}-{}'.format( + 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), + *self.span(index)) + + def explanation(self, index, extra): + return 'Block count: '+super().explanation(index, extra) + +class DistanceParamAlphabet(WithExtra): + """The distance parameters NPOSTFIX and NDIRECT. + Although these are treated as two in the description, this is easier. + """ + def __init__(self): + super().__init__('DIST', bitLength=2) + + def extraBits(self, index): + return 4 + + def value(self, index, extra): + """Returns NPOSTFIX and NDIRECT<15: + raise ValueError('value: extra out of range') + return index, extra<>> LiteralContextMode().showCode() + 00:LSB6 01:MSB6 10:UTF8 11:Signed + >>> LiteralContextMode().explanation(2) + 'Context mode for type 9: 2(UTF8)' + """ + + def __init__(self, *, number=9): + super().__init__('LC'+str(number), bitLength=2) + self.number = number + + def mnemonic(self, index): + return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] + + def explanation(self, index): + return 'Context mode for type {}: {}({})'.format( + self.number, + index, + self.mnemonic(index)) + +class RLEmaxAlphabet(Enumerator): + """Used for describing the run length encoding used for describing context maps. + >>> RLEmaxAlphabet().showCode() + 0:1 1:more + """ + value0 = 0 + extraTable = [0, 4] + name = 'RLE#' + + def mnemonic(self, index): + return ['1', 'more'][index] + + def explanation(self, index, extra): + description = self.description and self.description+': ' + if index==0: return description+'No RLE coding' + return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) + +class TreeAlphabet(WithExtra): + """The alphabet to enumerate entries (called trees) in the context map. + parameters are RLEMAX and NTREES + >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) + >>> len(t) + 8 + >>> print(t[2]) + xx+4 zeroes + >>> t[3].explanation(2) + '8+010=10 zeroes' + >>> t[0].value(0) + (1, 0) + """ + name = 'CMI' + def __init__(self, name=None, *, RLEMAX, NTREES, **args): + super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) + self.RLEMAX = RLEMAX + self.NTREES = NTREES + + def extraBits(self, index): + if 0>> for x in range(0,704,704//13): + ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) + 0 I0C2&D=0 + 110110 I6+xC8&D=0 + 1101100 I5C22+xxx&D=0 + 10100010 I4C4 + 11011000 I3C10+x + 100001110 I14+xxC8 + 101000100 I10+xxC22+xxx + 101111010 I98+xxxxxC14+xx + 110110000 I6+xC70+xxxxx + 111100110 I1090+[10*x]C8 + 1000011100 I26+xxxC326+[8*x] + 1001010010 I322+[8*x]C14+xx + 1010001000 I194+[7*x]C70+xxxxx + 1010111110 I22594+[24*x]C1094+[10*x] + """ + insertLengthAlphabet = InsertLengthAlphabet(None) + copyLengthAlphabet = CopyLengthAlphabet(None) + + def __init__(self, number=''): + super().__init__('IC'+str(number), bitLength=10) + + def __len__(self): + return 704 + + def extraBits(self, index): + insertSymbol, copySymbol, dist0 = self.splitSymbol(index) + return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ + CopyLengthAlphabet.extraTable[copySymbol.index] + + def splitSymbol(self, index): + """Give relevant values for computations: + (insertSymbol, copySymbol, dist0flag) + """ + #determine insert and copy upper bits from table + row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] + col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] + #determine inserts and copy sub codes + insertLengthCode = row<<3 | index>>3&7 + if row: insertLengthCode -= 8 + copyLengthCode = col<<3 | index&7 + return ( + Symbol(self.insertLengthAlphabet, insertLengthCode), + Symbol(self.copyLengthAlphabet, copyLengthCode), + row==0 + ) + + def mnemonic(self, index): + """Make a nice mnemonic + """ + i,c,d0 = self.splitSymbol(index) + iLower, _ = i.code.span(i.index) + iExtra = i.extraBits() + cLower, _ = c.code.span(c.index) + cExtra = c.extraBits() + return 'I{}{}{}C{}{}{}{}'.format( + iLower, + '+' if iExtra else '', + 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), + cLower, + '+' if cExtra else '', + 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), + '&D=0' if d0 else '') + + def value(self, index, extra): + i,c,d0 = self.splitSymbol(index) + iExtra = i.extraBits() + ce, ie = extra>>iExtra, extra&(1<>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) + >>> print(d[4], d[17], d[34]) + last-1 1 10xx00-5 + >>> [str(d[x]) for x in range(26, 32)] + ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] + """ + def __init__(self, number, *, NPOSTFIX, NDIRECT): + self.NPOSTFIX = NPOSTFIX + self.NDIRECT = NDIRECT + #set length + #Actually, not all symbols are used, + #only NDIRECT+16+(44-2*POSTFIX<>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) + >>> [d[i].extraBits() for i in range(26)] + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + >>> [d[i].extraBits() for i in range(26,36)] + [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] + """ + if index<16+self.NDIRECT: return 0 + return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) + + def value(self, dcode, dextra): + """Decode value of symbol together with the extra bits. + >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) + >>> d[34].value(2) + (0, 35) + """ + if dcode<16: + return [(1,0),(2,0),(3,0),(4,0), + (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), + (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) + ][dcode] + if dcode<16+self.NDIRECT: + return (0,dcode-16) + #we use the original formulas, instead of my clear explanation + POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 + ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) + hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX + lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK + offset = ((2 + (hcode & 1)) << ndistbits) - 4 + distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 + return (0,distance) + + def mnemonic(self, index, verbose=False): + """Give mnemonic representation of meaning. + verbose compresses strings of x's + """ + if index<16: + return ['last', '2last', '3last', '4last', + 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', + '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' + ][index] + if index<16+self.NDIRECT: + return str(index-16) + #construct strings like "1xx01-15" + index -= self.NDIRECT+16 + hcode = index >> self.NPOSTFIX + lcode = index & (1<>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), + lcode, self.NPOSTFIX, + self.NDIRECT+1-(4<>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) + >>> d[55].explanation(13) + '11[1101]01-5: [0]+240' + """ + extraBits = self.extraBits(index) + extraString = '[{:0{}b}]'.format(extra, extraBits) + return '{0}: [{1[0]}]{1[1]:+d}'.format( + self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), + self.value(index, extra)) + +#Classes for doing actual work------------------------------------------ +class ContextModeKeeper: + """For computing the literal context mode. + You feed it characters, and it computes indices in the context map. + """ + def __init__(self, mode): + self.chars = deque([0,0], maxlen=2) + self.mode = mode + + def setContextMode(self, mode): + """Switch to given context mode (0..3)""" + self.mode = mode + def getIndex(self): + if self.mode==0: #LSB6 + return self.chars[1]&0x3f + elif self.mode==1: #MSB6 + return self.chars[1]>>2 + elif self.mode==2: #UTF8: character class of previous and a bit of the second + p2,p1 = self.chars + return self.lut0[p1]|self.lut1[p2] + elif self.mode==3: #Signed: initial bits of last two bytes + p2,p1 = self.chars + return self.lut2[p1]<<3|self.lut2[p2] + + def add(self, index): + """Adjust the context for output char (as int).""" + self.chars.append(index) + + #0: control #16: quote #32: ,:; #48: AEIOU + #4: tab/lf/cr #20: % #36: . #52: BC..Z + #8: space #24: (<[{ #40: = #56: aeiou + #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z + lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, + 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, + 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, + 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, + 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 + ]+[0,1]*32+[2,3]*32 + #0: space 1:punctuation 2:digit/upper 3:lower + lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 + ]+[0]*96+[2]*32 + #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 + lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] + assert len(lut0)==len(lut1)==len(lut2)==256 + +class WordList: + """Word list. + >>> WordList().word(7, 35555) + b'Program to ' + """ + NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, + 7, 6, 6, 5, 5] + def __init__(self): + self.file = open('dict', 'rb') + self.compileActions() + + def word(self, size, dist): + """Get word + """ + #split dist in index and action + ndbits = self.NDBITS[size] + index = dist&(1<>ndbits + #compute position in file + position = sum(n< 94:w.U+\' 119:_+w.U+=\' + 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' + 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. + 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> + 23:w[:-3] 48:w[:-7] 98:_+w+=\' + 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, + """ + + def compileActions(self): + """Build the action table from the text above + """ + import re + self.actionList = actions = [None]*121 + #Action 73, which is too long, looks like this when expanded: + actions[73] = "b' the '+w+b' of the '" + #find out what the columns are + actionLines = self.actionTable.splitlines() + colonPositions = [m.start() + for m in re.finditer(':',actionLines[1]) + ]+[100] + columns = [(colonPositions[i]-3,colonPositions[i+1]-3) + for i in range(len(colonPositions)-1)] + for line in self.actionTable.splitlines(keepends=False): + for start,end in columns: + action = line[start:end] + #skip empty actions + if not action or action.isspace(): continue + #chop it up, and check if the colon is properly placed + index, colon, action = action[:3], action[3], action[4:] + assert colon==':' + #remove filler spaces at right + action = action.rstrip() + #replace space symbols + action = action.replace('_', ' ') + wPos = action.index('w') + #add quotes around left string when present + #translation: any pattern from beginning, up to + #(but not including) a + following by a w later on + action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) + #add quotes around right string when present + #translation: anything with a w in it, followed by a + + #and a pattern up to the end + #(there is no variable lookbehind assertion, + #so we have to copy the pattern) + action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) + #expand shortcut for uppercaseAll + action = action.replace(".U", ".upper()") + #store action + actions[int(index)] = action + + def doAction(self, w, action): + """Perform the proper action + """ + #set environment for the UpperCaseFirst + U = self.upperCase1 + return eval(self.actionList[action], locals()) + +class Layout: + """Class to layout the output. + """ + #display width of hexdata+bitdata + width = 25 + #general + def __init__(self, stream): + self.stream = stream + self.bitPtr = self.width + + def makeHexData(self, pos): + """Produce hex dump of all data containing the bits + from pos to stream.pos + """ + firstAddress = pos+7>>3 + lastAddress = self.stream.pos+7>>3 + return ''.join(map('{:02x} '.format, + self.stream.data[firstAddress:lastAddress])) + + def formatBitData(self, pos, width1, width2=0): + """Show formatted bit data: + Bytes are separated by commas + whole bytes are displayed in hex + >>> Layout(olleke).formatBitData(6, 2, 16) + '|00h|2Eh,|00' + >>> Layout(olleke).formatBitData(4, 1, 0) + '1' + """ + result = [] + #make empty prefix code explicit + if width1==0: result = ['()', ','] + for width in width1, width2: + #skip empty width2 + if width==0: continue + #build result backwards in a list + while width>0: + availableBits = 8-(pos&7) + if width>3] >> (pos&7) & (1<>3] >> (pos&7) + result.append('|{:0{}b}'.format(data, availableBits)) + else: + #read whole byte (in hex), beginning and ending at boundary + data = self.stream.data[pos>>3] + result.append('|{:02X}h'.format(data)) + width -= availableBits + pos += availableBits + #if width overshot from the availableBits subtraction, fix it + pos += width + #add comma to separate fields + result.append(',') + #concatenate pieces, reversed, skipping the last space + return ''.join(result[-2::-1]) + + def readPrefixCode(self, alphabet): + """give alphabet the prefix code that is read from the stream + Called for the following alphabets, in this order: + The alphabet in question must have a "logical" order, + otherwise the assignment of symbols doesn't work. + """ + mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) + if mode=='Complex': + #for a complex code, numberOfSymbols means hskip + self.readComplexCode(numberOfSymbols, alphabet) + return alphabet + else: + table = [] + #Set table of lengths for mnemonic function + lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] + #adjust mnemonic function of alphabet class + def myMnemonic(index): + return '{} bit{}: {}'.format( + lengths[i], + '' if lengths[i]==1 else 's', + alphabet.__class__.mnemonic(alphabet, index) + ) + alphabet.mnemonic = myMnemonic + for i in range(numberOfSymbols): + table.append(self.verboseRead(alphabet, skipExtra=True).index) + #restore mnemonic + del alphabet.mnemonic + if numberOfSymbols==4: + #read tree shape to redefine lengths + lengths = self.verboseRead(TreeShapeAlhabet()) + #construct the alphabet prefix code + alphabet.setLength(dict(zip(table, lengths))) + return alphabet + + def readComplexCode(self, hskip, alphabet): + """Read complex code""" + stream = self.stream + #read the lengths for the length code + lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] + codeLengths = {} + total = 0 + lol = LengthOfLengthAlphabet('##'+alphabet.name) + #lengthCode will be used for coding the lengths of the new code + #we use it for display until now; definition comes below + lengthCode = LengthAlphabet('#'+alphabet.name) + lengthIter = iter(lengths) + while total<32: + newSymbol = next(lengthIter) + lol.description = str(lengthCode[newSymbol]) + length = self.verboseRead(lol) + if length: + codeLengths[newSymbol] = length + total += 32>>length + if total>=32: + break + if total>32: raise ValueError("Stream format") + #Now set the encoding of the lengthCode + lengthCode.setLength(codeLengths) + print("***** Lengths for {} will be coded as:".format(alphabet.name)) + lengthCode.showCode() + #Now determine the symbol lengths with the lengthCode + symbolLengths = {} + total = 0 + lastLength = 8 + alphabetIter = iter(alphabet) + while total<32768: + #look ahead to see what is going to happen + length = lengthCode.decodePeek( + self.stream.peek(lengthCode.maxLength))[1].index + #in every branch, set lengthCode.description to explanatory text + #lengthCode calls format(symbol, extra) with this string + if length==0: + symbol = next(alphabetIter) + lengthCode.description = 'symbol {} unused'.format(symbol) + self.verboseRead(lengthCode) + #unused symbol + continue + if length==16: + lengthCode.description = \ + '{1}+3 symbols of length '+str(lastLength) + extra = self.verboseRead(lengthCode) + #scan series of 16s (repeat counts) + #start with repeat count 2 + repeat = 2 + startSymbol = next(alphabetIter) + endSymbol = next(alphabetIter) + symbolLengths[startSymbol.index] = \ + symbolLengths[endSymbol.index] = lastLength + #count the two just defined symbols + total += 2*32768>>lastLength + #note: loop may end because we're there + #even if a 16 _appears_ to follow + while True: + #determine last symbol + oldRepeat = repeat + repeat = (repeat-2<<2)+extra+3 + #read as many symbols as repeat increased + for i in range(oldRepeat, repeat): + endSymbol = next(alphabetIter) + symbolLengths[endSymbol.index] = lastLength + #compute new total; it may be end of loop + total += (repeat-oldRepeat)*32768>>lastLength + if total>=32768: break + #see if there is more to do + length = lengthCode.decodePeek( + self.stream.peek(lengthCode.maxLength))[1].index + if length!=16: break + lengthCode.description = 'total {}+{{1}} symbols'.format( + (repeat-2<<2)+3) + extra = self.verboseRead(lengthCode) + elif length==17: + #read, and show explanation + lengthCode.description = '{1}+3 unused' + extra = self.verboseRead(lengthCode) + #scan series of 17s (groups of zero counts) + #start with repeat count 2 + repeat = 2 + startSymbol = next(alphabetIter) + endSymbol = next(alphabetIter) + #note: loop will not end with total==32768, + #since total doesn't change here + while True: + #determine last symbol + oldRepeat = repeat + repeat = (repeat-2<<3)+extra+3 + #read as many symbols as repeat increases + for i in range(repeat-oldRepeat): + endSymbol = next(alphabetIter) + #see if there is more to do + length = lengthCode.decodePeek( + self.stream.peek(lengthCode.maxLength))[1].index + if length!=17: break + lengthCode.description = 'total {}+{{1}} unused'.format( + (repeat-2<<3)+3) + extra = self.verboseRead(lengthCode) + else: + symbol = next(alphabetIter) + #double braces for format + char = str(symbol) + if char in '{}': char *= 2 + lengthCode.description = \ + 'Length for {} is {{0.index}} bits'.format(char) + #output is not needed (will be 0) + self.verboseRead(lengthCode) + symbolLengths[symbol.index] = length + total += 32768>>length + lastLength = length + assert total==32768 + alphabet.setLength(symbolLengths) + print('End of table. Prefix code '+alphabet.name+':') + alphabet.showCode() + + #stream + def processStream(self): + """Process a brotli stream. + """ + print('addr hex{:{}s}binary context explanation'.format( + '', self.width-10)) + print('Stream header'.center(60, '-')) + self.windowSize = self.verboseRead(WindowSizeAlphabet()) + print('Metablock header'.center(60, '=')) + self.ISLAST = False + self.output = bytearray() + while not self.ISLAST: + self.ISLAST = self.verboseRead( + BoolCode('LAST', description="Last block")) + if self.ISLAST: + if self.verboseRead( + BoolCode('EMPTY', description="Empty block")): break + if self.metablockLength(): continue + if not self.ISLAST and self.uncompressed(): continue + print('Block type descriptors'.center(60, '-')) + self.numberOfBlockTypes = {} + self.currentBlockCounts = {} + self.blockTypeCodes = {} + self.blockCountCodes = {} + for blockType in (L,I,D): self.blockType(blockType) + print('Distance code parameters'.center(60, '-')) + self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) + self.readLiteralContextModes() + print('Context maps'.center(60, '-')) + self.cmaps = {} + #keep the number of each kind of prefix tree for the last loop + numberOfTrees = {I: self.numberOfBlockTypes[I]} + for blockType in (L,D): + numberOfTrees[blockType] = self.contextMap(blockType) + print('Prefix code lists'.center(60, '-')) + self.prefixCodes = {} + for blockType in (L,I,D): + self.readPrefixArray(blockType, numberOfTrees[blockType]) + self.metablock() + + #metablock header + def verboseRead(self, alphabet, context='', skipExtra=False): + """Read symbol and extra from stream and explain what happens. + Returns the value of the symbol + >>> olleke.pos = 0 + >>> l = Layout(olleke) + >>> l.verboseRead(WindowSizeAlphabet()) + 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 + 4194288 + """ + #TODO 2: verbosity level, e.g. show only codes and maps in header + stream = self.stream + pos = stream.pos + if skipExtra: + length, symbol = alphabet.readTuple(stream) + extraBits, extra = 0, None + else: + length, symbol, extraBits, extra = alphabet.readTupleAndExtra( + stream) + #fields: address, hex data, binary data, name of alphabet, explanation + hexdata = self.makeHexData(pos) + addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' + bitdata = self.formatBitData(pos, length, extraBits) + #bitPtr moves bitdata so that the bytes are easier to read + #jump back to right if a new byte starts + if '|' in bitdata[1:]: + #start over on the right side + self.bitPtr = self.width + fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) + if fillWidth<0: fillWidth = 0 + print('{:<5s} {:<{}s} {:7s} {}'.format( + addressField, + hexdata+' '*fillWidth+bitdata, self.width, + context+alphabet.name, + symbol if skipExtra else symbol.explanation(extra), + )) + #jump to the right if we started with a '|' + #because we didn't jump before printing + if bitdata.startswith('|'): self.bitPtr = self.width + else: self.bitPtr -= len(bitdata) + return symbol if skipExtra else symbol.value(extra) + + def metablockLength(self): + """Read MNIBBLES and meta block length; + if empty block, skip block and return true. + """ + self.MLEN = self.verboseRead(MetablockLengthAlphabet()) + if self.MLEN: + return False + #empty block; skip and return False + self.verboseRead(ReservedAlphabet()) + MSKIP = self.verboseRead(SkipLengthAlphabet()) + self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) + self.stream.pos += 8*MSKIP + print("Skipping to {:x}".format(self.stream.pos>>3)) + return True + + def uncompressed(self): + """If true, handle uncompressed data + """ + ISUNCOMPRESSED = self.verboseRead( + BoolCode('UNCMPR', description='Is uncompressed?')) + if ISUNCOMPRESSED: + self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) + print('Uncompressed data:') + self.output += self.stream.readBytes(self.MLEN) + print(outputFormatter(self.output[-self.MLEN:])) + return ISUNCOMPRESSED + + def blockType(self, kind): + """Read block type switch descriptor for given kind of blockType.""" + NBLTYPES = self.verboseRead(TypeCountAlphabet( + 'BT#'+kind[0].upper(), + description='{} block types'.format(kind), + )) + self.numberOfBlockTypes[kind] = NBLTYPES + if NBLTYPES>=2: + self.blockTypeCodes[kind] = self.readPrefixCode( + BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) + self.blockCountCodes[kind] = self.readPrefixCode( + BlockCountAlphabet('BC'+kind[0].upper())) + blockCount = self.verboseRead(self.blockCountCodes[kind]) + else: + blockCount = 1<<24 + self.currentBlockCounts[kind] = blockCount + + def readLiteralContextModes(self): + """Read literal context modes. + LSB6: lower 6 bits of last char + MSB6: upper 6 bits of last char + UTF8: rougly dependent on categories: + upper 4 bits depend on category of last char: + control/whitespace/space/ punctuation/quote/%/open/close/ + comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant + lower 2 bits depend on category of 2nd last char: + space/punctuation/digit or upper/lowercase + signed: hamming weight of last 2 chars + """ + print('Context modes'.center(60, '-')) + self.literalContextModes = [] + for i in range(self.numberOfBlockTypes[L]): + self.literalContextModes.append( + self.verboseRead(LiteralContextMode(number=i))) + + def contextMap(self, kind): + """Read context maps + Returns the number of differnt values on the context map + (In other words, the number of prefix trees) + """ + NTREES = self.verboseRead(TypeCountAlphabet( + kind[0].upper()+'T#', + description='{} prefix trees'.format(kind))) + mapSize = {L:64, D:4}[kind] + if NTREES<2: + self.cmaps[kind] = [0]*mapSize + else: + #read CMAPkind + RLEMAX = self.verboseRead(RLEmaxAlphabet( + 'RLE#'+kind[0].upper(), + description=kind+' context map')) + alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) + cmapCode = self.readPrefixCode(alphabet) + tableSize = mapSize*self.numberOfBlockTypes[kind] + cmap = [] + while len(cmap)=self.MLEN: return + #distance + #distances are computed relative to output length, at most window size + maxDistance = min(len(self.output), self.windowSize) + if dist0Flag: + distance = self.lastDistances[-1] + else: + bt = self.figureBlockType(D) + cm = {2:0, 3:1, 4:2}.get(copyLen, 3) + ct = self.cmaps[D][bt<<2|cm] + index, offset = self.verboseRead( + self.prefixCodes[D][ct], + context='{},{}='.format(bt,cm)) + distance = self.lastDistances[-index]+offset if index else offset + if index==1 and offset==0: + #to make sure distance is not put in last distance list + dist0Flag = True + if distance<=maxDistance: + #copy from output + for i in range( + maxDistance-distance, + maxDistance-distance+copyLen): + self.output.append(self.output[i]) + if not dist0Flag: self.lastDistances.append(distance) + comment = 'Seen before' + else: + #fetch from wordlist + newWord = wordList.word(copyLen, distance-maxDistance-1) + self.output.extend(newWord) + #adjust copyLen to reflect actual new data + copyLen = len(newWord) + comment = 'From wordlist' + blockLen += copyLen + print(' '*40, + comment, + ': "', + outputFormatter(self.output[-copyLen:]), + '"', + sep='') + self.contextMode.add(self.output[-2]) + self.contextMode.add(self.output[-1]) + + def figureBlockType(self, kind): + counts, types = self.currentBlockCounts, self.currentBlockTypes + if counts[kind]==0: + newType = self.verboseRead(self.blockTypeCodes[kind]) + if newType==-2: newType = types['P'+kind] + elif newType==-1: + newType = (types[kind]+1)%self.numberOfBlockTypes[kind] + types['P'+kind] = types[kind] + types[kind] = newType + counts[kind] = self.verboseRead(self.blockCountCodes[kind]) + counts[kind] -=1 + return types[kind] + +__test__ = { +'BitStream': """ + >>> bs = BitStream(b'Jurjen') + >>> bs.readBytes(2) + b'Ju' + >>> bs.read(6) #r=01110010 + 50 + >>> bs + BitStream(pos=2:6) + >>> bs.peek(5) #j=01101010 + 9 + >>> bs.readBytes(2) + Traceback (most recent call last): + ... + ValueError: readBytes: need byte boundary + """, + +'Symbol': """ + >>> a=Symbol(MetablockLengthAlphabet(),5) + >>> len(a) + 2 + >>> int(a) + 5 + >>> a.bitPattern() + '01' + >>> a.value(200000) + 200001 + >>> a.explanation(300000) + 'data length: 493e0h+1=300001' + """, + +'RangeDecoder': """ + >>> a=RangeDecoder(bitLength=3) + >>> len(a) + 8 + >>> a.name='t' + >>> list(a) + [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] + >>> a[2] + Symbol(t, 2) + >>> a.bitPattern(4) + '100' + >>> a.length(2) + 3 + >>> a.decodePeek(15) + (3, Symbol(t, 7)) + >>> + + """, + +'PrefixDecoder': """ + >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) + >>> len(a) + 4 + >>> a.name='t' + >>> list(a) + [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] + >>> a.decodePeek(22) + (1, Symbol(t, 1)) + >>> a.decodePeek(27) + (3, Symbol(t, 3)) + >>> a.length(1) + 1 + >>> a.length(4) + 3 + """, + +'Code': """ + >>> a=Code('t',alphabetSize=10) + >>> len(a) + 10 + >>> a.showCode() + 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 + >>> a.setLength({2:1,3:2,5:3,6:3}) + >>> a.showCode() + 0:2 01:3 011:5 111:6 + >>> len(a) + 4 + >>> def callback(i): return 'call{}back'.format(i) + >>> a=Code('t',callback=callback,bitLength=3) + >>> a[6].explanation() + 'call6back' + """, + +'WithExtra': """ + >>> class A(WithExtra): + ... extraTable = [0,1,1,2,2] + >>> a=A('t',alphabetSize=5) + >>> a[1] + Symbol(t, 1) + >>> a.extraBits(2) + 1 + >>> a.mnemonic(4) + '4' + >>> a.readTupleAndExtra(BitStream(b'\x5b')) + (3, Symbol(t, 3), 2, 3) + """, + +'BoolCode': """ + >>> BoolCode('test')[0].explanation() + '0: False' + """, + +'Enumerator': """ + >>> class A(Enumerator): + ... extraTable = [0,1,1,2,2] + ... value0=3 + >>> a=A(alphabetLength=5) + >>> a.value(3) + Traceback (most recent call last): + ... + TypeError: value() missing 1 required positional argument: 'extra' + >>> a.explanation(3,4) + 'xx 011: 8-11; 8+4=12' + """, + +'WindowSizeAlphabet': """ + >>> windowSizeAlphabet = WindowSizeAlphabet() + >>> windowSizeAlphabet[0] + Traceback (most recent call last): + ... + ValueError: No symbol WindowSizeAlphabet[0] + >>> len(windowSizeAlphabet) + 16 + >>> windowSizeAlphabet[21] + Symbol(WSIZE, 21) + >>> windowSizeAlphabet[21].bitPattern() + '1001' + >>> windowSizeAlphabet[21].extraBits() + 0 + >>> windowSizeAlphabet[21].index + 21 + >>> windowSizeAlphabet[10].value() + 1008 + >>> windowSizeAlphabet[10].explanation() + 'windowsize=(1<<10)-16=1008' + >>> windowSizeAlphabet.showCode() + 0:65520 1100001:16368 1110001:32752 0011:262128 + 0000001:131056 0010001:None 1001:2097136 1011:4194288 + 1000001:4080 1010001:8176 0101:524272 0111:1048560 + 0100001:1008 0110001:2032 1101:8388592 1111:16777200 + """, + +'TypeCountAlphabet': """ + >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') + >>> len(typeCountAlphabet) + 9 + >>> typeCountAlphabet[3] + Symbol(BT#, 3) + >>> typeCountAlphabet[9] + Traceback (most recent call last): + ... + ValueError: No symbol TypeCountAlphabet[9] + >>> print(typeCountAlphabet[3]) + xx,0101 + >>> typeCountAlphabet[8].value(127) + 256 + >>> typeCountAlphabet[4].explanation(2) + 'xxx,0111: 11 bananas' + >>> typeCountAlphabet[0].explanation() + '0: 1 banana' + """, + +'DistanceParamAlphabet': """ + >>> dpa = DistanceParamAlphabet() + >>> dpa.showCode() + 00:PF0 01:PF1 10:PF2 11:PF3 + >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) + (2, Symbol(DIST, 1), 4, 10) + >>> dpa.explanation(2, 5) + '2 postfix bits and 0101<<2=20 direct codes' + """, + +'LiteralAlphabet': """ + >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS + 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 + 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 + 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 + ... + 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff + 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc + 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd + 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce + 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf + """, + +'BlockCountAlphabet': """ + >>> bc=BlockCountAlphabet('BCL') + >>> len(bc) + 26 + >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') + >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) + 'Block count: xx 00000: 1-4; 1+2=3' + >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) + 'Block count: xxx 00110: 33-40; 33+0=33' + >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) + 'Block count: xxxxxx 10001: 305-368; 305+28=333' + >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) + 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' + """, + +'Layout': """ + >>> olleke.pos = 0 + >>> l = Layout(olleke) + >>> l.verboseRead(WindowSizeAlphabet()) + 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 + 4194288 + >>> l.verboseRead(BoolCode('LAST', description="Last block")) + 1 LAST Last block: 1: True + True + >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) + 0 EMPTY Empty block: 0: False + False + >>> l.verboseRead(MetablockLengthAlphabet()) + 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 + 47 + >>> olleke.pos = 76 + >>> l = Layout(olleke) + >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) + 000a 82 10|1100 D0 10[15*x]-3 + >>> x.explanation(0x86a3) + '10[1000011010100011]-3: [0]+100000' + """, + +'olleke': """ + >>> olleke.pos = 0 + >>> try: Layout(olleke).processStream() + ... except NotImplementedError: pass + ... #doctest: +REPORT_NDIFF + addr hex binary context explanation + -----------------------Stream header------------------------ + 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 + ======================Metablock header====================== + 1 LAST Last block: 1: True + 0 EMPTY Empty block: 0: False + 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 + -------------------Block type descriptors------------------- + 0003 00 0 BT#L 0: 1 literal block type + 0 BT#I 0: 1 insert© block type + 0 BT#D 0: 1 distance block type + ------------------Distance code parameters------------------ + 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes + -----------------------Context modes------------------------ + 10 LC0 Context mode for type 0: 2(UTF8) + ------------------------Context maps------------------------ + 0 LT# 0: 1 literal prefix tree + 0 DT# 0: 1 distance prefix tree + ---------------------Prefix code lists---------------------- + 10 PFX L0 is complex with lengths 3,4,0,5,17... + 0005 4f 1|0 ##L0 len 3: coded with 3 bits + 0111 ##L0 len 4: coded with 1 bits + 10 ##L0 unused: coded with 3 bits + 0006 d6 0|0 ##L0 len 5: skipped + 011 ##L0 zero xxx: coded with 2 bits + ***** Lengths for L0 will be coded as: + 0:len 4 01:zero xxx 011:unused 111:len 3 + 0007 95 1|11,01 #L0 7+3 unused + 0 #L0 Length for \\n is 4 bits + 001,01 #L0 1+3 unused + 0008 44 010,0|1 #L0 total 19+2 unused + 0 #L0 Length for " " is 4 bits + 0 #L0 Length for ! is 4 bits + 0009 cb 011,|01 #L0 3+3 unused + |110,01 #L0 total 35+6 unused + 000a 82 0 #L0 Length for K is 4 bits + 000,01 #L0 0+3 unused + 0 #L0 Length for O is 4 bits + 000b 4d 01|1 #L0 symbol P unused + 011 #L0 symbol Q unused + 0 #L0 Length for R is 4 bits + 000c 88 000,|01 #L0 0+3 unused + |100,01 #L0 total 11+4 unused + 000d b6 0 #L0 Length for b is 4 bits + 011 #L0 symbol c unused + 011 #L0 symbol d unused + 000e 27 11|1 #L0 Length for e is 3 bits + 010,01 #L0 2+3 unused + |0 #L0 Length for k is 4 bits + 000f 1f 111 #L0 Length for l is 3 bits + 011 #L0 symbol m unused + 0 #L0 Length for n is 4 bits + |0 #L0 Length for o is 4 bits + 0010 c1 000,01 #L0 0+3 unused + 0 #L0 Length for s is 4 bits + 0011 b4 0|11 #L0 symbol t unused + 0 #L0 Length for u is 4 bits + End of table. Prefix code L0: + 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s + 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u + 11,01 PFX IC0 is simple with 4 code words + 0012 2a |2Ah|10 IC0 ? bits: I5C4 + 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 + 0015 22 0010|111011 IC0 ? bits: I8+xC5 + 0016 8c 001100|0010 IC0 ? bits: I0C14+xx + 0 SHAPE False: lengths 2,2,2,2 + 0017 74 10,0|1 PFX D0 is simple with 3 code words + 0018 a6 0|01110 D0 1 bit: 2last-3 + 010011 D0 2 bits: 11xx-3 + 0019 aa 01010|1 D0 2 bits: 11xxx-3 + ====================Meta block contents===================== + |1,01 IC0 Literal: 9, copy: 5 + 001a 41 0001 0,0=L0 O + 100 0,48=L0 l + 001b a2 10|0 0,62=L0 l + 000 0,63=L0 e + 001c a1 1|101 0,59=L0 k + 000 0,63=L0 e + |1010 0,59=L0 " " + 001d b5 0101 0,11=L0 b + |1011 0,60=L0 o + 001e 24 0 0,3=D0 Distance: 2last-3=8 + Seen before: "lleke" + 0,10 IC0 Literal: 6, copy: 7 + |0010 0,59=L0 \\n + 001f 89 1001 0,7=L0 R + 000 0,52=L0 e + 0020 fa 010|1 0,58=L0 b + 1111 0,63=L0 u + 0021 eb 011|1 0,59=L0 s + 11,01 0,3=D0 Absolute value: 12 (pos 8) + Seen before: "olleke\\n" + 0022 db 01,1|1 IC0 Literal: 0, copy: 15 + |110,11 0,3=D0 Absolute value: 27 (pos 0) + Seen before: "Olleke bolleke\\n" + 0023 f8 00 IC0 Literal: 5, copy: 4 + 1110 0,7=L0 K + 0024 2c 00|11 0,52=L0 n + 1011 0,62=L0 o + 0025 0d 1|00 0,59=L0 l + 0110 0,63=L0 ! + """, + +'file': """ + >>> try: Layout(BitStream( + ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') + ... .read())).processStream() + ... except NotImplementedError: pass + addr hex binary context explanation + -----------------------Stream header------------------------ + 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 + ======================Metablock header====================== + 1 LAST Last block: 1: True + 0 EMPTY Empty block: 0: False + 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 + -------------------Block type descriptors------------------- + 0003 00 0 BT#L 0: 1 literal block type + 0 BT#I 0: 1 insert© block type + 0 BT#D 0: 1 distance block type + ------------------Distance code parameters------------------ + 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes + -----------------------Context modes------------------------ + 10 LC0 Context mode for type 0: 2(UTF8) + ------------------------Context maps------------------------ + 0 LT# 0: 1 literal prefix tree + 0 DT# 0: 1 distance prefix tree + ---------------------Prefix code lists---------------------- + 0005 b0 0|1,01 PFX L0 is simple with 2 code words + 0006 b2 0|1011000 L0 1 bit: X + 0007 ea 0|1011001 L0 1 bit: Y + 01,01 PFX IC0 is simple with 2 code words + 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 + 0009 47 02 0|47h|1 IC0 1 bit: I1C9 + 00,01 PFX D0 is simple with 1 code word + 000b 8a 010|000 D0 0 bits: 10x-3 + ====================Meta block contents===================== + 1 IC0 Literal: 1, copy: 9 + 0 0,0=L0 X + 0,() 0,3=D0 Absolute value: 1 (pos 0) + Seen before: "XXXXXXXXX" + 0 IC0 Literal: 1, copy: 9, same distance + |1 0,54=L0 Y + Seen before: "YYYYYYYYY" + """, + +'XY': """ + >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() + ... except NotImplementedError: pass + addr hex binary context explanation + -----------------------Stream header------------------------ + 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 + ======================Metablock header====================== + 1 LAST Last block: 1: True + 0 EMPTY Empty block: 0: False + 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 + -------------------Block type descriptors------------------- + 0003 00 0 BT#L 0: 1 literal block type + 0 BT#I 0: 1 insert© block type + 0 BT#D 0: 1 distance block type + ------------------Distance code parameters------------------ + 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes + -----------------------Context modes------------------------ + 10 LC0 Context mode for type 0: 2(UTF8) + ------------------------Context maps------------------------ + 0 LT# 0: 1 literal prefix tree + 0 DT# 0: 1 distance prefix tree + ---------------------Prefix code lists---------------------- + 0005 b0 0|1,01 PFX L0 is simple with 2 code words + 0006 b2 0|1011000 L0 1 bit: X + 0007 82 0|1011001 L0 1 bit: Y + 00,01 PFX IC0 is simple with 1 code word + 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 + 0009 00 00,0|1 PFX D0 is simple with 1 code word + 000a e0 0|00000 D0 0 bits: last + ====================Meta block contents===================== + () IC0 Literal: 4, copy: 6, same distance + 0 0,0=L0 X + 0 0,52=L0 X + 0 0,54=L0 X + 0 0,54=L0 X + Seen before: "XXXXXX" + () IC0 Literal: 4, copy: 6, same distance + 1 0,54=L0 Y + 1 0,54=L0 Y + |1 0,54=L0 Y + 000b 01 1 0,54=L0 Y + Seen before: "YYYYYY" + """, + +'empty': """ + >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() + ... except NotImplementedError: pass + addr hex binary context explanation + -----------------------Stream header------------------------ + 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 + ======================Metablock header====================== + |1 LAST Last block: 1: True + 0001 16 0 EMPTY Empty block: 0: False + 11 MLEN 11: empty block + 0 RSVD Reserved (must be zero) + 0002 00 000000|00,01 SKIP skip length: 0h+1=1 + |00 SKIP 2 bits ignored + Skipping to 4 + """, + +} + +if __name__=='__main__': + import sys + if len(sys.argv)>1: + l = Layout(BitStream(open(sys.argv[1],'rb').read())) + l.processStream() + else: + sys.path.append("h:/Persoonlijk/bin") + try: + import brotli + open('brotlidump.br', 'wb').write( + brotli.compress( + open('brotlidump.py', 'r').read() + )) + olleke = BitStream(brotli.compress( + 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) + except ImportError: pass + import doctest + doctest.testmod(optionflags=doctest.REPORT_NDIFF + #|doctest.FAIL_FAST + ) -- cgit v1.1