debian-tablib/tablib/packages/xlrd3/biffh.py

# Support module for the xlrd3 package.
#
# Portions copyright (c) 2005-2008 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
#
# 2010-12-08 mozman refactoring for python 3
# 2008-02-10 SJM BIFF2 BLANK record
# 2008-02-08 SJM Preparation for Excel 2.0 support
# 2008-02-02 SJM Added suffixes (_B2, _B2_ONLY, etc) on record names for
#                biff_dump & biff_count
# 2007-12-04 SJM Added support for Excel 2.x (BIFF2) files.
# 2007-09-08 SJM Avoid crash when zero-length Unicode string missing options byte.
# 2007-04-22 SJM Remove experimental "trimming" facility.

import sys
from struct import unpack

encoding_from_codepage = {
    1200 : 'utf_16_le',
    10000: 'mac_roman',
    10006: 'mac_greek', # guess
    10007: 'mac_cyrillic', # guess
    10029: 'mac_latin2', # guess
    10079: 'mac_iceland', # guess
    10081: 'mac_turkish', # guess
    32768: 'mac_roman',
    32769: 'cp1252',
    }

# some more guessing, for Indic scripts
# codepage 57000 range:
# 2 Devanagari [0]
# 3 Bengali [1]
# 4 Tamil [5]
# 5 Telegu [6]
# 6 Assamese [1] c.f. Bengali
# 7 Oriya [4]
# 8 Kannada [7]
# 9 Malayalam [8]
# 10 Gujarati [3]
# 11 Gurmukhi [2]

FUN = 0 # unknown
FDT = 1 # date
FNU = 2 # number
FGE = 3 # general
FTX = 4 # text

DATEFORMAT = FDT
NUMBERFORMAT = FNU

XL_CELL_EMPTY = 0
XL_CELL_TEXT = 1
XL_CELL_NUMBER = 2
XL_CELL_DATE = 3
XL_CELL_BOOLEAN = 4
XL_CELL_ERROR = 5
XL_CELL_BLANK = 6 # for use in debugging, gathering stats, etc

biff_text_from_num = {
    0:  "(not BIFF)",
    20: "2.0",
    21: "2.1",
    30: "3",
    40: "4S",
    45: "4W",
    50: "5",
    70: "7",
    80: "8",
    85: "8X",
}

# This dictionary can be used to produce a text version of the internal codes
# that Excel uses for error cells. Here are its contents:
error_text_from_code = {
    0x00: '#NULL!',  # Intersection of two cell ranges is empty
    0x07: '#DIV/0!', # Division by zero
    0x0F: '#VALUE!', # Wrong type of operand
    0x17: '#REF!',   # Illegal or deleted cell reference
    0x1D: '#NAME?',  # Wrong function or range name
    0x24: '#NUM!',   # Value range overflow
    0x2A: '#N/A!',   # Argument or function not available
}

BIFF_FIRST_UNICODE = 80

XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5
XL_WORKBOOK_GLOBALS_4W = 0x100
XL_WORKSHEET = WRKSHEET = 0x10

XL_BOUNDSHEET_WORKSHEET = 0x00
XL_BOUNDSHEET_CHART     = 0x02
XL_BOUNDSHEET_VB_MODULE = 0x06

# XL_RK2 = 0x7e
XL_ARRAY  = 0x0221
XL_ARRAY2 = 0x0021
XL_BLANK = 0x0201
XL_BLANK_B2 = 0x01
XL_BOF = 0x809
XL_BOOLERR = 0x205
XL_BOOLERR_B2 = 0x5
XL_BOUNDSHEET = 0x85
XL_BUILTINFMTCOUNT = 0x56
XL_CF = 0x01B1
XL_CODEPAGE = 0x42
XL_COLINFO = 0x7D
XL_COLUMNDEFAULT = 0x20 # BIFF2 only
XL_COLWIDTH = 0x24 # BIFF2 only
XL_CONDFMT = 0x01B0
XL_CONTINUE = 0x3c
XL_COUNTRY = 0x8C
XL_DATEMODE = 0x22
XL_DEFAULTROWHEIGHT = 0x0225
XL_DEFCOLWIDTH = 0x55
XL_DIMENSION = 0x200
XL_DIMENSION2 = 0x0
XL_EFONT = 0x45
XL_EOF = 0x0a
XL_EXTERNNAME = 0x23
XL_EXTERNSHEET = 0x17
XL_EXTSST = 0xff
XL_FEAT11 = 0x872
XL_FILEPASS = 0x2f
XL_FONT = 0x31
XL_FONT_B3B4 = 0x231
XL_FORMAT = 0x41e
XL_FORMAT2 = 0x1E # BIFF2, BIFF3
XL_FORMULA = 0x6
XL_FORMULA3 = 0x206
XL_FORMULA4 = 0x406
XL_GCW = 0xab
XL_INDEX = 0x20b
XL_INTEGER = 0x2 # BIFF2 only
XL_IXFE = 0x44 # BIFF2 only
XL_LABEL = 0x204
XL_LABEL_B2 = 0x04
XL_LABELRANGES = 0x15f
XL_LABELSST = 0xfd
XL_MERGEDCELLS = 0xE5
XL_MSO_DRAWING = 0x00EC
XL_MSO_DRAWING_GROUP = 0x00EB
XL_MSO_DRAWING_SELECTION = 0x00ED
XL_MULRK = 0xbd
XL_MULBLANK = 0xbe
XL_NAME = 0x18
XL_NOTE = 0x1c
XL_NUMBER = 0x203
XL_NUMBER_B2 = 0x3
XL_OBJ = 0x5D
XL_PALETTE = 0x92
XL_RK = 0x27e
XL_ROW = 0x208
XL_ROW_B2 = 0x08
XL_RSTRING = 0xd6
XL_SHEETHDR = 0x8F # BIFF4W only
XL_SHEETSOFFSET = 0x8E # BIFF4W only
XL_SHRFMLA = 0x04bc
XL_SST = 0xfc
XL_STANDARDWIDTH = 0x99
XL_STRING = 0x207
XL_STRING_B2 = 0x7
XL_STYLE = 0x293
XL_SUPBOOK = 0x1AE
XL_TABLEOP = 0x236
XL_TABLEOP2 = 0x37
XL_TABLEOP_B2 = 0x36
XL_TXO = 0x1b6
XL_UNCALCED = 0x5e
XL_UNKNOWN = 0xffff
XL_WINDOW2 = 0x023E
XL_WRITEACCESS = 0x5C
XL_XF = 0xe0
XL_XF2 = 0x0043 # BIFF2 version of XF record
XL_XF3 = 0x0243 # BIFF3 version of XF record
XL_XF4 = 0x0443 # BIFF4 version of XF record

boflen = {
    0x0809: 8,
    0x0409: 6,
    0x0209: 6,
    0x0009: 4,
}

bofcodes = (0x0809, 0x0409, 0x0209, 0x0009)

XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206)

_cell_opcode_list = (
    XL_BOOLERR,
    XL_FORMULA,
    XL_FORMULA3,
    XL_FORMULA4,
    XL_LABEL,
    XL_LABELSST,
    XL_MULRK,
    XL_NUMBER,
    XL_RK,
    XL_RSTRING,
)

biff_rec_name_dict = {
    0x0000: 'DIMENSIONS_B2',
    0x0001: 'BLANK_B2',
    0x0002: 'INTEGER_B2_ONLY',
    0x0003: 'NUMBER_B2',
    0x0004: 'LABEL_B2',
    0x0005: 'BOOLERR_B2',
    0x0006: 'FORMULA',
    0x0007: 'STRING_B2',
    0x0008: 'ROW_B2',
    0x0009: 'BOF_B2',
    0x000A: 'EOF',
    0x000B: 'INDEX_B2_ONLY',
    0x000C: 'CALCCOUNT',
    0x000D: 'CALCMODE',
    0x000E: 'PRECISION',
    0x000F: 'REFMODE',
    0x0010: 'DELTA',
    0x0011: 'ITERATION',
    0x0012: 'PROTECT',
    0x0013: 'PASSWORD',
    0x0014: 'HEADER',
    0x0015: 'FOOTER',
    0x0016: 'EXTERNCOUNT',
    0x0017: 'EXTERNSHEET',
    0x0018: 'NAME_B2,5+',
    0x0019: 'WINDOWPROTECT',
    0x001A: 'VERTICALPAGEBREAKS',
    0x001B: 'HORIZONTALPAGEBREAKS',
    0x001C: 'NOTE',
    0x001D: 'SELECTION',
    0x001E: 'FORMAT_B2-3',
    0x001F: 'BUILTINFMTCOUNT_B2',
    0x0020: 'COLUMNDEFAULT_B2_ONLY',
    0x0021: 'ARRAY_B2_ONLY',
    0x0022: 'DATEMODE',
    0x0023: 'EXTERNNAME',
    0x0024: 'COLWIDTH_B2_ONLY',
    0x0025: 'DEFAULTROWHEIGHT_B2_ONLY',
    0x0026: 'LEFTMARGIN',
    0x0027: 'RIGHTMARGIN',
    0x0028: 'TOPMARGIN',
    0x0029: 'BOTTOMMARGIN',
    0x002A: 'PRINTHEADERS',
    0x002B: 'PRINTGRIDLINES',
    0x002F: 'FILEPASS',
    0x0031: 'FONT',
    0x0032: 'FONT2_B2_ONLY',
    0x0036: 'TABLEOP_B2',
    0x0037: 'TABLEOP2_B2',
    0x003C: 'CONTINUE',
    0x003D: 'WINDOW1',
    0x003E: 'WINDOW2_B2',
    0x0040: 'BACKUP',
    0x0041: 'PANE',
    0x0042: 'CODEPAGE',
    0x0043: 'XF_B2',
    0x0044: 'IXFE_B2_ONLY',
    0x0045: 'EFONT_B2_ONLY',
    0x004D: 'PLS',
    0x0051: 'DCONREF',
    0x0055: 'DEFCOLWIDTH',
    0x0056: 'BUILTINFMTCOUNT_B3-4',
    0x0059: 'XCT',
    0x005A: 'CRN',
    0x005B: 'FILESHARING',
    0x005C: 'WRITEACCESS',
    0x005D: 'OBJECT',
    0x005E: 'UNCALCED',
    0x005F: 'SAVERECALC',
    0x0063: 'OBJECTPROTECT',
    0x007D: 'COLINFO',
    0x007E: 'RK2_mythical_?',
    0x0080: 'GUTS',
    0x0081: 'WSBOOL',
    0x0082: 'GRIDSET',
    0x0083: 'HCENTER',
    0x0084: 'VCENTER',
    0x0085: 'BOUNDSHEET',
    0x0086: 'WRITEPROT',
    0x008C: 'COUNTRY',
    0x008D: 'HIDEOBJ',
    0x008E: 'SHEETSOFFSET',
    0x008F: 'SHEETHDR',
    0x0090: 'SORT',
    0x0092: 'PALETTE',
    0x0099: 'STANDARDWIDTH',
    0x009B: 'FILTERMODE',
    0x009C: 'FNGROUPCOUNT',
    0x009D: 'AUTOFILTERINFO',
    0x009E: 'AUTOFILTER',
    0x00A0: 'SCL',
    0x00A1: 'SETUP',
    0x00AB: 'GCW',
    0x00BD: 'MULRK',
    0x00BE: 'MULBLANK',
    0x00C1: 'MMS',
    0x00D6: 'RSTRING',
    0x00D7: 'DBCELL',
    0x00DA: 'BOOKBOOL',
    0x00DD: 'SCENPROTECT',
    0x00E0: 'XF',
    0x00E1: 'INTERFACEHDR',
    0x00E2: 'INTERFACEEND',
    0x00E5: 'MERGEDCELLS',
    0x00E9: 'BITMAP',
    0x00EB: 'MSO_DRAWING_GROUP',
    0x00EC: 'MSO_DRAWING',
    0x00ED: 'MSO_DRAWING_SELECTION',
    0x00EF: 'PHONETIC',
    0x00FC: 'SST',
    0x00FD: 'LABELSST',
    0x00FF: 'EXTSST',
    0x013D: 'TABID',
    0x015F: 'LABELRANGES',
    0x0160: 'USESELFS',
    0x0161: 'DSF',
    0x01AE: 'SUPBOOK',
    0x01AF: 'PROTECTIONREV4',
    0x01B0: 'CONDFMT',
    0x01B1: 'CF',
    0x01B2: 'DVAL',
    0x01B6: 'TXO',
    0x01B7: 'REFRESHALL',
    0x01B8: 'HLINK',
    0x01BC: 'PASSWORDREV4',
    0x01BE: 'DV',
    0x01C0: 'XL9FILE',
    0x01C1: 'RECALCID',
    0x0200: 'DIMENSIONS',
    0x0201: 'BLANK',
    0x0203: 'NUMBER',
    0x0204: 'LABEL',
    0x0205: 'BOOLERR',
    0x0206: 'FORMULA_B3',
    0x0207: 'STRING',
    0x0208: 'ROW',
    0x0209: 'BOF',
    0x020B: 'INDEX_B3+',
    0x0218: 'NAME',
    0x0221: 'ARRAY',
    0x0223: 'EXTERNNAME_B3-4',
    0x0225: 'DEFAULTROWHEIGHT',
    0x0231: 'FONT_B3B4',
    0x0236: 'TABLEOP',
    0x023E: 'WINDOW2',
    0x0243: 'XF_B3',
    0x027E: 'RK',
    0x0293: 'STYLE',
    0x0406: 'FORMULA_B4',
    0x0409: 'BOF',
    0x041E: 'FORMAT',
    0x0443: 'XF_B4',
    0x04BC: 'SHRFMLA',
    0x0800: 'QUICKTIP',
    0x0809: 'BOF',
    0x0862: 'SHEETLAYOUT',
    0x0867: 'SHEETPROTECTION',
    0x0868: 'RANGEPROTECTION',
}

class XLRDError(Exception):
    pass

class BaseObject:
    """
    Parent of almost all other classes in the package. Defines a common
    'dump' method for debugging.
    """
    _repr_these = []

    def dump(self, f=None, header=None, footer=None, indent=0):
        """
        :param f: open file object, to which the dump is written
        :param header: text to write before the dump
        :param footer: text to write after the dump
        :param indent: number of leading spaces (for recursive calls)
        """
        if f is None:
            f = sys.stderr
        pad = " " * indent

        if header is not None:
            print(header, file=f)

        for attr, value in sorted(self.__dict__.items()):
            if getattr(value, 'dump', None) and attr != 'book':
                value.dump(f,
                    header="%s%s (%s object):" % (pad, attr, value.__class__.__name__),
                    indent=indent+4)
            elif attr not in self._repr_these and \
                 (isinstance(value, list) or
                  isinstance(value, dict)):
                print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f)
            else:
                print("%s%s: %r" % (pad, attr, value), file=f)
        if footer is not None:
            print(footer, file=f)

def fprintf(f, fmt, *vargs):
    print(fmt.rstrip('\n') % vargs, file=f)

def upkbits(tgt_obj, src, manifest, local_setattr=setattr):
    for n, mask, attr in manifest:
        local_setattr(tgt_obj, attr, (src & mask) >> n)

def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int):
    for n, mask, attr in manifest:
        local_setattr(tgt_obj, attr, local_int((src & mask) >> n))

def unpack_string(data, pos, encoding, lenlen=1):
    nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
    pos += lenlen
    return str(data[pos:pos+nchars], encoding)

def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None):
    if known_len is not None:
        # On a NAME record, the length byte is detached from the front of the string.
        nchars = known_len
    else:
        nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
        pos += lenlen

    newpos = pos + nchars
    return (str(data[pos:newpos], encoding), newpos)

def unpack_unicode(data, pos, lenlen=2):
    """ Return unicode_strg """
    nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
    if not nchars:
        # Ambiguous whether 0-length string should have an "options" byte.
        # Avoid crash if missing.
        return ""
    pos += lenlen
    options = data[pos]
    pos += 1

    if options & 0x08: # richtext
        pos += 2

    if options & 0x04: # phonetic
        pos += 4

    if options & 0x01:
        # Uncompressed UTF-16-LE
        rawstrg = data[pos:pos+2*nchars]
        strg = str(rawstrg, 'utf_16_le')
    else:
        # Note: this is COMPRESSED (not ASCII!) encoding!!!
        # Merely returning the raw bytes would work OK 99.99% of the time
        # if the local codepage was cp1252 -- however this would rapidly go pear-shaped
        # for other codepages so we grit our Anglocentric teeth and return Unicode :-)
        strg = str(data[pos:pos+nchars], "latin_1")
    return strg

def unpack_unicode_update_pos(data, pos, lenlen=2, known_len=None):
    """ Return (unicode_strg, updated value of pos) """
    if known_len is not None:
        # On a NAME record, the length byte is detached from the front of the string.
        nchars = known_len
    else:
        nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
        pos += lenlen

    if not nchars and not data[pos:]:
        # Zero-length string with no options byte
        return ("", pos)

    options = data[pos]
    pos += 1
    phonetic = options & 0x04
    richtext = options & 0x08

    if richtext:
        rt = unpack('<H', data[pos:pos+2])[0]
        pos += 2

    if phonetic:
        sz = unpack('<i', data[pos:pos+4])[0]
        pos += 4

    if options & 0x01:
        # Uncompressed UTF-16-LE
        strg = str(data[pos:pos+2*nchars], 'utf_16_le')
        pos += 2*nchars
    else:
        # Note: this is COMPRESSED (not ASCII!) encoding!!!
        strg = str(data[pos:pos+nchars], "latin_1")
        pos += nchars

    if richtext:
        pos += 4 * rt

    if phonetic:
        pos += sz

    return (strg, pos)

def unpack_cell_range_address_list_update_pos(
    output_list, data, pos, biff_version, addr_size=6):
    # output_list is updated in situ
    if biff_version < 80:
        assert addr_size == 6
    else:
        assert addr_size in (6, 8)
    n, = unpack("<H", data[pos:pos+2])
    pos += 2
    if n:
        fmt = "<HHBB" if addr_size == 6 else "<HHHH"
        for _unused in range(n):
            ra, rb, ca, cb = unpack(fmt, data[pos:pos+addr_size])
            output_list.append((ra, rb+1, ca, cb+1))
            pos += addr_size
    return pos

def hex_char_dump(strg, ofs, dlen, base=0, fout=sys.stdout, unnumbered=False):
    endpos = min(ofs + dlen, len(strg))
    pos = ofs
    numbered = not unnumbered
    num_prefix = ''
    while pos < endpos:
        endsub = min(pos + 16, endpos)
        substrg = strg[pos:endsub]
        lensub = endsub - pos
        if lensub <= 0 or lensub != len(substrg):
            fprintf(
                sys.stdout,
                '??? hex_char_dump: ofs=%d dlen=%d base=%d -> endpos=%d pos=%d endsub=%d substrg=%r\n',
                ofs, dlen, base, endpos, pos, endsub, substrg)
            break
        hexd = ''.join(["%02x " % c for c in substrg])
        chard = ''
        for c in substrg:
            if c == ord('\0'):
                c = '~'
            elif not (' ' <= chr(c) <= '~'):
                c = '?'
            if isinstance(c, int):
                c = chr(c)
            chard += c
        if numbered:
            num_prefix = "%5d: " %  (base+pos-ofs)
        fprintf(fout, "%s     %-48s %s\n", num_prefix, hexd, chard)
        pos = endsub

def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout,
              unnumbered=False):
    pos = stream_offset
    stream_end = stream_offset + stream_len
    adj = base - stream_offset
    dummies = 0
    numbered = not unnumbered
    num_prefix = ''
    while stream_end - pos >= 4:
        rc, length = unpack('<HH', mem[pos:pos+4])
        if rc == 0 and length == 0:
            if mem[pos:] == '\0' * (stream_end - pos):
                dummies = stream_end - pos
                savpos = pos
                pos = stream_end
                break

            if dummies:
                dummies += 4
            else:
                savpos = pos
                dummies = 4
            pos += 4
        else:
            if dummies:
                if numbered:
                    num_prefix =  "%5d: " % (adj + savpos)
                fprintf(fout, "%s---- %d zero bytes skipped ----\n",
                        num_prefix, dummies)
                dummies = 0

            recname = biff_rec_name_dict.get(rc, '<UNKNOWN>')
            if numbered:
                num_prefix = "%5d: " % (adj + pos)
            fprintf(fout, "%s%04x %s len = %04x (%d)\n",
                    num_prefix, rc, recname, length, length)
            pos += 4
            hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered)
            pos += length
    if dummies:
        if numbered:
            num_prefix =  "%5d: " % (adj + savpos)
        fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies)

    if pos < stream_end:
        if numbered:
            num_prefix = "%5d: " % (adj + pos)
        fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix)
        hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered)
    elif pos > stream_end:
        fprintf(fout, "Last dumped record has length (%d) that is too large\n", length)

def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout):
    pos = stream_offset
    stream_end = stream_offset + stream_len
    tally = {}
    while stream_end - pos >= 4:
        rc, length = unpack('<HH', mem[pos:pos+4])
        if rc == 0 and length == 0:
            if mem[pos:] == '\0' * (stream_end - pos):
                break
            recname = "<Dummy (zero)>"
        else:
            recname = biff_rec_name_dict.get(rc, None)
            if recname is None:
                recname = "Unknown_0x%04X" % rc
        if recname in tally:
            tally[recname] += 1
        else:
            tally[recname] = 1
        pos += length + 4
    for recname, count in sorted(tally.items()):
        fprintf(fout, "%8d %s", count, recname)