debian-tablib/tablib/packages/xlrd3/biffh.py

621 lines
17 KiB
Python

# Support module for the xlrd3 package.
#
# Portions copyright (c) 2005-2008 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
#
# 2010-12-08 mozman refactoring for python 3
# 2008-02-10 SJM BIFF2 BLANK record
# 2008-02-08 SJM Preparation for Excel 2.0 support
# 2008-02-02 SJM Added suffixes (_B2, _B2_ONLY, etc) on record names for
# biff_dump & biff_count
# 2007-12-04 SJM Added support for Excel 2.x (BIFF2) files.
# 2007-09-08 SJM Avoid crash when zero-length Unicode string missing options byte.
# 2007-04-22 SJM Remove experimental "trimming" facility.
import sys
from struct import unpack
encoding_from_codepage = {
1200 : 'utf_16_le',
10000: 'mac_roman',
10006: 'mac_greek', # guess
10007: 'mac_cyrillic', # guess
10029: 'mac_latin2', # guess
10079: 'mac_iceland', # guess
10081: 'mac_turkish', # guess
32768: 'mac_roman',
32769: 'cp1252',
}
# some more guessing, for Indic scripts
# codepage 57000 range:
# 2 Devanagari [0]
# 3 Bengali [1]
# 4 Tamil [5]
# 5 Telegu [6]
# 6 Assamese [1] c.f. Bengali
# 7 Oriya [4]
# 8 Kannada [7]
# 9 Malayalam [8]
# 10 Gujarati [3]
# 11 Gurmukhi [2]
FUN = 0 # unknown
FDT = 1 # date
FNU = 2 # number
FGE = 3 # general
FTX = 4 # text
DATEFORMAT = FDT
NUMBERFORMAT = FNU
XL_CELL_EMPTY = 0
XL_CELL_TEXT = 1
XL_CELL_NUMBER = 2
XL_CELL_DATE = 3
XL_CELL_BOOLEAN = 4
XL_CELL_ERROR = 5
XL_CELL_BLANK = 6 # for use in debugging, gathering stats, etc
biff_text_from_num = {
0: "(not BIFF)",
20: "2.0",
21: "2.1",
30: "3",
40: "4S",
45: "4W",
50: "5",
70: "7",
80: "8",
85: "8X",
}
# This dictionary can be used to produce a text version of the internal codes
# that Excel uses for error cells. Here are its contents:
error_text_from_code = {
0x00: '#NULL!', # Intersection of two cell ranges is empty
0x07: '#DIV/0!', # Division by zero
0x0F: '#VALUE!', # Wrong type of operand
0x17: '#REF!', # Illegal or deleted cell reference
0x1D: '#NAME?', # Wrong function or range name
0x24: '#NUM!', # Value range overflow
0x2A: '#N/A!', # Argument or function not available
}
BIFF_FIRST_UNICODE = 80
XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5
XL_WORKBOOK_GLOBALS_4W = 0x100
XL_WORKSHEET = WRKSHEET = 0x10
XL_BOUNDSHEET_WORKSHEET = 0x00
XL_BOUNDSHEET_CHART = 0x02
XL_BOUNDSHEET_VB_MODULE = 0x06
# XL_RK2 = 0x7e
XL_ARRAY = 0x0221
XL_ARRAY2 = 0x0021
XL_BLANK = 0x0201
XL_BLANK_B2 = 0x01
XL_BOF = 0x809
XL_BOOLERR = 0x205
XL_BOOLERR_B2 = 0x5
XL_BOUNDSHEET = 0x85
XL_BUILTINFMTCOUNT = 0x56
XL_CF = 0x01B1
XL_CODEPAGE = 0x42
XL_COLINFO = 0x7D
XL_COLUMNDEFAULT = 0x20 # BIFF2 only
XL_COLWIDTH = 0x24 # BIFF2 only
XL_CONDFMT = 0x01B0
XL_CONTINUE = 0x3c
XL_COUNTRY = 0x8C
XL_DATEMODE = 0x22
XL_DEFAULTROWHEIGHT = 0x0225
XL_DEFCOLWIDTH = 0x55
XL_DIMENSION = 0x200
XL_DIMENSION2 = 0x0
XL_EFONT = 0x45
XL_EOF = 0x0a
XL_EXTERNNAME = 0x23
XL_EXTERNSHEET = 0x17
XL_EXTSST = 0xff
XL_FEAT11 = 0x872
XL_FILEPASS = 0x2f
XL_FONT = 0x31
XL_FONT_B3B4 = 0x231
XL_FORMAT = 0x41e
XL_FORMAT2 = 0x1E # BIFF2, BIFF3
XL_FORMULA = 0x6
XL_FORMULA3 = 0x206
XL_FORMULA4 = 0x406
XL_GCW = 0xab
XL_INDEX = 0x20b
XL_INTEGER = 0x2 # BIFF2 only
XL_IXFE = 0x44 # BIFF2 only
XL_LABEL = 0x204
XL_LABEL_B2 = 0x04
XL_LABELRANGES = 0x15f
XL_LABELSST = 0xfd
XL_MERGEDCELLS = 0xE5
XL_MSO_DRAWING = 0x00EC
XL_MSO_DRAWING_GROUP = 0x00EB
XL_MSO_DRAWING_SELECTION = 0x00ED
XL_MULRK = 0xbd
XL_MULBLANK = 0xbe
XL_NAME = 0x18
XL_NOTE = 0x1c
XL_NUMBER = 0x203
XL_NUMBER_B2 = 0x3
XL_OBJ = 0x5D
XL_PALETTE = 0x92
XL_RK = 0x27e
XL_ROW = 0x208
XL_ROW_B2 = 0x08
XL_RSTRING = 0xd6
XL_SHEETHDR = 0x8F # BIFF4W only
XL_SHEETSOFFSET = 0x8E # BIFF4W only
XL_SHRFMLA = 0x04bc
XL_SST = 0xfc
XL_STANDARDWIDTH = 0x99
XL_STRING = 0x207
XL_STRING_B2 = 0x7
XL_STYLE = 0x293
XL_SUPBOOK = 0x1AE
XL_TABLEOP = 0x236
XL_TABLEOP2 = 0x37
XL_TABLEOP_B2 = 0x36
XL_TXO = 0x1b6
XL_UNCALCED = 0x5e
XL_UNKNOWN = 0xffff
XL_WINDOW2 = 0x023E
XL_WRITEACCESS = 0x5C
XL_XF = 0xe0
XL_XF2 = 0x0043 # BIFF2 version of XF record
XL_XF3 = 0x0243 # BIFF3 version of XF record
XL_XF4 = 0x0443 # BIFF4 version of XF record
boflen = {
0x0809: 8,
0x0409: 6,
0x0209: 6,
0x0009: 4,
}
bofcodes = (0x0809, 0x0409, 0x0209, 0x0009)
XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206)
_cell_opcode_list = (
XL_BOOLERR,
XL_FORMULA,
XL_FORMULA3,
XL_FORMULA4,
XL_LABEL,
XL_LABELSST,
XL_MULRK,
XL_NUMBER,
XL_RK,
XL_RSTRING,
)
biff_rec_name_dict = {
0x0000: 'DIMENSIONS_B2',
0x0001: 'BLANK_B2',
0x0002: 'INTEGER_B2_ONLY',
0x0003: 'NUMBER_B2',
0x0004: 'LABEL_B2',
0x0005: 'BOOLERR_B2',
0x0006: 'FORMULA',
0x0007: 'STRING_B2',
0x0008: 'ROW_B2',
0x0009: 'BOF_B2',
0x000A: 'EOF',
0x000B: 'INDEX_B2_ONLY',
0x000C: 'CALCCOUNT',
0x000D: 'CALCMODE',
0x000E: 'PRECISION',
0x000F: 'REFMODE',
0x0010: 'DELTA',
0x0011: 'ITERATION',
0x0012: 'PROTECT',
0x0013: 'PASSWORD',
0x0014: 'HEADER',
0x0015: 'FOOTER',
0x0016: 'EXTERNCOUNT',
0x0017: 'EXTERNSHEET',
0x0018: 'NAME_B2,5+',
0x0019: 'WINDOWPROTECT',
0x001A: 'VERTICALPAGEBREAKS',
0x001B: 'HORIZONTALPAGEBREAKS',
0x001C: 'NOTE',
0x001D: 'SELECTION',
0x001E: 'FORMAT_B2-3',
0x001F: 'BUILTINFMTCOUNT_B2',
0x0020: 'COLUMNDEFAULT_B2_ONLY',
0x0021: 'ARRAY_B2_ONLY',
0x0022: 'DATEMODE',
0x0023: 'EXTERNNAME',
0x0024: 'COLWIDTH_B2_ONLY',
0x0025: 'DEFAULTROWHEIGHT_B2_ONLY',
0x0026: 'LEFTMARGIN',
0x0027: 'RIGHTMARGIN',
0x0028: 'TOPMARGIN',
0x0029: 'BOTTOMMARGIN',
0x002A: 'PRINTHEADERS',
0x002B: 'PRINTGRIDLINES',
0x002F: 'FILEPASS',
0x0031: 'FONT',
0x0032: 'FONT2_B2_ONLY',
0x0036: 'TABLEOP_B2',
0x0037: 'TABLEOP2_B2',
0x003C: 'CONTINUE',
0x003D: 'WINDOW1',
0x003E: 'WINDOW2_B2',
0x0040: 'BACKUP',
0x0041: 'PANE',
0x0042: 'CODEPAGE',
0x0043: 'XF_B2',
0x0044: 'IXFE_B2_ONLY',
0x0045: 'EFONT_B2_ONLY',
0x004D: 'PLS',
0x0051: 'DCONREF',
0x0055: 'DEFCOLWIDTH',
0x0056: 'BUILTINFMTCOUNT_B3-4',
0x0059: 'XCT',
0x005A: 'CRN',
0x005B: 'FILESHARING',
0x005C: 'WRITEACCESS',
0x005D: 'OBJECT',
0x005E: 'UNCALCED',
0x005F: 'SAVERECALC',
0x0063: 'OBJECTPROTECT',
0x007D: 'COLINFO',
0x007E: 'RK2_mythical_?',
0x0080: 'GUTS',
0x0081: 'WSBOOL',
0x0082: 'GRIDSET',
0x0083: 'HCENTER',
0x0084: 'VCENTER',
0x0085: 'BOUNDSHEET',
0x0086: 'WRITEPROT',
0x008C: 'COUNTRY',
0x008D: 'HIDEOBJ',
0x008E: 'SHEETSOFFSET',
0x008F: 'SHEETHDR',
0x0090: 'SORT',
0x0092: 'PALETTE',
0x0099: 'STANDARDWIDTH',
0x009B: 'FILTERMODE',
0x009C: 'FNGROUPCOUNT',
0x009D: 'AUTOFILTERINFO',
0x009E: 'AUTOFILTER',
0x00A0: 'SCL',
0x00A1: 'SETUP',
0x00AB: 'GCW',
0x00BD: 'MULRK',
0x00BE: 'MULBLANK',
0x00C1: 'MMS',
0x00D6: 'RSTRING',
0x00D7: 'DBCELL',
0x00DA: 'BOOKBOOL',
0x00DD: 'SCENPROTECT',
0x00E0: 'XF',
0x00E1: 'INTERFACEHDR',
0x00E2: 'INTERFACEEND',
0x00E5: 'MERGEDCELLS',
0x00E9: 'BITMAP',
0x00EB: 'MSO_DRAWING_GROUP',
0x00EC: 'MSO_DRAWING',
0x00ED: 'MSO_DRAWING_SELECTION',
0x00EF: 'PHONETIC',
0x00FC: 'SST',
0x00FD: 'LABELSST',
0x00FF: 'EXTSST',
0x013D: 'TABID',
0x015F: 'LABELRANGES',
0x0160: 'USESELFS',
0x0161: 'DSF',
0x01AE: 'SUPBOOK',
0x01AF: 'PROTECTIONREV4',
0x01B0: 'CONDFMT',
0x01B1: 'CF',
0x01B2: 'DVAL',
0x01B6: 'TXO',
0x01B7: 'REFRESHALL',
0x01B8: 'HLINK',
0x01BC: 'PASSWORDREV4',
0x01BE: 'DV',
0x01C0: 'XL9FILE',
0x01C1: 'RECALCID',
0x0200: 'DIMENSIONS',
0x0201: 'BLANK',
0x0203: 'NUMBER',
0x0204: 'LABEL',
0x0205: 'BOOLERR',
0x0206: 'FORMULA_B3',
0x0207: 'STRING',
0x0208: 'ROW',
0x0209: 'BOF',
0x020B: 'INDEX_B3+',
0x0218: 'NAME',
0x0221: 'ARRAY',
0x0223: 'EXTERNNAME_B3-4',
0x0225: 'DEFAULTROWHEIGHT',
0x0231: 'FONT_B3B4',
0x0236: 'TABLEOP',
0x023E: 'WINDOW2',
0x0243: 'XF_B3',
0x027E: 'RK',
0x0293: 'STYLE',
0x0406: 'FORMULA_B4',
0x0409: 'BOF',
0x041E: 'FORMAT',
0x0443: 'XF_B4',
0x04BC: 'SHRFMLA',
0x0800: 'QUICKTIP',
0x0809: 'BOF',
0x0862: 'SHEETLAYOUT',
0x0867: 'SHEETPROTECTION',
0x0868: 'RANGEPROTECTION',
}
class XLRDError(Exception):
pass
class BaseObject:
"""
Parent of almost all other classes in the package. Defines a common
'dump' method for debugging.
"""
_repr_these = []
def dump(self, f=None, header=None, footer=None, indent=0):
"""
:param f: open file object, to which the dump is written
:param header: text to write before the dump
:param footer: text to write after the dump
:param indent: number of leading spaces (for recursive calls)
"""
if f is None:
f = sys.stderr
pad = " " * indent
if header is not None:
print(header, file=f)
for attr, value in sorted(self.__dict__.items()):
if getattr(value, 'dump', None) and attr != 'book':
value.dump(f,
header="%s%s (%s object):" % (pad, attr, value.__class__.__name__),
indent=indent+4)
elif attr not in self._repr_these and \
(isinstance(value, list) or
isinstance(value, dict)):
print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f)
else:
print("%s%s: %r" % (pad, attr, value), file=f)
if footer is not None:
print(footer, file=f)
def fprintf(f, fmt, *vargs):
print(fmt.rstrip('\n') % vargs, file=f)
def upkbits(tgt_obj, src, manifest, local_setattr=setattr):
for n, mask, attr in manifest:
local_setattr(tgt_obj, attr, (src & mask) >> n)
def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int):
for n, mask, attr in manifest:
local_setattr(tgt_obj, attr, local_int((src & mask) >> n))
def unpack_string(data, pos, encoding, lenlen=1):
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
return str(data[pos:pos+nchars], encoding)
def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None):
if known_len is not None:
# On a NAME record, the length byte is detached from the front of the string.
nchars = known_len
else:
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
newpos = pos + nchars
return (str(data[pos:newpos], encoding), newpos)
def unpack_unicode(data, pos, lenlen=2):
""" Return unicode_strg """
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
if not nchars:
# Ambiguous whether 0-length string should have an "options" byte.
# Avoid crash if missing.
return ""
pos += lenlen
options = data[pos]
pos += 1
if options & 0x08: # richtext
pos += 2
if options & 0x04: # phonetic
pos += 4
if options & 0x01:
# Uncompressed UTF-16-LE
rawstrg = data[pos:pos+2*nchars]
strg = str(rawstrg, 'utf_16_le')
else:
# Note: this is COMPRESSED (not ASCII!) encoding!!!
# Merely returning the raw bytes would work OK 99.99% of the time
# if the local codepage was cp1252 -- however this would rapidly go pear-shaped
# for other codepages so we grit our Anglocentric teeth and return Unicode :-)
strg = str(data[pos:pos+nchars], "latin_1")
return strg
def unpack_unicode_update_pos(data, pos, lenlen=2, known_len=None):
""" Return (unicode_strg, updated value of pos) """
if known_len is not None:
# On a NAME record, the length byte is detached from the front of the string.
nchars = known_len
else:
nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0]
pos += lenlen
if not nchars and not data[pos:]:
# Zero-length string with no options byte
return ("", pos)
options = data[pos]
pos += 1
phonetic = options & 0x04
richtext = options & 0x08
if richtext:
rt = unpack('<H', data[pos:pos+2])[0]
pos += 2
if phonetic:
sz = unpack('<i', data[pos:pos+4])[0]
pos += 4
if options & 0x01:
# Uncompressed UTF-16-LE
strg = str(data[pos:pos+2*nchars], 'utf_16_le')
pos += 2*nchars
else:
# Note: this is COMPRESSED (not ASCII!) encoding!!!
strg = str(data[pos:pos+nchars], "latin_1")
pos += nchars
if richtext:
pos += 4 * rt
if phonetic:
pos += sz
return (strg, pos)
def unpack_cell_range_address_list_update_pos(
output_list, data, pos, biff_version, addr_size=6):
# output_list is updated in situ
if biff_version < 80:
assert addr_size == 6
else:
assert addr_size in (6, 8)
n, = unpack("<H", data[pos:pos+2])
pos += 2
if n:
fmt = "<HHBB" if addr_size == 6 else "<HHHH"
for _unused in range(n):
ra, rb, ca, cb = unpack(fmt, data[pos:pos+addr_size])
output_list.append((ra, rb+1, ca, cb+1))
pos += addr_size
return pos
def hex_char_dump(strg, ofs, dlen, base=0, fout=sys.stdout, unnumbered=False):
endpos = min(ofs + dlen, len(strg))
pos = ofs
numbered = not unnumbered
num_prefix = ''
while pos < endpos:
endsub = min(pos + 16, endpos)
substrg = strg[pos:endsub]
lensub = endsub - pos
if lensub <= 0 or lensub != len(substrg):
fprintf(
sys.stdout,
'??? hex_char_dump: ofs=%d dlen=%d base=%d -> endpos=%d pos=%d endsub=%d substrg=%r\n',
ofs, dlen, base, endpos, pos, endsub, substrg)
break
hexd = ''.join(["%02x " % c for c in substrg])
chard = ''
for c in substrg:
if c == ord('\0'):
c = '~'
elif not (' ' <= chr(c) <= '~'):
c = '?'
if isinstance(c, int):
c = chr(c)
chard += c
if numbered:
num_prefix = "%5d: " % (base+pos-ofs)
fprintf(fout, "%s %-48s %s\n", num_prefix, hexd, chard)
pos = endsub
def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout,
unnumbered=False):
pos = stream_offset
stream_end = stream_offset + stream_len
adj = base - stream_offset
dummies = 0
numbered = not unnumbered
num_prefix = ''
while stream_end - pos >= 4:
rc, length = unpack('<HH', mem[pos:pos+4])
if rc == 0 and length == 0:
if mem[pos:] == '\0' * (stream_end - pos):
dummies = stream_end - pos
savpos = pos
pos = stream_end
break
if dummies:
dummies += 4
else:
savpos = pos
dummies = 4
pos += 4
else:
if dummies:
if numbered:
num_prefix = "%5d: " % (adj + savpos)
fprintf(fout, "%s---- %d zero bytes skipped ----\n",
num_prefix, dummies)
dummies = 0
recname = biff_rec_name_dict.get(rc, '<UNKNOWN>')
if numbered:
num_prefix = "%5d: " % (adj + pos)
fprintf(fout, "%s%04x %s len = %04x (%d)\n",
num_prefix, rc, recname, length, length)
pos += 4
hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered)
pos += length
if dummies:
if numbered:
num_prefix = "%5d: " % (adj + savpos)
fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies)
if pos < stream_end:
if numbered:
num_prefix = "%5d: " % (adj + pos)
fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix)
hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered)
elif pos > stream_end:
fprintf(fout, "Last dumped record has length (%d) that is too large\n", length)
def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout):
pos = stream_offset
stream_end = stream_offset + stream_len
tally = {}
while stream_end - pos >= 4:
rc, length = unpack('<HH', mem[pos:pos+4])
if rc == 0 and length == 0:
if mem[pos:] == '\0' * (stream_end - pos):
break
recname = "<Dummy (zero)>"
else:
recname = biff_rec_name_dict.get(rc, None)
if recname is None:
recname = "Unknown_0x%04X" % rc
if recname in tally:
tally[recname] += 1
else:
tally[recname] = 1
pos += length + 4
for recname, count in sorted(tally.items()):
fprintf(fout, "%8d %s", count, recname)