xlrd-1.2.0/0000775000372000037200000000000013405237100013320 5ustar travistravis00000000000000xlrd-1.2.0/xlrd/0000775000372000037200000000000013405237100014271 5ustar travistravis00000000000000xlrd-1.2.0/xlrd/biffh.py0000664000372000037200000004041313405237033015730 0ustar travistravis00000000000000# -*- coding: utf-8 -*- # Portions copyright © 2005-2010 Stephen John Machin, Lingfo Pty Ltd # This module is part of the xlrd package, which is released under a # BSD-style licence. from __future__ import print_function import sys from struct import unpack from .timemachine import * DEBUG = 0 class XLRDError(Exception): """ An exception indicating problems reading data from an Excel file. """ class BaseObject(object): """ Parent of almost all other classes in the package. Defines a common :meth:`dump` method for debugging. """ _repr_these = [] def dump(self, f=None, header=None, footer=None, indent=0): """ :param f: open file object, to which the dump is written :param header: text to write before the dump :param footer: text to write after the dump :param indent: number of leading spaces (for recursive calls) """ if f is None: f = sys.stderr if hasattr(self, "__slots__"): alist = [] for attr in self.__slots__: alist.append((attr, getattr(self, attr))) else: alist = self.__dict__.items() alist = sorted(alist) pad = " " * indent if header is not None: print(header, file=f) list_type = type([]) dict_type = type({}) for attr, value in alist: if getattr(value, 'dump', None) and attr != 'book': value.dump(f, header="%s%s (%s object):" % (pad, attr, value.__class__.__name__), indent=indent+4) elif (attr not in self._repr_these and (isinstance(value, list_type) or isinstance(value, dict_type))): print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f) else: fprintf(f, "%s%s: %r\n", pad, attr, value) if footer is not None: print(footer, file=f) FUN, FDT, FNU, FGE, FTX = range(5) # unknown, date, number, general, text DATEFORMAT = FDT NUMBERFORMAT = FNU ( XL_CELL_EMPTY, XL_CELL_TEXT, XL_CELL_NUMBER, XL_CELL_DATE, XL_CELL_BOOLEAN, XL_CELL_ERROR, XL_CELL_BLANK, # for use in debugging, gathering stats, etc ) = range(7) biff_text_from_num = { 0: "(not BIFF)", 20: "2.0", 21: "2.1", 30: "3", 40: "4S", 45: "4W", 50: "5", 70: "7", 80: "8", 85: "8X", } #: This dictionary can be used to produce a text version of the internal codes #: that Excel uses for error cells. error_text_from_code = { 0x00: '#NULL!', # Intersection of two cell ranges is empty 0x07: '#DIV/0!', # Division by zero 0x0F: '#VALUE!', # Wrong type of operand 0x17: '#REF!', # Illegal or deleted cell reference 0x1D: '#NAME?', # Wrong function or range name 0x24: '#NUM!', # Value range overflow 0x2A: '#N/A', # Argument or function not available } BIFF_FIRST_UNICODE = 80 XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5 XL_WORKBOOK_GLOBALS_4W = 0x100 XL_WORKSHEET = WRKSHEET = 0x10 XL_BOUNDSHEET_WORKSHEET = 0x00 XL_BOUNDSHEET_CHART = 0x02 XL_BOUNDSHEET_VB_MODULE = 0x06 # XL_RK2 = 0x7e XL_ARRAY = 0x0221 XL_ARRAY2 = 0x0021 XL_BLANK = 0x0201 XL_BLANK_B2 = 0x01 XL_BOF = 0x809 XL_BOOLERR = 0x205 XL_BOOLERR_B2 = 0x5 XL_BOUNDSHEET = 0x85 XL_BUILTINFMTCOUNT = 0x56 XL_CF = 0x01B1 XL_CODEPAGE = 0x42 XL_COLINFO = 0x7D XL_COLUMNDEFAULT = 0x20 # BIFF2 only XL_COLWIDTH = 0x24 # BIFF2 only XL_CONDFMT = 0x01B0 XL_CONTINUE = 0x3c XL_COUNTRY = 0x8C XL_DATEMODE = 0x22 XL_DEFAULTROWHEIGHT = 0x0225 XL_DEFCOLWIDTH = 0x55 XL_DIMENSION = 0x200 XL_DIMENSION2 = 0x0 XL_EFONT = 0x45 XL_EOF = 0x0a XL_EXTERNNAME = 0x23 XL_EXTERNSHEET = 0x17 XL_EXTSST = 0xff XL_FEAT11 = 0x872 XL_FILEPASS = 0x2f XL_FONT = 0x31 XL_FONT_B3B4 = 0x231 XL_FORMAT = 0x41e XL_FORMAT2 = 0x1E # BIFF2, BIFF3 XL_FORMULA = 0x6 XL_FORMULA3 = 0x206 XL_FORMULA4 = 0x406 XL_GCW = 0xab XL_HLINK = 0x01B8 XL_QUICKTIP = 0x0800 XL_HORIZONTALPAGEBREAKS = 0x1b XL_INDEX = 0x20b XL_INTEGER = 0x2 # BIFF2 only XL_IXFE = 0x44 # BIFF2 only XL_LABEL = 0x204 XL_LABEL_B2 = 0x04 XL_LABELRANGES = 0x15f XL_LABELSST = 0xfd XL_LEFTMARGIN = 0x26 XL_TOPMARGIN = 0x28 XL_RIGHTMARGIN = 0x27 XL_BOTTOMMARGIN = 0x29 XL_HEADER = 0x14 XL_FOOTER = 0x15 XL_HCENTER = 0x83 XL_VCENTER = 0x84 XL_MERGEDCELLS = 0xE5 XL_MSO_DRAWING = 0x00EC XL_MSO_DRAWING_GROUP = 0x00EB XL_MSO_DRAWING_SELECTION = 0x00ED XL_MULRK = 0xbd XL_MULBLANK = 0xbe XL_NAME = 0x18 XL_NOTE = 0x1c XL_NUMBER = 0x203 XL_NUMBER_B2 = 0x3 XL_OBJ = 0x5D XL_PAGESETUP = 0xA1 XL_PALETTE = 0x92 XL_PANE = 0x41 XL_PRINTGRIDLINES = 0x2B XL_PRINTHEADERS = 0x2A XL_RK = 0x27e XL_ROW = 0x208 XL_ROW_B2 = 0x08 XL_RSTRING = 0xd6 XL_SCL = 0x00A0 XL_SHEETHDR = 0x8F # BIFF4W only XL_SHEETPR = 0x81 XL_SHEETSOFFSET = 0x8E # BIFF4W only XL_SHRFMLA = 0x04bc XL_SST = 0xfc XL_STANDARDWIDTH = 0x99 XL_STRING = 0x207 XL_STRING_B2 = 0x7 XL_STYLE = 0x293 XL_SUPBOOK = 0x1AE # aka EXTERNALBOOK in OOo docs XL_TABLEOP = 0x236 XL_TABLEOP2 = 0x37 XL_TABLEOP_B2 = 0x36 XL_TXO = 0x1b6 XL_UNCALCED = 0x5e XL_UNKNOWN = 0xffff XL_VERTICALPAGEBREAKS = 0x1a XL_WINDOW2 = 0x023E XL_WINDOW2_B2 = 0x003E XL_WRITEACCESS = 0x5C XL_WSBOOL = XL_SHEETPR XL_XF = 0xe0 XL_XF2 = 0x0043 # BIFF2 version of XF record XL_XF3 = 0x0243 # BIFF3 version of XF record XL_XF4 = 0x0443 # BIFF4 version of XF record boflen = {0x0809: 8, 0x0409: 6, 0x0209: 6, 0x0009: 4} bofcodes = (0x0809, 0x0409, 0x0209, 0x0009) XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206) _cell_opcode_list = [ XL_BOOLERR, XL_FORMULA, XL_FORMULA3, XL_FORMULA4, XL_LABEL, XL_LABELSST, XL_MULRK, XL_NUMBER, XL_RK, XL_RSTRING, ] _cell_opcode_dict = {} for _cell_opcode in _cell_opcode_list: _cell_opcode_dict[_cell_opcode] = 1 def is_cell_opcode(c): return c in _cell_opcode_dict def upkbits(tgt_obj, src, manifest, local_setattr=setattr): for n, mask, attr in manifest: local_setattr(tgt_obj, attr, (src & mask) >> n) def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int): for n, mask, attr in manifest: local_setattr(tgt_obj, attr, local_int((src & mask) >> n)) def unpack_string(data, pos, encoding, lenlen=1): nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] pos += lenlen return unicode(data[pos:pos+nchars], encoding) def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None): if known_len is not None: # On a NAME record, the length byte is detached from the front of the string. nchars = known_len else: nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] pos += lenlen newpos = pos + nchars return (unicode(data[pos:newpos], encoding), newpos) def unpack_unicode(data, pos, lenlen=2): "Return unicode_strg" nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] if not nchars: # Ambiguous whether 0-length string should have an "options" byte. # Avoid crash if missing. return UNICODE_LITERAL("") pos += lenlen options = BYTES_ORD(data[pos]) pos += 1 # phonetic = options & 0x04 # richtext = options & 0x08 if options & 0x08: # rt = unpack(' endpos=%d pos=%d endsub=%d substrg=%r\n', ofs, dlen, base, endpos, pos, endsub, substrg) break hexd = ''.join("%02x " % BYTES_ORD(c) for c in substrg) chard = '' for c in substrg: c = chr(BYTES_ORD(c)) if c == '\0': c = '~' elif not (' ' <= c <= '~'): c = '?' chard += c if numbered: num_prefix = "%5d: " % (base+pos-ofs) fprintf(fout, "%s %-48s %s\n", num_prefix, hexd, chard) pos = endsub def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout, unnumbered=False): pos = stream_offset stream_end = stream_offset + stream_len adj = base - stream_offset dummies = 0 numbered = not unnumbered num_prefix = '' while stream_end - pos >= 4: rc, length = unpack('') if numbered: num_prefix = "%5d: " % (adj + pos) fprintf(fout, "%s%04x %s len = %04x (%d)\n", num_prefix, rc, recname, length, length) pos += 4 hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered) pos += length if dummies: if numbered: num_prefix = "%5d: " % (adj + savpos) fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies) if pos < stream_end: if numbered: num_prefix = "%5d: " % (adj + pos) fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix) hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered) elif pos > stream_end: fprintf(fout, "Last dumped record has length (%d) that is too large\n", length) def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout): pos = stream_offset stream_end = stream_offset + stream_len tally = {} while stream_end - pos >= 4: rc, length = unpack('= 2: return tag[:pos], tag[pos:] return '', tag def augment_keys(adict, uri): # uri must already be enclosed in {} for x in list(adict.keys()): adict[uri + x] = adict[x] _UPPERCASE_1_REL_INDEX = {} # Used in fast conversion of column names (e.g. "XFD") to indices (16383) for _x in xrange(26): _UPPERCASE_1_REL_INDEX["ABCDEFGHIJKLMNOPQRSTUVWXYZ"[_x]] = _x + 1 for _x in "123456789": _UPPERCASE_1_REL_INDEX[_x] = 0 del _x def cell_name_to_rowx_colx(cell_name, letter_value=_UPPERCASE_1_REL_INDEX, allow_no_col=False): # Extract column index from cell name # A => 0, Z =>25, AA => 26, XFD => 16383 colx = 0 charx = -1 try: for c in cell_name: charx += 1 lv = letter_value[c] if lv: colx = colx * 26 + lv else: # start of row number; can't be '0' if charx == 0: # there was no col marker if allow_no_col: colx = None break else: raise Exception( 'Missing col in cell name %r', cell_name) else: colx = colx - 1 assert 0 <= colx < X12_MAX_COLS break except KeyError: raise Exception('Unexpected character %r in cell name %r' % (c, cell_name)) rowx = int(cell_name[charx:]) - 1 return rowx, colx error_code_from_text = {} for _code, _text in error_text_from_code.items(): error_code_from_text[_text] = _code # === X12 === Excel 2007 .xlsx =============================================== U_SSML12 = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}" U_ODREL = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}" U_PKGREL = "{http://schemas.openxmlformats.org/package/2006/relationships}" U_CP = "{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}" U_DC = "{http://purl.org/dc/elements/1.1/}" U_DCTERMS = "{http://purl.org/dc/terms/}" XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space" XML_WHITESPACE = "\t\n \r" X12_MAX_ROWS = 2 ** 20 X12_MAX_COLS = 2 ** 14 V_TAG = U_SSML12 + 'v' # cell child: value F_TAG = U_SSML12 + 'f' # cell child: formula IS_TAG = U_SSML12 + 'is' # cell child: inline string def unescape(s, subber=re.compile(r'_x[0-9A-Fa-f]{4,4}_', re.UNICODE).sub, repl=lambda mobj: unichr(int(mobj.group(0)[2:6], 16))): if "_" in s: return subber(repl, s) return s def cooked_text(self, elem): t = elem.text if t is None: return '' if elem.get(XML_SPACE_ATTR) != 'preserve': t = t.strip(XML_WHITESPACE) return ensure_unicode(unescape(t)) def get_text_from_si_or_is(self, elem, r_tag=U_SSML12+'r', t_tag=U_SSML12 +'t'): "Returns unescaped unicode" accum = [] for child in elem: # self.dump_elem(child) tag = child.tag if tag == t_tag: t = cooked_text(self, child) if t: # note: .text attribute can be None accum.append(t) elif tag == r_tag: for tnode in child: if tnode.tag == t_tag: t = cooked_text(self, tnode) if t: accum.append(t) return ''.join(accum) def map_attributes(amap, elem, obj): for xml_attr, obj_attr, cnv_func_or_const in amap: if not xml_attr: setattr(obj, obj_attr, cnv_func_or_const) continue if not obj_attr: continue #### FIX ME #### raw_value = elem.get(xml_attr) cooked_value = cnv_func_or_const(raw_value) setattr(obj, obj_attr, cooked_value) def cnv_ST_Xstring(s): if s is None: return "" return ensure_unicode(s) def cnv_xsd_unsignedInt(s): if not s: return None value = int(s) assert value >= 0 return value def cnv_xsd_boolean(s): if not s: return 0 if s in ("1", "true", "on"): return 1 if s in ("0", "false", "off"): return 0 raise ValueError("unexpected xsd:boolean value: %r" % s) _defined_name_attribute_map = ( ("name", "name", cnv_ST_Xstring, ), ("comment", "", cnv_ST_Xstring, ), ("customMenu", "", cnv_ST_Xstring, ), ("description", "", cnv_ST_Xstring, ), ("help", "", cnv_ST_Xstring, ), ("statusBar", "", cnv_ST_Xstring, ), ("localSheetId", "scope", cnv_xsd_unsignedInt, ), ("hidden", "hidden", cnv_xsd_boolean, ), ("function", "func", cnv_xsd_boolean, ), ("vbProcedure", "vbasic", cnv_xsd_boolean, ), ("xlm", "macro", cnv_xsd_boolean, ), ("functionGroupId", "funcgroup", cnv_xsd_unsignedInt, ), ("shortcutKey", "", cnv_ST_Xstring, ), ("publishToServer", "", cnv_xsd_boolean, ), ("workbookParameter", "", cnv_xsd_boolean, ), ("", "any_err", 0, ), ("", "any_external", 0, ), ("", "any_rel", 0, ), ("", "basic_formula_len", 0, ), ("", "binary", 0, ), ("", "builtin", 0, ), ("", "complex", 0, ), ("", "evaluated", 0, ), ("", "excel_sheet_index", 0, ), ("", "excel_sheet_num", 0, ), ("", "option_flags", 0, ), ("", "result", None, ), ("", "stack", None, ), ) def make_name_access_maps(bk): name_and_scope_map = {} # (name.lower(), scope): Name_object name_map = {} # name.lower() : list of Name_objects (sorted in scope order) num_names = len(bk.name_obj_list) for namex in xrange(num_names): nobj = bk.name_obj_list[namex] name_lcase = nobj.name.lower() key = (name_lcase, nobj.scope) if key in name_and_scope_map: msg = 'Duplicate entry %r in name_and_scope_map' % (key, ) if 0: raise XLRDError(msg) else: if bk.verbosity: print(msg, file=bk.logfile) name_and_scope_map[key] = nobj sort_data = (nobj.scope, namex, nobj) if name_lcase in name_map: name_map[name_lcase].append(sort_data) else: name_map[name_lcase] = [sort_data] for key in name_map.keys(): alist = name_map[key] alist.sort() name_map[key] = [x[2] for x in alist] bk.name_and_scope_map = name_and_scope_map bk.name_map = name_map class X12General(object): def process_stream(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) self.tree = ET.parse(stream) getmethod = self.tag2meth.get for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator(): if self.verbosity >= 3: self.dump_elem(elem) meth = getmethod(elem.tag) if meth: meth(self, elem) self.finish_off() def finish_off(self): pass def dump_elem(self, elem): fprintf(self.logfile, "===\ntag=%r len=%d attrib=%r text=%r tail=%r\n", split_tag(elem.tag)[1], len(elem), elem.attrib, elem.text, elem.tail) def dumpout(self, fmt, *vargs): text = (12 * ' ' + fmt + '\n') % vargs self.logfile.write(text) class X12Book(X12General): def __init__(self, bk, logfile=DLF, verbosity=False): self.bk = bk self.logfile = logfile self.verbosity = verbosity self.bk.nsheets = 0 self.bk.props = {} self.relid2path = {} self.relid2reltype = {} self.sheet_targets = [] # indexed by sheetx self.sheetIds = [] # indexed by sheetx core_props_menu = { U_CP+"lastModifiedBy": ("last_modified_by", cnv_ST_Xstring), U_DC+"creator": ("creator", cnv_ST_Xstring), U_DCTERMS+"modified": ("modified", cnv_ST_Xstring), U_DCTERMS+"created": ("created", cnv_ST_Xstring), } def process_coreprops(self, stream): if self.verbosity >= 2: fprintf(self.logfile, "\n=== coreProps ===\n") self.tree = ET.parse(stream) getmenu = self.core_props_menu.get props = {} for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator(): if self.verbosity >= 3: self.dump_elem(elem) menu = getmenu(elem.tag) if menu: attr, func = menu value = func(elem.text) props[attr] = value self.bk.user_name = props.get('last_modified_by') or props.get('creator') self.bk.props = props if self.verbosity >= 2: fprintf(self.logfile, "props: %r\n", props) self.finish_off() @staticmethod def convert_filename(name): return name.replace('\\', '/').lower() def process_rels(self, stream): if self.verbosity >= 2: fprintf(self.logfile, "\n=== Relationships ===\n") tree = ET.parse(stream) r_tag = U_PKGREL + 'Relationship' for elem in tree.findall(r_tag): rid = elem.get('Id') target = X12Book.convert_filename(elem.get('Target')) reltype = elem.get('Type').split('/')[-1] if self.verbosity >= 2: self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target) self.relid2reltype[rid] = reltype # self.relid2path[rid] = 'xl/' + target if target.startswith('/'): self.relid2path[rid] = target[1:] # drop the / else: self.relid2path[rid] = 'xl/' + target def do_defined_name(self, elem): #### UNDER CONSTRUCTION #### if 0 and self.verbosity >= 3: self.dump_elem(elem) nobj = Name() bk = self.bk nobj.bk = bk nobj.name_index = len(bk.name_obj_list) bk.name_obj_list.append(nobj) nobj.name = elem.get('name') nobj.raw_formula = None # compiled bytecode formula -- not in XLSX nobj.formula_text = cooked_text(self, elem) map_attributes(_defined_name_attribute_map, elem, nobj) if nobj.scope is None: nobj.scope = -1 # global if nobj.name.startswith("_xlnm."): nobj.builtin = 1 if self.verbosity >= 2: nobj.dump(header='=== Name object ===') def do_defined_names(self, elem): for child in elem: self.do_defined_name(child) make_name_access_maps(self.bk) def do_sheet(self, elem): bk = self.bk sheetx = bk.nsheets # print elem.attrib rid = elem.get(U_ODREL + 'id') sheetId = int(elem.get('sheetId')) name = unescape(ensure_unicode(elem.get('name'))) reltype = self.relid2reltype[rid] target = self.relid2path[rid] if self.verbosity >= 2: self.dumpout( 'sheetx=%d sheetId=%r rid=%r type=%r name=%r', sheetx, sheetId, rid, reltype, name) if reltype != 'worksheet': if self.verbosity >= 2: self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name) return state = elem.get('state') visibility_map = { None: 0, 'visible': 0, 'hidden': 1, 'veryHidden': 2, } bk._sheet_visibility.append(visibility_map[state]) sheet = Sheet(bk, position=None, name=name, number=sheetx) sheet.utter_max_rows = X12_MAX_ROWS sheet.utter_max_cols = X12_MAX_COLS bk._sheet_list.append(sheet) bk._sheet_names.append(name) bk.nsheets += 1 self.sheet_targets.append(target) self.sheetIds.append(sheetId) def do_workbookpr(self, elem): datemode = cnv_xsd_boolean(elem.get('date1904')) if self.verbosity >= 2: self.dumpout('datemode=%r', datemode) self.bk.datemode = datemode tag2meth = { 'definedNames': do_defined_names, 'workbookPr': do_workbookpr, 'sheet': do_sheet, } augment_keys(tag2meth, U_SSML12) class X12SST(X12General): def __init__(self, bk, logfile=DLF, verbosity=0): self.bk = bk self.logfile = logfile self.verbosity = verbosity if ET_has_iterparse: self.process_stream = self.process_stream_iterparse else: self.process_stream = self.process_stream_findall def process_stream_iterparse(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for event, elem in ET.iterparse(stream): if elem.tag != si_tag: continue elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) elem.clear() # destroy all child elements if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) if self.verbosity >= 3: for x, s in enumerate(sst): fprintf(self.logfile, "SST x=%d s=%r\n", x, s) def process_stream_findall(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) self.tree = ET.parse(stream) si_tag = U_SSML12 + 'si' elemno = -1 sst = self.bk._sharedstrings for elem in self.tree.findall(si_tag): elemno = elemno + 1 if self.verbosity >= 3: fprintf(self.logfile, "element #%d\n", elemno) self.dump_elem(elem) result = get_text_from_si_or_is(self, elem) sst.append(result) if self.verbosity >= 2: self.dumpout('Entries in SST: %d', len(sst)) class X12Styles(X12General): def __init__(self, bk, logfile=DLF, verbosity=0): self.bk = bk self.logfile = logfile self.verbosity = verbosity self.xf_counts = [0, 0] self.xf_type = None self.fmt_is_date = {} for x in list(range(14, 23)) + list(range(45, 48)): #### hard-coding FIX ME #### self.fmt_is_date[x] = 1 # dummy entry for XF 0 in case no Styles section self.bk._xf_index_to_xl_type_map[0] = 2 # fill_in_standard_formats(bk) #### pre-integration kludge def do_cellstylexfs(self, elem): self.xf_type = 0 def do_cellxfs(self, elem): self.xf_type = 1 def do_numfmt(self, elem): formatCode = ensure_unicode(elem.get('formatCode')) numFmtId = int(elem.get('numFmtId')) is_date = is_date_format_string(self.bk, formatCode) self.fmt_is_date[numFmtId] = is_date fmt_obj = Format(numFmtId, is_date + 2, formatCode) self.bk.format_map[numFmtId] = fmt_obj if self.verbosity >= 3: self.dumpout('numFmtId=%d formatCode=%r is_date=%d', numFmtId, formatCode, is_date) def do_xf(self, elem): if self.xf_type != 1: #### ignoring style XFs for the moment return xfx = self.xf_counts[self.xf_type] self.xf_counts[self.xf_type] = xfx + 1 xf = XF() self.bk.xf_list.append(xf) self.bk.xfcount += 1 numFmtId = int(elem.get('numFmtId', '0')) xf.format_key = numFmtId is_date = self.fmt_is_date.get(numFmtId, 0) self.bk._xf_index_to_xl_type_map[xfx] = is_date + 2 if self.verbosity >= 3: self.dumpout('xfx=%d numFmtId=%d', xfx, numFmtId) self.dumpout(repr(self.bk._xf_index_to_xl_type_map)) tag2meth = { 'cellStyleXfs': do_cellstylexfs, 'cellXfs': do_cellxfs, 'numFmt': do_numfmt, 'xf': do_xf, } augment_keys(tag2meth, U_SSML12) class X12Sheet(X12General): def __init__(self, sheet, logfile=DLF, verbosity=0): self.sheet = sheet self.logfile = logfile self.verbosity = verbosity self.rowx = -1 # We may need to count them. self.bk = sheet.book self.sst = self.bk._sharedstrings self.relid2path = {} self.relid2reltype = {} self.merged_cells = sheet.merged_cells self.warned_no_cell_name = 0 self.warned_no_row_num = 0 if ET_has_iterparse: self.process_stream = self.own_process_stream def own_process_stream(self, stream, heading=None): if self.verbosity >= 2 and heading is not None: fprintf(self.logfile, "\n=== %s ===\n", heading) row_tag = U_SSML12 + "row" self_do_row = self.do_row for event, elem in ET.iterparse(stream): if elem.tag == row_tag: self_do_row(elem) elem.clear() # destroy all child elements (cells) elif elem.tag == U_SSML12 + "dimension": self.do_dimension(elem) elif elem.tag == U_SSML12 + "mergeCell": self.do_merge_cell(elem) self.finish_off() def process_rels(self, stream): if self.verbosity >= 2: fprintf(self.logfile, "\n=== Sheet Relationships ===\n") tree = ET.parse(stream) r_tag = U_PKGREL + 'Relationship' for elem in tree.findall(r_tag): rid = elem.get('Id') target = elem.get('Target') reltype = elem.get('Type').split('/')[-1] if self.verbosity >= 2: self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target) self.relid2reltype[rid] = reltype self.relid2path[rid] = normpath(join('xl/worksheets', target)) def process_comments_stream(self, stream): root = ET.parse(stream).getroot() author_list = root[0] assert author_list.tag == U_SSML12 + 'authors' authors = [elem.text for elem in author_list] comment_list = root[1] assert comment_list.tag == U_SSML12 + 'commentList' cell_note_map = self.sheet.cell_note_map from .sheet import Note text_tag = U_SSML12 + 'text' r_tag = U_SSML12 + 'r' t_tag = U_SSML12 + 't' for elem in comment_list.findall(U_SSML12 + 'comment'): ts = elem.findall('./' + text_tag + '/' + t_tag) ts += elem.findall('./' + text_tag + '/' + r_tag + '/' + t_tag) ref = elem.get('ref') note = Note() note.author = authors[int(elem.get('authorId'))] note.rowx, note.colx = coords = cell_name_to_rowx_colx(ref) note.text = '' for t in ts: note.text += cooked_text(self, t) cell_note_map[coords] = note def do_dimension(self, elem): ref = elem.get('ref') # example: "A1:Z99" or just "A1" if ref: # print >> self.logfile, "dimension: ref=%r" % ref last_cell_ref = ref.split(':')[-1] # example: "Z99" rowx, colx = cell_name_to_rowx_colx( last_cell_ref, allow_no_col=True) self.sheet._dimnrows = rowx + 1 if colx is not None: self.sheet._dimncols = colx + 1 def do_merge_cell(self, elem): # The ref attribute should be a cell range like "B1:D5". ref = elem.get('ref') if ref: try: first_cell_ref, last_cell_ref = ref.split(':') except ValueError: # encountered a single cell merge, e.g. "B3" first_cell_ref = ref last_cell_ref = ref first_rowx, first_colx = cell_name_to_rowx_colx(first_cell_ref) last_rowx, last_colx = cell_name_to_rowx_colx(last_cell_ref) self.merged_cells.append((first_rowx, last_rowx + 1, first_colx, last_colx + 1)) def do_row(self, row_elem): def bad_child_tag(child_tag): raise Exception('cell type %s has unexpected child <%s> at rowx=%r colx=%r' % (cell_type, child_tag, rowx, colx)) row_number = row_elem.get('r') if row_number is None: # Yes, it's optional. self.rowx += 1 explicit_row_number = 0 if self.verbosity and not self.warned_no_row_num: self.dumpout("no row number; assuming rowx=%d", self.rowx) self.warned_no_row_num = 1 else: self.rowx = int(row_number) - 1 explicit_row_number = 1 assert 0 <= self.rowx < X12_MAX_ROWS rowx = self.rowx colx = -1 if self.verbosity >= 3: self.dumpout(" row_number=%r rowx=%d explicit=%d", row_number, self.rowx, explicit_row_number) letter_value = _UPPERCASE_1_REL_INDEX for cell_elem in row_elem: cell_name = cell_elem.get('r') if cell_name is None: # Yes, it's optional. colx += 1 if self.verbosity and not self.warned_no_cell_name: self.dumpout("no cellname; assuming rowx=%d colx=%d", rowx, colx) self.warned_no_cell_name = 1 else: # Extract column index from cell name # A => 0, Z =>25, AA => 26, XFD => 16383 colx = 0 charx = -1 try: for c in cell_name: charx += 1 if c == '$': continue lv = letter_value[c] if lv: colx = colx * 26 + lv else: # start of row number; can't be '0' colx = colx - 1 assert 0 <= colx < X12_MAX_COLS break except KeyError: raise Exception('Unexpected character %r in cell name %r' % (c, cell_name)) if explicit_row_number and cell_name[charx:] != row_number: raise Exception('cell name %r but row number is %r' % (cell_name, row_number)) xf_index = int(cell_elem.get('s', '0')) cell_type = cell_elem.get('t', 'n') tvalue = None if cell_type == 'n': # n = number. Most frequent type. # child contains plain text which can go straight into float() # OR there's no text in which case it's a BLANK cell for child in cell_elem: child_tag = child.tag if child_tag == V_TAG: tvalue = child.text elif child_tag == F_TAG: # formula pass else: raise Exception('unexpected tag %r' % child_tag) if not tvalue: if self.bk.formatting_info: self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) else: self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index) elif cell_type == "s": # s = index into shared string table. 2nd most frequent type # child contains plain text which can go straight into int() for child in cell_elem: child_tag = child.tag if child_tag == V_TAG: tvalue = child.text elif child_tag == F_TAG: # formula not expected here, but gnumeric does it. pass else: bad_child_tag(child_tag) if not tvalue: # if self.bk.formatting_info: self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) else: value = self.sst[int(tvalue)] self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, value, xf_index) elif cell_type == "str": # str = string result from formula. # Should have (formula) child; however in one file, all text cells are str with no formula. # child can contain escapes for child in cell_elem: child_tag = child.tag if child_tag == V_TAG: tvalue = cooked_text(self, child) elif child_tag == F_TAG: # formula pass else: bad_child_tag(child_tag) # assert tvalue is not None and formula is not None # Yuk. Fails with file created by gnumeric -- no tvalue! self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index) elif cell_type == "b": # b = boolean # child contains "0" or "1" for child in cell_elem: child_tag = child.tag if child_tag == V_TAG: tvalue = child.text elif child_tag == F_TAG: # formula pass else: bad_child_tag(child_tag) self.sheet.put_cell(rowx, colx, XL_CELL_BOOLEAN, cnv_xsd_boolean(tvalue), xf_index) elif cell_type == "e": # e = error # child contains e.g. "#REF!" tvalue = '#N/A' for child in cell_elem: child_tag = child.tag if child_tag == V_TAG: tvalue = child.text elif child_tag == F_TAG: # formula pass else: bad_child_tag(child_tag) value = error_code_from_text[tvalue] self.sheet.put_cell(rowx, colx, XL_CELL_ERROR, value, xf_index) elif cell_type == "inlineStr": # Not expected in files produced by Excel. # It's a way of allowing 3rd party s/w to write text (including rich text) cells # without having to build a shared string table for child in cell_elem: child_tag = child.tag if child_tag == IS_TAG: tvalue = get_text_from_si_or_is(self, child) elif child_tag == V_TAG: tvalue = child.text elif child_tag == F_TAG: # formula pass else: bad_child_tag(child_tag) if not tvalue: if self.bk.formatting_info: self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) else: self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index) else: raise Exception("Unknown cell type %r in rowx=%d colx=%d" % (cell_type, rowx, colx)) tag2meth = { 'row': do_row, } augment_keys(tag2meth, U_SSML12) def open_workbook_2007_xml(zf, component_names, logfile=sys.stdout, verbosity=0, use_mmap=0, formatting_info=0, on_demand=0, ragged_rows=0): ensure_elementtree_imported(verbosity, logfile) bk = Book() bk.logfile = logfile bk.verbosity = verbosity bk.formatting_info = formatting_info if formatting_info: raise NotImplementedError("formatting_info=True not yet implemented") bk.use_mmap = False #### Not supported initially bk.on_demand = on_demand if on_demand: if verbosity: print("WARNING *** on_demand=True not yet implemented; falling back to False", file=bk.logfile) bk.on_demand = False bk.ragged_rows = ragged_rows x12book = X12Book(bk, logfile, verbosity) zflo = zf.open(component_names['xl/_rels/workbook.xml.rels']) x12book.process_rels(zflo) del zflo zflo = zf.open(component_names['xl/workbook.xml']) x12book.process_stream(zflo, 'Workbook') del zflo props_name = 'docprops/core.xml' if props_name in component_names: zflo = zf.open(component_names[props_name]) x12book.process_coreprops(zflo) x12sty = X12Styles(bk, logfile, verbosity) if 'xl/styles.xml' in component_names: zflo = zf.open(component_names['xl/styles.xml']) x12sty.process_stream(zflo, 'styles') del zflo else: # seen in MS sample file MergedCells.xlsx pass sst_fname = 'xl/sharedstrings.xml' x12sst = X12SST(bk, logfile, verbosity) if sst_fname in component_names: zflo = zf.open(component_names[sst_fname]) x12sst.process_stream(zflo, 'SST') del zflo for sheetx in range(bk.nsheets): fname = x12book.sheet_targets[sheetx] zflo = zf.open(component_names[fname]) sheet = bk._sheet_list[sheetx] x12sheet = X12Sheet(sheet, logfile, verbosity) heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname) x12sheet.process_stream(zflo, heading) del zflo rels_fname = 'xl/worksheets/_rels/%s.rels' % fname.rsplit('/', 1)[-1] if rels_fname in component_names: zfrels = zf.open(rels_fname) x12sheet.process_rels(zfrels) del zfrels for relid, reltype in x12sheet.relid2reltype.items(): if reltype == 'comments': comments_fname = x12sheet.relid2path.get(relid) if comments_fname and comments_fname in component_names: comments_stream = zf.open(comments_fname) x12sheet.process_comments_stream(comments_stream) del comments_stream sheet.tidy_dimensions() return bk xlrd-1.2.0/xlrd/compdoc.py0000664000372000037200000005067413405237033016310 0ustar travistravis00000000000000# -*- coding: utf-8 -*- # Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd # This module is part of the xlrd package, which is released under a # BSD-style licence. # No part of the content of this file was derived from the works of # David Giffin. """ Implements the minimal functionality required to extract a "Workbook" or "Book" stream (as one big string) from an OLE2 Compound Document file. """ from __future__ import print_function import array import sys from struct import unpack from .timemachine import * #: Magic cookie that should appear in the first 8 bytes of the file. SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" EOCSID = -2 FREESID = -1 SATSID = -3 MSATSID = -4 EVILSID = -5 class CompDocError(Exception): pass class DirNode(object): def __init__(self, DID, dent, DEBUG=0, logfile=sys.stdout): # dent is the 128-byte directory entry self.DID = DID self.logfile = logfile (cbufsize, self.etype, self.colour, self.left_DID, self.right_DID, self.root_DID) = \ unpack(' 20: # allows for 2**20 bytes i.e. 1MB print("WARNING: sector size (2**%d) is preposterous; assuming 512 and continuing ..." % ssz, file=logfile) ssz = 9 if sssz > ssz: print("WARNING: short stream sector size (2**%d) is preposterous; assuming 64 and continuing ..." % sssz, file=logfile) sssz = 6 self.sec_size = sec_size = 1 << ssz self.short_sec_size = 1 << sssz if self.sec_size != 512 or self.short_sec_size != 64: print("@@@@ sec_size=%d short_sec_size=%d" % (self.sec_size, self.short_sec_size), file=logfile) ( SAT_tot_secs, self.dir_first_sec_sid, _unused, self.min_size_std_stream, SSAT_first_sec_sid, SSAT_tot_secs, MSATX_first_sec_sid, MSATX_tot_secs, ) = unpack(' 1: print('MSATX: sid=%d (0x%08X)' % (sid, sid), file=logfile) if sid >= mem_data_secs: msg = "MSAT extension: accessing sector %d but only %d in file" % (sid, mem_data_secs) if DEBUG > 1: print(msg, file=logfile) break raise CompDocError(msg) elif sid < 0: raise CompDocError("MSAT extension: invalid sector id: %d" % sid) if seen[sid]: raise CompDocError("MSAT corruption: seen[%d] == %d" % (sid, seen[sid])) seen[sid] = 1 actual_MSATX_sectors += 1 if DEBUG and actual_MSATX_sectors > expected_MSATX_sectors: print("[1]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile) offset = 512 + sec_size * sid MSAT.extend(unpack(fmt, mem[offset:offset+sec_size])) sid = MSAT.pop() # last sector id is sid of next sector in the chain if DEBUG and actual_MSATX_sectors != expected_MSATX_sectors: print("[2]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile) if DEBUG: print("MSAT: len =", len(MSAT), file=logfile) dump_list(MSAT, 10, logfile) # # === build the SAT === # self.SAT = [] actual_SAT_sectors = 0 dump_again = 0 for msidx in xrange(len(MSAT)): msid = MSAT[msidx] if msid in (FREESID, EOCSID): # Specification: the MSAT array may be padded with trailing FREESID entries. # Toleration: a FREESID or EOCSID entry anywhere in the MSAT array will be ignored. continue if msid >= mem_data_secs: if not trunc_warned: print("WARNING *** File is truncated, or OLE2 MSAT is corrupt!!", file=logfile) print("INFO: Trying to access sector %d but only %d available" % (msid, mem_data_secs), file=logfile) trunc_warned = 1 MSAT[msidx] = EVILSID dump_again = 1 continue elif msid < -2: raise CompDocError("MSAT: invalid sector id: %d" % msid) if seen[msid]: raise CompDocError("MSAT extension corruption: seen[%d] == %d" % (msid, seen[msid])) seen[msid] = 2 actual_SAT_sectors += 1 if DEBUG and actual_SAT_sectors > SAT_sectors_reqd: print("[3]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, actual_SAT_sectors, msid, file=logfile) offset = 512 + sec_size * msid self.SAT.extend(unpack(fmt, mem[offset:offset+sec_size])) if DEBUG: print("SAT: len =", len(self.SAT), file=logfile) dump_list(self.SAT, 10, logfile) # print >> logfile, "SAT ", # for i, s in enumerate(self.SAT): # print >> logfile, "entry: %4d offset: %6d, next entry: %4d" % (i, 512 + sec_size * i, s) # print >> logfile, "%d:%d " % (i, s), print(file=logfile) if DEBUG and dump_again: print("MSAT: len =", len(MSAT), file=logfile) dump_list(MSAT, 10, logfile) for satx in xrange(mem_data_secs, len(self.SAT)): self.SAT[satx] = EVILSID print("SAT: len =", len(self.SAT), file=logfile) dump_list(self.SAT, 10, logfile) # # === build the directory === # dbytes = self._get_stream( self.mem, 512, self.SAT, self.sec_size, self.dir_first_sec_sid, name="directory", seen_id=3) dirlist = [] did = -1 for pos in xrange(0, len(dbytes), 128): did += 1 dirlist.append(DirNode(did, dbytes[pos:pos+128], 0, logfile)) self.dirlist = dirlist _build_family_tree(dirlist, 0, dirlist[0].root_DID) # and stand well back ... if DEBUG: for d in dirlist: d.dump(DEBUG) # # === get the SSCS === # sscs_dir = self.dirlist[0] assert sscs_dir.etype == 5 # root entry if sscs_dir.first_SID < 0 or sscs_dir.tot_size == 0: # Problem reported by Frank Hoffsuemmer: some software was # writing -1 instead of -2 (EOCSID) for the first_SID # when the SCCS was empty. Not having EOCSID caused assertion # failure in _get_stream. # Solution: avoid calling _get_stream in any case when the # SCSS appears to be empty. self.SSCS = "" else: self.SSCS = self._get_stream( self.mem, 512, self.SAT, sec_size, sscs_dir.first_SID, sscs_dir.tot_size, name="SSCS", seen_id=4) # if DEBUG: print >> logfile, "SSCS", repr(self.SSCS) # # === build the SSAT === # self.SSAT = [] if SSAT_tot_secs > 0 and sscs_dir.tot_size == 0: print("WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero", file=logfile) if sscs_dir.tot_size > 0: sid = SSAT_first_sec_sid nsecs = SSAT_tot_secs while sid >= 0 and nsecs > 0: if seen[sid]: raise CompDocError("SSAT corruption: seen[%d] == %d" % (sid, seen[sid])) seen[sid] = 5 nsecs -= 1 start_pos = 512 + sid * sec_size news = list(unpack(fmt, mem[start_pos:start_pos+sec_size])) self.SSAT.extend(news) sid = self.SAT[sid] if DEBUG: print("SSAT last sid %d; remaining sectors %d" % (sid, nsecs), file=logfile) assert nsecs == 0 and sid == EOCSID if DEBUG: print("SSAT", file=logfile) dump_list(self.SSAT, 10, logfile) if DEBUG: print("seen", file=logfile) dump_list(seen, 20, logfile) def _get_stream(self, mem, base, sat, sec_size, start_sid, size=None, name='', seen_id=None): # print >> self.logfile, "_get_stream", base, sec_size, start_sid, size sectors = [] s = start_sid if size is None: # nothing to check against while s >= 0: if seen_id is not None: if self.seen[s]: raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s])) self.seen[s] = seen_id start_pos = base + s * sec_size sectors.append(mem[start_pos:start_pos+sec_size]) try: s = sat[s] except IndexError: raise CompDocError( "OLE2 stream %r: sector allocation table invalid entry (%d)" % (name, s) ) assert s == EOCSID else: todo = size while s >= 0: if seen_id is not None: if self.seen[s]: raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s])) self.seen[s] = seen_id start_pos = base + s * sec_size grab = sec_size if grab > todo: grab = todo todo -= grab sectors.append(mem[start_pos:start_pos+grab]) try: s = sat[s] except IndexError: raise CompDocError( "OLE2 stream %r: sector allocation table invalid entry (%d)" % (name, s) ) assert s == EOCSID if todo != 0: fprintf(self.logfile, "WARNING *** OLE2 stream %r: expected size %d, actual size %d\n", name, size, size - todo) return b''.join(sectors) def _dir_search(self, path, storage_DID=0): # Return matching DirNode instance, or None head = path[0] tail = path[1:] dl = self.dirlist for child in dl[storage_DID].children: if dl[child].name.lower() == head.lower(): et = dl[child].etype if et == 2: return dl[child] if et == 1: if not tail: raise CompDocError("Requested component is a 'storage'") return self._dir_search(tail, child) dl[child].dump(1) raise CompDocError("Requested stream is not a 'user stream'") return None def get_named_stream(self, qname): """ Interrogate the compound document's directory; return the stream as a string if found, otherwise return ``None``. :param qname: Name of the desired stream e.g. ``'Workbook'``. Should be in Unicode or convertible thereto. """ d = self._dir_search(qname.split("/")) if d is None: return None if d.tot_size >= self.min_size_std_stream: return self._get_stream( self.mem, 512, self.SAT, self.sec_size, d.first_SID, d.tot_size, name=qname, seen_id=d.DID+6) else: return self._get_stream( self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID, d.tot_size, name=qname + " (from SSCS)", seen_id=None) def locate_named_stream(self, qname): """ Interrogate the compound document's directory. If the named stream is not found, ``(None, 0, 0)`` will be returned. If the named stream is found and is contiguous within the original byte sequence (``mem``) used when the document was opened, then ``(mem, offset_to_start_of_stream, length_of_stream)`` is returned. Otherwise a new string is built from the fragments and ``(new_string, 0, length_of_stream)`` is returned. :param qname: Name of the desired stream e.g. ``'Workbook'``. Should be in Unicode or convertible thereto. """ d = self._dir_search(qname.split("/")) if d is None: return (None, 0, 0) if d.tot_size > self.mem_data_len: raise CompDocError("%r stream length (%d bytes) > file data size (%d bytes)" % (qname, d.tot_size, self.mem_data_len)) if d.tot_size >= self.min_size_std_stream: result = self._locate_stream( self.mem, 512, self.SAT, self.sec_size, d.first_SID, d.tot_size, qname, d.DID+6) if self.DEBUG: print("\nseen", file=self.logfile) dump_list(self.seen, 20, self.logfile) return result else: return ( self._get_stream( self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID, d.tot_size, qname + " (from SSCS)", None), 0, d.tot_size, ) def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id): # print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size s = start_sid if s < 0: raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid) p = -99 # dummy previous SID start_pos = -9999 end_pos = -8888 slices = [] tot_found = 0 found_limit = (expected_stream_size + sec_size - 1) // sec_size while s >= 0: if self.seen[s]: print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile) raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s])) self.seen[s] = seen_id tot_found += 1 if tot_found > found_limit: # Note: expected size rounded up to higher sector raise CompDocError( "%s: size exceeds expected %d bytes; corrupt?" % (qname, found_limit * sec_size) ) if s == p+1: # contiguous sectors end_pos += sec_size else: # start new slice if p >= 0: # not first time slices.append((start_pos, end_pos)) start_pos = base + s * sec_size end_pos = start_pos + sec_size p = s s = sat[s] assert s == EOCSID assert tot_found == found_limit # print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile) if not slices: # The stream is contiguous ... just what we like! return (mem, start_pos, expected_stream_size) slices.append((start_pos, end_pos)) # print >> self.logfile, "+++>>> %d fragments" % len(slices) return (b''.join(mem[start_pos:end_pos] for start_pos, end_pos in slices), 0, expected_stream_size) # ========================================================================================== def x_dump_line(alist, stride, f, dpos, equal=0): print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f) for value in alist[dpos:dpos + stride]: print(str(value), end=' ', file=f) print(file=f) def dump_list(alist, stride, f=sys.stdout): def _dump_line(dpos, equal=0): print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f) for value in alist[dpos:dpos + stride]: print(str(value), end=' ', file=f) print(file=f) pos = None oldpos = None for pos in xrange(0, len(alist), stride): if oldpos is None: _dump_line(pos) oldpos = pos elif alist[pos:pos+stride] != alist[oldpos:oldpos+stride]: if pos - oldpos > stride: _dump_line(pos - stride, equal=1) _dump_line(pos) oldpos = pos if oldpos is not None and pos is not None and pos != oldpos: _dump_line(pos, equal=1) xlrd-1.2.0/xlrd/book.py0000664000372000037200000015736113405237033015617 0ustar travistravis00000000000000# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd # This module is part of the xlrd package, which is released under a # BSD-style licence. from __future__ import print_function import gc import sys from . import compdoc, formatting, sheet from .biffh import * from .formula import * from .timemachine import * try: from time import perf_counter except ImportError: # Python 2.7 from time import clock as perf_counter import struct; unpack = struct.unpack empty_cell = sheet.empty_cell # for exposure to the world ... DEBUG = 0 USE_FANCY_CD = 1 TOGGLE_GC = 0 # gc.set_debug(gc.DEBUG_STATS) try: import mmap MMAP_AVAILABLE = 1 except ImportError: MMAP_AVAILABLE = 0 USE_MMAP = MMAP_AVAILABLE MY_EOF = 0xF00BAAA # not a 16-bit number SUPBOOK_UNK, SUPBOOK_INTERNAL, SUPBOOK_EXTERNAL, SUPBOOK_ADDIN, SUPBOOK_DDEOLE = range(5) SUPPORTED_VERSIONS = (80, 70, 50, 45, 40, 30, 21, 20) _code_from_builtin_name = { "Consolidate_Area": "\x00", "Auto_Open": "\x01", "Auto_Close": "\x02", "Extract": "\x03", "Database": "\x04", "Criteria": "\x05", "Print_Area": "\x06", "Print_Titles": "\x07", "Recorder": "\x08", "Data_Form": "\x09", "Auto_Activate": "\x0A", "Auto_Deactivate": "\x0B", "Sheet_Title": "\x0C", "_FilterDatabase": "\x0D", } builtin_name_from_code = {} code_from_builtin_name = {} for _bin, _bic in _code_from_builtin_name.items(): _bin = UNICODE_LITERAL(_bin) _bic = UNICODE_LITERAL(_bic) code_from_builtin_name[_bin] = _bic builtin_name_from_code[_bic] = _bin del _bin, _bic, _code_from_builtin_name def open_workbook_xls(filename=None, logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, file_contents=None, encoding_override=None, formatting_info=False, on_demand=False, ragged_rows=False): t0 = perf_counter() if TOGGLE_GC: orig_gc_enabled = gc.isenabled() if orig_gc_enabled: gc.disable() bk = Book() try: bk.biff2_8_load( filename=filename, file_contents=file_contents, logfile=logfile, verbosity=verbosity, use_mmap=use_mmap, encoding_override=encoding_override, formatting_info=formatting_info, on_demand=on_demand, ragged_rows=ragged_rows, ) t1 = perf_counter() bk.load_time_stage_1 = t1 - t0 biff_version = bk.getbof(XL_WORKBOOK_GLOBALS) if not biff_version: raise XLRDError("Can't determine file's BIFF version") if biff_version not in SUPPORTED_VERSIONS: raise XLRDError( "BIFF version %s is not supported" % biff_text_from_num[biff_version] ) bk.biff_version = biff_version if biff_version <= 40: # no workbook globals, only 1 worksheet if on_demand: fprintf(bk.logfile, "*** WARNING: on_demand is not supported for this Excel version.\n" "*** Setting on_demand to False.\n") bk.on_demand = on_demand = False bk.fake_globals_get_sheet() elif biff_version == 45: # worksheet(s) embedded in global stream bk.parse_globals() if on_demand: fprintf(bk.logfile, "*** WARNING: on_demand is not supported for this Excel version.\n" "*** Setting on_demand to False.\n") bk.on_demand = on_demand = False else: bk.parse_globals() bk._sheet_list = [None for sh in bk._sheet_names] if not on_demand: bk.get_sheets() bk.nsheets = len(bk._sheet_list) if biff_version == 45 and bk.nsheets > 1: fprintf( bk.logfile, "*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n" "*** Book-level data will be that of the last worksheet.\n", bk.nsheets ) if TOGGLE_GC: if orig_gc_enabled: gc.enable() t2 = perf_counter() bk.load_time_stage_2 = t2 - t1 except: bk.release_resources() raise # normal exit if not on_demand: bk.release_resources() return bk class Name(BaseObject): """ Information relating to a named reference, formula, macro, etc. .. note:: Name information is **not** extracted from files older than Excel 5.0 (``Book.biff_version < 50``) """ _repr_these = ['stack'] book = None # parent #: 0 = Visible; 1 = Hidden hidden = 0 #: 0 = Command macro; 1 = Function macro. Relevant only if macro == 1 func = 0 #: 0 = Sheet macro; 1 = VisualBasic macro. Relevant only if macro == 1 vbasic = 0 #: 0 = Standard name; 1 = Macro name macro = 0 #: 0 = Simple formula; 1 = Complex formula (array formula or user defined). #: #: .. note:: No examples have been sighted. complex = 0 #: 0 = User-defined name; 1 = Built-in name #: #: Common examples: ``Print_Area``, ``Print_Titles``; see OOo docs for #: full list builtin = 0 #: Function group. Relevant only if macro == 1; see OOo docs for values. funcgroup = 0 #: 0 = Formula definition; 1 = Binary data #: #: .. note:: No examples have been sighted. binary = 0 #: The index of this object in book.name_obj_list name_index = 0 # A Unicode string. If builtin, decoded as per OOo docs. name = UNICODE_LITERAL("") #: An 8-bit string. raw_formula = b'' #: ``-1``: #: The name is global (visible in all calculation sheets). #: ``-2``: #: The name belongs to a macro sheet or VBA sheet. #: ``-3``: #: The name is invalid. #: ``0 <= scope < book.nsheets``: #: The name is local to the sheet whose index is scope. scope = -1 #: The result of evaluating the formula, if any. #: If no formula, or evaluation of the formula encountered problems, #: the result is ``None``. Otherwise the result is a single instance of the #: :class:`~xlrd.formula.Operand` class. # result = None def cell(self): """ This is a convenience method for the frequent use case where the name refers to a single cell. :returns: An instance of the :class:`~xlrd.sheet.Cell` class. :raises xlrd.biffh.XLRDError: The name is not a constant absolute reference to a single cell. """ res = self.result if res: # result should be an instance of the Operand class kind = res.kind value = res.value if kind == oREF and len(value) == 1: ref3d = value[0] if (0 <= ref3d.shtxlo == ref3d.shtxhi - 1 and ref3d.rowxlo == ref3d.rowxhi - 1 and ref3d.colxlo == ref3d.colxhi - 1): sh = self.book.sheet_by_index(ref3d.shtxlo) return sh.cell(ref3d.rowxlo, ref3d.colxlo) self.dump( self.book.logfile, header="=== Dump of Name object ===", footer="======= End of dump =======", ) raise XLRDError("Not a constant absolute reference to a single cell") def area2d(self, clipped=True): """ This is a convenience method for the use case where the name refers to one rectangular area in one worksheet. :param clipped: If ``True``, the default, the returned rectangle is clipped to fit in ``(0, sheet.nrows, 0, sheet.ncols)``. it is guaranteed that ``0 <= rowxlo <= rowxhi <= sheet.nrows`` and that the number of usable rows in the area (which may be zero) is ``rowxhi - rowxlo``; likewise for columns. :returns: a tuple ``(sheet_object, rowxlo, rowxhi, colxlo, colxhi)``. :raises xlrd.biffh.XLRDError: The name is not a constant absolute reference to a single area in a single sheet. """ res = self.result if res: # result should be an instance of the Operand class kind = res.kind value = res.value if kind == oREF and len(value) == 1: # only 1 reference ref3d = value[0] if 0 <= ref3d.shtxlo == ref3d.shtxhi - 1: # only 1 usable sheet sh = self.book.sheet_by_index(ref3d.shtxlo) if not clipped: return sh, ref3d.rowxlo, ref3d.rowxhi, ref3d.colxlo, ref3d.colxhi rowxlo = min(ref3d.rowxlo, sh.nrows) rowxhi = max(rowxlo, min(ref3d.rowxhi, sh.nrows)) colxlo = min(ref3d.colxlo, sh.ncols) colxhi = max(colxlo, min(ref3d.colxhi, sh.ncols)) assert 0 <= rowxlo <= rowxhi <= sh.nrows assert 0 <= colxlo <= colxhi <= sh.ncols return sh, rowxlo, rowxhi, colxlo, colxhi self.dump( self.book.logfile, header="=== Dump of Name object ===", footer="======= End of dump =======", ) raise XLRDError("Not a constant absolute reference to a single area in a single sheet") class Book(BaseObject): """ Contents of a "workbook". .. warning:: You should not instantiate this class yourself. You use the :class:`Book` object that was returned when you called :func:`~xlrd.open_workbook`. """ #: The number of worksheets present in the workbook file. #: This information is available even when no sheets have yet been loaded. nsheets = 0 #: Which date system was in force when this file was last saved. #: #: 0: #: 1900 system (the Excel for Windows default). #: #: 1: #: 1904 system (the Excel for Macintosh default). #: #: Defaults to 0 in case it's not specified in the file. datemode = 0 #: Version of BIFF (Binary Interchange File Format) used to create the file. #: Latest is 8.0 (represented here as 80), introduced with Excel 97. #: Earliest supported by this module: 2.0 (represented as 20). biff_version = 0 #: List containing a :class:`Name` object for each ``NAME`` record in the #: workbook. #: #: .. versionadded:: 0.6.0 name_obj_list = [] #: An integer denoting the character set used for strings in this file. #: For BIFF 8 and later, this will be 1200, meaning Unicode; #: more precisely, UTF_16_LE. #: For earlier versions, this is used to derive the appropriate Python #: encoding to be used to convert to Unicode. #: Examples: ``1252 -> 'cp1252'``, ``10000 -> 'mac_roman'`` codepage = None #: The encoding that was derived from the codepage. encoding = None #: A tuple containing the telephone country code for: #: #: ``[0]``: #: the user-interface setting when the file was created. #: #: ``[1]``: #: the regional settings. #: #: Example: ``(1, 61)`` meaning ``(USA, Australia)``. #: #: This information may give a clue to the correct encoding for an #: unknown codepage. For a long list of observed values, refer to the #: OpenOffice.org documentation for the ``COUNTRY`` record. countries = (0, 0) #: What (if anything) is recorded as the name of the last user to #: save the file. user_name = UNICODE_LITERAL('') #: A list of :class:`~xlrd.formatting.Font` class instances, #: each corresponding to a FONT record. #: #: .. versionadded:: 0.6.1 font_list = [] #: A list of :class:`~xlrd.formatting.XF` class instances, #: each corresponding to an ``XF`` record. #: #: .. versionadded:: 0.6.1 xf_list = [] #: A list of :class:`~xlrd.formatting.Format` objects, each corresponding to #: a ``FORMAT`` record, in the order that they appear in the input file. #: It does *not* contain builtin formats. #: #: If you are creating an output file using (for example) :mod:`xlwt`, #: use this list. #: #: The collection to be used for all visual rendering purposes is #: :attr:`format_map`. #: #: .. versionadded:: 0.6.1 format_list = [] ## #: The mapping from :attr:`~xlrd.formatting.XF.format_key` to #: :class:`~xlrd.formatting.Format` object. #: #: .. versionadded:: 0.6.1 format_map = {} #: This provides access via name to the extended format information for #: both built-in styles and user-defined styles. #: #: It maps ``name`` to ``(built_in, xf_index)``, where #: ``name`` is either the name of a user-defined style, #: or the name of one of the built-in styles. Known built-in names are #: Normal, RowLevel_1 to RowLevel_7, #: ColLevel_1 to ColLevel_7, Comma, Currency, Percent, "Comma [0]", #: "Currency [0]", Hyperlink, and "Followed Hyperlink". #: #: ``built_in`` has the following meanings #: #: 1: #: built-in style #: #: 0: #: user-defined #: #: ``xf_index`` is an index into :attr:`Book.xf_list`. #: #: References: OOo docs s6.99 (``STYLE`` record); Excel UI Format/Style #: #: .. versionadded:: 0.6.1 #: #: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.7.4 style_name_map = {} #: This provides definitions for colour indexes. Please refer to #: :ref:`palette` for an explanation #: of how colours are represented in Excel. #: #: Colour indexes into the palette map into ``(red, green, blue)`` tuples. #: "Magic" indexes e.g. ``0x7FFF`` map to ``None``. #: #: :attr:`colour_map` is what you need if you want to render cells on screen #: or in a PDF file. If you are writing an output XLS file, use #: :attr:`palette_record`. #: #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.6.1 colour_map = {} #: If the user has changed any of the colours in the standard palette, the #: XLS file will contain a ``PALETTE`` record with 56 (16 for Excel 4.0 and #: earlier) RGB values in it, and this list will be e.g. #: ``[(r0, b0, g0), ..., (r55, b55, g55)]``. #: Otherwise this list will be empty. This is what you need if you are #: writing an output XLS file. If you want to render cells on screen or in a #: PDF file, use :attr:`colour_map`. #: #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.6.1 palette_record = [] #: Time in seconds to extract the XLS image as a contiguous string #: (or mmap equivalent). load_time_stage_1 = -1.0 #: Time in seconds to parse the data from the contiguous string #: (or mmap equivalent). load_time_stage_2 = -1.0 def sheets(self): """ :returns: A list of all sheets in the book. All sheets not already loaded will be loaded. """ for sheetx in xrange(self.nsheets): if not self._sheet_list[sheetx]: self.get_sheet(sheetx) return self._sheet_list[:] def sheet_by_index(self, sheetx): """ :param sheetx: Sheet index in ``range(nsheets)`` :returns: A :class:`~xlrd.sheet.Sheet`. """ return self._sheet_list[sheetx] or self.get_sheet(sheetx) def sheet_by_name(self, sheet_name): """ :param sheet_name: Name of the sheet required. :returns: A :class:`~xlrd.sheet.Sheet`. """ try: sheetx = self._sheet_names.index(sheet_name) except ValueError: raise XLRDError('No sheet named <%r>' % sheet_name) return self.sheet_by_index(sheetx) def sheet_names(self): """ :returns: A list of the names of all the worksheets in the workbook file. This information is available even when no sheets have yet been loaded. """ return self._sheet_names[:] def sheet_loaded(self, sheet_name_or_index): """ :param sheet_name_or_index: Name or index of sheet enquired upon :returns: ``True`` if sheet is loaded, ``False`` otherwise. .. versionadded:: 0.7.1 """ if isinstance(sheet_name_or_index, int): sheetx = sheet_name_or_index else: try: sheetx = self._sheet_names.index(sheet_name_or_index) except ValueError: raise XLRDError('No sheet named <%r>' % sheet_name_or_index) return bool(self._sheet_list[sheetx]) def unload_sheet(self, sheet_name_or_index): """ :param sheet_name_or_index: Name or index of sheet to be unloaded. .. versionadded:: 0.7.1 """ if isinstance(sheet_name_or_index, int): sheetx = sheet_name_or_index else: try: sheetx = self._sheet_names.index(sheet_name_or_index) except ValueError: raise XLRDError('No sheet named <%r>' % sheet_name_or_index) self._sheet_list[sheetx] = None def release_resources(self): """ This method has a dual purpose. You can call it to release memory-consuming objects and (possibly) a memory-mapped file (:class:`mmap.mmap` object) when you have finished loading sheets in ``on_demand`` mode, but still require the :class:`Book` object to examine the loaded sheets. It is also called automatically (a) when :func:`~xlrd.open_workbook` raises an exception and (b) if you are using a ``with`` statement, when the ``with`` block is exited. Calling this method multiple times on the same object has no ill effect. """ self._resources_released = 1 if hasattr(self.mem, "close"): # must be a mmap.mmap object self.mem.close() self.mem = None if hasattr(self.filestr, "close"): self.filestr.close() self.filestr = None self._sharedstrings = None self._rich_text_runlist_map = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_tb): self.release_resources() # return false #: A mapping from ``(lower_case_name, scope)`` to a single :class:`Name` #: object. #: #: .. versionadded:: 0.6.0 name_and_scope_map = {} #: A mapping from `lower_case_name` to a list of :class:`Name` objects. #: The list is sorted in scope order. Typically there will be one item #: (of global scope) in the list. #: #: .. versionadded:: 0.6.0 name_map = {} def __init__(self): self._sheet_list = [] self._sheet_names = [] self._sheet_visibility = [] # from BOUNDSHEET record self.nsheets = 0 self._sh_abs_posn = [] # sheet's absolute position in the stream self._sharedstrings = [] self._rich_text_runlist_map = {} self.raw_user_name = False self._sheethdr_count = 0 # BIFF 4W only self.builtinfmtcount = -1 # unknown as yet. BIFF 3, 4S, 4W self.initialise_format_info() self._all_sheets_count = 0 # includes macro & VBA sheets self._supbook_count = 0 self._supbook_locals_inx = None self._supbook_addins_inx = None self._all_sheets_map = [] # maps an all_sheets index to a calc-sheets index (or -1) self._externsheet_info = [] self._externsheet_type_b57 = [] self._extnsht_name_from_num = {} self._sheet_num_from_name = {} self._extnsht_count = 0 self._supbook_types = [] self._resources_released = 0 self.addin_func_names = [] self.name_obj_list = [] self.colour_map = {} self.palette_record = [] self.xf_list = [] self.style_name_map = {} self.mem = b'' self.filestr = b'' def biff2_8_load(self, filename=None, file_contents=None, logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, encoding_override=None, formatting_info=False, on_demand=False, ragged_rows=False): # DEBUG = 0 self.logfile = logfile self.verbosity = verbosity self.use_mmap = use_mmap and MMAP_AVAILABLE self.encoding_override = encoding_override self.formatting_info = formatting_info self.on_demand = on_demand self.ragged_rows = ragged_rows if not file_contents: with open(filename, "rb") as f: f.seek(0, 2) # EOF size = f.tell() f.seek(0, 0) # BOF if size == 0: raise XLRDError("File size is 0 bytes") if self.use_mmap: self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) self.stream_len = size else: self.filestr = f.read() self.stream_len = len(self.filestr) else: self.filestr = file_contents self.stream_len = len(file_contents) self.base = 0 if self.filestr[:8] != compdoc.SIGNATURE: # got this one at the antique store self.mem = self.filestr else: cd = compdoc.CompDoc(self.filestr, logfile=self.logfile) if USE_FANCY_CD: for qname in ['Workbook', 'Book']: self.mem, self.base, self.stream_len = \ cd.locate_named_stream(UNICODE_LITERAL(qname)) if self.mem: break else: raise XLRDError("Can't find workbook in OLE2 compound document") else: for qname in ['Workbook', 'Book']: self.mem = cd.get_named_stream(UNICODE_LITERAL(qname)) if self.mem: break else: raise XLRDError("Can't find workbook in OLE2 compound document") self.stream_len = len(self.mem) del cd if self.mem is not self.filestr: if hasattr(self.filestr, "close"): self.filestr.close() self.filestr = b'' self._position = self.base if DEBUG: print("mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), file=self.logfile) def initialise_format_info(self): # needs to be done once per sheet for BIFF 4W :-( self.format_map = {} self.format_list = [] self.xfcount = 0 self.actualfmtcount = 0 # number of FORMAT records seen so far self._xf_index_to_xl_type_map = {0: XL_CELL_NUMBER} self._xf_epilogue_done = 0 self.xf_list = [] self.font_list = [] def get2bytes(self): pos = self._position buff_two = self.mem[pos:pos+2] lenbuff = len(buff_two) self._position += lenbuff if lenbuff < 2: return MY_EOF lo, hi = buff_two return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo) def get_record_parts(self): pos = self._position mem = self.mem code, length = unpack('= 2: fprintf(self.logfile, "BOUNDSHEET: inx=%d vis=%r sheet_name=%r abs_posn=%d sheet_type=0x%02x\n", self._all_sheets_count, visibility, sheet_name, abs_posn, sheet_type) self._all_sheets_count += 1 if sheet_type != XL_BOUNDSHEET_WORKSHEET: self._all_sheets_map.append(-1) descr = { 1: 'Macro sheet', 2: 'Chart', 6: 'Visual Basic module', }.get(sheet_type, 'UNKNOWN') if DEBUG or self.verbosity >= 1: fprintf(self.logfile, "NOTE *** Ignoring non-worksheet data named %r (type 0x%02x = %s)\n", sheet_name, sheet_type, descr) else: snum = len(self._sheet_names) self._all_sheets_map.append(snum) self._sheet_names.append(sheet_name) self._sh_abs_posn.append(abs_posn) self._sheet_visibility.append(visibility) self._sheet_num_from_name[sheet_name] = snum def handle_builtinfmtcount(self, data): ### N.B. This count appears to be utterly useless. # DEBUG = 1 builtinfmtcount = unpack('= 2: fprintf(self.logfile, "*** No CODEPAGE record; assuming 1200 (utf_16_le)\n") else: codepage = self.codepage if codepage in encoding_from_codepage: encoding = encoding_from_codepage[codepage] elif 300 <= codepage <= 1999: encoding = 'cp' + str(codepage) else: encoding = 'unknown_codepage_' + str(codepage) if DEBUG or (self.verbosity and encoding != self.encoding) : fprintf(self.logfile, "CODEPAGE: codepage %r -> encoding %r\n", codepage, encoding) self.encoding = encoding if self.codepage != 1200: # utf_16_le # If we don't have a codec that can decode ASCII into Unicode, # we're well & truly stuffed -- let the punter know ASAP. try: unicode(b'trial', self.encoding) except BaseException as e: fprintf(self.logfile, "ERROR *** codepage %r -> encoding %r -> %s: %s\n", self.codepage, self.encoding, type(e).__name__.split(".")[-1], e) raise if self.raw_user_name: strg = unpack_string(self.user_name, 0, self.encoding, lenlen=1) strg = strg.rstrip() # if DEBUG: # print "CODEPAGE: user name decoded from %r to %r" % (self.user_name, strg) self.user_name = strg self.raw_user_name = False return self.encoding def handle_codepage(self, data): # DEBUG = 0 codepage = unpack('= 2 if self.biff_version >= 80: option_flags, other_info =unpack("= 1 blah2 = DEBUG or self.verbosity >= 2 if self.biff_version >= 80: num_refs = unpack("= 2: logf = self.logfile fprintf(logf, "FILEPASS:\n") hex_char_dump(data, 0, len(data), base=0, fout=logf) if self.biff_version >= 80: kind1, = unpack('= 2 bv = self.biff_version if bv < 50: return self.derive_encoding() # print # hex_char_dump(data, 0, len(data), fout=self.logfile) ( option_flags, kb_shortcut, name_len, fmla_len, extsht_index, sheet_index, menu_text_len, description_text_len, help_topic_text_len, status_bar_text_len, ) = unpack("> nshift) macro_flag = " M"[nobj.macro] if bv < 80: internal_name, pos = unpack_string_update_pos(data, 14, self.encoding, known_len=name_len) else: internal_name, pos = unpack_unicode_update_pos(data, 14, known_len=name_len) nobj.extn_sheet_num = extsht_index nobj.excel_sheet_index = sheet_index nobj.scope = None # patched up in the names_epilogue() method if blah: fprintf( self.logfile, "NAME[%d]:%s oflags=%d, name_len=%d, fmla_len=%d, extsht_index=%d, sheet_index=%d, name=%r\n", name_index, macro_flag, option_flags, name_len, fmla_len, extsht_index, sheet_index, internal_name) name = internal_name if nobj.builtin: name = builtin_name_from_code.get(name, "??Unknown??") if blah: print(" builtin: %s" % name, file=self.logfile) nobj.name = name nobj.raw_formula = data[pos:] nobj.basic_formula_len = fmla_len nobj.evaluated = 0 if blah: nobj.dump( self.logfile, header="--- handle_name: name[%d] ---" % name_index, footer="-------------------", ) def names_epilogue(self): blah = self.verbosity >= 2 f = self.logfile if blah: print("+++++ names_epilogue +++++", file=f) print("_all_sheets_map", REPR(self._all_sheets_map), file=f) print("_extnsht_name_from_num", REPR(self._extnsht_name_from_num), file=f) print("_sheet_num_from_name", REPR(self._sheet_num_from_name), file=f) num_names = len(self.name_obj_list) for namex in range(num_names): nobj = self.name_obj_list[namex] # Convert from excel_sheet_index to scope. # This is done here because in BIFF7 and earlier, the # BOUNDSHEET records (from which _all_sheets_map is derived) # come after the NAME records. if self.biff_version >= 80: sheet_index = nobj.excel_sheet_index if sheet_index == 0: intl_sheet_index = -1 # global elif 1 <= sheet_index <= len(self._all_sheets_map): intl_sheet_index = self._all_sheets_map[sheet_index-1] if intl_sheet_index == -1: # maps to a macro or VBA sheet intl_sheet_index = -2 # valid sheet reference but not useful else: # huh? intl_sheet_index = -3 # invalid elif 50 <= self.biff_version <= 70: sheet_index = nobj.extn_sheet_num if sheet_index == 0: intl_sheet_index = -1 # global else: sheet_name = self._extnsht_name_from_num[sheet_index] intl_sheet_index = self._sheet_num_from_name.get(sheet_name, -2) nobj.scope = intl_sheet_index for namex in range(num_names): nobj = self.name_obj_list[namex] # Parse the formula ... if nobj.macro or nobj.binary: continue if nobj.evaluated: continue evaluate_name_formula(self, nobj, namex, blah=blah) if self.verbosity >= 2: print("---------- name object dump ----------", file=f) for namex in range(num_names): nobj = self.name_obj_list[namex] nobj.dump(f, header="--- name[%d] ---" % namex) print("--------------------------------------", file=f) # # Build some dicts for access to the name objects # name_and_scope_map = {} # (name.lower(), scope): Name_object name_map = {} # name.lower() : list of Name_objects (sorted in scope order) for namex in range(num_names): nobj = self.name_obj_list[namex] name_lcase = nobj.name.lower() key = (name_lcase, nobj.scope) if key in name_and_scope_map and self.verbosity: fprintf(f, 'Duplicate entry %r in name_and_scope_map\n', key) name_and_scope_map[key] = nobj sort_data = (nobj.scope, namex, nobj) # namex (a temp unique ID) ensures the Name objects will not # be compared (fatal in py3) if name_lcase in name_map: name_map[name_lcase].append(sort_data) else: name_map[name_lcase] = [sort_data] for key in name_map.keys(): alist = name_map[key] alist.sort() name_map[key] = [x[2] for x in alist] self.name_and_scope_map = name_and_scope_map self.name_map = name_map def handle_obj(self, data): # Not doing much handling at all. # Worrying about embedded (BOF ... EOF) substreams is done elsewhere. # DEBUG = 1 obj_type, obj_id = unpack(' handle_obj type=%d id=0x%08x" % (obj_type, obj_id) def handle_supbook(self, data): # aka EXTERNALBOOK in OOo docs self._supbook_types.append(None) blah = DEBUG or self.verbosity >= 2 if blah: print("SUPBOOK:", file=self.logfile) hex_char_dump(data, 0, len(data), fout=self.logfile) num_sheets = unpack("= 2: fprintf(self.logfile, "SST: unique strings: %d\n", uniquestrings) while 1: code, nb, data = self.get_record_parts_conditional(XL_CONTINUE) if code is None: break nbt += nb if DEBUG >= 2: fprintf(self.logfile, "CONTINUE: adding %d bytes to SST -> %d\n", nb, nbt) strlist.append(data) self._sharedstrings, rt_runlist = unpack_SST_table(strlist, uniquestrings) if self.formatting_info: self._rich_text_runlist_map = rt_runlist if DEBUG: t1 = perf_counter() print("SST processing took %.2f seconds" % (t1 - t0, ), file=self.logfile) def handle_writeaccess(self, data): DEBUG = 0 if self.biff_version < 80: if not self.encoding: self.raw_user_name = True self.user_name = data return strg = unpack_string(data, 0, self.encoding, lenlen=1) else: strg = unpack_unicode(data, 0, lenlen=2) if DEBUG: fprintf(self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg) strg = strg.rstrip() self.user_name = strg def parse_globals(self): # DEBUG = 0 # no need to position, just start reading (after the BOF) formatting.initialise_book(self) while 1: rc, length, data = self.get_record_parts() if DEBUG: print("parse_globals: record code is 0x%04x" % rc, file=self.logfile) if rc == XL_SST: self.handle_sst(data) elif rc == XL_FONT or rc == XL_FONT_B3B4: self.handle_font(data) elif rc == XL_FORMAT: # XL_FORMAT2 is BIFF <= 3.0, can't appear in globals self.handle_format(data) elif rc == XL_XF: self.handle_xf(data) elif rc == XL_BOUNDSHEET: self.handle_boundsheet(data) elif rc == XL_DATEMODE: self.handle_datemode(data) elif rc == XL_CODEPAGE: self.handle_codepage(data) elif rc == XL_COUNTRY: self.handle_country(data) elif rc == XL_EXTERNNAME: self.handle_externname(data) elif rc == XL_EXTERNSHEET: self.handle_externsheet(data) elif rc == XL_FILEPASS: self.handle_filepass(data) elif rc == XL_WRITEACCESS: self.handle_writeaccess(data) elif rc == XL_SHEETSOFFSET: self.handle_sheetsoffset(data) elif rc == XL_SHEETHDR: self.handle_sheethdr(data) elif rc == XL_SUPBOOK: self.handle_supbook(data) elif rc == XL_NAME: self.handle_name(data) elif rc == XL_PALETTE: self.handle_palette(data) elif rc == XL_STYLE: self.handle_style(data) elif rc & 0xff == 9 and self.verbosity: fprintf(self.logfile, "*** Unexpected BOF at posn %d: 0x%04x len=%d data=%r\n", self._position - length - 4, rc, length, data) elif rc == XL_EOF: self.xf_epilogue() self.names_epilogue() self.palette_epilogue() if not self.encoding: self.derive_encoding() if self.biff_version == 45: # DEBUG = 0 if DEBUG: print("global EOF: position", self._position, file=self.logfile) # if DEBUG: # pos = self._position - 4 # print repr(self.mem[pos:pos+40]) return else: # if DEBUG: # print >> self.logfile, "parse_globals: ignoring record code 0x%04x" % rc pass def read(self, pos, length): data = self.mem[pos:pos+length] self._position = pos + len(data) return data def getbof(self, rqd_stream): # DEBUG = 1 # if DEBUG: print >> self.logfile, "getbof(): position", self._position if DEBUG: print("reqd: 0x%04x" % rqd_stream, file=self.logfile) def bof_error(msg): raise XLRDError('Unsupported format, or corrupt file: ' + msg) savpos = self._position opcode = self.get2bytes() if opcode == MY_EOF: bof_error('Expected BOF record; met end of file') if opcode not in bofcodes: bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8]) length = self.get2bytes() if length == MY_EOF: bof_error('Incomplete BOF record[1]; met end of file') if not (4 <= length <= 20): bof_error( 'Invalid length (%d) for BOF record type 0x%04x' % (length, opcode)) padding = b'\0' * max(0, boflen[opcode] - length) data = self.read(self._position, length) if DEBUG: fprintf(self.logfile, "\ngetbof(): data=%r\n", data) if len(data) < length: bof_error('Incomplete BOF record[2]; met end of file') data += padding version1 = opcode >> 8 version2, streamtype = unpack('= 2: print("BOF: op=0x%04x vers=0x%04x stream=0x%04x buildid=%d buildyr=%d -> BIFF%d" % (opcode, version2, streamtype, build, year, version), file=self.logfile) got_globals = streamtype == XL_WORKBOOK_GLOBALS or ( version == 45 and streamtype == XL_WORKBOOK_GLOBALS_4W) if (rqd_stream == XL_WORKBOOK_GLOBALS and got_globals) or streamtype == rqd_stream: return version if version < 50 and streamtype == XL_WORKSHEET: return version if version >= 50 and streamtype == 0x0100: bof_error("Workspace file -- no spreadsheet data") bof_error( 'BOF not workbook/worksheet: op=0x%04x vers=0x%04x strm=0x%04x build=%d year=%d -> BIFF%d' % (opcode, version2, streamtype, build, year, version) ) # === helper functions def expand_cell_address(inrow, incol): # Ref : OOo docs, "4.3.4 Cell Addresses in BIFF8" outrow = inrow if incol & 0x8000: if outrow >= 32768: outrow -= 65536 relrow = 1 else: relrow = 0 outcol = incol & 0xFF if incol & 0x4000: if outcol >= 128: outcol -= 256 relcol = 1 else: relcol = 0 return outrow, outcol, relrow, relcol def colname(colx, _A2Z="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): assert colx >= 0 name = UNICODE_LITERAL('') while 1: quot, rem = divmod(colx, 26) name = _A2Z[rem] + name if not quot: return name colx = quot - 1 def display_cell_address(rowx, colx, relrow, relcol): if relrow: rowpart = "(*%s%d)" % ("+-"[rowx < 0], abs(rowx)) else: rowpart = "$%d" % (rowx+1,) if relcol: colpart = "(*%s%d)" % ("+-"[colx < 0], abs(colx)) else: colpart = "$" + colname(colx) return colpart + rowpart def unpack_SST_table(datatab, nstrings): "Return list of strings" datainx = 0 ndatas = len(datatab) data = datatab[0] datalen = len(data) pos = 8 strings = [] strappend = strings.append richtext_runs = {} local_unpack = unpack local_min = min local_BYTES_ORD = BYTES_ORD latin_1 = "latin_1" for _unused_i in xrange(nstrings): nchars = local_unpack('> 1, charsneed) rawstrg = data[pos:pos+2*charsavail] # if DEBUG: print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) try: accstrg += unicode(rawstrg, "utf_16_le") except: # print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) # Probable cause: dodgy data e.g. unfinished surrogate pair. # E.g. file unicode2.xls in pyExcelerator's examples has cells containing # unichr(i) for i in range(0x100000) # so this will include 0xD800 etc raise pos += 2*charsavail else: # Note: this is COMPRESSED (not ASCII!) encoding!!! charsavail = local_min(datalen - pos, charsneed) rawstrg = data[pos:pos+charsavail] # if DEBUG: print "SST CMPRSD: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) accstrg += unicode(rawstrg, latin_1) pos += charsavail charsgot += charsavail if charsgot == nchars: break datainx += 1 data = datatab[datainx] datalen = len(data) options = local_BYTES_ORD(data[0]) pos = 1 if rtcount: runs = [] for runindex in xrange(rtcount): if pos == datalen: pos = 0 datainx += 1 data = datatab[datainx] datalen = len(data) runs.append(local_unpack("= datalen: # adjust to correct position in next record pos = pos - datalen datainx += 1 if datainx < ndatas: data = datatab[datainx] datalen = len(data) else: assert _unused_i == nstrings - 1 strappend(accstrg) return strings, richtext_runs xlrd-1.2.0/xlrd/timemachine.py0000664000372000037200000000333513405237033017137 0ustar travistravis00000000000000## #

Copyright (c) 2006-2012 Stephen John Machin, Lingfo Pty Ltd

#

This module is part of the xlrd package, which is released under a BSD-style licence.

## # timemachine.py -- adaptation for single codebase. # Currently supported: 2.6 to 2.7, 3.2+ # usage: from timemachine import * from __future__ import print_function import sys python_version = sys.version_info[:2] # e.g. version 2.6 -> (2, 6) if python_version >= (3, 0): # Python 3 BYTES_LITERAL = lambda x: x.encode('latin1') UNICODE_LITERAL = lambda x: x BYTES_ORD = lambda byte: byte from io import BytesIO as BYTES_IO def fprintf(f, fmt, *vargs): fmt = fmt.replace("%r", "%a") if fmt.endswith('\n'): print(fmt[:-1] % vargs, file=f) else: print(fmt % vargs, end=' ', file=f) EXCEL_TEXT_TYPES = (str, bytes, bytearray) # xlwt: isinstance(obj, EXCEL_TEXT_TYPES) REPR = ascii xrange = range unicode = lambda b, enc: b.decode(enc) ensure_unicode = lambda s: s unichr = chr else: # Python 2 BYTES_LITERAL = lambda x: x UNICODE_LITERAL = lambda x: x.decode('latin1') BYTES_ORD = ord from cStringIO import StringIO as BYTES_IO def fprintf(f, fmt, *vargs): if fmt.endswith('\n'): print(fmt[:-1] % vargs, file=f) else: print(fmt % vargs, end=' ', file=f) try: EXCEL_TEXT_TYPES = basestring # xlwt: isinstance(obj, EXCEL_TEXT_TYPES) except NameError: EXCEL_TEXT_TYPES = (str, unicode) REPR = repr xrange = xrange # following used only to overcome 2.x ElementTree gimmick which # returns text as `str` if it's ascii, otherwise `unicode` ensure_unicode = unicode # used only in xlsx.py xlrd-1.2.0/xlrd/xldate.py0000664000372000037200000001737613405237033016147 0ustar travistravis00000000000000# -*- coding: utf-8 -*- # Copyright (c) 2005-2008 Stephen John Machin, Lingfo Pty Ltd # This module is part of the xlrd package, which is released under a # BSD-style licence. # No part of the content of this file was derived from the works of David Giffin. """ Tools for working with dates and times in Excel files. The conversion from ``days`` to ``(year, month, day)`` starts with an integral "julian day number" aka JDN. FWIW: - JDN 0 corresponds to noon on Monday November 24 in Gregorian year -4713. More importantly: - Noon on Gregorian 1900-03-01 (day 61 in the 1900-based system) is JDN 2415080.0 - Noon on Gregorian 1904-01-02 (day 1 in the 1904-based system) is JDN 2416482.0 """ import datetime _JDN_delta = (2415080 - 61, 2416482 - 1) assert _JDN_delta[1] - _JDN_delta[0] == 1462 # Pre-calculate the datetime epochs for efficiency. epoch_1904 = datetime.datetime(1904, 1, 1) epoch_1900 = datetime.datetime(1899, 12, 31) epoch_1900_minus_1 = datetime.datetime(1899, 12, 30) # This is equivalent to 10000-01-01: _XLDAYS_TOO_LARGE = (2958466, 2958466 - 1462) class XLDateError(ValueError): "A base class for all datetime-related errors." class XLDateNegative(XLDateError): "``xldate < 0.00``" class XLDateAmbiguous(XLDateError): "The 1900 leap-year problem ``(datemode == 0 and 1.0 <= xldate < 61.0)``" class XLDateTooLarge(XLDateError): "Gregorian year 10000 or later" class XLDateBadDatemode(XLDateError): "``datemode`` arg is neither 0 nor 1" class XLDateBadTuple(XLDateError): pass def xldate_as_tuple(xldate, datemode): """ Convert an Excel number (presumed to represent a date, a datetime or a time) into a tuple suitable for feeding to datetime or mx.DateTime constructors. :param xldate: The Excel number :param datemode: 0: 1900-based, 1: 1904-based. :raises xlrd.xldate.XLDateNegative: :raises xlrd.xldate.XLDateAmbiguous: :raises xlrd.xldate.XLDateTooLarge: :raises xlrd.xldate.XLDateBadDatemode: :raises xlrd.xldate.XLDateError: :returns: Gregorian ``(year, month, day, hour, minute, nearest_second)``. .. warning:: When using this function to interpret the contents of a workbook, you should pass in the :attr:`~xlrd.book.Book.datemode` attribute of that workbook. Whether the workbook has ever been anywhere near a Macintosh is irrelevant. .. admonition:: Special case If ``0.0 <= xldate < 1.0``, it is assumed to represent a time; ``(0, 0, 0, hour, minute, second)`` will be returned. .. note:: ``1904-01-01`` is not regarded as a valid date in the ``datemode==1`` system; its "serial number" is zero. """ if datemode not in (0, 1): raise XLDateBadDatemode(datemode) if xldate == 0.00: return (0, 0, 0, 0, 0, 0) if xldate < 0.00: raise XLDateNegative(xldate) xldays = int(xldate) frac = xldate - xldays seconds = int(round(frac * 86400.0)) assert 0 <= seconds <= 86400 if seconds == 86400: hour = minute = second = 0 xldays += 1 else: # second = seconds % 60; minutes = seconds // 60 minutes, second = divmod(seconds, 60) # minute = minutes % 60; hour = minutes // 60 hour, minute = divmod(minutes, 60) if xldays >= _XLDAYS_TOO_LARGE[datemode]: raise XLDateTooLarge(xldate) if xldays == 0: return (0, 0, 0, hour, minute, second) if xldays < 61 and datemode == 0: raise XLDateAmbiguous(xldate) jdn = xldays + _JDN_delta[datemode] yreg = ((((jdn * 4 + 274277) // 146097) * 3 // 4) + jdn + 1363) * 4 + 3 mp = ((yreg % 1461) // 4) * 535 + 333 d = ((mp % 16384) // 535) + 1 # mp /= 16384 mp >>= 14 if mp >= 10: return ((yreg // 1461) - 4715, mp - 9, d, hour, minute, second) else: return ((yreg // 1461) - 4716, mp + 3, d, hour, minute, second) def xldate_as_datetime(xldate, datemode): """ Convert an Excel date/time number into a :class:`datetime.datetime` object. :param xldate: The Excel number :param datemode: 0: 1900-based, 1: 1904-based. :returns: A :class:`datetime.datetime` object. """ # Set the epoch based on the 1900/1904 datemode. if datemode: epoch = epoch_1904 else: if xldate < 60: epoch = epoch_1900 else: # Workaround Excel 1900 leap year bug by adjusting the epoch. epoch = epoch_1900_minus_1 # The integer part of the Excel date stores the number of days since # the epoch and the fractional part stores the percentage of the day. days = int(xldate) fraction = xldate - days # Get the the integer and decimal seconds in Excel's millisecond resolution. seconds = int(round(fraction * 86400000.0)) seconds, milliseconds = divmod(seconds, 1000) return epoch + datetime.timedelta(days, seconds, 0, milliseconds) # === conversions from date/time to xl numbers def _leap(y): if y % 4: return 0 if y % 100: return 1 if y % 400: return 0 return 1 _days_in_month = (None, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) def xldate_from_date_tuple(date_tuple, datemode): """ Convert a date tuple (year, month, day) to an Excel date. :param year: Gregorian year. :param month: ``1 <= month <= 12`` :param day: ``1 <= day <= last day of that (year, month)`` :param datemode: 0: 1900-based, 1: 1904-based. :raises xlrd.xldate.XLDateAmbiguous: :raises xlrd.xldate.XLDateBadDatemode: :raises xlrd.xldate.XLDateBadTuple: ``(year, month, day)`` is too early/late or has invalid component(s) :raises xlrd.xldate.XLDateError: """ year, month, day = date_tuple if datemode not in (0, 1): raise XLDateBadDatemode(datemode) if year == 0 and month == 0 and day == 0: return 0.00 if not (1900 <= year <= 9999): raise XLDateBadTuple("Invalid year: %r" % ((year, month, day),)) if not (1 <= month <= 12): raise XLDateBadTuple("Invalid month: %r" % ((year, month, day),)) if (day < 1 or (day > _days_in_month[month] and not(day == 29 and month == 2 and _leap(year)))): raise XLDateBadTuple("Invalid day: %r" % ((year, month, day),)) Yp = year + 4716 M = month if M <= 2: Yp = Yp - 1 Mp = M + 9 else: Mp = M - 3 jdn = (1461 * Yp // 4) + ((979 * Mp + 16) // 32) + \ day - 1364 - (((Yp + 184) // 100) * 3 // 4) xldays = jdn - _JDN_delta[datemode] if xldays <= 0: raise XLDateBadTuple("Invalid (year, month, day): %r" % ((year, month, day),)) if xldays < 61 and datemode == 0: raise XLDateAmbiguous("Before 1900-03-01: %r" % ((year, month, day),)) return float(xldays) def xldate_from_time_tuple(time_tuple): """ Convert a time tuple ``(hour, minute, second)`` to an Excel "date" value (fraction of a day). :param hour: ``0 <= hour < 24`` :param minute: ``0 <= minute < 60`` :param second: ``0 <= second < 60`` :raises xlrd.xldate.XLDateBadTuple: Out-of-range hour, minute, or second """ hour, minute, second = time_tuple if 0 <= hour < 24 and 0 <= minute < 60 and 0 <= second < 60: return ((second / 60.0 + minute) / 60.0 + hour) / 24.0 raise XLDateBadTuple("Invalid (hour, minute, second): %r" % ((hour, minute, second),)) def xldate_from_datetime_tuple(datetime_tuple, datemode): """ Convert a datetime tuple ``(year, month, day, hour, minute, second)`` to an Excel date value. For more details, refer to other xldate_from_*_tuple functions. :param datetime_tuple: ``(year, month, day, hour, minute, second)`` :param datemode: 0: 1900-based, 1: 1904-based. """ return ( xldate_from_date_tuple(datetime_tuple[:3], datemode) + xldate_from_time_tuple(datetime_tuple[3:]) ) xlrd-1.2.0/xlrd/sheet.py0000664000372000037200000031724213405237033015771 0ustar travistravis00000000000000# -*- coding: utf-8 -*- # Copyright (c) 2005-2013 Stephen John Machin, Lingfo Pty Ltd # This module is part of the xlrd package, which is released under a # BSD-style licence. from __future__ import print_function from array import array from struct import calcsize, unpack from .biffh import * from .formatting import Format, nearest_colour_index from .formula import ( FMLA_TYPE_CELL, FMLA_TYPE_SHARED, decompile_formula, dump_formula, rangename2d, ) from .timemachine import * DEBUG = 0 OBJ_MSO_DEBUG = 0 _WINDOW2_options = ( # Attribute names and initial values to use in case # a WINDOW2 record is not written. ("show_formulas", 0), ("show_grid_lines", 1), ("show_sheet_headers", 1), ("panes_are_frozen", 0), ("show_zero_values", 1), ("automatic_grid_line_colour", 1), ("columns_from_right_to_left", 0), ("show_outline_symbols", 1), ("remove_splits_if_pane_freeze_is_removed", 0), # Multiple sheets can be selected, but only one can be active # (hold down Ctrl and click multiple tabs in the file in OOo) ("sheet_selected", 0), # "sheet_visible" should really be called "sheet_active" # and is 1 when this sheet is the sheet displayed when the file # is open. More than likely only one sheet should ever be set as # visible. # This would correspond to the Book's sheet_active attribute, but # that doesn't exist as WINDOW1 records aren't currently processed. # The real thing is the visibility attribute from the BOUNDSHEET record. ("sheet_visible", 0), ("show_in_page_break_preview", 0), ) class Sheet(BaseObject): """ Contains the data for one worksheet. In the cell access functions, ``rowx`` is a row index, counting from zero, and ``colx`` is a column index, counting from zero. Negative values for row/column indexes and slice positions are supported in the expected fashion. For information about cell types and cell values, refer to the documentation of the :class:`Cell` class. .. warning:: You don't instantiate this class yourself. You access :class:`Sheet` objects via the :class:`~xlrd.book.Book` object that was returned when you called :func:`xlrd.open_workbook`. """ #: Name of sheet. name = '' #: A reference to the :class:`~xlrd.book.Book` object to which this sheet #: belongs. #: #: Example usage: ``some_sheet.book.datemode`` book = None #: Number of rows in sheet. A row index is in ``range(thesheet.nrows)``. nrows = 0 #: Nominal number of columns in sheet. It is one more than the maximum #: column index found, ignoring trailing empty cells. #: See also the ``ragged_rows`` parameter to :func:`~xlrd.open_workbook` #: and :meth:`~xlrd.sheet.Sheet.row_len`. ncols = 0 #: The map from a column index to a :class:`Colinfo` object. Often there is #: an entry in ``COLINFO`` records for all column indexes in ``range(257)``. #: #: .. note:: #: xlrd ignores the entry for the non-existent #: 257th column. #: #: On the other hand, there may be no entry for unused columns. #: #: .. versionadded:: 0.6.1 #: #: Populated only if ``open_workbook(..., formatting_info=True)`` colinfo_map = {} #: The map from a row index to a :class:`Rowinfo` object. #: #: ..note:: #: It is possible to have missing entries -- at least one source of #: XLS files doesn't bother writing ``ROW`` records. #: #: .. versionadded:: 0.6.1 #: #: Populated only if ``open_workbook(..., formatting_info=True)`` rowinfo_map = {} #: List of address ranges of cells containing column labels. #: These are set up in Excel by Insert > Name > Labels > Columns. #: #: .. versionadded:: 0.6.0 #: #: How to deconstruct the list: #: #: .. code-block:: python #: #: for crange in thesheet.col_label_ranges: #: rlo, rhi, clo, chi = crange #: for rx in xrange(rlo, rhi): #: for cx in xrange(clo, chi): #: print "Column label at (rowx=%d, colx=%d) is %r" \ #: (rx, cx, thesheet.cell_value(rx, cx)) col_label_ranges = [] #: List of address ranges of cells containing row labels. #: For more details, see :attr:`col_label_ranges`. #: #: .. versionadded:: 0.6.0 row_label_ranges = [] #: List of address ranges of cells which have been merged. #: These are set up in Excel by Format > Cells > Alignment, then ticking #: the "Merge cells" box. #: #: .. note:: #: The upper limits are exclusive: i.e. ``[2, 3, 7, 9]`` only #: spans two cells. #: #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.6.1 #: #: How to deconstruct the list: #: #: .. code-block:: python #: #: for crange in thesheet.merged_cells: #: rlo, rhi, clo, chi = crange #: for rowx in xrange(rlo, rhi): #: for colx in xrange(clo, chi): #: # cell (rlo, clo) (the top left one) will carry the data #: # and formatting info; the remainder will be recorded as #: # blank cells, but a renderer will apply the formatting info #: # for the top left cell (e.g. border, pattern) to all cells in #: # the range. merged_cells = [] #: Mapping of ``(rowx, colx)`` to list of ``(offset, font_index)`` tuples. #: The offset defines where in the string the font begins to be used. #: Offsets are expected to be in ascending order. #: If the first offset is not zero, the meaning is that the cell's ``XF``'s #: font should be used from offset 0. #: #: This is a sparse mapping. There is no entry for cells that are not #: formatted with rich text. #: #: How to use: #: #: .. code-block:: python #: #: runlist = thesheet.rich_text_runlist_map.get((rowx, colx)) #: if runlist: #: for offset, font_index in runlist: #: # do work here. #: pass #: #: .. versionadded:: 0.7.2 #: #: Populated only if ``open_workbook(..., formatting_info=True)`` rich_text_runlist_map = {} #: Default column width from ``DEFCOLWIDTH`` record, else ``None``. #: From the OOo docs: #: #: Column width in characters, using the width of the zero character #: from default font (first FONT record in the file). Excel adds some #: extra space to the default width, depending on the default font and #: default font size. The algorithm how to exactly calculate the resulting #: column width is not known. #: Example: The default width of 8 set in this record results in a column #: width of 8.43 using Arial font with a size of 10 points. #: #: For the default hierarchy, refer to the :class:`Colinfo` class. #: #: .. versionadded:: 0.6.1 defcolwidth = None #: Default column width from ``STANDARDWIDTH`` record, else ``None``. #: #: From the OOo docs: #: #: Default width of the columns in 1/256 of the width of the zero #: character, using default font (first FONT record in the file). #: #: For the default hierarchy, refer to the :class:`Colinfo` class. #: #: .. versionadded:: 0.6.1 standardwidth = None #: Default value to be used for a row if there is #: no ``ROW`` record for that row. #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_row_height = None #: Default value to be used for a row if there is #: no ``ROW`` record for that row. #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_row_height_mismatch = None #: Default value to be used for a row if there is #: no ``ROW`` record for that row. #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_row_hidden = None #: Default value to be used for a row if there is #: no ``ROW`` record for that row. #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_additional_space_above = None #: Default value to be used for a row if there is #: no ``ROW`` record for that row. #: From the *optional* ``DEFAULTROWHEIGHT`` record. default_additional_space_below = None #: Visibility of the sheet:: #: #: 0 = visible #: 1 = hidden (can be unhidden by user -- Format -> Sheet -> Unhide) #: 2 = "very hidden" (can be unhidden only by VBA macro). visibility = 0 #: A 256-element tuple corresponding to the contents of the GCW record for #: this sheet. If no such record, treat as all bits zero. #: Applies to BIFF4-7 only. See docs of the :class:`Colinfo` class for #: discussion. gcw = (0, ) * 256 #: A list of :class:`Hyperlink` objects corresponding to ``HLINK`` records #: found in the worksheet. #: #: .. versionadded:: 0.7.2 hyperlink_list = [] #: A sparse mapping from ``(rowx, colx)`` to an item in #: :attr:`~xlrd.sheet.Sheet.hyperlink_list`. #: Cells not covered by a hyperlink are not mapped. #: It is possible using the Excel UI to set up a hyperlink that #: covers a larger-than-1x1 rectangle of cells. #: Hyperlink rectangles may overlap (Excel doesn't check). #: When a multiply-covered cell is clicked on, the hyperlink that is #: activated #: (and the one that is mapped here) is the last in #: :attr:`~xlrd.sheet.Sheet.hyperlink_list`. #: #: .. versionadded:: 0.7.2 hyperlink_map = {} #: A sparse mapping from ``(rowx, colx)`` to a :class:`Note` object. #: Cells not containing a note ("comment") are not mapped. #: #: .. versionadded:: 0.7.2 cell_note_map = {} #: Number of columns in left pane (frozen panes; for split panes, see #: comments in code) vert_split_pos = 0 #: Number of rows in top pane (frozen panes; for split panes, see comments #: in code) horz_split_pos = 0 #: Index of first visible row in bottom frozen/split pane horz_split_first_visible = 0 #: Index of first visible column in right frozen/split pane vert_split_first_visible = 0 #: Frozen panes: ignore it. Split panes: explanation and diagrams in #: OOo docs. split_active_pane = 0 #: Boolean specifying if a ``PANE`` record was present, ignore unless you're #: ``xlutils.copy`` has_pane_record = 0 #: A list of the horizontal page breaks in this sheet. #: Breaks are tuples in the form #: ``(index of row after break, start col index, end col index)``. #: #: Populated only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.7.2 horizontal_page_breaks = [] #: A list of the vertical page breaks in this sheet. #: Breaks are tuples in the form #: ``(index of col after break, start row index, end row index)``. #: #: Populated only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.7.2 vertical_page_breaks = [] def __init__(self, book, position, name, number): self.book = book self.biff_version = book.biff_version self._position = position self.logfile = book.logfile self.bt = array('B', [XL_CELL_EMPTY]) self.bf = array('h', [-1]) self.name = name self.number = number self.verbosity = book.verbosity self.formatting_info = book.formatting_info self.ragged_rows = book.ragged_rows if self.ragged_rows: self.put_cell = self.put_cell_ragged else: self.put_cell = self.put_cell_unragged self._xf_index_to_xl_type_map = book._xf_index_to_xl_type_map self.nrows = 0 # actual, including possibly empty cells self.ncols = 0 self._maxdatarowx = -1 # highest rowx containing a non-empty cell self._maxdatacolx = -1 # highest colx containing a non-empty cell self._dimnrows = 0 # as per DIMENSIONS record self._dimncols = 0 self._cell_values = [] self._cell_types = [] self._cell_xf_indexes = [] self.defcolwidth = None self.standardwidth = None self.default_row_height = None self.default_row_height_mismatch = 0 self.default_row_hidden = 0 self.default_additional_space_above = 0 self.default_additional_space_below = 0 self.colinfo_map = {} self.rowinfo_map = {} self.col_label_ranges = [] self.row_label_ranges = [] self.merged_cells = [] self.rich_text_runlist_map = {} self.horizontal_page_breaks = [] self.vertical_page_breaks = [] self._xf_index_stats = [0, 0, 0, 0] self.visibility = book._sheet_visibility[number] # from BOUNDSHEET record for attr, defval in _WINDOW2_options: setattr(self, attr, defval) self.first_visible_rowx = 0 self.first_visible_colx = 0 self.gridline_colour_index = 0x40 self.gridline_colour_rgb = None # pre-BIFF8 self.hyperlink_list = [] self.hyperlink_map = {} self.cell_note_map = {} # Values calculated by xlrd to predict the mag factors that # will actually be used by Excel to display your worksheet. # Pass these values to xlwt when writing XLS files. # Warning 1: Behaviour of OOo Calc and Gnumeric has been observed to differ from Excel's. # Warning 2: A value of zero means almost exactly what it says. Your sheet will be # displayed as a very tiny speck on the screen. xlwt will reject attempts to set # a mag_factor that is not (10 <= mag_factor <= 400). self.cooked_page_break_preview_mag_factor = 60 self.cooked_normal_view_mag_factor = 100 # Values (if any) actually stored on the XLS file self.cached_page_break_preview_mag_factor = 0 # default (60%), from WINDOW2 record self.cached_normal_view_mag_factor = 0 # default (100%), from WINDOW2 record self.scl_mag_factor = None # from SCL record self._ixfe = None # BIFF2 only self._cell_attr_to_xfx = {} # BIFF2.0 only if self.biff_version >= 80: self.utter_max_rows = 65536 else: self.utter_max_rows = 16384 self.utter_max_cols = 256 self._first_full_rowx = -1 # self._put_cell_exceptions = 0 # self._put_cell_row_widenings = 0 # self._put_cell_rows_appended = 0 # self._put_cell_cells_appended = 0 def cell(self, rowx, colx): """ :class:`Cell` object in the given row and column. """ if self.formatting_info: xfx = self.cell_xf_index(rowx, colx) else: xfx = None return Cell( self._cell_types[rowx][colx], self._cell_values[rowx][colx], xfx, ) def cell_value(self, rowx, colx): "Value of the cell in the given row and column." return self._cell_values[rowx][colx] def cell_type(self, rowx, colx): """ Type of the cell in the given row and column. Refer to the documentation of the :class:`Cell` class. """ return self._cell_types[rowx][colx] def cell_xf_index(self, rowx, colx): """ XF index of the cell in the given row and column. This is an index into :attr:`~xlrd.book.Book.xf_list`. .. versionadded:: 0.6.1 """ self.req_fmt_info() xfx = self._cell_xf_indexes[rowx][colx] if xfx > -1: self._xf_index_stats[0] += 1 return xfx # Check for a row xf_index try: xfx = self.rowinfo_map[rowx].xf_index if xfx > -1: self._xf_index_stats[1] += 1 return xfx except KeyError: pass # Check for a column xf_index try: xfx = self.colinfo_map[colx].xf_index if xfx == -1: xfx = 15 self._xf_index_stats[2] += 1 return xfx except KeyError: # If all else fails, 15 is used as hardwired global default xf_index. self._xf_index_stats[3] += 1 return 15 def row_len(self, rowx): """ Returns the effective number of cells in the given row. For use with ``open_workbook(ragged_rows=True)`` which is likely to produce rows with fewer than :attr:`~Sheet.ncols` cells. .. versionadded:: 0.7.2 """ return len(self._cell_values[rowx]) def row(self, rowx): """ Returns a sequence of the :class:`Cell` objects in the given row. """ return [ self.cell(rowx, colx) for colx in xrange(len(self._cell_values[rowx])) ] def get_rows(self): "Returns a generator for iterating through each row." return (self.row(index) for index in range(self.nrows)) def row_types(self, rowx, start_colx=0, end_colx=None): """ Returns a slice of the types of the cells in the given row. """ if end_colx is None: return self._cell_types[rowx][start_colx:] return self._cell_types[rowx][start_colx:end_colx] def row_values(self, rowx, start_colx=0, end_colx=None): """ Returns a slice of the values of the cells in the given row. """ if end_colx is None: return self._cell_values[rowx][start_colx:] return self._cell_values[rowx][start_colx:end_colx] def row_slice(self, rowx, start_colx=0, end_colx=None): """ Returns a slice of the :class:`Cell` objects in the given row. """ nc = len(self._cell_values[rowx]) if start_colx < 0: start_colx += nc if start_colx < 0: start_colx = 0 if end_colx is None or end_colx > nc: end_colx = nc elif end_colx < 0: end_colx += nc return [ self.cell(rowx, colx) for colx in xrange(start_colx, end_colx) ] def col_slice(self, colx, start_rowx=0, end_rowx=None): """ Returns a slice of the :class:`Cell` objects in the given column. """ nr = self.nrows if start_rowx < 0: start_rowx += nr if start_rowx < 0: start_rowx = 0 if end_rowx is None or end_rowx > nr: end_rowx = nr elif end_rowx < 0: end_rowx += nr return [ self.cell(rowx, colx) for rowx in xrange(start_rowx, end_rowx) ] def col_values(self, colx, start_rowx=0, end_rowx=None): """ Returns a slice of the values of the cells in the given column. """ nr = self.nrows if start_rowx < 0: start_rowx += nr if start_rowx < 0: start_rowx = 0 if end_rowx is None or end_rowx > nr: end_rowx = nr elif end_rowx < 0: end_rowx += nr return [ self._cell_values[rowx][colx] for rowx in xrange(start_rowx, end_rowx) ] def col_types(self, colx, start_rowx=0, end_rowx=None): """ Returns a slice of the types of the cells in the given column. """ nr = self.nrows if start_rowx < 0: start_rowx += nr if start_rowx < 0: start_rowx = 0 if end_rowx is None or end_rowx > nr: end_rowx = nr elif end_rowx < 0: end_rowx += nr return [ self._cell_types[rowx][colx] for rowx in xrange(start_rowx, end_rowx) ] col = col_slice # === Following methods are used in building the worksheet. # === They are not part of the API. def tidy_dimensions(self): if self.verbosity >= 3: fprintf( self.logfile, "tidy_dimensions: nrows=%d ncols=%d \n", self.nrows, self.ncols, ) if 1 and self.merged_cells: nr = nc = 0 umaxrows = self.utter_max_rows umaxcols = self.utter_max_cols for crange in self.merged_cells: rlo, rhi, clo, chi = crange if not (0 <= rlo < rhi <= umaxrows) or not (0 <= clo < chi <= umaxcols): fprintf(self.logfile, "*** WARNING: sheet #%d (%r), MERGEDCELLS bad range %r\n", self.number, self.name, crange) if rhi > nr: nr = rhi if chi > nc: nc = chi if nc > self.ncols: self.ncols = nc self._first_full_rowx = -2 if nr > self.nrows: # we put one empty cell at (nr-1,0) to make sure # we have the right number of rows. The ragged rows # will sort out the rest if needed. self.put_cell(nr-1, 0, XL_CELL_EMPTY, UNICODE_LITERAL(''), -1) if (self.verbosity >= 1 and (self.nrows != self._dimnrows or self.ncols != self._dimncols)): fprintf( self.logfile, "NOTE *** sheet %d (%r): DIMENSIONS R,C = %d,%d should be %d,%d\n", self.number, self.name, self._dimnrows, self._dimncols, self.nrows, self.ncols, ) if not self.ragged_rows: # fix ragged rows ncols = self.ncols s_cell_types = self._cell_types s_cell_values = self._cell_values s_cell_xf_indexes = self._cell_xf_indexes s_fmt_info = self.formatting_info # for rowx in xrange(self.nrows): if self._first_full_rowx == -2: ubound = self.nrows else: ubound = self._first_full_rowx for rowx in xrange(ubound): trow = s_cell_types[rowx] rlen = len(trow) nextra = ncols - rlen if nextra > 0: s_cell_values[rowx][rlen:] = [UNICODE_LITERAL('')] * nextra trow[rlen:] = self.bt * nextra if s_fmt_info: s_cell_xf_indexes[rowx][rlen:] = self.bf * nextra def put_cell_ragged(self, rowx, colx, ctype, value, xf_index): if ctype is None: # we have a number, so look up the cell type ctype = self._xf_index_to_xl_type_map[xf_index] assert 0 <= colx < self.utter_max_cols assert 0 <= rowx < self.utter_max_rows fmt_info = self.formatting_info try: nr = rowx + 1 if self.nrows < nr: scta = self._cell_types.append scva = self._cell_values.append scxa = self._cell_xf_indexes.append bt = self.bt bf = self.bf for _unused in xrange(self.nrows, nr): scta(bt * 0) scva([]) if fmt_info: scxa(bf * 0) self.nrows = nr types_row = self._cell_types[rowx] values_row = self._cell_values[rowx] if fmt_info: fmt_row = self._cell_xf_indexes[rowx] ltr = len(types_row) if colx >= self.ncols: self.ncols = colx + 1 num_empty = colx - ltr if not num_empty: # most common case: colx == previous colx + 1 # self._put_cell_cells_appended += 1 types_row.append(ctype) values_row.append(value) if fmt_info: fmt_row.append(xf_index) return if num_empty > 0: num_empty += 1 # self._put_cell_row_widenings += 1 # types_row.extend(self.bt * num_empty) # values_row.extend([UNICODE_LITERAL('')] * num_empty) # if fmt_info: # fmt_row.extend(self.bf * num_empty) types_row[ltr:] = self.bt * num_empty values_row[ltr:] = [UNICODE_LITERAL('')] * num_empty if fmt_info: fmt_row[ltr:] = self.bf * num_empty types_row[colx] = ctype values_row[colx] = value if fmt_info: fmt_row[colx] = xf_index except: print("put_cell", rowx, colx, file=self.logfile) raise def put_cell_unragged(self, rowx, colx, ctype, value, xf_index): if ctype is None: # we have a number, so look up the cell type ctype = self._xf_index_to_xl_type_map[xf_index] # assert 0 <= colx < self.utter_max_cols # assert 0 <= rowx < self.utter_max_rows try: self._cell_types[rowx][colx] = ctype self._cell_values[rowx][colx] = value if self.formatting_info: self._cell_xf_indexes[rowx][colx] = xf_index except IndexError: # print >> self.logfile, "put_cell extending", rowx, colx # self.extend_cells(rowx+1, colx+1) # self._put_cell_exceptions += 1 nr = rowx + 1 nc = colx + 1 assert 1 <= nc <= self.utter_max_cols assert 1 <= nr <= self.utter_max_rows if nc > self.ncols: self.ncols = nc # The row self._first_full_rowx and all subsequent rows # are guaranteed to have length == self.ncols. Thus the # "fix ragged rows" section of the tidy_dimensions method # doesn't need to examine them. if nr < self.nrows: # cell data is not in non-descending row order *AND* # self.ncols has been bumped up. # This very rare case ruins this optmisation. self._first_full_rowx = -2 elif rowx > self._first_full_rowx > -2: self._first_full_rowx = rowx if nr <= self.nrows: # New cell is in an existing row, so extend that row (if necessary). # Note that nr < self.nrows means that the cell data # is not in ascending row order!! trow = self._cell_types[rowx] nextra = self.ncols - len(trow) if nextra > 0: # self._put_cell_row_widenings += 1 trow.extend(self.bt * nextra) if self.formatting_info: self._cell_xf_indexes[rowx].extend(self.bf * nextra) self._cell_values[rowx].extend([UNICODE_LITERAL('')] * nextra) else: scta = self._cell_types.append scva = self._cell_values.append scxa = self._cell_xf_indexes.append fmt_info = self.formatting_info nc = self.ncols bt = self.bt bf = self.bf for _unused in xrange(self.nrows, nr): # self._put_cell_rows_appended += 1 scta(bt * nc) scva([UNICODE_LITERAL('')] * nc) if fmt_info: scxa(bf * nc) self.nrows = nr # === end of code from extend_cells() try: self._cell_types[rowx][colx] = ctype self._cell_values[rowx][colx] = value if self.formatting_info: self._cell_xf_indexes[rowx][colx] = xf_index except: print("put_cell", rowx, colx, file=self.logfile) raise except: print("put_cell", rowx, colx, file=self.logfile) raise # === Methods after this line neither know nor care about how cells are stored. def read(self, bk): global rc_stats DEBUG = 0 blah = DEBUG or self.verbosity >= 2 blah_rows = DEBUG or self.verbosity >= 4 blah_formulas = 0 and blah r1c1 = 0 oldpos = bk._position bk._position = self._position XL_SHRFMLA_ETC_ETC = ( XL_SHRFMLA, XL_ARRAY, XL_TABLEOP, XL_TABLEOP2, XL_ARRAY2, XL_TABLEOP_B2, ) self_put_cell = self.put_cell local_unpack = unpack bk_get_record_parts = bk.get_record_parts bv = self.biff_version fmt_info = self.formatting_info do_sst_rich_text = fmt_info and bk._rich_text_runlist_map rowinfo_sharing_dict = {} txos = {} eof_found = 0 while 1: # if DEBUG: print "SHEET.READ: about to read from position %d" % bk._position rc, data_len, data = bk_get_record_parts() # if rc in rc_stats: # rc_stats[rc] += 1 # else: # rc_stats[rc] = 1 # if DEBUG: print "SHEET.READ: op 0x%04x, %d bytes %r" % (rc, data_len, data) if rc == XL_NUMBER: # [:14] in following stmt ignores extraneous rubbish at end of record. # Sample file testEON-8.xls supplied by Jan Kraus. rowx, colx, xf_index, d = local_unpack('> 15) & 1 r.outline_level = bits2 & 7 r.outline_group_starts_ends = (bits2 >> 4) & 1 r.hidden = (bits2 >> 5) & 1 r.height_mismatch = (bits2 >> 6) & 1 r.has_default_xf_index = (bits2 >> 7) & 1 r.xf_index = (bits2 >> 16) & 0xfff r.additional_space_above = (bits2 >> 28) & 1 r.additional_space_below = (bits2 >> 29) & 1 if not r.has_default_xf_index: r.xf_index = -1 self.rowinfo_map[rowx] = r if 0 and r.xf_index > -1: fprintf(self.logfile, "**ROW %d %d %d\n", self.number, rowx, r.xf_index) if blah_rows: print('ROW', rowx, bits1, bits2, file=self.logfile) r.dump(self.logfile, header="--- sh #%d, rowx=%d ---" % (self.number, rowx)) elif rc in XL_FORMULA_OPCODES: # 06, 0206, 0406 # DEBUG = 1 # if DEBUG: print "FORMULA: rc: 0x%04x data: %r" % (rc, data) if bv >= 50: rowx, colx, xf_index, result_str, flags = local_unpack('= 30: rowx, colx, xf_index, result_str, flags = local_unpack(' 255: break # Excel does 0 to 256 inclusive self.colinfo_map[colx] = c if 0: fprintf(self.logfile, "**COL %d %d %d\n", self.number, colx, c.xf_index) if blah: fprintf( self.logfile, "COLINFO sheet #%d cols %d-%d: wid=%d xf_index=%d flags=0x%04x\n", self.number, first_colx, last_colx, c.width, c.xf_index, flags, ) c.dump(self.logfile, header='===') elif rc == XL_DEFCOLWIDTH: self.defcolwidth, = local_unpack(">= 1 self.gcw = tuple(gcw) if 0: showgcw = "".join(map(lambda x: "F "[x], gcw)).rstrip().replace(' ', '.') print("GCW:", showgcw, file=self.logfile) elif rc == XL_BLANK: if not fmt_info: continue rowx, colx, xf_index = local_unpack('> self.logfile, "BLANK", rowx, colx, xf_index self_put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index) elif rc == XL_MULBLANK: # 00BE if not fmt_info: continue nitems = data_len >> 1 result = local_unpack("<%dH" % nitems, data) rowx, mul_first = result[:2] mul_last = result[-1] # print >> self.logfile, "MULBLANK", rowx, mul_first, mul_last, data_len, nitems, mul_last + 4 - mul_first assert nitems == mul_last + 4 - mul_first pos = 2 for colx in xrange(mul_first, mul_last + 1): self_put_cell(rowx, colx, XL_CELL_BLANK, '', result[pos]) pos += 1 elif rc == XL_DIMENSION or rc == XL_DIMENSION2: if data_len == 0: # Four zero bytes after some other record. See github issue 64. continue # if data_len == 10: # Was crashing on BIFF 4.0 file w/o the two trailing unused bytes. # Reported by Ralph Heimburger. if bv < 80: dim_tuple = local_unpack(' found EOF", file=self.logfile) elif rc == XL_COUNTRY: bk.handle_country(data) elif rc == XL_LABELRANGES: pos = 0 pos = unpack_cell_range_address_list_update_pos( self.row_label_ranges, data, pos, bv, addr_size=8, ) pos = unpack_cell_range_address_list_update_pos( self.col_label_ranges, data, pos, bv, addr_size=8, ) assert pos == data_len elif rc == XL_ARRAY: row1x, rownx, col1x, colnx, array_flags, tokslen = \ local_unpack("= 80 num_CFs, needs_recalc, browx1, browx2, bcolx1, bcolx2 = \ unpack("<6H", data[0:12]) if self.verbosity >= 1: fprintf( self.logfile, "\n*** WARNING: Ignoring CONDFMT (conditional formatting) record\n" "*** in Sheet %d (%r).\n" "*** %d CF record(s); needs_recalc_or_redraw = %d\n" "*** Bounding box is %s\n", self.number, self.name, num_CFs, needs_recalc, rangename2d(browx1, browx2+1, bcolx1, bcolx2+1), ) olist = [] # updated by the function pos = unpack_cell_range_address_list_update_pos( olist, data, 12, bv, addr_size=8) # print >> self.logfile, repr(result), len(result) if self.verbosity >= 1: fprintf( self.logfile, "*** %d individual range(s):\n" "*** %s\n", len(olist), ", ".join(rangename2d(*coords) for coords in olist), ) elif rc == XL_CF: if not fmt_info: continue cf_type, cmp_op, sz1, sz2, flags = unpack("> 26) & 1 bord_block = (flags >> 28) & 1 patt_block = (flags >> 29) & 1 if self.verbosity >= 1: fprintf( self.logfile, "\n*** WARNING: Ignoring CF (conditional formatting) sub-record.\n" "*** cf_type=%d, cmp_op=%d, sz1=%d, sz2=%d, flags=0x%08x\n" "*** optional data blocks: font=%d, border=%d, pattern=%d\n", cf_type, cmp_op, sz1, sz2, flags, font_block, bord_block, patt_block, ) # hex_char_dump(data, 0, data_len, fout=self.logfile) pos = 12 if font_block: (font_height, font_options, weight, escapement, underline, font_colour_index, two_bits, font_esc, font_underl) = unpack("<64x i i H H B 3x i 4x i i i 18x", data[pos:pos+118]) font_style = (two_bits > 1) & 1 posture = (font_options > 1) & 1 font_canc = (two_bits > 7) & 1 cancellation = (font_options > 7) & 1 if self.verbosity >= 1: fprintf( self.logfile, "*** Font info: height=%d, weight=%d, escapement=%d,\n" "*** underline=%d, colour_index=%d, esc=%d, underl=%d,\n" "*** style=%d, posture=%d, canc=%d, cancellation=%d\n", font_height, weight, escapement, underline, font_colour_index, font_esc, font_underl, font_style, posture, font_canc, cancellation, ) pos += 118 if bord_block: pos += 8 if patt_block: pos += 4 fmla1 = data[pos:pos+sz1] pos += sz1 if blah and sz1: fprintf(self.logfile, "*** formula 1:\n") dump_formula(bk, fmla1, sz1, bv, reldelta=0, blah=1) fmla2 = data[pos:pos+sz2] pos += sz2 assert pos == data_len if blah and sz2: fprintf(self.logfile, "*** formula 2:\n") dump_formula(bk, fmla2, sz2, bv, reldelta=0, blah=1) elif rc == XL_DEFAULTROWHEIGHT: if data_len == 4: bits, self.default_row_height = unpack("> 1) & 1 self.default_additional_space_above = (bits >> 2) & 1 self.default_additional_space_below = (bits >> 3) & 1 elif rc == XL_MERGEDCELLS: if not fmt_info: continue pos = unpack_cell_range_address_list_update_pos( self.merged_cells, data, 0, bv, addr_size=8) if blah: fprintf(self.logfile, "MERGEDCELLS: %d ranges\n", (pos - 2) // 8) assert pos == data_len, \ "MERGEDCELLS: pos=%d data_len=%d" % (pos, data_len) elif rc == XL_WINDOW2: if bv >= 80 and data_len >= 14: ( options, self.first_visible_rowx, self.first_visible_colx, self.gridline_colour_index, self.cached_page_break_preview_mag_factor, self.cached_normal_view_mag_factor ) = unpack("= 30 # BIFF3-7 ( options, self.first_visible_rowx, self.first_visible_colx, ) = unpack(">= 1 elif rc == XL_SCL: num, den = unpack("= 0: print( "WARNING *** SCL rcd sheet %d: should have 0.1 <= num/den <= 4; got %d/%d" % (self.number, num, den), file=self.logfile, ) result = 100 self.scl_mag_factor = result elif rc == XL_PANE: ( self.vert_split_pos, self.horz_split_pos, self.horz_split_first_visible, self.vert_split_first_visible, self.split_active_pane, ) = unpack("= 80)) + 2 == data_len pos = 2 if bv < 80: while pos < data_len: self.horizontal_page_breaks.append((local_unpack("= 80)) + 2 == data_len pos = 2 if bv < 80: while pos < data_len: self.vertical_page_breaks.append((local_unpack("> 15) & 1 r.has_default_xf_index = bits2 & 1 r.xf_index = xf_index # r.outline_level = 0 # set in __init__ # r.outline_group_starts_ends = 0 # set in __init__ # r.hidden = 0 # set in __init__ # r.height_mismatch = 0 # set in __init__ # r.additional_space_above = 0 # set in __init__ # r.additional_space_below = 0 # set in __init__ self.rowinfo_map[rowx] = r if 0 and r.xf_index > -1: fprintf(self.logfile, "**ROW %d %d %d\n", self.number, rowx, r.xf_index) if blah_rows: print('ROW_B2', rowx, bits1, file=self.logfile) r.dump(self.logfile, header="--- sh #%d, rowx=%d ---" % (self.number, rowx)) elif rc == XL_COLWIDTH: # BIFF2 only if not fmt_info: continue first_colx, last_colx, width\ = local_unpack("= 30) + 1 nchars_expected = unpack("<" + "BH"[lenlen - 1], data[:lenlen])[0] offset = lenlen if bv < 80: enc = bk.encoding or bk.derive_encoding() nchars_found = 0 result = UNICODE_LITERAL("") while 1: if bv >= 80: flag = BYTES_ORD(data[offset]) & 1 enc = ("latin_1", "utf_16_le")[flag] offset += 1 chunk = unicode(data[offset:], enc) result += chunk nchars_found += len(chunk) if nchars_found == nchars_expected: return result if nchars_found > nchars_expected: msg = ("STRING/CONTINUE: expected %d chars, found %d" % (nchars_expected, nchars_found)) raise XLRDError(msg) rc, _unused_len, data = bk.get_record_parts() if rc != XL_CONTINUE: raise XLRDError( "Expected CONTINUE record; found record-type 0x%04X" % rc) offset = 0 def update_cooked_mag_factors(self): # Cached values are used ONLY for the non-active view mode. # When the user switches to the non-active view mode, # if the cached value for that mode is not valid, # Excel pops up a window which says: # "The number must be between 10 and 400. Try again by entering a number in this range." # When the user hits OK, it drops into the non-active view mode # but uses the magn from the active mode. # NOTE: definition of "valid" depends on mode ... see below blah = DEBUG or self.verbosity > 0 if self.show_in_page_break_preview: if self.scl_mag_factor is None: # no SCL record self.cooked_page_break_preview_mag_factor = 100 # Yes, 100, not 60, NOT a typo else: self.cooked_page_break_preview_mag_factor = self.scl_mag_factor zoom = self.cached_normal_view_mag_factor if not (10 <= zoom <=400): if blah: print( "WARNING *** WINDOW2 rcd sheet %d: Bad cached_normal_view_mag_factor: %d" % (self.number, self.cached_normal_view_mag_factor), file=self.logfile, ) zoom = self.cooked_page_break_preview_mag_factor self.cooked_normal_view_mag_factor = zoom else: # normal view mode if self.scl_mag_factor is None: # no SCL record self.cooked_normal_view_mag_factor = 100 else: self.cooked_normal_view_mag_factor = self.scl_mag_factor zoom = self.cached_page_break_preview_mag_factor if not zoom: # VALID, defaults to 60 zoom = 60 elif not (10 <= zoom <= 400): if blah: print( "WARNING *** WINDOW2 rcd sheet %r: Bad cached_page_break_preview_mag_factor: %r" % (self.number, self.cached_page_break_preview_mag_factor), file=self.logfile, ) zoom = self.cooked_normal_view_mag_factor self.cooked_page_break_preview_mag_factor = zoom def fixed_BIFF2_xfindex(self, cell_attr, rowx, colx, true_xfx=None): DEBUG = 0 blah = DEBUG or self.verbosity >= 2 if self.biff_version == 21: if self.book.xf_list: if true_xfx is not None: xfx = true_xfx else: xfx = BYTES_ORD(cell_attr[0]) & 0x3F if xfx == 0x3F: if self._ixfe is None: raise XLRDError("BIFF2 cell record has XF index 63 but no preceding IXFE record.") xfx = self._ixfe # OOo docs are capable of interpretation that each # cell record is preceded immediately by its own IXFE record. # Empirical evidence is that (sensibly) an IXFE record applies to all # following cell records until another IXFE comes along. return xfx # Have either Excel 2.0, or broken 2.1 w/o XF records -- same effect. self.biff_version = self.book.biff_version = 20 #### check that XF slot in cell_attr is zero xfx_slot = BYTES_ORD(cell_attr[0]) & 0x3F assert xfx_slot == 0 xfx = self._cell_attr_to_xfx.get(cell_attr) if xfx is not None: return xfx if blah: fprintf(self.logfile, "New cell_attr %r at (%r, %r)\n", cell_attr, rowx, colx) if not self.book.xf_list: for xfx in xrange(16): self.insert_new_BIFF20_xf(cell_attr=b"\x40\x00\x00", style=xfx < 15) xfx = self.insert_new_BIFF20_xf(cell_attr=cell_attr) return xfx def insert_new_BIFF20_xf(self, cell_attr, style=0): DEBUG = 0 blah = DEBUG or self.verbosity >= 2 book = self.book xfx = len(book.xf_list) xf = self.fake_XF_from_BIFF20_cell_attr(cell_attr, style) xf.xf_index = xfx book.xf_list.append(xf) if blah: xf.dump(self.logfile, header="=== Faked XF %d ===" % xfx, footer="======") if xf.format_key not in book.format_map: if xf.format_key: msg = "ERROR *** XF[%d] unknown format key (%d, 0x%04x)\n" fprintf(self.logfile, msg, xf.xf_index, xf.format_key, xf.format_key) fmt = Format(xf.format_key, FUN, UNICODE_LITERAL("General")) book.format_map[xf.format_key] = fmt book.format_list.append(fmt) cellty_from_fmtty = { FNU: XL_CELL_NUMBER, FUN: XL_CELL_NUMBER, FGE: XL_CELL_NUMBER, FDT: XL_CELL_DATE, FTX: XL_CELL_NUMBER, # Yes, a number can be formatted as text. } fmt = book.format_map[xf.format_key] cellty = cellty_from_fmtty[fmt.type] self._xf_index_to_xl_type_map[xf.xf_index] = cellty self._cell_attr_to_xfx[cell_attr] = xfx return xfx def fake_XF_from_BIFF20_cell_attr(self, cell_attr, style=0): from .formatting import XF, XFAlignment, XFBorder, XFBackground, XFProtection xf = XF() xf.alignment = XFAlignment() xf.alignment.indent_level = 0 xf.alignment.shrink_to_fit = 0 xf.alignment.text_direction = 0 xf.border = XFBorder() xf.border.diag_up = 0 xf.border.diag_down = 0 xf.border.diag_colour_index = 0 xf.border.diag_line_style = 0 # no line xf.background = XFBackground() xf.protection = XFProtection() (prot_bits, font_and_format, halign_etc) = unpack('> 6 upkbits(xf.protection, prot_bits, ( (6, 0x40, 'cell_locked'), (7, 0x80, 'formula_hidden'), )) xf.alignment.hor_align = halign_etc & 0x07 for mask, side in ((0x08, 'left'), (0x10, 'right'), (0x20, 'top'), (0x40, 'bottom')): if halign_etc & mask: colour_index, line_style = 8, 1 # black, thin else: colour_index, line_style = 0, 0 # none, none setattr(xf.border, side + '_colour_index', colour_index) setattr(xf.border, side + '_line_style', line_style) bg = xf.background if halign_etc & 0x80: bg.fill_pattern = 17 else: bg.fill_pattern = 0 bg.background_colour_index = 9 # white bg.pattern_colour_index = 8 # black xf.parent_style_index = (0x0FFF, 0)[style] xf.alignment.vert_align = 2 # bottom xf.alignment.rotation = 0 attr_stems = [ 'format', 'font', 'alignment', 'border', 'background', 'protection', ] for attr_stem in attr_stems: attr = "_" + attr_stem + "_flag" setattr(xf, attr, 1) return xf def req_fmt_info(self): if not self.formatting_info: raise XLRDError("Feature requires open_workbook(..., formatting_info=True)") def computed_column_width(self, colx): """ Determine column display width. :param colx: Index of the queried column, range 0 to 255. Note that it is possible to find out the width that will be used to display columns with no cell information e.g. column IV (colx=255). :return: The column width that will be used for displaying the given column by Excel, in units of 1/256th of the width of a standard character (the digit zero in the first font). .. versionadded:: 0.6.1 """ self.req_fmt_info() if self.biff_version >= 80: colinfo = self.colinfo_map.get(colx, None) if colinfo is not None: return colinfo.width if self.standardwidth is not None: return self.standardwidth elif self.biff_version >= 40: if self.gcw[colx]: if self.standardwidth is not None: return self.standardwidth else: colinfo = self.colinfo_map.get(colx, None) if colinfo is not None: return colinfo.width elif self.biff_version == 30: colinfo = self.colinfo_map.get(colx, None) if colinfo is not None: return colinfo.width # All roads lead to Rome and the DEFCOLWIDTH ... if self.defcolwidth is not None: return self.defcolwidth * 256 return 8 * 256 # 8 is what Excel puts in a DEFCOLWIDTH record def handle_hlink(self, data): # DEBUG = 1 if DEBUG: print("\n=== hyperlink ===", file=self.logfile) record_size = len(data) h = Hyperlink() h.frowx, h.lrowx, h.fcolx, h.lcolx, guid0, dummy, options = unpack(' 0: fprintf( self.logfile, "*** WARNING: hyperlink at R%dC%d has %d extra data bytes: %s\n", h.frowx + 1, h.fcolx + 1, extra_nbytes, REPR(data[-extra_nbytes:]), ) # Seen: b"\x00\x00" also b"A\x00", b"V\x00" elif extra_nbytes < 0: raise XLRDError("Bug or corrupt file, send copy of input file for debugging") self.hyperlink_list.append(h) for rowx in xrange(h.frowx, h.lrowx+1): for colx in xrange(h.fcolx, h.lcolx+1): self.hyperlink_map[rowx, colx] = h def handle_quicktip(self, data): rcx, frowx, lrowx, fcolx, lcolx = unpack('<5H', data[:10]) assert rcx == XL_QUICKTIP assert self.hyperlink_list h = self.hyperlink_list[-1] assert (frowx, lrowx, fcolx, lcolx) == (h.frowx, h.lrowx, h.fcolx, h.lcolx) assert data[-2:] == b'\x00\x00' h.quicktip = unicode(data[10:-2], 'utf_16_le') def handle_msodrawingetc(self, recid, data_len, data): if not OBJ_MSO_DEBUG: return DEBUG = 1 if self.biff_version < 80: return o = MSODrawing() pos = 0 while pos < data_len: tmp, fbt, cb = unpack('> 4) & 0xFFF if ver == 0xF: ndb = 0 # container else: ndb = cb if DEBUG: hex_char_dump(data, pos, ndb + 8, base=0, fout=self.logfile) fprintf(self.logfile, "fbt:0x%04X inst:%d ver:0x%X cb:%d (0x%04X)\n", fbt, inst, ver, cb, cb) if fbt == 0xF010: # Client Anchor assert ndb == 18 (o.anchor_unk, o.anchor_colx_lo, o.anchor_rowx_lo, o.anchor_colx_hi, o.anchor_rowx_hi) = unpack(' 0: rc2, data2_len, data2 = self.book.get_record_parts() assert rc2 == XL_NOTE dummy_rowx, nb = unpack('> 1) & 1 o.row_hidden = (option_flags >> 7) & 1 o.col_hidden = (option_flags >> 8) & 1 # XL97 dev kit book says NULL [sic] bytes padding between string count and string data # to ensure that string is word-aligned. Appears to be nonsense. o.author, endpos = unpack_unicode_update_pos(data, 8, lenlen=2) # There is a random/undefined byte after the author string (not counted in the # string length). # Issue 4 on github: Google Spreadsheet doesn't write the undefined byte. assert (data_len - endpos) in (0, 1) if OBJ_MSO_DEBUG: o.dump(self.logfile, header="=== Note ===", footer= " ") txo = txos.get(o._object_id) if txo: o.text = txo.text o.rich_text_runlist = txo.rich_text_runlist self.cell_note_map[o.rowx, o.colx] = o def handle_txo(self, data): if self.biff_version < 80: return o = MSTxo() fmt = '