workbook.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # Copyright (c) 2010-2024 openpyxl
  2. from warnings import warn
  3. from openpyxl.xml.functions import fromstring
  4. from openpyxl.packaging.relationship import (
  5. get_dependents,
  6. get_rels_path,
  7. get_rel,
  8. )
  9. from openpyxl.packaging.workbook import WorkbookPackage
  10. from openpyxl.workbook import Workbook
  11. from openpyxl.workbook.defined_name import DefinedNameList
  12. from openpyxl.workbook.external_link.external import read_external_link
  13. from openpyxl.pivot.cache import CacheDefinition
  14. from openpyxl.pivot.record import RecordList
  15. from openpyxl.worksheet.print_settings import PrintTitles, PrintArea
  16. from openpyxl.utils.datetime import CALENDAR_MAC_1904
  17. class WorkbookParser:
  18. _rels = None
  19. def __init__(self, archive, workbook_part_name, keep_links=True):
  20. self.archive = archive
  21. self.workbook_part_name = workbook_part_name
  22. self.defined_names = DefinedNameList()
  23. self.wb = Workbook()
  24. self.keep_links = keep_links
  25. self.sheets = []
  26. @property
  27. def rels(self):
  28. if self._rels is None:
  29. self._rels = get_dependents(self.archive, get_rels_path(self.workbook_part_name)).to_dict()
  30. return self._rels
  31. def parse(self):
  32. src = self.archive.read(self.workbook_part_name)
  33. node = fromstring(src)
  34. package = WorkbookPackage.from_tree(node)
  35. if package.properties.date1904:
  36. self.wb.epoch = CALENDAR_MAC_1904
  37. self.wb.code_name = package.properties.codeName
  38. self.wb.active = package.active
  39. self.wb.views = package.bookViews
  40. self.sheets = package.sheets
  41. self.wb.calculation = package.calcPr
  42. self.caches = package.pivotCaches
  43. # external links contain cached worksheets and can be very big
  44. if not self.keep_links:
  45. package.externalReferences = []
  46. for ext_ref in package.externalReferences:
  47. rel = self.rels.get(ext_ref.id)
  48. self.wb._external_links.append(
  49. read_external_link(self.archive, rel.Target)
  50. )
  51. if package.definedNames:
  52. self.defined_names = package.definedNames
  53. self.wb.security = package.workbookProtection
  54. def find_sheets(self):
  55. """
  56. Find all sheets in the workbook and return the link to the source file.
  57. Older XLSM files sometimes contain invalid sheet elements.
  58. Warn user when these are removed.
  59. """
  60. for sheet in self.sheets:
  61. if not sheet.id:
  62. msg = f"File contains an invalid specification for {0}. This will be removed".format(sheet.name)
  63. warn(msg)
  64. continue
  65. yield sheet, self.rels[sheet.id]
  66. def assign_names(self):
  67. """
  68. Bind defined names and other definitions to worksheets or the workbook
  69. """
  70. for idx, names in self.defined_names.by_sheet().items():
  71. if idx == "global":
  72. self.wb.defined_names = names
  73. continue
  74. try:
  75. sheet = self.wb._sheets[idx]
  76. except IndexError:
  77. warn(f"Defined names for sheet index {idx} cannot be located")
  78. continue
  79. for name, defn in names.items():
  80. reserved = defn.is_reserved
  81. if reserved is None:
  82. sheet.defined_names[name] = defn
  83. elif reserved == "Print_Titles":
  84. titles = PrintTitles.from_string(defn.value)
  85. sheet._print_rows = titles.rows
  86. sheet._print_cols = titles.cols
  87. elif reserved == "Print_Area":
  88. try:
  89. sheet._print_area = PrintArea.from_string(defn.value)
  90. except TypeError:
  91. warn(f"Print area cannot be set to Defined name: {defn.value}.")
  92. continue
  93. @property
  94. def pivot_caches(self):
  95. """
  96. Get PivotCache objects
  97. """
  98. d = {}
  99. for c in self.caches:
  100. cache = get_rel(self.archive, self.rels, id=c.id, cls=CacheDefinition)
  101. if cache.deps:
  102. records = get_rel(self.archive, cache.deps, cache.id, RecordList)
  103. cache.records = records
  104. d[c.cacheId] = cache
  105. return d