From a6720d2040e857e8726231e8874fef29e43d13b6 Mon Sep 17 00:00:00 2001 From: me Date: Wed, 10 May 2017 14:37:09 -0400 Subject: [PATCH 1/3] Get app.xml properties. --- docx/__init__.py | 4 +- docx/opc/appprops.py | 63 ++++++++++++ docx/opc/package.py | 21 ++++ docx/opc/parts/appprops.py | 48 +++++++++ docx/oxml/__init__.py | 3 + docx/oxml/appprops.py | 197 +++++++++++++++++++++++++++++++++++++ docx/oxml/coreprops.py | 1 - tests/opc/test_package.py | 1 + 8 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 docx/opc/appprops.py create mode 100644 docx/opc/parts/appprops.py create mode 100644 docx/oxml/appprops.py diff --git a/docx/__init__.py b/docx/__init__.py index cfa48729d..b8f9a77c4 100644 --- a/docx/__init__.py +++ b/docx/__init__.py @@ -10,6 +10,7 @@ from docx.opc.constants import CONTENT_TYPE as CT, RELATIONSHIP_TYPE as RT from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.appprops import AppPropertiesPart from docx.parts.document import DocumentPart from docx.parts.image import ImagePart @@ -26,12 +27,13 @@ def part_class_selector(content_type, reltype): PartFactory.part_class_selector = part_class_selector PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart +PartFactory.part_type_for[CT.OFC_EXTENDED_PROPERTIES] = AppPropertiesPart PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart PartFactory.part_type_for[CT.WML_NUMBERING] = NumberingPart PartFactory.part_type_for[CT.WML_SETTINGS] = SettingsPart PartFactory.part_type_for[CT.WML_STYLES] = StylesPart del ( - CT, CorePropertiesPart, DocumentPart, NumberingPart, PartFactory, + CT, CorePropertiesPart, AppPropertiesPart, DocumentPart, NumberingPart, PartFactory, StylesPart, part_class_selector ) diff --git a/docx/opc/appprops.py b/docx/opc/appprops.py new file mode 100644 index 000000000..99ee69bcf --- /dev/null +++ b/docx/opc/appprops.py @@ -0,0 +1,63 @@ +# encoding: utf-8 + +""" +The :mod:`pptx.packaging` module coheres around the concerns of reading and +writing presentations to and from a .pptx file. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + + +class AppProperties(object): + """ + Corresponds to part named ``/docProps/app.xml``, containing the core + document properties for this document package. + """ + def __init__(self, element): + self._element = element + + @property + def template(self): + return self._element.template_text + + @property + def totaltime(self): + return self._element.totaltime_number + + @property + def pages(self): + return self._element.pages_number + + @property + def words(self): + return self._element.words_number + + @property + def characters(self): + return self._element.characters_number + + @property + def application(self): + return self._element.application_text + + @property + def docsecurity(self): + return self._element.docsecurity_number + + @property + def lines(self): + return self._element.lines_number + + @property + def paragraphs(self): + return self._element.paragraphs_number + + @property + def company(self): + return self._element.company_text + + @property + def appversion(self): + return self._element.appversion_number diff --git a/docx/opc/package.py b/docx/opc/package.py index b0ea37ea5..b23c46410 100644 --- a/docx/opc/package.py +++ b/docx/opc/package.py @@ -11,6 +11,7 @@ from .packuri import PACKAGE_URI from .part import PartFactory from .parts.coreprops import CorePropertiesPart +from .parts.appprops import AppPropertiesPart from .pkgreader import PackageReader from .pkgwriter import PackageWriter from .rel import Relationships @@ -43,6 +44,14 @@ def core_properties(self): """ return self._core_properties_part.core_properties + @property + def app_properties(self): + """ + |AppProperties| object providing read access to the + App properties for this document. + """ + return self._app_properties_part.app_properties + def iter_rels(self): """ Generate exactly one reference to each relationship in the package by @@ -172,6 +181,18 @@ def _core_properties_part(self): self.relate_to(core_properties_part, RT.CORE_PROPERTIES) return core_properties_part + @property + def _app_properties_part(self): + """ + |AppPropertiesPart| object related to this package. Creates + a default app properties part if one is not present (not common). + """ + try: + return self.part_related_by(RT.EXTENDED_PROPERTIES) + except KeyError: + app_properties_part = AppPropertiesPart.default(self) + self.relate_to(app_properties_part, RT.EXTENDED_PROPERTIES) + return app_properties_part class Unmarshaller(object): """ diff --git a/docx/opc/parts/appprops.py b/docx/opc/parts/appprops.py new file mode 100644 index 000000000..5d252b4b3 --- /dev/null +++ b/docx/opc/parts/appprops.py @@ -0,0 +1,48 @@ +# encoding: utf-8 + +""" +App properties part, corresponds to ``/docProps/app.xml`` part in package. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + + +from ..constants import CONTENT_TYPE as CT +from ..appprops import AppProperties +from ...oxml.appprops import CT_AppProperties +from ..packuri import PackURI +from ..part import XmlPart + + +class AppPropertiesPart(XmlPart): + """ + Corresponds to part named ``/docProps/app.xml``, containing the app + document properties for this document package. + """ + @classmethod + def default(cls, package): + """ + Return a new |AppPropertiesPart| object initialized with default + values for its base properties. + """ + app_properties_part = cls._new(package) + return app_properties_part + + @property + def app_properties(self): + """ + A |AppProperties| object providing read/write access to the app + properties contained in this app properties part. + """ + return AppProperties(self.element) + + @classmethod + def _new(cls, package): + partname = PackURI('/docProps/app.xml') + content_type = CT.OFC_EXTENDED_PROPERTIES + appProperties = CT_AppProperties.new() + return AppPropertiesPart( + partname, content_type, appProperties, package + ) diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py index 528b1eac7..92d7a9062 100644 --- a/docx/oxml/__init__.py +++ b/docx/oxml/__init__.py @@ -70,6 +70,9 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None): from .coreprops import CT_CoreProperties register_element_cls('cp:coreProperties', CT_CoreProperties) +from .appprops import CT_AppProperties +register_element_cls('cp:appProperties', CT_AppProperties) + from .document import CT_Body, CT_Document register_element_cls('w:body', CT_Body) register_element_cls('w:document', CT_Document) diff --git a/docx/oxml/appprops.py b/docx/oxml/appprops.py new file mode 100644 index 000000000..75e888fa9 --- /dev/null +++ b/docx/oxml/appprops.py @@ -0,0 +1,197 @@ +# encoding: utf-8 + +""" +lxml custom element classes for app properties-related XML elements. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import re + +from datetime import datetime, timedelta + +from . import parse_xml +from .xmlchemy import BaseOxmlElement, ZeroOrOne + + +class CT_AppProperties(BaseOxmlElement): + """ + ```` element, the root element of the App Properties + part stored as ``/docProps/app.xml``. Implements many of the Dublin Core + document metadata elements. String elements resolve to an empty string + ('') if the element is not present in the XML. String elements are + limited in length to 255 unicode characters. + """ + template = ZeroOrOne('template', successors=()) + totaltime = ZeroOrOne('totaltim', successors=()) + pages = ZeroOrOne('pages', successors=()) + words = ZeroOrOne('words', successors=()) + characters = ZeroOrOne('characters', successors=()) + application = ZeroOrOne('application', successors=()) + docsecurity = ZeroOrOne('docsecurity', successors=()) + lines = ZeroOrOne('lines', successors=()) + paragraphs = ZeroOrOne('paragraphs', successors=()) + company = ZeroOrOne('company', successors=()) + appversion = ZeroOrOne('appversion', successors=()) + + _appProperties_tmpl = '\n' + + + @classmethod + def new(cls): + """ + Return a new ```` element + """ + xml = cls._appProperties_tmpl + appProperties = parse_xml(xml) + return appProperties + + + @property + def template(self): + return self._text_of_element('template') + + @property + def totaltime(self): + return self._text_of_element('totaltime') + + @property + def pages(self): + return self._number_of_element('pages') + + @property + def words(self): + return self._number_of_element('words') + + @property + def characters(self): + return self._number_of_element('characters') + + @property + def application(self): + return self._number_of_text_of_element('application') + + @property + def docsecurity(self): + return self._number_of_element('docsecurity') + + @property + def lines(self): + return self._number_of_element('lines') + + @property + def paragraphs(self): + return self._number_of_element('paragraphs') + + @property + def company(self): + return self._text_of_element('company') + + @property + def appversion(self): + return self._number_of_element('appversion') + + def _datetime_of_element(self, property_name): + element = getattr(self, property_name) + if element is None: + return None + datetime_str = element.text + try: + return self._parse_W3CDTF_to_datetime(datetime_str) + except ValueError: + # invalid datetime strings are ignored + return None + + def _get_or_add(self, prop_name): + """ + Return element returned by 'get_or_add_' method for *prop_name*. + """ + get_or_add_method_name = 'get_or_add_%s' % prop_name + get_or_add_method = getattr(self, get_or_add_method_name) + element = get_or_add_method() + return element + + @classmethod + def _offset_dt(cls, dt, offset_str): + """ + Return a |datetime| instance that is offset from datetime *dt* by + the timezone offset specified in *offset_str*, a string like + ``'-07:00'``. + """ + match = cls._offset_pattern.match(offset_str) + if match is None: + raise ValueError( + "'%s' is not a valid offset string" % offset_str + ) + sign, hours_str, minutes_str = match.groups() + sign_factor = -1 if sign == '+' else 1 + hours = int(hours_str) * sign_factor + minutes = int(minutes_str) * sign_factor + td = timedelta(hours=hours, minutes=minutes) + return dt + td + + _offset_pattern = re.compile('([+-])(\d\d):(\d\d)') + + @classmethod + def _parse_W3CDTF_to_datetime(cls, w3cdtf_str): + # valid W3CDTF date cases: + # yyyy e.g. '2003' + # yyyy-mm e.g. '2003-12' + # yyyy-mm-dd e.g. '2003-12-31' + # UTC timezone e.g. '2003-12-31T10:14:55Z' + # numeric timezone e.g. '2003-12-31T10:14:55-08:00' + templates = ( + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%d', + '%Y-%m', + '%Y', + ) + # strptime isn't smart enough to parse literal timezone offsets like + # '-07:30', so we have to do it ourselves + parseable_part = w3cdtf_str[:19] + offset_str = w3cdtf_str[19:] + dt = None + for tmpl in templates: + try: + dt = datetime.strptime(parseable_part, tmpl) + except ValueError: + continue + if dt is None: + tmpl = "could not parse W3CDTF datetime string '%s'" + raise ValueError(tmpl % w3cdtf_str) + if len(offset_str) == 6: + return cls._offset_dt(dt, offset_str) + return dt + + def _text_of_element(self, property_name): + """ + Return the text in the element matching *property_name*, or an empty + string if the element is not present or contains no text. + """ + element = getattr(self, property_name) + if element is None: + return '' + if element.text is None: + return '' + return element.text + + def _number_of_element(self, property_name): + """ + Return the number in the element matching *property_name*, or zero + if the element is not present or contains no value. + """ + element = getattr(self, property_name) + if element is None: + return 0 + element_str = element.text + try: + element = int(element_str) + except ValueError: + # non-integer strings also resolve to 0 + element = 0 + # as do negative integers + if element < 0: + element = 0 + return element diff --git a/docx/oxml/coreprops.py b/docx/oxml/coreprops.py index b53807443..d32a5d750 100644 --- a/docx/oxml/coreprops.py +++ b/docx/oxml/coreprops.py @@ -61,7 +61,6 @@ def author_text(self): """ return self._text_of_element('creator') - @author_text.setter def author_text(self, value): self._set_element_text('creator', value) diff --git a/tests/opc/test_package.py b/tests/opc/test_package.py index f5b7ac3f7..c7732c218 100644 --- a/tests/opc/test_package.py +++ b/tests/opc/test_package.py @@ -10,6 +10,7 @@ from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.opc.coreprops import CoreProperties +from docx.opc.appprops import AppProperties from docx.opc.package import OpcPackage, Unmarshaller from docx.opc.packuri import PACKAGE_URI from docx.opc.part import Part From 6acbe99f6e3b2369b7ba37e6e98046aa7afd6b52 Mon Sep 17 00:00:00 2001 From: me Date: Wed, 10 May 2017 15:05:55 -0400 Subject: [PATCH 2/3] Get app.xml properties. Add property to Document. --- docx/document.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docx/document.py b/docx/document.py index ba94a7990..b1c04e5f0 100644 --- a/docx/document.py +++ b/docx/document.py @@ -108,6 +108,14 @@ def core_properties(self): """ return self._part.core_properties + @property + def app_properties(self): + """ + A |AppProperties| object providing read access to the app + properties of this document. + """ + return self._part.app_properties + @property def inline_shapes(self): """ From 7bc303d44a7d5a3c763cd5fa93626bc5343be1b7 Mon Sep 17 00:00:00 2001 From: me Date: Wed, 10 May 2017 15:19:48 -0400 Subject: [PATCH 3/3] Get app.xml properties. Add property to Part Document. --- docx/parts/document.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docx/parts/document.py b/docx/parts/document.py index 7a23e9a5e..43c471afa 100644 --- a/docx/parts/document.py +++ b/docx/parts/document.py @@ -36,6 +36,14 @@ def core_properties(self): """ return self.package.core_properties + @property + def app_properties(self): + """ + A |AppProperties| object providing reade access to the app + properties of this document. + """ + return self.package.app_properties + @property def document(self): """