Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,13 @@ More information is available in the `python-docx documentation`_.

.. _`python-docx documentation`:
https://python-docx.readthedocs.org/en/latest/

This fork of the repository includes a merge from `renejsum's fork <https://github.com/renejsum/python-docx>`_ with recent master from `the origin <https://github.com/python-openxml/python-docx>`_ to support read/write access to custom metadata properties of the document. For example::

>>> import docx
>>> d = docx.Document('test1.docx')
>>> p = d.custom_properties
>>> print(p['prov_wasDerivedFrom'])
fid://slap.G24X2UWc
>>> p['prov_wasAssociatedWith'] = 'some other value'
>>> d.save('test1.docx')
2 changes: 2 additions & 0 deletions docx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from docx.opc.constants import CONTENT_TYPE as CT, RELATIONSHIP_TYPE as RT
from docx.opc.part import PartFactory
from docx.opc.parts.coreprops import CorePropertiesPart
from docx.opc.parts.customprops import CustomPropertiesPart

from docx.parts.document import DocumentPart
from docx.parts.image import ImagePart
Expand All @@ -26,6 +27,7 @@ def part_class_selector(content_type, reltype):

PartFactory.part_class_selector = part_class_selector
PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart
PartFactory.part_type_for[CT.OPC_CUSTOM_PROPERTIES] = CustomPropertiesPart
PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart
PartFactory.part_type_for[CT.WML_NUMBERING] = NumberingPart
PartFactory.part_type_for[CT.WML_SETTINGS] = SettingsPart
Expand Down
8 changes: 8 additions & 0 deletions docx/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ def core_properties(self):
"""
return self._part.core_properties

@property
def custom_properties(self):
"""
A |CustomProperties| object providing read/write access to the custom
properties of this document.
"""
return self._part.custom_properties

@property
def inline_shapes(self):
"""
Expand Down
3 changes: 3 additions & 0 deletions docx/opc/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ class CONTENT_TYPE(object):
OPC_CORE_PROPERTIES = (
'application/vnd.openxmlformats-package.core-properties+xml'
)
OPC_CUSTOM_PROPERTIES = (
'application/vnd.openxmlformats-officedocument.custom-properties+xml'
)
OPC_DIGITAL_SIGNATURE_CERTIFICATE = (
'application/vnd.openxmlformats-package.digital-signature-certificat'
'e'
Expand Down
48 changes: 48 additions & 0 deletions docx/opc/customprops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# encoding: utf-8

"""
The :mod:`pptx.packaging` module coheres around the concerns of reading and
writing presentations to and from a .pptx file.
"""

from __future__ import (
absolute_import, division, print_function, unicode_literals
)

from lxml import etree

NS_VT = "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"

class CustomProperties(object):
"""
Corresponds to part named ``/docProps/custom.xml``, containing the custom
document properties for this document package.
"""
def __init__(self, element):
self._element = element

def __getitem__( self, item ):
# print(etree.tostring(self._element, pretty_print=True))
prop = self.lookup(item)
if prop is not None :
return prop[0].text

def __setitem__( self, key, value ):
prop = self.lookup(key)
if prop is None :
prop = etree.SubElement( self._element, "property" )
elm = etree.SubElement(prop, '{%s}lpwstr' % NS_VT, nsmap = {'vt':NS_VT} )
prop.set("name", key)
prop.set("fmtid", "{D5CDD505-2E9C-101B-9397-08002B2CF9AE}")
prop.set("pid", "%s" % str(len(self._element) + 1))
else:
elm = prop[0]
elm.text = value
# etree.tostring(prop, pretty_print=True)

def lookup(self, item):
for child in self._element :
if child.get("name") == item :
return child
return None

21 changes: 21 additions & 0 deletions docx/opc/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .packuri import PACKAGE_URI
from .part import PartFactory
from .parts.coreprops import CorePropertiesPart
from .parts.customprops import CustomPropertiesPart
from .pkgreader import PackageReader
from .pkgwriter import PackageWriter
from .rel import Relationships
Expand Down Expand Up @@ -43,6 +44,14 @@ def core_properties(self):
"""
return self._core_properties_part.core_properties

@property
def custom_properties(self):
"""
|CustomProperties| object providing read/write access to the Dublin
Core properties for this document.
"""
return self._custom_properties_part.custom_properties

def iter_rels(self):
"""
Generate exactly one reference to each relationship in the package by
Expand Down Expand Up @@ -172,6 +181,18 @@ def _core_properties_part(self):
self.relate_to(core_properties_part, RT.CORE_PROPERTIES)
return core_properties_part

@property
def _custom_properties_part(self):
"""
|CustomPropertiesPart| object related to this package. Creates
a default custom properties part if one is not present (not common).
"""
try:
return self.part_related_by(RT.CUSTOM_PROPERTIES)
except KeyError:
custom_properties_part = CustomPropertiesPart.default(self)
self.relate_to(custom_properties_part, RT.CUSTOM_PROPERTIES)
return custom_properties_part

class Unmarshaller(object):
"""
Expand Down
57 changes: 57 additions & 0 deletions docx/opc/parts/customprops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# encoding: utf-8

"""
Custom properties part, corresponds to ``/docProps/custom.xml`` part in package.
"""

from __future__ import (
absolute_import, division, print_function, unicode_literals
)

from lxml import etree

from datetime import datetime

from ..constants import CONTENT_TYPE as CT
from ..customprops import CustomProperties
from ...oxml.customprops import CT_CustomProperties, ct_parse_xml
from ..packuri import PackURI
from ..part import XmlPart


class CustomPropertiesPart(XmlPart):
"""
Corresponds to part named ``/docProps/custom.xml``, containing the custom
document properties for this document package.
"""
@classmethod
def default(cls, package):
"""
Return a new |CustomPropertiesPart| object initialized with default
values for its base properties.
"""
custom_properties_part = cls._new(package)
custom_properties = custom_properties_part.custom_properties
return custom_properties_part

@property
def custom_properties(self):
"""
A |CustomProperties| object providing read/write access to the custom
properties contained in this custom properties part.
"""
return CustomProperties(self.element)

@classmethod
def load(cls, partname, content_type, blob, package):
element = ct_parse_xml(blob)
return cls(partname, content_type, element, package)

@classmethod
def _new(cls, package):
partname = PackURI('/docProps/custom.xml')
content_type = CT.OPC_CUSTOM_PROPERTIES
customProperties = CT_CustomProperties.new()
return CustomPropertiesPart(
partname, content_type, customProperties, package
)
19 changes: 19 additions & 0 deletions docx/oxml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@ def parse_xml(xml):
return root_element


# configure XML parser
parser_lookup = etree.ElementDefaultClassLookup()#element=CT_CustomProperties)
ct_parser = etree.XMLParser(remove_blank_text=True)
ct_parser.set_element_class_lookup(parser_lookup)

def ct_parse_xml(xml):
"""
Return root lxml element obtained by parsing XML character string in
*xml*, which can be either a Python 2.x string or unicode. The custom
parser is used, so custom element classes are produced for elements in
*xml* that have them.
"""
root_element = etree.fromstring(xml, ct_parser)
return root_element


def register_element_cls(tag, cls):
"""
Register *cls* to be constructed when the oxml parser encounters an
Expand Down Expand Up @@ -70,6 +86,9 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None):
from .coreprops import CT_CoreProperties
register_element_cls('cp:coreProperties', CT_CoreProperties)

from .customprops import CT_CustomProperties
#register_element_cls('Properties', CT_CustomProperties)

from .document import CT_Body, CT_Document
register_element_cls('w:body', CT_Body)
register_element_cls('w:document', CT_Document)
Expand Down
155 changes: 155 additions & 0 deletions docx/oxml/customprops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# encoding: utf-8

"""
lxml custom element classes for core properties-related XML elements.
"""

from __future__ import (
absolute_import, division, print_function, unicode_literals
)

import re

from datetime import datetime, timedelta
from lxml import etree
from .ns import nsdecls, qn
from .xmlchemy import BaseOxmlElement, ZeroOrOne
from . import ct_parse_xml

class CT_CustomProperties(BaseOxmlElement):
"""
``<cp:customProperties>`` element, the root element of the Custom Properties
part stored as ``/docProps/custom.xml``. String elements are
limited in length to 255 unicode characters.
"""

_customProperties_tmpl = (
'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties" %s/>\n' % nsdecls('vt')
)

@classmethod
def new(cls):
"""
Return a new ``<property>`` element
"""
xml = cls._customProperties_tmpl
customProperties = ct_parse_xml(xml)
return customProperties

def _datetime_of_element(self, property_name):
element = getattr(self, property_name)
if element is None:
return None
datetime_str = element.text
try:
return self._parse_W3CDTF_to_datetime(datetime_str)
except ValueError:
# invalid datetime strings are ignored
return None

def _get_or_add(self, prop_name):
"""
Return element returned by 'get_or_add_' method for *prop_name*.
"""
get_or_add_method_name = 'get_or_add_%s' % prop_name
get_or_add_method = getattr(self, get_or_add_method_name)
element = get_or_add_method()
return element

@classmethod
def _offset_dt(cls, dt, offset_str):
"""
Return a |datetime| instance that is offset from datetime *dt* by
the timezone offset specified in *offset_str*, a string like
``'-07:00'``.
"""
match = cls._offset_pattern.match(offset_str)
if match is None:
raise ValueError(
"'%s' is not a valid offset string" % offset_str
)
sign, hours_str, minutes_str = match.groups()
sign_factor = -1 if sign == '+' else 1
hours = int(hours_str) * sign_factor
minutes = int(minutes_str) * sign_factor
td = timedelta(hours=hours, minutes=minutes)
return dt + td

_offset_pattern = re.compile('([+-])(\d\d):(\d\d)')

@classmethod
def _parse_W3CDTF_to_datetime(cls, w3cdtf_str):
# valid W3CDTF date cases:
# yyyy e.g. '2003'
# yyyy-mm e.g. '2003-12'
# yyyy-mm-dd e.g. '2003-12-31'
# UTC timezone e.g. '2003-12-31T10:14:55Z'
# numeric timezone e.g. '2003-12-31T10:14:55-08:00'
templates = (
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%d',
'%Y-%m',
'%Y',
)
# strptime isn't smart enough to parse literal timezone offsets like
# '-07:30', so we have to do it ourselves
parseable_part = w3cdtf_str[:19]
offset_str = w3cdtf_str[19:]
dt = None
for tmpl in templates:
try:
dt = datetime.strptime(parseable_part, tmpl)
except ValueError:
continue
if dt is None:
tmpl = "could not parse W3CDTF datetime string '%s'"
raise ValueError(tmpl % w3cdtf_str)
if len(offset_str) == 6:
return cls._offset_dt(dt, offset_str)
return dt

def _set_element_datetime(self, prop_name, value):
"""
Set date/time value of child element having *prop_name* to *value*.
"""
if not isinstance(value, datetime):
tmpl = (
"property requires <type 'datetime.datetime'> object, got %s"
)
raise ValueError(tmpl % type(value))
element = self._get_or_add(prop_name)
dt_str = value.strftime('%Y-%m-%dT%H:%M:%SZ')
element.text = dt_str
if prop_name in ('created', 'modified'):
# These two require an explicit 'xsi:type="dcterms:W3CDTF"'
# attribute. The first and last line are a hack required to add
# the xsi namespace to the root element rather than each child
# element in which it is referenced
self.set(qn('xsi:foo'), 'bar')
element.set(qn('xsi:type'), 'dcterms:W3CDTF')
del self.attrib[qn('xsi:foo')]

def _set_element_text(self, prop_name, value):
"""
Set string value of *name* property to *value*.
"""
value = str(value)
if len(value) > 255:
tmpl = (
"exceeded 255 char limit for property, got:\n\n'%s'"
)
raise ValueError(tmpl % value)
element = self._get_or_add(prop_name)
element.text = value

def _text_of_element(self, property_name):
"""
Return the text in the element matching *property_name*, or an empty
string if the element is not present or contains no text.
"""
element = getattr(self, property_name)
if element is None:
return ''
if element.text is None:
return ''
return element.text
Loading