Search This Blog

31 January 2012

default value for text function using lxml

Say we need to parse this XML

<pack xmlns="http://ns.qubic.tv/2010/item">
        <packitem>
            <duration>520</duration>
            <max_count>14</max_count>
        </packitem>
        <packitem>
            <duration></duration>
            <max_count>23</max_count>
        </packitem>
</pack>


if you want to parse it and retrieve the values in tuples

root = etree.fromstring(xml)
namespaces = {'i':"http://ns.qubic.tv/2010/item"}
packitems_duration = root.xpath('//i:pack/i:packitem/i:duration/text()', 
    namespaces=namespaces)
packitems_max_count = root.xpath('//b:pack/i:packitem/i:max_count/text()',
    namespaces=namespaces)
packitems = zip(packitems_duration, packitems_max_count)

>>> packitems
[('520','14')]

The problem is the zip result miss a value. That's because lxml returns nothing instead of None or empty string. Let's change that.

def lxml_empty_str(context, nodes):
    for node in nodes:
        node.text = node.text or ""
    return nodes

ns = etree.FunctionNamespace('http://ns.qubic.tv/lxmlfunctions')
ns['lxml_empty_str'] = lxml_empty_str

namespaces = {'i':"http://ns.qubic.tv/2010/item",
              'f': "http://ns.qubic.tv/lxmlfunctions"}
packitems_duration = root.xpath('f:lxml_empty_str('//b:pack/i:packitem/i:duration)/text()',
    namespaces={'b':billing_ns, 'f' : 'http://ns.qubic.tv/lxmlfunctions'})
packitems_max_count = root.xpath('f:lxml_empty_str('//b:pack/i:packitem/i:max_count)/text()',
    namespaces={'b':billing_ns, 'f' : 'http://ns.qubic.tv/lxmlfunctions'})
packitems = zip(packitems_duration, packitems_max_count)

>>> packitems
[('520','14'), ('','23')]

more info on extending lxml http://lxml.de/extensions.html#xpath-extension-functions

No comments:

Post a Comment