xpath.py

#
#

Copyright 2014-2019 Álvaro Justen https://github.com/turicas/rows/

#

This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

#

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

#

You should have received a copy of the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/.

from __future__ import unicode_literals

import six
from lxml.html import fromstring as tree_from_string

from rows.plugins.utils import create_table
from rows.utils import Source

try:
    from HTMLParser import HTMLParser  # Python 2
    unescape = HTMLParser().unescape
except ImportError:
    import html  # Python 3
    unescape = html.unescape
#
def _get_row_data(fields_xpath):

    fields = list(fields_xpath.items())
#
    def get_data(row):
        data = []
        for field_name, field_xpath in fields:
            result = row.xpath(field_xpath)
            if result:
                result = " ".join(
                    text
                    for text in map(
                        six.text_type.strip, map(six.text_type, map(unescape, result))
                    )
                    if text
                )
            else:
                result = None
            data.append(result)

        return data

    return get_data
#
def import_from_xpath(
    filename_or_fobj, rows_xpath, fields_xpath, encoding="utf-8", *args, **kwargs
):

    types = set([type(rows_xpath)] + [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError("XPath must be {}".format(six.text_type.__name__))

    source = Source.from_file(filename_or_fobj, plugin_name="xpath", mode="rb", encoding=encoding)
    xml = source.fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {"imported_from": "xpath", "source": source}
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)