Copyright 2014-2019 Álvaro Justen https://github.com/turicas/rows/
This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/.
from __future__ import unicode_literals
from io import BytesIO
import six
try:
from lxml.etree import strip_tags
from lxml.etree import tostring as to_string
from lxml.html import document_fromstring
except ImportError:
has_lxml = False
else:
has_lxml = True
from rows.plugins.utils import create_table, serialize
from rows.utils import Source
try:
from HTMLParser import HTMLParser # Python 2
unescape = HTMLParser().unescape
except:
import html # Python 3
unescape = html.unescape
try:
from html import escape # Python 3
except:
from cgi import escape # Python 2
def _get_content(element):
return (element.text if element.text is not None else "") + "".join(
to_string(child, encoding=six.text_type) for child in element.getchildren()
)
def _get_row(row, column_tag, preserve_html, properties):
if not preserve_html:
data = list(map(_extract_node_text, row.xpath(column_tag)))
else:
data = list(map(_get_content, row.xpath(column_tag)))
if properties:
data.append(dict(row.attrib))
return data
Return rows.Table from HTML file.
def import_from_html(
filename_or_fobj,
encoding="utf-8",
index=0,
ignore_colspan=True,
preserve_html=False,
properties=False,
table_tag="table",
row_tag="tr",
column_tag="td|th",
*args,
**kwargs
):
source = Source.from_file(
filename_or_fobj, plugin_name="html", mode="rb", encoding=encoding
)
html = source.fobj.read()
if b"<?xml" not in html[:1024] or b"encoding" not in html[:html.find(b"?>") + 2]:
html = html.decode(source.encoding) # Regular HTML, not XHTML/XML
html_tree = document_fromstring(html)
tables = html_tree.xpath("//{}".format(table_tag))
table = tables[index]
TODO: set meta’s “name” from @id or @name (if available)
strip_tags(table, "thead")
strip_tags(table, "tbody")
row_elements = table.xpath(row_tag)
table_rows = [
_get_row(
row,
column_tag=column_tag,
preserve_html=preserve_html,
properties=properties,
)
for row in row_elements
]
if properties:
table_rows[0][-1] = "properties"
if preserve_html and kwargs.get("fields", None) is None:
The field names will be the first table row, so we need to strip HTML
from it even if preserve_html
is True
(it’s True
only for rows,
not for the header).
table_rows[0] = list(map(_extract_node_text, row_elements[0]))
if ignore_colspan:
max_columns = max(map(len, table_rows))
table_rows = [row for row in table_rows if len(row) == max_columns]
meta = {"imported_from": "html", "source": source}
return create_table(table_rows, meta=meta, *args, **kwargs)
Export and return rows.Table data to HTML file.
def export_to_html(
table, filename_or_fobj=None, encoding="utf-8", caption=False, *args, **kwargs
):
return_data, should_close = False, None
if filename_or_fobj is None:
filename_or_fobj = BytesIO()
return_data = should_close = True
source = Source.from_file(
filename_or_fobj,
plugin_name="html",
mode="wb",
encoding=encoding,
should_close=should_close,
)
serialized_table = serialize(table, *args, **kwargs)
fields = next(serialized_table)
result = ["<table>\n\n"]
if caption and table.name:
result.extend([" <caption>", table.name, "</caption>\n\n"])
result.extend([" <thead>\n", " <tr>\n"])
TODO: set @name/@id if self.meta[“name”] is set
header = [" <th> {} </th>\n".format(field) for field in fields]
result.extend(header)
result.extend([" </tr>\n", " </thead>\n", "\n", " <tbody>\n", "\n"])
for index, row in enumerate(serialized_table, start=1):
css_class = "odd" if index % 2 == 1 else "even"
result.append(' <tr class="{}">\n'.format(css_class))
for value in row:
result.extend([" <td> ", escape(value), " </td>\n"])
result.append(" </tr>\n\n")
result.append(" </tbody>\n\n</table>\n")
html = "".join(result).encode(encoding)
if return_data:
result = html
else:
result = source.fobj
source.fobj.write(html)
source.fobj.flush()
if source.should_close:
source.fobj.close()
return result
Extract text from a given lxml node.
def _extract_node_text(node):
texts = map(
six.text_type.strip, map(six.text_type, map(unescape, node.xpath(".//text()")))
)
return " ".join(text for text in texts if text)
Read a file passed by arg and return your table HTML tag count.
def count_tables(filename_or_fobj, encoding="utf-8", table_tag="table"):
source = Source.from_file(
filename_or_fobj, plugin_name="html", mode="rb", encoding=encoding
)
html = source.fobj.read().decode(source.encoding)
html_tree = document_fromstring(html)
tables = html_tree.xpath("//{}".format(table_tag))
result = len(tables)
if source.should_close:
source.fobj.close()
return result
Extract tag’s attributes into a dict
.
def tag_to_dict(html):
element = document_fromstring(html).xpath("//html/body/child::*")[0]
attributes = dict(element.attrib)
attributes["text"] = element.text_content()
return attributes
Extract text from a given HTML.
def extract_text(html):
return _extract_node_text(document_fromstring(html))
Extract the href values from a given HTML (returns a list of strings).
def extract_links(html):
return document_fromstring(html).xpath(".//@href")