diff options
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index efaf084..894c13c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,6 +835,17 @@ class OSMManager(ImportManager): api.ChangesetClose() return idx+1, None +import urllib2 +from BeautifulSoup import BeautifulSoup +#from tidylib import tidy_document +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), + (re.compile(' ( )*'), ' '), + (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), + '<a href="%(base_url)s\\1"'), + ) + class HtmlXsltManager(ImportManager): def get(self): u""" @@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager): - updated items; - error detail on error. """ + try: + main_page = urllib2.urlopen(self.importer_instance.source) + assert main_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + return (0, 0, _(u"Source page is unreachable.")) + # first prettify with BeautifulSoup + main_page = BeautifulSoup(main_page.read()).prettify() + # convert it to valid XHTML + #doc, errors = tidy_document(main_page) + doc = main_page + dom = etree.HTML(doc, etree.HTMLParser()) + try: + xslt = etree.parse(self.importer_instance.source_file) + transform = etree.XSLT(xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError): + return (0, 0, _(u"The source file is not a valid XSLT file.")) + newdom = transform(dom) + items = [] + # load an alternate xslt file to apply to linked page + transform_child = None + if self.importer_instance.source_file_alt: + try: + alt_xslt = etree.parse(self.importer_instance.source_file_alt) + transform_child = etree.XSLT(alt_xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError): + return (0, 0, + _(u"The alt source file is not a valid XSLT file.")) + base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) + base_url += u"/" + for item in newdom.getroot(): + c_item = {child.tag:child.text for child in item.getchildren() + if child.text} + # try to have more information on the linked page + if transform_child and 'link' in c_item: + # not an absolute address + if not c_item['link'].startswith('http://') and \ + not c_item['link'].startswith('https://'): + c_item['link'] = base_url + c_item['link'] + try: + child_page = urllib2.urlopen(c_item['link']) + assert child_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + # don't stop the export for a bad link + items.append(c_item) + continue + child_page = BeautifulSoup(child_page.read()).prettify() + child_dom = etree.HTML(child_page, etree.HTMLParser()) + extra_keys = transform_child(child_dom).getroot() + if len(extra_keys): + c_item.update({extra.tag:etree.tostring(extra) + for extra in extra_keys[0].getchildren()}) + items.append(c_item) + # change relative link to full link and simplify + for item in items: + for k in item: + val = item[k] + for r, replaced in RE_CLEANS: + val = re.sub(r, replaced % {'base_url':base_url}, val) + item[k] = val + return (42, 43, '') |