diff options
Diffstat (limited to 'chimere/utils.py')
| -rw-r--r-- | chimere/utils.py | 71 | 
1 files changed, 71 insertions, 0 deletions
| diff --git a/chimere/utils.py b/chimere/utils.py index efaf084..894c13c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,6 +835,17 @@ class OSMManager(ImportManager):          api.ChangesetClose()          return idx+1, None +import urllib2 +from BeautifulSoup import BeautifulSoup +#from tidylib import tidy_document +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), +             (re.compile(' ( )*'), ' '), +             (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), +                        '<a href="%(base_url)s\\1"'), +             ) +  class HtmlXsltManager(ImportManager):      def get(self):          u""" @@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager):          - updated items;          - error detail on error.          """ +        try: +            main_page = urllib2.urlopen(self.importer_instance.source) +            assert main_page.getcode() == 200 +        except (urllib2.URLError, AssertionError): +            return (0, 0, _(u"Source page is unreachable.")) +        # first prettify with BeautifulSoup +        main_page = BeautifulSoup(main_page.read()).prettify() +        # convert it to valid XHTML +        #doc, errors = tidy_document(main_page) +        doc = main_page +        dom = etree.HTML(doc, etree.HTMLParser()) +        try: +            xslt = etree.parse(self.importer_instance.source_file) +            transform = etree.XSLT(xslt) +        except (etree.XSLTParseError, etree.XMLSyntaxError): +            return (0, 0, _(u"The source file is not a valid XSLT file.")) +        newdom = transform(dom) +        items = [] +        # load an alternate xslt file to apply to linked page +        transform_child = None +        if self.importer_instance.source_file_alt: +            try: +                alt_xslt = etree.parse(self.importer_instance.source_file_alt) +                transform_child = etree.XSLT(alt_xslt) +            except (etree.XSLTParseError, etree.XMLSyntaxError): +                return (0, 0, +                        _(u"The alt source file is not a valid XSLT file.")) +        base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) +        base_url += u"/" +        for item in newdom.getroot(): +            c_item = {child.tag:child.text for child in item.getchildren() +                                                             if child.text} +            # try to have more information on the linked page +            if transform_child and 'link' in c_item: +                # not an absolute address +                if not c_item['link'].startswith('http://') and \ +                   not c_item['link'].startswith('https://'): +                    c_item['link'] = base_url + c_item['link'] +                try: +                    child_page = urllib2.urlopen(c_item['link']) +                    assert child_page.getcode() == 200 +                except (urllib2.URLError, AssertionError): +                    # don't stop the export for a bad link +                    items.append(c_item) +                    continue +                child_page = BeautifulSoup(child_page.read()).prettify() +                child_dom = etree.HTML(child_page, etree.HTMLParser()) +                extra_keys = transform_child(child_dom).getroot() +                if len(extra_keys): +                    c_item.update({extra.tag:etree.tostring(extra) +                            for extra in extra_keys[0].getchildren()}) +            items.append(c_item) +        # change relative link to full link and simplify +        for item in items: +            for k in item: +                val = item[k] +                for r, replaced in RE_CLEANS: +                    val = re.sub(r, replaced % {'base_url':base_url}, val) +                item[k] = val +        return (42, 43, '') | 
