1 files changed, 71 insertions, 0 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index efaf084..894c13c 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -835,6 +835,17 @@ class OSMManager(ImportManager):
         api.ChangesetClose()
         return idx+1, None
 
+import urllib2
+from BeautifulSoup import BeautifulSoup
+#from tidylib import tidy_document
+from lxml import etree
+
+RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
+             (re.compile(' ( )*'), ' '),
+             (re.compile(r"""<a href=["'](?!https?)(.*)["']"""),
+                        '<a href="%(base_url)s\\1"'),
+             )
+
 class HtmlXsltManager(ImportManager):
     def get(self):
         u"""
@@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager):
         - updated items;
         - error detail on error.
         """
+        try:
+            main_page = urllib2.urlopen(self.importer_instance.source)
+            assert main_page.getcode() == 200
+        except (urllib2.URLError, AssertionError):
+            return (0, 0, _(u"Source page is unreachable."))
+        # first prettify with BeautifulSoup
+        main_page = BeautifulSoup(main_page.read()).prettify()
+        # convert it to valid XHTML
+        #doc, errors = tidy_document(main_page)
+        doc = main_page
+        dom = etree.HTML(doc, etree.HTMLParser())
+        try:
+            xslt = etree.parse(self.importer_instance.source_file)
+            transform = etree.XSLT(xslt)
+        except (etree.XSLTParseError, etree.XMLSyntaxError):
+            return (0, 0, _(u"The source file is not a valid XSLT file."))
+        newdom = transform(dom)
+        items = []
+        # load an alternate xslt file to apply to linked page
+        transform_child = None
+        if self.importer_instance.source_file_alt:
+            try:
+                alt_xslt = etree.parse(self.importer_instance.source_file_alt)
+                transform_child = etree.XSLT(alt_xslt)
+            except (etree.XSLTParseError, etree.XMLSyntaxError):
+                return (0, 0,
+                        _(u"The alt source file is not a valid XSLT file."))
+        base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])
+        base_url += u"/"
+        for item in newdom.getroot():
+            c_item = {child.tag:child.text for child in item.getchildren()
+                                                             if child.text}
+            # try to have more information on the linked page
+            if transform_child and 'link' in c_item:
+                # not an absolute address
+                if not c_item['link'].startswith('http://') and \
+                   not c_item['link'].startswith('https://'):
+                    c_item['link'] = base_url + c_item['link']
+                try:
+                    child_page = urllib2.urlopen(c_item['link'])
+                    assert child_page.getcode() == 200
+                except (urllib2.URLError, AssertionError):
+                    # don't stop the export for a bad link
+                    items.append(c_item)
+                    continue
+                child_page = BeautifulSoup(child_page.read()).prettify()
+                child_dom = etree.HTML(child_page, etree.HTMLParser())
+                extra_keys = transform_child(child_dom).getroot()
+                if len(extra_keys):
+                    c_item.update({extra.tag:etree.tostring(extra)
+                            for extra in extra_keys[0].getchildren()})
+            items.append(c_item)
+        # change relative link to full link and simplify
+        for item in items:
+            for k in item:
+                val = item[k]
+                for r, replaced in RE_CLEANS:
+                    val = re.sub(r, replaced % {'base_url':base_url}, val)
+                item[k] = val
+        return (42, 43, '')