diff options
| -rw-r--r-- | chimere/models.py | 3 | ||||
| -rw-r--r-- | chimere/tests.py | 7 | ||||
| -rw-r--r-- | chimere/utils.py | 71 | ||||
| -rw-r--r-- | debian/control | 3 | 
4 files changed, 82 insertions, 2 deletions
| diff --git a/chimere/models.py b/chimere/models.py index 952594c..727c3c7 100644 --- a/chimere/models.py +++ b/chimere/models.py @@ -326,7 +326,8 @@ class Importer(models.Model):      filtr = models.CharField(_(u"Filter"), max_length=200,                               blank=True, null=True)      source = models.CharField(_(u"Web address"), max_length=200, -                              blank=True, null=True) +                              blank=True, null=True, +                              help_text=_(u"Don't forget the trailing slash"))      source_file = models.FileField(_(u"Source file"),                          upload_to='import_files', blank=True, null=True)      source_file_alt = models.FileField(_(u"Alt source file"), diff --git a/chimere/tests.py b/chimere/tests.py index 070443e..5c76d2b 100644 --- a/chimere/tests.py +++ b/chimere/tests.py @@ -296,6 +296,13 @@ class GeoRSSImporterTest(TestCase, ImporterTest):          self.marker_importers = [(importer1, 1), (importer2, 32)] +class HtmlXsltImporterTest(TestCase, ImporterTest): +    def setUp(self): +        subcategories = subcategory_setup() +        importer1 = Importer.objects.create(importer_type='XSLT', +                         source='http://www.ville-villierslebacle.fr/') +        self.marker_importers = [(importer1, 5),] +  class FeedsTest(TestCase):      def setUp(self):          self.areas = areas_setup() diff --git a/chimere/utils.py b/chimere/utils.py index efaf084..894c13c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,6 +835,17 @@ class OSMManager(ImportManager):          api.ChangesetClose()          return idx+1, None +import urllib2 +from BeautifulSoup import BeautifulSoup +#from tidylib import tidy_document +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), +             (re.compile(' ( )*'), ' '), +             (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), +                        '<a href="%(base_url)s\\1"'), +             ) +  class HtmlXsltManager(ImportManager):      def get(self):          u""" @@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager):          - updated items;          - error detail on error.          """ +        try: +            main_page = urllib2.urlopen(self.importer_instance.source) +            assert main_page.getcode() == 200 +        except (urllib2.URLError, AssertionError): +            return (0, 0, _(u"Source page is unreachable.")) +        # first prettify with BeautifulSoup +        main_page = BeautifulSoup(main_page.read()).prettify() +        # convert it to valid XHTML +        #doc, errors = tidy_document(main_page) +        doc = main_page +        dom = etree.HTML(doc, etree.HTMLParser()) +        try: +            xslt = etree.parse(self.importer_instance.source_file) +            transform = etree.XSLT(xslt) +        except (etree.XSLTParseError, etree.XMLSyntaxError): +            return (0, 0, _(u"The source file is not a valid XSLT file.")) +        newdom = transform(dom) +        items = [] +        # load an alternate xslt file to apply to linked page +        transform_child = None +        if self.importer_instance.source_file_alt: +            try: +                alt_xslt = etree.parse(self.importer_instance.source_file_alt) +                transform_child = etree.XSLT(alt_xslt) +            except (etree.XSLTParseError, etree.XMLSyntaxError): +                return (0, 0, +                        _(u"The alt source file is not a valid XSLT file.")) +        base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) +        base_url += u"/" +        for item in newdom.getroot(): +            c_item = {child.tag:child.text for child in item.getchildren() +                                                             if child.text} +            # try to have more information on the linked page +            if transform_child and 'link' in c_item: +                # not an absolute address +                if not c_item['link'].startswith('http://') and \ +                   not c_item['link'].startswith('https://'): +                    c_item['link'] = base_url + c_item['link'] +                try: +                    child_page = urllib2.urlopen(c_item['link']) +                    assert child_page.getcode() == 200 +                except (urllib2.URLError, AssertionError): +                    # don't stop the export for a bad link +                    items.append(c_item) +                    continue +                child_page = BeautifulSoup(child_page.read()).prettify() +                child_dom = etree.HTML(child_page, etree.HTMLParser()) +                extra_keys = transform_child(child_dom).getroot() +                if len(extra_keys): +                    c_item.update({extra.tag:etree.tostring(extra) +                            for extra in extra_keys[0].getchildren()}) +            items.append(c_item) +        # change relative link to full link and simplify +        for item in items: +            for k in item: +                val = item[k] +                for r, replaced in RE_CLEANS: +                    val = re.sub(r, replaced % {'base_url':base_url}, val) +                item[k] = val +        return (42, 43, '') diff --git a/debian/control b/debian/control index 71b2461..75e0526 100644 --- a/debian/control +++ b/debian/control @@ -24,7 +24,8 @@ Depends: ${misc:Depends},   python-simplejson,   python-django-south,   python-pyexiv2, - python-feedparser + python-feedparser, + python-lxml  Recommends: javascript-common,   libjs-jquery,   libjs-jquery-ui, | 
