diff options
author | Étienne Loks <etienne.loks@peacefrogs.net> | 2013-10-30 03:08:06 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@peacefrogs.net> | 2013-10-30 03:08:06 +0100 |
commit | 5b41f143a3956fbd9d3a2288e607e6aa8d8c3451 (patch) | |
tree | bd15b032cf9ee7e843c7b40c407a89c686f788d2 | |
parent | 07eff011922affd5733814c13236fe149cd64fd1 (diff) | |
download | Chimère-5b41f143a3956fbd9d3a2288e607e6aa8d8c3451.tar.bz2 Chimère-5b41f143a3956fbd9d3a2288e607e6aa8d8c3451.zip |
Importer HTML-XSLT: retrieving and parsing of external website using an URL and XSLT files
-rw-r--r-- | chimere/models.py | 3 | ||||
-rw-r--r-- | chimere/tests.py | 7 | ||||
-rw-r--r-- | chimere/utils.py | 71 | ||||
-rw-r--r-- | debian/control | 3 |
4 files changed, 82 insertions, 2 deletions
diff --git a/chimere/models.py b/chimere/models.py index 952594c..727c3c7 100644 --- a/chimere/models.py +++ b/chimere/models.py @@ -326,7 +326,8 @@ class Importer(models.Model): filtr = models.CharField(_(u"Filter"), max_length=200, blank=True, null=True) source = models.CharField(_(u"Web address"), max_length=200, - blank=True, null=True) + blank=True, null=True, + help_text=_(u"Don't forget the trailing slash")) source_file = models.FileField(_(u"Source file"), upload_to='import_files', blank=True, null=True) source_file_alt = models.FileField(_(u"Alt source file"), diff --git a/chimere/tests.py b/chimere/tests.py index 070443e..5c76d2b 100644 --- a/chimere/tests.py +++ b/chimere/tests.py @@ -296,6 +296,13 @@ class GeoRSSImporterTest(TestCase, ImporterTest): self.marker_importers = [(importer1, 1), (importer2, 32)] +class HtmlXsltImporterTest(TestCase, ImporterTest): + def setUp(self): + subcategories = subcategory_setup() + importer1 = Importer.objects.create(importer_type='XSLT', + source='http://www.ville-villierslebacle.fr/') + self.marker_importers = [(importer1, 5),] + class FeedsTest(TestCase): def setUp(self): self.areas = areas_setup() diff --git a/chimere/utils.py b/chimere/utils.py index efaf084..894c13c 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,6 +835,17 @@ class OSMManager(ImportManager): api.ChangesetClose() return idx+1, None +import urllib2 +from BeautifulSoup import BeautifulSoup +#from tidylib import tidy_document +from lxml import etree + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), + (re.compile(' ( )*'), ' '), + (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), + '<a href="%(base_url)s\\1"'), + ) + class HtmlXsltManager(ImportManager): def get(self): u""" @@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager): - updated items; - error detail on error. """ + try: + main_page = urllib2.urlopen(self.importer_instance.source) + assert main_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + return (0, 0, _(u"Source page is unreachable.")) + # first prettify with BeautifulSoup + main_page = BeautifulSoup(main_page.read()).prettify() + # convert it to valid XHTML + #doc, errors = tidy_document(main_page) + doc = main_page + dom = etree.HTML(doc, etree.HTMLParser()) + try: + xslt = etree.parse(self.importer_instance.source_file) + transform = etree.XSLT(xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError): + return (0, 0, _(u"The source file is not a valid XSLT file.")) + newdom = transform(dom) + items = [] + # load an alternate xslt file to apply to linked page + transform_child = None + if self.importer_instance.source_file_alt: + try: + alt_xslt = etree.parse(self.importer_instance.source_file_alt) + transform_child = etree.XSLT(alt_xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError): + return (0, 0, + _(u"The alt source file is not a valid XSLT file.")) + base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) + base_url += u"/" + for item in newdom.getroot(): + c_item = {child.tag:child.text for child in item.getchildren() + if child.text} + # try to have more information on the linked page + if transform_child and 'link' in c_item: + # not an absolute address + if not c_item['link'].startswith('http://') and \ + not c_item['link'].startswith('https://'): + c_item['link'] = base_url + c_item['link'] + try: + child_page = urllib2.urlopen(c_item['link']) + assert child_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + # don't stop the export for a bad link + items.append(c_item) + continue + child_page = BeautifulSoup(child_page.read()).prettify() + child_dom = etree.HTML(child_page, etree.HTMLParser()) + extra_keys = transform_child(child_dom).getroot() + if len(extra_keys): + c_item.update({extra.tag:etree.tostring(extra) + for extra in extra_keys[0].getchildren()}) + items.append(c_item) + # change relative link to full link and simplify + for item in items: + for k in item: + val = item[k] + for r, replaced in RE_CLEANS: + val = re.sub(r, replaced % {'base_url':base_url}, val) + item[k] = val + return (42, 43, '') diff --git a/debian/control b/debian/control index 71b2461..75e0526 100644 --- a/debian/control +++ b/debian/control @@ -24,7 +24,8 @@ Depends: ${misc:Depends}, python-simplejson, python-django-south, python-pyexiv2, - python-feedparser + python-feedparser, + python-lxml Recommends: javascript-common, libjs-jquery, libjs-jquery-ui, |