summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--chimere/models.py3
-rw-r--r--chimere/tests.py7
-rw-r--r--chimere/utils.py71
-rw-r--r--debian/control3
4 files changed, 82 insertions, 2 deletions
diff --git a/chimere/models.py b/chimere/models.py
index 952594c..727c3c7 100644
--- a/chimere/models.py
+++ b/chimere/models.py
@@ -326,7 +326,8 @@ class Importer(models.Model):
filtr = models.CharField(_(u"Filter"), max_length=200,
blank=True, null=True)
source = models.CharField(_(u"Web address"), max_length=200,
- blank=True, null=True)
+ blank=True, null=True,
+ help_text=_(u"Don't forget the trailing slash"))
source_file = models.FileField(_(u"Source file"),
upload_to='import_files', blank=True, null=True)
source_file_alt = models.FileField(_(u"Alt source file"),
diff --git a/chimere/tests.py b/chimere/tests.py
index 070443e..5c76d2b 100644
--- a/chimere/tests.py
+++ b/chimere/tests.py
@@ -296,6 +296,13 @@ class GeoRSSImporterTest(TestCase, ImporterTest):
self.marker_importers = [(importer1, 1), (importer2, 32)]
+class HtmlXsltImporterTest(TestCase, ImporterTest):
+ def setUp(self):
+ subcategories = subcategory_setup()
+ importer1 = Importer.objects.create(importer_type='XSLT',
+ source='http://www.ville-villierslebacle.fr/')
+ self.marker_importers = [(importer1, 5),]
+
class FeedsTest(TestCase):
def setUp(self):
self.areas = areas_setup()
diff --git a/chimere/utils.py b/chimere/utils.py
index efaf084..894c13c 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -835,6 +835,17 @@ class OSMManager(ImportManager):
api.ChangesetClose()
return idx+1, None
+import urllib2
+from BeautifulSoup import BeautifulSoup
+#from tidylib import tidy_document
+from lxml import etree
+
+RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
+ (re.compile(' ( )*'), ' '),
+ (re.compile(r"""<a href=["'](?!https?)(.*)["']"""),
+ '<a href="%(base_url)s\\1"'),
+ )
+
class HtmlXsltManager(ImportManager):
def get(self):
u"""
@@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager):
- updated items;
- error detail on error.
"""
+ try:
+ main_page = urllib2.urlopen(self.importer_instance.source)
+ assert main_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ return (0, 0, _(u"Source page is unreachable."))
+ # first prettify with BeautifulSoup
+ main_page = BeautifulSoup(main_page.read()).prettify()
+ # convert it to valid XHTML
+ #doc, errors = tidy_document(main_page)
+ doc = main_page
+ dom = etree.HTML(doc, etree.HTMLParser())
+ try:
+ xslt = etree.parse(self.importer_instance.source_file)
+ transform = etree.XSLT(xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError):
+ return (0, 0, _(u"The source file is not a valid XSLT file."))
+ newdom = transform(dom)
+ items = []
+ # load an alternate xslt file to apply to linked page
+ transform_child = None
+ if self.importer_instance.source_file_alt:
+ try:
+ alt_xslt = etree.parse(self.importer_instance.source_file_alt)
+ transform_child = etree.XSLT(alt_xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError):
+ return (0, 0,
+ _(u"The alt source file is not a valid XSLT file."))
+ base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])
+ base_url += u"/"
+ for item in newdom.getroot():
+ c_item = {child.tag:child.text for child in item.getchildren()
+ if child.text}
+ # try to have more information on the linked page
+ if transform_child and 'link' in c_item:
+ # not an absolute address
+ if not c_item['link'].startswith('http://') and \
+ not c_item['link'].startswith('https://'):
+ c_item['link'] = base_url + c_item['link']
+ try:
+ child_page = urllib2.urlopen(c_item['link'])
+ assert child_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ # don't stop the export for a bad link
+ items.append(c_item)
+ continue
+ child_page = BeautifulSoup(child_page.read()).prettify()
+ child_dom = etree.HTML(child_page, etree.HTMLParser())
+ extra_keys = transform_child(child_dom).getroot()
+ if len(extra_keys):
+ c_item.update({extra.tag:etree.tostring(extra)
+ for extra in extra_keys[0].getchildren()})
+ items.append(c_item)
+ # change relative link to full link and simplify
+ for item in items:
+ for k in item:
+ val = item[k]
+ for r, replaced in RE_CLEANS:
+ val = re.sub(r, replaced % {'base_url':base_url}, val)
+ item[k] = val
+ return (42, 43, '')
diff --git a/debian/control b/debian/control
index 71b2461..75e0526 100644
--- a/debian/control
+++ b/debian/control
@@ -24,7 +24,8 @@ Depends: ${misc:Depends},
python-simplejson,
python-django-south,
python-pyexiv2,
- python-feedparser
+ python-feedparser,
+ python-lxml
Recommends: javascript-common,
libjs-jquery,
libjs-jquery-ui,