summaryrefslogtreecommitdiff
path: root/chimere/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'chimere/utils.py')
-rw-r--r--chimere/utils.py71
1 files changed, 71 insertions, 0 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index efaf084..894c13c 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -835,6 +835,17 @@ class OSMManager(ImportManager):
api.ChangesetClose()
return idx+1, None
+import urllib2
+from BeautifulSoup import BeautifulSoup
+#from tidylib import tidy_document
+from lxml import etree
+
+RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
+ (re.compile(' ( )*'), ' '),
+ (re.compile(r"""<a href=["'](?!https?)(.*)["']"""),
+ '<a href="%(base_url)s\\1"'),
+ )
+
class HtmlXsltManager(ImportManager):
def get(self):
u"""
@@ -845,3 +856,63 @@ class HtmlXsltManager(ImportManager):
- updated items;
- error detail on error.
"""
+ try:
+ main_page = urllib2.urlopen(self.importer_instance.source)
+ assert main_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ return (0, 0, _(u"Source page is unreachable."))
+ # first prettify with BeautifulSoup
+ main_page = BeautifulSoup(main_page.read()).prettify()
+ # convert it to valid XHTML
+ #doc, errors = tidy_document(main_page)
+ doc = main_page
+ dom = etree.HTML(doc, etree.HTMLParser())
+ try:
+ xslt = etree.parse(self.importer_instance.source_file)
+ transform = etree.XSLT(xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError):
+ return (0, 0, _(u"The source file is not a valid XSLT file."))
+ newdom = transform(dom)
+ items = []
+ # load an alternate xslt file to apply to linked page
+ transform_child = None
+ if self.importer_instance.source_file_alt:
+ try:
+ alt_xslt = etree.parse(self.importer_instance.source_file_alt)
+ transform_child = etree.XSLT(alt_xslt)
+ except (etree.XSLTParseError, etree.XMLSyntaxError):
+ return (0, 0,
+ _(u"The alt source file is not a valid XSLT file."))
+ base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1])
+ base_url += u"/"
+ for item in newdom.getroot():
+ c_item = {child.tag:child.text for child in item.getchildren()
+ if child.text}
+ # try to have more information on the linked page
+ if transform_child and 'link' in c_item:
+ # not an absolute address
+ if not c_item['link'].startswith('http://') and \
+ not c_item['link'].startswith('https://'):
+ c_item['link'] = base_url + c_item['link']
+ try:
+ child_page = urllib2.urlopen(c_item['link'])
+ assert child_page.getcode() == 200
+ except (urllib2.URLError, AssertionError):
+ # don't stop the export for a bad link
+ items.append(c_item)
+ continue
+ child_page = BeautifulSoup(child_page.read()).prettify()
+ child_dom = etree.HTML(child_page, etree.HTMLParser())
+ extra_keys = transform_child(child_dom).getroot()
+ if len(extra_keys):
+ c_item.update({extra.tag:etree.tostring(extra)
+ for extra in extra_keys[0].getchildren()})
+ items.append(c_item)
+ # change relative link to full link and simplify
+ for item in items:
+ for k in item:
+ val = item[k]
+ for r, replaced in RE_CLEANS:
+ val = re.sub(r, replaced % {'base_url':base_url}, val)
+ item[k] = val
+ return (42, 43, '')