diff options
author | Étienne Loks <etienne.loks@peacefrogs.net> | 2013-10-31 23:26:32 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@peacefrogs.net> | 2013-10-31 23:26:32 +0100 |
commit | 834673fa592f86bdf4d2a62b19d1f8081df4ae17 (patch) | |
tree | a4bb27e46e34d4e0a570ac289bffee2eb4483e61 /chimere/utils.py | |
parent | 05a9e582a5c02bb8901a6875e488465b09aec9f9 (diff) | |
download | Chimère-834673fa592f86bdf4d2a62b19d1f8081df4ae17.tar.bz2 Chimère-834673fa592f86bdf4d2a62b19d1f8081df4ae17.zip |
HTML-XSLT import: try to detect encoding automatically - escape html entities
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index 0344421..8c8624e 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,9 +835,8 @@ class OSMManager(ImportManager): api.ChangesetClose() return idx+1, None -import urllib2 +import urllib2, chardet, HTMLParser from BeautifulSoup import BeautifulSoup -#from tidylib import tidy_document from lxml import etree RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), @@ -862,8 +861,12 @@ class HtmlXsltManager(ImportManager): assert main_page.getcode() == 200 except (urllib2.URLError, AssertionError): return (0, 0, _(u"Source page is unreachable.")) - # first prettify with BeautifulSoup - main_page = BeautifulSoup(main_page.read()).prettify() + data = main_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + + soup = BeautifulSoup(data) + main_page = soup.prettify() # convert it to valid XHTML #doc, errors = tidy_document(main_page) doc = main_page @@ -904,20 +907,24 @@ class HtmlXsltManager(ImportManager): # don't stop the export for a bad link items.append(c_item) continue - child_page = BeautifulSoup(child_page.read()).prettify() + data = child_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + child_page = BeautifulSoup(data).prettify() child_dom = etree.HTML(child_page, etree.HTMLParser()) extra_keys = transform_child(child_dom).getroot() if len(extra_keys): c_item.update({extra.tag:etree.tostring(extra) for extra in extra_keys[0].getchildren()}) items.append(c_item) - # change relative link to full link and simplify + # change relative link to full link, simplify, unescape HTML entities + html_unescape = HTMLParser.HTMLParser().unescape for item in items: for k in item: val = item[k] for r, replaced in RE_CLEANS: val = re.sub(r, replaced % {'base_url':base_url}, val) - item[k] = val + item[k] = html_unescape(val) updated_item, new_item = 0, 0 for item in items: if not self.importer_instance.default_localisation and \ |