summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@peacefrogs.net>2013-10-31 23:26:32 +0100
committerÉtienne Loks <etienne.loks@peacefrogs.net>2013-10-31 23:26:32 +0100
commit834673fa592f86bdf4d2a62b19d1f8081df4ae17 (patch)
treea4bb27e46e34d4e0a570ac289bffee2eb4483e61
parent05a9e582a5c02bb8901a6875e488465b09aec9f9 (diff)
downloadChimère-834673fa592f86bdf4d2a62b19d1f8081df4ae17.tar.bz2
Chimère-834673fa592f86bdf4d2a62b19d1f8081df4ae17.zip
HTML-XSLT import: try to detect encoding automatically - escape html entities
-rw-r--r--chimere/utils.py21
1 files changed, 14 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index 0344421..8c8624e 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -835,9 +835,8 @@ class OSMManager(ImportManager):
api.ChangesetClose()
return idx+1, None
-import urllib2
+import urllib2, chardet, HTMLParser
from BeautifulSoup import BeautifulSoup
-#from tidylib import tidy_document
from lxml import etree
RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
@@ -862,8 +861,12 @@ class HtmlXsltManager(ImportManager):
assert main_page.getcode() == 200
except (urllib2.URLError, AssertionError):
return (0, 0, _(u"Source page is unreachable."))
- # first prettify with BeautifulSoup
- main_page = BeautifulSoup(main_page.read()).prettify()
+ data = main_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+
+ soup = BeautifulSoup(data)
+ main_page = soup.prettify()
# convert it to valid XHTML
#doc, errors = tidy_document(main_page)
doc = main_page
@@ -904,20 +907,24 @@ class HtmlXsltManager(ImportManager):
# don't stop the export for a bad link
items.append(c_item)
continue
- child_page = BeautifulSoup(child_page.read()).prettify()
+ data = child_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+ child_page = BeautifulSoup(data).prettify()
child_dom = etree.HTML(child_page, etree.HTMLParser())
extra_keys = transform_child(child_dom).getroot()
if len(extra_keys):
c_item.update({extra.tag:etree.tostring(extra)
for extra in extra_keys[0].getchildren()})
items.append(c_item)
- # change relative link to full link and simplify
+ # change relative link to full link, simplify, unescape HTML entities
+ html_unescape = HTMLParser.HTMLParser().unescape
for item in items:
for k in item:
val = item[k]
for r, replaced in RE_CLEANS:
val = re.sub(r, replaced % {'base_url':base_url}, val)
- item[k] = val
+ item[k] = html_unescape(val)
updated_item, new_item = 0, 0
for item in items:
if not self.importer_instance.default_localisation and \