summaryrefslogtreecommitdiff
path: root/chimere/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'chimere/utils.py')
-rw-r--r--chimere/utils.py21
1 files changed, 14 insertions, 7 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index 0344421..8c8624e 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -835,9 +835,8 @@ class OSMManager(ImportManager):
api.ChangesetClose()
return idx+1, None
-import urllib2
+import urllib2, chardet, HTMLParser
from BeautifulSoup import BeautifulSoup
-#from tidylib import tidy_document
from lxml import etree
RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''),
@@ -862,8 +861,12 @@ class HtmlXsltManager(ImportManager):
assert main_page.getcode() == 200
except (urllib2.URLError, AssertionError):
return (0, 0, _(u"Source page is unreachable."))
- # first prettify with BeautifulSoup
- main_page = BeautifulSoup(main_page.read()).prettify()
+ data = main_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+
+ soup = BeautifulSoup(data)
+ main_page = soup.prettify()
# convert it to valid XHTML
#doc, errors = tidy_document(main_page)
doc = main_page
@@ -904,20 +907,24 @@ class HtmlXsltManager(ImportManager):
# don't stop the export for a bad link
items.append(c_item)
continue
- child_page = BeautifulSoup(child_page.read()).prettify()
+ data = child_page.read()
+ encoding = chardet.detect(data)
+ data = data.decode(encoding['encoding'])
+ child_page = BeautifulSoup(data).prettify()
child_dom = etree.HTML(child_page, etree.HTMLParser())
extra_keys = transform_child(child_dom).getroot()
if len(extra_keys):
c_item.update({extra.tag:etree.tostring(extra)
for extra in extra_keys[0].getchildren()})
items.append(c_item)
- # change relative link to full link and simplify
+ # change relative link to full link, simplify, unescape HTML entities
+ html_unescape = HTMLParser.HTMLParser().unescape
for item in items:
for k in item:
val = item[k]
for r, replaced in RE_CLEANS:
val = re.sub(r, replaced % {'base_url':base_url}, val)
- item[k] = val
+ item[k] = html_unescape(val)
updated_item, new_item = 0, 0
for item in items:
if not self.importer_instance.default_localisation and \