diff options
Diffstat (limited to 'chimere/utils.py')
| -rw-r--r-- | chimere/utils.py | 21 | 
1 files changed, 14 insertions, 7 deletions
| diff --git a/chimere/utils.py b/chimere/utils.py index 0344421..8c8624e 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -835,9 +835,8 @@ class OSMManager(ImportManager):          api.ChangesetClose()          return idx+1, None -import urllib2 +import urllib2, chardet, HTMLParser  from BeautifulSoup import BeautifulSoup -#from tidylib import tidy_document  from lxml import etree  RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), @@ -862,8 +861,12 @@ class HtmlXsltManager(ImportManager):              assert main_page.getcode() == 200          except (urllib2.URLError, AssertionError):              return (0, 0, _(u"Source page is unreachable.")) -        # first prettify with BeautifulSoup -        main_page = BeautifulSoup(main_page.read()).prettify() +        data = main_page.read() +        encoding = chardet.detect(data) +        data = data.decode(encoding['encoding']) + +        soup = BeautifulSoup(data) +        main_page = soup.prettify()          # convert it to valid XHTML          #doc, errors = tidy_document(main_page)          doc = main_page @@ -904,20 +907,24 @@ class HtmlXsltManager(ImportManager):                      # don't stop the export for a bad link                      items.append(c_item)                      continue -                child_page = BeautifulSoup(child_page.read()).prettify() +                data = child_page.read() +                encoding = chardet.detect(data) +                data = data.decode(encoding['encoding']) +                child_page = BeautifulSoup(data).prettify()                  child_dom = etree.HTML(child_page, etree.HTMLParser())                  extra_keys = transform_child(child_dom).getroot()                  if len(extra_keys):                      c_item.update({extra.tag:etree.tostring(extra)                              for extra in extra_keys[0].getchildren()})              items.append(c_item) -        # change relative link to full link and simplify +        # change relative link to full link, simplify, unescape HTML entities +        html_unescape = HTMLParser.HTMLParser().unescape          for item in items:              for k in item:                  val = item[k]                  for r, replaced in RE_CLEANS:                      val = re.sub(r, replaced % {'base_url':base_url}, val) -                item[k] = val +                item[k] = html_unescape(val)          updated_item, new_item = 0, 0          for item in items:              if not self.importer_instance.default_localisation and \ | 
