Importers: many fixes with python3

author: Étienne Loks <etienne.loks@iggdrasil.net> 2016-11-22 10:21:14 +0100
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2016-11-22 10:23:12 +0100
commit: 1bb13300d867e7af7de36dcfaca5833a711cdf06 (patch)
tree: 39a04f397498c634bc5b217aa2a873eca125b7e7
parent: c748476b7497255ab78ba3be45733c3dd5719a60 (diff)
download: Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.tar.bz2
Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.zip
5 files changed, 105 insertions, 87 deletions
diff --git a/chimere/tests.py b/chimere/tests.py
index e9f8e1f..819782a 100644
--- a/chimere/tests.py
+++ b/chimere/tests.py
@@ -194,6 +194,7 @@ class ImporterTest:
 class KMLImporterTest(TestCase, ImporterTest):
     def setUp(self):
         subcategories = subcategory_setup()
+
         importer1 = Importer.objects.create(
             importer_type='KML',
             source=test_dir_path + 'tests/sample.kml',
diff --git a/chimere/tests/magny-xml.xslt b/chimere/tests/magny-xml.xslt
index 1da9212..2b0333d 100644
--- a/chimere/tests/magny-xml.xslt
+++ b/chimere/tests/magny-xml.xslt
@@ -5,14 +5,14 @@
     <items>
         <xsl:for-each select="/nodes/node">
         <item>
-            <date><xsl:value-of select="date"/></date>
-            <name>Magny-les-hameaux : <xsl:value-of select="titre"/></name>
+            <date><xsl:value-of select="Date"/></date>
+            <name>Magny-les-hameaux : <xsl:value-of select="Titre"/></name>
             <category><xsl:value-of select="tiquette"/></category>
-            <link><xsl:value-of select="lien"/></link>
-            <description><xsl:value-of select="corps"/></description>
-            <key>magny-les-hameaux-<xsl:value-of select="titre"/></key>
-            <lat><xsl:value-of select="latitude"/></lat>
-            <lon><xsl:value-of select="longitude"/></lon>
+            <link><xsl:value-of select="Lien"/></link>
+            <description><xsl:value-of select="Corps"/></description>
+            <key>magny-les-hameaux-<xsl:value-of select="Titre"/></key>
+            <lat><xsl:value-of select="Latitude"/></lat>
+            <lon><xsl:value-of select="Longitude"/></lon>
         </item>
         </xsl:for-each>
     </items>
diff --git a/chimere/tests/villiers-le-bacle.xslt b/chimere/tests/villiers-le-bacle.xslt
index fd062b1..be446cb 100644
--- a/chimere/tests/villiers-le-bacle.xslt
+++ b/chimere/tests/villiers-le-bacle.xslt
@@ -3,7 +3,8 @@
 <xsl:output method="xml" indent="yes"/> 
     <xsl:template match="/">
     <items>
-        <xsl:for-each select="/html/body/div[@id='agenda']/ul/li">
+        <xsl:for-each
+                select="/html/body/div[@id='container']/table[@id='mainpage']/tr/td/div[@id='agenda']/ul/li">
         <item>
             <date><xsl:value-of select="div[@class='ag-date']"/></date>
             <name><xsl:value-of select="div[@class='titre']/a"/></name>
diff --git a/chimere/utils.py b/chimere/utils.py
index f8b7bf5..71a7237 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -25,9 +25,11 @@ import csv
 import collections
 import datetime
 import feedparser
+import html
 import io
 import json
 import os
+from tidylib import tidy_document
 import re
 import tempfile
 import urllib
@@ -55,7 +57,7 @@ def unicode_normalize(string):
 
 
 class ImportManager(object):
-    u"""
+    """
     Generic class for specific importers
     """
     default_source = None
@@ -163,7 +165,7 @@ class ImportManager(object):
         try:
             flz = zipfile.ZipFile(zippedfile)
         except zipfile.BadZipfile:
-            return [], _(u"Bad zip file")
+            return [], _("Bad zip file")
         namelist = flz.namelist()
         filenames = []
         for suffix in suffixes:
@@ -200,7 +202,7 @@ class ImportManager(object):
                 if extra_url:
                     url += extra_url
                 remotehandle = urllib.request.urlopen(url)
-                source = io.StringIO(remotehandle.read())
+                source = io.BytesIO(remotehandle.read())
                 remotehandle.close()
             except ValueError:
                 # assume it is a local file
@@ -212,18 +214,20 @@ class ImportManager(object):
                 return (None, str(error))
         if self.importer_instance.zipped:
             try:
-                files = self.get_files_inside_zip(source, suffixes, dest_dir)
+                files = self.get_files_inside_zip(
+                    self.importer_instance.source_file or
+                    self.importer_instance.source , suffixes, dest_dir)
             except zipfile.BadZipfile:
-                return (None, _(u"Bad zip file"))
+                return (None, _("Bad zip file"))
             if not files or None in files or [] in files:
                 return (None,
-                        _(u"Missing file(s) inside the zip file"))
+                        _("Missing file(s) inside the zip file"))
             source = files[0] if len(suffixes) == 1 else files
         return (source, None)
 
 
 class KMLManager(ImportManager):
-    u"""
+    """
     KML importer
     The filtr argument has to be defined as the exact name of the folder to be
     imported
@@ -236,7 +240,7 @@ class KMLManager(ImportManager):
         self.ns = ns
 
     def get(self):
-        u"""
+        """
         Get data from a KML source
 
         Return a tuple with:
@@ -252,15 +256,15 @@ class KMLManager(ImportManager):
         doc = source
         # remove empty lines before declaration (bad XML file)
         if hasattr(source, 'getvalue'):
-            splitted = source.getvalue().split('\n')
+            splitted = source.getvalue().decode('utf-8').split('\n')
             for idx, line in enumerate(splitted):
                 if line.strip():
                     break
-            doc = io.StringIO("\n".join(splitted[idx:]))
+            doc = io.BytesIO("\n".join(splitted[idx:]).encode('utf-8'))
         try:
             tree = etree.parse(doc)
         except:
-            return (0, 0, _(u"Bad XML file"))
+            return (0, 0, _("Bad XML file"))
         # try to get default namespace
         if not self.ns:
             self.ns = tree.getroot().nsmap[None]
@@ -329,11 +333,11 @@ class KMLManager(ImportManager):
 
 
 class ShapefileManager(ImportManager):
-    u"""
+    """
     Shapefile importer
     """
     def get(self):
-        u"""
+        """
         Get data from a Shapefile source
 
         Return a tuple with:
@@ -354,7 +358,7 @@ class ShapefileManager(ImportManager):
         if msg:
             return (0, 0, msg)
         if not sources:
-            return (0, 0, _(u"Error while reading the data source."))
+            return (0, 0, _("Error while reading the data source."))
         # get the srid
         srid = self.importer_instance.srid
         if not srid:
@@ -372,10 +376,10 @@ class ShapefileManager(ImportManager):
             if not srid:
                 # try with the default projection
                 srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION
-                msg = _(u"SRID cannot be guessed. The default SRID (%s) has "
-                        u"been used.") % srid
+                msg = _("SRID cannot be guessed. The default SRID (%s) has "
+                        "been used.") % srid
                 # If imported items are not well located "
-                #        u"ask your data provider for the SRID to use.") % srid
+                #        "ask your data provider for the SRID to use.") % srid
         shapefilename = tmpdir + os.sep + sources[0]
         ds = DataSource(shapefilename)
         lyr = ds[0]
@@ -387,23 +391,23 @@ class ShapefileManager(ImportManager):
             except ValueError:
                 return (
                     new_item, updated_item,
-                    _(u"Bad configuration: filter must be a valid "
-                      u"JSON string"))
+                    _("Bad configuration: filter must be a valid "
+                      "JSON string"))
             for k in ('id',):
                 if k not in filtr:
                     return (
                         new_item, updated_item,
-                        _(u"The key \"%s\" is missing in the "
-                          u"filter.") % k)
+                        _("The key \"%s\" is missing in the "
+                          "filter.") % k)
             for k in filtr:
                 try:
                     ids = lyr.get_fields(k)
                 except:
                     return (
                         new_item, updated_item,
-                        _(u"Config: {} is not an appropriate column name "
-                          u"for this Shapefile. Available columns "
-                          u" are: {}").format(k, u", ".join(
+                        _("Config: {} is not an appropriate column name "
+                          "for this Shapefile. Available columns "
+                          " are: {}").format(k, ", ".join(
                               [j for j in lyr.fields])))
             default_dct = {'origin': self.importer_instance.origin,
                            'license': self.importer_instance.license}
@@ -427,8 +431,8 @@ class ShapefileManager(ImportManager):
                 filtr["name"] = id_name
 
         if lyr.geom_type not in ('Point', 'LineString', 'Polygon'):
-            return (0, 0, _(u"Type of geographic item (%s) of this shapefile "
-                            u"is not managed by Chimère.") % lyr.geom_type)
+            return (0, 0, _("Type of geographic item (%s) of this shapefile "
+                            "is not managed by Chimère.") % lyr.geom_type)
         geom_key = ''
         geom_cls = None
         if lyr.geom_type == 'Point':
@@ -459,7 +463,7 @@ class ShapefileManager(ImportManager):
             try:
                 geoms = [feat.geom.wkt]
             except:
-                return (0, 0, _(u"Bad Shapefile"))
+                return (0, 0, _("Bad Shapefile"))
             if feat.geom.geom_type == 'MultiLineString':
                 geoms = [geom.wkt for geom in feat.geom]
             import_key = dct.pop('id')
@@ -553,7 +557,7 @@ class ShapefileManager(ImportManager):
 
 
 class CSVManager(ImportManager):
-    u"""
+    """
     CSV importer
     """
     @classmethod
@@ -561,15 +565,15 @@ class CSVManager(ImportManager):
         return
 
     # (label, getter, setter)
-    COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'),
-            (_(u"Categories"), lambda obj: ", ".join(
+    COLS = [("Id", 'pk', 'pk'), (_("Name"), 'name', 'name'),
+            (_("Categories"), lambda obj: ", ".join(
                 [c.name for c in obj.categories.all()]), set_categories),
-            (_(u"State"), 'status', lambda x: x),
-            (_(u"Description"), 'description', 'description'),
-            (_(u"Localisation"), 'geometry', 'geometry')]
+            (_("State"), 'status', lambda x: x),
+            (_("Description"), 'description', 'description'),
+            (_("Localisation"), 'geometry', 'geometry')]
 
     def get(self):
-        u"""
+        """
         Get data from a CSV source
 
         Return a tuple with:
@@ -594,7 +598,7 @@ class CSVManager(ImportManager):
                 try:
                     assert(len(row) >= len(cols))
                 except AssertionError:
-                    return (0, 0, _(u"Invalid CSV format"))
+                    return (0, 0, _("Invalid CSV format"))
                 continue
             if len(row) < len(cols):
                 continue
@@ -656,13 +660,13 @@ class CSVManager(ImportManager):
 
 
 class GeoRSSManager(ImportManager):
-    u"""
+    """
     RSS importer.
     This manager only gets and do not produce GeoRSSFeed
     """
 
     def get(self):
-        u"""
+        """
         Get data from a GeoRSS simple source
 
         Return a tuple with:
@@ -675,7 +679,7 @@ class GeoRSSManager(ImportManager):
         feed = feedparser.parse(self.importer_instance.source)
         if feed['bozo'] and not isinstance(
                 feed['bozo_exception'], feedparser.CharacterEncodingOverride):
-            return (0, 0, _(u"RSS feed is not well formed"))
+            return (0, 0, _("RSS feed is not well formed"))
         # differ with feed parser version
         item_key = 'items'
         if 'entries' in feed:
@@ -740,13 +744,13 @@ class GeoRSSManager(ImportManager):
 
 
 class JsonManager(ImportManager):
-    u"""
+    """
     Json importer.
     This manager only gets and do not produce Json feed
     """
 
     def get(self):
-        u"""
+        """
         Get data from a json simple source
 
         Return a tuple with:
@@ -760,29 +764,29 @@ class JsonManager(ImportManager):
         if msg:
             return (0, 0, msg)
 
-        vals = str(source.read()).replace('\n', ' ')
+        vals = source.read().decode("utf-8").replace('\n', ' ')
         try:
             values = json.JSONDecoder(
                 object_pairs_hook=collections.OrderedDict).decode(vals)
         except ValueError as e:
             return (new_item, updated_item,
-                    _(u"JSON file is not well formed: ") + str(e))
+                    _("JSON file is not well formed: ") + str(e))
         # configuration in filtr
         try:
             filtr = json.JSONDecoder().decode(self.importer_instance.filtr)
         except ValueError:
             return (
                 new_item, updated_item,
-                _(u"Bad configuration: filter field must be a valid "
-                  u"JSON string"))
+                _("Bad configuration: filter field must be a valid "
+                  "JSON string"))
 
         vls = filtr.values()
         for k in ('name', 'id', 'description'):
             if k not in vls:
                 return (
                     new_item, updated_item,
-                    _(u"A key must be associated to \"%s\" in the "
-                      u"filter.") % k)
+                    _("A key must be associated to \"%s\" in the "
+                      "filter.") % k)
 
         default_dct = {'origin': self.importer_instance.origin,
                        'license': self.importer_instance.license}
@@ -845,7 +849,7 @@ RE_HOOK = re.compile('\[([^\]]*)\]')
 
 
 class OSMManager(ImportManager):
-    u"""
+    """
     OSM importer/exporter
     The source url is a path to an OSM file or a XAPI url
     The filtr argument is XAPI args or empty if it is an OSM file.
@@ -853,7 +857,7 @@ class OSMManager(ImportManager):
     default_source = settings.CHIMERE_XAPI_URL
 
     def get(self):
-        u"""
+        """
         Get data from the source
 
         Return a tuple with:
@@ -872,7 +876,7 @@ class OSMManager(ImportManager):
             return self.import_ways(tree)
         elif tree.xpath('count(//node)'):
             return self.import_nodes(tree)
-        return 0, 0, _(u"Nothing to import")
+        return 0, 0, _("Nothing to import")
 
     def import_ways(self, tree):
         from chimere.models import Route
@@ -956,10 +960,10 @@ class OSMManager(ImportManager):
         if msg:
             return 0, msg
         if new_item:
-            return 0, _(u"New items imported - validate them before exporting")
+            return 0, _("New items imported - validate them before exporting")
         if Marker.objects.filter(status='I').count():
-            return 0, _(u"There are items from a former import not yet "
-                        u"validated - validate them before exporting")
+            return 0, _("There are items from a former import not yet "
+                        "validated - validate them before exporting")
         # start import
         api = settings.CHIMERE_OSM_API_URL
         username = settings.CHIMERE_OSM_USER
@@ -970,17 +974,17 @@ class OSMManager(ImportManager):
                 username = extra_args['username']
                 password = extra_args['password']
             except KeyError:
-                return 0, _(u"Bad params - programming error")
+                return 0, _("Bad params - programming error")
         username = username.encode('latin1')
         password = password.encode('latin1')
         api = OsmApi.OsmApi(api=api, username=username, password=password)
-        api.ChangesetCreate({u"comment": u"Import from Chimère %s" %
+        api.ChangesetCreate({"comment": "Import from Chimère %s" %
                              get_version()})
         hooks = RE_HOOK.findall(self.importer_instance.filtr)
         if not hooks:
             hooks = RE_HOOK.findall(self.importer_instance.source)
             if not hooks:
-                return 0, _(u"Bad param")
+                return 0, _("Bad param")
         tags = {}
         bbox = []
         for hook in hooks:
@@ -995,12 +999,12 @@ class OSMManager(ImportManager):
                 continue
             tags[key] = value
         if not tags:
-            return 0, _(u"No non ambigious tag is defined in the XAPI request")
+            return 0, _("No non ambigious tag is defined in the XAPI request")
         if not bbox:
             return 0, _(
-                u"No bounding box is defined in the XAPI request."
-                u"If you are sure to manage the entire planet set the "
-                u"bounding box to -180,-90,180,90")
+                "No bounding box is defined in the XAPI request."
+                "If you are sure to manage the entire planet set the "
+                "bounding box to -180,-90,180,90")
         default_dct = {'tag': tags,
                        'import_source': self.importer_instance.source}
         idx = -1
@@ -1111,7 +1115,7 @@ class HtmlXsltManager(ImportManager):
     PARSER = 'HTMLParser'
 
     def get(self):
-        u"""
+        """
         Get data from the source
 
         Return a tuple with:
@@ -1125,23 +1129,28 @@ class HtmlXsltManager(ImportManager):
             main_page = urllib.request.urlopen(self.importer_instance.source)
             assert main_page.getcode() == 200
         except (urllib.error.URLError, AssertionError):
-            return (0, 0, _(u"Source page is unreachable."))
+            return (0, 0, _("Source page is unreachable."))
         data = main_page.read()
         encoding = chardet.detect(data)
         data = data.decode(encoding['encoding'])
 
-        soup = BeautifulSoup(data)
-        main_page = soup.prettify()
-        # convert it to valid XHTML
-        # doc, errors = tidy_document(main_page)
-        doc = main_page
-        dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+        if 'HTML' in self.PARSER:
+            soup = BeautifulSoup(data)
+            main_page = soup.prettify()
+            # convert it to valid XHTML
+            doc, errors = tidy_document(main_page)
+            dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+        else:
+            soup = BeautifulSoup(data, 'xml')
+            main_page = soup.prettify()
+            dom = etree.XML(main_page.encode('utf-8'), getattr(
+                etree, self.PARSER)())
         try:
             xslt = etree.parse(self.importer_instance.source_file)
             self.importer_instance.source_file.seek(0)
             transform = etree.XSLT(xslt)
         except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
-            return (0, 0, _(u"The source file is not a valid XSLT file."))
+            return (0, 0, _("The source file is not a valid XSLT file."))
         newdom = transform(dom)
         items = []
         # load an alternate xslt file to apply to linked page
@@ -1153,9 +1162,9 @@ class HtmlXsltManager(ImportManager):
                 transform_child = etree.XSLT(alt_xslt)
             except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
                 return (0, 0,
-                        _(u"The alt source file is not a valid XSLT file."))
-        base_url = u"/".join(self.importer_instance.source.split('/')[:-1])
-        base_url += u"/"
+                        _("The alt source file is not a valid XSLT file."))
+        base_url = "/".join(self.importer_instance.source.split('/')[:-1])
+        base_url += "/"
         for item in newdom.getroot():
             c_item = {child.tag: clean_field(child.text)
                       for child in item.getchildren() if child.text}
@@ -1183,10 +1192,12 @@ class HtmlXsltManager(ImportManager):
                                    for extra in extra_keys[0].getchildren()})
             items.append(c_item)
         # change relative link to full link, simplify, unescape HTML entities
-        html_unescape = HTMLParser().unescape
+        html_unescape = html.unescape
         for item in items:
             for k in item:
                 val = item[k]
+                if type(val) == bytes:
+                    val = val.decode('utf-8')
                 for r, replaced in RE_CLEANS:
                     val = re.sub(r, replaced % {'base_url': base_url}, val)
                 item[k] = html_unescape(val)
@@ -1198,8 +1209,8 @@ class HtmlXsltManager(ImportManager):
         msg = ''
         if self.missing_cats:
             msg = _(
-                u"Names \"%s\" doesn't match existing categories. "
-                u"Modify the import to match theses names with categories.") %\
+                "Names \"%s\" doesn't match existing categories. "
+                "Modify the import to match theses names with categories.") %\
                 ('", "'.join(self.missing_cats))
         return (self.new_item, self.updated_item, msg)
 
@@ -1229,10 +1240,14 @@ class HtmlXsltManager(ImportManager):
     def parse_date(self, date):
         dct = {}
         has_dates = False
+        if type(date) == bytes:
+            date = date.decode('utf-8')
         for locale in DATE_PARSINGS:
             if has_dates:
                 break
             for r in DATE_PARSINGS[locale]:
+                if not date:
+                    continue
                 m = r.search(date)
                 if not m:
                     continue
@@ -1263,7 +1278,7 @@ class HtmlXsltManager(ImportManager):
         origin_lnk = item.get('link')
         # filter non relevant links
         if origin_lnk and origin_lnk.startswith('http'):
-            origin = u"<a href='%s' target='_blank'>%s</a>" % (
+            origin = "<a href='%s' target='_blank'>%s</a>" % (
                 origin_lnk, origin)
         dct = {
             'origin': origin,
@@ -1308,7 +1323,7 @@ import icalendar
 
 class IcalManager(ImportManager):
     def get(self):
-        u"""
+        """
         Get data from an icalendar source
         """
         from chimere.models import Marker
@@ -1322,7 +1337,7 @@ class IcalManager(ImportManager):
             cal = icalendar.Calendar.from_ical(data)
         except ValueError as e:
             return (new_item, updated_item,
-                    _(u"Error on icalendar parsing: ") + str(e))
+                    _("Error on icalendar parsing: ") + str(e))
 
         default_dct = {'origin': self.importer_instance.origin,
                        'license': self.importer_instance.license}
@@ -1339,10 +1354,10 @@ class IcalManager(ImportManager):
                 dct['description'] = str(dct['description'])
             loc = event.get('LOCATION', None)
             if loc:
-                dct['description'] += u"<br/>{}".format(str(loc))
+                dct['description'] += "<br/>{}".format(str(loc))
             url = event.get('URL', None)
             if url:
-                dct['description'] += u"<br/><a href='{}'>{}</a>".format(
+                dct['description'] += "<br/><a href='{}'>{}</a>".format(
                     str(url), str(_('Link')))
             dct['start_date'] = event.get('DTSTART', None)
             if dct['start_date']:
diff --git a/requirements.txt b/requirements.txt
index 4c314f4..54624f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ chardet==2.3
 py3exiv2==0.2.1
 gdal==1.10
 osmapi==0.6.2
+PyTidyLib==0.3.1
author	Étienne Loks <etienne.loks@iggdrasil.net>	2016-11-22 10:21:14 +0100
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2016-11-22 10:23:12 +0100
commit	1bb13300d867e7af7de36dcfaca5833a711cdf06 (patch)
tree	39a04f397498c634bc5b217aa2a873eca125b7e7
parent	c748476b7497255ab78ba3be45733c3dd5719a60 (diff)
download	Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.tar.bz2 Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.zip