diff options
| author | Étienne Loks <etienne.loks@iggdrasil.net> | 2016-11-22 10:21:14 +0100 | 
|---|---|---|
| committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2016-11-22 10:23:12 +0100 | 
| commit | 1bb13300d867e7af7de36dcfaca5833a711cdf06 (patch) | |
| tree | 39a04f397498c634bc5b217aa2a873eca125b7e7 | |
| parent | c748476b7497255ab78ba3be45733c3dd5719a60 (diff) | |
| download | Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.tar.bz2 Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.zip | |
Importers: many fixes with python3
| -rw-r--r-- | chimere/tests.py | 1 | ||||
| -rw-r--r-- | chimere/tests/magny-xml.xslt | 14 | ||||
| -rw-r--r-- | chimere/tests/villiers-le-bacle.xslt | 3 | ||||
| -rw-r--r-- | chimere/utils.py | 173 | ||||
| -rw-r--r-- | requirements.txt | 1 | 
5 files changed, 105 insertions, 87 deletions
| diff --git a/chimere/tests.py b/chimere/tests.py index e9f8e1f..819782a 100644 --- a/chimere/tests.py +++ b/chimere/tests.py @@ -194,6 +194,7 @@ class ImporterTest:  class KMLImporterTest(TestCase, ImporterTest):      def setUp(self):          subcategories = subcategory_setup() +          importer1 = Importer.objects.create(              importer_type='KML',              source=test_dir_path + 'tests/sample.kml', diff --git a/chimere/tests/magny-xml.xslt b/chimere/tests/magny-xml.xslt index 1da9212..2b0333d 100644 --- a/chimere/tests/magny-xml.xslt +++ b/chimere/tests/magny-xml.xslt @@ -5,14 +5,14 @@      <items>          <xsl:for-each select="/nodes/node">          <item> -            <date><xsl:value-of select="date"/></date> -            <name>Magny-les-hameaux : <xsl:value-of select="titre"/></name> +            <date><xsl:value-of select="Date"/></date> +            <name>Magny-les-hameaux : <xsl:value-of select="Titre"/></name>              <category><xsl:value-of select="tiquette"/></category> -            <link><xsl:value-of select="lien"/></link> -            <description><xsl:value-of select="corps"/></description> -            <key>magny-les-hameaux-<xsl:value-of select="titre"/></key> -            <lat><xsl:value-of select="latitude"/></lat> -            <lon><xsl:value-of select="longitude"/></lon> +            <link><xsl:value-of select="Lien"/></link> +            <description><xsl:value-of select="Corps"/></description> +            <key>magny-les-hameaux-<xsl:value-of select="Titre"/></key> +            <lat><xsl:value-of select="Latitude"/></lat> +            <lon><xsl:value-of select="Longitude"/></lon>          </item>          </xsl:for-each>      </items> diff --git a/chimere/tests/villiers-le-bacle.xslt b/chimere/tests/villiers-le-bacle.xslt index fd062b1..be446cb 100644 --- a/chimere/tests/villiers-le-bacle.xslt +++ b/chimere/tests/villiers-le-bacle.xslt @@ -3,7 +3,8 @@  <xsl:output method="xml" indent="yes"/>       <xsl:template match="/">      <items> -        <xsl:for-each select="/html/body/div[@id='agenda']/ul/li"> +        <xsl:for-each +                select="/html/body/div[@id='container']/table[@id='mainpage']/tr/td/div[@id='agenda']/ul/li">          <item>              <date><xsl:value-of select="div[@class='ag-date']"/></date>              <name><xsl:value-of select="div[@class='titre']/a"/></name> diff --git a/chimere/utils.py b/chimere/utils.py index f8b7bf5..71a7237 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -25,9 +25,11 @@ import csv  import collections  import datetime  import feedparser +import html  import io  import json  import os +from tidylib import tidy_document  import re  import tempfile  import urllib @@ -55,7 +57,7 @@ def unicode_normalize(string):  class ImportManager(object): -    u""" +    """      Generic class for specific importers      """      default_source = None @@ -163,7 +165,7 @@ class ImportManager(object):          try:              flz = zipfile.ZipFile(zippedfile)          except zipfile.BadZipfile: -            return [], _(u"Bad zip file") +            return [], _("Bad zip file")          namelist = flz.namelist()          filenames = []          for suffix in suffixes: @@ -200,7 +202,7 @@ class ImportManager(object):                  if extra_url:                      url += extra_url                  remotehandle = urllib.request.urlopen(url) -                source = io.StringIO(remotehandle.read()) +                source = io.BytesIO(remotehandle.read())                  remotehandle.close()              except ValueError:                  # assume it is a local file @@ -212,18 +214,20 @@ class ImportManager(object):                  return (None, str(error))          if self.importer_instance.zipped:              try: -                files = self.get_files_inside_zip(source, suffixes, dest_dir) +                files = self.get_files_inside_zip( +                    self.importer_instance.source_file or +                    self.importer_instance.source , suffixes, dest_dir)              except zipfile.BadZipfile: -                return (None, _(u"Bad zip file")) +                return (None, _("Bad zip file"))              if not files or None in files or [] in files:                  return (None, -                        _(u"Missing file(s) inside the zip file")) +                        _("Missing file(s) inside the zip file"))              source = files[0] if len(suffixes) == 1 else files          return (source, None)  class KMLManager(ImportManager): -    u""" +    """      KML importer      The filtr argument has to be defined as the exact name of the folder to be      imported @@ -236,7 +240,7 @@ class KMLManager(ImportManager):          self.ns = ns      def get(self): -        u""" +        """          Get data from a KML source          Return a tuple with: @@ -252,15 +256,15 @@ class KMLManager(ImportManager):          doc = source          # remove empty lines before declaration (bad XML file)          if hasattr(source, 'getvalue'): -            splitted = source.getvalue().split('\n') +            splitted = source.getvalue().decode('utf-8').split('\n')              for idx, line in enumerate(splitted):                  if line.strip():                      break -            doc = io.StringIO("\n".join(splitted[idx:])) +            doc = io.BytesIO("\n".join(splitted[idx:]).encode('utf-8'))          try:              tree = etree.parse(doc)          except: -            return (0, 0, _(u"Bad XML file")) +            return (0, 0, _("Bad XML file"))          # try to get default namespace          if not self.ns:              self.ns = tree.getroot().nsmap[None] @@ -329,11 +333,11 @@ class KMLManager(ImportManager):  class ShapefileManager(ImportManager): -    u""" +    """      Shapefile importer      """      def get(self): -        u""" +        """          Get data from a Shapefile source          Return a tuple with: @@ -354,7 +358,7 @@ class ShapefileManager(ImportManager):          if msg:              return (0, 0, msg)          if not sources: -            return (0, 0, _(u"Error while reading the data source.")) +            return (0, 0, _("Error while reading the data source."))          # get the srid          srid = self.importer_instance.srid          if not srid: @@ -372,10 +376,10 @@ class ShapefileManager(ImportManager):              if not srid:                  # try with the default projection                  srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION -                msg = _(u"SRID cannot be guessed. The default SRID (%s) has " -                        u"been used.") % srid +                msg = _("SRID cannot be guessed. The default SRID (%s) has " +                        "been used.") % srid                  # If imported items are not well located " -                #        u"ask your data provider for the SRID to use.") % srid +                #        "ask your data provider for the SRID to use.") % srid          shapefilename = tmpdir + os.sep + sources[0]          ds = DataSource(shapefilename)          lyr = ds[0] @@ -387,23 +391,23 @@ class ShapefileManager(ImportManager):              except ValueError:                  return (                      new_item, updated_item, -                    _(u"Bad configuration: filter must be a valid " -                      u"JSON string")) +                    _("Bad configuration: filter must be a valid " +                      "JSON string"))              for k in ('id',):                  if k not in filtr:                      return (                          new_item, updated_item, -                        _(u"The key \"%s\" is missing in the " -                          u"filter.") % k) +                        _("The key \"%s\" is missing in the " +                          "filter.") % k)              for k in filtr:                  try:                      ids = lyr.get_fields(k)                  except:                      return (                          new_item, updated_item, -                        _(u"Config: {} is not an appropriate column name " -                          u"for this Shapefile. Available columns " -                          u" are: {}").format(k, u", ".join( +                        _("Config: {} is not an appropriate column name " +                          "for this Shapefile. Available columns " +                          " are: {}").format(k, ", ".join(                                [j for j in lyr.fields])))              default_dct = {'origin': self.importer_instance.origin,                             'license': self.importer_instance.license} @@ -427,8 +431,8 @@ class ShapefileManager(ImportManager):                  filtr["name"] = id_name          if lyr.geom_type not in ('Point', 'LineString', 'Polygon'): -            return (0, 0, _(u"Type of geographic item (%s) of this shapefile " -                            u"is not managed by Chimère.") % lyr.geom_type) +            return (0, 0, _("Type of geographic item (%s) of this shapefile " +                            "is not managed by Chimère.") % lyr.geom_type)          geom_key = ''          geom_cls = None          if lyr.geom_type == 'Point': @@ -459,7 +463,7 @@ class ShapefileManager(ImportManager):              try:                  geoms = [feat.geom.wkt]              except: -                return (0, 0, _(u"Bad Shapefile")) +                return (0, 0, _("Bad Shapefile"))              if feat.geom.geom_type == 'MultiLineString':                  geoms = [geom.wkt for geom in feat.geom]              import_key = dct.pop('id') @@ -553,7 +557,7 @@ class ShapefileManager(ImportManager):  class CSVManager(ImportManager): -    u""" +    """      CSV importer      """      @classmethod @@ -561,15 +565,15 @@ class CSVManager(ImportManager):          return      # (label, getter, setter) -    COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'), -            (_(u"Categories"), lambda obj: ", ".join( +    COLS = [("Id", 'pk', 'pk'), (_("Name"), 'name', 'name'), +            (_("Categories"), lambda obj: ", ".join(                  [c.name for c in obj.categories.all()]), set_categories), -            (_(u"State"), 'status', lambda x: x), -            (_(u"Description"), 'description', 'description'), -            (_(u"Localisation"), 'geometry', 'geometry')] +            (_("State"), 'status', lambda x: x), +            (_("Description"), 'description', 'description'), +            (_("Localisation"), 'geometry', 'geometry')]      def get(self): -        u""" +        """          Get data from a CSV source          Return a tuple with: @@ -594,7 +598,7 @@ class CSVManager(ImportManager):                  try:                      assert(len(row) >= len(cols))                  except AssertionError: -                    return (0, 0, _(u"Invalid CSV format")) +                    return (0, 0, _("Invalid CSV format"))                  continue              if len(row) < len(cols):                  continue @@ -656,13 +660,13 @@ class CSVManager(ImportManager):  class GeoRSSManager(ImportManager): -    u""" +    """      RSS importer.      This manager only gets and do not produce GeoRSSFeed      """      def get(self): -        u""" +        """          Get data from a GeoRSS simple source          Return a tuple with: @@ -675,7 +679,7 @@ class GeoRSSManager(ImportManager):          feed = feedparser.parse(self.importer_instance.source)          if feed['bozo'] and not isinstance(                  feed['bozo_exception'], feedparser.CharacterEncodingOverride): -            return (0, 0, _(u"RSS feed is not well formed")) +            return (0, 0, _("RSS feed is not well formed"))          # differ with feed parser version          item_key = 'items'          if 'entries' in feed: @@ -740,13 +744,13 @@ class GeoRSSManager(ImportManager):  class JsonManager(ImportManager): -    u""" +    """      Json importer.      This manager only gets and do not produce Json feed      """      def get(self): -        u""" +        """          Get data from a json simple source          Return a tuple with: @@ -760,29 +764,29 @@ class JsonManager(ImportManager):          if msg:              return (0, 0, msg) -        vals = str(source.read()).replace('\n', ' ') +        vals = source.read().decode("utf-8").replace('\n', ' ')          try:              values = json.JSONDecoder(                  object_pairs_hook=collections.OrderedDict).decode(vals)          except ValueError as e:              return (new_item, updated_item, -                    _(u"JSON file is not well formed: ") + str(e)) +                    _("JSON file is not well formed: ") + str(e))          # configuration in filtr          try:              filtr = json.JSONDecoder().decode(self.importer_instance.filtr)          except ValueError:              return (                  new_item, updated_item, -                _(u"Bad configuration: filter field must be a valid " -                  u"JSON string")) +                _("Bad configuration: filter field must be a valid " +                  "JSON string"))          vls = filtr.values()          for k in ('name', 'id', 'description'):              if k not in vls:                  return (                      new_item, updated_item, -                    _(u"A key must be associated to \"%s\" in the " -                      u"filter.") % k) +                    _("A key must be associated to \"%s\" in the " +                      "filter.") % k)          default_dct = {'origin': self.importer_instance.origin,                         'license': self.importer_instance.license} @@ -845,7 +849,7 @@ RE_HOOK = re.compile('\[([^\]]*)\]')  class OSMManager(ImportManager): -    u""" +    """      OSM importer/exporter      The source url is a path to an OSM file or a XAPI url      The filtr argument is XAPI args or empty if it is an OSM file. @@ -853,7 +857,7 @@ class OSMManager(ImportManager):      default_source = settings.CHIMERE_XAPI_URL      def get(self): -        u""" +        """          Get data from the source          Return a tuple with: @@ -872,7 +876,7 @@ class OSMManager(ImportManager):              return self.import_ways(tree)          elif tree.xpath('count(//node)'):              return self.import_nodes(tree) -        return 0, 0, _(u"Nothing to import") +        return 0, 0, _("Nothing to import")      def import_ways(self, tree):          from chimere.models import Route @@ -956,10 +960,10 @@ class OSMManager(ImportManager):          if msg:              return 0, msg          if new_item: -            return 0, _(u"New items imported - validate them before exporting") +            return 0, _("New items imported - validate them before exporting")          if Marker.objects.filter(status='I').count(): -            return 0, _(u"There are items from a former import not yet " -                        u"validated - validate them before exporting") +            return 0, _("There are items from a former import not yet " +                        "validated - validate them before exporting")          # start import          api = settings.CHIMERE_OSM_API_URL          username = settings.CHIMERE_OSM_USER @@ -970,17 +974,17 @@ class OSMManager(ImportManager):                  username = extra_args['username']                  password = extra_args['password']              except KeyError: -                return 0, _(u"Bad params - programming error") +                return 0, _("Bad params - programming error")          username = username.encode('latin1')          password = password.encode('latin1')          api = OsmApi.OsmApi(api=api, username=username, password=password) -        api.ChangesetCreate({u"comment": u"Import from Chimère %s" % +        api.ChangesetCreate({"comment": "Import from Chimère %s" %                               get_version()})          hooks = RE_HOOK.findall(self.importer_instance.filtr)          if not hooks:              hooks = RE_HOOK.findall(self.importer_instance.source)              if not hooks: -                return 0, _(u"Bad param") +                return 0, _("Bad param")          tags = {}          bbox = []          for hook in hooks: @@ -995,12 +999,12 @@ class OSMManager(ImportManager):                  continue              tags[key] = value          if not tags: -            return 0, _(u"No non ambigious tag is defined in the XAPI request") +            return 0, _("No non ambigious tag is defined in the XAPI request")          if not bbox:              return 0, _( -                u"No bounding box is defined in the XAPI request." -                u"If you are sure to manage the entire planet set the " -                u"bounding box to -180,-90,180,90") +                "No bounding box is defined in the XAPI request." +                "If you are sure to manage the entire planet set the " +                "bounding box to -180,-90,180,90")          default_dct = {'tag': tags,                         'import_source': self.importer_instance.source}          idx = -1 @@ -1111,7 +1115,7 @@ class HtmlXsltManager(ImportManager):      PARSER = 'HTMLParser'      def get(self): -        u""" +        """          Get data from the source          Return a tuple with: @@ -1125,23 +1129,28 @@ class HtmlXsltManager(ImportManager):              main_page = urllib.request.urlopen(self.importer_instance.source)              assert main_page.getcode() == 200          except (urllib.error.URLError, AssertionError): -            return (0, 0, _(u"Source page is unreachable.")) +            return (0, 0, _("Source page is unreachable."))          data = main_page.read()          encoding = chardet.detect(data)          data = data.decode(encoding['encoding']) -        soup = BeautifulSoup(data) -        main_page = soup.prettify() -        # convert it to valid XHTML -        # doc, errors = tidy_document(main_page) -        doc = main_page -        dom = etree.HTML(doc, getattr(etree, self.PARSER)()) +        if 'HTML' in self.PARSER: +            soup = BeautifulSoup(data) +            main_page = soup.prettify() +            # convert it to valid XHTML +            doc, errors = tidy_document(main_page) +            dom = etree.HTML(doc, getattr(etree, self.PARSER)()) +        else: +            soup = BeautifulSoup(data, 'xml') +            main_page = soup.prettify() +            dom = etree.XML(main_page.encode('utf-8'), getattr( +                etree, self.PARSER)())          try:              xslt = etree.parse(self.importer_instance.source_file)              self.importer_instance.source_file.seek(0)              transform = etree.XSLT(xslt)          except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): -            return (0, 0, _(u"The source file is not a valid XSLT file.")) +            return (0, 0, _("The source file is not a valid XSLT file."))          newdom = transform(dom)          items = []          # load an alternate xslt file to apply to linked page @@ -1153,9 +1162,9 @@ class HtmlXsltManager(ImportManager):                  transform_child = etree.XSLT(alt_xslt)              except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):                  return (0, 0, -                        _(u"The alt source file is not a valid XSLT file.")) -        base_url = u"/".join(self.importer_instance.source.split('/')[:-1]) -        base_url += u"/" +                        _("The alt source file is not a valid XSLT file.")) +        base_url = "/".join(self.importer_instance.source.split('/')[:-1]) +        base_url += "/"          for item in newdom.getroot():              c_item = {child.tag: clean_field(child.text)                        for child in item.getchildren() if child.text} @@ -1183,10 +1192,12 @@ class HtmlXsltManager(ImportManager):                                     for extra in extra_keys[0].getchildren()})              items.append(c_item)          # change relative link to full link, simplify, unescape HTML entities -        html_unescape = HTMLParser().unescape +        html_unescape = html.unescape          for item in items:              for k in item:                  val = item[k] +                if type(val) == bytes: +                    val = val.decode('utf-8')                  for r, replaced in RE_CLEANS:                      val = re.sub(r, replaced % {'base_url': base_url}, val)                  item[k] = html_unescape(val) @@ -1198,8 +1209,8 @@ class HtmlXsltManager(ImportManager):          msg = ''          if self.missing_cats:              msg = _( -                u"Names \"%s\" doesn't match existing categories. " -                u"Modify the import to match theses names with categories.") %\ +                "Names \"%s\" doesn't match existing categories. " +                "Modify the import to match theses names with categories.") %\                  ('", "'.join(self.missing_cats))          return (self.new_item, self.updated_item, msg) @@ -1229,10 +1240,14 @@ class HtmlXsltManager(ImportManager):      def parse_date(self, date):          dct = {}          has_dates = False +        if type(date) == bytes: +            date = date.decode('utf-8')          for locale in DATE_PARSINGS:              if has_dates:                  break              for r in DATE_PARSINGS[locale]: +                if not date: +                    continue                  m = r.search(date)                  if not m:                      continue @@ -1263,7 +1278,7 @@ class HtmlXsltManager(ImportManager):          origin_lnk = item.get('link')          # filter non relevant links          if origin_lnk and origin_lnk.startswith('http'): -            origin = u"<a href='%s' target='_blank'>%s</a>" % ( +            origin = "<a href='%s' target='_blank'>%s</a>" % (                  origin_lnk, origin)          dct = {              'origin': origin, @@ -1308,7 +1323,7 @@ import icalendar  class IcalManager(ImportManager):      def get(self): -        u""" +        """          Get data from an icalendar source          """          from chimere.models import Marker @@ -1322,7 +1337,7 @@ class IcalManager(ImportManager):              cal = icalendar.Calendar.from_ical(data)          except ValueError as e:              return (new_item, updated_item, -                    _(u"Error on icalendar parsing: ") + str(e)) +                    _("Error on icalendar parsing: ") + str(e))          default_dct = {'origin': self.importer_instance.origin,                         'license': self.importer_instance.license} @@ -1339,10 +1354,10 @@ class IcalManager(ImportManager):                  dct['description'] = str(dct['description'])              loc = event.get('LOCATION', None)              if loc: -                dct['description'] += u"<br/>{}".format(str(loc)) +                dct['description'] += "<br/>{}".format(str(loc))              url = event.get('URL', None)              if url: -                dct['description'] += u"<br/><a href='{}'>{}</a>".format( +                dct['description'] += "<br/><a href='{}'>{}</a>".format(                      str(url), str(_('Link')))              dct['start_date'] = event.get('DTSTART', None)              if dct['start_date']: diff --git a/requirements.txt b/requirements.txt index 4c314f4..54624f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ chardet==2.3  py3exiv2==0.2.1  gdal==1.10  osmapi==0.6.2 +PyTidyLib==0.3.1 | 
