diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2016-11-22 10:21:14 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2016-11-22 10:23:12 +0100 |
commit | 1bb13300d867e7af7de36dcfaca5833a711cdf06 (patch) | |
tree | 39a04f397498c634bc5b217aa2a873eca125b7e7 /chimere/utils.py | |
parent | c748476b7497255ab78ba3be45733c3dd5719a60 (diff) | |
download | Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.tar.bz2 Chimère-1bb13300d867e7af7de36dcfaca5833a711cdf06.zip |
Importers: many fixes with python3
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 173 |
1 files changed, 94 insertions, 79 deletions
diff --git a/chimere/utils.py b/chimere/utils.py index f8b7bf5..71a7237 100644 --- a/chimere/utils.py +++ b/chimere/utils.py @@ -25,9 +25,11 @@ import csv import collections import datetime import feedparser +import html import io import json import os +from tidylib import tidy_document import re import tempfile import urllib @@ -55,7 +57,7 @@ def unicode_normalize(string): class ImportManager(object): - u""" + """ Generic class for specific importers """ default_source = None @@ -163,7 +165,7 @@ class ImportManager(object): try: flz = zipfile.ZipFile(zippedfile) except zipfile.BadZipfile: - return [], _(u"Bad zip file") + return [], _("Bad zip file") namelist = flz.namelist() filenames = [] for suffix in suffixes: @@ -200,7 +202,7 @@ class ImportManager(object): if extra_url: url += extra_url remotehandle = urllib.request.urlopen(url) - source = io.StringIO(remotehandle.read()) + source = io.BytesIO(remotehandle.read()) remotehandle.close() except ValueError: # assume it is a local file @@ -212,18 +214,20 @@ class ImportManager(object): return (None, str(error)) if self.importer_instance.zipped: try: - files = self.get_files_inside_zip(source, suffixes, dest_dir) + files = self.get_files_inside_zip( + self.importer_instance.source_file or + self.importer_instance.source , suffixes, dest_dir) except zipfile.BadZipfile: - return (None, _(u"Bad zip file")) + return (None, _("Bad zip file")) if not files or None in files or [] in files: return (None, - _(u"Missing file(s) inside the zip file")) + _("Missing file(s) inside the zip file")) source = files[0] if len(suffixes) == 1 else files return (source, None) class KMLManager(ImportManager): - u""" + """ KML importer The filtr argument has to be defined as the exact name of the folder to be imported @@ -236,7 +240,7 @@ class KMLManager(ImportManager): self.ns = ns def get(self): - u""" + """ Get data from a KML source Return a tuple with: @@ -252,15 +256,15 @@ class KMLManager(ImportManager): doc = source # remove empty lines before declaration (bad XML file) if hasattr(source, 'getvalue'): - splitted = source.getvalue().split('\n') + splitted = source.getvalue().decode('utf-8').split('\n') for idx, line in enumerate(splitted): if line.strip(): break - doc = io.StringIO("\n".join(splitted[idx:])) + doc = io.BytesIO("\n".join(splitted[idx:]).encode('utf-8')) try: tree = etree.parse(doc) except: - return (0, 0, _(u"Bad XML file")) + return (0, 0, _("Bad XML file")) # try to get default namespace if not self.ns: self.ns = tree.getroot().nsmap[None] @@ -329,11 +333,11 @@ class KMLManager(ImportManager): class ShapefileManager(ImportManager): - u""" + """ Shapefile importer """ def get(self): - u""" + """ Get data from a Shapefile source Return a tuple with: @@ -354,7 +358,7 @@ class ShapefileManager(ImportManager): if msg: return (0, 0, msg) if not sources: - return (0, 0, _(u"Error while reading the data source.")) + return (0, 0, _("Error while reading the data source.")) # get the srid srid = self.importer_instance.srid if not srid: @@ -372,10 +376,10 @@ class ShapefileManager(ImportManager): if not srid: # try with the default projection srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION - msg = _(u"SRID cannot be guessed. The default SRID (%s) has " - u"been used.") % srid + msg = _("SRID cannot be guessed. The default SRID (%s) has " + "been used.") % srid # If imported items are not well located " - # u"ask your data provider for the SRID to use.") % srid + # "ask your data provider for the SRID to use.") % srid shapefilename = tmpdir + os.sep + sources[0] ds = DataSource(shapefilename) lyr = ds[0] @@ -387,23 +391,23 @@ class ShapefileManager(ImportManager): except ValueError: return ( new_item, updated_item, - _(u"Bad configuration: filter must be a valid " - u"JSON string")) + _("Bad configuration: filter must be a valid " + "JSON string")) for k in ('id',): if k not in filtr: return ( new_item, updated_item, - _(u"The key \"%s\" is missing in the " - u"filter.") % k) + _("The key \"%s\" is missing in the " + "filter.") % k) for k in filtr: try: ids = lyr.get_fields(k) except: return ( new_item, updated_item, - _(u"Config: {} is not an appropriate column name " - u"for this Shapefile. Available columns " - u" are: {}").format(k, u", ".join( + _("Config: {} is not an appropriate column name " + "for this Shapefile. Available columns " + " are: {}").format(k, ", ".join( [j for j in lyr.fields]))) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} @@ -427,8 +431,8 @@ class ShapefileManager(ImportManager): filtr["name"] = id_name if lyr.geom_type not in ('Point', 'LineString', 'Polygon'): - return (0, 0, _(u"Type of geographic item (%s) of this shapefile " - u"is not managed by Chimère.") % lyr.geom_type) + return (0, 0, _("Type of geographic item (%s) of this shapefile " + "is not managed by Chimère.") % lyr.geom_type) geom_key = '' geom_cls = None if lyr.geom_type == 'Point': @@ -459,7 +463,7 @@ class ShapefileManager(ImportManager): try: geoms = [feat.geom.wkt] except: - return (0, 0, _(u"Bad Shapefile")) + return (0, 0, _("Bad Shapefile")) if feat.geom.geom_type == 'MultiLineString': geoms = [geom.wkt for geom in feat.geom] import_key = dct.pop('id') @@ -553,7 +557,7 @@ class ShapefileManager(ImportManager): class CSVManager(ImportManager): - u""" + """ CSV importer """ @classmethod @@ -561,15 +565,15 @@ class CSVManager(ImportManager): return # (label, getter, setter) - COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'), - (_(u"Categories"), lambda obj: ", ".join( + COLS = [("Id", 'pk', 'pk'), (_("Name"), 'name', 'name'), + (_("Categories"), lambda obj: ", ".join( [c.name for c in obj.categories.all()]), set_categories), - (_(u"State"), 'status', lambda x: x), - (_(u"Description"), 'description', 'description'), - (_(u"Localisation"), 'geometry', 'geometry')] + (_("State"), 'status', lambda x: x), + (_("Description"), 'description', 'description'), + (_("Localisation"), 'geometry', 'geometry')] def get(self): - u""" + """ Get data from a CSV source Return a tuple with: @@ -594,7 +598,7 @@ class CSVManager(ImportManager): try: assert(len(row) >= len(cols)) except AssertionError: - return (0, 0, _(u"Invalid CSV format")) + return (0, 0, _("Invalid CSV format")) continue if len(row) < len(cols): continue @@ -656,13 +660,13 @@ class CSVManager(ImportManager): class GeoRSSManager(ImportManager): - u""" + """ RSS importer. This manager only gets and do not produce GeoRSSFeed """ def get(self): - u""" + """ Get data from a GeoRSS simple source Return a tuple with: @@ -675,7 +679,7 @@ class GeoRSSManager(ImportManager): feed = feedparser.parse(self.importer_instance.source) if feed['bozo'] and not isinstance( feed['bozo_exception'], feedparser.CharacterEncodingOverride): - return (0, 0, _(u"RSS feed is not well formed")) + return (0, 0, _("RSS feed is not well formed")) # differ with feed parser version item_key = 'items' if 'entries' in feed: @@ -740,13 +744,13 @@ class GeoRSSManager(ImportManager): class JsonManager(ImportManager): - u""" + """ Json importer. This manager only gets and do not produce Json feed """ def get(self): - u""" + """ Get data from a json simple source Return a tuple with: @@ -760,29 +764,29 @@ class JsonManager(ImportManager): if msg: return (0, 0, msg) - vals = str(source.read()).replace('\n', ' ') + vals = source.read().decode("utf-8").replace('\n', ' ') try: values = json.JSONDecoder( object_pairs_hook=collections.OrderedDict).decode(vals) except ValueError as e: return (new_item, updated_item, - _(u"JSON file is not well formed: ") + str(e)) + _("JSON file is not well formed: ") + str(e)) # configuration in filtr try: filtr = json.JSONDecoder().decode(self.importer_instance.filtr) except ValueError: return ( new_item, updated_item, - _(u"Bad configuration: filter field must be a valid " - u"JSON string")) + _("Bad configuration: filter field must be a valid " + "JSON string")) vls = filtr.values() for k in ('name', 'id', 'description'): if k not in vls: return ( new_item, updated_item, - _(u"A key must be associated to \"%s\" in the " - u"filter.") % k) + _("A key must be associated to \"%s\" in the " + "filter.") % k) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} @@ -845,7 +849,7 @@ RE_HOOK = re.compile('\[([^\]]*)\]') class OSMManager(ImportManager): - u""" + """ OSM importer/exporter The source url is a path to an OSM file or a XAPI url The filtr argument is XAPI args or empty if it is an OSM file. @@ -853,7 +857,7 @@ class OSMManager(ImportManager): default_source = settings.CHIMERE_XAPI_URL def get(self): - u""" + """ Get data from the source Return a tuple with: @@ -872,7 +876,7 @@ class OSMManager(ImportManager): return self.import_ways(tree) elif tree.xpath('count(//node)'): return self.import_nodes(tree) - return 0, 0, _(u"Nothing to import") + return 0, 0, _("Nothing to import") def import_ways(self, tree): from chimere.models import Route @@ -956,10 +960,10 @@ class OSMManager(ImportManager): if msg: return 0, msg if new_item: - return 0, _(u"New items imported - validate them before exporting") + return 0, _("New items imported - validate them before exporting") if Marker.objects.filter(status='I').count(): - return 0, _(u"There are items from a former import not yet " - u"validated - validate them before exporting") + return 0, _("There are items from a former import not yet " + "validated - validate them before exporting") # start import api = settings.CHIMERE_OSM_API_URL username = settings.CHIMERE_OSM_USER @@ -970,17 +974,17 @@ class OSMManager(ImportManager): username = extra_args['username'] password = extra_args['password'] except KeyError: - return 0, _(u"Bad params - programming error") + return 0, _("Bad params - programming error") username = username.encode('latin1') password = password.encode('latin1') api = OsmApi.OsmApi(api=api, username=username, password=password) - api.ChangesetCreate({u"comment": u"Import from Chimère %s" % + api.ChangesetCreate({"comment": "Import from Chimère %s" % get_version()}) hooks = RE_HOOK.findall(self.importer_instance.filtr) if not hooks: hooks = RE_HOOK.findall(self.importer_instance.source) if not hooks: - return 0, _(u"Bad param") + return 0, _("Bad param") tags = {} bbox = [] for hook in hooks: @@ -995,12 +999,12 @@ class OSMManager(ImportManager): continue tags[key] = value if not tags: - return 0, _(u"No non ambigious tag is defined in the XAPI request") + return 0, _("No non ambigious tag is defined in the XAPI request") if not bbox: return 0, _( - u"No bounding box is defined in the XAPI request." - u"If you are sure to manage the entire planet set the " - u"bounding box to -180,-90,180,90") + "No bounding box is defined in the XAPI request." + "If you are sure to manage the entire planet set the " + "bounding box to -180,-90,180,90") default_dct = {'tag': tags, 'import_source': self.importer_instance.source} idx = -1 @@ -1111,7 +1115,7 @@ class HtmlXsltManager(ImportManager): PARSER = 'HTMLParser' def get(self): - u""" + """ Get data from the source Return a tuple with: @@ -1125,23 +1129,28 @@ class HtmlXsltManager(ImportManager): main_page = urllib.request.urlopen(self.importer_instance.source) assert main_page.getcode() == 200 except (urllib.error.URLError, AssertionError): - return (0, 0, _(u"Source page is unreachable.")) + return (0, 0, _("Source page is unreachable.")) data = main_page.read() encoding = chardet.detect(data) data = data.decode(encoding['encoding']) - soup = BeautifulSoup(data) - main_page = soup.prettify() - # convert it to valid XHTML - # doc, errors = tidy_document(main_page) - doc = main_page - dom = etree.HTML(doc, getattr(etree, self.PARSER)()) + if 'HTML' in self.PARSER: + soup = BeautifulSoup(data) + main_page = soup.prettify() + # convert it to valid XHTML + doc, errors = tidy_document(main_page) + dom = etree.HTML(doc, getattr(etree, self.PARSER)()) + else: + soup = BeautifulSoup(data, 'xml') + main_page = soup.prettify() + dom = etree.XML(main_page.encode('utf-8'), getattr( + etree, self.PARSER)()) try: xslt = etree.parse(self.importer_instance.source_file) self.importer_instance.source_file.seek(0) transform = etree.XSLT(xslt) except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): - return (0, 0, _(u"The source file is not a valid XSLT file.")) + return (0, 0, _("The source file is not a valid XSLT file.")) newdom = transform(dom) items = [] # load an alternate xslt file to apply to linked page @@ -1153,9 +1162,9 @@ class HtmlXsltManager(ImportManager): transform_child = etree.XSLT(alt_xslt) except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): return (0, 0, - _(u"The alt source file is not a valid XSLT file.")) - base_url = u"/".join(self.importer_instance.source.split('/')[:-1]) - base_url += u"/" + _("The alt source file is not a valid XSLT file.")) + base_url = "/".join(self.importer_instance.source.split('/')[:-1]) + base_url += "/" for item in newdom.getroot(): c_item = {child.tag: clean_field(child.text) for child in item.getchildren() if child.text} @@ -1183,10 +1192,12 @@ class HtmlXsltManager(ImportManager): for extra in extra_keys[0].getchildren()}) items.append(c_item) # change relative link to full link, simplify, unescape HTML entities - html_unescape = HTMLParser().unescape + html_unescape = html.unescape for item in items: for k in item: val = item[k] + if type(val) == bytes: + val = val.decode('utf-8') for r, replaced in RE_CLEANS: val = re.sub(r, replaced % {'base_url': base_url}, val) item[k] = html_unescape(val) @@ -1198,8 +1209,8 @@ class HtmlXsltManager(ImportManager): msg = '' if self.missing_cats: msg = _( - u"Names \"%s\" doesn't match existing categories. " - u"Modify the import to match theses names with categories.") %\ + "Names \"%s\" doesn't match existing categories. " + "Modify the import to match theses names with categories.") %\ ('", "'.join(self.missing_cats)) return (self.new_item, self.updated_item, msg) @@ -1229,10 +1240,14 @@ class HtmlXsltManager(ImportManager): def parse_date(self, date): dct = {} has_dates = False + if type(date) == bytes: + date = date.decode('utf-8') for locale in DATE_PARSINGS: if has_dates: break for r in DATE_PARSINGS[locale]: + if not date: + continue m = r.search(date) if not m: continue @@ -1263,7 +1278,7 @@ class HtmlXsltManager(ImportManager): origin_lnk = item.get('link') # filter non relevant links if origin_lnk and origin_lnk.startswith('http'): - origin = u"<a href='%s' target='_blank'>%s</a>" % ( + origin = "<a href='%s' target='_blank'>%s</a>" % ( origin_lnk, origin) dct = { 'origin': origin, @@ -1308,7 +1323,7 @@ import icalendar class IcalManager(ImportManager): def get(self): - u""" + """ Get data from an icalendar source """ from chimere.models import Marker @@ -1322,7 +1337,7 @@ class IcalManager(ImportManager): cal = icalendar.Calendar.from_ical(data) except ValueError as e: return (new_item, updated_item, - _(u"Error on icalendar parsing: ") + str(e)) + _("Error on icalendar parsing: ") + str(e)) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} @@ -1339,10 +1354,10 @@ class IcalManager(ImportManager): dct['description'] = str(dct['description']) loc = event.get('LOCATION', None) if loc: - dct['description'] += u"<br/>{}".format(str(loc)) + dct['description'] += "<br/>{}".format(str(loc)) url = event.get('URL', None) if url: - dct['description'] += u"<br/><a href='{}'>{}</a>".format( + dct['description'] += "<br/><a href='{}'>{}</a>".format( str(url), str(_('Link'))) dct['start_date'] = event.get('DTSTART', None) if dct['start_date']: |