summaryrefslogtreecommitdiff
path: root/chimere/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'chimere/utils.py')
-rw-r--r--chimere/utils.py173
1 files changed, 94 insertions, 79 deletions
diff --git a/chimere/utils.py b/chimere/utils.py
index f8b7bf5..71a7237 100644
--- a/chimere/utils.py
+++ b/chimere/utils.py
@@ -25,9 +25,11 @@ import csv
import collections
import datetime
import feedparser
+import html
import io
import json
import os
+from tidylib import tidy_document
import re
import tempfile
import urllib
@@ -55,7 +57,7 @@ def unicode_normalize(string):
class ImportManager(object):
- u"""
+ """
Generic class for specific importers
"""
default_source = None
@@ -163,7 +165,7 @@ class ImportManager(object):
try:
flz = zipfile.ZipFile(zippedfile)
except zipfile.BadZipfile:
- return [], _(u"Bad zip file")
+ return [], _("Bad zip file")
namelist = flz.namelist()
filenames = []
for suffix in suffixes:
@@ -200,7 +202,7 @@ class ImportManager(object):
if extra_url:
url += extra_url
remotehandle = urllib.request.urlopen(url)
- source = io.StringIO(remotehandle.read())
+ source = io.BytesIO(remotehandle.read())
remotehandle.close()
except ValueError:
# assume it is a local file
@@ -212,18 +214,20 @@ class ImportManager(object):
return (None, str(error))
if self.importer_instance.zipped:
try:
- files = self.get_files_inside_zip(source, suffixes, dest_dir)
+ files = self.get_files_inside_zip(
+ self.importer_instance.source_file or
+ self.importer_instance.source , suffixes, dest_dir)
except zipfile.BadZipfile:
- return (None, _(u"Bad zip file"))
+ return (None, _("Bad zip file"))
if not files or None in files or [] in files:
return (None,
- _(u"Missing file(s) inside the zip file"))
+ _("Missing file(s) inside the zip file"))
source = files[0] if len(suffixes) == 1 else files
return (source, None)
class KMLManager(ImportManager):
- u"""
+ """
KML importer
The filtr argument has to be defined as the exact name of the folder to be
imported
@@ -236,7 +240,7 @@ class KMLManager(ImportManager):
self.ns = ns
def get(self):
- u"""
+ """
Get data from a KML source
Return a tuple with:
@@ -252,15 +256,15 @@ class KMLManager(ImportManager):
doc = source
# remove empty lines before declaration (bad XML file)
if hasattr(source, 'getvalue'):
- splitted = source.getvalue().split('\n')
+ splitted = source.getvalue().decode('utf-8').split('\n')
for idx, line in enumerate(splitted):
if line.strip():
break
- doc = io.StringIO("\n".join(splitted[idx:]))
+ doc = io.BytesIO("\n".join(splitted[idx:]).encode('utf-8'))
try:
tree = etree.parse(doc)
except:
- return (0, 0, _(u"Bad XML file"))
+ return (0, 0, _("Bad XML file"))
# try to get default namespace
if not self.ns:
self.ns = tree.getroot().nsmap[None]
@@ -329,11 +333,11 @@ class KMLManager(ImportManager):
class ShapefileManager(ImportManager):
- u"""
+ """
Shapefile importer
"""
def get(self):
- u"""
+ """
Get data from a Shapefile source
Return a tuple with:
@@ -354,7 +358,7 @@ class ShapefileManager(ImportManager):
if msg:
return (0, 0, msg)
if not sources:
- return (0, 0, _(u"Error while reading the data source."))
+ return (0, 0, _("Error while reading the data source."))
# get the srid
srid = self.importer_instance.srid
if not srid:
@@ -372,10 +376,10 @@ class ShapefileManager(ImportManager):
if not srid:
# try with the default projection
srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION
- msg = _(u"SRID cannot be guessed. The default SRID (%s) has "
- u"been used.") % srid
+ msg = _("SRID cannot be guessed. The default SRID (%s) has "
+ "been used.") % srid
# If imported items are not well located "
- # u"ask your data provider for the SRID to use.") % srid
+ # "ask your data provider for the SRID to use.") % srid
shapefilename = tmpdir + os.sep + sources[0]
ds = DataSource(shapefilename)
lyr = ds[0]
@@ -387,23 +391,23 @@ class ShapefileManager(ImportManager):
except ValueError:
return (
new_item, updated_item,
- _(u"Bad configuration: filter must be a valid "
- u"JSON string"))
+ _("Bad configuration: filter must be a valid "
+ "JSON string"))
for k in ('id',):
if k not in filtr:
return (
new_item, updated_item,
- _(u"The key \"%s\" is missing in the "
- u"filter.") % k)
+ _("The key \"%s\" is missing in the "
+ "filter.") % k)
for k in filtr:
try:
ids = lyr.get_fields(k)
except:
return (
new_item, updated_item,
- _(u"Config: {} is not an appropriate column name "
- u"for this Shapefile. Available columns "
- u" are: {}").format(k, u", ".join(
+ _("Config: {} is not an appropriate column name "
+ "for this Shapefile. Available columns "
+ " are: {}").format(k, ", ".join(
[j for j in lyr.fields])))
default_dct = {'origin': self.importer_instance.origin,
'license': self.importer_instance.license}
@@ -427,8 +431,8 @@ class ShapefileManager(ImportManager):
filtr["name"] = id_name
if lyr.geom_type not in ('Point', 'LineString', 'Polygon'):
- return (0, 0, _(u"Type of geographic item (%s) of this shapefile "
- u"is not managed by Chimère.") % lyr.geom_type)
+ return (0, 0, _("Type of geographic item (%s) of this shapefile "
+ "is not managed by Chimère.") % lyr.geom_type)
geom_key = ''
geom_cls = None
if lyr.geom_type == 'Point':
@@ -459,7 +463,7 @@ class ShapefileManager(ImportManager):
try:
geoms = [feat.geom.wkt]
except:
- return (0, 0, _(u"Bad Shapefile"))
+ return (0, 0, _("Bad Shapefile"))
if feat.geom.geom_type == 'MultiLineString':
geoms = [geom.wkt for geom in feat.geom]
import_key = dct.pop('id')
@@ -553,7 +557,7 @@ class ShapefileManager(ImportManager):
class CSVManager(ImportManager):
- u"""
+ """
CSV importer
"""
@classmethod
@@ -561,15 +565,15 @@ class CSVManager(ImportManager):
return
# (label, getter, setter)
- COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'),
- (_(u"Categories"), lambda obj: ", ".join(
+ COLS = [("Id", 'pk', 'pk'), (_("Name"), 'name', 'name'),
+ (_("Categories"), lambda obj: ", ".join(
[c.name for c in obj.categories.all()]), set_categories),
- (_(u"State"), 'status', lambda x: x),
- (_(u"Description"), 'description', 'description'),
- (_(u"Localisation"), 'geometry', 'geometry')]
+ (_("State"), 'status', lambda x: x),
+ (_("Description"), 'description', 'description'),
+ (_("Localisation"), 'geometry', 'geometry')]
def get(self):
- u"""
+ """
Get data from a CSV source
Return a tuple with:
@@ -594,7 +598,7 @@ class CSVManager(ImportManager):
try:
assert(len(row) >= len(cols))
except AssertionError:
- return (0, 0, _(u"Invalid CSV format"))
+ return (0, 0, _("Invalid CSV format"))
continue
if len(row) < len(cols):
continue
@@ -656,13 +660,13 @@ class CSVManager(ImportManager):
class GeoRSSManager(ImportManager):
- u"""
+ """
RSS importer.
This manager only gets and do not produce GeoRSSFeed
"""
def get(self):
- u"""
+ """
Get data from a GeoRSS simple source
Return a tuple with:
@@ -675,7 +679,7 @@ class GeoRSSManager(ImportManager):
feed = feedparser.parse(self.importer_instance.source)
if feed['bozo'] and not isinstance(
feed['bozo_exception'], feedparser.CharacterEncodingOverride):
- return (0, 0, _(u"RSS feed is not well formed"))
+ return (0, 0, _("RSS feed is not well formed"))
# differ with feed parser version
item_key = 'items'
if 'entries' in feed:
@@ -740,13 +744,13 @@ class GeoRSSManager(ImportManager):
class JsonManager(ImportManager):
- u"""
+ """
Json importer.
This manager only gets and do not produce Json feed
"""
def get(self):
- u"""
+ """
Get data from a json simple source
Return a tuple with:
@@ -760,29 +764,29 @@ class JsonManager(ImportManager):
if msg:
return (0, 0, msg)
- vals = str(source.read()).replace('\n', ' ')
+ vals = source.read().decode("utf-8").replace('\n', ' ')
try:
values = json.JSONDecoder(
object_pairs_hook=collections.OrderedDict).decode(vals)
except ValueError as e:
return (new_item, updated_item,
- _(u"JSON file is not well formed: ") + str(e))
+ _("JSON file is not well formed: ") + str(e))
# configuration in filtr
try:
filtr = json.JSONDecoder().decode(self.importer_instance.filtr)
except ValueError:
return (
new_item, updated_item,
- _(u"Bad configuration: filter field must be a valid "
- u"JSON string"))
+ _("Bad configuration: filter field must be a valid "
+ "JSON string"))
vls = filtr.values()
for k in ('name', 'id', 'description'):
if k not in vls:
return (
new_item, updated_item,
- _(u"A key must be associated to \"%s\" in the "
- u"filter.") % k)
+ _("A key must be associated to \"%s\" in the "
+ "filter.") % k)
default_dct = {'origin': self.importer_instance.origin,
'license': self.importer_instance.license}
@@ -845,7 +849,7 @@ RE_HOOK = re.compile('\[([^\]]*)\]')
class OSMManager(ImportManager):
- u"""
+ """
OSM importer/exporter
The source url is a path to an OSM file or a XAPI url
The filtr argument is XAPI args or empty if it is an OSM file.
@@ -853,7 +857,7 @@ class OSMManager(ImportManager):
default_source = settings.CHIMERE_XAPI_URL
def get(self):
- u"""
+ """
Get data from the source
Return a tuple with:
@@ -872,7 +876,7 @@ class OSMManager(ImportManager):
return self.import_ways(tree)
elif tree.xpath('count(//node)'):
return self.import_nodes(tree)
- return 0, 0, _(u"Nothing to import")
+ return 0, 0, _("Nothing to import")
def import_ways(self, tree):
from chimere.models import Route
@@ -956,10 +960,10 @@ class OSMManager(ImportManager):
if msg:
return 0, msg
if new_item:
- return 0, _(u"New items imported - validate them before exporting")
+ return 0, _("New items imported - validate them before exporting")
if Marker.objects.filter(status='I').count():
- return 0, _(u"There are items from a former import not yet "
- u"validated - validate them before exporting")
+ return 0, _("There are items from a former import not yet "
+ "validated - validate them before exporting")
# start import
api = settings.CHIMERE_OSM_API_URL
username = settings.CHIMERE_OSM_USER
@@ -970,17 +974,17 @@ class OSMManager(ImportManager):
username = extra_args['username']
password = extra_args['password']
except KeyError:
- return 0, _(u"Bad params - programming error")
+ return 0, _("Bad params - programming error")
username = username.encode('latin1')
password = password.encode('latin1')
api = OsmApi.OsmApi(api=api, username=username, password=password)
- api.ChangesetCreate({u"comment": u"Import from Chimère %s" %
+ api.ChangesetCreate({"comment": "Import from Chimère %s" %
get_version()})
hooks = RE_HOOK.findall(self.importer_instance.filtr)
if not hooks:
hooks = RE_HOOK.findall(self.importer_instance.source)
if not hooks:
- return 0, _(u"Bad param")
+ return 0, _("Bad param")
tags = {}
bbox = []
for hook in hooks:
@@ -995,12 +999,12 @@ class OSMManager(ImportManager):
continue
tags[key] = value
if not tags:
- return 0, _(u"No non ambigious tag is defined in the XAPI request")
+ return 0, _("No non ambigious tag is defined in the XAPI request")
if not bbox:
return 0, _(
- u"No bounding box is defined in the XAPI request."
- u"If you are sure to manage the entire planet set the "
- u"bounding box to -180,-90,180,90")
+ "No bounding box is defined in the XAPI request."
+ "If you are sure to manage the entire planet set the "
+ "bounding box to -180,-90,180,90")
default_dct = {'tag': tags,
'import_source': self.importer_instance.source}
idx = -1
@@ -1111,7 +1115,7 @@ class HtmlXsltManager(ImportManager):
PARSER = 'HTMLParser'
def get(self):
- u"""
+ """
Get data from the source
Return a tuple with:
@@ -1125,23 +1129,28 @@ class HtmlXsltManager(ImportManager):
main_page = urllib.request.urlopen(self.importer_instance.source)
assert main_page.getcode() == 200
except (urllib.error.URLError, AssertionError):
- return (0, 0, _(u"Source page is unreachable."))
+ return (0, 0, _("Source page is unreachable."))
data = main_page.read()
encoding = chardet.detect(data)
data = data.decode(encoding['encoding'])
- soup = BeautifulSoup(data)
- main_page = soup.prettify()
- # convert it to valid XHTML
- # doc, errors = tidy_document(main_page)
- doc = main_page
- dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+ if 'HTML' in self.PARSER:
+ soup = BeautifulSoup(data)
+ main_page = soup.prettify()
+ # convert it to valid XHTML
+ doc, errors = tidy_document(main_page)
+ dom = etree.HTML(doc, getattr(etree, self.PARSER)())
+ else:
+ soup = BeautifulSoup(data, 'xml')
+ main_page = soup.prettify()
+ dom = etree.XML(main_page.encode('utf-8'), getattr(
+ etree, self.PARSER)())
try:
xslt = etree.parse(self.importer_instance.source_file)
self.importer_instance.source_file.seek(0)
transform = etree.XSLT(xslt)
except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
- return (0, 0, _(u"The source file is not a valid XSLT file."))
+ return (0, 0, _("The source file is not a valid XSLT file."))
newdom = transform(dom)
items = []
# load an alternate xslt file to apply to linked page
@@ -1153,9 +1162,9 @@ class HtmlXsltManager(ImportManager):
transform_child = etree.XSLT(alt_xslt)
except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError):
return (0, 0,
- _(u"The alt source file is not a valid XSLT file."))
- base_url = u"/".join(self.importer_instance.source.split('/')[:-1])
- base_url += u"/"
+ _("The alt source file is not a valid XSLT file."))
+ base_url = "/".join(self.importer_instance.source.split('/')[:-1])
+ base_url += "/"
for item in newdom.getroot():
c_item = {child.tag: clean_field(child.text)
for child in item.getchildren() if child.text}
@@ -1183,10 +1192,12 @@ class HtmlXsltManager(ImportManager):
for extra in extra_keys[0].getchildren()})
items.append(c_item)
# change relative link to full link, simplify, unescape HTML entities
- html_unescape = HTMLParser().unescape
+ html_unescape = html.unescape
for item in items:
for k in item:
val = item[k]
+ if type(val) == bytes:
+ val = val.decode('utf-8')
for r, replaced in RE_CLEANS:
val = re.sub(r, replaced % {'base_url': base_url}, val)
item[k] = html_unescape(val)
@@ -1198,8 +1209,8 @@ class HtmlXsltManager(ImportManager):
msg = ''
if self.missing_cats:
msg = _(
- u"Names \"%s\" doesn't match existing categories. "
- u"Modify the import to match theses names with categories.") %\
+ "Names \"%s\" doesn't match existing categories. "
+ "Modify the import to match theses names with categories.") %\
('", "'.join(self.missing_cats))
return (self.new_item, self.updated_item, msg)
@@ -1229,10 +1240,14 @@ class HtmlXsltManager(ImportManager):
def parse_date(self, date):
dct = {}
has_dates = False
+ if type(date) == bytes:
+ date = date.decode('utf-8')
for locale in DATE_PARSINGS:
if has_dates:
break
for r in DATE_PARSINGS[locale]:
+ if not date:
+ continue
m = r.search(date)
if not m:
continue
@@ -1263,7 +1278,7 @@ class HtmlXsltManager(ImportManager):
origin_lnk = item.get('link')
# filter non relevant links
if origin_lnk and origin_lnk.startswith('http'):
- origin = u"<a href='%s' target='_blank'>%s</a>" % (
+ origin = "<a href='%s' target='_blank'>%s</a>" % (
origin_lnk, origin)
dct = {
'origin': origin,
@@ -1308,7 +1323,7 @@ import icalendar
class IcalManager(ImportManager):
def get(self):
- u"""
+ """
Get data from an icalendar source
"""
from chimere.models import Marker
@@ -1322,7 +1337,7 @@ class IcalManager(ImportManager):
cal = icalendar.Calendar.from_ical(data)
except ValueError as e:
return (new_item, updated_item,
- _(u"Error on icalendar parsing: ") + str(e))
+ _("Error on icalendar parsing: ") + str(e))
default_dct = {'origin': self.importer_instance.origin,
'license': self.importer_instance.license}
@@ -1339,10 +1354,10 @@ class IcalManager(ImportManager):
dct['description'] = str(dct['description'])
loc = event.get('LOCATION', None)
if loc:
- dct['description'] += u"<br/>{}".format(str(loc))
+ dct['description'] += "<br/>{}".format(str(loc))
url = event.get('URL', None)
if url:
- dct['description'] += u"<br/><a href='{}'>{}</a>".format(
+ dct['description'] += "<br/><a href='{}'>{}</a>".format(
str(url), str(_('Link')))
dct['start_date'] = event.get('DTSTART', None)
if dct['start_date']: