diff options
Diffstat (limited to 'chimere/utils.py')
-rw-r--r-- | chimere/utils.py | 1357 |
1 files changed, 1357 insertions, 0 deletions
diff --git a/chimere/utils.py b/chimere/utils.py new file mode 100644 index 0000000..0d84be3 --- /dev/null +++ b/chimere/utils.py @@ -0,0 +1,1357 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2012-2016 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# See the file COPYING for details. + +""" +Utilitaries +""" + +import csv +import collections +import datetime +import feedparser +import json +import os +import re +import StringIO +import tempfile +import urllib2 +import unicodedata +import zipfile + +from osgeo import ogr, osr +from lxml import etree + +from django.conf import settings +from django.contrib.gis.gdal import DataSource, OGRGeomType, check_err +from django.contrib.gis.geos import GEOSGeometry +from django.core.exceptions import ObjectDoesNotExist +from django.shortcuts import render_to_response +from django.utils.translation import ugettext_lazy as _ + +from chimere import get_version +from external_utils import OsmApi + + +def unicode_normalize(string): + if type(string) == str: + string = unicode(string.decode('utf-8')) + return ''.join( + (c for c in unicodedata.normalize('NFD', string) + if unicodedata.category(c) not in ('Mn', 'Sm', 'Sc'))) + + +class ImportManager(object): + u""" + Generic class for specific importers + """ + default_source = None + + def __init__(self, importer_instance): + self.importer_instance = importer_instance + if self.importer_instance.default_name: + self.default_name = self.importer_instance.default_name + else: + self.default_name = " - ".join([ + cat.name + for cat in self.importer_instance.categories.order_by( + 'name').all()]) + + def get(self): + raise NotImplementedError + + def put(self, extra_args={}): + raise NotImplementedError + + def create_or_update_item(self, cls, values, import_key, version=None, + key='', pk=None, category=None): + from models import PropertyModel + updated, created, item = False, False, None + import_key = unicode(import_key).replace(':', '^') + if not values.get('name'): + values['name'] = self.default_name + if not key: + key = self.importer_instance.importer_type + item = None + pms = [pm["slug"] for pm in PropertyModel.objects.values('slug').all()] + properties = {} + for k in values.keys(): + if k in pms: + properties[k] = values.pop(k) + if import_key or pk: + dct_import = { + 'import_key__icontains': '%s:%s;' % (key, import_key), + 'import_source': self.importer_instance.source} + ref_item = cls.objects.filter(**dct_import) + try: + item = None + if pk: + ref_item = cls.objects.get(pk=pk) + else: + ref_item = cls.objects.filter(**dct_import) + if not ref_item.count(): + raise ObjectDoesNotExist + ref_item = ref_item.all()[0] + if version and ref_item.import_version == int(version): + # no update since the last import + return ref_item, None, None + if not self.importer_instance.overwrite \ + and ref_item.modified_since_import: + return ref_item, None, None + else: + item = ref_item + for k in values: + if values[k]: + setattr(item, k, values[k]) + try: + item.save() + # force the modified_since_import status + item.modified_since_import = False + item.save() + except TypeError: + # error on data source + return None, False, False + updated = True + except ObjectDoesNotExist: + pass + if not item: + if not self.importer_instance.get_description and \ + self.importer_instance.default_description: + values['description'] = \ + self.importer_instance.default_description + values.update({ + 'import_source': self.importer_instance.source}) + values['status'] = self.importer_instance.default_status + if not self.importer_instance.associate_marker_to_way\ + and cls.__name__ == 'Route': + values['has_associated_marker'] = False + + try: + item = cls.objects.create(**values) + item.modified_since_import = False + item.save() + except TypeError: + # error on data source + return None, False, False + created = True + if import_key: + item.set_key(key, import_key) + item.categories.clear() + if category: + item.categories.add(category) + else: + for cat in self.importer_instance.categories.all(): + item.categories.add(cat) + for prop in properties: + item.setProperty(prop, properties[prop]) + return item, updated, created + + @classmethod + def get_files_inside_zip(cls, zippedfile, suffixes, dest_dir=None): + try: + flz = zipfile.ZipFile(zippedfile) + except zipfile.BadZipfile: + return [], _(u"Bad zip file") + namelist = flz.namelist() + filenames = [] + for suffix in suffixes: + current_file_name = None + for name in namelist: + if name.endswith(suffix) \ + or name.endswith(suffix.lower()) \ + or name.endswith(suffix.upper()): + current_file_name = name + filenames.append(current_file_name) + files = [] + for filename in filenames: + if filename: + if dest_dir: + files.append(filename) + flz.extract(filename, dest_dir) + else: + files.append(flz.open(filename)) + else: + files.append(None) + return files + + def get_source_file(self, suffixes, dest_dir=None, + extra_url=None): + source = self.importer_instance.source_file + if not hasattr(source, 'read'): + if not source: + source = self.importer_instance.source \ + if self.importer_instance.source else self.default_source + try: + url = source + if extra_url: + url += extra_url + remotehandle = urllib2.urlopen(url) + source = StringIO.StringIO(remotehandle.read()) + remotehandle.close() + except ValueError: + # assume it is a local file + try: + source = open(source) + except IOError, msg: + return (None, msg) + except (urllib2.URLError, AttributeError) as error: + return (None, error.message) + if self.importer_instance.zipped: + try: + files = self.get_files_inside_zip(source, suffixes, dest_dir) + except zipfile.BadZipfile: + return (None, _(u"Bad zip file")) + if not files or None in files: + return (None, + _(u"Missing file(s) inside the zip file")) + source = files[0] if len(suffixes) == 1 else files + return (source, None) + + +class KMLManager(ImportManager): + u""" + KML importer + The filtr argument has to be defined as the exact name of the folder to be + imported + """ + XPATH = '//kml:Folder/kml:name[text()="%s"]/../kml:Placemark' + DEFAULT_XPATH = '//kml:Placemark' + + def __init__(self, importer_instance, ns=''): + super(KMLManager, self).__init__(importer_instance) + self.ns = ns + + def get(self): + u""" + Get data from a KML source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + """ + from models import Marker, Route + new_item, updated_item, msg = 0, 0, '' + source, msg = self.get_source_file(['.kml']) + if msg: + return (0, 0, msg) + doc = source + # remove empty lines before declaration (bad XML file) + if hasattr(source, 'getvalue'): + splitted = source.getvalue().split('\n') + for idx, line in enumerate(splitted): + if line.strip(): + break + doc = StringIO.StringIO("\n".join(splitted[idx:])) + try: + tree = etree.parse(doc) + except: + return (0, 0, _(u"Bad XML file")) + # try to get default namespace + if not self.ns: + self.ns = tree.getroot().nsmap[None] + xpath = self.XPATH % self.importer_instance.filtr \ + if self.importer_instance.filtr else self.DEFAULT_XPATH + for placemark in tree.xpath(xpath, + namespaces={'kml': self.ns}): + name, point, line = None, None, None + pl_id = placemark.attrib.get('id') + pl_key = 'kml-%d' % self.importer_instance.pk + ns = '{%s}' % self.ns + description = '' + for item in placemark: + if item.tag == ns + 'name': + name = item.text + if not pl_id: + # if no ID is provided assume that name is a key + pl_id = name + elif item.tag == ns + 'description': + if self.importer_instance.get_description: + description = item.text + elif item.tag == ns + 'Point': + for coord in item: + if coord.tag == ns + 'coordinates': + x, y, z = coord.text.split(',') + point = 'SRID=4326;POINT(%s %s)' % (x, y) + elif item.tag == ns + 'LineString': + for coord in item: + if coord.tag == ns + 'coordinates': + points = coord.text.replace('\n', ' ').split(' ') + points = ",".join([" ".join(p.split(',')[:2]) + for p in points if p]) + line = 'SRID=4326;LINESTRING(%s)' % points + cls = None + dct = {'description': description, + 'name': name, + 'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if point: + dct['point'] = point + cls = Marker + if line: + dct['route'] = line + dct.pop('description') + cls = Route + if cls: + item, updated, created = self.create_or_update_item( + cls, dct, pl_id, key=pl_key) + if updated: + updated_item += 1 + if created: + new_item += 1 + return (new_item, updated_item, msg) + + @classmethod + def export(cls, queryset): + dct = { + 'name': settings.PROJECT_NAME, + 'description': unicode(datetime.date.today()), + 'locations': queryset.all() + } + filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + + '.kml') + result = render_to_response('chimere/export.kml', dct) + return filename, result + + +class ShapefileManager(ImportManager): + u""" + Shapefile importer + """ + def get(self): + u""" + Get data from a Shapefile source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + + The filtr argument allow to specify match between the shapefile cols + and the db. JSON format is used. + """ + from models import Marker, Route, Polygon + new_item, updated_item, msg = 0, 0, '' + tmpdir = tempfile.mkdtemp() + sources, msg = self.get_source_file(['.shp', '.dbf', '.prj', '.shx'], + dest_dir=tmpdir) + if msg: + return (0, 0, msg) + if not sources: + return (0, 0, _(u"Error while reading the data source.")) + # get the srid + srid = self.importer_instance.srid + if not srid: + prjfilename = tmpdir + os.sep + sources[2] + try: + from osgeo import osr + with open(prjfilename, 'r') as prj_file: + prj_txt = prj_file.read() + srs = osr.SpatialReference() + srs.ImportFromESRI([prj_txt]) + srs.AutoIdentifyEPSG() + srid = srs.GetAuthorityCode(None) + except ImportError: + pass + if not srid: + # try with the default projection + srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION + msg = _(u"SRID cannot be guessed. The default SRID (%s) has " + u"been used.") % srid + # If imported items are not well located " + # u"ask your data provider for the SRID to use.") % srid + shapefilename = tmpdir + os.sep + sources[0] + ds = DataSource(shapefilename) + lyr = ds[0] + default_dct = {} + filtr = self.importer_instance.filtr + if filtr: + try: + filtr = json.JSONDecoder().decode(self.importer_instance.filtr) + except ValueError: + return ( + new_item, updated_item, + _(u"Bad configuration: filter must be a valid " + u"JSON string")) + for k in ('id',): + if k not in filtr: + return ( + new_item, updated_item, + _(u"The key \"%s\" is missing in the " + u"filter.") % k) + for k in filtr: + try: + ids = lyr.get_fields(k) + except: + return ( + new_item, updated_item, + _(u"Config: {} is not an appropriate column name " + u"for this Shapefile. Available columns " + u" are: {}").format(k, u", ".join( + [j for j in lyr.fields]))) + default_dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if 'prefix_name' in filtr: + default_dct['name'] = filtr.pop('prefix_name') + if 'prefix_description' in filtr: + default_dct['description'] = filtr.pop('prefix_description') + else: + # if no filtr it is assumed that the first field is a + # id name and the second field is the name + id_name = lyr.fields[0] if len(lyr.fields) > 0 else None + # test if id_name is well guess + if id_name: + ids = lyr.get_fields(id_name) + if len(ids) != len(set(ids)): + id_name = None + filtr['id'] = id_name + if len(lyr.fields) > 1: + filtr["name"] = lyr.fields[1] + elif id_name: + filtr["name"] = id_name + + if lyr.geom_type not in ('Point', 'LineString', 'Polygon'): + return (0, 0, _(u"Type of geographic item (%s) of this shapefile " + u"is not managed by Chimère.") % lyr.geom_type) + geom_key = '' + geom_cls = None + if lyr.geom_type == 'Point': + geom_key = 'point' + geom_cls = Marker + elif lyr.geom_type == 'Polygon': + geom_key = 'polygon' + geom_cls = Polygon + else: + geom_key = 'route' + geom_cls = Route + # indexes = [] + for idx, feat in enumerate(lyr): + dct = default_dct.copy() + for k in filtr: + val = feat.get(k) + try: + val = unicode(val) + except UnicodeDecodeError: + try: + val = unicode( + val.decode(settings.CHIMERE_SHAPEFILE_ENCODING)) + except: + continue + if filtr[k] not in dct: + dct[filtr[k]] = '' + dct[filtr[k]] += val + try: + geoms = [feat.geom.wkt] + except: + return (0, 0, _(u"Bad Shapefile")) + if feat.geom.geom_type == 'MultiLineString': + geoms = [geom.wkt for geom in feat.geom] + import_key = dct.pop('id') + for geom in geoms: + dct[geom_key] = 'SRID=%s;%s' % (srid, geom) + item, updated, created = self.create_or_update_item( + geom_cls, dct, import_key) + if updated: + updated_item += 1 + if created: + new_item += 1 + # clean up + tmpdirs = set() + for src in sources: + dirs = os.sep.join(src.split(os.sep)[:-1]) + if dirs: + tmpdirs.add(tmpdir + os.sep + dirs) + os.remove(tmpdir + os.sep + src) + for dr in tmpdirs: + os.removedirs(dr) + return (new_item, updated_item, msg) + + @classmethod + def export(cls, queryset): + date = unicode(datetime.date.today()) + + tmp = tempfile.NamedTemporaryFile(suffix='.shp', mode='w+b') + tmp.close() + + tmp_name = tmp.name + field_names = [field.name for field in queryset.model._meta.fields] + geo_field = getattr( + queryset.model, + 'point' if 'point' in field_names else 'route')._field + + dr = ogr.GetDriverByName('ESRI Shapefile') + ds = dr.CreateDataSource(tmp_name) + if ds is None: + raise Exception(_(u'Could not create file!')) + ogr_type = OGRGeomType(geo_field.geom_type).num + srs = osr.SpatialReference() + srs.ImportFromEPSG(geo_field.srid) + + layer = ds.CreateLayer('lyr', srs=srs, geom_type=ogr_type) + + for field_name in ('name', 'category'): + field_defn = ogr.FieldDefn(str(field_name), ogr.OFTString) + field_defn.SetWidth(255) + if layer.CreateField(field_defn) != 0: + raise Exception(_(u'Failed to create field')) + + feature_def = layer.GetLayerDefn() + + for item in queryset: + # duplicate items when in several categories + q = item.categories + if not q.count(): + categories = [None] + else: + categories = q.all() + for category in categories: + feat = ogr.Feature(feature_def) + feat.SetField('name', str(unicode_normalize(item.name)[:80])) + if category: + feat.SetField('category', + str(unicode_normalize(category.name)[:80])) + + geom = getattr(item, geo_field.name) + if not geom: + continue + ogr_geom = ogr.CreateGeometryFromWkt(geom.wkt) + check_err(feat.SetGeometry(ogr_geom)) + check_err(layer.CreateFeature(feat)) + # Cleaning up + ds.Destroy() + + # writing to a zip file + filename = unicode_normalize(settings.PROJECT_NAME) + '-' + date + buff = StringIO.StringIO() + zip_file = zipfile.ZipFile(buff, 'w', zipfile.ZIP_DEFLATED) + suffixes = ['shp', 'shx', 'prj', 'dbf'] + for suffix in suffixes: + name = tmp_name.replace('.shp', '.' + suffix) + arcname = '.'.join((filename, suffix)) + zip_file.write(name, arcname=arcname) + zip_file.close() + buff.flush() + zip_stream = buff.getvalue() + buff.close() + return filename, zip_stream + + +class CSVManager(ImportManager): + u""" + CSV importer + """ + @classmethod + def set_categories(value): + return + + # (label, getter, setter) + COLS = [("Id", 'pk', 'pk'), (_(u"Name"), 'name', 'name'), + (_(u"Categories"), lambda obj: ", ".join( + [c.name for c in obj.categories.all()]), set_categories), + (_(u"State"), 'status', lambda x: x), + (_(u"Description"), 'description', 'description'), + (_(u"Localisation"), 'geometry', 'geometry')] + + def get(self): + u""" + Get data from a CSV source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + """ + from models import Marker, Route + new_item, updated_item, msg = 0, 0, '' + source, msg = self.get_source_file(['.csv']) + if msg: + return (0, 0, msg) + reader = csv.reader(source, delimiter=';', quotechar='"') + prop_cols = [] + for pm in Marker.all_properties(): + prop_cols.append((pm.name, pm.getAttrName(), + pm.getAttrName() + '_set')) + cols = list(self.COLS) + prop_cols + # datas = [] + for idx, row in enumerate(reader): + if not idx: # first row + try: + assert(len(row) >= len(cols)) + except AssertionError: + return (0, 0, _(u"Invalid CSV format")) + continue + if len(row) < len(cols): + continue + # pk, name, cats, state = row[0], row[1], row[2], row[3] + pk, name = row[0], row[1] + geom = row[5] + description = '' + if self.importer_instance.get_description: + description = row[4] + COL_INDEX = 6 + dct = {'description': description, + 'name': name, + 'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + cls = None + if 'POINT' in geom: + cls = Marker + dct['point'] = geom + elif 'LINE' in geom: + cls = Route + dct['route'] = geom + else: + continue + import_key = pk if pk else name.decode('utf-8') + item, updated, created = self.create_or_update_item( + cls, dct, import_key, pk=pk) + if updated: + updated_item += 1 + if created: + new_item += 1 + for idx, col in enumerate(cols[COL_INDEX:]): + name, getter, setter_val = col + setter = getattr(item, setter_val) + val = row[idx + COL_INDEX] + setter(item, val) + return (new_item, updated_item, msg) + + @classmethod + def export(cls, queryset): + dct = {'description': unicode(datetime.date.today()), 'data': []} + # cls_name = queryset.model.__name__.lower() + cols = list(cls.COLS) + for pm in queryset.model.all_properties(): + cols.append((pm.name, pm.getAttrName(), pm.getAttrName() + '_set')) + header = [col[0] for col in cols] + dct['data'].append(header) + for item in queryset.all(): + data = [] + for (lbl, attr, setr) in cols: + if callable(attr): + data.append(attr(item)) + else: + data.append(getattr(item, attr)) + dct['data'].append(data) + filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + + '.csv') + result = render_to_response('chimere/export.csv', dct) + return filename, result + + +class GeoRSSManager(ImportManager): + u""" + RSS importer. + This manager only gets and do not produce GeoRSSFeed + """ + + def get(self): + u""" + Get data from a GeoRSS simple source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + """ + from models import Marker, Route + new_item, updated_item, msg = 0, 0, '' + feed = feedparser.parse(self.importer_instance.source) + if feed['bozo'] and not isinstance( + feed['bozo_exception'], feedparser.CharacterEncodingOverride): + return (0, 0, _(u"RSS feed is not well formed")) + for item in feed['items']: + if "georss_point" not in item and 'georss_line' not in item \ + and not ("geo_lat" in item and "geo_long" in item): + continue + cls = None + dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if 'georss_point' in item or "geo_lat" in item: + cls = Marker + if 'georss_point' in item: + try: + y, x = item['georss_point'].split(' ') + except ValueError: + continue + else: + y = item['geo_lat'] + x = item['geo_long'] + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + if self.importer_instance.get_description: + for k in ['description', 'summary', 'value']: + if k in item: + dct['description'] = item[k] + break + else: + cls = Route + points = item['georss_line'].split(' ') + reordered_points = [] + # lat, lon -> x, y + for idx in xrange(len(points) / 2): + reordered_points.append("%s %s" % (points[idx * 2 + 1], + points[idx * 2])) + dct['route'] = 'SRID=4326;LINESTRING(%s)' % \ + ",".join(reordered_points) + + dct['name'] = item['title'] + pl_id = item['id'] if 'id' in item else item['title'] + it, updated, created = self.create_or_update_item(cls, dct, pl_id) + if updated: + updated_item += 1 + if created: + new_item += 1 + return (new_item, updated_item, msg) + + +class JsonManager(ImportManager): + u""" + Json importer. + This manager only gets and do not produce Json feed + """ + + def get(self): + u""" + Get data from a json simple source + + Return a tuple with: + - number of new item ; + - number of item updated ; + - error detail on error + """ + from models import Marker + new_item, updated_item, msg = 0, 0, '' + source, msg = self.get_source_file(['.json']) + if msg: + return (0, 0, msg) + + vals = source.read().replace('\n', ' ') + try: + values = json.JSONDecoder( + object_pairs_hook=collections.OrderedDict).decode(vals) + except ValueError as e: + return (new_item, updated_item, + _(u"JSON file is not well formed: " + e.message)) + # configuration in filtr + try: + filtr = json.JSONDecoder().decode(self.importer_instance.filtr) + except ValueError: + return ( + new_item, updated_item, + _(u"Bad configuration: filter field must be a valid " + u"JSON string")) + + vls = filtr.values() + for k in ('name', 'id', 'description'): + if k not in vls: + return ( + new_item, updated_item, + _(u"A key must be associated to \"%s\" in the " + u"filter.") % k) + + default_dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if 'prefix_name' in filtr: + default_dct['name'] = filtr.pop('prefix_name') + if 'prefix_description' in filtr: + default_dct['description'] = filtr.pop('prefix_description') + if self.importer_instance.default_localisation: + default_dct['point'] = self.importer_instance.default_localisation + + for item in values: + dct = default_dct.copy() + for k in filtr: + if k.startswith('prefix_') or k.startswith('suffix_'): + continue + if k in item and item[k]: + if filtr[k] not in dct: + dct[filtr[k]] = "" + else: + if filtr[k] == 'description': + dct[filtr[k]] += "<br/>" + else: + dct[filtr[k]] += " " + dct[filtr[k]] += item[k] + if 'point' in item: + x, y = item['point'].split(",") + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + elif 'lat' in item and item['lat'] \ + and 'lon' in item and item['lon']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], + item['lat']) + elif 'x' in item and item['x'] \ + and 'y' in item and item['y']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['x'], + item['y']) + if not dct['point']: + continue + for k in filtr: + if k.startswith('prefix_') or k.startswith('suffix_'): + pos = k.split('_')[0] + key = '_'.join(k.split('_')[1:]) + if key in dct: + if pos == 'prefix': + dct[key] = filtr[k] + dct[key] + else: + dct[key] += filtr[k] + cls = Marker + pl_id = (dct.pop('id') if 'id' in dct else dct['name']) \ + + "-" + unicode(self.importer_instance.pk) + it, updated, created = self.create_or_update_item(cls, dct, pl_id) + if updated: + updated_item += 1 + if created: + new_item += 1 + return (new_item, updated_item, msg) + +RE_HOOK = re.compile('\[([^\]]*)\]') + +# TODO: manage deleted item from OSM + + +class OSMManager(ImportManager): + u""" + OSM importer/exporter + The source url is a path to an OSM file or a XAPI url + The filtr argument is XAPI args or empty if it is an OSM file. + """ + default_source = settings.CHIMERE_XAPI_URL + + def get(self): + u""" + Get data from the source + + Return a tuple with: + - new items; + - updated items; + - error detail on error. + """ + source, msg = self.get_source_file( + ['.osm'], extra_url=self.importer_instance.filtr) + if not source: + return (0, 0, msg) + + tree = etree.parse(source) + # only import node or ways + if tree.xpath('count(//way)') and tree.xpath('count(//node)'): + return self.import_ways(tree) + elif tree.xpath('count(//node)'): + return self.import_nodes(tree) + return 0, 0, _(u"Nothing to import") + + def import_ways(self, tree): + from chimere.models import Route + msg, items, new_item, updated_item = "", [], 0, 0 + nodes = {} + for node in tree.xpath('//node'): + node_id = node.attrib.get('id') + for item in node: + k = item.attrib.get('k') + if node_id: + nodes[node_id] = '%s %s' % (node.get('lon'), + node.get('lat')) + for way in tree.xpath('//way'): + name = None + points = [] + node_id = way.attrib.get('id') + version = way.attrib.get('version') + for item in way: + k = item.attrib.get('k') + if k == 'name': + name = item.attrib.get('v') + if item.tag == 'nd': + points.append(item.get('ref')) + if not points: + continue + wkt = 'SRID=4326;LINESTRING(%s)' % ",".join( + [nodes[point_id] for point_id in points if point_id in nodes]) + dct = {'route': wkt, + 'name': name, + 'origin': self.importer_instance.origin + or u'OpenStreetMap.org', + 'license': self.importer_instance.license + or u'ODbL', + 'import_version': version} + item, updated, created = self.create_or_update_item( + Route, dct, node_id, version) + if updated: + updated_item += 1 + if created: + new_item += 1 + items.append(item) + return new_item, updated_item, msg + + def import_nodes(self, tree): + from chimere.models import Marker + msg, items, new_item, updated_item = "", [], 0, 0 + for node in tree.xpath('//node'): + name = None + node_id = node.attrib.get('id') + if not node_id: + continue + version = node.attrib.get('version') + for item in node: + k = item.attrib.get('k') + if k == 'name': + name = item.attrib.get('v') + point = 'SRID=4326;POINT(%s %s)' % (node.get('lon'), + node.get('lat')) + dct = {'point': point, + 'name': name, + 'origin': self.importer_instance.origin + or u'OpenStreetMap.org', + 'license': self.importer_instance.license + or u'ODbL', + 'import_version': version} + item, updated, created = self.create_or_update_item( + Marker, dct, node_id, version) + if updated: + updated_item += 1 + if created: + new_item += 1 + items.append(item) + return (new_item, updated_item, msg) + + def put(self, extra_args={}): + # first of all: reimport in order to verify that no changes has been + # made since the last import + from models import Marker + new_item, updated_item, msg = self.get() + # check if import is possible + if msg: + return 0, msg + if new_item: + return 0, _(u"New items imported - validate them before exporting") + if Marker.objects.filter(status='I').count(): + return 0, _(u"There are items from a former import not yet " + u"validated - validate them before exporting") + # start import + api = settings.CHIMERE_OSM_API_URL + username = settings.CHIMERE_OSM_USER + password = settings.CHIMERE_OSM_PASSWORD + if extra_args: + try: + api = extra_args['api'] + username = extra_args['username'] + password = extra_args['password'] + except KeyError: + return 0, _(u"Bad params - programming error") + username = username.encode('latin1') + password = password.encode('latin1') + api = OsmApi.OsmApi(api=api, username=username, password=password) + api.ChangesetCreate({u"comment": u"Import from Chimère %s" % + get_version()}) + hooks = RE_HOOK.findall(self.importer_instance.filtr) + if not hooks: + hooks = RE_HOOK.findall(self.importer_instance.source) + if not hooks: + return 0, _(u"Bad param") + tags = {} + bbox = [] + for hook in hooks: + key, value = hook.split('=') + if '*' in value or '|' in key or '|' in value: + continue + if key == 'bbox': + x1, y1, x2, y2 = [float(val) for val in value.split(',')] + bbox = GEOSGeometry( + 'POLYGON((%f %f,%f %f,%f %f,%f %f,%f %f))' % ( + x1, y1, x2, y1, x2, y2, x1, y2, x1, y1), srid=4326) + continue + tags[key] = value + if not tags: + return 0, _(u"No non ambigious tag is defined in the XAPI request") + if not bbox: + return 0, _( + u"No bounding box is defined in the XAPI request." + u"If you are sure to manage the entire planet set the " + u"bounding box to -180,-90,180,90") + default_dct = {'tag': tags, + 'import_source': self.importer_instance.source} + idx = -1 + for idx, item in enumerate( + Marker.objects.filter( + status='A', + point__contained=bbox, + categories=self.importer_instance.categories.all(), + not_for_osm=False, modified_since_import=True, + route=None).all()): + dct = default_dct.copy() + dct.update({'lon': item.point.x, + 'lat': item.point.y}) + dct['tag']['name'] = item.name + node = None + import_key = item.get_key('OSM') + updated = False + if import_key: + try: + dct['id'] = import_key + dct['version'] = item.import_version + node = api.NodeUpdate(dct) + updated = True + except OsmApi.ApiError, error: + if error.status == 404: + dct.pop('id') + dct.pop('version') + pass # if the node doesn't exist it is created + else: + raise + if not updated: + node = api.NodeCreate(dct) + item.set_key('OSM', node['id']) + item.import_version = node['version'] + item.save() + api.ChangesetClose() + return idx + 1, None + + +import chardet +import HTMLParser +from BeautifulSoup import BeautifulSoup + + +RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), + (re.compile(' ( )*'), ' '), + (re.compile(r"""<a href=["'](?!https?)(.*)["']"""), + '<a href="%(base_url)s\\1"'), + ) + +from calendar import TimeEncoding, month_name + + +def get_month_name(month_no, locale): + with TimeEncoding(locale) as encoding: + s = month_name[month_no] + if encoding is not None: + s = s.decode(encoding) + return s + +MONTH_NAMES = {locale: [get_month_name(no_month, locale + '.UTF-8') + for no_month in xrange(1, 13)] for locale in ['fr_FR']} + +try: + UNI_MONTH_NAMES = {locale: [m.decode('utf-8') for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} +except UnicodeEncodeError: + UNI_MONTH_NAMES = {locale: [m for m in MONTH_NAMES[locale]] + for locale in MONTH_NAMES} + +DATE_PARSINGS = { + 'fr_FR': [ + re.compile(r'(?P<day1>\d{1,2}) ' + r'(?P<month1>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') ' + r'(?P<year1>\d{4})?[^\d]*' + r'(?P<day2>\d{1,2}) ' + r'(?P<month2>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') *' + r'(?P<year2>\d{4})?.*'), + re.compile(r'(?P<day1>\d{1,2}) ' + r'(?P<month1>' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') * ' + r'(?P<year1>\d{4})?')], + 'en': [ + re.compile(r'(?P<year1>\d{4})-' + r'(?P<month1>\d{2})-' + r'(?P<day1>\d{2})' + r'(?:T' + r'(?P<hour1>\d{2})?:' + r'(?P<minut1>\d{2})?:' + r'(?P<second1>\d{2})' + r')?.*' + r'(?P<year2>\d{4})-' + r'(?P<month2>\d{2})-' + r'(?P<day2>\d{2})' + r'(?:T' + r'(?P<hour2>\d{2})?:' + r'(?P<minut2>\d{2})?:' + r'(?P<second2>\d{2})' + r')?.*'), + re.compile(r'(?P<year1>\d{4})-' + r'(?P<month1>\d{2})-' + r'(?P<day1>\d{2})' + r'(?:T' + r'(?P<hour1>\d{2})?:' + r'(?P<minut1>\d{2})?:' + r'(?P<second1>\d{2})' + r')?')], +} + + +def clean_field(value): + return value.strip() + + +class HtmlXsltManager(ImportManager): + PARSER = 'HTMLParser' + + def get(self): + u""" + Get data from the source + + Return a tuple with: + - new items; + - updated items; + - error detail on error. + """ + from models import Marker + self.marker_cls = Marker + try: + main_page = urllib2.urlopen(self.importer_instance.source) + assert main_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + return (0, 0, _(u"Source page is unreachable.")) + data = main_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + + soup = BeautifulSoup(data) + main_page = soup.prettify() + # convert it to valid XHTML + # doc, errors = tidy_document(main_page) + doc = main_page + dom = etree.HTML(doc, getattr(etree, self.PARSER)()) + try: + xslt = etree.parse(self.importer_instance.source_file) + self.importer_instance.source_file.seek(0) + transform = etree.XSLT(xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): + return (0, 0, _(u"The source file is not a valid XSLT file.")) + newdom = transform(dom) + items = [] + # load an alternate xslt file to apply to linked page + transform_child = None + if self.importer_instance.source_file_alt: + try: + alt_xslt = etree.parse(self.importer_instance.source_file_alt) + self.importer_instance.source_file_alt.seek(0) + transform_child = etree.XSLT(alt_xslt) + except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): + return (0, 0, + _(u"The alt source file is not a valid XSLT file.")) + base_url = u"/".join(self.importer_instance.source.split(u'/')[:-1]) + base_url += u"/" + for item in newdom.getroot(): + c_item = {child.tag: clean_field(child.text) + for child in item.getchildren() if child.text} + # try to have more information on the linked page + if transform_child and 'link' in c_item: + # not an absolute address + if not c_item['link'].startswith('http://') and \ + not c_item['link'].startswith('https://'): + c_item['link'] = base_url + c_item['link'] + try: + child_page = urllib2.urlopen(c_item['link']) + assert child_page.getcode() == 200 + except (urllib2.URLError, AssertionError): + # don't stop the export for a bad link + items.append(c_item) + continue + data = child_page.read() + encoding = chardet.detect(data) + data = data.decode(encoding['encoding']) + child_page = BeautifulSoup(data).prettify() + child_dom = etree.HTML(child_page, etree.HTMLParser()) + extra_keys = transform_child(child_dom).getroot() + if len(extra_keys): + c_item.update({extra.tag: etree.tostring(extra) + for extra in extra_keys[0].getchildren()}) + items.append(c_item) + # change relative link to full link, simplify, unescape HTML entities + html_unescape = HTMLParser.HTMLParser().unescape + for item in items: + for k in item: + val = item[k] + for r, replaced in RE_CLEANS: + val = re.sub(r, replaced % {'base_url': base_url}, val) + item[k] = html_unescape(val) + self.key_categories = self.importer_instance.get_key_category_dict() + self.missing_cats = set() + self.updated_item, self.new_item = 0, 0 + for item in items: + self.add_dct_item(item) + msg = '' + if self.missing_cats: + msg = _( + u"Names \"%s\" doesn't match existing categories. " + u"Modify the import to match theses names with categories.") %\ + (u'", "'.join(self.missing_cats)) + return (self.new_item, self.updated_item, msg) + + @classmethod + def _internal_parse_date(cls, locale, year, month, day): + try: + year = datetime.date.today().year if not year else int(year) + except ValueError: + return + month = month.encode('utf-8') + if locale in MONTH_NAMES and month in MONTH_NAMES[locale]: + month = MONTH_NAMES[locale].index(month) + 1 + else: + try: + month = int(month) + except ValueError: + return + try: + day = int(day) + except ValueError: + return + try: + return datetime.date(year, month, day) + except ValueError: + return + + def parse_date(self, date): + dct = {} + has_dates = False + for locale in DATE_PARSINGS: + if has_dates: + break + for r in DATE_PARSINGS[locale]: + m = r.search(date) + if not m: + continue + values = m.groupdict() + date = self._internal_parse_date( + locale, 'year1' in values and values['year1'], + values['month1'], values['day1']) + if not date: + continue + dct['start_date'] = date + has_dates = True + if 'day2' not in values: + break + date = self._internal_parse_date( + locale, 'year2' in values and values['year2'], + values['month2'], values['day2']) + if date: + dct['end_date'] = date + break + return dct + + def add_dct_item(self, item): + if not self.importer_instance.default_localisation and \ + "point" not in item and not ("lat" in item and item['lat']): + return + cls = None + origin = self.importer_instance.origin + origin_lnk = item.get('link') + if origin_lnk: + origin = u"<a href='%s' target='_blank'>%s</a>" % ( + origin_lnk, origin) + dct = { + 'origin': origin, + 'license': self.importer_instance.license, + 'name': item['name']} + category = None + if 'category' in item and item['category']: + if item['category'] in self.key_categories: + category = self.key_categories[item['category']] + else: + self.missing_cats.add(item['category']) + cls = self.marker_cls + if 'point' in item: + x, y = item['point'].split(",") + dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) + elif 'lat' in item and item['lat']: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], + item['lat']) + else: + dct['point'] = self.importer_instance.default_localisation + dct['description'] = item.get('description', '') + if 'date' in item: + dct.update(self.parse_date(item['date'])) + if "start_date" in item and item["start_date"]: + dct['start_date'] = item["start_date"] + if "end_date" in item and item["end_date"]: + dct['end_date'] = item["end_date"] + key = item['key'] + it, updated, created = self.create_or_update_item(cls, dct, key, + category=category) + if updated: + self.updated_item += 1 + if created: + self.new_item += 1 + + +class XMLXsltManager(HtmlXsltManager): + PARSER = 'XMLParser' + +import icalendar + + +class IcalManager(ImportManager): + def get(self): + u""" + Get data from an icalendar source + """ + from models import Marker + new_item, updated_item, msg = 0, 0, '' + source, msg = self.get_source_file([]) + if msg: + return (0, 0, msg) + + data = source.read() + try: + cal = icalendar.Calendar.from_ical(data) + except ValueError as e: + return (new_item, updated_item, + _(u"Error on icalendar parsing: " + e.message)) + + default_dct = {'origin': self.importer_instance.origin, + 'license': self.importer_instance.license} + if self.importer_instance.default_localisation: + default_dct['point'] = self.importer_instance.default_localisation + + for event in cal.walk('VEVENT'): + dct = default_dct.copy() + dct['name'] = event.get('SUMMARY', '') + if dct['name']: + dct['name'] = unicode(dct['name']) + dct['description'] = event.get('DESCRIPTION', '') + if dct['description']: + dct['description'] = unicode(dct['description']) + loc = event.get('LOCATION', None) + if loc: + dct['description'] += u"<br/>{}".format(unicode(loc)) + url = event.get('URL', None) + if url: + dct['description'] += u"<br/><a href='{}'>{}</a>".format( + unicode(url), unicode(_(u'Link'))) + dct['start_date'] = event.get('DTSTART', None) + if dct['start_date']: + dct['start_date'] = event.decoded('DTSTART') + dct['end_date'] = event.get('DTEND', None) + if dct['end_date']: + dct['end_date'] = event.decoded('DTEND') + point = event.get('GEO', None) + if point: + dct['point'] = 'SRID=4326;POINT(%s %s)' % (point.longitude, + point.latitude) + + if not dct.get('point', None): + continue + + cls = Marker + pl_id = event.get('UID', None) + if not pl_id: + pl_id = dct['name'] + "-" + unicode(self.importer_instance.pk) + pl_id += "-" + unicode(self.importer_instance.pk) + it, updated, created = self.create_or_update_item(cls, dct, pl_id) + if updated: + updated_item += 1 + if created: + new_item += 1 + return (new_item, updated_item, msg) |