#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (C) 2012-2017 Étienne Loks # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # See the file COPYING for details. """ Utilitaries """ from copy import deepcopy import csv import collections import datetime import feedparser import html import io import json import os from tidylib import tidy_document import re import tempfile import urllib import unicodedata import zipfile from osgeo import ogr, osr from osmapi import OsmApi from lxml import etree from django.conf import settings from django.contrib.gis.gdal import DataSource, OGRGeomType, check_err from django.contrib.gis.geos import GEOSGeometry from django.core.exceptions import ObjectDoesNotExist from django.shortcuts import render_to_response from django.utils.translation import ugettext_lazy as _ from chimere import get_version def unicode_normalize(string): return ''.join( (c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) not in ('Mn', 'Sm', 'Sc'))) class ImportManager(object): """ Generic class for specific importers """ default_source = None def __init__(self, importer_instance): self.importer_instance = importer_instance if self.importer_instance.default_name: self.default_name = self.importer_instance.default_name else: self.default_name = " - ".join([ cat.name for cat in self.importer_instance.categories.order_by( 'name').all()]) def get(self): raise NotImplementedError def put(self, extra_args={}): raise NotImplementedError def create_or_update_item(self, cls, values, import_key, version=None, key='', pk=None, category=None): from chimere.models import PropertyModel updated, created, item = False, False, None import_key = str(import_key).replace(':', '^') if not values.get('name'): values['name'] = self.default_name if not key: key = self.importer_instance.importer_type item = None pms = [pm["slug"] for pm in PropertyModel.objects.values('slug').all()] properties = {} for k in values.keys(): if k in pms: properties[k] = values.pop(k) if import_key or pk: dct_import = { 'import_key__icontains': '%s:%s;' % (key, import_key), 'import_source': self.importer_instance.source} ref_item = cls.objects.filter(**dct_import) try: item = None if pk: ref_item = cls.objects.get(pk=pk) else: ref_item = cls.objects.filter(**dct_import) if not ref_item.count(): raise ObjectDoesNotExist ref_item = ref_item.all()[0] if version and ref_item.import_version == int(version): # no update since the last import return ref_item, None, None if not self.importer_instance.overwrite \ and ref_item.modified_since_import: return ref_item, None, None else: item = ref_item for k in values: if values[k]: setattr(item, k, values[k]) try: item.save() # force the modified_since_import status item.modified_since_import = False item.save() except TypeError: # error on data source return None, False, False updated = True except ObjectDoesNotExist: pass if not item: if not self.importer_instance.get_description and \ self.importer_instance.default_description: values['description'] = \ self.importer_instance.default_description values.update({ 'import_source': self.importer_instance.source}) values['status'] = self.importer_instance.default_status item = cls.objects.create(**values) item.modified_since_import = False item.save() try: item = cls.objects.create(**values) item.modified_since_import = False item.save() except TypeError: # error on data source return None, False, False created = True if import_key: item.set_key(key, import_key) item.categories.clear() if category: item.categories.add(category) else: for cat in self.importer_instance.categories.all(): item.categories.add(cat) for prop in properties: item.setProperty(prop, properties[prop]) return item, updated, created @classmethod def get_files_inside_zip(cls, zippedfile, suffixes, dest_dir=None): try: flz = zipfile.ZipFile(zippedfile) except zipfile.BadZipfile: return [], _("Bad zip file") namelist = flz.namelist() filenames = [] for suffix in suffixes: current_file_name = None for name in namelist: if name.endswith(suffix) \ or name.endswith(suffix.lower()) \ or name.endswith(suffix.upper()): current_file_name = name filenames.append(current_file_name) files = [] for filename in filenames: if filename: if dest_dir: files.append(filename) flz.extract(filename, dest_dir) else: files.append(flz.open(filename)) else: files.append(None) return files def get_source_file(self, suffixes, dest_dir=None, extra_url=None): source = self.importer_instance.source_file try: source.read except ValueError: if not source: source = self.importer_instance.source \ if self.importer_instance.source else self.default_source try: url = source if extra_url: url += extra_url remotehandle = urllib.request.urlopen(url) source = io.BytesIO(remotehandle.read()) remotehandle.close() except ValueError: # assume it is a local file try: source = open(source) except IOError as msg: return (None, msg) except (urllib.error.URLError, AttributeError) as error: return (None, str(error)) if self.importer_instance.zipped: try: files = self.get_files_inside_zip( self.importer_instance.source_file or self.importer_instance.source , suffixes, dest_dir) except zipfile.BadZipfile: return (None, _("Bad zip file")) if not files or None in files or [] in files: return (None, _("Missing file(s) inside the zip file")) source = files[0] if len(suffixes) == 1 else files return (source, None) class KMLManager(ImportManager): """ KML importer The filtr argument has to be defined as the exact name of the folder to be imported """ XPATH = '//kml:Folder/kml:name[text()="%s"]/../kml:Placemark' DEFAULT_XPATH = '//kml:Placemark' def __init__(self, importer_instance, ns=''): super(KMLManager, self).__init__(importer_instance) self.ns = ns def get(self): """ Get data from a KML source Return a tuple with: - number of new item ; - number of item updated ; - error detail on error """ from chimere.models import Marker, Route new_item, updated_item, msg = 0, 0, '' source, msg = self.get_source_file(['.kml']) if msg: return (0, 0, msg) doc = source # remove empty lines before declaration (bad XML file) if hasattr(source, 'getvalue'): splitted = source.getvalue().decode('utf-8').split('\n') for idx, line in enumerate(splitted): if line.strip(): break doc = io.BytesIO("\n".join(splitted[idx:]).encode('utf-8')) try: tree = etree.parse(doc) except: return (0, 0, _("Bad XML file")) # try to get default namespace if not self.ns: self.ns = tree.getroot().nsmap[None] xpath = self.XPATH % self.importer_instance.filtr \ if self.importer_instance.filtr else self.DEFAULT_XPATH for placemark in tree.xpath(xpath, namespaces={'kml': self.ns}): name, point, line = None, None, None pl_id = placemark.attrib.get('id') pl_key = 'kml-%d' % self.importer_instance.pk ns = '{%s}' % self.ns description = '' for item in placemark: if item.tag == ns + 'name': name = item.text if not pl_id: # if no ID is provided assume that name is a key pl_id = name elif item.tag == ns + 'description': if self.importer_instance.get_description: description = item.text elif item.tag == ns + 'Point': for coord in item: if coord.tag == ns + 'coordinates': x, y, z = coord.text.split(',') point = 'SRID=4326;POINT(%s %s)' % (x, y) elif item.tag == ns + 'LineString': for coord in item: if coord.tag == ns + 'coordinates': points = coord.text.replace('\n', ' ').split(' ') points = ",".join([" ".join(p.split(',')[:2]) for p in points if p]) line = 'SRID=4326;LINESTRING(%s)' % points cls = None dct = {'description': description, 'name': name, 'origin': self.importer_instance.origin, 'license': self.importer_instance.license} if point: dct['point'] = point cls = Marker if line: dct['route'] = line dct.pop('description') cls = Route if cls: item, updated, created = self.create_or_update_item( cls, dct, pl_id, key=pl_key) if updated: updated_item += 1 if created: new_item += 1 return (new_item, updated_item, msg) @classmethod def export(cls, queryset): dct = { 'name': settings.PROJECT_NAME, 'description': str(datetime.date.today()), 'locations': queryset.all() } filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + '.kml') result = render_to_response('chimere/export.kml', dct) return filename, result class ShapefileManager(ImportManager): """ Shapefile importer """ def get(self): """ Get data from a Shapefile source Return a tuple with: - number of new item ; - number of item updated ; - error detail on error The filtr argument allow to specify match between the shapefile cols and the db. JSON format is used. """ from chimere.models import Marker, Route, Polygon new_item, updated_item, msg = 0, 0, '' tmpdir = tempfile.mkdtemp() res = self.get_source_file(['.shp', '.dbf', '.prj', '.shx'], dest_dir=tmpdir) sources, msg = self.get_source_file(['.shp', '.dbf', '.prj', '.shx'], dest_dir=tmpdir) if msg: return (0, 0, msg) if not sources: return (0, 0, _("Error while reading the data source.")) # get the srid srid = self.importer_instance.srid if not srid: prjfilename = tmpdir + os.sep + sources[2] try: from osgeo import osr with open(prjfilename, 'r') as prj_file: prj_txt = prj_file.read() srs = osr.SpatialReference() srs.ImportFromESRI([prj_txt]) srs.AutoIdentifyEPSG() srid = srs.GetAuthorityCode(None) except ImportError: pass if not srid: # try with the default projection srid = settings.CHIMERE_EPSG_DISPLAY_PROJECTION msg = _("SRID cannot be guessed. The default SRID (%s) has " "been used.") % srid # If imported items are not well located " # "ask your data provider for the SRID to use.") % srid shapefilename = tmpdir + os.sep + sources[0] ds = DataSource(shapefilename) lyr = ds[0] default_dct = {} filtr = self.importer_instance.filtr or {} if filtr: try: filtr = json.JSONDecoder().decode(self.importer_instance.filtr) except ValueError: return ( new_item, updated_item, _("Bad configuration: filter must be a valid " "JSON string")) for k in ('id',): if k not in filtr: return ( new_item, updated_item, _("The key \"%s\" is missing in the " "filter.") % k) for k in filtr: try: ids = lyr.get_fields(k) except: return ( new_item, updated_item, _("Config: {} is not an appropriate column name " "for this Shapefile. Available columns " " are: {}").format(k, ", ".join( [j for j in lyr.fields]))) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} if 'prefix_name' in filtr: default_dct['name'] = filtr.pop('prefix_name') if 'prefix_description' in filtr: default_dct['description'] = filtr.pop('prefix_description') else: # if no filtr it is assumed that the first field is a # id name and the second field is the name id_name = lyr.fields[0] if len(lyr.fields) > 0 else None # test if id_name is well guess if id_name: ids = lyr.get_fields(id_name) if len(ids) != len(set(ids)): id_name = None filtr['id'] = id_name if len(lyr.fields) > 1: filtr["name"] = lyr.fields[1] elif id_name: filtr["name"] = id_name if lyr.geom_type not in ('Point', 'LineString', 'Polygon'): return (0, 0, _("Type of geographic item (%s) of this shapefile " "is not managed by Chimère.") % lyr.geom_type) geom_key = '' geom_cls = None if lyr.geom_type == 'Point': geom_key = 'point' geom_cls = Marker elif lyr.geom_type == 'Polygon': geom_key = 'polygon' geom_cls = Polygon else: geom_key = 'route' geom_cls = Route # indexes = [] for idx, feat in enumerate(lyr): dct = default_dct.copy() for k in filtr: val = feat.get(k) try: val = str(val) except UnicodeDecodeError: try: val = str( val.decode(settings.CHIMERE_SHAPEFILE_ENCODING)) except: continue if filtr[k] not in dct: dct[filtr[k]] = '' dct[filtr[k]] += val try: geoms = [feat.geom.wkt] except: return (0, 0, _("Bad Shapefile")) if feat.geom.geom_type == 'MultiLineString': geoms = [geom.wkt for geom in feat.geom] import_key = dct.pop('id') for geom in geoms: dct[geom_key] = 'SRID=%s;%s' % (srid, geom) item, updated, created = self.create_or_update_item( geom_cls, dct, import_key) if updated: updated_item += 1 if created: new_item += 1 # clean up tmpdirs = set() for src in sources: dirs = os.sep.join(src.split(os.sep)[:-1]) if dirs: tmpdirs.add(tmpdir + os.sep + dirs) os.remove(tmpdir + os.sep + src) for dr in tmpdirs: os.removedirs(dr) return (new_item, updated_item, msg) @classmethod def export(cls, queryset): date = str(datetime.date.today()) tmp = tempfile.NamedTemporaryFile(suffix='.shp', mode='w+b') tmp.close() tmp_name = tmp.name field_names = [field.name for field in queryset.model._meta.fields] geo_field = getattr( queryset.model, 'point' if 'point' in field_names else 'route')._field dr = ogr.GetDriverByName('ESRI Shapefile') ds = dr.CreateDataSource(tmp_name) if ds is None: raise Exception(_('Could not create file!')) ogr_type = OGRGeomType(geo_field.geom_type).num srs = osr.SpatialReference() srs.ImportFromEPSG(geo_field.srid) layer = ds.CreateLayer('lyr', srs=srs, geom_type=ogr_type) for field_name in ('name', 'category'): field_defn = ogr.FieldDefn(str(field_name), ogr.OFTString) field_defn.SetWidth(255) if layer.CreateField(field_defn) != 0: raise Exception(_('Failed to create field')) feature_def = layer.GetLayerDefn() for item in queryset: # duplicate items when in several categories q = item.categories if not q.count(): categories = [None] else: categories = q.all() for category in categories: feat = ogr.Feature(feature_def) feat.SetField('name', str(unicode_normalize(item.name)[:80])) if category: feat.SetField('category', str(unicode_normalize(category.name)[:80])) geom = getattr(item, geo_field.name) if not geom: continue ogr_geom = ogr.CreateGeometryFromWkt(geom.wkt) check_err(feat.SetGeometry(ogr_geom)) check_err(layer.CreateFeature(feat)) # Cleaning up ds.Destroy() # writing to a zip file filename = unicode_normalize(settings.PROJECT_NAME) + '-' + date buff = io.BytesIO() zip_file = zipfile.ZipFile(buff, 'w', zipfile.ZIP_DEFLATED) suffixes = ['shp', 'shx', 'prj', 'dbf'] for suffix in suffixes: name = tmp_name.replace('.shp', '.' + suffix) arcname = '.'.join((filename, suffix)) zip_file.write(name, arcname=arcname) zip_file.close() buff.flush() zip_stream = buff.getvalue() buff.close() return filename, zip_stream class CSVManager(ImportManager): """ CSV importer """ @classmethod def set_categories(value): return # (label, getter, setter) COLS = [("Id", 'pk', 'pk'), (_("Name"), 'name', 'name'), (_("Categories"), lambda obj: ", ".join( [c.name for c in obj.categories.all()]), set_categories), (_("State"), 'status', lambda x: x), (_("Description"), 'description', 'description'), (_("Localisation"), 'geometry', 'geometry')] def get(self): """ Get data from a CSV source Return a tuple with: - number of new item ; - number of item updated ; - error detail on error """ from chimere.models import Marker, Route new_item, updated_item, msg = 0, 0, '' source, msg = self.get_source_file(['.csv']) if msg: return (0, 0, msg) reader = csv.reader(source, delimiter=';', quotechar='"') prop_cols = [] for pm in Marker.all_properties(): prop_cols.append((pm.name, pm.getAttrName(), pm.getAttrName() + '_set')) cols = list(self.COLS) + prop_cols # datas = [] for idx, row in enumerate(reader): if not idx: # first row try: assert(len(row) >= len(cols)) except AssertionError: return (0, 0, _("Invalid CSV format")) continue if len(row) < len(cols): continue # pk, name, cats, state = row[0], row[1], row[2], row[3] pk, name = row[0], row[1] geom = row[5] description = '' if self.importer_instance.get_description: description = row[4] COL_INDEX = 6 dct = {'description': description, 'name': name, 'origin': self.importer_instance.origin, 'license': self.importer_instance.license} cls = None if 'POINT' in geom: cls = Marker dct['point'] = geom elif 'LINE' in geom: cls = Route dct['route'] = geom else: continue import_key = pk if pk else name.decode('utf-8') item, updated, created = self.create_or_update_item( cls, dct, import_key, pk=pk) if updated: updated_item += 1 if created: new_item += 1 for idx, col in enumerate(cols[COL_INDEX:]): name, getter, setter_val = col setter = getattr(item, setter_val) val = row[idx + COL_INDEX] setter(item, val) return (new_item, updated_item, msg) @classmethod def export(cls, queryset): dct = {'description': str(datetime.date.today()), 'data': []} # cls_name = queryset.model.__name__.lower() cols = list(cls.COLS) for pm in queryset.model.all_properties(): cols.append((pm.name, pm.getAttrName(), pm.getAttrName() + '_set')) header = [col[0] for col in cols] dct['data'].append(header) for item in queryset.all(): data = [] for (lbl, attr, setr) in cols: if callable(attr): data.append(attr(item)) else: data.append(getattr(item, attr)) dct['data'].append(data) filename = unicode_normalize(settings.PROJECT_NAME + dct['description'] + '.csv') result = render_to_response('chimere/export.csv', dct) return filename, result class GeoRSSManager(ImportManager): """ RSS importer. This manager only gets and do not produce GeoRSSFeed """ def get(self): """ Get data from a GeoRSS simple source Return a tuple with: - number of new item ; - number of item updated ; - error detail on error """ from chimere.models import Marker, Route new_item, updated_item, msg = 0, 0, '' feed = feedparser.parse(self.importer_instance.source) if feed['bozo'] and not isinstance( feed['bozo_exception'], feedparser.CharacterEncodingOverride): return (0, 0, _("RSS feed is not well formed")) # differ with feed parser version item_key = 'items' if 'entries' in feed: item_key = 'entries' for item in feed[item_key]: if 'where' not in item and "georss_point" not in item \ and 'georss_line' not in item \ and not ("geo_lat" in item and "geo_long" in item): continue cls = None dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} if "where" in item and 'coordinates' in item['where']: coord = item['where']['coordinates'] if item['where']['type'] == 'Point': cls = Marker dct['point'] = 'SRID=4326;POINT(%s %s)' % ( coord[0], coord[1]) elif item['where']['type'] == 'LineString': cls = Route dct['route'] = 'SRID=4326;LINESTRING(%s)' % ( ",".join(["{} {}".format(c[0], c[1]) for c in coord])) else: continue elif 'georss_point' in item or "geo_lat" in item: cls = Marker if 'georss_point' in item: try: y, x = item['georss_point'].split(' ') except ValueError: continue else: y = item['geo_lat'] x = item['geo_long'] dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) elif "georss_line" in item: cls = Route points = item['georss_line'].split(' ') reordered_points = [] # lat, lon -> x, y for idx in range(int(len(points) / 2)): reordered_points.append("%s %s" % (points[idx * 2 + 1], points[idx * 2])) dct['route'] = 'SRID=4326;LINESTRING(%s)' % \ ",".join(reordered_points) else: continue if self.importer_instance.get_description: for k in ['description', 'summary', 'value']: if k in item: dct['description'] = item[k] break dct['name'] = item['title'] pl_id = item['id'] if 'id' in item else item['title'] it, updated, created = self.create_or_update_item(cls, dct, pl_id) if updated: updated_item += 1 if created: new_item += 1 return (new_item, updated_item, msg) class JsonManager(ImportManager): """ Json importer. This manager only gets and do not produce Json feed """ def extract_dict_values(self, item, filtr): """ Extract values from a dict. :param item: the source dictionary :param filtr: the filter, a dictionary that contains keys or dictionary, each dictionary is parsed for each values :return: an iterator giving tuple of final keys and values. example: item = {'comment': {'fr': "Commentaire", 'en': "Comment"}, 'latitude': 1.0, 'longitude': -1.0} filtr = {'comment': {'fr': "description"}, 'latitude': 'y', 'longitude': 'x'} print(list(extract_dict_values(item, filtr))) [("description", "Commentaire"), ("y", 1.0), ("x", -1.0)] """ for k in filtr: if k not in item: continue if not isinstance(filtr[k], dict): yield filtr[k], item[k] continue for key, value in self.extract_dict_values(item[k], filtr[k]): yield key, value def get(self): """ Get data from a json simple source Return a tuple with: - number of new item ; - number of item updated ; - error detail on error """ from chimere.models import Marker new_item, updated_item, msg = 0, 0, '' source, msg = self.get_source_file(['.json']) if msg: return (0, 0, msg) vals = source.read().decode("utf-8").replace('\n', ' ') try: values = json.JSONDecoder( object_pairs_hook=collections.OrderedDict).decode(vals) except ValueError as e: return (new_item, updated_item, _("JSON file is not well formed: ") + str(e)) filtr = self.importer_instance.filtr # a left part before "{" indicate keys to be used to access to the # event list - separated by ";" left_part = filtr.split('{')[0] if left_part: filtr = filtr[len(left_part):] for key in left_part.split(';'): if key not in values: return ( new_item, updated_item, _("Bad filter configuration a key doesn't " "match with json source: ") + key) values = values[key] # configuration in filtr try: filtr = json.JSONDecoder().decode(filtr) except ValueError: return ( new_item, updated_item, _("Bad configuration: filter field must be a valid " "JSON string")) # check that mandatory fields are available vls = [] cvalues = filtr.copy() while cvalues: new_values = {} for idx, val in enumerate(cvalues.values()): if isinstance(val, dict): for k in val: new_values["{}-{}".format(idx, k)] = val[k] else: vls.append(val) cvalues = new_values for k in ('name', 'id', 'description'): if k not in vls: return ( new_item, updated_item, _("A key must be associated to \"%s\" in the " "filter.") % k) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license, 'description': ""} if 'prefix_name' in filtr: default_dct['name'] = filtr.pop('prefix_name') if 'prefix_description' in filtr: default_dct['description'] = filtr.pop('prefix_description') if self.importer_instance.default_localisation: default_dct['point'] = self.importer_instance.default_localisation for item in values: dct = default_dct.copy() for key, value in self.extract_dict_values(item, filtr): """ for k in filtr: """ if key.startswith('prefix_') or key.startswith('suffix_'): continue if key == 'external_image': value = ''.format(value) if key not in dct: dct[key] = "" else: if key == 'description': dct[key] += "
" else: dct[key] += " " dct[key] += str(value) if value else "" if 'point' in dct and isinstance(dct['point'], str): x, y = dct['point'].split(",") dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) elif 'lat' in dct and dct['lat'] \ and 'lon' in dct and dct['lon']: dct['point'] = 'SRID=4326;POINT(%s %s)' % (dct.pop('lon'), dct.pop('lat')) elif 'x' in dct and dct['x'] \ and 'y' in dct and dct['y']: dct['point'] = 'SRID=4326;POINT(%s %s)' % (dct['x'], dct['y']) if not dct['point']: continue # manage prefixes and suffixes for k in filtr: if k.startswith('prefix_') or k.startswith('suffix_'): pos = k.split('_')[0] key = '_'.join(k.split('_')[1:]) if key in dct: if pos == 'prefix': dct[key] = filtr[k] + dct[key] else: dct[key] += filtr[k] if 'external_image' in dct: dct['description'] = \ dct.pop('external_image') + dct['description'] cls = Marker pl_id = (dct.pop('id') if 'id' in dct else dct['name']) \ + "-" + str(self.importer_instance.pk) it, updated, created = self.create_or_update_item(cls, dct, pl_id) if updated: updated_item += 1 if created: new_item += 1 return new_item, updated_item, msg RE_HOOK = re.compile('\[([^\]]*)\]') # TODO: manage deleted item from OSM class OSMManager(ImportManager): """ OSM importer/exporter The source url is a path to an OSM file or a XAPI url The filtr argument is XAPI args or empty if it is an OSM file. """ default_source = settings.CHIMERE_XAPI_URL def get(self): """ Get data from the source Return a tuple with: - new items; - updated items; - error detail on error. """ source, msg = self.get_source_file( ['.osm'], extra_url=self.importer_instance.filtr) if not source: return (0, 0, msg) tree = etree.parse(source) # only import node or ways if tree.xpath('count(//way)') and tree.xpath('count(//node)'): return self.import_ways(tree) elif tree.xpath('count(//node)'): return self.import_nodes(tree) return 0, 0, _("Nothing to import") def import_ways(self, tree): from chimere.models import Route msg, items, new_item, updated_item = "", [], 0, 0 nodes = {} for node in tree.xpath('//node'): node_id = node.attrib.get('id') for item in node: k = item.attrib.get('k') if node_id: nodes[node_id] = '%s %s' % (node.get('lon'), node.get('lat')) for way in tree.xpath('//way'): name = None points = [] node_id = way.attrib.get('id') version = way.attrib.get('version') for item in way: k = item.attrib.get('k') if k == 'name': name = item.attrib.get('v') if item.tag == 'nd': points.append(item.get('ref')) if not points: continue wkt = 'SRID=4326;LINESTRING(%s)' % ",".join( [nodes[point_id] for point_id in points if point_id in nodes]) dct = {'route': wkt, 'name': name, 'origin': self.importer_instance.origin or 'OpenStreetMap.org', 'license': self.importer_instance.license or 'ODbL', 'import_version': version} item, updated, created = self.create_or_update_item( Route, dct, node_id, version) if updated: updated_item += 1 if created: new_item += 1 items.append(item) return new_item, updated_item, msg def import_nodes(self, tree): from chimere.models import Marker msg, items, new_item, updated_item = "", [], 0, 0 for node in tree.xpath('//node'): name = None node_id = node.attrib.get('id') if not node_id: continue version = node.attrib.get('version') for item in node: k = item.attrib.get('k') if k == 'name': name = item.attrib.get('v') point = 'SRID=4326;POINT(%s %s)' % (node.get('lon'), node.get('lat')) dct = {'point': point, 'name': name, 'origin': self.importer_instance.origin or 'OpenStreetMap.org', 'license': self.importer_instance.license or 'ODbL', 'import_version': version} item, updated, created = self.create_or_update_item( Marker, dct, node_id, version) if updated: updated_item += 1 if created: new_item += 1 items.append(item) return (new_item, updated_item, msg) def put(self, extra_args={}): # first of all: reimport in order to verify that no changes has been # made since the last import from chimere.models import Marker new_item, updated_item, msg = self.get() # check if import is possible if msg: return 0, msg if new_item: return 0, _("New items imported - validate them before exporting") if Marker.objects.filter(status='I').count(): return 0, _("There are items from a former import not yet " "validated - validate them before exporting") # start import api = settings.CHIMERE_OSM_API_URL username = settings.CHIMERE_OSM_USER password = settings.CHIMERE_OSM_PASSWORD if extra_args: try: api = extra_args['api'] username = extra_args['username'] password = extra_args['password'] except KeyError: return 0, _("Bad params - programming error") username = username.encode('latin1') password = password.encode('latin1') api = OsmApi.OsmApi(api=api, username=username, password=password) api.ChangesetCreate({"comment": "Import from Chimère %s" % get_version()}) hooks = RE_HOOK.findall(self.importer_instance.filtr) if not hooks: hooks = RE_HOOK.findall(self.importer_instance.source) if not hooks: return 0, _("Bad param") tags = {} bbox = [] for hook in hooks: key, value = hook.split('=') if '*' in value or '|' in key or '|' in value: continue if key == 'bbox': x1, y1, x2, y2 = [float(val) for val in value.split(',')] bbox = GEOSGeometry( 'POLYGON((%f %f,%f %f,%f %f,%f %f,%f %f))' % ( x1, y1, x2, y1, x2, y2, x1, y2, x1, y1), srid=4326) continue tags[key] = value if not tags: return 0, _("No non ambigious tag is defined in the XAPI request") if not bbox: return 0, _( "No bounding box is defined in the XAPI request." "If you are sure to manage the entire planet set the " "bounding box to -180,-90,180,90") default_dct = {'tag': tags, 'import_source': self.importer_instance.source} idx = -1 for idx, item in enumerate( Marker.objects.filter( status='A', point__contained=bbox, categories=self.importer_instance.categories.all(), not_for_osm=False, modified_since_import=True, route=None).all()): dct = default_dct.copy() dct.update({'lon': item.point.x, 'lat': item.point.y}) dct['tag']['name'] = item.name node = None import_key = item.get_key('OSM') updated = False if import_key: try: dct['id'] = import_key dct['version'] = item.import_version node = api.NodeUpdate(dct) updated = True except OsmApi.ApiError as error: if error.status == 404: dct.pop('id') dct.pop('version') pass # if the node doesn't exist it is created else: raise if not updated: node = api.NodeCreate(dct) item.set_key('OSM', node['id']) item.import_version = node['version'] item.save() api.ChangesetClose() return idx + 1, None import chardet from html.parser import HTMLParser from bs4 import BeautifulSoup RE_CLEANS = ((re.compile('(\n)*|^( )*(\n)*( )*|( )*(\n)*( )*$'), ''), (re.compile(' ( )*'), ' '), (re.compile(r"""\d{1,2}) ' r'(?P' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') ' r'(?P\d{4})?[^\d]*' r'(?P\d{1,2}) ' r'(?P' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') *' r'(?P\d{4})?.*'), re.compile(r'(?P\d{1,2}) ' r'(?P' + '|'.join(UNI_MONTH_NAMES['fr_FR']) + ') * ' r'(?P\d{4})?')], 'en': [ re.compile(r'(?P\d{4})-' r'(?P\d{2})-' r'(?P\d{2})' r'(?:T' r'(?P\d{2})?:' r'(?P\d{2})?:' r'(?P\d{2})' r')?.*' r'(?P\d{4})-' r'(?P\d{2})-' r'(?P\d{2})' r'(?:T' r'(?P\d{2})?:' r'(?P\d{2})?:' r'(?P\d{2})' r')?.*'), re.compile(r'(?P\d{4})-' r'(?P\d{2})-' r'(?P\d{2})' r'(?:T' r'(?P\d{2})?:' r'(?P\d{2})?:' r'(?P\d{2})' r')?')], } def clean_field(value): return value.strip() class HtmlXsltManager(ImportManager): PARSER = 'HTMLParser' def get(self): """ Get data from the source Return a tuple with: - new items; - updated items; - error detail on error. """ from chimere.models import Marker self.marker_cls = Marker try: main_page = urllib.request.urlopen(self.importer_instance.source, timeout=20) assert main_page.getcode() == 200 except (urllib.error.URLError, AssertionError): return (0, 0, _("Source page is unreachable.")) data = main_page.read() encoding = chardet.detect(data) data = data.decode(encoding['encoding']) if 'HTML' in self.PARSER: soup = BeautifulSoup(data) main_page = soup.prettify() # convert it to valid XHTML doc, errors = tidy_document(main_page) dom = etree.HTML(doc, getattr(etree, self.PARSER)()) else: main_page = data dom = etree.XML(main_page.encode('utf-8'), getattr( etree, self.PARSER)()) try: xslt = etree.parse(self.importer_instance.source_file) self.importer_instance.source_file.seek(0) transform = etree.XSLT(xslt) except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): return (0, 0, _("The source file is not a valid XSLT file.")) newdom = transform(dom) items = [] # load an alternate xslt file to apply to linked page transform_child = None if self.importer_instance.source_file_alt: try: alt_xslt = etree.parse(self.importer_instance.source_file_alt) self.importer_instance.source_file_alt.seek(0) transform_child = etree.XSLT(alt_xslt) except (etree.XSLTParseError, etree.XMLSyntaxError, TypeError): return (0, 0, _("The alt source file is not a valid XSLT file.")) base_url = "/".join(self.importer_instance.source.split('/')[:-1]) base_url += "/" for item in newdom.getroot(): c_item = {child.tag: clean_field(child.text) for child in item.getchildren() if child.text} # try to have more information on the linked page if transform_child and 'link' in c_item: # not an absolute address if not c_item['link'].startswith('http://') and \ not c_item['link'].startswith('https://'): c_item['link'] = base_url + c_item['link'] try: child_page = urllib.request.urlopen(c_item['link']) assert child_page.getcode() == 200 except (urllib.error.URLError, AssertionError): # don't stop the export for a bad link items.append(c_item) continue data = child_page.read() encoding = chardet.detect(data) data = data.decode(encoding['encoding']) child_page = BeautifulSoup(data).prettify() child_dom = etree.HTML(child_page, etree.HTMLParser()) extra_keys = transform_child(child_dom).getroot() if len(extra_keys): c_item.update({extra.tag: etree.tostring(extra) for extra in extra_keys[0].getchildren()}) items.append(c_item) # change relative link to full link, simplify, unescape HTML entities html_unescape = html.unescape for item in items: for k in item: val = item[k] if type(val) == bytes: val = val.decode('utf-8') for r, replaced in RE_CLEANS: val = re.sub(r, replaced % {'base_url': base_url}, val) item[k] = html_unescape(val) self.key_categories = self.importer_instance.get_key_category_dict() self.missing_cats = set() self.updated_item, self.new_item = 0, 0 for item in items: self.add_dct_item(item) msg = '' if self.missing_cats: msg = _( "Names \"%s\" doesn't match existing categories. " "Modify the import to match theses names with categories.") %\ ('", "'.join(self.missing_cats)) return (self.new_item, self.updated_item, msg) @classmethod def _internal_parse_date(cls, locale, year, month, day): try: year = datetime.date.today().year if not year else int(year) except ValueError: return month = month.encode('utf-8') if locale in MONTH_NAMES and month in MONTH_NAMES[locale]: month = MONTH_NAMES[locale].index(month) + 1 else: try: month = int(month) except ValueError: return try: day = int(day) except ValueError: return try: return datetime.date(year, month, day) except ValueError: return def parse_date(self, date): dct = {} has_dates = False if type(date) == bytes: date = date.decode('utf-8') for locale in DATE_PARSINGS: if has_dates: break for r in DATE_PARSINGS[locale]: if not date: continue m = r.search(date) if not m: continue values = m.groupdict() date = self._internal_parse_date( locale, 'year1' in values and values['year1'], values['month1'], values['day1']) if not date: continue dct['start_date'] = date has_dates = True if 'day2' not in values: break date = self._internal_parse_date( locale, 'year2' in values and values['year2'], values['month2'], values['day2']) if date: dct['end_date'] = date break return dct def add_dct_item(self, item): if not self.importer_instance.default_localisation and \ "point" not in item and not ("lat" in item and item['lat']): return cls = None origin = self.importer_instance.origin origin_lnk = item.get('link') # filter non relevant links if origin_lnk and origin_lnk.startswith('http'): origin = "%s" % ( origin_lnk, origin) dct = { 'origin': origin, 'license': self.importer_instance.license, 'name': item['name']} category = None if 'category' in item and item['category']: if item['category'] in self.key_categories: category = self.key_categories[item['category']] else: self.missing_cats.add(item['category']) cls = self.marker_cls if 'point' in item: x, y = item['point'].split(",") dct['point'] = 'SRID=4326;POINT(%s %s)' % (x, y) elif 'lat' in item and item['lat']: dct['point'] = 'SRID=4326;POINT(%s %s)' % (item['lon'], item['lat']) else: dct['point'] = self.importer_instance.default_localisation dct['description'] = item.get('description', '') if 'date' in item: dct.update(self.parse_date(item['date'])) if "start_date" in item and item["start_date"]: dct['start_date'] = item["start_date"] if "end_date" in item and item["end_date"]: dct['end_date'] = item["end_date"] key = item['key'] it, updated, created = self.create_or_update_item(cls, dct, key, category=category) if updated: self.updated_item += 1 if created: self.new_item += 1 class XMLXsltManager(HtmlXsltManager): PARSER = 'XMLParser' import icalendar class IcalManager(ImportManager): def get(self): """ Get data from an icalendar source """ from chimere.models import Marker new_item, updated_item, msg = 0, 0, '' source, msg = self.get_source_file([]) if msg: return (0, 0, msg) data = source.read() try: cal = icalendar.Calendar.from_ical(data) except ValueError as e: return (new_item, updated_item, _("Error on icalendar parsing: ") + str(e)) default_dct = {'origin': self.importer_instance.origin, 'license': self.importer_instance.license} if self.importer_instance.default_localisation: default_dct['point'] = self.importer_instance.default_localisation for event in cal.walk('VEVENT'): dct = default_dct.copy() dct['name'] = event.get('SUMMARY', '') if dct['name']: dct['name'] = str(dct['name']) dct['description'] = event.get('DESCRIPTION', '') if dct['description']: dct['description'] = str(dct['description']) loc = event.get('LOCATION', None) if loc: dct['description'] += "
{}".format(str(loc)) url = event.get('URL', None) if url: dct['description'] += "
{}".format( str(url), str(_('Link'))) dct['start_date'] = event.get('DTSTART', None) if dct['start_date']: dct['start_date'] = event.decoded('DTSTART') dct['end_date'] = event.get('DTEND', None) if dct['end_date']: dct['end_date'] = event.decoded('DTEND') point = event.get('GEO', None) if point: dct['point'] = 'SRID=4326;POINT(%s %s)' % (point.longitude, point.latitude) if not dct.get('point', None): continue cls = Marker pl_id = event.get('UID', None) if not pl_id: pl_id = dct['name'] + "-" + str(self.importer_instance.pk) pl_id += "-" + str(self.importer_instance.pk) it, updated, created = self.create_or_update_item(cls, dct, pl_id) if updated: updated_item += 1 if created: new_item += 1 return (new_item, updated_item, msg)