diff options
-rw-r--r-- | ishtar_common/management/commands/import_geofla_csv.py | 202 | ||||
-rw-r--r-- | ishtar_common/management/commands/import_insee_comm_csv.py | 119 | ||||
-rw-r--r-- | ishtar_common/utils.py | 28 |
3 files changed, 210 insertions, 139 deletions
diff --git a/ishtar_common/management/commands/import_geofla_csv.py b/ishtar_common/management/commands/import_geofla_csv.py index 69a1cad0b..400825092 100644 --- a/ishtar_common/management/commands/import_geofla_csv.py +++ b/ishtar_common/management/commands/import_geofla_csv.py @@ -18,6 +18,8 @@ # See the file COPYING for details. import csv +import datetime +import os import sys from django.core.management.base import BaseCommand @@ -25,8 +27,10 @@ from django.contrib.gis.geos import GEOSGeometry, Point from django.db import transaction from django.db.utils import DataError +from django.conf import settings from django.contrib.contenttypes.models import ContentType from ishtar_common.models import Town, GeoVectorData, GeoDataType, GeoProviderType +from ishtar_common.utils import BColors, fast_line_count, get_log_time, get_progress town_content_type = ContentType.objects.get(app_label="ishtar_common", model="town") @@ -51,6 +55,9 @@ class Command(BaseCommand): help='Quiet output') parser.add_argument( '--create-only', dest='create_only', action='store_true', + help='No update') + parser.add_argument( + '--create-only-geo', dest='create_only_geo', action='store_true', help='Create only missing geo') parser.add_argument( '--srid', type=int, default=2154, dest='srid', @@ -72,108 +79,127 @@ class Command(BaseCommand): @transaction.atomic def handle(self, *args, **options): + log_path = os.sep.join([settings.ROOT_PATH, "logs"]) + if not os.path.exists(log_path): + os.mkdir(log_path, mode=0o770) + csv.field_size_limit(sys.maxsize) csv_file = options['csv_file'] default_year = options['year'] srid = options['srid'] quiet = options['quiet'] create_only = options["create_only"] + create_only_geo = options["create_only_geo"] if not quiet: - sys.stdout.write('* using year {} as a default\n'.format( - default_year)) - sys.stdout.write('* Opening file {}\n'.format(csv_file)) - nb_created, nb_changed, nb_error, nb_geo = 0, 0, 0, 0 - with open(csv_file, 'rt') as csvfile: - header = csvfile.readline() - geom_colum = header.split(",")[0] - csvfile.seek(0) - - reader = csv.DictReader(csvfile) - for idx, row in enumerate(reader): - if not quiet: - sys.stdout.write('Processing town %d.\r' % (idx + 1)) - sys.stdout.flush() - num_insee = row['INSEE_COM'] - if len(num_insee) < 5: - num_insee = '0' + num_insee - if 'NOM_COM_M' in row: - name = row['NOM_COM_M'] - elif 'NOM_M' in row: - name = row['NOM_M'] - elif 'NOM' in row: - name = row['NOM'].upper() - else: - name = row['NOM_COM'].upper() - town, created = self.get_town(num_insee, name, default_year) - if created: - nb_created += 1 - else: - nb_changed += 1 - if create_only and town.main_geodata: + sys.stdout.write(BColors.OKGREEN) + sys.stdout.write(f'* using year {default_year} as a default\n') + sys.stdout.write(f'* opening file {csv_file}{BColors.ENDC}\n') + nb_created, nb_error, nb_geo = 0, 0, 0 + nb_csv_lines = fast_line_count(csv_file) - 1 + started = datetime.datetime.now() + log_filename = f"import_ign-{get_log_time().replace(':', '')}.csv" + log_path = os.sep.join([log_path, log_filename]) + with open(log_path, "w+") as fle: + writer = csv.writer(fle) + writer.writerow(["insee", "town", "created", "error"]) + with open(csv_file, 'rt') as csvfile: + header = csvfile.readline() + geom_colum = header.split(",")[0] + csvfile.seek(0) + + reader = csv.DictReader(csvfile) + for idx, row in enumerate(reader): + if not quiet: + sys.stdout.write( + get_progress("processing town", idx, nb_csv_lines, started) + ) + sys.stdout.flush() + num_insee = row['INSEE_COM'] + if len(num_insee) < 5: + num_insee = '0' + num_insee + if 'NOM_COM_M' in row: + name = row['NOM_COM_M'] + elif 'NOM_M' in row: + name = row['NOM_M'] + elif 'NOM' in row: + name = row['NOM'].upper() + else: + name = row['NOM_COM'].upper() + town, created = self.get_town(num_insee, name, default_year) + if created: + if create_only_geo: + continue + nb_created += 1 + elif (create_only or create_only_geo) and town.main_geodata: continue - geom = row[geom_colum].upper() - if 'MULTI' not in geom: - geom = geom.replace('POLYGON', 'MULTIPOLYGON(') + ')' - limit = GEOSGeometry(geom, srid=srid) - if 'X_CENTROID' in row: - center = Point( - float(row['X_CENTROID']), float(row['Y_CENTROID']), - srid=srid) - else: - center = None - values = {} - values['center'] = None - if not town.year and default_year: - values['year'] = default_year - if 'SUPERFICIE' in row: - values['surface'] = row['SUPERFICIE'] - else: - values['surface'] = None - for k in values: - setattr(town, k, values[k]) - try: - with transaction.atomic(): - town.save() - except DataError: - nb_error += 1 - town, created = self.get_town(num_insee, name, default_year) + geom = row[geom_colum].upper() + if 'MULTI' not in geom: + geom = geom.replace('POLYGON', 'MULTIPOLYGON(') + ')' + limit = GEOSGeometry(geom, srid=srid) + if 'X_CENTROID' in row: + center = Point( + float(row['X_CENTROID']), float(row['Y_CENTROID']), + srid=srid) + else: + center = None + values = {} + values['center'] = None + if not town.year and default_year: + values['year'] = default_year + if 'SUPERFICIE' in row: + values['surface'] = row['SUPERFICIE'] + else: + values['surface'] = None for k in values: setattr(town, k, values[k]) - town.save() - attrs = { - "name": town._generate_cached_label(), - "source_content_type": town_content_type, - "source_id": town.pk, - "data_type": data_type, - "provider": provider, - } - if limit: - attrs["multi_polygon"] = limit - else: - attrs["point_2d"] = center - try: - data, created = GeoVectorData.objects.get_or_create(**attrs) - except DataError: + error = False + try: + with transaction.atomic(): + town.save() + except DataError: + nb_error += 1 + town, created = self.get_town(num_insee, name, default_year) + for k in values: + setattr(town, k, values[k]) + town.save() + error = True + writer.writerow([town.numero_insee, town.name, created, error]) + attrs = { + "name": town._generate_cached_label(), + "source_content_type": town_content_type, + "source_id": town.pk, + "data_type": data_type, + "provider": provider, + } if limit: - print(f"\nError {town} polygon\n") - attrs.pop("multi_polygon") + attrs["multi_polygon"] = limit + else: attrs["point_2d"] = center + try: data, created = GeoVectorData.objects.get_or_create(**attrs) - else: - print(f"\nError {town} center\n") - town.main_geodata = data - town._post_save_geo_ok = False - town.save() - if created: - nb_geo += 1 + except DataError: + if limit: + print(f"\nError {town} polygon\n") + attrs.pop("multi_polygon") + attrs["point_2d"] = center + data, created = GeoVectorData.objects.get_or_create(**attrs) + else: + print(f"\nError {town} center\n") + town.main_geodata = data + town._post_save_geo_ok = False + town.save() + if created: + nb_geo += 1 if quiet: return - sys.stdout.write('\n* {} town created'.format(nb_created)) - sys.stdout.write('\n* {} geo created'.format(nb_geo)) - sys.stdout.write('\n* {} town changed'.format(nb_changed)) - sys.stdout.write('\n* {} town with geometry error\n'.format(nb_error)) + sys.stdout.write(BColors.OKGREEN) + if nb_created: + sys.stdout.write('\n* {} town created'.format(nb_created)) + if nb_geo: + sys.stdout.write('\n* {} geo created'.format(nb_geo)) + if nb_error: + sys.stdout.write('\n* {} town with geometry error\n'.format(nb_error)) + sys.stdout.write(f"\n[{get_log_time()}] log file:") + sys.stdout.write(f"\n{BColors.WARNING}{log_path}{BColors.ENDC}\n") sys.stdout.flush() - - - diff --git a/ishtar_common/management/commands/import_insee_comm_csv.py b/ishtar_common/management/commands/import_insee_comm_csv.py index 2e990c7ce..4584da890 100644 --- a/ishtar_common/management/commands/import_insee_comm_csv.py +++ b/ishtar_common/management/commands/import_insee_comm_csv.py @@ -18,13 +18,17 @@ # See the file COPYING for details. import csv +import os import re import sys +from django.conf import settings from django.core.management.base import BaseCommand from ishtar_common.models import Town +from ishtar_common.utils import BColors, get_log_time + class Command(BaseCommand): help = 'Import INSEE csv' @@ -39,75 +43,88 @@ class Command(BaseCommand): help='Quiet output') def handle(self, *args, **options): + log_path = os.sep.join([settings.ROOT_PATH, "logs"]) + if not os.path.exists(log_path): + os.mkdir(log_path, mode=0o770) + csv_file = options['csv_file'] default_year = options['year'] quiet = options['quiet'] if not quiet: - sys.stdout.write('* using year {} for new towns\n'.format( - default_year)) - sys.stdout.write('* opening file {}\n'.format(csv_file)) + sys.stdout.write(BColors.OKGREEN) + sys.stdout.write(f'* Using year {default_year} for new towns\n') + sys.stdout.write(f'* Opening file {csv_file}{BColors.ENDC}\n') r = re.compile(r"(.*)\((.*)\)") nb_created = 0 nb_link = 0 missing = [] strange = [] linked = set() - with open(csv_file, 'rt') as csvfile: - reader = csv.DictReader(csvfile) - for idx, row in enumerate(reader): - new_insee = row['DepComN'] - if len(new_insee) < 5: - new_insee = '0' + new_insee - - if not idx: # test if first do not exist + + log_filename = f"import_insee-{get_log_time().replace(':', '')}.csv" + log_path = os.sep.join([log_path, log_filename]) + with open(log_path, "w+") as fle: + writer = csv.writer(fle) + writer.writerow(["new insee", "new town"]) + with open(csv_file, 'rt') as csvfile: + reader = csv.DictReader(csvfile) + for idx, row in enumerate(reader): + new_insee = row['DepComN'] + if len(new_insee) < 5: + new_insee = '0' + new_insee + + if not idx: # test if first do not exist + q = Town.objects.filter(numero_insee=new_insee, + year=default_year) + if q.count(): + sys.stdout.write( + f"{BColors.FAIL}First town already exists for this year{BColors.ENDC}\n") + return + + if not quiet: + sys.stdout.write('Processing town %d.\r' % (idx + 1)) + sys.stdout.flush() + + old_insee = row['DepComA'] + if len(old_insee) < 5: + old_insee = '0' + old_insee + q = Town.objects.filter(numero_insee=old_insee) + + if not q.count(): + missing.append((old_insee, row['NomCA'])) + continue + if q.count() > 1: + q = q.filter(year__lt=default_year).order_by('-year') + if not q.count(): + strange.append((old_insee, row['NomCA'])) + continue + old_town = q.all()[0] + q = Town.objects.filter(numero_insee=new_insee, year=default_year) - if q.count(): - print("First town already exists for this year....") - return - - if not quiet: - sys.stdout.write('Processing town %d.\r' % (idx + 1)) - sys.stdout.flush() - - old_insee = row['DepComA'] - if len(old_insee) < 5: - old_insee = '0' + old_insee - q = Town.objects.filter(numero_insee=old_insee) - - if not q.count(): - missing.append((old_insee, row['NomCA'])) - continue - if q.count() > 1: - q = q.filter(year__lt=default_year).order_by('-year') if not q.count(): - strange.append((old_insee, row['NomCA'])) - continue - old_town = q.all()[0] - - q = Town.objects.filter(numero_insee=new_insee, - year=default_year) - if not q.count(): - nb_created += 1 - name = row['NomCN'].strip() - name = r.sub(r"\2 \1", name).strip() - new_town = Town.objects.create(name=name, year=default_year, - numero_insee=new_insee) - else: - new_town = q.all()[0] - if new_town in old_town.children.all(): - continue # link already created - nb_link += 1 - old_town.children.add(new_town) - linked.add(new_town) + nb_created += 1 + name = row['NomCN'].strip() + name = r.sub(r"\2 \1", name).strip() + new_town = Town.objects.create(name=name, year=default_year, + numero_insee=new_insee) + writer.writerow([new_town.numero_insee, new_town.name]) + else: + new_town = q.all()[0] + if new_town in old_town.children.all(): + continue # link already created + nb_link += 1 + old_town.children.add(new_town) + linked.add(new_town) nb_limit = 0 if not quiet: - sys.stdout.write('\nGenerate limits...'.format(nb_created)) + sys.stdout.write('\nGenerate limits...') for town in linked: if town.generate_geo(): nb_limit += 1 if quiet: return + sys.stdout.write(BColors.OKGREEN) sys.stdout.write('\n* {} town created\n'.format(nb_created)) sys.stdout.write('* {} link created\n'.format(nb_link)) sys.stdout.write('* {} limit generated\n'.format(nb_limit)) @@ -119,6 +136,6 @@ class Command(BaseCommand): sys.stdout.write('* these towns have newer version:\n') for insee, name in strange: sys.stdout.write('* {} ({})\n'.format(name, insee)) + sys.stdout.write(f"\n[{get_log_time()}] log file:") + sys.stdout.write(f"\n{BColors.WARNING}{log_path}{BColors.ENDC}\n") sys.stdout.flush() - - diff --git a/ishtar_common/utils.py b/ishtar_common/utils.py index 23495c2ec..04cda150b 100644 --- a/ishtar_common/utils.py +++ b/ishtar_common/utils.py @@ -3158,6 +3158,34 @@ def get_eta(current, total, base_time, current_time): return f"{int(eta // 3600):02d}:{int(eta % 3600 // 60):02d}:{int(eta % 60):02d}" +def get_progress(base_lbl, idx, total, ref_time): + """ + Output progress for a long task. + - base_lbl: label to display + - idx: current item number + - total: number of items + - ref_time: time the task has been started + """ + lbl = f"\r{BColors.OKBLUE}[{get_percent(idx, total)}] {base_lbl} {idx + 1}/{total}" + lbl += f" ({get_eta(idx, total, ref_time, datetime.datetime.now())} left){BColors.ENDC}" + return lbl + + +def fast_line_count(filename): + """ + Efficient line counter for a file + """ + CHUNK_SIZE = 1024 * 1024 + def _count(reader): + b = reader(CHUNK_SIZE) + while b: + yield b + b = reader(CHUNK_SIZE) + with open(filename, 'rb') as fp: + count = sum(buffer.count(b"\n") for buffer in _count(fp.raw.read)) + return count + 1 + + RE_NUMBER = r"[+-]?\d+(?:\.\d*)?" RE_COORDS = r"(" + RE_NUMBER + r") (" + RE_NUMBER + r")" |