diff options
author | Étienne Loks <etienne.loks@proxience.com> | 2015-01-26 21:41:59 +0100 |
---|---|---|
committer | Étienne Loks <etienne.loks@proxience.com> | 2015-01-26 21:41:59 +0100 |
commit | f960a671e29949be75d50d1ec9e92e90cd62a9f7 (patch) | |
tree | 927690a6a830ff0fa04d24c3242fc564502e55a0 | |
parent | 248505a3a2a0343153b698aecfb358f2b29b766d (diff) | |
download | Ishtar-f960a671e29949be75d50d1ec9e92e90cd62a9f7.tar.bz2 Ishtar-f960a671e29949be75d50d1ec9e92e90cd62a9f7.zip |
Improve/fix import management. Bibracte import for operations.
-rw-r--r-- | archaeological_operations/data_importer.py | 13 | ||||
-rw-r--r--[-rwxr-xr-x] | archaeological_operations/management/commands/import_operations.py | 91 | ||||
-rwxr-xr-x | archaeological_operations/management/commands/import_operations_old.py | 56 | ||||
-rw-r--r-- | ishtar_common/data_importer.py | 239 | ||||
-rw-r--r-- | ishtar_common/models.py | 2 |
5 files changed, 151 insertions, 250 deletions
diff --git a/archaeological_operations/data_importer.py b/archaeological_operations/data_importer.py index e02b53df0..ebcc039d0 100644 --- a/archaeological_operations/data_importer.py +++ b/archaeological_operations/data_importer.py @@ -24,9 +24,9 @@ from django.template.defaultfilters import slugify from django.utils.translation import ugettext_lazy as _ from ishtar_common.data_importer import * -from ishtar_common.models import Town +from ishtar_common.models import Town, OrganizationType -from archaeological_operations.models import Parcel +from archaeological_operations import models from archaeological_operations.utils import parse_parcels RE_PERMIT_REFERENCE = re.compile('[A-Za-z]*(.*)') @@ -45,7 +45,7 @@ class ImportParcelFormater(ImportFormater): for parcel_dct in parse_parcels(value, owner=owner): parcel_dct.update(base_dct) try: - Parcel.objects.get_or_create(**parcel_dct) + models.Parcel.objects.get_or_create(**parcel_dct) except IntegrityError: raise ImporterError("Erreur d'import parcelle, contexte : %s" \ % unicode(parcel_dct)) @@ -148,8 +148,9 @@ RE_ORGA = re.compile("([^,]*)") class OperationImporterBibracte(Importer): + DESC = u"Exports Bibracte : importeur pour l'onglet opération" DEFAULTS = { - ('in_charge',):{ + ('operator',):{ 'organization_type':OrganizationType.objects.get( txt_idx="operator")}, } @@ -171,10 +172,8 @@ class OperationImporterBibracte(Importer): # fin ImportFormater('excavation_end_date', DateFormater('%Y/%m/%d'),), # Chronos - ImportFormater('periods', TypeFormater(models.Period, many=True), - many=True), + ImportFormater('periods', TypeFormater(models.Period, many_split="&")), ] OBJECT_CLS = models.Operation - DEFAULTS = {} diff --git a/archaeological_operations/management/commands/import_operations.py b/archaeological_operations/management/commands/import_operations.py index a9ecf41c9..fe4afc032 100755..100644 --- a/archaeological_operations/management/commands/import_operations.py +++ b/archaeological_operations/management/commands/import_operations.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> +# Copyright (C) 2015 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -17,40 +17,73 @@ # See the file COPYING for details. +import datetime, unicodecsv + +from django.conf import settings from django.core.management.base import BaseCommand, CommandError -from archaeological_operations.import_from_csv import import_from_csv -from archaeological_operations.import_from_dbf import import_from_dbf -IMPORTERS = {'csv':import_from_csv, - 'dbf':import_from_dbf, - 'db3':import_from_dbf, - 'fp':import_from_dbf, - 'vfp':import_from_dbf} +from archaeological_operations.data_importer import * + +IMPORTERS = {'bibracte-operation':OperationImporterBibracte} class Command(BaseCommand): - args = '<filename> [<lines>]' - help = "Import archaelogical operations" + args = '<filename> <importer_name> [<nb lines skipped>]' + help = "Import archaeological operations" def handle(self, *args, **options): if not args or not args[0]: raise CommandError("No file provided.") + if len(args) < 2 or args[1] not in IMPORTERS: + msg = "Bad importer. \nAvailable importers are:\n" + for key in IMPORTERS: + msg += "\t* %s: %s" % (key, IMPORTERS[key].DESC.encode('utf-8') + or "-") + raise CommandError(msg) + try: + skip_lines = int(args[2]) + except: + skip_lines = 0 filename = args[0] - update = True - file_type = None - lines = len(args) > 1 and args[1] - if not file_type: - suffix = filename.split('.')[-1].lower() - if suffix in IMPORTERS.keys(): - file_type = suffix - else: - raise CommandError("This file extension is not managed. "\ - "Specify manualy the file type.") - elif file_type not in IMPORTERS.keys(): - raise CommandError("This file type is not managed.") - nb_ops, errors = IMPORTERS[file_type](filename, - update=update, - stdout=self.stdout, - lines=lines) - self.stdout.write('\n* %d operation treated\n' % nb_ops) - if errors: - self.stderr.write('\n'.join(errors)) + importer = IMPORTERS[args[1]](skip_lines=skip_lines, output='cli') + sys.stdout.write("*" * 72 + "\n") + msg = "* Importer - %s" % importer.DESC + if len(msg) < 72: + msg += (71 - len(msg))*" " + "*\n" + sys.stdout.write(msg) + sys.stdout.write("*" * 72 + "\n\n") + sys.stdout.write("Processing...") + with open(filename) as csv_file: + encodings = [settings.ENCODING, settings.ALT_ENCODING, 'utf-8'] + for encoding in encodings: + try: + importer.importation([line for line in + unicodecsv.reader(csv_file, encoding='utf-8')]) + errors = importer.get_csv_errors() + sys.stdout.write("\n") + if errors: + print errors + now = datetime.datetime.now().isoformat('-' + ).replace(':','') + error_file = '.'.join(filename.split('.')[:-1]) \ + + "_errors_%s.csv" % now + sys.stdout.write("Some errors as occured during the ") + sys.stdout.write("import.\n") + try: + with open(error_file, 'w') as fle: + fle.write(errors.encode('utf-8')) + sys.stdout.write("A report has been create in file:"\ + " \"%s\"" % error_file) + except IOError: + sys.stdout.write("Cannot create CSV error file \"%s\"." % + error_file) + break + except ImporterError, e: + if e.type == ImporterError.HEADER and encoding != encodings[-1]: + csv_file.seek(0) + continue + except UnicodeDecodeError: + if encoding != encodings[-1]: + csv_file.seek(0) + continue + sys.stdout.write("\n\n") + diff --git a/archaeological_operations/management/commands/import_operations_old.py b/archaeological_operations/management/commands/import_operations_old.py new file mode 100755 index 000000000..a9ecf41c9 --- /dev/null +++ b/archaeological_operations/management/commands/import_operations_old.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2012-2013 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# See the file COPYING for details. + +from django.core.management.base import BaseCommand, CommandError +from archaeological_operations.import_from_csv import import_from_csv +from archaeological_operations.import_from_dbf import import_from_dbf + +IMPORTERS = {'csv':import_from_csv, + 'dbf':import_from_dbf, + 'db3':import_from_dbf, + 'fp':import_from_dbf, + 'vfp':import_from_dbf} + +class Command(BaseCommand): + args = '<filename> [<lines>]' + help = "Import archaelogical operations" + + def handle(self, *args, **options): + if not args or not args[0]: + raise CommandError("No file provided.") + filename = args[0] + update = True + file_type = None + lines = len(args) > 1 and args[1] + if not file_type: + suffix = filename.split('.')[-1].lower() + if suffix in IMPORTERS.keys(): + file_type = suffix + else: + raise CommandError("This file extension is not managed. "\ + "Specify manualy the file type.") + elif file_type not in IMPORTERS.keys(): + raise CommandError("This file type is not managed.") + nb_ops, errors = IMPORTERS[file_type](filename, + update=update, + stdout=self.stdout, + lines=lines) + self.stdout.write('\n* %d operation treated\n' % nb_ops) + if errors: + self.stderr.write('\n'.join(errors)) diff --git a/ishtar_common/data_importer.py b/ishtar_common/data_importer.py index 0ce61ba01..87b3a40b0 100644 --- a/ishtar_common/data_importer.py +++ b/ishtar_common/data_importer.py @@ -17,7 +17,7 @@ # See the file COPYING for details. -import copy, csv, datetime, logging, sys +import copy, csv, datetime, logging, re, sys from tempfile import NamedTemporaryFile from django.contrib.auth.models import User @@ -216,16 +216,16 @@ class StrChoiceFormater(Formater): msgstr += unicode(_(u"%d. None of the above - skip")) % idx + u"\n" if self.many_split: new_values = [] - r = re.compile(func.many_split) + r = re.compile(self.many_split) for value in values: new_values += r.split(value) values = new_values for value in values: - base_value = copy(value) + base_value = copy.copy(value) value = self.prepare(value) if value in self.equiv_dict: continue - if not self.cli: + if output != 'cli': self.missings.add(value) continue res = None @@ -263,9 +263,12 @@ class StrChoiceFormater(Formater): class TypeFormater(StrChoiceFormater): def __init__(self, model, cli=False, defaults={}, many_split=False): + self.create = True + self.strict = False self.model = model self.defaults = defaults self.many_split = many_split + self.missings = set() self.equiv_dict, self.choices = {}, [] for item in model.objects.all(): self.choices.append((item.pk, unicode(item))) @@ -348,6 +351,7 @@ class StrToBoolean(Formater): logger = logging.getLogger(__name__) class Importer(object): + DESC = "" LINE_FORMAT = [] OBJECT_CLS = None IMPORTED_LINE_FIELD = None @@ -366,19 +370,18 @@ class Importer(object): 'regex_not_match':_(u"The regexp doesn't match.") } - def __init__(self, skip_first_line=False, reference_header=None, + def __init__(self, skip_lines=0, reference_header=None, check_col_num=False, test=False, check_validity=True, - history_modifier=None, output=None): + history_modifier=None, output='silent'): """ - * skip_first_line must be set to True if the data provided has got - an header. + * skip_line must be set if the data provided has got headers lines. * a reference_header can be provided to perform a data compliance check. It can be useful to warn about bad parsing. * test doesn't write in the database * check_validity rewrite a CSV file to be compared """ self.message = '' - self.skip_first_line = skip_first_line + self.skip_lines = skip_lines self.reference_header = reference_header self.test = test self.errors = [] # list of (line, col, message) @@ -408,7 +411,7 @@ class Importer(object): assert output in ('silent', 'cli', 'db') vals = [] for idx_line, line in enumerate(table): - if (self.skip_first_line and not idx_line): + if self.skip_lines > idx_line: continue for idx_col, val in enumerate(line): if idx_col >= len(self.line_format): @@ -424,7 +427,7 @@ class Importer(object): def importation(self, table): self.validity_file = None if not self._initialized: - self.initialize(table) + self.initialize(table, self.output) if self.check_validity: with NamedTemporaryFile(delete=False) as validity_file: self.validity_file = UnicodeWriter(validity_file, @@ -479,7 +482,7 @@ class Importer(object): self.min_col_number = len(self.line_format) - idx_last_col # check the conformity with the reference header if self.reference_header and \ - self.skip_first_line and \ + self.skip_lines and \ self.reference_header != table[0]: raise ImporterError(self.ERRORS['header_check'], type=ImporterError.HEADER) @@ -507,7 +510,7 @@ class Importer(object): self.errors.append((idx_line, None, msg)) def _line_processing(self, idx_line, line): - if (self.skip_first_line and not idx_line): + if self.skip_lines > idx_line: if self.validity_file: self.validity_file.writerow(line) return @@ -703,201 +706,6 @@ class Importer(object): else: c_row.append(unicode(c_values)) - - """ - def _importation(self, table): - table = list(table) - if not table or not table[0]: - raise ImporterError(self.ERRORS['no_data'], ImporterError.HEADER) - if self.check_col_num and len(table[0]) > len(self.line_format): - raise ImporterError(self.ERRORS['too_many_cols'] % { - 'user_col':len(table[0]), 'ref_col':len(self.line_format)}) - self.errors = [] - self.messages = [] - self.number_imported = 0 - # index of the last required column - for idx_last_col, formater in enumerate(reversed(self.line_format)): - if formater.required: - break - else: - idx_last_col += 1 - # min col number to be filled - min_col_number = len(self.line_format) - idx_last_col - # check the conformity with the reference header - if self.reference_header and \ - self.skip_first_line and \ - self.reference_header != table[0]: - raise ImporterError(self.ERRORS['header_check'], - type=ImporterError.HEADER) - now = datetime.datetime.now() - for idx_line, line in enumerate(table): - #self._line_processing() - - if (self.skip_first_line and not idx_line): - if self.validity_file: - self.validity_file.writerow(line) - continue - if not line: - if self.validity_file: - self.validity_file.writerow([]) - continue - self.throughs = [] # list of (formater, value) - self.post_processing = [] # list of (formater, value) - data = {} - - # keep in database the raw line for testing purpose - if self.IMPORTED_LINE_FIELD: - output = io.StringIO() - writer = csv.writer(output) - writer.writerow(line) - data[self.IMPORTED_LINE_FIELD] = output.getvalue() - - n = datetime.datetime.now() - logger.debug('%s - Processing line %d' % (unicode(n-now), idx_line)) - now = n - n2 = n - self.c_errors = False - c_row = [] - for idx_col, val in enumerate(line): - #self._row_processing(self, c_row, idx_col, val): - - if idx_col >= len(self.line_format): - break - formater = self.line_format[idx_col] - if not formater.field_name: - if self.validity_file: - c_row.append(val) - continue - if formater.regexp: - # multiline regexp is a mess... - val = val.replace('\n', NEW_LINE_BREAK) - match = formater.regexp.match(val) - if not match: - if formater.required: - self.errors.append((idx_line+1, idx_col+1, - self.ERRORS['value_required'])) - elif not val.strip(): - c_row.append("") - continue - c_errors = True - val = val.replace(NEW_LINE_BREAK, '\n') - self.errors.append((idx_line+1, idx_col+1, - unicode(self.ERRORS['regex_not_match']) + val)) - c_row.append("") - continue - val_group = [v.replace(NEW_LINE_BREAK, '\n') - for v in match.groups()] - else: - val_group = [val] - c_values = [] - for idx_v, v in enumerate(val_group): - self.message = '' - func = formater.formater - if type(func) in (list, tuple): - func = func[idx_v] - if not callable(func) and type(func) in (unicode, str): - func = getattr(self, func) - value = None - try: - if formater.regexp_formater_args: - args = [] - for idx in formater.regexp_formater_args[idx_v]: - args.append(val_group[idx]) - value = func.format(*args) - else: - value = func.format(v) - except ValueError, e: - c_errors = True - self.errors.append((idx_line+1, idx_col+1, e.message)) - c_values.append(None) - continue - if self.message: - self.messages.append(self.message) - c_values.append(value) - if value == None: - if formater.required: - c_errors = True - self.errors.append((idx_line+1, idx_col+1, - self.ERRORS['value_required'])) - continue - field_name = formater.field_name - if type(field_name) in (list, tuple): - field_name = field_name[idx_v] - field_names = [field_name] - if formater.duplicate_field: - duplicate_field = formater.duplicate_field - if type(duplicate_field) in (list, tuple): - duplicate_field = duplicate_field[idx_v] - field_names += [duplicate_field] - - - if formater.through: - throughs.append((formater, value)) - else: - for field_name in field_names: - self._field_name_to_data_dict(field_name, - value, data) - if formater.reverse_for_test: - c_row.append(formater.reverse_for_test(**c_values)) - else: - c_row.append(unicode(c_values)) - - if self.validity_file: - self.validity_file.writerow(c_row) - if not self.c_errors and (idx_col + 1) < min_col_number: - self.c_errors = True - self.errors.append((idx_line+1, idx_col+1, - self.ERRORS['not_enough_cols'] % min_col_number)) - if self.c_errors: - continue - n = datetime.datetime.now() - logger.debug('* %s - Cols read' % (unicode(n-n2))) - n2 = n - if self.test: - continue - # manage unicity of items (mainly for updates) - self.number_imported += 1 - if self.UNICITY_KEYS: - data['defaults'] = {} - for k in data.keys(): - if k not in self.UNICITY_KEYS \ - and k != 'defaults': - data['defaults'][k] = data.pop(k) - - obj, created = self.get_object(self.OBJECT_CLS, data) - - if not created and 'defaults' in data: - for k in data['defaults']: - setattr(obj, k, data['defaults'][k]) - obj.save() - n = datetime.datetime.now() - logger.debug('* %s - Item saved' % (unicode(n-n2))) - n2 = n - for formater, value in self.throughs: - n = datetime.datetime.now() - logger.debug('* %s - Processing formater %s' % (unicode(n-n2), - formater.field_name)) - n2 = n - data = {} - if formater.through_dict: - data = formater.through_dict.copy() - if formater.through_key: - data[formater.through_key] = obj - data[formater.field_name] = value - through_cls = formater.through - if formater.through_unicity_keys: - data['defaults'] = {} - for k in data.keys(): - if k not in formater.through_unicity_keys \ - and k != 'defaults': - data['defaults'][k] = data.pop(k) - t_obj, created = through_cls.objects.get_or_create(**data) - if not created and 'defaults' in data: - for k in data['defaults']: - setattr(t_obj, k, data['defaults'][k]) - t_obj.save() - """ - def get_object(self, cls, data, path=[]): m2ms = [] if data and type(data) == dict: @@ -937,6 +745,8 @@ class Importer(object): dct = create_dict.copy() dct['defaults'] = defaults obj, created = cls.objects.get_or_create(**create_dict) + except IntegrityError as e: + raise IntegrityError(e.message) except: created = False obj = cls.objects.filter(**create_dict).all()[0] @@ -946,16 +756,19 @@ class Importer(object): values = value for v in values: getattr(obj, attr).add(v) - except IntegrityError: - raise ImporterError("Erreur d'import %s, contexte : %s" \ - % (unicode(cls), unicode(data))) + except IntegrityError as e: + raise ImporterError("Erreur d'import %s, contexte : %s, erreur : %s" \ + % (unicode(cls), unicode(data), e.message.decode('utf-8'))) return obj, created return data def get_csv_errors(self): - csv_errors = [] + if not self.errors: + return "" + csv_errors = ["line,col,error"] for line, col, error in self.errors: - csv_errors.append(u'"%d","%d","%s"' % (line or 0, col or 0, + csv_errors.append(u'"%s","%s","%s"' % (line and unicode(line) or '-', + col and unicode(col) or '-', unicode(error))) return u"\n".join(csv_errors) diff --git a/ishtar_common/models.py b/ishtar_common/models.py index 7ef06ddb7..280376966 100644 --- a/ishtar_common/models.py +++ b/ishtar_common/models.py @@ -386,7 +386,7 @@ class GeneralType(models.Model): keys = [] content_type = ContentType.objects.get_for_model(self.__class__) for ik in ItemKey.objects.filter(content_type=content_type, - object_id=ik.pk).all(): + object_id=self.pk).all(): keys.append(ik.key) return keys |