diff options
Diffstat (limited to 'ishtar_common/management/commands')
-rw-r--r-- | ishtar_common/management/commands/generate_rights.py | 71 | ||||
-rw-r--r-- | ishtar_common/management/commands/reassociate_similar_images.py | 234 |
2 files changed, 234 insertions, 71 deletions
diff --git a/ishtar_common/management/commands/generate_rights.py b/ishtar_common/management/commands/generate_rights.py deleted file mode 100644 index 75b1cf807..000000000 --- a/ishtar_common/management/commands/generate_rights.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2011 Étienne Loks <etienne.loks_AT_peacefrogsDOTnet> - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -# See the file COPYING for details. - -import sys - -from django.core.management.base import BaseCommand, CommandError -from django.core.exceptions import ObjectDoesNotExist - -import ishtar_base.forms_main as ishtar_forms -import ishtar_base.models as models - -class Command(BaseCommand): - args = '' - help = 'Regenerate rights for current forms' - - def handle(self, *args, **options): - wizards = [] - wizard_steps = {} - for attr in dir(ishtar_forms): - if not attr.endswith('_wizard'): - continue - wizard = getattr(ishtar_forms, attr) - url_name = wizard.url_name - try: - wizard_obj = models.Wizard.objects.get(url_name=url_name) - except ObjectDoesNotExist: - wizard_obj = models.Wizard.objects.create(url_name=url_name) - wizard_obj.save() - #self.stdout.write('* Wizard "%s" added\n' % url_name) - sys.stdout.write('* Wizard "%s" added\n' % url_name) - wizard_steps[url_name] = [] - for idx, step_url_name in enumerate(wizard.form_list.keys()): - form = wizard.form_list[step_url_name] - if issubclass(form, ishtar_forms.FinalForm): - break # don't reference the final form - step_values = {'name':unicode(form.form_label), - 'order':idx} - try: - step_obj = models.WizardStep.objects.get(wizard=wizard_obj, - url_name=step_url_name) - for k in step_values: - setattr(step_obj, k, step_values[k]) - step_obj.save() - except ObjectDoesNotExist: - step_values.update({'wizard':wizard_obj, - 'url_name':step_url_name}) - step_obj = models.WizardStep.objects.create(**step_values) - step_obj.save() - #self.stdout.write('* Wizard step "%s" added\n' \ - # % unicode(form.form_label)) - sys.stdout.write('* Wizard step "%s" added\n' \ - % unicode(form.form_label)) - wizard_steps[url_name].append(step_url_name) - #self.stdout.write('Successfully regeneration of wizard rights\n') - sys.stdout.write('Successfully regeneration of wizard rights\n') diff --git a/ishtar_common/management/commands/reassociate_similar_images.py b/ishtar_common/management/commands/reassociate_similar_images.py new file mode 100644 index 000000000..f6d432327 --- /dev/null +++ b/ishtar_common/management/commands/reassociate_similar_images.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import datetime +import hashlib +import sys + +from django.core.management.base import BaseCommand + +from ishtar_common.models import Document + +BLOCKSIZE = 65536 + + +def get_hexdigest(filename): + m = hashlib.sha256() + with open(filename, 'rb') as afile: + buf = afile.read(BLOCKSIZE) + while len(buf) > 0: + m.update(buf) + buf = afile.read(BLOCKSIZE) + return m.hexdigest() + + +class Command(BaseCommand): + help = 'Re-associate similar images in the database' + + def add_arguments(self, parser): + parser.add_argument( + '--merge-title', type=str, default='', dest='merged-title', + help='If specified when title differs the given title will be ' + 'used.') + parser.add_argument( + '--output-path', type=str, default='', dest='output-path', + help='Output path for results CSV files. Default to current path.') + parser.add_argument( + '--ignore-reference', dest='ignore-reference', action='store_true', + help='Ignore the reference on diff between models.') + parser.add_argument( + '--delete-missing', dest='delete-missing', action='store_true', + default=False, help='Delete document with missing images.') + parser.add_argument( + '--quiet', dest='quiet', action='store_true', + help='Quiet output.') + + def handle(self, *args, **options): + quiet = options['quiet'] + ignore_ref = options['ignore-reference'] + delete_missing = options['delete-missing'] + merged_title = options['merged-title'] + output_path = options['output-path'] + + q = Document.objects.filter(image__isnull=False).exclude( + image='') + hashes = {} + missing_images = [] + count = q.count() + out = sys.stdout + if not quiet: + out.write("* {} images\n".format(count)) + for idx, doc in enumerate(q.all()): + if not quiet: + out.write("\r* hashes calculation: {} %".format( + int(float(idx + 1) / count * 100))) + out.flush() + path = doc.image.path + try: + hexdigest = get_hexdigest(path) + except IOError: + missing_images.append(doc.pk) + continue + if hexdigest not in hashes: + hashes[hexdigest] = [] + hashes[hexdigest].append(doc.pk) + nb_missing_images = len(missing_images) + if not quiet: + out.write("\n* {} missing images\n".format(nb_missing_images)) + + if missing_images and delete_missing: + for nb, idx in enumerate(missing_images): + if not quiet: + out.write( + "\r* delete document with missing images: {} %".format( + int(float(nb + 1) / nb_missing_images * 100))) + out.flush() + doc = Document.objects.get(pk=idx) + doc.delete() + if not quiet: + out.write("\n") + + attributes = [ + 'title', 'associated_file', 'internal_reference', 'source_type', + 'support_type', 'format_type', 'scale', + 'authors_raw', 'associated_url', 'receipt_date', 'creation_date', + 'receipt_date_in_documentation', 'item_number', 'description', + 'comment', 'additional_information', 'duplicate' + ] + if not ignore_ref: + attributes.append('reference') + + m2ms = ['authors', 'licenses'] + + nb_conflicted_items = 0 + nb_merged_items = 0 + distinct_image = 0 + conflicts = [] + merged = [] + + count = len(hashes) + + for idx, hash in enumerate(hashes): + if not quiet: + out.write("\r* merge similar images: {} %".format( + int(float(idx + 1) / count * 100))) + out.flush() + if len(hashes[hash]) < 2: + distinct_image += 1 + continue + items = [Document.objects.get(pk=pk) for pk in hashes[hash]] + ref_item = items[0] + other_items = items[1:] + + for item in other_items: + ref_item = Document.objects.get(pk=ref_item.pk) + conflicted_values = [] + for attr in attributes: + ref_value = getattr(ref_item, attr) + other_value = getattr(item, attr) + if ref_value: + if not other_value: + continue + if other_value != ref_value: + if attr == 'title' and merged_title: + setattr(ref_item, 'title', merged_title) + else: + conflicted_values.append( + (attr, ref_value, other_value) + ) + else: + if not other_value: + continue + setattr(ref_item, attr, other_value) + + base_csv = [ + ref_item.pk, + ref_item.reference.encode('utf-8') if + ref_item.reference else "", + ref_item.cache_related_label.encode('utf-8') if + ref_item.cache_related_label else "", + ref_item.image.name.encode('utf-8'), + item.pk, + item.reference.encode('utf-8') if + item.reference else "", + item.cache_related_label.encode('utf-8') if + item.cache_related_label else "", + item.image.name.encode('utf-8'), + ] + if conflicted_values: + nb_conflicted_items += 1 + for attr, ref_value, other_value in conflicted_values: + conflicts.append(base_csv + [ + attr, unicode(ref_value).encode('utf-8'), + unicode(other_value).encode('utf-8') + ]) + continue + + merged.append(base_csv) + + for m2m in m2ms: + for m2 in getattr(item, m2m).all(): + if m2 not in getattr(ref_item, m2m): + getattr(ref_item, m2m).add(m2) + + for rel_attr in Document.RELATED_MODELS: + ref_rel_items = [ + r.pk for r in getattr(ref_item, rel_attr).all()] + for rel_item in getattr(item, rel_attr).all(): + if rel_item.pk not in ref_rel_items: + getattr(ref_item, rel_attr).add(rel_item) + + ref_item.skip_history_when_saving = True + ref_item.save() + item.delete() + nb_merged_items += 1 + if not quiet: + out.write(u"\n") + + n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-') + if conflicts: + filename = output_path + u"{}-conflict.csv".format(n) + with open(filename, 'w') as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + ["Document 1 - pk", "Document 1 - Ref", + "Document 1 - related", "Document 1 - image path", + "Document 2 - pk", "Document 2 - Ref", + "Document 2 - related", "Document 2 - image path", + "Attribute", "Document 1 - value", "Document 2 - value" + ] + ) + for conflict in conflicts: + writer.writerow(conflict) + if not quiet: + out.write(u"* {} conflicted items ({})\n".format( + nb_conflicted_items, filename)) + if merged: + filename = output_path + u"{}-merged.csv".format(n) + with open(filename, 'w') as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + ["Document 1 - pk", "Document 1 - Ref", + "Document 1 - related", "Document 1 - image path", + "Document 2 - pk", "Document 2 - Ref", + "Document 2 - related", "Document 2 - image path", + ] + ) + for merge in merged: + writer.writerow(merge) + if not quiet: + out.write(u"* {} merged items ({})\n".format(nb_merged_items, + filename)) + if not quiet: + out.write("* {} distinct images\n".format(distinct_image)) + + + + + + + + + + |