#!/usr/bin/env python # -*- coding: utf-8 -*- import csv import datetime import hashlib import sys from django.core.management.base import BaseCommand from ishtar_common.models import Document BLOCKSIZE = 65536 def get_hexdigest(filename): m = hashlib.sha256() with open(filename, 'rb') as afile: buf = afile.read(BLOCKSIZE) while len(buf) > 0: m.update(buf) buf = afile.read(BLOCKSIZE) return m.hexdigest() class Command(BaseCommand): help = 'Re-associate similar images in the database' def add_arguments(self, parser): parser.add_argument( '--merge-title', type=str, default='', dest='merged-title', help='If specified when title differs the given title will be ' 'used.') parser.add_argument( '--output-path', type=str, default='', dest='output-path', help='Output path for results CSV files. Default to current path.') parser.add_argument( '--ignore-reference', dest='ignore-reference', action='store_true', help='Ignore the reference on diff between models.') parser.add_argument( '--delete-missing', dest='delete-missing', action='store_true', default=False, help='Delete document with missing images.') parser.add_argument( '--quiet', dest='quiet', action='store_true', help='Quiet output.') def handle(self, *args, **options): quiet = options['quiet'] ignore_ref = options['ignore-reference'] delete_missing = options['delete-missing'] merged_title = options['merged-title'] output_path = options['output-path'] q = Document.objects.filter(image__isnull=False).exclude( image='') hashes = {} missing_images = [] count = q.count() out = sys.stdout if not quiet: out.write("* {} images\n".format(count)) for idx, doc in enumerate(q.all()): if not quiet: out.write("\r* hashes calculation: {} %".format( int(float(idx + 1) / count * 100))) out.flush() path = doc.image.path try: hexdigest = get_hexdigest(path) except IOError: missing_images.append(doc.pk) continue if hexdigest not in hashes: hashes[hexdigest] = [] hashes[hexdigest].append(doc.pk) nb_missing_images = len(missing_images) if not quiet: out.write("\n* {} missing images\n".format(nb_missing_images)) if missing_images and delete_missing: for nb, idx in enumerate(missing_images): if not quiet: out.write( "\r* delete document with missing images: {} %".format( int(float(nb + 1) / nb_missing_images * 100))) out.flush() doc = Document.objects.get(pk=idx) doc.delete() if not quiet: out.write("\n") attributes = [ 'title', 'associated_file', 'internal_reference', 'source_type', 'support_type', 'format_type', 'scale', 'authors_raw', 'associated_url', 'receipt_date', 'creation_date', 'receipt_date_in_documentation', 'item_number', 'description', 'comment', 'additional_information', 'duplicate' ] if not ignore_ref: attributes.append('reference') m2ms = ['authors', 'licenses'] nb_conflicted_items = 0 nb_merged_items = 0 distinct_image = 0 conflicts = [] merged = [] count = len(hashes) for idx, hash in enumerate(hashes): if not quiet: out.write("\r* merge similar images: {} %".format( int(float(idx + 1) / count * 100))) out.flush() if len(hashes[hash]) < 2: distinct_image += 1 continue items = [Document.objects.get(pk=pk) for pk in hashes[hash]] ref_item = items[0] other_items = items[1:] for item in other_items: ref_item = Document.objects.get(pk=ref_item.pk) conflicted_values = [] for attr in attributes: ref_value = getattr(ref_item, attr) other_value = getattr(item, attr) if not other_value: continue if ref_value: if other_value != ref_value: if attr == 'title' and merged_title: setattr(ref_item, 'title', merged_title) else: conflicted_values.append( (attr, ref_value, other_value) ) else: setattr(ref_item, attr, other_value) base_csv = [ ref_item.pk, ref_item.reference.encode('utf-8') if ref_item.reference else "", ref_item.cache_related_label.encode('utf-8') if ref_item.cache_related_label else "", ref_item.image.name.encode('utf-8'), item.pk, item.reference.encode('utf-8') if item.reference else "", item.cache_related_label.encode('utf-8') if item.cache_related_label else "", item.image.name.encode('utf-8'), ] if conflicted_values: nb_conflicted_items += 1 for attr, ref_value, other_value in conflicted_values: conflicts.append(base_csv + [ attr, str(ref_value).encode('utf-8'), str(other_value).encode('utf-8') ]) continue merged.append(base_csv) for m2m in m2ms: for m2 in getattr(item, m2m).all(): if m2 not in getattr(ref_item, m2m).all(): getattr(ref_item, m2m).add(m2) for rel_attr in Document.RELATED_MODELS: ref_rel_items = [ r.pk for r in getattr(ref_item, rel_attr).all()] for rel_item in getattr(item, rel_attr).all(): if rel_item.pk not in ref_rel_items: getattr(ref_item, rel_attr).add(rel_item) ref_item.skip_history_when_saving = True ref_item.save() item.delete() nb_merged_items += 1 if not quiet: out.write("\n") n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-') if conflicts: filename = output_path + "{}-conflict.csv".format(n) with open(filename, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow( ["Document 1 - pk", "Document 1 - Ref", "Document 1 - related", "Document 1 - image path", "Document 2 - pk", "Document 2 - Ref", "Document 2 - related", "Document 2 - image path", "Attribute", "Document 1 - value", "Document 2 - value" ] ) for conflict in conflicts: writer.writerow(conflict) if not quiet: out.write("* {} conflicted items ({})\n".format( nb_conflicted_items, filename)) if merged: filename = output_path + "{}-merged.csv".format(n) with open(filename, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow( ["Document 1 - pk", "Document 1 - Ref", "Document 1 - related", "Document 1 - image path", "Document 2 - pk", "Document 2 - Ref", "Document 2 - related", "Document 2 - image path", ] ) for merge in merged: writer.writerow(merge) if not quiet: out.write("* {} merged items ({})\n".format(nb_merged_items, filename)) if not quiet: out.write("* {} distinct images\n".format(distinct_image))