diff options
| -rw-r--r-- | ishtar_common/management/commands/reassociate_similar_images.py | 211 | 
1 files changed, 211 insertions, 0 deletions
| diff --git a/ishtar_common/management/commands/reassociate_similar_images.py b/ishtar_common/management/commands/reassociate_similar_images.py new file mode 100644 index 000000000..f255f4876 --- /dev/null +++ b/ishtar_common/management/commands/reassociate_similar_images.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import datetime +import hashlib +import sys + +from django.core.management.base import BaseCommand + +from ishtar_common.models import Document + +BLOCKSIZE = 65536 + + +def get_hexdigest(filename): +    m = hashlib.sha256() +    with open(filename, 'rb') as afile: +        buf = afile.read(BLOCKSIZE) +        while len(buf) > 0: +            m.update(buf) +            buf = afile.read(BLOCKSIZE) +    return m.hexdigest() + + +class Command(BaseCommand): +    help = 'Re-associate similar images in the database' + +    def add_arguments(self, parser): +        parser.add_argument( +            '--merge-title', type=str, default='', dest='merged-title', +            help='If specified when title differs the given title will be ' +                 'used.') +        parser.add_argument( +            '--output-path', type=str, default='', dest='output-path', +            help='Output path for results CSV files. Default to current path.') +        parser.add_argument( +            '--ignore-reference', dest='ignore-reference', action='store_true', +            help='Ignore the reference on diff between models.') +        parser.add_argument( +            '--quiet', dest='quiet', action='store_true', +            help='Quiet output.') + +    def handle(self, *args, **options): +        quiet = options['quiet'] +        ignore_ref = options['ignore-reference'] +        merged_title = options['merged-title'] +        output_path = options['output-path'] + +        q = Document.objects.filter(image__isnull=False).exclude( +            image='') +        hashes = {} +        missing_images = [] +        count = q.count() +        out = sys.stdout +        if not quiet: +            out.write("* {} images\n".format(count)) +        for idx, doc in enumerate(q.all()): +            if not quiet: +                out.write("\r* hashes calculation: {} %".format( +                    int(float(idx + 1) / count * 100))) +                out.flush() +            path = doc.image.path +            try: +                hexdigest = get_hexdigest(path) +            except IOError: +                missing_images.append(doc.pk) +                continue +            if hexdigest not in hashes: +                hashes[hexdigest] = [] +            hashes[hexdigest].append(doc.pk) +        if not quiet: +            out.write("\n* {} missing images\n".format(len(missing_images))) + +        attributes = ['title', 'associated_file', 'internal_reference', +                      'source_type', 'description'] +        if not ignore_ref: +            attributes.append('reference') + +        nb_conflicted_items = 0 +        nb_merged_items = 0 +        distinct_image = 0 +        conflicts = [] +        merged = [] + +        count = len(hashes) + +        for idx, hash in enumerate(hashes): +            if not quiet: +                out.write("\r* merge similar images: {} %".format( +                    int(float(idx + 1) / count * 100))) +                out.flush() +            if len(hashes[hash]) < 2: +                distinct_image += 1 +                continue +            items = [Document.objects.get(pk=pk) for pk in hashes[hash]] +            ref_item = items[0] +            other_items = items[1:] + +            for item in other_items: +                ref_item = Document.objects.get(pk=ref_item.pk) +                conflicted_values = [] +                for attr in attributes: +                    ref_value = getattr(ref_item, attr) +                    other_value = getattr(item, attr) +                    if ref_value: +                        if not other_value: +                            continue +                        if other_value != ref_value: +                            if attr == 'title' and merged_title: +                                setattr(ref_item, 'title', merged_title) +                            else: +                                conflicted_values.append( +                                    (attr, ref_value, other_value) +                                ) +                    else: +                        if not other_value: +                            continue +                        setattr(ref_item, attr, other_value) + +                if conflicted_values: +                    nb_conflicted_items += 1 +                    for attr, ref_value, other_value in conflicted_values: +                        conflicts.append([ +                            ref_item.pk, ref_item.reference.encode('utf-8'), +                            ref_item.cache_related_label.encode('utf-8'), +                            ref_item.image.name.encode('utf-8'), +                            item.pk, item.reference.encode('utf-8'), +                            item.cache_related_label.encode('utf-8'), +                            item.image.name.encode('utf-8'), +                            attr, unicode(ref_value).encode('utf-8'), +                            unicode(other_value).encode('utf-8') +                        ]) +                    continue + +                merged.append([ +                    ref_item.pk, ref_item.reference.encode('utf-8'), +                    ref_item.cache_related_label.encode('utf-8'), +                    ref_item.image.name.encode('utf-8'), +                    item.pk, item.reference.encode('utf-8'), +                    item.cache_related_label.encode('utf-8'), +                    item.image.name.encode('utf-8'), +                ]) + +                for author in ref_item.authors.all(): +                    if author not in item.authors: +                        item.authors.add(author) +                for author in item.authors.all(): +                    if author not in ref_item.authors: +                        ref_item.authors.add(author) + +                for rel_attr in Document.RELATED_MODELS: +                    ref_rel_items = [ +                        r.pk for r in getattr(ref_item, rel_attr).all()] +                    for rel_item in getattr(item, rel_attr).all(): +                        if rel_item.pk not in ref_rel_items: +                            getattr(ref_item, rel_attr).add(rel_item) + +                ref_item.skip_history_when_saving = True +                ref_item.save() +                item.delete() +                nb_merged_items += 1 +        if not quiet: +            out.write(u"\n") + +        n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-') +        if conflicts: +            filename = output_path + u"{}-conflict.csv".format(n) +            with open(filename, 'w') as csvfile: +                writer = csv.writer(csvfile) +                writer.writerow( +                    ["Document 1 - pk", "Document 1 - Ref", +                     "Document 1 - related", "Document 1 - image path", +                     "Document 2 - pk", "Document 2 - Ref", +                     "Document 2 - related", "Document 2 - image path", +                     "Attribute", "Document 1 - value", "Document 2 - value" +                     ] +                ) +                for conflict in conflicts: +                    writer.writerow(conflict) +            if not quiet: +                out.write(u"* {} conflicted items ({})\n".format( +                    nb_conflicted_items, filename)) +        if merged: +            filename = output_path + u"{}-merged.csv".format(n) +            with open(filename, 'w') as csvfile: +                writer = csv.writer(csvfile) +                writer.writerow( +                    ["Document 1 - pk", "Document 1 - Ref", +                     "Document 1 - related", "Document 1 - image path", +                     "Document 2 - pk", "Document 2 - Ref", +                     "Document 2 - related", "Document 2 - image path", +                     ] +                ) +                for merge in merged: +                    writer.writerow(merge) +            if not quiet: +                out.write(u"* {} merged items ({})\n".format(nb_merged_items, +                                                            filename)) +        if not quiet: +            out.write("* {} distinct images\n".format(distinct_image)) + + + + + + + + + + | 
