diff options
Diffstat (limited to 'ishtar_common/management/commands')
| -rw-r--r-- | ishtar_common/management/commands/generate_rights.py | 71 | ||||
| -rw-r--r-- | ishtar_common/management/commands/reassociate_similar_images.py | 234 | 
2 files changed, 234 insertions, 71 deletions
| diff --git a/ishtar_common/management/commands/generate_rights.py b/ishtar_common/management/commands/generate_rights.py deleted file mode 100644 index 75b1cf807..000000000 --- a/ishtar_common/management/commands/generate_rights.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2011  Étienne Loks  <etienne.loks_AT_peacefrogsDOTnet> - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program.  If not, see <http://www.gnu.org/licenses/>. - -# See the file COPYING for details. - -import sys - -from django.core.management.base import BaseCommand, CommandError -from django.core.exceptions import ObjectDoesNotExist - -import ishtar_base.forms_main as ishtar_forms -import ishtar_base.models as models - -class Command(BaseCommand): -    args = '' -    help = 'Regenerate rights for current forms' - -    def handle(self, *args, **options): -        wizards = [] -        wizard_steps = {} -        for attr in dir(ishtar_forms): -            if not attr.endswith('_wizard'): -                continue -            wizard = getattr(ishtar_forms, attr) -            url_name = wizard.url_name -            try: -                wizard_obj = models.Wizard.objects.get(url_name=url_name) -            except ObjectDoesNotExist: -                wizard_obj = models.Wizard.objects.create(url_name=url_name) -                wizard_obj.save() -                #self.stdout.write('* Wizard "%s" added\n' % url_name) -                sys.stdout.write('* Wizard "%s" added\n' % url_name) -            wizard_steps[url_name] = [] -            for idx, step_url_name in enumerate(wizard.form_list.keys()): -                form = wizard.form_list[step_url_name] -                if issubclass(form, ishtar_forms.FinalForm): -                    break # don't reference the final form -                step_values = {'name':unicode(form.form_label), -                               'order':idx} -                try: -                    step_obj = models.WizardStep.objects.get(wizard=wizard_obj, -                                                         url_name=step_url_name) -                    for k in step_values: -                        setattr(step_obj, k, step_values[k]) -                    step_obj.save() -                except ObjectDoesNotExist: -                    step_values.update({'wizard':wizard_obj, -                                        'url_name':step_url_name}) -                    step_obj = models.WizardStep.objects.create(**step_values) -                    step_obj.save() -                    #self.stdout.write('* Wizard step "%s" added\n' \ -                    #                  % unicode(form.form_label)) -                    sys.stdout.write('* Wizard step "%s" added\n' \ -                                      % unicode(form.form_label)) -                wizard_steps[url_name].append(step_url_name) -        #self.stdout.write('Successfully regeneration of wizard rights\n') -        sys.stdout.write('Successfully regeneration of wizard rights\n') diff --git a/ishtar_common/management/commands/reassociate_similar_images.py b/ishtar_common/management/commands/reassociate_similar_images.py new file mode 100644 index 000000000..f6d432327 --- /dev/null +++ b/ishtar_common/management/commands/reassociate_similar_images.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import datetime +import hashlib +import sys + +from django.core.management.base import BaseCommand + +from ishtar_common.models import Document + +BLOCKSIZE = 65536 + + +def get_hexdigest(filename): +    m = hashlib.sha256() +    with open(filename, 'rb') as afile: +        buf = afile.read(BLOCKSIZE) +        while len(buf) > 0: +            m.update(buf) +            buf = afile.read(BLOCKSIZE) +    return m.hexdigest() + + +class Command(BaseCommand): +    help = 'Re-associate similar images in the database' + +    def add_arguments(self, parser): +        parser.add_argument( +            '--merge-title', type=str, default='', dest='merged-title', +            help='If specified when title differs the given title will be ' +                 'used.') +        parser.add_argument( +            '--output-path', type=str, default='', dest='output-path', +            help='Output path for results CSV files. Default to current path.') +        parser.add_argument( +            '--ignore-reference', dest='ignore-reference', action='store_true', +            help='Ignore the reference on diff between models.') +        parser.add_argument( +            '--delete-missing', dest='delete-missing', action='store_true', +            default=False, help='Delete document with missing images.') +        parser.add_argument( +            '--quiet', dest='quiet', action='store_true', +            help='Quiet output.') + +    def handle(self, *args, **options): +        quiet = options['quiet'] +        ignore_ref = options['ignore-reference'] +        delete_missing = options['delete-missing'] +        merged_title = options['merged-title'] +        output_path = options['output-path'] + +        q = Document.objects.filter(image__isnull=False).exclude( +            image='') +        hashes = {} +        missing_images = [] +        count = q.count() +        out = sys.stdout +        if not quiet: +            out.write("* {} images\n".format(count)) +        for idx, doc in enumerate(q.all()): +            if not quiet: +                out.write("\r* hashes calculation: {} %".format( +                    int(float(idx + 1) / count * 100))) +                out.flush() +            path = doc.image.path +            try: +                hexdigest = get_hexdigest(path) +            except IOError: +                missing_images.append(doc.pk) +                continue +            if hexdigest not in hashes: +                hashes[hexdigest] = [] +            hashes[hexdigest].append(doc.pk) +        nb_missing_images = len(missing_images) +        if not quiet: +            out.write("\n* {} missing images\n".format(nb_missing_images)) + +        if missing_images and delete_missing: +            for nb, idx in enumerate(missing_images): +                if not quiet: +                    out.write( +                        "\r* delete document with missing images: {} %".format( +                            int(float(nb + 1) / nb_missing_images * 100))) +                    out.flush() +                doc = Document.objects.get(pk=idx) +                doc.delete() +            if not quiet: +                out.write("\n") + +        attributes = [ +            'title', 'associated_file', 'internal_reference', 'source_type', +            'support_type', 'format_type', 'scale', +            'authors_raw', 'associated_url', 'receipt_date', 'creation_date', +            'receipt_date_in_documentation', 'item_number', 'description', +            'comment', 'additional_information', 'duplicate' +        ] +        if not ignore_ref: +            attributes.append('reference') + +        m2ms = ['authors', 'licenses'] + +        nb_conflicted_items = 0 +        nb_merged_items = 0 +        distinct_image = 0 +        conflicts = [] +        merged = [] + +        count = len(hashes) + +        for idx, hash in enumerate(hashes): +            if not quiet: +                out.write("\r* merge similar images: {} %".format( +                    int(float(idx + 1) / count * 100))) +                out.flush() +            if len(hashes[hash]) < 2: +                distinct_image += 1 +                continue +            items = [Document.objects.get(pk=pk) for pk in hashes[hash]] +            ref_item = items[0] +            other_items = items[1:] + +            for item in other_items: +                ref_item = Document.objects.get(pk=ref_item.pk) +                conflicted_values = [] +                for attr in attributes: +                    ref_value = getattr(ref_item, attr) +                    other_value = getattr(item, attr) +                    if ref_value: +                        if not other_value: +                            continue +                        if other_value != ref_value: +                            if attr == 'title' and merged_title: +                                setattr(ref_item, 'title', merged_title) +                            else: +                                conflicted_values.append( +                                    (attr, ref_value, other_value) +                                ) +                    else: +                        if not other_value: +                            continue +                        setattr(ref_item, attr, other_value) + +                base_csv = [ +                    ref_item.pk, +                    ref_item.reference.encode('utf-8') if +                    ref_item.reference else "", +                    ref_item.cache_related_label.encode('utf-8') if +                    ref_item.cache_related_label else "", +                    ref_item.image.name.encode('utf-8'), +                    item.pk, +                    item.reference.encode('utf-8') if +                    item.reference else "", +                    item.cache_related_label.encode('utf-8') if +                    item.cache_related_label else "", +                    item.image.name.encode('utf-8'), +                ] +                if conflicted_values: +                    nb_conflicted_items += 1 +                    for attr, ref_value, other_value in conflicted_values: +                        conflicts.append(base_csv + [ +                            attr, unicode(ref_value).encode('utf-8'), +                            unicode(other_value).encode('utf-8') +                        ]) +                    continue + +                merged.append(base_csv) + +                for m2m in m2ms: +                    for m2 in getattr(item, m2m).all(): +                        if m2 not in getattr(ref_item, m2m): +                            getattr(ref_item, m2m).add(m2) + +                for rel_attr in Document.RELATED_MODELS: +                    ref_rel_items = [ +                        r.pk for r in getattr(ref_item, rel_attr).all()] +                    for rel_item in getattr(item, rel_attr).all(): +                        if rel_item.pk not in ref_rel_items: +                            getattr(ref_item, rel_attr).add(rel_item) + +                ref_item.skip_history_when_saving = True +                ref_item.save() +                item.delete() +                nb_merged_items += 1 +        if not quiet: +            out.write(u"\n") + +        n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-') +        if conflicts: +            filename = output_path + u"{}-conflict.csv".format(n) +            with open(filename, 'w') as csvfile: +                writer = csv.writer(csvfile) +                writer.writerow( +                    ["Document 1 - pk", "Document 1 - Ref", +                     "Document 1 - related", "Document 1 - image path", +                     "Document 2 - pk", "Document 2 - Ref", +                     "Document 2 - related", "Document 2 - image path", +                     "Attribute", "Document 1 - value", "Document 2 - value" +                     ] +                ) +                for conflict in conflicts: +                    writer.writerow(conflict) +            if not quiet: +                out.write(u"* {} conflicted items ({})\n".format( +                    nb_conflicted_items, filename)) +        if merged: +            filename = output_path + u"{}-merged.csv".format(n) +            with open(filename, 'w') as csvfile: +                writer = csv.writer(csvfile) +                writer.writerow( +                    ["Document 1 - pk", "Document 1 - Ref", +                     "Document 1 - related", "Document 1 - image path", +                     "Document 2 - pk", "Document 2 - Ref", +                     "Document 2 - related", "Document 2 - image path", +                     ] +                ) +                for merge in merged: +                    writer.writerow(merge) +            if not quiet: +                out.write(u"* {} merged items ({})\n".format(nb_merged_items, +                                                            filename)) +        if not quiet: +            out.write("* {} distinct images\n".format(distinct_image)) + + + + + + + + + + | 
