2 files changed, 234 insertions, 71 deletions
diff --git a/ishtar_common/management/commands/generate_rights.py b/ishtar_common/management/commands/generate_rights.py
deleted file mode 100644
index 75b1cf807..000000000
--- a/ishtar_common/management/commands/generate_rights.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright (C) 2011  Étienne Loks  <etienne.loks_AT_peacefrogsDOTnet>
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# See the file COPYING for details.
-
-import sys
-
-from django.core.management.base import BaseCommand, CommandError
-from django.core.exceptions import ObjectDoesNotExist
-
-import ishtar_base.forms_main as ishtar_forms
-import ishtar_base.models as models
-
-class Command(BaseCommand):
-    args = ''
-    help = 'Regenerate rights for current forms'
-
-    def handle(self, *args, **options):
-        wizards = []
-        wizard_steps = {}
-        for attr in dir(ishtar_forms):
-            if not attr.endswith('_wizard'):
-                continue
-            wizard = getattr(ishtar_forms, attr)
-            url_name = wizard.url_name
-            try:
-                wizard_obj = models.Wizard.objects.get(url_name=url_name)
-            except ObjectDoesNotExist:
-                wizard_obj = models.Wizard.objects.create(url_name=url_name)
-                wizard_obj.save()
-                #self.stdout.write('* Wizard "%s" added\n' % url_name)
-                sys.stdout.write('* Wizard "%s" added\n' % url_name)
-            wizard_steps[url_name] = []
-            for idx, step_url_name in enumerate(wizard.form_list.keys()):
-                form = wizard.form_list[step_url_name]
-                if issubclass(form, ishtar_forms.FinalForm):
-                    break # don't reference the final form
-                step_values = {'name':unicode(form.form_label),
-                               'order':idx}
-                try:
-                    step_obj = models.WizardStep.objects.get(wizard=wizard_obj,
-                                                         url_name=step_url_name)
-                    for k in step_values:
-                        setattr(step_obj, k, step_values[k])
-                    step_obj.save()
-                except ObjectDoesNotExist:
-                    step_values.update({'wizard':wizard_obj,
-                                        'url_name':step_url_name})
-                    step_obj = models.WizardStep.objects.create(**step_values)
-                    step_obj.save()
-                    #self.stdout.write('* Wizard step "%s" added\n' \
-                    #                  % unicode(form.form_label))
-                    sys.stdout.write('* Wizard step "%s" added\n' \
-                                      % unicode(form.form_label))
-                wizard_steps[url_name].append(step_url_name)
-        #self.stdout.write('Successfully regeneration of wizard rights\n')
-        sys.stdout.write('Successfully regeneration of wizard rights\n')
diff --git a/ishtar_common/management/commands/reassociate_similar_images.py b/ishtar_common/management/commands/reassociate_similar_images.py
new file mode 100644
index 000000000..f6d432327
--- /dev/null
+++ b/ishtar_common/management/commands/reassociate_similar_images.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import csv
+import datetime
+import hashlib
+import sys
+
+from django.core.management.base import BaseCommand
+
+from ishtar_common.models import Document
+
+BLOCKSIZE = 65536
+
+
+def get_hexdigest(filename):
+    m = hashlib.sha256()
+    with open(filename, 'rb') as afile:
+        buf = afile.read(BLOCKSIZE)
+        while len(buf) > 0:
+            m.update(buf)
+            buf = afile.read(BLOCKSIZE)
+    return m.hexdigest()
+
+
+class Command(BaseCommand):
+    help = 'Re-associate similar images in the database'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--merge-title', type=str, default='', dest='merged-title',
+            help='If specified when title differs the given title will be '
+                 'used.')
+        parser.add_argument(
+            '--output-path', type=str, default='', dest='output-path',
+            help='Output path for results CSV files. Default to current path.')
+        parser.add_argument(
+            '--ignore-reference', dest='ignore-reference', action='store_true',
+            help='Ignore the reference on diff between models.')
+        parser.add_argument(
+            '--delete-missing', dest='delete-missing', action='store_true',
+            default=False, help='Delete document with missing images.')
+        parser.add_argument(
+            '--quiet', dest='quiet', action='store_true',
+            help='Quiet output.')
+
+    def handle(self, *args, **options):
+        quiet = options['quiet']
+        ignore_ref = options['ignore-reference']
+        delete_missing = options['delete-missing']
+        merged_title = options['merged-title']
+        output_path = options['output-path']
+
+        q = Document.objects.filter(image__isnull=False).exclude(
+            image='')
+        hashes = {}
+        missing_images = []
+        count = q.count()
+        out = sys.stdout
+        if not quiet:
+            out.write("* {} images\n".format(count))
+        for idx, doc in enumerate(q.all()):
+            if not quiet:
+                out.write("\r* hashes calculation: {} %".format(
+                    int(float(idx + 1) / count * 100)))
+                out.flush()
+            path = doc.image.path
+            try:
+                hexdigest = get_hexdigest(path)
+            except IOError:
+                missing_images.append(doc.pk)
+                continue
+            if hexdigest not in hashes:
+                hashes[hexdigest] = []
+            hashes[hexdigest].append(doc.pk)
+        nb_missing_images = len(missing_images)
+        if not quiet:
+            out.write("\n* {} missing images\n".format(nb_missing_images))
+
+        if missing_images and delete_missing:
+            for nb, idx in enumerate(missing_images):
+                if not quiet:
+                    out.write(
+                        "\r* delete document with missing images: {} %".format(
+                            int(float(nb + 1) / nb_missing_images * 100)))
+                    out.flush()
+                doc = Document.objects.get(pk=idx)
+                doc.delete()
+            if not quiet:
+                out.write("\n")
+
+        attributes = [
+            'title', 'associated_file', 'internal_reference', 'source_type',
+            'support_type', 'format_type', 'scale',
+            'authors_raw', 'associated_url', 'receipt_date', 'creation_date',
+            'receipt_date_in_documentation', 'item_number', 'description',
+            'comment', 'additional_information', 'duplicate'
+        ]
+        if not ignore_ref:
+            attributes.append('reference')
+
+        m2ms = ['authors', 'licenses']
+
+        nb_conflicted_items = 0
+        nb_merged_items = 0
+        distinct_image = 0
+        conflicts = []
+        merged = []
+
+        count = len(hashes)
+
+        for idx, hash in enumerate(hashes):
+            if not quiet:
+                out.write("\r* merge similar images: {} %".format(
+                    int(float(idx + 1) / count * 100)))
+                out.flush()
+            if len(hashes[hash]) < 2:
+                distinct_image += 1
+                continue
+            items = [Document.objects.get(pk=pk) for pk in hashes[hash]]
+            ref_item = items[0]
+            other_items = items[1:]
+
+            for item in other_items:
+                ref_item = Document.objects.get(pk=ref_item.pk)
+                conflicted_values = []
+                for attr in attributes:
+                    ref_value = getattr(ref_item, attr)
+                    other_value = getattr(item, attr)
+                    if ref_value:
+                        if not other_value:
+                            continue
+                        if other_value != ref_value:
+                            if attr == 'title' and merged_title:
+                                setattr(ref_item, 'title', merged_title)
+                            else:
+                                conflicted_values.append(
+                                    (attr, ref_value, other_value)
+                                )
+                    else:
+                        if not other_value:
+                            continue
+                        setattr(ref_item, attr, other_value)
+
+                base_csv = [
+                    ref_item.pk,
+                    ref_item.reference.encode('utf-8') if
+                    ref_item.reference else "",
+                    ref_item.cache_related_label.encode('utf-8') if
+                    ref_item.cache_related_label else "",
+                    ref_item.image.name.encode('utf-8'),
+                    item.pk,
+                    item.reference.encode('utf-8') if
+                    item.reference else "",
+                    item.cache_related_label.encode('utf-8') if
+                    item.cache_related_label else "",
+                    item.image.name.encode('utf-8'),
+                ]
+                if conflicted_values:
+                    nb_conflicted_items += 1
+                    for attr, ref_value, other_value in conflicted_values:
+                        conflicts.append(base_csv + [
+                            attr, unicode(ref_value).encode('utf-8'),
+                            unicode(other_value).encode('utf-8')
+                        ])
+                    continue
+
+                merged.append(base_csv)
+
+                for m2m in m2ms:
+                    for m2 in getattr(item, m2m).all():
+                        if m2 not in getattr(ref_item, m2m):
+                            getattr(ref_item, m2m).add(m2)
+
+                for rel_attr in Document.RELATED_MODELS:
+                    ref_rel_items = [
+                        r.pk for r in getattr(ref_item, rel_attr).all()]
+                    for rel_item in getattr(item, rel_attr).all():
+                        if rel_item.pk not in ref_rel_items:
+                            getattr(ref_item, rel_attr).add(rel_item)
+
+                ref_item.skip_history_when_saving = True
+                ref_item.save()
+                item.delete()
+                nb_merged_items += 1
+        if not quiet:
+            out.write(u"\n")
+
+        n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-')
+        if conflicts:
+            filename = output_path + u"{}-conflict.csv".format(n)
+            with open(filename, 'w') as csvfile:
+                writer = csv.writer(csvfile)
+                writer.writerow(
+                    ["Document 1 - pk", "Document 1 - Ref",
+                     "Document 1 - related", "Document 1 - image path",
+                     "Document 2 - pk", "Document 2 - Ref",
+                     "Document 2 - related", "Document 2 - image path",
+                     "Attribute", "Document 1 - value", "Document 2 - value"
+                     ]
+                )
+                for conflict in conflicts:
+                    writer.writerow(conflict)
+            if not quiet:
+                out.write(u"* {} conflicted items ({})\n".format(
+                    nb_conflicted_items, filename))
+        if merged:
+            filename = output_path + u"{}-merged.csv".format(n)
+            with open(filename, 'w') as csvfile:
+                writer = csv.writer(csvfile)
+                writer.writerow(
+                    ["Document 1 - pk", "Document 1 - Ref",
+                     "Document 1 - related", "Document 1 - image path",
+                     "Document 2 - pk", "Document 2 - Ref",
+                     "Document 2 - related", "Document 2 - image path",
+                     ]
+                )
+                for merge in merged:
+                    writer.writerow(merge)
+            if not quiet:
+                out.write(u"* {} merged items ({})\n".format(nb_merged_items,
+                                                            filename))
+        if not quiet:
+            out.write("* {} distinct images\n".format(distinct_image))
+
+
+
+
+
+
+
+
+
+