summaryrefslogtreecommitdiff
path: root/ishtar_common/management/commands/reassociate_similar_images.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2018-10-09 16:22:34 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2018-10-24 12:06:09 +0200
commit8ef2b47de3a9a05ec57adfc43362a00a38e750ff (patch)
tree65ad9d928f6862318819e56629794b519ffdd2b9 /ishtar_common/management/commands/reassociate_similar_images.py
parent115901d896150f734b25a4dfb3802a69cb250ca0 (diff)
downloadIshtar-8ef2b47de3a9a05ec57adfc43362a00a38e750ff.tar.bz2
Ishtar-8ef2b47de3a9a05ec57adfc43362a00a38e750ff.zip
Command to reassociate your images
Diffstat (limited to 'ishtar_common/management/commands/reassociate_similar_images.py')
-rw-r--r--ishtar_common/management/commands/reassociate_similar_images.py211
1 files changed, 211 insertions, 0 deletions
diff --git a/ishtar_common/management/commands/reassociate_similar_images.py b/ishtar_common/management/commands/reassociate_similar_images.py
new file mode 100644
index 000000000..f255f4876
--- /dev/null
+++ b/ishtar_common/management/commands/reassociate_similar_images.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import csv
+import datetime
+import hashlib
+import sys
+
+from django.core.management.base import BaseCommand
+
+from ishtar_common.models import Document
+
+BLOCKSIZE = 65536
+
+
+def get_hexdigest(filename):
+ m = hashlib.sha256()
+ with open(filename, 'rb') as afile:
+ buf = afile.read(BLOCKSIZE)
+ while len(buf) > 0:
+ m.update(buf)
+ buf = afile.read(BLOCKSIZE)
+ return m.hexdigest()
+
+
+class Command(BaseCommand):
+ help = 'Re-associate similar images in the database'
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--merge-title', type=str, default='', dest='merged-title',
+ help='If specified when title differs the given title will be '
+ 'used.')
+ parser.add_argument(
+ '--output-path', type=str, default='', dest='output-path',
+ help='Output path for results CSV files. Default to current path.')
+ parser.add_argument(
+ '--ignore-reference', dest='ignore-reference', action='store_true',
+ help='Ignore the reference on diff between models.')
+ parser.add_argument(
+ '--quiet', dest='quiet', action='store_true',
+ help='Quiet output.')
+
+ def handle(self, *args, **options):
+ quiet = options['quiet']
+ ignore_ref = options['ignore-reference']
+ merged_title = options['merged-title']
+ output_path = options['output-path']
+
+ q = Document.objects.filter(image__isnull=False).exclude(
+ image='')
+ hashes = {}
+ missing_images = []
+ count = q.count()
+ out = sys.stdout
+ if not quiet:
+ out.write("* {} images\n".format(count))
+ for idx, doc in enumerate(q.all()):
+ if not quiet:
+ out.write("\r* hashes calculation: {} %".format(
+ int(float(idx + 1) / count * 100)))
+ out.flush()
+ path = doc.image.path
+ try:
+ hexdigest = get_hexdigest(path)
+ except IOError:
+ missing_images.append(doc.pk)
+ continue
+ if hexdigest not in hashes:
+ hashes[hexdigest] = []
+ hashes[hexdigest].append(doc.pk)
+ if not quiet:
+ out.write("\n* {} missing images\n".format(len(missing_images)))
+
+ attributes = ['title', 'associated_file', 'internal_reference',
+ 'source_type', 'description']
+ if not ignore_ref:
+ attributes.append('reference')
+
+ nb_conflicted_items = 0
+ nb_merged_items = 0
+ distinct_image = 0
+ conflicts = []
+ merged = []
+
+ count = len(hashes)
+
+ for idx, hash in enumerate(hashes):
+ if not quiet:
+ out.write("\r* merge similar images: {} %".format(
+ int(float(idx + 1) / count * 100)))
+ out.flush()
+ if len(hashes[hash]) < 2:
+ distinct_image += 1
+ continue
+ items = [Document.objects.get(pk=pk) for pk in hashes[hash]]
+ ref_item = items[0]
+ other_items = items[1:]
+
+ for item in other_items:
+ ref_item = Document.objects.get(pk=ref_item.pk)
+ conflicted_values = []
+ for attr in attributes:
+ ref_value = getattr(ref_item, attr)
+ other_value = getattr(item, attr)
+ if ref_value:
+ if not other_value:
+ continue
+ if other_value != ref_value:
+ if attr == 'title' and merged_title:
+ setattr(ref_item, 'title', merged_title)
+ else:
+ conflicted_values.append(
+ (attr, ref_value, other_value)
+ )
+ else:
+ if not other_value:
+ continue
+ setattr(ref_item, attr, other_value)
+
+ if conflicted_values:
+ nb_conflicted_items += 1
+ for attr, ref_value, other_value in conflicted_values:
+ conflicts.append([
+ ref_item.pk, ref_item.reference.encode('utf-8'),
+ ref_item.cache_related_label.encode('utf-8'),
+ ref_item.image.name.encode('utf-8'),
+ item.pk, item.reference.encode('utf-8'),
+ item.cache_related_label.encode('utf-8'),
+ item.image.name.encode('utf-8'),
+ attr, unicode(ref_value).encode('utf-8'),
+ unicode(other_value).encode('utf-8')
+ ])
+ continue
+
+ merged.append([
+ ref_item.pk, ref_item.reference.encode('utf-8'),
+ ref_item.cache_related_label.encode('utf-8'),
+ ref_item.image.name.encode('utf-8'),
+ item.pk, item.reference.encode('utf-8'),
+ item.cache_related_label.encode('utf-8'),
+ item.image.name.encode('utf-8'),
+ ])
+
+ for author in ref_item.authors.all():
+ if author not in item.authors:
+ item.authors.add(author)
+ for author in item.authors.all():
+ if author not in ref_item.authors:
+ ref_item.authors.add(author)
+
+ for rel_attr in Document.RELATED_MODELS:
+ ref_rel_items = [
+ r.pk for r in getattr(ref_item, rel_attr).all()]
+ for rel_item in getattr(item, rel_attr).all():
+ if rel_item.pk not in ref_rel_items:
+ getattr(ref_item, rel_attr).add(rel_item)
+
+ ref_item.skip_history_when_saving = True
+ ref_item.save()
+ item.delete()
+ nb_merged_items += 1
+ if not quiet:
+ out.write(u"\n")
+
+ n = datetime.datetime.now().isoformat().split('.')[0].replace(':', '-')
+ if conflicts:
+ filename = output_path + u"{}-conflict.csv".format(n)
+ with open(filename, 'w') as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(
+ ["Document 1 - pk", "Document 1 - Ref",
+ "Document 1 - related", "Document 1 - image path",
+ "Document 2 - pk", "Document 2 - Ref",
+ "Document 2 - related", "Document 2 - image path",
+ "Attribute", "Document 1 - value", "Document 2 - value"
+ ]
+ )
+ for conflict in conflicts:
+ writer.writerow(conflict)
+ if not quiet:
+ out.write(u"* {} conflicted items ({})\n".format(
+ nb_conflicted_items, filename))
+ if merged:
+ filename = output_path + u"{}-merged.csv".format(n)
+ with open(filename, 'w') as csvfile:
+ writer = csv.writer(csvfile)
+ writer.writerow(
+ ["Document 1 - pk", "Document 1 - Ref",
+ "Document 1 - related", "Document 1 - image path",
+ "Document 2 - pk", "Document 2 - Ref",
+ "Document 2 - related", "Document 2 - image path",
+ ]
+ )
+ for merge in merged:
+ writer.writerow(merge)
+ if not quiet:
+ out.write(u"* {} merged items ({})\n".format(nb_merged_items,
+ filename))
+ if not quiet:
+ out.write("* {} distinct images\n".format(distinct_image))
+
+
+
+
+
+
+
+
+
+