From 34e5e81e7afe240b05f2df834d21e08dbb4bcf79 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Wed, 18 Jan 2023 23:33:56 +0100 Subject: Free search: "raw" index for reference - improve parent only search --- CHANGES.md | 5 ++ archaeological_context_records/models.py | 4 +- archaeological_context_records/tests.py | 5 +- archaeological_files/models.py | 4 +- archaeological_finds/models_finds.py | 21 ++++---- archaeological_operations/models.py | 37 +++++++++----- archaeological_warehouse/models.py | 9 ++-- .../management/commands/ishtar_maintenance.py | 2 +- ishtar_common/models.py | 6 +-- ishtar_common/models_common.py | 59 ++++++++++++++++++---- ishtar_common/utils.py | 11 +++- ishtar_common/views_item.py | 6 ++- 12 files changed, 116 insertions(+), 53 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 111f1d5cd..d4f072642 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,11 @@ date: 2023-01-18 Ishtar changelog ================ +### Features/improvements ### +- Free search: + - "raw" index for reference (add in index whole reference and split ref) + - improve parent only search index + v4.0.35 - 2023-01-18 -------------------- diff --git a/archaeological_context_records/models.py b/archaeological_context_records/models.py index 72a835cc2..074bc3358 100644 --- a/archaeological_context_records/models.py +++ b/archaeological_context_records/models.py @@ -635,10 +635,10 @@ class ContextRecord( PARENT_ONLY_SEARCH_VECTORS = ["operation", "archaeological_site", "parcel"] BASE_SEARCH_VECTORS = [ - SearchVectorConfig("cached_label"), - SearchVectorConfig("label"), + SearchVectorConfig("label", "raw"), SearchVectorConfig("location"), SearchVectorConfig("town__name"), + SearchVectorConfig("town__numero_insee", "raw"), SearchVectorConfig("interpretation", "local"), SearchVectorConfig("filling", "local"), SearchVectorConfig("datings_comment", "local"), diff --git a/archaeological_context_records/tests.py b/archaeological_context_records/tests.py index 9842340f3..f0b12e0d6 100644 --- a/archaeological_context_records/tests.py +++ b/archaeological_context_records/tests.py @@ -429,8 +429,9 @@ class ContextRecordTest(ContextRecordInit, TestCase): self.assertIsNotNone(cr.search_vector) for key in ("label", "heeer"): self.assertIn(key, cr.search_vector) - cr.operation.code_patriarche = "PATRIARCHE" - cr.operation.save() + operation = models_ope.Operation.objects.get(pk=cr.operation.pk) + operation.code_patriarche = "PATRIARCHE" + operation.save() cr = models.ContextRecord.objects.get(pk=cr.pk) profile = get_current_profile() diff --git a/archaeological_files/models.py b/archaeological_files/models.py index c2992935b..bb31c1c24 100644 --- a/archaeological_files/models.py +++ b/archaeological_files/models.py @@ -417,11 +417,11 @@ class File( } BASE_SEARCH_VECTORS = [ SearchVectorConfig("name"), - SearchVectorConfig("internal_reference"), + SearchVectorConfig("internal_reference", "raw"), SearchVectorConfig("file_type__label"), SearchVectorConfig("saisine_type__label"), SearchVectorConfig("permit_type__label"), - SearchVectorConfig("permit_reference"), + SearchVectorConfig("permit_reference", "raw"), SearchVectorConfig("comment", "local"), SearchVectorConfig("research_comment", "local"), ] diff --git a/archaeological_finds/models_finds.py b/archaeological_finds/models_finds.py index 0efb9963e..6be609056 100644 --- a/archaeological_finds/models_finds.py +++ b/archaeological_finds/models_finds.py @@ -459,12 +459,12 @@ class BaseFind( CACHED_COMPLETE_ID = "cache_complete_id" PARENT_SEARCH_VECTORS = ["context_record"] BASE_SEARCH_VECTORS = [ - SearchVectorConfig("label"), + SearchVectorConfig("label", "raw"), SearchVectorConfig("description", "local"), SearchVectorConfig("comment", "local"), - SearchVectorConfig("cache_short_id"), - SearchVectorConfig("cache_complete_id"), - SearchVectorConfig("excavation_id"), + SearchVectorConfig("cache_short_id", "raw"), + SearchVectorConfig("cache_complete_id", "raw"), + SearchVectorConfig("excavation_id", "raw"), ] DOC_VALUES = [ @@ -1652,19 +1652,18 @@ class Find( """ PARENT_SEARCH_VECTORS = ["base_finds"] + PARENT_ONLY_SEARCH_VECTORS = ["container"] BASE_SEARCH_VECTORS = [ - SearchVectorConfig("cached_label"), - SearchVectorConfig("label"), + SearchVectorConfig("cached_label", "raw"), + SearchVectorConfig("label", "raw"), SearchVectorConfig("description", "local"), - SearchVectorConfig("container__location__name"), - SearchVectorConfig("container__reference"), SearchVectorConfig("mark"), SearchVectorConfig("comment", "local"), SearchVectorConfig("dating_comment", "local"), - SearchVectorConfig("previous_id"), + SearchVectorConfig("previous_id", "raw"), SearchVectorConfig("denomination"), - SearchVectorConfig("museum_id"), - SearchVectorConfig("laboratory_id"), + SearchVectorConfig("museum_id", "raw"), + SearchVectorConfig("laboratory_id", "raw"), SearchVectorConfig("decoration"), SearchVectorConfig("manufacturing_place"), ] diff --git a/archaeological_operations/models.py b/archaeological_operations/models.py index caaff33d9..191918237 100644 --- a/archaeological_operations/models.py +++ b/archaeological_operations/models.py @@ -326,19 +326,20 @@ class ArchaeologicalSite( SearchVectorConfig("locality_ngi", "local"), SearchVectorConfig("name"), SearchVectorConfig("oceanographic_service_localisation"), - SearchVectorConfig("reference"), - SearchVectorConfig("other_reference"), - SearchVectorConfig("shipwreck_code"), + SearchVectorConfig("reference", "raw"), + SearchVectorConfig("other_reference", "raw"), + SearchVectorConfig("shipwreck_code", "raw"), SearchVectorConfig("shipwreck_name"), - SearchVectorConfig("drassm_number"), - SearchVectorConfig("affmar_number"), + SearchVectorConfig("drassm_number", "raw"), + SearchVectorConfig("affmar_number", "raw"), ] M2M_SEARCH_VECTORS = [ SearchVectorConfig("periods__label", "local"), SearchVectorConfig("remains__label", "local"), SearchVectorConfig("towns__name"), + SearchVectorConfig("towns__numero_insee", "raw"), ] - PARENT_SEARCH_VECTORS = ["operations"] + PARENT_ONLY_SEARCH_VECTORS = ["operations"] DATED_FIELDS = BaseHistorizedItem.DATED_FIELDS + ["sinking_date"] @@ -907,6 +908,15 @@ class ParcelItem: parcels[key] = p +def add_oa_prefix(value): + if not value: + return "" + profile = get_current_profile() + if not profile.operation_prefix: + return "" + return profile.operation_prefix + value + + class Operation( ClosedItem, DocumentItem, @@ -1035,25 +1045,26 @@ class Operation( BASE_SEARCH_VECTORS = [ SearchVectorConfig("abstract", "local"), SearchVectorConfig("address", "local"), - SearchVectorConfig("code_patriarche"), + SearchVectorConfig("code_patriarche", "raw"), + SearchVectorConfig("code_patriarche", "raw", func=add_oa_prefix), SearchVectorConfig("comment", "local"), SearchVectorConfig("common_name"), SearchVectorConfig("common_name", "local"), SearchVectorConfig("in_charge__cached_label"), SearchVectorConfig("protagonist__cached_label"), - SearchVectorConfig("official_report_number"), - SearchVectorConfig("old_code"), + SearchVectorConfig("official_report_number", "raw"), + SearchVectorConfig("old_code", "raw"), SearchVectorConfig("operation_type__label"), - SearchVectorConfig("operator_reference"), + SearchVectorConfig("operator_reference", "raw"), SearchVectorConfig("operator__cached_label"), SearchVectorConfig("scientist__cached_label"), SearchVectorConfig("scientific_documentation_comment", "local"), SearchVectorConfig("seizure_name"), - SearchVectorConfig("drassm_code"), + SearchVectorConfig("drassm_code", "raw"), ] PROPERTY_SEARCH_VECTORS = [ - SearchVectorConfig("full_reference"), - SearchVectorConfig("short_code_patriarche"), + SearchVectorConfig("full_reference", "raw"), + SearchVectorConfig("short_code_patriarche", "raw"), ] INT_SEARCH_VECTORS = [ SearchVectorConfig("year"), diff --git a/archaeological_warehouse/models.py b/archaeological_warehouse/models.py index 9a8bc86c2..0997d6ea7 100644 --- a/archaeological_warehouse/models.py +++ b/archaeological_warehouse/models.py @@ -836,17 +836,18 @@ class Container( ] IMAGE_PREFIX = "containers/" BASE_SEARCH_VECTORS = [ - SearchVectorConfig("reference"), + SearchVectorConfig("reference", "raw"), SearchVectorConfig("container_type__label"), SearchVectorConfig("cached_location"), - SearchVectorConfig("old_reference"), + SearchVectorConfig("old_reference", "raw"), SearchVectorConfig("comment", "local"), ] M2M_SEARCH_VECTORS = [ - SearchVectorConfig("division__reference"), - SearchVectorConfig("division__division__division__label"), + SearchVectorConfig("division__reference", "raw"), + SearchVectorConfig("division__division__division__label", "raw"), ] PARENT_SEARCH_VECTORS = ["parent"] + PARENT_ONLY_SEARCH_VECTORS = ["finds", "finds_ref"] STATISTIC_MODALITIES_OPTIONS = OrderedDict( [ diff --git a/ishtar_common/management/commands/ishtar_maintenance.py b/ishtar_common/management/commands/ishtar_maintenance.py index c4ad52ab6..4f050088e 100644 --- a/ishtar_common/management/commands/ishtar_maintenance.py +++ b/ishtar_common/management/commands/ishtar_maintenance.py @@ -131,7 +131,7 @@ def _end_task(changed_nb, msg, quiet, store_results, log, log_name, csv_cols): writer.writerow(csv_cols) writer.writerows(store_results) if not quiet: - sys.stdout.write(f"log: {path} written.") + sys.stdout.write(f"log: {path} written.\n") def task_main_image(quiet=False, log=False): diff --git a/ishtar_common/models.py b/ishtar_common/models.py index 442e0c687..fc2487d7b 100644 --- a/ishtar_common/models.py +++ b/ishtar_common/models.py @@ -3780,9 +3780,9 @@ class Document( BASE_SEARCH_VECTORS = [ SearchVectorConfig("title"), SearchVectorConfig("source_type__label"), - SearchVectorConfig("external_id"), - SearchVectorConfig("reference"), - SearchVectorConfig("internal_reference"), + SearchVectorConfig("external_id", "raw"), + SearchVectorConfig("reference", "raw"), + SearchVectorConfig("internal_reference", "raw"), SearchVectorConfig("description", "local"), SearchVectorConfig("comment", "local"), SearchVectorConfig("additional_information", "local"), diff --git a/ishtar_common/models_common.py b/ishtar_common/models_common.py index a6f9ac0e8..03fe26f0c 100644 --- a/ishtar_common/models_common.py +++ b/ishtar_common/models_common.py @@ -884,8 +884,36 @@ class FullSearch(models.Model): deactivate() return query_parameters + @classmethod + def _update_raw_search_field(cls, value): + result = [] + if not value: + value = "" + for val in value.split("'"): + result.append(f"'{val.lower()}':1") + SEPS = [" ", "-", "/"] + values = [] + # split ID terms + for idx, sep in enumerate(SEPS): + if not idx: + values = value.split(sep) + continue + new_values = [] + for val in values: + new_values += val.split(sep) + values = new_values + for val in values: + if len(val) < 2: + continue + val = val.replace("'", "").lower() + result.append(f"'{val}':1") + return result + def _update_search_field(self, search_vector_conf, search_vectors, data): for value in search_vector_conf.format(data): + if search_vector_conf.language == "raw": + search_vectors += self._update_raw_search_field(value) + continue with connection.cursor() as cursor: cursor.execute( "SELECT to_tsvector(%s, %s)", [search_vector_conf.language, value] @@ -921,7 +949,7 @@ class FullSearch(models.Model): logger.warning("No search_vectors defined for {}".format(self.__class__)) return if getattr(self, "_search_updated", None): - return + return self.search_vector JsonDataField = apps.get_model("ishtar_common", "JsonDataField") self._search_updated = True @@ -938,6 +966,12 @@ class FullSearch(models.Model): for item in rel_key.values("pk").all(): query_dct = {key + "__pk": item["pk"]} q = copy.copy(base_q).filter(**query_dct) + if m2m_search_vector.language == "raw": + q = q.values_list(m2m_search_vector.key, flat=True) + search_vectors += self._update_raw_search_field(q.all()[0]) + continue + query_dct = {key + "__pk": item["pk"]} + q = copy.copy(base_q).filter(**query_dct) q = q.annotate( search=SearchVector( m2m_search_vector.key, config=m2m_search_vector.language @@ -961,17 +995,17 @@ class FullSearch(models.Model): elif parent: search_vectors.append(parent.search_vector) - for PARENT_ONLY_SEARCH_VECTOR in self.PARENT_ONLY_SEARCH_VECTORS: - parent = getattr(self, PARENT_ONLY_SEARCH_VECTOR) - if hasattr(parent, "all"): # m2m - for p in parent.all(): + for PARENT_ONLY_SEARCH_VECTOR in self.PARENT_ONLY_SEARCH_VECTORS: + parent = getattr(self, PARENT_ONLY_SEARCH_VECTOR) + if hasattr(parent, "all"): # m2m + for p in parent.all(): + search_vectors.append( + p.update_search_vector(save=False, exclude_parent=True) + ) + elif parent: search_vectors.append( - p.update_search_vector(save=False, exclude_parent=True) + parent.update_search_vector(save=False, exclude_parent=True) ) - elif parent: - search_vectors.append( - parent.update_search_vector(save=False, exclude_parent=True) - ) if self.BASE_SEARCH_VECTORS: # query "simple" fields @@ -4110,7 +4144,10 @@ class SearchVectorConfig: value = "" if not self.func: return [value] - return self.func(value) + value = self.func(value) + if not isinstance(value, list): + return [value] + return value class ShortMenuItem: diff --git a/ishtar_common/utils.py b/ishtar_common/utils.py index 340fb9ee0..1219dd454 100644 --- a/ishtar_common/utils.py +++ b/ishtar_common/utils.py @@ -934,6 +934,9 @@ def num2col(n): return string +RE_TSVECTOR = re.compile(r"('[^']+':\d+(?:,\d+)*)") + + def merge_tsvectors(vectors): """ Parse tsvector to merge them in one string @@ -952,16 +955,20 @@ def merge_tsvectors(vectors): if max_position > current_position: current_position = max_position - for dct_member in vector.split(" "): + for dct_member in RE_TSVECTOR.findall(vector): splitted = dct_member.split(":") key = ":".join(splitted[:-1]) - positions = splitted[-1] key = key[1:-1] # remove quotes + result_dict[key] = [1] + """ + # position is not used today - simplify + positions = splitted[-1] positions = [int(pos) + current_position for pos in positions.split(",")] if key in result_dict: result_dict[key] += positions else: result_dict[key] = positions + """ # {'lamelie': [1, 42, 5]} => {'lamelie': "1,42,5"} result_dict = { diff --git a/ishtar_common/views_item.py b/ishtar_common/views_item.py index 821a6635f..f905d677c 100644 --- a/ishtar_common/views_item.py +++ b/ishtar_common/views_item.py @@ -835,9 +835,11 @@ def _search_manage_search_vector( { "where": [ f"{model._meta.db_table}.search_vector @@ (to_tsquery(%s, %s)) = true OR " + - f"{model._meta.db_table}.search_vector @@ (to_tsquery('simple', %s)) = true" + f"{model._meta.db_table}.search_vector @@ (to_tsquery('simple', %s)) = true OR " + f"{model._meta.db_table}.search_vector::tsvector @@ %s::tsquery = true" ], - "params": [settings.ISHTAR_SEARCH_LANGUAGE, search_query, search_query], + "params": [settings.ISHTAR_SEARCH_LANGUAGE, search_query, search_query, + search_query], } ) return dct, exc_dct, distinct_queries -- cgit v1.2.3