diff --git a/apps/commons/mixins.py b/apps/commons/mixins.py index 84ef04d5..7472bbd8 100644 --- a/apps/commons/mixins.py +++ b/apps/commons/mixins.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Self, Tuple from django.contrib.auth.models import Group, Permission from django.contrib.contenttypes.models import ContentType @@ -408,3 +408,23 @@ def get_slug(self) -> str: if self.get_id_field_name(slug) != "slug": slug = f"{self.slug_prefix}-{slug}" return slug + + +class HasEmbending: + def vectorize(self): + if not getattr(self, "embedding", None): + model_embedding = type(self).embedding.related.related_model + self.embedding = model_embedding(item=self) + self.embedding.save() + self.embedding.vectorize() + + def similars(self, threshold: float = 0.15) -> QuerySet[Self]: + """return similars documents""" + if getattr(self, "embedding", None): + vector = self.embedding.embedding + model_embedding = type(self).embedding.related.related_model + queryset = type(self).objects.all() + return model_embedding.vector_search(vector, queryset, threshold).exclude( + pk=self.pk + ) + return type(self).objects.none() diff --git a/services/crisalid/admin.py b/services/crisalid/admin.py index 7043fb35..a1f9d6b6 100644 --- a/services/crisalid/admin.py +++ b/services/crisalid/admin.py @@ -1,10 +1,14 @@ from contextlib import suppress +from typing import Any from django.contrib import admin, messages from django.db.models import Count +from django.db.models.query import QuerySet +from django.http.request import HttpRequest -from apps.accounts.models import ProjectUser +from apps.accounts.models import PeopleGroup, ProjectUser from apps.commons.admin import TranslateObjectAdminMixin +from services.crisalid.manager import CrisalidQuerySet from services.crisalid.tasks import vectorize_documents from .models import ( @@ -13,9 +17,21 @@ DocumentContributor, Identifier, Researcher, + Structure, ) +class IdentifierAminMixin: + @admin.display(description="identifiers count", ordering="identifiers_count") + def get_identifiers(self, instance): + # list all harvester name from this profile + result = [o.harvester for o in instance.identifiers.all()] + if not result: + return None + + return f"{', '.join(result)} ({len(result)})" + + @admin.register(Identifier) class IdentifierAdmin(admin.ModelAdmin): list_display = ("harvester", "value", "get_researcher", "get_documents") @@ -45,7 +61,7 @@ class DocumentContributorAdminInline(admin.StackedInline): @admin.register(Document) -class DocumentAdmin(TranslateObjectAdminMixin, admin.ModelAdmin): +class DocumentAdmin(TranslateObjectAdminMixin, IdentifierAminMixin, admin.ModelAdmin): list_display = ( "title", "publication_date", @@ -89,22 +105,16 @@ def get_queryset(self, request): def get_contributors(self, instance): return instance.contributors.count() - @admin.display(description="identifiers count", ordering="identifiers_count") - def get_identifiers(self, instance): - # list all harvester name from this profile - result = [o.harvester for o in instance.identifiers.all()] - if not result: - return None - return f"{', '.join(result)} ({len(result)})" - @admin.register(Researcher) -class ResearcherAdmin(admin.ModelAdmin): +class ResearcherAdmin(IdentifierAminMixin, admin.ModelAdmin): list_display = ( "given_name", "family_name", "user", "get_documents", + "get_memberships", + "get_employments", "get_identifiers", ) search_fields = ( @@ -124,6 +134,8 @@ def get_queryset(self, request): .prefetch_related("identifiers", "documents") .annotate(identifiers_count=Count("identifiers__id")) .annotate(documents_count=Count("documents__id", distinct=True)) + .annotate(memberships_count=Count("memberships__id", distinct=True)) + .annotate(employments_count=Count("employments__id", distinct=True)) ) @admin.action(description="assign researcher on projects") @@ -138,17 +150,18 @@ def assign_user(self, request, queryset): continue for identifier in research.identifiers.all(): - if identifier.harvester != Identifier.Harvester.EPPN.value: + if identifier.harvester != Identifier.Harvester.LOCAL.value: continue user = None + email = identifier.value with suppress(ProjectUser.DoesNotExist): - user = ProjectUser.objects.get(email=identifier.value) + user = ProjectUser.objects.get(email=email) if not user: created += 1 user = ProjectUser( - email=identifier.value, + email=email, given_name=research.given_name, family_name=research.family_name, ) @@ -177,14 +190,70 @@ def assign_user(self, request, queryset): def get_documents(self, instance): return instance.documents_count - @admin.display(description="identifiers count", ordering="identifiers_count") - def get_identifiers(self, instance): - # list all harvester name from this profile - result = [o.harvester for o in instance.identifiers.all()] - if not result: - return None + @admin.display(description="number of memberships", ordering="-memberships_count") + def get_memberships(self, instance): + return instance.memberships_count - return f"{', '.join(result)} ({len(result)})" + @admin.display(description="number of employments", ordering="-employments_count") + def get_employments(self, instance): + return instance.employments_count + + +@admin.register(Structure) +class StructureAdmin(IdentifierAminMixin, admin.ModelAdmin): + list_display = ( + "acronym", + "name", + "organization", + "get_memberships", + "get_employments", + "get_identifiers", + ) + search_fields = ("acronym", "name", "organization__code") + autocomplete_fields = ("organization",) + actions = ("assign_group",) + + def get_queryset(self, request: HttpRequest) -> QuerySet[Any]: + return ( + super() + .get_queryset(request) + .select_related("organization") + .annotate( + memberships_count=Count("memberships__pk", distinct=True), + employments_count=Count("employments__pk", distinct=True), + ) + ) + + @admin.action(description="create/update groups") + def assign_group(self, request, queryset: CrisalidQuerySet): + for structure in queryset: + name = structure.name or structure.acronym + if not name: + continue + + parent = PeopleGroup.update_or_create_root(structure.organization) + group = PeopleGroup.objects.filter( + parent=parent, name=name, organization=structure.organization + ).first() + if not group: + group = PeopleGroup( + name=name, parent=parent, organization=structure.organization + ) + + group.save() + member_group = group.get_members() + for membership in structure.memberships.select_related("user").filter( + user__isnull=False + ): + membership.user.groups.add(member_group) + + @admin.display(description="number of memberships", ordering="-memberships_count") + def get_memberships(self, instance): + return instance.memberships_count + + @admin.display(description="number of employments", ordering="-employments_count") + def get_employments(self, instance): + return instance.employments_count @admin.register(CrisalidConfig) diff --git a/services/crisalid/factories.py b/services/crisalid/factories.py index 44435c20..bba115b8 100644 --- a/services/crisalid/factories.py +++ b/services/crisalid/factories.py @@ -36,6 +36,8 @@ def value(self): Identifier.Harvester.EPPN: faker.unique.email(), Identifier.Harvester.DOI: faker.unique.doi(), Identifier.Harvester.PMID: faker.unique.url(), + Identifier.Harvester.NNS: faker.unique.uuid4(), + Identifier.Harvester.RNSR: faker.unique.uuid4(), }[self.harvester] diff --git a/services/crisalid/management/commands/populate_crisalid.py b/services/crisalid/management/commands/populate_crisalid.py index 517af011..b7593d67 100644 --- a/services/crisalid/management/commands/populate_crisalid.py +++ b/services/crisalid/management/commands/populate_crisalid.py @@ -10,7 +10,11 @@ Identifier, Researcher, ) -from services.crisalid.populates import PopulateDocument, PopulateResearcher +from services.crisalid.populates import ( + PopulateDocument, + PopulateResearcher, + PopulateStructure, +) from services.crisalid.populates.base import AbstractPopulate from services.crisalid.utils.timer import timeit from services.mistral.models import DocumentEmbedding @@ -23,13 +27,13 @@ def add_arguments(self, parser): parser.add_argument( "organization", choices=CrisalidConfig.objects.filter( - organization__code__isnull=False + organization__code__isnull=False, active=True ).values_list("organization__code", flat=True), help="organization code", ) parser.add_argument( "command", - choices=("document", "researcher", "all"), + choices=("document", "researcher", "structure", "all"), help="elements to populate", ) parser.add_argument( @@ -111,3 +115,12 @@ def handle(self, **options): where={"external_EQ": False}, **options, ) + + if command in ("all", "structure"): + populate = PopulateStructure(config) + self.populate_crisalid( + service, + populate, + query="organisations", + **options, + ) diff --git a/services/crisalid/migrations/0003_alter_document_options.py b/services/crisalid/migrations/0003_alter_document_options.py new file mode 100644 index 00000000..851222d3 --- /dev/null +++ b/services/crisalid/migrations/0003_alter_document_options.py @@ -0,0 +1,23 @@ +# Generated by Django 5.2.10 on 2026-02-10 14:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("crisalid", "0002_crisalidconfig_and_more"), + ] + + operations = [ + migrations.AlterModelOptions( + name="document", + options={ + "ordering": ( + models.OrderBy( + models.F("publication_date"), descending=True, nulls_last=True + ), + ) + }, + ), + ] diff --git a/services/crisalid/models.py b/services/crisalid/models.py index 965cf3c5..f4c8955a 100644 --- a/services/crisalid/models.py +++ b/services/crisalid/models.py @@ -5,10 +5,9 @@ from django.db import models from django.db.models.functions import Lower -from apps.commons.mixins import OrganizationRelated +from apps.commons.mixins import HasEmbending, OrganizationRelated from apps.organizations.models import Organization from services.crisalid import relators -from services.mistral.models import DocumentEmbedding from services.translator.mixins import HasAutoTranslatedFields from .manager import CrisalidQuerySet, DocumentQuerySet @@ -58,6 +57,8 @@ class Harvester(models.TextChoices): EPPN = "eppn" DOI = "doi" PMID = "pmid" + NNS = "nns" + RNSR = "rnsr" harvester = models.CharField(max_length=50, choices=Harvester.choices) value = models.CharField(max_length=255) @@ -94,6 +95,12 @@ class Researcher(CrisalidDataModel): ) objects = CrisalidQuerySet.as_manager() + memberships = models.ManyToManyField( + "crisalid.Structure", related_name="memberships" + ) + employments = models.ManyToManyField( + "crisalid.Structure", related_name="employments" + ) def __str__(self): if hasattr(self, "user") and self.user is not None: @@ -121,7 +128,9 @@ class Meta: ] -class Document(OrganizationRelated, HasAutoTranslatedFields, CrisalidDataModel): +class Document( + HasEmbending, OrganizationRelated, HasAutoTranslatedFields, CrisalidDataModel +): """ Represents a research publicaiton (or 'document') in the Crisalid system. """ @@ -199,6 +208,10 @@ class DocumentType(models.TextChoices): organization_query_string = "contributors__user__groups__organizations" + class Meta: + # order by publicattion date, and put "null date" at last + ordering = (models.F("publication_date").desc(nulls_last=True),) + def get_related_organizations(self): """organizations from user""" return list( @@ -217,24 +230,6 @@ def document_type_centralized(self) -> list[str]: return vals return [self.document_type] - def vectorize(self): - if not getattr(self, "embedding", None): - self.embedding = DocumentEmbedding(item=self) - self.embedding.save() - self.embedding.vectorize() - - def similars(self, threshold: float = 0.15) -> DocumentQuerySet: - """return similars documents""" - if getattr(self, "embedding", None): - vector = self.embedding.embedding - queryset = Document.objects.all() - return ( - DocumentEmbedding.vector_search(vector, queryset, threshold) - .filter(document_type__in=self.document_type_centralized) - .exclude(pk=self.pk) - ) - return Document.objects.none() - def save(self, *ar, **kw): md = super().save(*ar, **kw) # when we update models , re-calculate vectorize @@ -285,6 +280,29 @@ def values(cls) -> Generator[tuple[str]]: yield v +class Structure(OrganizationRelated, CrisalidDataModel): + acronym = models.TextField(null=True, blank=True) + name = models.TextField() + identifiers = models.ManyToManyField( + "crisalid.Identifier", related_name="structures" + ) + organization = models.ForeignKey( + "organizations.Organization", + on_delete=models.CASCADE, + related_name="structures", + ) + objects = CrisalidQuerySet.as_manager() + group = models.ForeignKey( + "accounts.PeopleGroup", + on_delete=models.SET_NULL, + null=True, + related_name="structure", + ) + + def __str__(self): + return self.name + + class CrisalidConfig(OrganizationRelated, models.Model): """model for crisalid config with host/pass for connected to crisalid, is linked to a one organization diff --git a/services/crisalid/populates/__init__.py b/services/crisalid/populates/__init__.py index b8986cd3..8302bbf8 100644 --- a/services/crisalid/populates/__init__.py +++ b/services/crisalid/populates/__init__.py @@ -1,9 +1,13 @@ from .caches import LiveCache from .document import PopulateDocument +from .identifier import PopulateIdentifier from .researcher import PopulateResearcher +from .structure import PopulateStructure __all__ = ( "PopulateResearcher", "PopulateDocument", + "PopulateStructure", + "PopulateIdentifier", "LiveCache", ) diff --git a/services/crisalid/populates/base.py b/services/crisalid/populates/base.py index 5c597589..cc92ba42 100644 --- a/services/crisalid/populates/base.py +++ b/services/crisalid/populates/base.py @@ -19,6 +19,12 @@ def __init__(self, config: CrisalidConfig, cache: TCACHE = None): self.config = config self.cache = cache or LiveCache() + def sanitize_string(self, value) -> str: + """strip value and convert it to string""" + if not value: + return "" + return str(value).strip() + def sanitize_languages(self, values: list[dict[str, str]]) -> str: """convert languages choices from crisalid fields crisalid return a list of objects with "language" and "value" assosiated from the language @@ -29,7 +35,7 @@ def sanitize_languages(self, values: list[dict[str, str]]) -> str: maps_languages = {} for value in values: - maps_languages[value["language"]] = (value["value"] or "").strip() + maps_languages[value["language"]] = self.sanitize_string(value["value"]) return ( maps_languages.get("en") diff --git a/services/crisalid/populates/document.py b/services/crisalid/populates/document.py index ab5025b4..0069ddef 100644 --- a/services/crisalid/populates/document.py +++ b/services/crisalid/populates/document.py @@ -1,10 +1,6 @@ from services.crisalid import relators -from services.crisalid.models import ( - CrisalidConfig, - Document, - DocumentContributor, - Identifier, -) +from services.crisalid.models import CrisalidConfig, Document, DocumentContributor +from services.crisalid.populates.identifier import PopulateIdentifier from .base import AbstractPopulate from .logger import logger @@ -14,7 +10,10 @@ class PopulateDocument(AbstractPopulate): def __init__(self, config: CrisalidConfig, cache=None): super().__init__(config, cache) - self.populate_researcher = PopulateResearcher(self.config, self.cache) + self.populate_identifiers = PopulateIdentifier(self.config, self.cache) + self.populate_researcher = PopulateResearcher( + self.config, self.cache, populate_identifiers=self.populate_identifiers + ) def sanitize_document_type(self, data: str | None): """Check documentType , and return unknow value if is not set in enum""" @@ -37,15 +36,7 @@ def sanitize_roles(self, data: list[str]) -> list[str]: def single(self, data: dict) -> Document | None: """this method create/update only on document from crisalid""" # identifiers (hal, openalex, idref ...ect) - documents_identifiers = [] - for recorded in data["recorded_by"]: - identifier = self.cache.model( - Identifier, - value=recorded["uid"], - harvester=recorded["harvester"].lower(), - ) - self.cache.save(identifier) - documents_identifiers.append(identifier) + documents_identifiers = self.populate_identifiers.multiple(data["recorded_by"]) # no identifiers for this documents, we ignore it if not documents_identifiers: diff --git a/services/crisalid/populates/identifier.py b/services/crisalid/populates/identifier.py new file mode 100644 index 00000000..e13d5877 --- /dev/null +++ b/services/crisalid/populates/identifier.py @@ -0,0 +1,40 @@ +from services.crisalid.models import Identifier + +from .base import AbstractPopulate + + +class PopulateIdentifier(AbstractPopulate): + """Populate class for identifiers element + + ex: + { + "type": "RNSR", + "value": "200612823S" + } + """ + + def sanitize_harvester(self, harvester: str) -> str: + # TODO change when crisalid is ok with all identifiers + # harvester can be "orcid_id" or "orcid" + if harvester == "orcid_id": + return Identifier.Harvester.ORCID + + if harvester not in Identifier.Harvester: + return None + + return harvester + + def single(self, data: dict) -> Identifier | None: + harvester = self.sanitize_harvester(self.sanitize_string(data["type"]).lower()) + value = self.sanitize_string(data["value"]) + + if not all((harvester, value)): + return None + + identifier = self.cache.model( + Identifier, + value=value, + harvester=harvester, + ) + self.cache.save(identifier) + return identifier diff --git a/services/crisalid/populates/researcher.py b/services/crisalid/populates/researcher.py index a6245a80..f0923f3d 100644 --- a/services/crisalid/populates/researcher.py +++ b/services/crisalid/populates/researcher.py @@ -1,10 +1,22 @@ from apps.accounts.models import PrivacySettings, ProjectUser + from services.crisalid.models import Identifier, Researcher +from services.crisalid.populates.identifier import PopulateIdentifier +from services.crisalid.populates.structure import PopulateStructure from .base import AbstractPopulate class PopulateResearcher(AbstractPopulate): + def __init__(self, *ar, populate_identifiers=None, populate_structures=None, **kw): + super().__init__(*ar, **kw) + self.populate_identifiers = populate_identifiers or PopulateIdentifier( + self.config, self.cache + ) + self.populate_structures = populate_structures or PopulateStructure( + self.config, self.cache, populate_identifiers=self.populate_identifiers + ) + def get_names(self, data): given_name = family_name = "" @@ -12,9 +24,6 @@ def get_names(self, data): given_name = self.sanitize_languages(name["first_names"]) family_name = self.sanitize_languages(name["last_names"]) - given_name = (given_name or "").strip() - family_name = (family_name or "").strip() - return given_name, family_name def create_user(self, eppn: str, given_name: str, family_name: str) -> ProjectUser: @@ -45,29 +54,26 @@ def update_user(self, user: ProjectUser) -> ProjectUser: return user def check_mapping_user( - self, researcher: Researcher, data: dict + self, + researcher: Researcher, + identifiers: list[Identifier], + given_name: str, + family_name: str, ) -> ProjectUser | None: """match user from researcher (need eppn)""" if researcher.user: return self.update_user(researcher.user) - for iden in data["identifiers"]: - if iden["type"].lower() != Identifier.Harvester.EPPN.value: + for iden in identifiers: + if iden.harvester != Identifier.Harvester.EPPN: continue - given_name, family_name = self.get_names(data) - return self.create_user(iden["value"], given_name, family_name) + return self.create_user(iden.value, given_name, family_name) return None def single(self, data: dict) -> Researcher | None: - researcher_identifiers = [] - for iden in data["identifiers"]: - identifier = self.cache.model( - Identifier, value=iden["value"], harvester=iden["type"].lower() - ) - self.cache.save(identifier) - researcher_identifiers.append(identifier) + researcher_identifiers = self.populate_identifiers.multiple(data["identifiers"]) # researcher withtout any identifiers no neeeeeeed to be created if not researcher_identifiers: @@ -85,11 +91,24 @@ def single(self, data: dict) -> Researcher | None: ) given_name, family_name = self.get_names(data) - user = self.check_mapping_user(researcher, data) + user = self.check_mapping_user( + researcher, researcher_identifiers, given_name, family_name + ) self.cache.save( researcher, given_name=given_name, family_name=family_name, user=user ) - self.cache.save_m2m(researcher, identifiers=researcher_identifiers) + + m2m = {"identifiers": researcher_identifiers} + + memberships = data.get("memberships") + if memberships: + m2m["memberships"] = self.populate_structures.multiple(memberships) + + employments = data.get("employments") + if employments: + m2m["employments"] = self.populate_structures.multiple(employments) + + self.cache.save_m2m(researcher, **m2m) return researcher diff --git a/services/crisalid/populates/structure.py b/services/crisalid/populates/structure.py new file mode 100644 index 00000000..8681535c --- /dev/null +++ b/services/crisalid/populates/structure.py @@ -0,0 +1,57 @@ +from services.crisalid.models import Structure +from services.crisalid.populates.identifier import PopulateIdentifier + +from .base import AbstractPopulate + + +class PopulateStructure(AbstractPopulate): + """Populate class for structure element + + ex: + { + "acronym": "CES", + "types": [ + "Organisation", + "ResearchStructure" + ], + "names": [ + { + "language": "fr", + "value": "UMR 8174 - CES" + } + ], + "identifiers": [ + { + "type": "RNSR", + "value": "200612823S" + }, + { + "type": "local", + "value": "U02C" + } + ] + } + """ + + def __init__(self, *ar, populate_identifiers=None, **kw): + super().__init__(*ar, **kw) + self.populate_identifiers = populate_identifiers or PopulateIdentifier( + self.config, self.cache + ) + + def single(self, data: dict) -> Structure | None: + acronym = self.sanitize_string(data["acronym"]) + name = self.sanitize_languages(data["names"]) + identifiers = self.populate_identifiers.multiple(data["identifiers"]) + + # no create structure if no identifiers are set + if not identifiers: + return None + + structure = self.cache.from_identifiers(Structure, identifiers) + self.cache.save( + structure, acronym=acronym, name=name, organization=self.config.organization + ) + self.cache.save_m2m(structure, identifiers=identifiers) + + return structure diff --git a/services/crisalid/queries/documents.graphql b/services/crisalid/queries/documents.graphql index 8d587082..e4a936fd 100644 --- a/services/crisalid/queries/documents.graphql +++ b/services/crisalid/queries/documents.graphql @@ -1,6 +1,5 @@ query PopulateFromCrisalid($limit: Int, $offset: Int, $where: DocumentWhere) { documents(limit: $limit, offset: $offset, where: $where) { - uid, publication_date, document_type, @@ -17,7 +16,6 @@ query PopulateFromCrisalid($limit: Int, $offset: Int, $where: DocumentWhere) { has_contributions { roles, contributor { - uid display_name, names { first_names { @@ -30,15 +28,37 @@ query PopulateFromCrisalid($limit: Int, $offset: Int, $where: DocumentWhere) { } } identifiers { - type + harvester: type value } + employments { + acronym + names { + language + value + } + identifiers { + harvester: type + value + } + } + memberships { + acronym + names { + language + value + } + identifiers { + harvester: type + value + } + } } } recorded_by { harvester - uid, + value: uid, } } } \ No newline at end of file diff --git a/services/crisalid/queries/organisations.graphql b/services/crisalid/queries/organisations.graphql new file mode 100644 index 00000000..85d2fc54 --- /dev/null +++ b/services/crisalid/queries/organisations.graphql @@ -0,0 +1,14 @@ +# this query for organisations ( structure / labo ) +query PopulateFromCrisalid($limit: Int, $offset: Int, $where: OrganisationWhere) { + organisations(limit: $limit, offset: $offset, where: $where) { + acronym + names { + language + value + } + identifiers { + harvester: type + value + } + } +} diff --git a/services/crisalid/queries/people.graphql b/services/crisalid/queries/people.graphql index 2f17f160..ff304917 100644 --- a/services/crisalid/queries/people.graphql +++ b/services/crisalid/queries/people.graphql @@ -1,6 +1,5 @@ query PopulateFromCrisalid($limit: Int, $offset: Int, $where: PersonWhere) { people(limit: $limit, offset: $offset, where: $where) { - uid display_name names { first_names { @@ -13,8 +12,32 @@ query PopulateFromCrisalid($limit: Int, $offset: Int, $where: PersonWhere) { } } identifiers { - type + harvester: type value } + + employments { + acronym + names { + language + value + } + identifiers { + harvester: type + value + } + } + + memberships { + acronym + names { + language + value + } + identifiers { + harvester: type + value + } + } } } diff --git a/services/crisalid/serializers.py b/services/crisalid/serializers.py index 5fe5ba1b..ff555fea 100644 --- a/services/crisalid/serializers.py +++ b/services/crisalid/serializers.py @@ -40,7 +40,12 @@ class ResearcherSerializer(serializers.ModelSerializer): class Meta: model = Researcher - exclude = ("updated",) + fields = ( + "id", + "user", + "identifiers", + "display_name", + ) def get_display_name(self, instance): return str(instance) @@ -53,9 +58,9 @@ class ResearcherDocumentsSerializer(ResearcherSerializer): class Meta: model = Researcher - read_only_fields = ("display_name",) fields = ( "identifiers", + "display_name", "user", "id", ) diff --git a/services/crisalid/tasks.py b/services/crisalid/tasks.py index b3d7226a..8bc2045e 100644 --- a/services/crisalid/tasks.py +++ b/services/crisalid/tasks.py @@ -4,8 +4,15 @@ from services.crisalid.bus.constant import CrisalidEventEnum, CrisalidTypeEnum from services.crisalid.bus.consumer import on_event from services.crisalid.interface import CrisalidService -from services.crisalid.models import CrisalidConfig, Document, Identifier, Researcher +from services.crisalid.models import ( + CrisalidConfig, + Document, + Identifier, + Researcher, + Structure, +) from services.crisalid.populates import PopulateDocument, PopulateResearcher +from services.crisalid.populates.structure import PopulateStructure logger = logging.getLogger(__name__) @@ -16,6 +23,8 @@ def get_crisalid_config(crisalid_config_id: int) -> CrisalidConfig: ) +# TODO(remi): convert fields to graphql request + # https://github.com/CRISalid-esr/crisalid-ikg/blob/dev-main/app/amqp/amqp_person_event_message_factory.py#L28 # https://github.com/CRISalid-esr/crisalid-ikg/blob/dev-main/app/amqp/amqp_document_event_message_factory.py#L37 @@ -27,8 +36,18 @@ def create_researcher(crisalid_config_id: int, fields: dict): config = get_crisalid_config(crisalid_config_id) logger.error("receive %s for organization %s", fields, config.organization) + service = CrisalidService(config) + + # fetch data from apollo + data = service.query("people", offset=0, limit=1, where={"uid_EQ": fields["uid"]})[ + "people" + ] + if not data: + logger.warning("no result fetching crisalid_uid=%s", fields["uid"]) + return + populate = PopulateResearcher(config) - populate.single(fields) + populate.single(data[0]) @on_event(CrisalidTypeEnum.PERSON, CrisalidEventEnum.DELETED) @@ -51,6 +70,54 @@ def delete_researcher(crisalid_config_id: int, fields: dict): logger.info("deleted = %s", deleted) +# ---- +# Documents task (publications/conference ....) +# ---- +@on_event(CrisalidTypeEnum.STRUCTURE, CrisalidEventEnum.CREATED) +@on_event(CrisalidTypeEnum.STRUCTURE, CrisalidEventEnum.UPDATED) +@app.task(name=f"{__name__}.create_structure") +def create_structure(crisalid_config_id: int, fields: dict): + config = get_crisalid_config(crisalid_config_id) + logger.error("receive %s for organization %s", fields, config.organization) + + service = CrisalidService(config) + + # fetch data from apollo + data = service.query( + "organisations", offset=0, limit=1, where={"uid_EQ": fields["uid"]} + )["organisations"] + if not data: + logger.warning("no result fetching crisalid_uid=%s", fields["uid"]) + return + + populate = PopulateStructure(config) + populate.single(data[0]) + + +@on_event(CrisalidTypeEnum.STRUCTURE, CrisalidEventEnum.DELETED) +@app.task(name=f"{__name__}.delete_structure") +def delete_structure(crisalid_config_id: int, fields: dict): + config = get_crisalid_config(crisalid_config_id) + logger.error("receive %s for organization %s", fields, config.organization) + + identifiers = [ + {"harvester": iden["type"].lower(), "value": iden["value"]} + for iden in fields["identifiers"] + if iden["type"].lower() + not in (Identifier.Harvester.LOCAL, Identifier.Harvester.EPPN) + ] + + qs = Structure.objects.from_identifiers(identifiers, distinct=False).filter( + organization=config.organization + ) + deleted, _ = qs.delete() + + logger.info("deleted = %s", deleted) + + +# ---- +# Documents task (publications/conference ....) +# ---- @on_event(CrisalidTypeEnum.DOCUMENT, CrisalidEventEnum.CREATED) @on_event(CrisalidTypeEnum.DOCUMENT, CrisalidEventEnum.UPDATED) @app.task(name=f"{__name__}.create_document") @@ -88,6 +155,9 @@ def delete_document(crisalid_config_id: int, fields: dict): logger.info("deleted = %s", deleted) +# ---- +# Vectorize documents for similarity +# ---- @app.task(name="Vectorize documents") def vectorize_documents(documents_pks: list[int]): for obj in Document.objects.filter(pk__in=documents_pks): diff --git a/services/crisalid/tests/test_populate.py b/services/crisalid/tests/test_populate.py index 408b3853..bcbc7ba8 100644 --- a/services/crisalid/tests/test_populate.py +++ b/services/crisalid/tests/test_populate.py @@ -5,8 +5,9 @@ from apps.accounts.factories import UserFactory from apps.accounts.models import PrivacySettings, ProjectUser from services.crisalid.factories import CrisalidConfigFactory -from services.crisalid.models import Document, Identifier, Researcher +from services.crisalid.models import Document, Identifier, Researcher, Structure from services.crisalid.populates import PopulateDocument, PopulateResearcher +from services.crisalid.populates.structure import PopulateStructure class TestPopulateResearcher(test.TestCase): @@ -226,7 +227,10 @@ def test_create_publication(self): } ], "identifiers": [ - {"type": "eppn", "value": "marty.mcfly@non-de-zeus.fr"}, + { + "type": "eppn", + "value": "marty.mcfly@non-de-zeus.fr", + }, {"type": "idref", "value": "4545454545454"}, {"type": "local", "value": "v55555"}, ], @@ -236,9 +240,8 @@ def test_create_publication(self): ], "recorded_by": [ { - "uid": "hals-truc", - "harvester": Identifier.Harvester.HAL.value, - "value": "", + "type": Identifier.Harvester.HAL.value, + "value": "hals-truc", } ], } @@ -359,3 +362,46 @@ def test_sanitize_document_type(self): ), Document.DocumentType.AUDIOVISUAL_DOCUMENT.value, ) + + +class TestPopulateStructure(test.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.config = CrisalidConfigFactory() + cls.popu = PopulateStructure(cls.config) + + def test_create_structure(self): + data = { + "acronym": "LabEx CAP", + "names": [{"language": "fr", "value": "CAP"}], + "identifiers": [{"type": "local", "value": "DGI01"}], + } + + new_obj = self.popu.single(data) + + # check obj from db + obj = Structure.objects.first() + self.assertEqual(obj, new_obj) + + self.assertEqual(obj.acronym, "LabEx CAP") + self.assertEqual(obj.name, "CAP") + self.assertEqual(obj.organization, self.config.organization) + self.assertEqual(obj.identifiers.count(), 1) + iden = obj.identifiers.first() + self.assertEqual(iden.value, "DGI01") + self.assertEqual(iden.harvester, "local") + + def test_create_structure_whitout_identifiers(self): + data = { + "acronym": "LabEx CAP", + "names": [{"language": "fr", "value": "CAP"}], + "identifiers": [], + } + + new_obj = self.popu.single(data) + + # check obj from db + obj = Structure.objects.first() + self.assertIsNone(obj) + self.assertIsNone(new_obj) diff --git a/services/crisalid/tests/test_tasks.py b/services/crisalid/tests/test_tasks.py index a0faf6e3..c30dbb2b 100644 --- a/services/crisalid/tests/test_tasks.py +++ b/services/crisalid/tests/test_tasks.py @@ -95,10 +95,11 @@ def test_delete_research(self): self.assertTrue(Researcher.objects.filter(pk=researcher.pk).exists()) - def test_create_researcher(self): + @patch("services.crisalid.interface.Client") + def test_create_researcher(self, client_gql): # other check/tests in test_views.py - fields = { - "uid": "05-11-1995-uuid", + fields = {"uid": "05-11-1995-uuid"} + data = { "names": [ { "first_names": [{"value": "marty", "language": "fr"}], @@ -110,6 +111,8 @@ def test_create_researcher(self): ], } + client_gql().execute.return_value = {"people": [data]} + create_researcher(self.config.pk, fields) # check obj from db @@ -171,9 +174,8 @@ def test_create_document(self, client_gql): ], "recorded_by": [ { - "uid": "hals-truc", - "harvester": Identifier.Harvester.HAL.value, - "value": "", + "type": Identifier.Harvester.HAL.value, + "value": "hals-truc", } ], } diff --git a/services/crisalid/urls.py b/services/crisalid/urls.py index 8a2b612e..de89f87d 100644 --- a/services/crisalid/urls.py +++ b/services/crisalid/urls.py @@ -7,6 +7,7 @@ ) from services.crisalid.views import ( ConferenceViewSet, + DocumentViewSet, PublicationViewSet, ResearcherViewSet, ) @@ -17,6 +18,13 @@ researcher_router, r"researcher", ResearcherViewSet, basename="Researcher" ) +organization_router_register( + researcher_router, + r"document", + DocumentViewSet, + basename="CrisalidDocument", +) + organization_researcher_router_register( researcher_router, r"publications", diff --git a/services/crisalid/views.py b/services/crisalid/views.py index 95debbed..a425a25a 100644 --- a/services/crisalid/views.py +++ b/services/crisalid/views.py @@ -82,14 +82,25 @@ ), ) class AbstractDocumentViewSet( - NestedOrganizationViewMixins, - NestedResearcherViewMixins, viewsets.ReadOnlyModelViewSet, ): """Abstract class to get documents info from documents types""" serializer_class = DocumentSerializer + def filter_roles(self, queryset, roles_enabled=True): + # filter only by roles (author, co-authors ...ect) + roles = [ + r.strip() + for r in self.request.query_params.get("roles", "").split(",") + if r.strip() + ] + if roles and roles_enabled: + queryset = queryset.filter( + documentcontributor__roles__contains=roles, + ) + return queryset + def filter_queryset( self, queryset, @@ -102,17 +113,7 @@ def filter_queryset( if year and year_enabled: qs = qs.filter(publication_date__year=year) - # filter only by roles (author, co-authors ...ect) - roles = [ - r.strip() - for r in self.request.query_params.get("roles", "").split(",") - if r.strip() - ] - if roles and roles_enabled: - qs = qs.filter( - documentcontributor__roles__contains=roles, - documentcontributor__researcher=self.researcher, - ) + qs = self.filter_roles(qs, roles_enabled) # filter by pblication_type if "document_type" in self.request.query_params and document_type_enabled: @@ -123,7 +124,6 @@ def filter_queryset( def get_queryset(self) -> QuerySet[Document]: return ( Document.objects.filter( - contributors=self.researcher, document_type__in=self.document_types, ) .prefetch_related("identifiers", "contributors__user") @@ -146,22 +146,17 @@ def similars(self, request, *args, **kwargs): ) return self.get_paginated_response(data.data) - @action( - detail=False, - methods=[HTTPMethod.GET], - url_path="analytics", - serializer_class=DocumentAnalyticsSerializer, - ) - def analytics(self, request, *args, **kwargs): - """methods to return analytics (how many documents/by year / by document_type) from researcher""" - + def get_analytics(self): qs = self.get_queryset() # get counted all document_types types # use only here the filter_queryset, # the next years values need to have all document_types (non filtered) + document_types = Counter( - self.filter_queryset(qs, document_type_enabled=False) + Document.objects.filter( + id__in=self.filter_queryset(qs, document_type_enabled=False) + ) .order_by("document_type") .values_list("document_type", flat=True) ) @@ -184,11 +179,23 @@ def analytics(self, request, *args, **kwargs): chain( *DocumentContributor.objects.filter( document__in=self.filter_queryset(qs, roles_enabled=False), - researcher=self.researcher, ).values_list("roles", flat=True) ) ) + return document_types, years, roles + + @action( + detail=False, + methods=[HTTPMethod.GET], + url_path="analytics", + serializer_class=DocumentAnalyticsSerializer, + ) + def analytics(self, request, *args, **kwargs): + """methods to return analytics (how many documents/by year / by document_type) from researcher""" + + document_types, years, roles = self.get_analytics() + return JsonResponse( self.serializer_class( { @@ -200,11 +207,58 @@ def analytics(self, request, *args, **kwargs): ) -class PublicationViewSet(AbstractDocumentViewSet): +class DocumentViewSet(NestedOrganizationViewMixins, AbstractDocumentViewSet): + """general viewset documents""" + + def get_queryset(self) -> QuerySet[Document]: + return ( + Document.objects.all() + .prefetch_related("identifiers", "contributors__user") + .order_by("-publication_date") + ) + + +class AbstractResearcherDocumentViewSet( + NestedOrganizationViewMixins, NestedResearcherViewMixins, AbstractDocumentViewSet +): + + def filter_roles(self, queryset, roles_enabled=True): + # filter only by roles (author, co-authors ...ect) + roles = [ + r.strip() + for r in self.request.query_params.get("roles", "").split(",") + if r.strip() + ] + if roles and roles_enabled: + queryset = queryset.filter( + documentcontributor__roles__contains=roles, + documentcontributor__research=self.researcher, + ) + return queryset + + def get_analytics(self): + document_types, years, _ = super().get_analytics() + qs = self.get_queryset() + roles = Counter( + chain( + *DocumentContributor.objects.filter( + document__in=self.filter_queryset(qs, roles_enabled=False), + researcher=self.researcher, + ).values_list("roles", flat=True) + ) + ) + + return (document_types, years, roles) + + def get_queryset(self) -> QuerySet[Document]: + return super().get_queryset().filter(contributors=self.researcher) + + +class PublicationViewSet(AbstractResearcherDocumentViewSet): document_types = DocumentTypeCentralized.publications -class ConferenceViewSet(AbstractDocumentViewSet): +class ConferenceViewSet(AbstractResearcherDocumentViewSet): document_types = DocumentTypeCentralized.conferences