Merge pull request #755 from simphony/dev

Merge release 3.7.0.
simphony · Feb 10, 2022 · 638fc6a · 638fc6a · kysrpex · Feb 10, 2022
2 parents cbfa060 + 635eb42
commit 638fc6a
Show file tree

Hide file tree

Showing 12 changed files with 496 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,13 @@ www.simphony-project.eu).
 
 ## Installation
 
-See [https://simphony.readthedocs.io/en/latest/installation.html](https://simphony.readthedocs.io/en/latest/installation.html).
+OSP-core is available on PyPI, so it can be installed using `pip`
+
+```shell
+pip install osp-core
+```
+
+For more detailed instructions, see [https://simphony.readthedocs.io/en/latest/installation.html](https://simphony.readthedocs.io/en/latest/installation.html).
 
 ## Visualization of ontologies
 

diff --git a/osp/core/ontology/namespace.py b/osp/core/ontology/namespace.py
@@ -361,7 +361,7 @@ def _get_labels_for_iri(self, iri, lang=None, _return_literal=False,
 
         Args:
             iri (rdflib.URIRef): the target iri.
-            lang (str): retrieve labels only on a speific language.
+            lang (str): retrieve labels only on a specific language.
             _return_literal: return rdflib.Literal instead of str, so that the
                              language of the labels is known to the caller.
 

diff --git a/osp/core/ontology/namespace_registry.py b/osp/core/ontology/namespace_registry.py
@@ -252,7 +252,7 @@ def _get_entity_name(self, entity_iri, ns_iri):
                                               _return_literal=True,
                                               _return_label_property=True))
             if not labels:
-                logger.warning(f"No label for {entity_iri}")
+                logger.debug(f"No label for {entity_iri}")
             else:
                 labels = sorted(labels,
                                 key=lambda x:

diff --git a/osp/core/ontology/ontology.py b/osp/core/ontology/ontology.py
@@ -183,7 +183,7 @@ def _overlay_add_cuba_triples(parser: Union[OntologyParser, 'Ontology'],
                 logger.warning(f"Specified relationship {iri} as "
                                f"active relationship, which is not "
                                f"a valid object property in the ontology."
-                               f"If such relationship belongs to another"
+                               f"If such relationship belongs to another "
                                f"ontology, and such ontology is installed, "
                                f"then you may safely ignore this warning.")
                 # This requirement is checked later on in
@@ -233,12 +233,13 @@ def labels_for_iri(iri):
                                                      lang=None,
                                                      _return_literal=True)
 
-    # Finally check for the duplicate labels.
+    # Finally, check for the duplicate labels.
     subjects = set(subject for subject in graph.subjects()
                    if in_namespace(subject))
-    results = sorted(((label.toPython(), label.language), iri)
-                     for iri in subjects for label
-                     in labels_for_iri(iri))
+    results = set(((label.toPython(), label.language or ''), iri)
+                  for iri in subjects for label
+                  in labels_for_iri(iri))
+    results = sorted(results)
     labels, iris = tuple(result[0] for result in results), \
         tuple(result[1] for result in results)
     coincidence_search = tuple(i
@@ -248,12 +249,13 @@ def labels_for_iri(iri):
     for i in coincidence_search:
         conflicting_labels[labels[i]] |= {iris[i - 1], iris[i]}
     if len(conflicting_labels) > 0:
-        texts = (f'{label[0]}, language {label[1]}: '
+        texts = (f'{label[0]}, language '
+                 f'{label[1] if label[1] != "" else None}: '
                  f'{", ".join(tuple(str(iri) for iri in iris))}'
                  for label, iris in conflicting_labels.items())
         raise KeyError(f'The following labels are assigned to more than '
                        f'one entity in namespace {namespace}; '
-                       f'{"; ".join(texts)}.')
+                       f'{"; ".join(texts)} .')
 
 
 def _check_namespaces(namespace_iris: Iterable[URIRef],

diff --git a/osp/core/session/db/db_wrapper_session.py b/osp/core/session/db/db_wrapper_session.py
@@ -1,16 +1,19 @@
 """An abstract session containing method useful for all database backends."""
 
-from abc import abstractmethod
-from typing import Union
 import itertools
 import logging
-import rdflib
 import uuid
-from osp.core.utils.general import uid_from_iri, CUDS_IRI_PREFIX
+from abc import abstractmethod
+from typing import Union
+
+import rdflib
+
+import osp.core.warnings as warning_settings
 from osp.core.ontology.namespace_registry import namespace_registry
 from osp.core.session.wrapper_session import consumes_buffers, WrapperSession
 from osp.core.session.result import returns_query_result
 from osp.core.session.buffers import BufferContext, EngineContext
+from osp.core.utils.general import uid_from_iri, CUDS_IRI_PREFIX
 
 logger = logging.getLogger(__name__)
 
@@ -27,11 +30,12 @@ def commit(self):
         try:
             root_obj = self._registry.get(self.root)
             added, updated, deleted = self._buffers[BufferContext.USER]
+            if warning_settings.unreachable_cuds_objects:
+                self._unreachable_warning(root_obj)
             self._apply_added(root_obj, added)
             self._apply_updated(root_obj, updated)
             self._apply_deleted(root_obj, deleted)
             self._reset_buffers(BufferContext.USER)
-            self._unreachable_warning(root_obj)
             self._commit()
         except Exception as e:
             self._rollback_transaction()
@@ -176,24 +180,112 @@ def _unreachable_warning(self, root_obj: Union[rdflib.URIRef, uuid.UUID]):
         raises a warning that lists some of the unreachable CUDS objects.
 
         Args:
-            root_obj (Union[URIRef, UUID]): The root object with respect to
-                which objects are deemed reachable or unreachable.
+            root_obj: The root object with respect to which objects are
+                deemed reachable or unreachable.
         """
-        unreachable = self._registry._get_not_reachable(root_obj, rel=None)
+        large_dataset_warning = LargeDatasetWarning()
+        unreachable, reachable = self._registry._get_not_reachable(
+            root_obj, rel=None, return_reachable=True,
+            warning=large_dataset_warning
+        )
+
+        # Warn about unreachable CUDS
         max_cuds_on_warning = 5
         if len(unreachable) > 0:
-            warning = "Some CUDS objects are unreachable " \
-                      "from the wrapper object: " \
-                      "{cuds}{more}." \
-                      "\n" \
-                      "If you want to be able to retrieve those CUDS " \
-                      "objects later, either add them to the wrapper object " \
-                      "or to any other CUDS that is reachable from it." \
-                .format(cuds=', '.join(str(x) for x in itertools
-                                       .islice(unreachable,
-                                               max_cuds_on_warning)),
-                        more=" and " + str(len(unreachable)
-                                           - max_cuds_on_warning)
-                             + " more" if len(unreachable) > 5
-                        else "")
-            logger.warning(warning)
+            unreachable_cuds_warning = (
+                "Some CUDS objects are unreachable from the wrapper object: "
+                "{cuds}{more}. \n"
+                "If you want to be able to retrieve those CUDS objects later, "
+                "either add them to the wrapper object or to any other CUDS "
+                "that is reachable from it."
+            ).format(
+                cuds=', '.join(str(x) for x in itertools
+                               .islice(unreachable, max_cuds_on_warning)),
+                more=" and " + str(len(unreachable) - max_cuds_on_warning)
+                     + " more" if len(unreachable) > 5 else "")
+            # A filter is applied to the logger that attaches the warning
+            # type to the log records.
+            logger_filter = UnreachableCUDSWarningFilter()
+            logger.addFilter(logger_filter)
+            logger.warning(unreachable_cuds_warning)
+            logger.removeFilter(logger_filter)
+
+            # Inform the large dataset warning that the unreachable CUDS
+            # warning was raised (so that it changes its text).
+            large_dataset_warning.unreachable_cuds_warning = True
+
+        # Warn about large datasets and recommend disabling the unreachable
+        # CUDS warning for large datasets.
+        if len(reachable) + len(unreachable) >= \
+                warning_settings.unreachable_cuds_objects_large_dataset_size:
+            # Recommend disabling the warning for large datasets.
+            large_dataset_warning.warn()
+
+
+class UnreachableCUDSWarning(UserWarning):
+    """Shown when CUDS are unreachable from the wrapper.
+
+    Used by `DbWrapperSession._unreachable_warning` during the commit
+    operation.
+    """
+
+
+class UnreachableCUDSWarningFilter(logging.Filter):
+    """Attaches the `UnreachableCUDSWarning` class to the records."""
+
+    def filter(self, record):
+        """Attaches the `UnreachableCUDSWarning` to the records."""
+        record.warning_class = UnreachableCUDSWarning
+        return True
+
+
+class LargeDatasetWarning(UserWarning):
+    """Shown while working with a large dataset.
+
+    Used by `DbWrapperSession._unreachable_warning`, during the commit
+    operation.
+    """
+    warned: bool = False
+    unreachable_cuds_warning: bool = False
+
+    def warn(self) -> None:
+        """Show the warning.
+
+        The warning will be only shown once. If you want to show the warning
+        again, you must create a new instance of `LargeDatasetWarning`.
+        """
+        if self.warned:
+            return
+
+        # Recommend disabling the `UnreachableCUDSWarning` for large datasets.
+        warning = (
+            "You are working with a large dataset. When committing "
+            "changes, OSP-core looks for objects that are unreachable "
+            "from the wrapper object to generate {reference_to_warning}. "
+            "Generating such warning is very expensive in computational "
+            "terms when small changes are applied to existing, "
+            "large datasets. You will notice that the changes may take a "
+            "lot of time to be committed. Please turn off such warning "
+            "when working with large datasets. You can turn off the "
+            "warning by running `import osp.core.warnings as "
+            "warning_settings; "
+            "warning_settings.unreachable_cuds_objects = False`.")
+        reference = ("a warning" if not self.unreachable_cuds_warning else
+                     "the previous warning")
+        warning = warning.format(reference_to_warning=reference)
+        # A filter is applied to the logger that attaches the warning
+        # type to the log records.
+        logger_filter = LargeDatasetWarningFilter()
+        logger.addFilter(logger_filter)
+        logger.warning(warning)
+        logger.removeFilter(logger_filter)
+        self.warned = True
+
+
+class LargeDatasetWarningFilter(logging.Filter):
+    """Filter that attaches the `LargeDatasetWarning` class to the records."""
+
+    def filter(self, record):
+        """Attaches the `LargeDatasetWarning` to the records."""
+        record.warning_class = LargeDatasetWarning
+        return True
diff --git a/osp/core/session/registry.py b/osp/core/session/registry.py
@@ -4,6 +4,8 @@
 from rdflib import URIRef
 import logging
 
+import osp.core.warnings as warning_settings
+
 logger = logging.getLogger(__name__)
 
 
@@ -58,7 +60,8 @@ def get(self, uid):
             message = '{!r} is not a proper uid'
             raise ValueError(message.format(uid))
 
-    def get_subtree(self, root, rel=None, skip=None):
+    def get_subtree(self, root, subtree=None, rel=None, skip=None,
+                    warning=None):
         """Get all the elements in the subtree rooted at given root.
 
         Only use the given relationship for traversal.
@@ -67,23 +70,59 @@ def get_subtree(self, root, rel=None, skip=None):
             root (Union[UUID, URIRef, Cuds]): The root of the subtree.
             rel (Relationship, optional): The relationship used for traversal.
                 Defaults to None. Defaults to None.
+            subtree (Set[Cuds]): Currently calculated subtree (this is a
+                recursive algorithm).
             skip (Set[Cuds], optional): The elements to skip. Defaults to None.
                 Defaults to None.
+            warning (LargeDatasetWarning, optional): Raise a
+                `LargeDatasetWarning` when the subtree is large. When `None`,
+                no warning is raised. If you wish to raise the warning, a
+                `LargeDatasetWarning` object must be provided.
 
         Returns:
             Set[Cuds]: The set of elements in the subtree rooted in the given
                 uid.
         """
-        from osp.core.cuds import Cuds
-        skip = skip or set()
-        if not isinstance(root, Cuds):
+        if isinstance(root, (UUID, URIRef)):
             root = super().__getitem__(root)
         assert root.uid in self
-        subtree = {root}
-        for child in root.iter(rel=rel):
-            if child not in (skip | subtree):
-                subtree |= self.get_subtree(child.uid, rel,
-                                            skip=(skip | subtree))
+        skip = skip or set() | {root}
+        skip |= {root}
+        subtree = subtree or {root}
+
+        subclasses = set() if rel is None else rel.subclasses
+        subclass_check = (lambda r: True) \
+            if not subclasses else (lambda r: r in subclasses)
+        """Checks whether relationship `x` should be considered.
+
+        - When no `rel` is provided, `subclass_check` should always return
+          True, as all relationships should be considered.
+
+        - When `rel` is provided, it should return true only if the
+          relationship `x` is a subclass of the provided relationship (`rel`).
+        """
+
+        # Load neighbors connected through the relationship
+        filtered_neighbors = (
+            neighbor
+            for r, dict_target in root._neighbors.items()
+            if subclass_check(r)
+            for neighbor in dict_target
+        )
+        filtered_neighbors = set(root.session.load(*filtered_neighbors))
+
+        subtree |= filtered_neighbors
+
+        # Optional: raise a `LargeDatasetWarning` if the subtree is too large.
+        if warning is not None and len(subtree) \
+                > warning_settings \
+                .unreachable_cuds_objects_large_dataset_size:
+            warning.warn()
+            warning = None
+
+        for neighbor in filter(lambda x: x not in skip, filtered_neighbors):
+            self.get_subtree(neighbor, subtree=subtree, rel=rel, skip=skip,
+                             warning=warning)
         return subtree
 
     def prune(self, *roots, rel=None):
@@ -103,7 +142,11 @@ def prune(self, *roots, rel=None):
             super().__delitem__(x.uid)
         return not_reachable
 
-    def _get_not_reachable(self, *roots, rel=None):
+    def _get_not_reachable(self,
+                           *roots,
+                           rel=None,
+                           return_reachable=False,
+                           warning=None):
         """Get all elements in the registry that are not reachable.
 
         Use the given rel for traversal.
@@ -113,22 +156,29 @@ def _get_not_reachable(self, *roots, rel=None):
                 from these root elements.
             rel (Relationship, optional): Only use this relationship for
                 traversal. Defaults to None.
+            return_reachable (bool): Returns also the uids of the reachable
+                cuds.
 
         Returns:
-            List[Cuds]: The set of non reachable elements.
+            Union[List[Cuds],
+                  Tuple[List[Cuds], Set[Union[UUID, URIRef]]]]: Either a
+                list of the unreachable CUDS when `return_reachable` is False
+                or a tuple whose first element is such list, and second
+                element a set with the uids of the reachable cuds.
         """
         # Get all reachable Cuds objects
         reachable = set()
         for root in roots:
-            reachable |= self.get_subtree(root, rel=rel, skip=reachable)
+            reachable |= self.get_subtree(
+                root, rel=rel, skip=reachable, warning=warning)
         reachable_uids = set([r.uid for r in reachable])
 
         # Get all the Cuds objects that are not reachable
         delete = list()
         for uid in self.keys():
             if uid not in reachable_uids:
                 delete.append(super().__getitem__(uid))
-        return delete
+        return delete if not return_reachable else (delete, reachable_uids)
 
     def reset(self):
         """Delete the contents of the registry."""

diff --git a/osp/core/session/transport/communication_engine.py b/osp/core/session/transport/communication_engine.py
@@ -162,6 +162,12 @@ def __init__(self, uri, handle_response, **kwargs):
         """
         self.uri = uri
         self.kwargs = kwargs
+        # The default `ping_timeout` is 20s. The pings are not sent during a
+        # transfer. Thus, if the transfer takes more than 20s, then the
+        # default value causes the websockets connection to close
+        # unexpectedly. Hence, we chose to never close the connection due to
+        # ping timeouts unless the user wishes to do so.
+        self.kwargs['ping_timeout'] = self.kwargs.get('ping_timeout', None)
         self.handle_response = handle_response
         self.websocket = None
Benchmark suite	Current: `638fc6a`	Previous: `635eb42`	Ratio
`benchmark_cuds_api.py::benchmark_cuds_create`	`385.5192011685611` iter/sec (`stddev: 0.006598974796613531`)	`607.6365250140163` iter/sec (`stddev: 0.00425757155642059`)	`1.58`
`benchmark_cuds_api.py::benchmark_add_default`	`1102.8966316109766` iter/sec (`stddev: 0.00018802817329449106`)	`1756.456883044433` iter/sec (`stddev: 0.00008497178955291957`)	`1.59`
`benchmark_cuds_api.py::benchmark_get_byuiduriref`	`288.3646163966868` iter/sec (`stddev: 0.0009759221792346276`)	`465.09750427330164` iter/sec (`stddev: 0.00025552976934833217`)	`1.61`
`benchmark_cuds_api.py::benchmark_cuds_is_a`	`6804.397191986568` iter/sec (`stddev: 0.00010118231278397455`)	`11552.723731376334` iter/sec (`stddev: 0.00005475464459677695`)	`1.70`
`benchmark_cuds_api.py::benchmark_cuds_oclass`	`8653.037094885041` iter/sec (`stddev: 0.000021429817896318477`)	`15647.727346027857` iter/sec (`stddev: 0.000015393623801881135`)	`1.81`
`benchmark_cuds_api.py::benchmark_cuds_uid`	`13328.58196046166` iter/sec (`stddev: 0.000019742996625737058`)	`22654.018897727678` iter/sec (`stddev: 0.000017784265023640967`)	`1.70`
`benchmark_cuds_api.py::benchmark_cuds_iri`	`12089.186181820303` iter/sec (`stddev: 0.000019560577453230282`)	`18885.624277889903` iter/sec (`stddev: 0.00002070254024985459`)	`1.56`