Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamically set value for schema based on document values [Proposed label: feature_request #615

Open
charlverster opened this issue Feb 27, 2024 · 0 comments

Comments

@charlverster
Copy link

charlverster commented Feb 27, 2024

I'm using Cerberus to validate configs for a framework that can run a multitude of tasks with many dependent configurations. The configs are such that the schema changes heavily depending on what is specified. In an attempt to make this manageable, I wrote schemas for the individual parts and store them in the schema_registry. When the config is validated, the name of the required validation is build from field values specified in the config, that corresponding schema is retrieved from the registry and then applied.

My code looks like this:

`import os
from cerberus.validator import Validator, schema_registry, rules_set_registry
import importlib
import pkgutil
import re

class ConfigValidator(Validator):
def init(self, *args, **kwargs):
super().init(*args, **kwargs)

def _validate_custom_validator_from_field_value(
    self, constraint: list[str,dict] | dict | str, field: str, value:str
) -> None:
    """
    Validate a field by applying a custom schema validation from the schema registry, named 'constraint'. 'constraint' can be compiled 
    from values of fields from within the config being validated. 


    Example 1:

    config = {
        "parameters": {
            "env: "dev",
            "class: "S3ToGlue",
            "size: "large"
        }
    }

    schema = {
        "parameters":{
            "custom_validator_from_field_value": "SomeCustomSchemaName"
        }
    }
    
    Example 2:

    schema = {
        "parameters":{
            "custom_validator_from_field_value": {"lookup": "parameters.class"}  <-- Example uses custom lookup feature to get the value of class
        }
    }

    Example 3:
    schema = {
        "parameters":{
            "custom_validator_from_field_value": [{"lookup": "parameters.class"}, "CustomValue", {"lookup":"parameters.env"}] <-- Example uses a list of values, which gets concatenated and converted to CamelCase as S3ToGlueCustomValueDev
        }
    }

    In this example, the dictionary 'parameters' will be evaluated according to the schema named 'S3ToGlue' in the schema_registry.

    Args:
        constraint (str): The name constraint applied.
        field (str): The name of the field.
        value (dict): The value of the field.

    #? The below part of the docstring is a Cerberus requirement.
    #? Without it you will get a 'UserWarning: No validation schema is defined for the arguments of rule' error.

    The rule's arguments are validated against this schema:
    {'type': ['list','dict','string']}
    """
    custom_validation_name = self._find_validation_name(
        constraint=constraint, field=field
    )

    if not custom_validation_name:
        self._error(field, f"Field {constraint} cannot be empty.")
    custom_validation_schema = schema_registry._storage.get(
        custom_validation_name, None
    )
    if not custom_validation_schema:
        self._error(
            field,
            f"No custom validation schema named {custom_validation_name}",
        )  # ? Potential TODO: Print the values that are valid.
    else:
        custom_validator = ConfigValidator(schema=custom_validation_schema)
        validation = custom_validator.validated(document=value)
        if validation is None:
            self._error(field, f"{field} not valid: {custom_validator.errors}")

def _find_validation_name(self, constraint, field) -> str:
    """
    Recursive function to find the name of the custom validation to apply from constraint values supplied.

    Args:
        constraint (str): Constraint to evaluate.
        field (str): Field to which the constraint is applied.
    Returns:
        custom_validation_name (str): The name of the custom validation to apply.
    """

    if isinstance(constraint, list):
        names_list = []
        for item in constraint:
            validation_name = self._find_validation_name(item, field)
            names_list.append(validation_name)
        custom_validation_name = "_".join(names_list)

    elif isinstance(constraint, dict) and "lookup" in constraint:
        constraint_value = constraint.get("lookup")
        if constraint_value.startswith("^"):
            constraint_value = constraint_value.replace("^", f"{field}.")
        validation_name = self._lookup_field(constraint_value)[1]
        custom_validation_name = validation_name

    elif isinstance(constraint, str):
        custom_validation_name = constraint
    else:
        self._error(field, f"{constraint} in {field} is not valid.")

    _ = re.split(r"(?<!^)(?=[A-Z])", custom_validation_name)
    custom_validation_name = (
        "_".join([part.lower() for part in _]).title().replace("_", "")
    )
    return custom_validation_name

@classmethod
def _populate_schema_registry(cls, schema_package) -> None:
    """
    Recursive searches and imports all validation schemas in the package 'specified schema_package_name', and adds them to the scheme registry
    with the convention 'FileName':schema.

    e.g. schema module 's3_to_glue.py' will be added to the schema registry as 'S3ToGlue'.

    Args:
        schema_package_name (str): The name of the schema package to import. It is useful to import the package and then specifying the package name as 'package.__name__'.
    """
    for loader, name, is_pkg in pkgutil.walk_packages(schema_package.__path__):
        full_name = schema_package.__name__ + "." + name
        spec = importlib.util.find_spec(full_name)
        if spec is not None and spec.submodule_search_locations is None:
            schema_module = importlib.import_module(full_name)
            schema_file_name = os.path.basename(
                getattr(schema_module, "__file__")  # s3_to_glue.py
            )
            schema_name = (
                os.path.splitext(schema_file_name)[0]
                .title()
                .replace("_", "")  # S3ToGlue
            )
            schema_registry.add(schema_name, schema_module.schema)

        elif is_pkg:
            package = importlib.import_module(full_name)
            cls._populate_schema_registry(package)

@classmethod
def _populate_rules_registry(cls, rules_package) -> None:
    """
    Recursive searches and imports all validation rules in the package 'specified schema_package_name', and adds them to the rules set registry
    with the convention rule_name:rule.

    Rules are stored in dictionaries with the variable name 'rules'. Multiple rules can be contained in one dictionary, where each key:value pair
    represents a rule i.e.

    rules = {
        "s3_path_rule": {
            "type": "string",
            "coerce": lambda v: {v if v.startswith("s3://") is True else f"s3://{v}"},
        },
        "file_name_rule":{
            "type": "string",
            "regex": "^[^\s!@#$%^&*()_+={}\[\]:;"'<>,.?/|\\]+$"
        }
    }



    Args:
        schema_package_name (str): The name of the schema package to import. It is useful to import the package and then specifying the package name as 'package.__name__'.
    """

    for loader, name, is_pkg in pkgutil.walk_packages(rules_package.__path__):
        full_name = rules_package.__name__ + "." + name
        spec = importlib.util.find_spec(full_name)
        if spec is not None and spec.submodule_search_locations is None:
            rules_module = importlib.import_module(full_name)
            rules_list = []
            for name, rule in rules_module.rules.items():
                rules_list.append((name, rule))

            rules_set_registry.extend(rules_list)

        elif is_pkg:
            package = importlib.import_module(full_name)
            cls._populate_rules_registry(package)

@classmethod
def populate_registries(cls, schema_package, rules_package) -> None:
    """
    Wrapper method to populate both the schema and rules registries.

    Args:
        schema_package_name (str): The name of the schema package to import.
        rules_package_name (str): The name of the schema package to import.
    """
    cls._populate_schema_registry(schema_package)
    cls._populate_rules_registry(rules_package)

`

I do the Validation by creating another instance of ConfigValidator and then validating the subdocuments individually. Reading through the documentation, I see I should have used _get_child_validator instead, however the problem is that normalization in the subdocuments does not work and I don't think that would solve that problem.

What I've realized is that the core functionality that I am looking for is the ability to set the value of the field 'schema' in the schema dict based on values from the document being validated. I have tried using coerce to set schema value, but that does not work.

schema = {
    "foo": {
        "type":"dict",
        "schema": {"coerce": lambda v: {"baz":{"type":"string"}}}
    }
}
config = {
    "foo":{
        "baz": "qux"
    }
}

v = Validator(schema)

print(v.validate(config))
print(v.errors)

Results:
False
{'foo': ['must be of dict type']}

I expected the above to be equivalent to this:

schema = {
    "foo": {
        "type":"dict",
        "schema": {"baz":{"type":"string"}}
    }
}
config = {
    "foo":{
        "baz": "qux"
    }
}

v = Validator(schema)

print(v.validate(config))
print(v.errors)

In short then, it would be great if values in the schema could be defined dynamically from values from the document being validated, for example using something like {'lookup': 'path.to.field.in.document'}, and with the same lookup capabilities as used when setting dependencies. e.g.

schema_registry.add("qux", {
                    "quux": {"type": "string"},
                    "grault": {"type": "string"}
                })

schema = {
    "foo": {
        "type":"dict",
        "schema": {
            "bar": {"type": "string"},
            "baz": {
                "schema": {"lookup": "^foo.bar"}
            }
        }
    }
}
config = {
    "foo":{
        "bar": "qux",
        "baz": {
            "quux": "corge",
            "grault": "garply"
        }
    }
}

Here the lookup retrieves 'foo.bar' from the document, and sets the value of schema accordingly. 'qux' is stored in the schema registry. Resolved, this would equate to:

schema = {
    "foo": {
        "type":"dict",
        "schema": {
            "bar": {"type": "string"},
            "baz": {
                "schema": {
                    "quux": {"type": "string"},
                    "grault": {"type": "string"}
                }
            }
        }
    }
}
config = {
    "foo":{
        "bar": "qux",
        "baz": {
            "quux": "corge",
            "grault": "garply"
        }
    }
}
@charlverster charlverster changed the title Dynamically set value for schema based on document values Dynamically set value for schema based on document values [Proposed label: feature_request Feb 27, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant