Skip to content

sanity

Checker

Base class for sanity checkers.

Source code in t4_devkit/sanity/checker.py
class Checker(ABC):
    """Base class for sanity checkers."""

    id: RuleID
    name: RuleName
    description: str
    severity: Severity

    def __call__(self, context: SanityContext) -> Report:
        match self.can_skip(context):
            case Some(skip):
                return make_skipped(self.id, self.name, self.severity, self.description, skip)

        reasons = self.check(context)
        return make_report(self.id, self.name, self.severity, self.description, reasons)

    def can_skip(self, _: SanityContext) -> Maybe[Reason]:
        """Return a skip reason if the checker should be skipped."""
        return Nothing

    @abstractmethod
    def check(self, context: SanityContext) -> list[Reason] | None:
        """Return a list of reasons if the checker fails, or None if it passes.

        Args:
            context (SanityContext): The sanity context.

        Returns:
            A list of reasons if the checker fails, or None if it passes.
        """
        pass

can_skip(_)

Return a skip reason if the checker should be skipped.

Source code in t4_devkit/sanity/checker.py
def can_skip(self, _: SanityContext) -> Maybe[Reason]:
    """Return a skip reason if the checker should be skipped."""
    return Nothing

check(context) abstractmethod

Return a list of reasons if the checker fails, or None if it passes.

Parameters:

Name Type Description Default
context SanityContext

The sanity context.

required

Returns:

Type Description
list[Reason] | None

A list of reasons if the checker fails, or None if it passes.

Source code in t4_devkit/sanity/checker.py
@abstractmethod
def check(self, context: SanityContext) -> list[Reason] | None:
    """Return a list of reasons if the checker fails, or None if it passes.

    Args:
        context (SanityContext): The sanity context.

    Returns:
        A list of reasons if the checker fails, or None if it passes.
    """
    pass

CheckerRegistry

Source code in t4_devkit/sanity/registry.py
class CheckerRegistry(dict[RuleGroup, dict[RuleID, type[Checker]]]):
    def register(self) -> Callable:
        """Register a checker class.

        Returns:
            A decorator function that registers the checker class.
        """

        def _register_decorator(module: type[Checker]) -> type[Checker]:
            self._add_module(module)
            return module

        return _register_decorator

    def _add_module(self, module: type[Checker]) -> None:
        if not inspect.isclass(module):
            raise TypeError(f"module must be a class, but got {type(module)}.")

        group = RuleGroup.to_group(module.id)

        if group is None:
            raise ValueError(
                f"'{module.id}' doesn't belong to any rule groups: {RuleGroup.values()}"
            )

        if group not in self:
            self[group] = {}

        if module.id in self[group]:
            raise ValueError(f"'{module.id}' has already been registered.")

        self[group][module.id] = module

    def build(self, excludes: Sequence[str] | None = None) -> list[Checker]:
        """Build a list of checkers from the registry.

        Args:
            excludes (Sequence[str] | None, optional): A list of rule IDs or rule groups to exclude.

        Returns:
            A list of checkers.
        """
        if excludes is None:
            excludes = []

        return [
            checker()
            for group, values in self.items()
            for id, checker in values.items()
            if id not in excludes and group.value not in excludes
        ]

build(excludes=None)

Build a list of checkers from the registry.

Parameters:

Name Type Description Default
excludes Sequence[str] | None

A list of rule IDs or rule groups to exclude.

None

Returns:

Type Description
list[Checker]

A list of checkers.

Source code in t4_devkit/sanity/registry.py
def build(self, excludes: Sequence[str] | None = None) -> list[Checker]:
    """Build a list of checkers from the registry.

    Args:
        excludes (Sequence[str] | None, optional): A list of rule IDs or rule groups to exclude.

    Returns:
        A list of checkers.
    """
    if excludes is None:
        excludes = []

    return [
        checker()
        for group, values in self.items()
        for id, checker in values.items()
        if id not in excludes and group.value not in excludes
    ]

register()

Register a checker class.

Returns:

Type Description
Callable

A decorator function that registers the checker class.

Source code in t4_devkit/sanity/registry.py
def register(self) -> Callable:
    """Register a checker class.

    Returns:
        A decorator function that registers the checker class.
    """

    def _register_decorator(module: type[Checker]) -> type[Checker]:
        self._add_module(module)
        return module

    return _register_decorator

Report

A report for a rule.

Attributes:

Name Type Description
id RuleID

The ID of the rule.

name RuleName

The name of the rule.

severity Severity

The severity of the rule.

description str

The description of the rule.

status Status

The status of the report.

reasons list[Reason] | None

The list of reasons for the report if the report is a failure or skipped.

Source code in t4_devkit/sanity/result.py
@define
class Report:
    """A report for a rule.

    Attributes:
        id (RuleID): The ID of the rule.
        name (RuleName): The name of the rule.
        severity (Severity): The severity of the rule.
        description (str): The description of the rule.
        status (Status): The status of the report.
        reasons (list[Reason] | None): The list of reasons for the report if the report is a failure or skipped.
    """

    id: RuleID
    name: RuleName
    severity: Severity
    description: str
    status: Status
    reasons: list[Reason] | None = field(default=None)

    def __attrs_post_init__(self) -> None:
        if self.status == Status.PASSED:
            assert self.reasons is None, "Passed report cannot have reasons"
        else:
            assert self.reasons is not None, "Non-passed report must have reasons"

    def is_passed(self, *, strict: bool = False) -> bool:
        """Check if the status is passed."""
        return (
            self.status == Status.PASSED
            or self.is_skipped()
            or (not strict and self.severity.is_warning())
        )

    def is_failed(self, *, strict: bool = False) -> bool:
        """Check if the status is failed."""
        return (self.status == Status.FAILED and self.severity.is_error()) or not (
            self.is_passed(strict=strict) or self.is_skipped()
        )

    def is_skipped(self) -> bool:
        """Check if the status is skipped."""
        return self.status == Status.SKIPPED

    def to_str(self, *, strict: bool = False) -> str:
        """Return a string representation of the report.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            A string representation of the report.
        """
        parts = []
        if not self.is_passed(strict=strict):
            parts.append(f"\033[31m  {self.id}:\033[0m\n")
            for reason in self.reasons or []:
                parts.append(f"\033[31m     - {reason}\033[0m\n")
        elif self.is_skipped():
            parts.append(f"\033[36m  {self.id}: [SKIPPED]\033[0m\n")
            for reason in self.reasons or []:
                parts.append(f"\033[36m     - {reason}\033[0m\n")
        elif self.severity.is_warning() and self.reasons:
            parts.append(f"\033[33m  {self.id}:\033[0m\n")
            for reason in self.reasons or []:
                parts.append(f"\033[33m     - {reason}\033[0m\n")
        else:
            parts.append(f"\033[32m  {self.id}: ✅\033[0m\n")
        return "".join(parts)

is_failed(*, strict=False)

Check if the status is failed.

Source code in t4_devkit/sanity/result.py
def is_failed(self, *, strict: bool = False) -> bool:
    """Check if the status is failed."""
    return (self.status == Status.FAILED and self.severity.is_error()) or not (
        self.is_passed(strict=strict) or self.is_skipped()
    )

is_passed(*, strict=False)

Check if the status is passed.

Source code in t4_devkit/sanity/result.py
def is_passed(self, *, strict: bool = False) -> bool:
    """Check if the status is passed."""
    return (
        self.status == Status.PASSED
        or self.is_skipped()
        or (not strict and self.severity.is_warning())
    )

is_skipped()

Check if the status is skipped.

Source code in t4_devkit/sanity/result.py
def is_skipped(self) -> bool:
    """Check if the status is skipped."""
    return self.status == Status.SKIPPED

to_str(*, strict=False)

Return a string representation of the report.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
str

A string representation of the report.

Source code in t4_devkit/sanity/result.py
def to_str(self, *, strict: bool = False) -> str:
    """Return a string representation of the report.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        A string representation of the report.
    """
    parts = []
    if not self.is_passed(strict=strict):
        parts.append(f"\033[31m  {self.id}:\033[0m\n")
        for reason in self.reasons or []:
            parts.append(f"\033[31m     - {reason}\033[0m\n")
    elif self.is_skipped():
        parts.append(f"\033[36m  {self.id}: [SKIPPED]\033[0m\n")
        for reason in self.reasons or []:
            parts.append(f"\033[36m     - {reason}\033[0m\n")
    elif self.severity.is_warning() and self.reasons:
        parts.append(f"\033[33m  {self.id}:\033[0m\n")
        for reason in self.reasons or []:
            parts.append(f"\033[33m     - {reason}\033[0m\n")
    else:
        parts.append(f"\033[32m  {self.id}: ✅\033[0m\n")
    return "".join(parts)

RuleGroup

Source code in t4_devkit/sanity/registry.py
@unique
class RuleGroup(Enum):
    STRUCTURE = "STR"
    RECORD = "REC"
    REFERENCE = "REF"
    FORMAT = "FMT"
    TIERIV = "TIV"

    @classmethod
    def values(cls) -> list[str]:
        """Return a list of all rule group values."""
        return [group.value for group in cls]

    @classmethod
    def to_group(cls, id: RuleID) -> RuleGroup | None:
        """Convert a rule ID to a rule group.

        Args:
            id (RuleID): The ID of the rule.

        Returns:
            The rule group if the rule ID belongs to any rule group, otherwise None.
        """
        for g in RuleGroup:
            if g.value in id:
                return g
        return None

to_group(id) classmethod

Convert a rule ID to a rule group.

Parameters:

Name Type Description Default
id RuleID

The ID of the rule.

required

Returns:

Type Description
RuleGroup | None

The rule group if the rule ID belongs to any rule group, otherwise None.

Source code in t4_devkit/sanity/registry.py
@classmethod
def to_group(cls, id: RuleID) -> RuleGroup | None:
    """Convert a rule ID to a rule group.

    Args:
        id (RuleID): The ID of the rule.

    Returns:
        The rule group if the rule ID belongs to any rule group, otherwise None.
    """
    for g in RuleGroup:
        if g.value in id:
            return g
    return None

values() classmethod

Return a list of all rule group values.

Source code in t4_devkit/sanity/registry.py
@classmethod
def values(cls) -> list[str]:
    """Return a list of all rule group values."""
    return [group.value for group in cls]

SanityResult

The result of a Sanity check.

Attributes:

Name Type Description
dataset_id str

The ID of the dataset.

version str | None

The version of the dataset.

reports list[Report]

The list of reports.

Source code in t4_devkit/sanity/result.py
@define
class SanityResult:
    """The result of a Sanity check.

    Attributes:
        dataset_id (str): The ID of the dataset.
        version (str | None): The version of the dataset.
        reports (list[Report]): The list of reports.
    """

    dataset_id: str
    version: str | None
    reports: list[Report]

    @classmethod
    def from_context(cls, context: SanityContext, reports: list[Report]) -> Self:
        """Create a SanityResult from a SanityContext and a list of reports.

        Args:
            context (SanityContext): The SanityContext to use.
            reports (list[Report]): The list of reports to include in the result.

        Returns:
            The created SanityResult.
        """
        return cls(
            dataset_id=context.dataset_id.value_or("UNKNOWN"),
            version=context.version.value_or(None),
            reports=reports,
        )

    def is_passed(self, *, strict: bool = False) -> bool:
        """Return True if all reports are passed, False otherwise.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            True if all reports are passed, False otherwise.
        """
        return all(report.is_passed(strict=strict) for report in self.reports)

    def to_str(self, *, strict: bool = False) -> str:
        """Return a string representation of the result.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            A string representation of the result.
        """
        return f"=== DatasetID: {self.dataset_id} ===\n" + "".join(
            report.to_str(strict=strict) for report in self.reports
        )

from_context(context, reports) classmethod

Create a SanityResult from a SanityContext and a list of reports.

Parameters:

Name Type Description Default
context SanityContext

The SanityContext to use.

required
reports list[Report]

The list of reports to include in the result.

required

Returns:

Type Description
Self

The created SanityResult.

Source code in t4_devkit/sanity/result.py
@classmethod
def from_context(cls, context: SanityContext, reports: list[Report]) -> Self:
    """Create a SanityResult from a SanityContext and a list of reports.

    Args:
        context (SanityContext): The SanityContext to use.
        reports (list[Report]): The list of reports to include in the result.

    Returns:
        The created SanityResult.
    """
    return cls(
        dataset_id=context.dataset_id.value_or("UNKNOWN"),
        version=context.version.value_or(None),
        reports=reports,
    )

is_passed(*, strict=False)

Return True if all reports are passed, False otherwise.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
bool

True if all reports are passed, False otherwise.

Source code in t4_devkit/sanity/result.py
def is_passed(self, *, strict: bool = False) -> bool:
    """Return True if all reports are passed, False otherwise.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        True if all reports are passed, False otherwise.
    """
    return all(report.is_passed(strict=strict) for report in self.reports)

to_str(*, strict=False)

Return a string representation of the result.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
str

A string representation of the result.

Source code in t4_devkit/sanity/result.py
def to_str(self, *, strict: bool = False) -> str:
    """Return a string representation of the result.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        A string representation of the result.
    """
    return f"=== DatasetID: {self.dataset_id} ===\n" + "".join(
        report.to_str(strict=strict) for report in self.reports
    )

Status

Runtime outcome per checker.

Source code in t4_devkit/sanity/result.py
class Status(str, Enum):
    """Runtime outcome per checker."""

    PASSED = "PASSED"
    FAILED = "FAILED"
    SKIPPED = "SKIPPED"

print_sanity_result(result, *, strict=False)

Print detailed and summary results of a sanity check.

Parameters:

Name Type Description Default
result SanityResult

The result of a sanity check.

required
Source code in t4_devkit/sanity/result.py
def print_sanity_result(result: SanityResult, *, strict: bool = False) -> None:
    """Print detailed and summary results of a sanity check.

    Args:
        result (SanityResult): The result of a sanity check.
    """
    # print detailed result
    print(result.to_str(strict=strict))

    # print summary result
    passed = sum(1 for rp in result.reports if rp.is_passed(strict=strict))
    failed = sum(1 for rp in result.reports if not rp.is_passed(strict=strict))
    skipped = sum(1 for rp in result.reports if rp.is_skipped())

    # just count the number of warnings
    warnings = sum(1 for rp in result.reports if rp.severity.is_warning() and rp.reasons)

    summary_rows = [[result.dataset_id, result.version, passed, failed, skipped, warnings]]

    print(
        tabulate(
            summary_rows,
            headers=["DatasetID", "Version", "Passed", "Failed", "Skipped", "Warnings"],
            tablefmt="pretty",
        ),
    )

sanity_check(data_root, revision=None, *, excludes=None)

Run sanity checks on the given data root.

Parameters:

Name Type Description Default
data_root str

The root directory of the data.

required
revision str | None

The revision to check. If None, the latest revision is used.

None
excludes Sequence[str] | None

A list of rule names or groups to exclude.

None

Returns:

Type Description
SanityResult

A SanityResult object.

Source code in t4_devkit/sanity/run.py
def sanity_check(
    data_root: str,
    revision: str | None = None,
    *,
    excludes: Sequence[str] | None = None,
) -> SanityResult:
    """Run sanity checks on the given data root.

    Args:
        data_root (str): The root directory of the data.
        revision (str | None, optional): The revision to check. If None, the latest revision is used.
        excludes (Sequence[str] | None, optional): A list of rule names or groups to exclude.

    Returns:
        A SanityResult object.
    """
    context = SanityContext.from_path(data_root, revision=revision)

    checkers = CHECKERS.build(excludes=excludes)
    reports = [checker(context) for checker in checkers]

    return SanityResult.from_context(context, reports)