Skip to content

sanity

Checker

Base class for sanity checkers.

Source code in t4_devkit/sanity/checker.py
class Checker(ABC):
    """Base class for sanity checkers."""

    id: RuleID
    name: RuleName
    description: str
    severity: Severity

    def __call__(self, context: SanityContext, fix: bool = False) -> Report:
        """Run the checker and return a report.

        The issues will be fixed if the checker is fixable, `fix` is True and
            the checker returns a list of failure or warning reasons (not `None`).

        Args:
            context (SanityContext): The sanity context.
            fix (bool, optional): Whether to attempt to fix the issue.

        Returns:
            A report containing the results of the checker.
        """
        match self.can_skip(context):
            case Some(skip):
                return make_skipped(self.id, self.name, self.severity, self.description, skip)

        reasons = self.check(context)
        fixed = self.fix(context) if fix and reasons else False
        return make_report(self.id, self.name, self.severity, self.description, reasons, fixed)

    def can_skip(self, context: SanityContext) -> Maybe[Reason]:
        """Return a skip reason if the checker should be skipped.

        Args:
            context (SanityContext): The sanity context.

        Returns:
            A skip reason if the checker should be skipped, or Nothing if it should not be skipped.
        """
        return Nothing

    @abstractmethod
    def check(self, context: SanityContext) -> list[Reason] | None:
        """Return a list of reasons if the checker fails, or None if it passes.

        Args:
            context (SanityContext): The sanity context.

        Returns:
            A list of reasons if the checker fails, or None if it passes.
        """
        pass

    def fix(self, context: SanityContext) -> bool:
        """Fix the issue reported by the checker.

        Args:
            context (SanityContext): The sanity context.

        Returns:
            True if the issue was fixed, False otherwise.
        """
        return False

__call__(context, fix=False)

Run the checker and return a report.

The issues will be fixed if the checker is fixable, fix is True and the checker returns a list of failure or warning reasons (not None).

Parameters:

Name Type Description Default
context SanityContext

The sanity context.

required
fix bool

Whether to attempt to fix the issue.

False

Returns:

Type Description
Report

A report containing the results of the checker.

Source code in t4_devkit/sanity/checker.py
def __call__(self, context: SanityContext, fix: bool = False) -> Report:
    """Run the checker and return a report.

    The issues will be fixed if the checker is fixable, `fix` is True and
        the checker returns a list of failure or warning reasons (not `None`).

    Args:
        context (SanityContext): The sanity context.
        fix (bool, optional): Whether to attempt to fix the issue.

    Returns:
        A report containing the results of the checker.
    """
    match self.can_skip(context):
        case Some(skip):
            return make_skipped(self.id, self.name, self.severity, self.description, skip)

    reasons = self.check(context)
    fixed = self.fix(context) if fix and reasons else False
    return make_report(self.id, self.name, self.severity, self.description, reasons, fixed)

can_skip(context)

Return a skip reason if the checker should be skipped.

Parameters:

Name Type Description Default
context SanityContext

The sanity context.

required

Returns:

Type Description
Maybe[Reason]

A skip reason if the checker should be skipped, or Nothing if it should not be skipped.

Source code in t4_devkit/sanity/checker.py
def can_skip(self, context: SanityContext) -> Maybe[Reason]:
    """Return a skip reason if the checker should be skipped.

    Args:
        context (SanityContext): The sanity context.

    Returns:
        A skip reason if the checker should be skipped, or Nothing if it should not be skipped.
    """
    return Nothing

check(context) abstractmethod

Return a list of reasons if the checker fails, or None if it passes.

Parameters:

Name Type Description Default
context SanityContext

The sanity context.

required

Returns:

Type Description
list[Reason] | None

A list of reasons if the checker fails, or None if it passes.

Source code in t4_devkit/sanity/checker.py
@abstractmethod
def check(self, context: SanityContext) -> list[Reason] | None:
    """Return a list of reasons if the checker fails, or None if it passes.

    Args:
        context (SanityContext): The sanity context.

    Returns:
        A list of reasons if the checker fails, or None if it passes.
    """
    pass

fix(context)

Fix the issue reported by the checker.

Parameters:

Name Type Description Default
context SanityContext

The sanity context.

required

Returns:

Type Description
bool

True if the issue was fixed, False otherwise.

Source code in t4_devkit/sanity/checker.py
def fix(self, context: SanityContext) -> bool:
    """Fix the issue reported by the checker.

    Args:
        context (SanityContext): The sanity context.

    Returns:
        True if the issue was fixed, False otherwise.
    """
    return False

CheckerRegistry

Source code in t4_devkit/sanity/registry.py
class CheckerRegistry(dict[RuleGroup, dict[RuleID, type[Checker]]]):
    def register(self) -> Callable:
        """Register a checker class.

        Returns:
            A decorator function that registers the checker class.
        """

        def _register_decorator(module: type[Checker]) -> type[Checker]:
            self._add_module(module)
            return module

        return _register_decorator

    def _add_module(self, module: type[Checker]) -> None:
        if not inspect.isclass(module):
            raise TypeError(f"module must be a class, but got {type(module)}.")

        group = RuleGroup.to_group(module.id)

        if group is None:
            raise ValueError(
                f"'{module.id}' doesn't belong to any rule groups: {RuleGroup.values()}"
            )

        if group not in self:
            self[group] = {}

        if module.id in self[group]:
            raise ValueError(f"'{module.id}' has already been registered.")

        self[group][module.id] = module

    def build(self, excludes: Sequence[str] | None = None) -> list[Checker]:
        """Build a list of checkers from the registry.

        Args:
            excludes (Sequence[str] | None, optional): A list of rule IDs or rule groups to exclude.

        Returns:
            A list of checkers.
        """
        if excludes is None:
            excludes = []

        return [
            checker()
            for group, values in self.items()
            for id, checker in values.items()
            if id not in excludes and group.value not in excludes
        ]

build(excludes=None)

Build a list of checkers from the registry.

Parameters:

Name Type Description Default
excludes Sequence[str] | None

A list of rule IDs or rule groups to exclude.

None

Returns:

Type Description
list[Checker]

A list of checkers.

Source code in t4_devkit/sanity/registry.py
def build(self, excludes: Sequence[str] | None = None) -> list[Checker]:
    """Build a list of checkers from the registry.

    Args:
        excludes (Sequence[str] | None, optional): A list of rule IDs or rule groups to exclude.

    Returns:
        A list of checkers.
    """
    if excludes is None:
        excludes = []

    return [
        checker()
        for group, values in self.items()
        for id, checker in values.items()
        if id not in excludes and group.value not in excludes
    ]

register()

Register a checker class.

Returns:

Type Description
Callable

A decorator function that registers the checker class.

Source code in t4_devkit/sanity/registry.py
def register(self) -> Callable:
    """Register a checker class.

    Returns:
        A decorator function that registers the checker class.
    """

    def _register_decorator(module: type[Checker]) -> type[Checker]:
        self._add_module(module)
        return module

    return _register_decorator

Report

A report for a rule.

Attributes:

Name Type Description
id RuleID

The ID of the rule.

name RuleName

The name of the rule.

severity Severity

The severity of the rule.

description str

The description of the rule.

status Status

The status of the report.

reasons list[Reason] | None

The list of reasons for the report if the report is a failure or skipped.

fixed bool

Whether the report is fixed.

Source code in t4_devkit/sanity/result.py
@define
class Report:
    """A report for a rule.

    Attributes:
        id (RuleID): The ID of the rule.
        name (RuleName): The name of the rule.
        severity (Severity): The severity of the rule.
        description (str): The description of the rule.
        status (Status): The status of the report.
        reasons (list[Reason] | None): The list of reasons for the report if the report is a failure or skipped.
        fixed (bool): Whether the report is fixed.
    """

    id: RuleID
    name: RuleName
    severity: Severity
    description: str
    status: Status
    reasons: list[Reason] | None = field(default=None)
    fixed: bool = False

    def __attrs_post_init__(self) -> None:
        if self.status == Status.PASSED:
            assert self.reasons is None, "Passed report cannot have reasons"
        else:
            assert self.reasons is not None, "Non-passed report must have reasons"

    def is_passed(self, *, strict: bool = False) -> bool:
        """Check if the status is passed."""
        return (
            self.status == Status.PASSED
            or self.is_skipped()
            or (not strict and self.severity.is_warning())
            or self.fixed
        )

    def is_failed(self, *, strict: bool = False) -> bool:
        """Check if the status is failed."""
        return (self.status == Status.FAILED and self.severity.is_error()) or not (
            self.is_passed(strict=strict) or self.is_skipped()
        )

    def is_skipped(self) -> bool:
        """Check if the status is skipped."""
        return self.status == Status.SKIPPED

    def to_str(self, *, strict: bool = False) -> str:
        """Return a string representation of the report.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            A string representation of the report.
        """
        parts = []
        if not self.is_passed(strict=strict):
            # print failure reasons
            parts.append(f"{Fore.RED}  {self.id}:\n")
            for reason in self.reasons or []:
                parts.append(f"{Fore.RED}     - {reason}\n")
        elif self.is_skipped():
            # print skipped reasons
            parts.append(f"{Fore.CYAN}  {self.id}: [SKIPPED]\n")
            for reason in self.reasons or []:
                parts.append(f"{Fore.CYAN}     - {reason}\n")
        elif self.severity.is_warning() and self.reasons:
            # print warning reasons
            parts.append(f"{Fore.YELLOW}  {self.id}:\n")
            for reason in self.reasons or []:
                parts.append(f"{Fore.YELLOW}     - {reason}\n")
        elif self.is_passed() and self.fixed:
            # print failure or warning but fixed reasons
            parts.append(f"{Fore.GREEN}  {self.id}: --> FIXED ✅\n")
            for reason in self.reasons or []:
                parts.append(f"{Fore.GREEN}     - {reason}\n")
        else:
            # print passed
            parts.append(f"{Fore.GREEN}  {self.id}: ✅\n")
        parts.append(f"{Fore.RESET}")
        return "".join(parts)

is_failed(*, strict=False)

Check if the status is failed.

Source code in t4_devkit/sanity/result.py
def is_failed(self, *, strict: bool = False) -> bool:
    """Check if the status is failed."""
    return (self.status == Status.FAILED and self.severity.is_error()) or not (
        self.is_passed(strict=strict) or self.is_skipped()
    )

is_passed(*, strict=False)

Check if the status is passed.

Source code in t4_devkit/sanity/result.py
def is_passed(self, *, strict: bool = False) -> bool:
    """Check if the status is passed."""
    return (
        self.status == Status.PASSED
        or self.is_skipped()
        or (not strict and self.severity.is_warning())
        or self.fixed
    )

is_skipped()

Check if the status is skipped.

Source code in t4_devkit/sanity/result.py
def is_skipped(self) -> bool:
    """Check if the status is skipped."""
    return self.status == Status.SKIPPED

to_str(*, strict=False)

Return a string representation of the report.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
str

A string representation of the report.

Source code in t4_devkit/sanity/result.py
def to_str(self, *, strict: bool = False) -> str:
    """Return a string representation of the report.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        A string representation of the report.
    """
    parts = []
    if not self.is_passed(strict=strict):
        # print failure reasons
        parts.append(f"{Fore.RED}  {self.id}:\n")
        for reason in self.reasons or []:
            parts.append(f"{Fore.RED}     - {reason}\n")
    elif self.is_skipped():
        # print skipped reasons
        parts.append(f"{Fore.CYAN}  {self.id}: [SKIPPED]\n")
        for reason in self.reasons or []:
            parts.append(f"{Fore.CYAN}     - {reason}\n")
    elif self.severity.is_warning() and self.reasons:
        # print warning reasons
        parts.append(f"{Fore.YELLOW}  {self.id}:\n")
        for reason in self.reasons or []:
            parts.append(f"{Fore.YELLOW}     - {reason}\n")
    elif self.is_passed() and self.fixed:
        # print failure or warning but fixed reasons
        parts.append(f"{Fore.GREEN}  {self.id}: --> FIXED ✅\n")
        for reason in self.reasons or []:
            parts.append(f"{Fore.GREEN}     - {reason}\n")
    else:
        # print passed
        parts.append(f"{Fore.GREEN}  {self.id}: ✅\n")
    parts.append(f"{Fore.RESET}")
    return "".join(parts)

RuleGroup

Source code in t4_devkit/sanity/registry.py
@unique
class RuleGroup(Enum):
    STRUCTURE = "STR"
    RECORD = "REC"
    REFERENCE = "REF"
    FORMAT = "FMT"
    TIERIV = "TIV"

    @classmethod
    def values(cls) -> list[str]:
        """Return a list of all rule group values."""
        return [group.value for group in cls]

    @classmethod
    def to_group(cls, id: RuleID) -> RuleGroup | None:
        """Convert a rule ID to a rule group.

        Args:
            id (RuleID): The ID of the rule.

        Returns:
            The rule group if the rule ID belongs to any rule group, otherwise None.
        """
        for g in RuleGroup:
            if g.value in id:
                return g
        return None

to_group(id) classmethod

Convert a rule ID to a rule group.

Parameters:

Name Type Description Default
id RuleID

The ID of the rule.

required

Returns:

Type Description
RuleGroup | None

The rule group if the rule ID belongs to any rule group, otherwise None.

Source code in t4_devkit/sanity/registry.py
@classmethod
def to_group(cls, id: RuleID) -> RuleGroup | None:
    """Convert a rule ID to a rule group.

    Args:
        id (RuleID): The ID of the rule.

    Returns:
        The rule group if the rule ID belongs to any rule group, otherwise None.
    """
    for g in RuleGroup:
        if g.value in id:
            return g
    return None

values() classmethod

Return a list of all rule group values.

Source code in t4_devkit/sanity/registry.py
@classmethod
def values(cls) -> list[str]:
    """Return a list of all rule group values."""
    return [group.value for group in cls]

SanityResult

The result of a Sanity check.

Attributes:

Name Type Description
dataset_id str

The ID of the dataset.

version str | None

The version of the dataset.

reports list[Report]

The list of reports.

Source code in t4_devkit/sanity/result.py
@define
class SanityResult:
    """The result of a Sanity check.

    Attributes:
        dataset_id (str): The ID of the dataset.
        version (str | None): The version of the dataset.
        reports (list[Report]): The list of reports.
    """

    dataset_id: str
    version: str | None
    reports: list[Report]

    @classmethod
    def from_context(cls, context: SanityContext, reports: list[Report]) -> Self:
        """Create a SanityResult from a SanityContext and a list of reports.

        Args:
            context (SanityContext): The SanityContext to use.
            reports (list[Report]): The list of reports to include in the result.

        Returns:
            The created SanityResult.
        """
        return cls(
            dataset_id=context.dataset_id.value_or("UNKNOWN"),
            version=context.version.value_or(None),
            reports=reports,
        )

    def is_passed(self, *, strict: bool = False) -> bool:
        """Return True if all reports are passed, False otherwise.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            True if all reports are passed, False otherwise.
        """
        return all(report.is_passed(strict=strict) for report in self.reports)

    def to_str(self, *, strict: bool = False) -> str:
        """Return a string representation of the result.

        Args:
            strict (bool): Whether to consider warnings as failures.

        Returns:
            A string representation of the result.
        """
        return f"=== DatasetID: {self.dataset_id} ===\n" + "".join(
            report.to_str(strict=strict) for report in self.reports
        )

from_context(context, reports) classmethod

Create a SanityResult from a SanityContext and a list of reports.

Parameters:

Name Type Description Default
context SanityContext

The SanityContext to use.

required
reports list[Report]

The list of reports to include in the result.

required

Returns:

Type Description
Self

The created SanityResult.

Source code in t4_devkit/sanity/result.py
@classmethod
def from_context(cls, context: SanityContext, reports: list[Report]) -> Self:
    """Create a SanityResult from a SanityContext and a list of reports.

    Args:
        context (SanityContext): The SanityContext to use.
        reports (list[Report]): The list of reports to include in the result.

    Returns:
        The created SanityResult.
    """
    return cls(
        dataset_id=context.dataset_id.value_or("UNKNOWN"),
        version=context.version.value_or(None),
        reports=reports,
    )

is_passed(*, strict=False)

Return True if all reports are passed, False otherwise.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
bool

True if all reports are passed, False otherwise.

Source code in t4_devkit/sanity/result.py
def is_passed(self, *, strict: bool = False) -> bool:
    """Return True if all reports are passed, False otherwise.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        True if all reports are passed, False otherwise.
    """
    return all(report.is_passed(strict=strict) for report in self.reports)

to_str(*, strict=False)

Return a string representation of the result.

Parameters:

Name Type Description Default
strict bool

Whether to consider warnings as failures.

False

Returns:

Type Description
str

A string representation of the result.

Source code in t4_devkit/sanity/result.py
def to_str(self, *, strict: bool = False) -> str:
    """Return a string representation of the result.

    Args:
        strict (bool): Whether to consider warnings as failures.

    Returns:
        A string representation of the result.
    """
    return f"=== DatasetID: {self.dataset_id} ===\n" + "".join(
        report.to_str(strict=strict) for report in self.reports
    )

Status

Runtime outcome per checker.

Source code in t4_devkit/sanity/result.py
class Status(str, Enum):
    """Runtime outcome per checker."""

    PASSED = "PASSED"
    FAILED = "FAILED"
    SKIPPED = "SKIPPED"

print_sanity_result(result, *, strict=False)

Print detailed and summary results of a sanity check.

Parameters:

Name Type Description Default
result SanityResult

The result of a sanity check.

required
Source code in t4_devkit/sanity/result.py
def print_sanity_result(result: SanityResult, *, strict: bool = False) -> None:
    """Print detailed and summary results of a sanity check.

    Args:
        result (SanityResult): The result of a sanity check.
    """
    # print detailed result
    print(result.to_str(strict=strict))

    # print summary result
    passed = sum(1 for rp in result.reports if rp.is_passed(strict=strict))
    failed = sum(1 for rp in result.reports if not rp.is_passed(strict=strict))
    skipped = sum(1 for rp in result.reports if rp.is_skipped())

    # just count the number of warnings
    warnings = sum(1 for rp in result.reports if rp.severity.is_warning() and rp.reasons)

    # count the number of fixed issues
    fixed = sum(1 for rp in result.reports if rp.fixed)

    summary_rows = [[result.dataset_id, result.version, passed, failed, skipped, warnings, fixed]]

    print(
        tabulate(
            summary_rows,
            headers=["DatasetID", "Version", "Passed", "Failed", "Skipped", "Warnings", "Fixed"],
            tablefmt="pretty",
        ),
    )

sanity_check(data_root, revision=None, *, excludes=None, fix=False)

Run sanity checks on the given data root.

Parameters:

Name Type Description Default
data_root str

The root directory of the data.

required
revision str | None

The revision to check. If None, the latest revision is used.

None
excludes Sequence[str] | None

A list of rule names or groups to exclude.

None
fix bool

Attempt to fix the issues reported by the sanity check.

False

Returns:

Type Description
SanityResult

A SanityResult object.

Source code in t4_devkit/sanity/run.py
def sanity_check(
    data_root: str,
    revision: str | None = None,
    *,
    excludes: Sequence[str] | None = None,
    fix: bool = False,
) -> SanityResult:
    """Run sanity checks on the given data root.

    Args:
        data_root (str): The root directory of the data.
        revision (str | None, optional): The revision to check. If None, the latest revision is used.
        excludes (Sequence[str] | None, optional): A list of rule names or groups to exclude.
        fix (bool, optional): Attempt to fix the issues reported by the sanity check.

    Returns:
        A SanityResult object.
    """
    context = SanityContext.from_path(data_root, revision=revision)

    checkers = CHECKERS.build(excludes=excludes)
    reports = [checker(context, fix=fix) for checker in checkers]

    return SanityResult.from_context(context, reports)