diff --git a/nannyml/data_quality/__init__.py b/nannyml/data_quality/__init__.py index dc5761b3..cf509fb5 100644 --- a/nannyml/data_quality/__init__.py +++ b/nannyml/data_quality/__init__.py @@ -7,4 +7,4 @@ from .missing import MissingValuesCalculator from .unseen import UnseenValuesCalculator -from .range import NumericalRangeCalculator \ No newline at end of file +from .range import NumericalRangeCalculator diff --git a/nannyml/data_quality/range/calculator.py b/nannyml/data_quality/range/calculator.py index 277cdd49..63733c97 100644 --- a/nannyml/data_quality/range/calculator.py +++ b/nannyml/data_quality/range/calculator.py @@ -2,7 +2,7 @@ # # License: Apache Software License 2.0 -"""Continous numerical variable range monitor to ensure range supplied is within training bounds.""" +"""Continuous numerical variable range monitor to ensure range supplied is within training bounds.""" from typing import Any, Dict, List, Optional, Union @@ -15,7 +15,6 @@ from nannyml.exceptions import InvalidArgumentsException from nannyml.thresholds import Threshold, calculate_threshold_values, ConstantThreshold from nannyml.usage_logging import UsageEvent, log_usage - from .result import Result """ @@ -24,7 +23,7 @@ class NumericalRangeCalculator(AbstractCalculator): - """NumericalRangeCalculator implementation to ensure inference data numerical ranges match training.""" + """NumericalRangeCalculator ensures the monitoring data set numerical ranges match the reference data set ones.""" def __init__( self, @@ -68,7 +67,8 @@ def __init__( -------- >>> import nannyml as nml >>> reference_df, analysis_df, _ = nml.load_synthetic_car_price_dataset() - >>> feature_column_names = [col for col in reference_df.columns if col not in ['fuel','transmission','timestamp', 'y_pred', 'y_true']] + >>> feature_column_names = [col for col in reference_df.columns if col not in [ + ... 'fuel','transmission','timestamp', 'y_pred', 'y_true']] >>> calc = nml.NumericalRangeCalculator( ... column_names=feature_column_names, ... timestamp_column_name='timestamp', @@ -117,7 +117,7 @@ def __init__( def _calculate_out_of_range_stats(self, data: pd.Series, lower_bound: float, upper_bound: float): # to do make this calc out of range stats count_tot = data.shape[0] - count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum() + count_out_of_range = ((data < lower_bound) | (data > upper_bound)).sum() if self.normalize: count_out_of_range = count_out_of_range / count_tot return count_out_of_range @@ -138,7 +138,7 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs): f"Specified columns_names for NumericalRangeCalculator must all be continuous.\n" f"Categorical columns found:\n{categorical_column_names}" ) - + for col in self.column_names: self._reference_value_ranges[col] = [reference_data[col].min(), reference_data[col].max()] @@ -212,7 +212,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st def _set_metric_thresholds(self, result_data: pd.DataFrame): for column_name in self.column_names: - self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501 + ( + self._lower_alert_thresholds[column_name], + self._upper_alert_thresholds[column_name], + ) = calculate_threshold_values( # noqa: E501 threshold=self.threshold, data=result_data.loc[:, (column_name, 'value')], lower_threshold_value_limit=self.lower_threshold_value_limit, @@ -227,11 +230,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: result_data[(column_name, 'alert')] = result_data.apply( lambda row: True if ( - row[(column_name, 'value')] > ( - np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501 + row[(column_name, 'value')] + > ( + np.inf + if row[(column_name, 'upper_threshold')] is None + else row[(column_name, 'upper_threshold')] # noqa: E501 ) - or row[(column_name, 'value')] < ( - -np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501 + or row[(column_name, 'value')] + < ( + -np.inf + if row[(column_name, 'lower_threshold')] is None + else row[(column_name, 'lower_threshold')] # noqa: E501 ) ) else False, diff --git a/nannyml/data_quality/range/result.py b/nannyml/data_quality/range/result.py index c9b57bf7..b6892ebc 100644 --- a/nannyml/data_quality/range/result.py +++ b/nannyml/data_quality/range/result.py @@ -75,7 +75,8 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + ... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', diff --git a/nannyml/data_quality/unseen/calculator.py b/nannyml/data_quality/unseen/calculator.py index 15605aef..621582e0 100644 --- a/nannyml/data_quality/unseen/calculator.py +++ b/nannyml/data_quality/unseen/calculator.py @@ -13,6 +13,7 @@ from nannyml.base import AbstractCalculator, _list_missing, _split_features_by_type from nannyml.chunk import Chunker + # from nannyml.data_quality.base import _add_alert_flag from nannyml.exceptions import InvalidArgumentsException from nannyml.thresholds import ConstantThreshold, Threshold, calculate_threshold_values @@ -69,7 +70,8 @@ def __init__( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + ... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', @@ -217,7 +219,10 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st def _set_metric_thresholds(self, result_data: pd.DataFrame): for column_name in self.column_names: - self._lower_alert_thresholds[column_name], self._upper_alert_thresholds[column_name] = calculate_threshold_values( # noqa: E501 + ( + self._lower_alert_thresholds[column_name], + self._upper_alert_thresholds[column_name], + ) = calculate_threshold_values( # noqa: E501 threshold=self.threshold, data=result_data.loc[:, (column_name, 'value')], lower_threshold_value_limit=self.lower_threshold_value_limit, @@ -232,11 +237,17 @@ def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame: result_data[(column_name, 'alert')] = result_data.apply( lambda row: True if ( - row[(column_name, 'value')] > ( - np.inf if row[(column_name, 'upper_threshold')] is None else row[(column_name, 'upper_threshold')] # noqa: E501 + row[(column_name, 'value')] + > ( + np.inf + if row[(column_name, 'upper_threshold')] is None + else row[(column_name, 'upper_threshold')] # noqa: E501 ) - or row[(column_name, 'value')] < ( - -np.inf if row[(column_name, 'lower_threshold')] is None else row[(column_name, 'lower_threshold')] # noqa: E501 + or row[(column_name, 'value')] + < ( + -np.inf + if row[(column_name, 'lower_threshold')] is None + else row[(column_name, 'lower_threshold')] # noqa: E501 ) ) else False, diff --git a/nannyml/data_quality/unseen/result.py b/nannyml/data_quality/unseen/result.py index 28c02df8..4ba0b710 100644 --- a/nannyml/data_quality/unseen/result.py +++ b/nannyml/data_quality/unseen/result.py @@ -75,7 +75,8 @@ def plot( -------- >>> import nannyml as nml >>> reference, analysis, _ = nml.load_synthetic_car_price_dataset() - >>> column_names = [col for col in reference.columns if col not in ['car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] + >>> column_names = [col for col in reference.columns if col not in [ + .... 'car_age', 'km_driven', 'price_new', 'accident_count', 'door_count','timestamp', 'y_pred', 'y_true']] >>> calc = nml.UnseenValuesCalculator( ... column_names=column_names, ... timestamp_column_name='timestamp', diff --git a/nannyml/usage_logging.py b/nannyml/usage_logging.py index f3004527..7e4ea792 100644 --- a/nannyml/usage_logging.py +++ b/nannyml/usage_logging.py @@ -72,7 +72,7 @@ class UsageEvent(str, Enum): DQ_CALC_VALUES_OUT_OF_RANGE_FIT = "Data Quality Calculator Values Out Of Range fit" DQ_CALC_VALUES_OUT_OF_RANGE_RUN = "Data Quality Calculator Values Out Of Range run" - DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot" + DQ_CALC_VALUES_OUT_OF_RANGE_PLOT = "Data Quality Calculator Values Out Of Range Plot" UNIVAR_DRIFT_CALC_FIT = "Univariate drift calculator fit" UNIVAR_DRIFT_CALC_RUN = "Univariate drift calculator run"