Source code for pyinterpolate.evaluate.metrics

"""
Error metrics for the variogram model autofit.

Authors
-------
1. Szymon Moliński | @SimonMolinsky
"""
import warnings
import numpy as np


class UndefinedSMAPEWarning(Warning):
    """
    Warning invoked by the scenario when predicted value is equal to 0
    and observation is equal to 0. It leads to the 0/0 division and,
    in return, to the ``NaN`` value at a specific position.
    Finally, user gets ``NaN`` as the output.

    Parameters
    ----------
    zero_idx : int
        Index where observation and prediction are equal to 0.

    Attributes
    ----------
    message : str
        Warning message.
    """

    def __init__(self, zero_idx: int):
        msg = (f'Values of observation and prediction on position {zero_idx} '
               f'are equal to zero. Partial smape is set to 0 to avoid zero '
               f'by zero division.')
        self.message = msg

    def __str__(self):
        return repr(self.message)


[docs] def forecast_bias(predicted_array: np.ndarray, real_array: np.ndarray) -> float: r""" Function calculates forecast bias of prediction. Parameters ---------- predicted_array : numpy array Predicted values. real_array : numpy array Real observations. Returns ------- fb : float Forecast Bias of prediction. Notes ----- How do we interpret forecast bias? Here are two important properties: - Large positive value means that our observations are usually higher than prediction. Our model underestimates predictions. - Large negative value tells us that our predictions are usually higher than expected values. Our model overestimates predictions. Equation: .. math:: e_{fb} = \frac{\sum_{i}^{N}{y_{i} - \bar{y_{i}}}}{N} where: - :math:`e_{fb}` - forecast bias, - :math:`y_{i}` - i-th observation, - :math:`\bar{y_{i}}` - i-th prediction, - :math:`N` - number of observations. Examples -------- >>> import numpy as np >>> from pyinterpolate.evaluate.metrics import forecast_bias >>> >>> >>> arr = np.array([1, 2, 3, 4, 5]) >>> preds = np.array([1, 2, 2, 5, 6]) >>> bias = forecast_bias(preds, arr) >>> print(bias) -0.2 """ fb = float(np.mean(real_array - predicted_array)) return fb
[docs] def mean_absolute_error(predicted_array: np.ndarray, real_array: np.ndarray) -> float: r""" Function calculates Mean Absolute Error (MAE) of prediction. Parameters ---------- predicted_array : numpy array Predicted values. real_array : numpy array Observations. Returns ------- mae : float Mean absolute Error (MAE) of prediction. Equation: .. math:: e_{mae} = \frac{\sum_{i}^{N}{|y_{i} - \bar{y_{i}}|}}{N} where: * :math:`e_{mae}` - mean absolute error, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`N` - number of observations. Examples -------- >>> import numpy as np >>> from pyinterpolate.evaluate.metrics import mean_absolute_error >>> >>> >>> arr = np.array([1, 2, 3, 4, 5]) >>> preds = np.array([1, 2, 2, 5, 6]) >>> err = mean_absolute_error(preds, arr) >>> print(err) 0.6 """ mae = float( np.mean( np.abs(real_array - predicted_array) ) ) return mae
[docs] def root_mean_squared_error(predicted_array: np.ndarray, real_array: np.ndarray) -> float: r""" Function calculates Root Mean Squared Error of predictions. Parameters ---------- predicted_array : numpy array Predictions. real_array : numpy array Observations. Returns ------- rmse : float Root Mean Squared Error of prediction. Notes ----- Equation: .. math:: e_{rmse} = \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}}{N}} where: * :math:`e_{rmse}` - root mean squared error, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`N` - number of observations. Examples -------- >>> import numpy as np >>> from pyinterpolate.evaluate.metrics import root_mean_squared_error >>> >>> >>> arr = np.array([1, 2, 3, 4, 5]) >>> preds = np.array([1, 2, 2, 5, 6]) >>> rmse = root_mean_squared_error(preds, arr) >>> print(rmse) 0.7745966692414834 """ rmse = np.sqrt( np.mean( (real_array - predicted_array) ** 2 ) ) return rmse
[docs] def symmetric_mean_absolute_percentage_error(predicted_array: np.ndarray, real_array: np.ndarray, check_undefined=True) -> float: r""" Function calculates Symmetric Mean Absolute Percentage Error (SMAPE) of predictions, allowing researcher to compare different models. Parameters ---------- predicted_array : numpy array Predictions. real_array : numpy array Observations check_undefined : bool, default = True Check if there are cases when prediction and observation are equal to 0. Returns ------- smape : float Symmetric Mean Absolute Percentage Error. Warns ----- UndefinedSMAPEWarning Observation and prediction are equal to 0 - SMAPE of this pair is undefined, algorithm assumes that SMAPE equals to 0. Notes ----- Symmetric Mean Absolute Percentage Error is an accuracy measure that returns prediction error in percent. It is a relative evaluation metric. It shouldn't be used alone because SMAPE penalizes more underforecasting, thus it should be compared to Forecast Bias to have a full view of the model properties. SMAPE is better than RMSE or FB for comparing multiple models and algorithms. More about SMAPE here: https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error Equation: .. math:: e_{smape} = \frac{100}{N} \sum_{i}^{N}{\frac{|\bar{y_{i}} - y_{i}|}{|y_{i}|+|\bar{y_{i}}|}} where: * :math:`e_{smape}` - symmetric mean absolute percentage error, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`N` - number of observations. Examples -------- >>> import numpy as np >>> from pyinterpolate.evaluate.metrics import symmetric_mean_absolute_percentage_error >>> >>> >>> arr = np.array([1, 2, 3, 4, 5]) >>> preds = np.array([1, 2, 2, 5, 6]) >>> smape = symmetric_mean_absolute_percentage_error(preds, arr) >>> print(smape) 8.040404040404042 """ smape = 100 if check_undefined: psmapes = [] for idx, val in enumerate(real_array): if val == 0 and predicted_array[idx] == 0: warnings.warn(idx, UndefinedSMAPEWarning) psmapes.append(0) else: pred = predicted_array[idx] psmape = np.abs(pred - val) / (np.abs(pred) + np.abs(val)) psmapes.append(psmape) smape = smape * np.mean(psmapes) else: smape = smape * np.mean( np.abs(predicted_array - real_array) / (np.abs(real_array) + np.abs(predicted_array)) ) return smape
[docs] def weighted_root_mean_squared_error(predicted_array: np.ndarray, real_array: np.ndarray, weighting_method: str, lag_points_distribution=None) -> float: r""" Function weights RMSE of each lag by a specific weighting factor. Parameters ---------- predicted_array : numpy array Predictions. real_array : numpy array Observations. weighting_method : str The name of a method used to weight error at a given lags. Available methods: - closest: lags at a close range have greater weights, - distant: lags that are further away have greater weights, - dense: error is weighted by the number of point pairs within a lag. lag_points_distribution : numpy array, optional Number of points pairs per lag. Returns ------- wrmse : float Weighted Root Mean Squared Error. Raises ------ AttributeError : The ``lag_points_distribution`` parameter is undefined when "dense" method is set. Notes ----- Error weighting is a useful in the case when we want to force semivariogram to better represent semivariances at specific ranges. The most popular is the ``"closest"`` method - we create model that fits better semivariogram at a close distances. Equations: ``"closest"`` .. math:: e_{wrmse} = \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{N-i}{N}}{N}} where: * :math:`e_{rmse}` - weighted root mean squared error, * :math:`i` - lag, i > 0, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`N` - number of observations. ``"distant"`` .. math:: e_{wrmse} = \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{i}{N}}{N}} where: * :math:`e_{rmse}` - weighted root mean squared error, * :math:`i` - lag, i > 0, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`N` - number of observations. ``"dense"`` .. math:: e_{wrmse} = \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{p_{i}}{P}}{N}} where: * :math:`e_{rmse}` - weighted root mean squared error, * :math:`y_{i}` - i-th observation, * :math:`\bar{y_{i}}` - i-th prediction, * :math:`p_{i}` - number of points within i-th lag, * :math:`P` - number of all points, * :math:`N` - number of observations. Examples -------- >>> import numpy as np >>> from pyinterpolate.evaluate.metrics import weighted_root_mean_squared_error >>> >>> >>> arr = np.array([1, 2, 3, 4, 5]) >>> preds = np.array([1, 2, 2, 5, 6]) >>> lag_dist = np.array([2, 4, 8, 16, 32]) >>> wrmse_closest = weighted_root_mean_squared_error(preds, arr, 'closest') >>> wrmse_distant = weighted_root_mean_squared_error(preds, arr, 'distant') >>> wrmse_dense = weighted_root_mean_squared_error(preds, ... arr, ... 'dense', ... lag_dist) >>> print(wrmse_closest, wrmse_distant, wrmse_dense) 0.3464101615137755 0.6928203230275509 0.4250237185032414 """ error = (real_array - predicted_array) ** 2 arr_length = len(error) weights = np.ones(len(error)) if weighting_method == 'closest': sequence = np.arange(1, arr_length + 1) weights = (arr_length - sequence) / arr_length elif weighting_method == 'distant': sequence = np.arange(1, arr_length + 1) weights = sequence / arr_length elif weighting_method == 'dense': if lag_points_distribution is None: msg = ('The "dense" weighting method requires ' 'passing the number of point pairs per lag!') raise AttributeError(msg) else: weights = lag_points_distribution / sum(lag_points_distribution) wrmse = error * weights wrmse = np.mean(wrmse) wrmse = np.sqrt(wrmse) return wrmse