Source code for pyinterpolate.evaluate.metrics

"""
Error metrics for the variogram model autofit.

Authors
-------
1. Szymon Moliński | @SimonMolinsky
"""
import warnings
import numpy as np


class UndefinedSMAPEWarning(Warning):
    """
    Warning invoked by the scenario when predicted value is equal to 0
    and observation is equal to 0. It leads to the 0/0 division and,
    in return, to the ``NaN`` value at a specific position.
    Finally, user gets ``NaN`` as the output.

    Parameters
    ----------
    zero_idx : int
        Index where observation and prediction are equal to 0.

    Attributes
    ----------
    message : str
        Warning message.
    """

    def __init__(self, zero_idx: int):
        msg = (f'Values of observation and prediction on position {zero_idx} '
               f'are equal to zero. Partial smape is set to 0 to avoid zero '
               f'by zero division.')
        self.message = msg

    def __str__(self):
        return repr(self.message)



[docs]
def forecast_bias(predicted_array: np.ndarray,
                  real_array: np.ndarray) -> float:
    r"""
    Function calculates forecast bias of prediction.

    Parameters
    ----------
    predicted_array : numpy array
        Predicted values.

    real_array : numpy array
        Real observations.

    Returns
    -------
    fb : float
        Forecast Bias of prediction.

    Notes
    -----
    How do we interpret forecast bias? Here are two important properties:
        - Large positive value means that our observations are usually higher
          than prediction. Our model underestimates predictions.
        - Large negative value tells us that our predictions are usually
          higher than expected values. Our model overestimates predictions.

    Equation:

    .. math:: e_{fb} = \frac{\sum_{i}^{N}{y_{i} - \bar{y_{i}}}}{N}

    where:
        - :math:`e_{fb}` - forecast bias,
        - :math:`y_{i}` - i-th observation,
        - :math:`\bar{y_{i}}` - i-th prediction,
        - :math:`N` - number of observations.

    Examples
    --------
    >>> import numpy as np
    >>> from pyinterpolate.evaluate.metrics import forecast_bias
    >>>
    >>>
    >>> arr = np.array([1, 2, 3, 4, 5])
    >>> preds = np.array([1, 2, 2, 5, 6])
    >>> bias = forecast_bias(preds, arr)
    >>> print(bias)
    -0.2
    """

    fb = float(np.mean(real_array - predicted_array))
    return fb




[docs]
def mean_absolute_error(predicted_array: np.ndarray,
                        real_array: np.ndarray) -> float:
    r"""
    Function calculates Mean Absolute Error (MAE) of prediction.

    Parameters
    ----------
    predicted_array : numpy array
        Predicted values.

    real_array : numpy array
        Observations.

    Returns
    -------
    mae : float
        Mean absolute Error (MAE) of prediction.

    Equation:

    .. math:: e_{mae} = \frac{\sum_{i}^{N}{|y_{i} - \bar{y_{i}}|}}{N}

    where:
        * :math:`e_{mae}` - mean absolute error,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`N` - number of observations.

    Examples
    --------
    >>> import numpy as np
    >>> from pyinterpolate.evaluate.metrics import mean_absolute_error
    >>>
    >>>
    >>> arr = np.array([1, 2, 3, 4, 5])
    >>> preds = np.array([1, 2, 2, 5, 6])
    >>> err = mean_absolute_error(preds, arr)
    >>> print(err)
    0.6
    """
    mae = float(
        np.mean(
            np.abs(real_array - predicted_array)
        )
    )
    return mae




[docs]
def root_mean_squared_error(predicted_array: np.ndarray,
                            real_array: np.ndarray) -> float:
    r"""
    Function calculates Root Mean Squared Error of predictions.

    Parameters
    ----------
    predicted_array : numpy array
        Predictions.

    real_array : numpy array
        Observations.

    Returns
    -------
    rmse : float
        Root Mean Squared Error of prediction.

    Notes
    -----
    Equation:

    .. math:: e_{rmse} = \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}}{N}}

    where:
        * :math:`e_{rmse}` - root mean squared error,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`N` - number of observations.

    Examples
    --------
    >>> import numpy as np
    >>> from pyinterpolate.evaluate.metrics import root_mean_squared_error
    >>>
    >>>
    >>> arr = np.array([1, 2, 3, 4, 5])
    >>> preds = np.array([1, 2, 2, 5, 6])
    >>> rmse = root_mean_squared_error(preds, arr)
    >>> print(rmse)
    0.7745966692414834
    """
    rmse = np.sqrt(
        np.mean(
            (real_array - predicted_array) ** 2
        )
    )
    return rmse




[docs]
def symmetric_mean_absolute_percentage_error(predicted_array: np.ndarray,
                                             real_array: np.ndarray,
                                             check_undefined=True) -> float:
    r"""
    Function calculates Symmetric Mean Absolute Percentage Error (SMAPE) of
    predictions, allowing researcher to compare different models.

    Parameters
    ----------
    predicted_array : numpy array
        Predictions.

    real_array : numpy array
        Observations

    check_undefined : bool, default = True
        Check if there are cases when prediction and observation are
        equal to 0.

    Returns
    -------
    smape : float
            Symmetric Mean Absolute Percentage Error.

    Warns
    -----
    UndefinedSMAPEWarning
        Observation and prediction are equal to 0 - SMAPE of this pair is
        undefined, algorithm assumes that SMAPE equals to 0.

    Notes
    -----
    Symmetric Mean Absolute Percentage Error is an accuracy measure that
    returns prediction error in percent. It is a relative evaluation metric.
    It shouldn't be used alone because SMAPE penalizes more underforecasting,
    thus it should be compared to Forecast Bias to have a full view of the
    model properties. SMAPE is better than RMSE or FB for comparing multiple
    models and algorithms.

    More about SMAPE here:
    https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error

    Equation:

    .. math:: e_{smape} = \frac{100}{N}
        \sum_{i}^{N}{\frac{|\bar{y_{i}} - y_{i}|}{|y_{i}|+|\bar{y_{i}}|}}

    where:
        * :math:`e_{smape}` - symmetric mean absolute percentage error,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`N` - number of observations.

    Examples
    --------
    >>> import numpy as np
    >>> from pyinterpolate.evaluate.metrics import symmetric_mean_absolute_percentage_error
    >>>
    >>>
    >>> arr = np.array([1, 2, 3, 4, 5])
    >>> preds = np.array([1, 2, 2, 5, 6])
    >>> smape = symmetric_mean_absolute_percentage_error(preds, arr)
    >>> print(smape)
    8.040404040404042
    """
    smape = 100

    if check_undefined:
        psmapes = []
        for idx, val in enumerate(real_array):
            if val == 0 and predicted_array[idx] == 0:
                warnings.warn(idx, UndefinedSMAPEWarning)
                psmapes.append(0)
            else:
                pred = predicted_array[idx]
                psmape = np.abs(pred - val) / (np.abs(pred) + np.abs(val))
                psmapes.append(psmape)
        smape = smape * np.mean(psmapes)
    else:
        smape = smape * np.mean(
            np.abs(predicted_array - real_array) /
            (np.abs(real_array) + np.abs(predicted_array))
        )
    return smape




[docs]
def weighted_root_mean_squared_error(predicted_array: np.ndarray,
                                     real_array: np.ndarray,
                                     weighting_method: str,
                                     lag_points_distribution=None) -> float:
    r"""
    Function weights RMSE of each lag by a specific weighting factor.

    Parameters
    ----------
    predicted_array : numpy array
        Predictions.

    real_array : numpy array
        Observations.

    weighting_method : str
        The name of a method used to weight error at
        a given lags.
        Available methods:
        - closest: lags at a close range have greater weights,
        - distant: lags that are further away have greater weights,
        - dense: error is weighted by the number of point pairs within a lag.

    lag_points_distribution : numpy array, optional
        Number of points pairs per lag.

    Returns
    -------
    wrmse : float
        Weighted Root Mean Squared Error.

    Raises
    ------
    AttributeError :
        The ``lag_points_distribution`` parameter is undefined when
        "dense" method is set.

    Notes
    -----
    Error weighting is a useful in the case when we want to
    force semivariogram to better represent semivariances at specific ranges.
    The most popular is the ``"closest"`` method - we create model that
    fits better semivariogram at a close distances.

    Equations:

    ``"closest"``

    .. math:: e_{wrmse} =
        \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{N-i}{N}}{N}}

    where:
        * :math:`e_{rmse}` - weighted root mean squared error,
        * :math:`i` - lag, i > 0,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`N` - number of observations.

    ``"distant"``

    .. math:: e_{wrmse} =
        \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{i}{N}}{N}}

    where:
        * :math:`e_{rmse}` - weighted root mean squared error,
        * :math:`i` - lag, i > 0,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`N` - number of observations.

    ``"dense"``

    .. math:: e_{wrmse} =
        \sqrt{\frac{\sum_{i}^{N}({y_{i} - \bar{y_{i}})^2}*\frac{p_{i}}{P}}{N}}

    where:
        * :math:`e_{rmse}` - weighted root mean squared error,
        * :math:`y_{i}` - i-th observation,
        * :math:`\bar{y_{i}}` - i-th prediction,
        * :math:`p_{i}` - number of points within i-th lag,
        * :math:`P` - number of all points,
        * :math:`N` - number of observations.

    Examples
    --------
    >>> import numpy as np
    >>> from pyinterpolate.evaluate.metrics import weighted_root_mean_squared_error
    >>>
    >>>
    >>> arr = np.array([1, 2, 3, 4, 5])
    >>> preds = np.array([1, 2, 2, 5, 6])
    >>> lag_dist = np.array([2, 4, 8, 16, 32])
    >>> wrmse_closest = weighted_root_mean_squared_error(preds, arr, 'closest')
    >>> wrmse_distant = weighted_root_mean_squared_error(preds, arr, 'distant')
    >>> wrmse_dense = weighted_root_mean_squared_error(preds,
    ...                                                arr,
    ...                                                'dense',
    ...                                                lag_dist)
    >>> print(wrmse_closest, wrmse_distant, wrmse_dense)
    0.3464101615137755 0.6928203230275509 0.4250237185032414
    """
    error = (real_array - predicted_array) ** 2
    arr_length = len(error)
    weights = np.ones(len(error))

    if weighting_method == 'closest':
        sequence = np.arange(1, arr_length + 1)
        weights = (arr_length - sequence) / arr_length
    elif weighting_method == 'distant':
        sequence = np.arange(1, arr_length + 1)
        weights = sequence / arr_length
    elif weighting_method == 'dense':
        if lag_points_distribution is None:
            msg = ('The "dense" weighting method requires '
                   'passing the number of point pairs per lag!')
            raise AttributeError(msg)
        else:
            weights = lag_points_distribution / sum(lag_points_distribution)

    wrmse = error * weights
    wrmse = np.mean(wrmse)
    wrmse = np.sqrt(wrmse)

    return wrmse