Source code for pyinterpolate.evaluate.cross_validation

from typing import Tuple, Union
from numpy.typing import ArrayLike

import numpy as np
from tqdm import tqdm

from pyinterpolate.semivariogram.theoretical.theoretical import TheoreticalVariogram
from pyinterpolate.kriging.point.ordinary import ordinary_kriging
from pyinterpolate.kriging.point.simple import simple_kriging
from pyinterpolate.transform.geo import geometry_and_values_array



[docs]
def validate_kriging(
        theoretical_model: TheoreticalVariogram,
        points: ArrayLike = None,
        values: ArrayLike = None,
        geometries: ArrayLike = None,
        how: str = 'ok',
        neighbors_range: Union[float, None] = None,
        no_neighbors: int = 4,
        use_all_neighbors_in_range=False,
        sk_mean: Union[float, None] = None,
        allow_approximate_solutions=False,
        progress_bar: bool = True
) -> Tuple[float, float, np.ndarray]:
    """
    Function performs cross-validation of kriging models.

    Parameters
    ----------
    theoretical_model : TheoreticalVariogram
        Fitted variogram model.

    points : ArrayLike, optional
        Known points and their values ``[x, y, value]``.

    values : ArrayLike, optional
        Observation in the i-th geometry (from ``geometries``). Optional
        parameter, if not given then ``points`` must be provided.

    geometries : ArrayLike, optional
        Array or similar structure with geometries. It must have the same
        length as ``values``. Optional parameter, if not given then
        ``points`` must be provided. Point type geometry.

    how : str, default='ok'
        Select what kind of kriging you want to perform

          * 'ok': ordinary kriging,
          * 'sk': simple kriging - if it is set then ``sk_mean`` parameter
            must be provided.

    neighbors_range : float, default=None
        The maximum distance where we search for neighbors. If ``None`` is
        given then range is selected from
        the ``theoretical_model`` ``rang`` attribute.

    no_neighbors : int, default = 4
        The number of the **n-closest neighbors** used for interpolation.

    use_all_neighbors_in_range : bool, default = False
        ``True``: if the number of neighbors within the
        ``neighbors_range`` is greater than the
        ``number_of_neighbors`` parameter then use all neighbors, do not
        clip their number.

    sk_mean : float, default=None
        The mean value of a process over a study area. Should be known
        before processing. That's why Simple Kriging has a limited number
        of applications. You must have multiple samples and well-known area to
        know this parameter.

    allow_approximate_solutions : bool, default=False
        Allows the approximation of kriging weights based on
        the OLS algorithm. We don't recommend set it to ``True``
        if you don't know what are you doing. This parameter can be useful
        when you have clusters in your dataset,
        that can lead to singular or near-singular matrix creation.

    progress_bar : bool, default=True
        Show process status.

    Returns
    -------
    : Tuple
        Function returns tuple with:

          * Mean Prediction Error,
          * Mean Kriging Error: ratio of variance of prediction errors to
            the average variance error of kriging,
          * array with: ``[coordinate x, coordinate y, prediction error, kriging estimate error, z-value, z-ci-min, z-ci-max]``

    References
    ----------
    1. Clark, I., (2004), The Art of Cross Validation in Geostatistical
       Applications
    2. Clark I., (1979), Does Geostatistics Work, Proc. 16th APCOM,
       pp.213.-225.

    Examples
    --------
    >>> from pyinterpolate import (
    ...     ExperimentalVariogram,
    ...     validate_kriging,
    ...     TheoreticalVariogram
    ... )
    >>>
    >>>
    >>> POINTS_DATA = ...  # load dataset
    >>> POINTS_VARIOGRAM = ExperimentalVariogram(POINTS_DATA,
    ...                                          step_size=1,
    ...                                          max_range=6)
    >>> THEORETICAL_MODEL = TheoreticalVariogram()
    >>> THEORETICAL_MODEL.autofit(experimental_variogram=POINTS_VARIOGRAM,
    ...                           models_group='linear',
    ...                           nugget=0.0)
    >>> validation_results = validate_kriging(
    ...     theoretical_model=THEORETICAL_MODEL,
    ...     values=POINTS_DATA[:, -1],
    ...     geometries=POINTS_DATA[:, :-1],
    ...     no_neighbors=4,
    ...     progress_bar=False
    ... )
    >>> print(validation_results[0])  # mean prediction error
    -0.01613441673494531
    >>> print(validation_results[1])  # mean kriging error
    1.6386630811210166
    """
    # TODO:
    # Validation tutorials
    # TODO:
    # Areal kriging validation
    # Initialize array for coordinates and errors
    coordinates_and_errors = []

    if points is None:
        points = geometry_and_values_array(
            geometry=geometries,
            values=values
        )

    # Divide observations
    for idx, row in enumerate(tqdm(points, disable=not progress_bar)):
        clipped_point = row[:-1]
        data_points = np.delete(points, idx, 0)

        if how == 'ok':
            preds = ordinary_kriging(
                theoretical_model=theoretical_model,
                known_locations=data_points,
                unknown_locations=clipped_point,
                neighbors_range=neighbors_range,
                no_neighbors=no_neighbors,
                use_all_neighbors_in_range=use_all_neighbors_in_range,
                allow_approximate_solutions=allow_approximate_solutions,
                progress_bar=False
            )
        elif how == 'sk':
            preds = simple_kriging(
                theoretical_model=theoretical_model,
                known_locations=data_points,
                process_mean=sk_mean,
                unknown_locations=clipped_point,
                neighbors_range=neighbors_range,
                no_neighbors=no_neighbors,
                use_all_neighbors_in_range=use_all_neighbors_in_range,
                allow_approximate_solutions=allow_approximate_solutions,
                progress_bar=False
            )
        else:
            raise KeyError(
                'Allowed kriging types (parameter "how") are:'
                ' "ok" - ordinary kriging,'
                ' and "sk" - simple kriging.'
            )

        if len(preds) == 1:
            preds = preds[0]

        prediction_error = row[-1] - preds[0]
        p1 = np.sqrt(preds[1])

        z = prediction_error / (p1 * preds[0])
        z_ci_min = z - 2*p1
        z_ci_max = z + 2*p1

        coordinates_and_errors.append(
            [preds[2],
             preds[3],
             prediction_error,
             preds[1],
             z,
             z_ci_min,
             z_ci_max]
        )

    output_arr = np.array(coordinates_and_errors)
    mean_prediction_error = np.mean(output_arr[:, 2])
    mean_variance_error = np.var(output_arr[:, 2]) / np.mean(output_arr[:, 3])

    return mean_prediction_error, mean_variance_error, output_arr