Source code for woodwork.statistics_utils.frequency_inference._infer_frequency

import dataclasses

import pandas as pd

from ._clean_timeseries import _clean_timeseries
from ._constants import FREQ_INFERENCE_THRESHOLD, WINDOW_LENGTH
from ._determine_duplicate_values import _determine_duplicate_values
from ._determine_extra_values import _determine_extra_values
from ._determine_missing_values import _determine_missing_values
from ._determine_most_likely_freq import _determine_most_likely_freq
from ._determine_nan_values import _determine_nan_values
from ._generate_estimated_timeseries import _generate_estimated_timeseries
from ._generate_freq_candidates import _generate_freq_candidates
from ._types import DataCheckMessageCode, InferDebug


def inference_response(inferred_freq, debug_obj, debug):
    if debug:
        return (inferred_freq, dataclasses.asdict(debug_obj))
    else:
        return inferred_freq


[docs]def infer_frequency( observed_ts: pd.Series, debug=False, window_length=WINDOW_LENGTH, threshold=FREQ_INFERENCE_THRESHOLD, ): """Infer the frequency of a given Pandas Datetime Series. Args: series (pd.Series): data to use for histogram debug (boolean): a flag to determine if debug object should be returned (explained below). window_length (int): the window length used to determine the most likely candidate frequence. Default is 15. If the timeseries is noisy and needs to inferred, the minimum length of the input timeseries needs to be greater than this window. threshold (float): a value between 0 and 1. Given the number of windows that contain the most observed frequency (N), and total number of windows (T), if N/T > threshold, the most observed frequency is determined to be the most likely frequency, else None. Returns: inferred_freq (str): pandas offset alias string (D, M, Y, etc.) or None if no uniform frequency was present in the data. debug (dict): a dictionary containing debug information if frequency cannot be inferred. This dictionary has the following properties: - actual_range_start (str): a string representing the minimum Timestamp in the input observed timeseries according to ISO 8601. - actual_range_end (str): a string representing the maximum Timestamp in the input observed timeseries according to ISO 8601. - message (str): message describing any issues with the input Datetime series - estimated_freq (str): None - estimated_range_start (str): a string representing the minimum Timestamp in the output estimated timeseries according to ISO 8601. - estimated_range_end (str): a string representing the maximum Timestamp in the output estimated timeseries according to ISO 8601. - duplicate_values (list(RangeObject)): a list of RangeObjects of Duplicate timestamps - missing_values (list(RangeObject)): a list of RangeObjects of Missing timestamps - extra_values (list(RangeObject)): a list of RangeObjects of Extra timestamps - nan_values (list(RangeObject)): a list of RangeObjects of NaN timestamps A range object contains the following information: - dt: an ISO 8601 formatted string of the first timestamp in this range - idx: the index of the first timestamp in this range - for duplicates and extra values, the idx is in reference to the observed data - for missing values, the idx is in reference to the estimated data. - range: the length of this range. """ pandas_inferred_freq = pd.infer_freq(observed_ts) if pandas_inferred_freq or not debug: return inference_response( inferred_freq=pandas_inferred_freq, debug_obj=InferDebug(), debug=debug ) # clean observed timeseries from duplicates and NaTs observed_ts_clean = _clean_timeseries(observed_ts) # Determine if series is not empty if len(observed_ts_clean) == 0: return inference_response( inferred_freq=None, debug_obj=InferDebug( message=DataCheckMessageCode.DATETIME_SERIES_IS_EMPTY, ), debug=debug, ) nan_values = _determine_nan_values(observed_ts) duplicate_values = _determine_duplicate_values(observed_ts) actual_range_start = observed_ts_clean.min().isoformat() actual_range_end = observed_ts_clean.max().isoformat() # Determine if series is long enough for inference if len(observed_ts_clean) < window_length: return inference_response( inferred_freq=None, debug_obj=InferDebug( actual_range_start=actual_range_start, actual_range_end=actual_range_end, message=DataCheckMessageCode.DATETIME_SERIES_IS_NOT_LONG_ENOUGH, duplicate_values=duplicate_values, nan_values=nan_values, ), debug=debug, ) # Determine if series if Monotonic is_monotonic = observed_ts_clean.is_monotonic_increasing if not is_monotonic: return inference_response( inferred_freq=None, debug_obj=InferDebug( actual_range_start, actual_range_end, message=DataCheckMessageCode.DATETIME_SERIES_IS_NOT_MONOTONIC, duplicate_values=duplicate_values, nan_values=nan_values, ), debug=debug, ) # Generate Frequency Candidates alias_dict = _generate_freq_candidates( observed_ts_clean, window_length=window_length ) most_likely_freq = _determine_most_likely_freq(alias_dict, threshold=threshold) if most_likely_freq is None: return inference_response( inferred_freq=None, debug_obj=InferDebug( actual_range_start, actual_range_end, DataCheckMessageCode.DATETIME_SERIES_FREQ_CANNOT_BE_ESTIMATED, duplicate_values=duplicate_values, nan_values=nan_values, ), debug=debug, ) most_likely_freq_alias_dict = alias_dict[most_likely_freq] estimated_ts = _generate_estimated_timeseries(most_likely_freq_alias_dict) estimated_range_start = most_likely_freq_alias_dict["min_dt"].isoformat() estimated_range_end = most_likely_freq_alias_dict["max_dt"].isoformat() missing_values = _determine_missing_values(estimated_ts, observed_ts_clean) extra_values = _determine_extra_values(estimated_ts, observed_ts_clean) return inference_response( inferred_freq=None, debug_obj=InferDebug( actual_range_start=actual_range_start, actual_range_end=actual_range_end, estimated_freq=most_likely_freq, estimated_range_start=estimated_range_start, estimated_range_end=estimated_range_end, missing_values=missing_values, duplicate_values=duplicate_values, extra_values=extra_values, nan_values=nan_values, ), debug=debug, )