Source code for woodwork.accessor_utils

import pandas as pd

from woodwork.exceptions import TypeConversionError
from woodwork.logical_types import Datetime, LatLong, Ordinal
from woodwork.type_sys.utils import _get_ltype_class
from woodwork.utils import (
    _get_column_logical_type,
    _reformat_to_latlong,
    import_or_none
)

dd = import_or_none('dask.dataframe')
ks = import_or_none('databricks.koalas')


[docs]def init_series(series, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None): """Initializes Woodwork typing information for a Series, returning a new Series. The dtype of the returned series will be converted to match the dtype associated with the LogicalType. Args: series (pd.Series, dd.Series, or ks.Series): The original series from which to create the Woodwork initialized series. logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. Returns: Series: A series with Woodwork typing information initialized """ logical_type = _get_column_logical_type(series, logical_type, series.name) new_series = _update_column_dtype(series, logical_type) new_series.ww.init(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata) return new_series
def _update_column_dtype(series, logical_type): """Update the dtype of the underlying series to match the dtype corresponding to the LogicalType for the column.""" if isinstance(logical_type, Ordinal): logical_type._validate_data(series) if _get_ltype_class(logical_type) == LatLong: # Reformat LatLong columns to be a length two tuple (or list for Koalas) of floats if dd and isinstance(series, dd.Series): name = series.name meta = (series, tuple([float, float])) series = series.apply(_reformat_to_latlong, meta=meta) series.name = name elif ks and isinstance(series, ks.Series): formatted_series = series.to_pandas().apply(_reformat_to_latlong, use_list=True) series = ks.from_pandas(formatted_series) else: series = series.apply(_reformat_to_latlong) new_dtype = _get_valid_dtype(type(series), logical_type) if new_dtype != str(series.dtype): # Update the underlying series error_msg = f'Error converting datatype for {series.name} from type {str(series.dtype)} ' \ f'to type {new_dtype}. Please confirm the underlying data is consistent with ' \ f'logical type {logical_type}.' try: if _get_ltype_class(logical_type) == Datetime: if dd and isinstance(series, dd.Series): name = series.name series = dd.to_datetime(series, format=logical_type.datetime_format) series.name = name elif ks and isinstance(series, ks.Series): series = ks.Series(ks.to_datetime(series.to_numpy(), format=logical_type.datetime_format), name=series.name) else: series = pd.to_datetime(series, format=logical_type.datetime_format) else: series = series.astype(new_dtype) if str(series.dtype) != new_dtype: # Catch conditions when Panads does not error but did not # convert to the specified dtype (example: 'category' -> 'bool') raise TypeConversionError(error_msg) except (TypeError, ValueError): raise TypeConversionError(error_msg) return series def _is_series(data): if isinstance(data, pd.Series): return True elif dd and isinstance(data, dd.Series): return True elif ks and isinstance(data, ks.Series): return True return False def _is_dataframe(data): if isinstance(data, pd.DataFrame): return True elif dd and isinstance(data, dd.DataFrame): return True elif ks and isinstance(data, ks.DataFrame): return True return False def _get_valid_dtype(series_type, logical_type): """Return the dtype that is considered valid for a series with the given logical_type""" backup_dtype = logical_type.backup_dtype if ks and series_type == ks.Series and backup_dtype: valid_dtype = backup_dtype else: valid_dtype = logical_type.primary_dtype return valid_dtype
[docs]def get_invalid_schema_message(dataframe, schema): """Return a message indicating the reason that the provided schema cannot be used to initialize Woodwork on the dataframe. If the schema is valid for the dataframe, None will be returned. Args: dataframe (DataFrame): The dataframe against which to check the schema. schema (ww.TableSchema): The schema to use in the validity check. Returns: str or None: The reason that the schema is invalid for the dataframe """ dataframe_cols = set(dataframe.columns) schema_cols = set(schema.columns.keys()) df_cols_not_in_schema = dataframe_cols - schema_cols if df_cols_not_in_schema: return f'The following columns in the DataFrame were missing from the typing information: '\ f'{df_cols_not_in_schema}' schema_cols_not_in_df = schema_cols - dataframe_cols if schema_cols_not_in_df: return f'The following columns in the typing information were missing from the DataFrame: '\ f'{schema_cols_not_in_df}' for name in dataframe.columns: df_dtype = dataframe[name].dtype valid_dtype = _get_valid_dtype(type(dataframe[name]), schema.logical_types[name]) if str(df_dtype) != valid_dtype: return f'dtype mismatch for column {name} between DataFrame dtype, '\ f'{df_dtype}, and {schema.logical_types[name]} dtype, {valid_dtype}' if schema.index is not None and isinstance(dataframe, pd.DataFrame): # Index validation not performed for Dask/Koalas if not pd.Series(dataframe.index, dtype=dataframe[schema.index].dtype).equals(pd.Series(dataframe[schema.index].values)): return 'Index mismatch between DataFrame and typing information' elif not dataframe[schema.index].is_unique: return 'Index column is not unique'
[docs]def is_schema_valid(dataframe, schema): """Check if a schema is valid for initializing Woodwork on a dataframe Args: dataframe (DataFrame): The dataframe against which to check the schema. schema (ww.TableSchema): The schema to use in the validity check. Returns: boolean: Boolean indicating whether the schema is valid for the dataframe """ invalid_schema_message = get_invalid_schema_message(dataframe, schema) if invalid_schema_message: return False return True