Source code for woodwork.datacolumn

import warnings

import pandas as pd
import pandas.api.types as pdtypes

from woodwork.config import config
from woodwork.exceptions import (
    ColumnNameMismatchWarning,
    DuplicateTagsWarning,
    StandardTagsRemovalWarning
)
from woodwork.indexers import _iLocIndexer
from woodwork.logical_types import (
    Boolean,
    Categorical,
    Datetime,
    Double,
    Integer,
    LogicalType,
    NaturalLanguage,
    Ordinal,
    Timedelta,
    str_to_logical_type
)
from woodwork.utils import (
    _convert_input_to_set,
    _get_ltype_class,
    col_is_datetime,
    import_or_none
)

dd = import_or_none('dask.dataframe')
ks = import_or_none('databricks.koalas')


[docs]class DataColumn(object):
[docs] def __init__(self, series, logical_type=None, semantic_tags=None, use_standard_tags=True, name=None, description=None): """Create a DataColumn. Args: series (pd.Series or dd.Series or pd.api.extensions.ExtensionArray): Series containing the data associated with the column. logical_type (LogicalType, optional): The logical type that should be assigned to the column. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the column. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to columns based on the inferred or specified logical type for the column. Defaults to True. name (str, optional): Name of DataColumn. Will overwrite Series name, if it exists. description (str, optional): Optional text describing the contents of the column """ self._assigned_name = name self._set_series(series) self.use_standard_tags = use_standard_tags self._logical_type = self._parse_logical_type(logical_type) semantic_tags = _convert_input_to_set(semantic_tags) _validate_tags(semantic_tags) if use_standard_tags: semantic_tags = semantic_tags.union(self.logical_type.standard_tags) self._semantic_tags = semantic_tags self._update_dtype() if description and not isinstance(description, str): raise TypeError("Column description must be a string") self.description = description
def __repr__(self): msg = u"<DataColumn: {} ".format(self.name) msg += u"(Physical Type = {}) ".format(self.dtype) msg += u"(Logical Type = {}) ".format(self.logical_type) msg += u"(Semantic Tags = {})>".format(self.semantic_tags) return msg def __eq__(self, other, deep=False): if self.name != other.name: return False if self.dtype != other.dtype: return False if self.semantic_tags != other.semantic_tags: return False if self.logical_type != other.logical_type: return False if self.description != other.description: return False # Only check pandas series for equality if isinstance(self._series, pd.Series) and isinstance(other.to_series(), pd.Series): return self.to_series().equals(other.to_series()) return True def __len__(self): return self._series.__len__() def _update_dtype(self): """Update the dtype of the underlying series to match the dtype corresponding to the LogicalType for the column.""" if isinstance(self.logical_type, Ordinal): self.logical_type._validate_data(self._series) if self.logical_type.pandas_dtype != str(self._series.dtype): # Update the underlying series try: if _get_ltype_class(self.logical_type) == Datetime: if dd and isinstance(self._series, dd.Series): name = self._series.name self._series = dd.to_datetime(self._series, format=self.logical_type.datetime_format) self._series.name = name elif ks and isinstance(self._series, ks.Series): self._series = ks.Series(ks.to_datetime(self._series.to_numpy(), format=self.logical_type.datetime_format), name=self._series.name) else: self._series = pd.to_datetime(self._series, format=self.logical_type.datetime_format) else: if ks and isinstance(self._series, ks.Series) and self.logical_type.backup_dtype: new_dtype = self.logical_type.backup_dtype else: new_dtype = self.logical_type.pandas_dtype self._series = self._series.astype(new_dtype) except (TypeError, ValueError): error_msg = f'Error converting datatype for column {self.name} from type {str(self._series.dtype)} ' \ f'to type {self.logical_type.pandas_dtype}. Please confirm the underlying data is consistent with ' \ f'logical type {self.logical_type}.' raise TypeError(error_msg) @property def iloc(self): """Purely integer-location based indexing for selection by position. ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean array. Allowed inputs are: An integer, e.g. ``5``. A list or array of integers, e.g. ``[4, 3, 0]``. A slice object with ints, e.g. ``1:7``. A boolean array. A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don't have a reference to the calling object, but would like to base your selection on some value. """ return _iLocIndexer(self)
[docs] def set_logical_type(self, logical_type, retain_index_tags=True): """Update the logical type for the column and return a new DataColumn object. Args: logical_type (LogicalType, str): The new logical type to set for the column. retain_index_tags (bool, optional): If True, any 'index' or 'time_index' tags on the column will be retained. If False, all tags will be cleared. Defaults to True. Returns: woodwork.DataColumn: DataColumn with updated logical type. """ new_logical_type = self._parse_logical_type(logical_type) new_col = DataColumn(series=self._series, logical_type=new_logical_type, use_standard_tags=self.use_standard_tags) if retain_index_tags and 'index' in self.semantic_tags: new_col._set_as_index() if retain_index_tags and 'time_index' in self.semantic_tags: new_col._set_as_time_index() return new_col
def _set_series(self, series): if not ((dd and isinstance(series, dd.Series)) or (ks and isinstance(series, ks.Series)) or isinstance(series, (pd.Series, pd.api.extensions.ExtensionArray))): raise TypeError('Series must be one of: pandas.Series, dask.Series, koalas.Series, or pandas.ExtensionArray') # pandas ExtensionArrays should be converted to pandas.Series if isinstance(series, pd.api.extensions.ExtensionArray): series = pd.Series(series, dtype=series.dtype) if self._assigned_name is not None and series.name is not None and self._assigned_name != series.name: warnings.warn(ColumnNameMismatchWarning().get_warning_message(series.name, self._assigned_name), ColumnNameMismatchWarning) series.name = self._assigned_name or series.name self._series = series def _parse_logical_type(self, logical_type): if logical_type: if isinstance(logical_type, str): logical_type = str_to_logical_type(logical_type) ltype_class = _get_ltype_class(logical_type) if ltype_class == Ordinal and not isinstance(logical_type, Ordinal): raise TypeError("Must use an Ordinal instance with order values defined") if ltype_class in LogicalType.__subclasses__(): return logical_type else: raise TypeError(f"Invalid logical type specified for '{self.name}'") else: return infer_logical_type(self._series)
[docs] def set_semantic_tags(self, semantic_tags, retain_index_tags=True): """Replace current semantic tags with new values and return a new DataColumn object. Args: semantic_tags (str/list/set): New semantic tag(s) to set for column retain_index_tags (bool, optional): If True, any 'index' or 'time_index' tags on the column will be retained. If False, all tags will be replaced. Defaults to True. Returns: woodwork.DataColumn: DataColumn with specified semantic tags. """ semantic_tags = _convert_input_to_set(semantic_tags) _validate_tags(semantic_tags) is_index = 'index' in self._semantic_tags is_time_index = 'time_index' in self._semantic_tags new_col = DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=semantic_tags, use_standard_tags=self.use_standard_tags) if new_col.use_standard_tags: new_col._semantic_tags = new_col._semantic_tags.union(new_col._logical_type.standard_tags) if retain_index_tags and is_index: new_col._set_as_index() if retain_index_tags and is_time_index: new_col._set_as_time_index() return new_col
[docs] def add_semantic_tags(self, semantic_tags): """Add the specified semantic tags to the column and return a new DataColumn object. Args: semantic_tags (str/list/set): New semantic tag(s) to add to the column Returns: woodwork.DataColumn: DataColumn with specified semantic tags added. """ new_tags = _convert_input_to_set(semantic_tags) _validate_tags(new_tags) duplicate_tags = sorted(list(self._semantic_tags.intersection(new_tags))) if duplicate_tags: warnings.warn(DuplicateTagsWarning().get_warning_message(duplicate_tags, self.name), DuplicateTagsWarning) new_col_tags = self._semantic_tags.union(new_tags) new_col = DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=new_col_tags, use_standard_tags=self.use_standard_tags) return new_col
[docs] def reset_semantic_tags(self, retain_index_tags=False): """Reset the semantic tags to the default values. The default values will be either an empty set or a set of the standard tags based on the column logical type, controlled by the use_standard_tags property. Args: retain_index_tags (bool, optional): If True, any 'index' or 'time_index' tags on the column will be retained. If False, all tags will be cleared. Defaults to False. Returns: woodwork.DataColumn: DataColumn with reset semantic tags. """ new_col = DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=None, use_standard_tags=self.use_standard_tags) if retain_index_tags and 'index' in self.semantic_tags: new_col._set_as_index() if retain_index_tags and 'time_index' in self.semantic_tags: new_col._set_as_time_index() return new_col
[docs] def remove_semantic_tags(self, semantic_tags): """Removes specified semantic tags from column and returns a new column. Args: semantic_tags (str/list/set): Semantic tag(s) to remove from the column. Returns: woodwork.DataColumn: DataColumn with specified tags removed. """ tags_to_remove = _convert_input_to_set(semantic_tags) invalid_tags = sorted(list(tags_to_remove.difference(self._semantic_tags))) if invalid_tags: raise LookupError(f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{self.name}'") standard_tags_to_remove = sorted(list(tags_to_remove.intersection(self._logical_type.standard_tags))) if standard_tags_to_remove and self.use_standard_tags: warnings.warn(StandardTagsRemovalWarning().get_warning_message(standard_tags_to_remove, self.name), StandardTagsRemovalWarning) new_tags = self._semantic_tags.difference(tags_to_remove) return DataColumn(series=self._series, logical_type=self.logical_type, semantic_tags=new_tags, use_standard_tags=False)
def _set_as_index(self): self._semantic_tags = self._semantic_tags.difference(self._logical_type.standard_tags) self._semantic_tags.add('index') def _set_as_time_index(self): self._semantic_tags.add('time_index') def _is_numeric(self): return 'numeric' in self.logical_type.standard_tags def _is_categorical(self): return 'category' in self.logical_type.standard_tags
[docs] def to_series(self): """Retrieves the DataColumn's underlying series. Note: Do not modify the returned series directly to avoid unexpected behavior Returns: Series: The underlying series of the DataColumn. Return type will depend on the type of series used to create the DataColumn. """ return self._series
@property def shape(self): """Returns a tuple representing the dimensionality of the DataTable. If Dask DataFrame, returns a Dask `Delayed` object for the number of rows.""" return self._series.shape @property def logical_type(self): """The logical type for the column""" return self._logical_type @property def semantic_tags(self): """The set of semantic tags currently assigned to the column""" return self._semantic_tags @property def name(self): """The name of the column""" return self._assigned_name or self._series.name @property def dtype(self): """The dtype of the underlying series""" return self._series.dtype
def _validate_tags(semantic_tags): """Verify user has not supplied tags that cannot be set directly""" if 'index' in semantic_tags: raise ValueError("Cannot add 'index' tag directly. To set a column as the index, " "use DataTable.set_index() instead.") if 'time_index' in semantic_tags: raise ValueError("Cannot add 'time_index' tag directly. To set a column as the time index, " "use DataTable.set_time_index() instead.") def infer_logical_type(series): """Infer logical type for a dataframe column Args: series (pd.Series): Input Series """ if dd and isinstance(series, dd.Series): series = series.get_partition(0).compute() if ks and isinstance(series, ks.Series): series = series.head(100000).to_pandas() natural_language_threshold = config.get_option('natural_language_threshold') numeric_categorical_threshold = config.get_option('numeric_categorical_threshold') inferred_type = NaturalLanguage if pdtypes.is_string_dtype(series.dtype): if col_is_datetime(series): inferred_type = Datetime else: inferred_type = Categorical # heuristics to predict this some other than categorical sample = series.sample(min(10000, len(series))) # catch cases where object dtype cannot be interpreted as a string try: avg_length = sample.str.len().mean() if avg_length > natural_language_threshold: inferred_type = NaturalLanguage except AttributeError: pass elif pdtypes.is_bool_dtype(series.dtype): inferred_type = Boolean elif pdtypes.is_categorical_dtype(series.dtype): inferred_type = Categorical elif pdtypes.is_integer_dtype(series.dtype): if _is_numeric_categorical(series, numeric_categorical_threshold): inferred_type = Categorical else: inferred_type = Integer elif pdtypes.is_float_dtype(series.dtype): inferred_type = Categorical if _is_numeric_categorical(series, numeric_categorical_threshold) else Double elif col_is_datetime(series): inferred_type = Datetime elif pdtypes.is_timedelta64_dtype(series.dtype): inferred_type = Timedelta return inferred_type def _is_numeric_categorical(series, threshold): return threshold != -1 and series.nunique() < threshold