importreimportwarningsfromdatetimeimportdatetimefromtypingimportOptionalimportnumpyasnpimportpandasaspdfrompandasimportCategoricalDtypefrompandas.apiimporttypesaspdtypesimportwoodworkaswwfromwoodwork.accessor_utilsimport_is_dask_series,_is_spark_seriesfromwoodwork.configimportconfigfromwoodwork.exceptionsimport(TypeConversionError,TypeConversionWarning,TypeValidationError,)fromwoodwork.type_sys.utilsimport_get_specified_ltype_paramsfromwoodwork.utilsimport(_infer_datetime_format,_is_valid_latlong_series,_is_valid_latlong_value,_reformat_to_latlong,camel_to_snake,import_or_none,)dd=import_or_none("dask.dataframe")ps=import_or_none("pyspark.pandas")classClassNameDescriptor(object):"""Descriptor to convert a class's name from camelcase to snakecase"""def__get__(self,instance,class_):returncamel_to_snake(class_.__name__)classLogicalTypeMetaClass(type):def__repr__(cls):returncls.__name__classLogicalType(object,metaclass=LogicalTypeMetaClass):"""Base class for all other Logical Types"""type_string=ClassNameDescriptor()primary_dtype="string"pyspark_dtype=Nonestandard_tags=set()def__eq__(self,other,deep=False):returnisinstance(other,self.__class__)and_get_specified_ltype_params(other,)==_get_specified_ltype_params(self)def__str__(self):returnstr(self.__class__)@classmethoddef_get_valid_dtype(cls,series_type):"""Return the dtype that is considered valid for a series with the given logical_type"""ifpsandseries_type==ps.Seriesandcls.pyspark_dtype:returncls.pyspark_dtypeelse:returncls.primary_dtypedeftransform(self,series,null_invalid_values=False):"""Converts the series dtype to match the logical type's if it is different."""new_dtype=self._get_valid_dtype(type(series))ifnew_dtype!=str(series.dtype):# Update the underlying seriestry:series=series.astype(new_dtype)except(TypeError,ValueError):raiseTypeConversionError(series,new_dtype,type(self))returnseriesdefvalidate(self,series,*args,**kwargs):"""Validates that a logical type is consistent with the series dtype. Performs additional type specific validation, as required. When the series' dtype does not match the logical types' required dtype, raises a TypeValidationError."""valid_dtype=self._get_valid_dtype(type(series))ifvalid_dtype!=str(series.dtype):raiseTypeValidationError(f"Series dtype '{series.dtype}' is incompatible with {self.type_string} LogicalType, try converting to {valid_dtype} dtype",)
[docs]classAddress(LogicalType):"""Represents Logical Types that contain address values. Examples: .. code-block:: python ['1 Miller Drive, New York, NY 12345', '1 Berkeley Street, Boston, MA 67891'] ['26387 Russell Hill, Dallas, TX 34521', '54305 Oxford Street, Seattle, WA 95132'] """primary_dtype="string"
[docs]classAge(LogicalType):"""Represents Logical Types that contain whole numbers indicating a person's age. Has 'numeric' as a standard tag. Examples: .. code-block:: python [15, 22, 45] [30, 62, 87] """primary_dtype="int64"standard_tags={"numeric"}defvalidate(self,series,return_invalid_values=True):"""Validates age values by checking for non-negative values. Args: series (Series): Series of age values return_invalid_values (bool): Whether or not to return invalid age values Returns: Series: If return_invalid_values is True, returns invalid age values. """return_validate_age(series,return_invalid_values)
[docs]classAgeFractional(LogicalType):"""Represents Logical Types that contain non-negative floating point numbers indicating a person's age. Has 'numeric' as a standard tag. May also contain null values. Examples: .. code-block:: python [0.34, 24.34, 45.0] [30.5, 62.82, np.nan] """primary_dtype="float64"standard_tags={"numeric"}deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_age(series,fractional=True)returnsuper().transform(series)defvalidate(self,series,return_invalid_values=True):"""Validates age values by checking for non-negative values. Args: series (Series): Series of age values return_invalid_values (bool): Whether or not to return invalid age values Returns: Series: If return_invalid_values is True, returns invalid age values. """return_validate_age(series,return_invalid_values)
[docs]classAgeNullable(LogicalType):"""Represents Logical Types that contain whole numbers indicating a person's age. Has 'numeric' as a standard tag. May also contain null values. Examples: .. code-block:: python [np.nan, 22, 45] [30, 62, np.nan] """primary_dtype="Int64"standard_tags={"numeric"}deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_age(series,fractional=False)returnsuper().transform(series)defvalidate(self,series,return_invalid_values=True):"""Validates age values by checking for non-negative values. Args: series (Series): Series of age values return_invalid_values (bool): Whether or not to return invalid age values Returns: Series: If return_invalid_values is True, returns invalid age values. """return_validate_age(series,return_invalid_values)
[docs]classBoolean(LogicalType):"""Represents Logical Types that contain binary values indicating true/false. Args: cast_nulls_as (bool): If provided, null values in the column will be cast to this default bool, otherwise will raise an error if None. Defaults to None. Examples: .. code-block:: python [True, False, True] [0, 1, 1] """primary_dtype="bool"
[docs]def__init__(self,cast_nulls_as=None):ifcast_nulls_asandnotisinstance(cast_nulls_as,bool):raiseValueError(f"Parameter `cast_nulls_as` must be either True or False, recieved {cast_nulls_as}",)self.cast_nulls_as=cast_nulls_as
deftransform(self,series,null_invalid_values=False):"""Validates Boolean values by checking for valid boolean equivalents. Args: series (series): Series of boolean values Returns: Series: Returns column transformed into boolean type """ve=ValueError("Expected no null values in this Boolean column. If you want to keep the nulls, use BooleanNullable type. Otherwise, cast these nulls to a boolean value with the `cast_null_as` parameter.",)is_dask=_is_dask_series(series)ifnotpdtypes.is_dtype_equal("bool",series.dtype):if(is_daskandseries.isna().any().compute())or(notis_daskandseries.isna().any()):ifself.cast_nulls_asisNone:raiseveseries.fillna(self.cast_nulls_as,inplace=True)series=_coerce_boolean(series,True)returnsuper().transform(series)
[docs]classBooleanNullable(LogicalType):"""Represents Logical Types that contain binary values indicating true/false. May also contain null values. Examples: .. code-block:: python [True, False, None] [0, 1, 1] """primary_dtype="boolean"deftransform(self,series,null_invalid_values=False):series=_replace_nans(series,self.primary_dtype)series=_coerce_boolean(series,null_invalid_values)returnsuper().transform(series)
[docs]classCategorical(LogicalType):"""Represents Logical Types that contain unordered discrete values that fall into one of a set of possible values. Has 'category' as a standard tag. Examples: .. code-block:: python ["red", "green", "blue"] ["produce", "dairy", "bakery"] [3, 1, 2] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}
[docs]def__init__(self,encoding=None):# encoding dict(str -> int)# user can specify the encoding to use downstreampass
[docs]classCountryCode(LogicalType):"""Represents Logical Types that use the ISO-3166 standard country code to represent countries. ISO 3166-1 (countries) are supported. These codes should be in the Alpha-2 format. Examples: .. code-block:: python ["AU", "US", "UA"] ["GB", "NZ", "DE"] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}
[docs]classCurrencyCode(LogicalType):"""Represents Logical Types that use the ISO-4217 internation standard currency code to represent currencies. Examples: .. code-block:: python ["GBP", "JPY", "USD"] ["SAR", "EUR", "CZK"] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}
[docs]classDatetime(LogicalType):"""Represents Logical Types that contain date and time information. Args: datetime_format (str): Desired datetime format for data Examples: .. code-block:: python ["2020-09-10", "2020-01-10 00:00:00", "01/01/2000 08:30"] """primary_dtype="datetime64[ns]"datetime_format=None
def_remove_timezone(self,series):"""Removes timezone from series and stores in logical type."""ifhasattr(series.dtype,"tz")andseries.dtype.tz:self.timezone=str(series.dtype.tz)series=series.dt.tz_localize(None)returnseriesdeftransform(self,series,null_invalid_values=False):"""Converts the series data to a formatted datetime. Datetime format will be inferred if datetime_format is None."""def_year_filter(date):"""Applies a filter to the years to ensure that the pivot point isn't too far forward."""ifdate.year>datetime.today().year+10:date=date.replace(year=date.year-100)returndatenew_dtype=self._get_valid_dtype(type(series))series=self._remove_timezone(series)series_dtype=str(series.dtype)ifnew_dtype!=series_dtype:self.datetime_format=self.datetime_formator_infer_datetime_format(series,)utc=self.datetime_formatandself.datetime_format.endswith("%z")if_is_dask_series(series):name=series.nameseries=dd.to_datetime(series,format=self.datetime_format,errors="coerce",utc=utc,)series.name=nameelif_is_spark_series(series):series=ps.Series(ps.to_datetime(series.to_numpy(),format=self.datetime_format,errors="coerce",),name=series.name,)else:try:series=pd.to_datetime(series,format=self.datetime_format,utc=utc,)except(TypeError,ValueError):warnings.warn(f"Some rows in series '{series.name}' are incompatible with datetime format "f"'{self.datetime_format}' and have been replaced with null values. You may be ""able to fix this by using an instantiated Datetime logical type with a different format ""string specified for this column during Woodwork initialization.",TypeConversionWarning,)series=pd.to_datetime(series,format=self.datetime_format,errors="coerce",utc=utc,)series=self._remove_timezone(series)ifself.datetime_formatisnotNoneand"%y"inself.datetime_format:if_is_spark_series(series):series=series.transform(_year_filter)else:series=series.apply(_year_filter)returnsuper().transform(series)
[docs]classDouble(LogicalType):"""Represents Logical Types that contain positive and negative numbers, some of which include a fractional component. Includes zero (0). Has 'numeric' as a standard tag. Examples: .. code-block:: python [1.2, 100.4, 3.5] [-15.34, 100, 58.3] """primary_dtype="float64"standard_tags={"numeric"}deftransform(self,series,null_invalid_values=False):series=_replace_nans(series,self.primary_dtype)ifnull_invalid_values:series=_coerce_numeric(series)returnsuper().transform(series)
[docs]classInteger(LogicalType):"""Represents Logical Types that contain positive and negative numbers without a fractional component, including zero (0). Has 'numeric' as a standard tag. Examples: .. code-block:: python [100, 35, 0] [-54, 73, 11] """primary_dtype="int64"standard_tags={"numeric"}
[docs]classIntegerNullable(LogicalType):"""Represents Logical Types that contain positive and negative numbers without a fractional component, including zero (0). May contain null values. Has 'numeric' as a standard tag. Examples: .. code-block:: python [100, 35, np.nan] [-54, 73, 11] """primary_dtype="Int64"standard_tags={"numeric"}deftransform(self,series,null_invalid_values=False):"""Converts a series dtype to Int64. Args: series (Series): A series of data values. null_invalid_values (bool): If true, nulls invalid integers by coercing the series to string, numeric, and then nulling out floats with decimals. Defaults to False. Returns: Series: A series of integers. """series=_replace_nans(series,self.primary_dtype)ifnull_invalid_values:series=_coerce_integer(series)returnsuper().transform(series)
[docs]classEmailAddress(LogicalType):"""Represents Logical Types that contain email address values. Examples: .. code-block:: python ["john.smith@example.com", "support@example.com", "team@example.com"] """primary_dtype="string"deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_string(series,regex="email_inference_regex")returnsuper().transform(series)defvalidate(self,series,return_invalid_values=False):"""Validates email address values based on the regex in the config. Args: series (Series): Series of email address values return_invalid_values (bool): Whether or not to return invalid email address values Returns: Series: If return_invalid_values is True, returns invalid email address. """return_regex_validate("email_inference_regex",series,return_invalid_values)
[docs]classFilepath(LogicalType):"""Represents Logical Types that specify locations of directories and files in a file system. Examples: .. code-block:: python ["/usr/local/bin", "/Users/john.smith/dev/index.html", "/tmp"] """primary_dtype="string"
[docs]classPersonFullName(LogicalType):"""Represents Logical Types that may contain first, middle and last names, including honorifics and suffixes. Examples: .. code-block:: python ["Mr. John Doe, Jr.", "Doe, Mrs. Jane", "James Brown"] """primary_dtype="string"
[docs]classIPAddress(LogicalType):"""Represents Logical Types that contain IP addresses, including both IPv4 and IPv6 addresses. Examples: .. code-block:: python ["172.16.254.1", "192.0.0.0", "2001:0db8:0000:0000:0000:ff00:0042:8329"] """primary_dtype="string"
[docs]classLatLong(LogicalType):"""Represents Logical Types that contain latitude and longitude values in decimal degrees. Note: LatLong values will be stored with the object dtype as a tuple of floats (or a list of floats for Spark DataFrames) and must contain only two values. Null latitude or longitude values will be stored as np.nan, and a fully null LatLong (np.nan, np.nan) will be stored as just a single nan. Examples: .. code-block:: python [(33.670914, -117.841501), (40.423599, -86.921162), (-45.031705, nan)] """primary_dtype="object"deftransform(self,series,null_invalid_values=False):"""Formats a series to be a tuple (or list for Spark) of two floats."""ifnull_invalid_values:series=_coerce_latlong(series)if_is_dask_series(series):name=series.namemeta=(name,tuple([float,float]))series=series.apply(_reformat_to_latlong,meta=meta)elif_is_spark_series(series):formatted_series=series.to_pandas().apply(_reformat_to_latlong,is_spark=True,)series=ps.from_pandas(formatted_series)else:series=series.apply(_reformat_to_latlong)returnsuper().transform(series)defvalidate(self,series,return_invalid_values=False):# TODO: we'll want to actually handle return_invalid_values in the ordinal and latlong logical types.super().validate(series)ifnot_is_valid_latlong_series(series):raiseTypeValidationError("Cannot initialize Woodwork. Series does not contain properly formatted ""LatLong data. Try reformatting before initializing or use the ""woodwork.init_series function to initialize.",)
[docs]classNaturalLanguage(LogicalType):"""Represents Logical Types that contain text or characters representing natural human language Examples: .. code-block:: python ["This is a short sentence.", "I like to eat pizza!", "When will humans go to mars?"] """primary_dtype="string"
[docs]classUnknown(LogicalType):"""Represents Logical Types that cannot be inferred as a specific Logical Type. It is assumed to contain string data. Examples: .. code-block:: python ["ax23n9ck23l", "1,28&*_%*&&xejc", "xnmvz@@Dcmeods-0"] """primary_dtype="string"
[docs]classOrdinal(LogicalType):"""Represents Logical Types that contain ordered discrete values. Has 'category' as a standard tag. Args: order (list or tuple): An list or tuple specifying the order of the ordinal values from low to high. The underlying series cannot contain values that are not present in the order values. Examples: .. code-block:: python ["first", "second", "third"] ["bronze", "silver", "gold"] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}
def_validate_order_values(self,series):"""Make sure order values are properly defined and confirm the supplied series does not contain any values that are not in the specified order values"""ifself.orderisNone:raiseTypeError("Must use an Ordinal instance with order values defined")elifnotisinstance(self.order,(list,tuple)):raiseTypeError("Order values must be specified in a list or tuple")iflen(self.order)!=len(set(self.order)):raiseValueError("Order values cannot contain duplicates")ifisinstance(series,pd.Series):missing_order_vals=set(series.dropna().values).difference(self.order)ifmissing_order_vals:error_msg=(f"Ordinal column {series.name} contains values that are not present "f"in the order values provided: {sorted(list(missing_order_vals))}")raiseValueError(error_msg)deftransform(self,series,null_invalid_values=False):"""Validates the series and converts the dtype to match the logical type's if it is different."""self._validate_order_values(series)typed_ser=super().transform(series)ifisinstance(typed_ser.dtype,CategoricalDtype):typed_ser=typed_ser.cat.set_categories(self.order,ordered=True)returntyped_serdefvalidate(self,series,return_invalid_values=False):# TODO: we'll want to actually handle return_invalid_values in the ordinal and latlong logical types.super().validate(series)self._validate_order_values(series)def__str__(self):return"{}: {}".format(self.__class__,self.order)
[docs]classPhoneNumber(LogicalType):"""Represents Logical Types that contain numeric digits and characters representing a phone number. Examples: .. code-block:: python ["1-(555)-123-5495", "+1-555-123-5495", "5551235495"] """primary_dtype="string"deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_string(series,regex="phone_inference_regex")returnsuper().transform(series)defvalidate(self,series,return_invalid_values=False):"""Validates PhoneNumber values based on the regex in the config. By default, this validates US/Canada-based phone numbers. Args: series (Series): Series of phone number values. return_invalid_values (bool): Whether or not to return invalid phone numbers. Returns: Series: If return_invalid_values is True, returns invalid phone numbers. """return_regex_validate("phone_inference_regex",series,return_invalid_values)
[docs]classSubRegionCode(LogicalType):"""Represents Logical Types that use the ISO-3166 standard sub-region code to represent a portion of a larger geographic region. ISO 3166-2 (sub-regions) codes are supported. These codes should be in the Alpha-2 format. Examples: .. code-block:: python ["US-CO", "US-MA", "US-CA"] ["AU-NSW", "AU-TAS", "AU-QLD"] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}
[docs]classTimedelta(LogicalType):"""Represents Logical Types that contain values specifying a duration of time Examples: .. code-block:: python [pd.Timedelta('1 days 00:00:00'), pd.Timedelta('-1 days +23:40:00'), pd.Timedelta('4 days 12:00:00')] """primary_dtype="timedelta64[ns]"
[docs]classURL(LogicalType):"""Represents Logical Types that contain URLs, which may include protocol, hostname and file name Examples: .. code-block:: python ["http://google.com", "https://example.com/index.html", "example.com"] """primary_dtype="string"deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_string(series,regex="url_inference_regex")returnsuper().transform(series)defvalidate(self,series,return_invalid_values=False):"""Validates URL values based on the regex in the config. Args: series (Series): Series of URL values return_invalid_values (bool): Whether or not to return invalid URLs Returns: Series: If return_invalid_values is True, returns invalid URLs. """return_regex_validate("url_inference_regex",series,return_invalid_values)
[docs]classPostalCode(LogicalType):"""Represents Logical Types that contain a series of postal codes for representing a group of addresses. Has 'category' as a standard tag. Examples: .. code-block:: python ["90210" "60018-0123", "10010"] """primary_dtype="category"pyspark_dtype="string"standard_tags={"category"}deftransform(self,series,null_invalid_values=False):ifnull_invalid_values:series=_coerce_postal_code(series)ifpd.api.types.is_numeric_dtype(series):try:series=series.astype("Int64").astype("string")exceptTypeError:raiseTypeConversionError(series,"string",type(self))returnsuper().transform(series)defvalidate(self,series,return_invalid_values=False):"""Validates PostalCode values based on the regex in the config. Currently only validates US Postal codes. Args: series (Series): Series of PostalCode values. return_invalid_values (bool): Whether or not to return invalid PostalCodes. Returns: Series: If return_invalid_values is True, returns invalid PostalCodes. """return_regex_validate("postal_code_inference_regex",series,return_invalid_values,)
_NULLABLE_PHYSICAL_TYPES={"boolean","category","datetime64[ns]","Int8","Int16","Int32","Int64","Float32","Float64","float16","float32","float64","float128","object","string","timedelta64[ns]",}def_regex_validate(regex_key,series,return_invalid_values):"""Validates data values based on the logical type regex in the config."""invalid=_get_index_invalid_string(series,regex_key)ifreturn_invalid_values:returnseries[invalid]else:any_invalid=invalid.any()ifddandisinstance(any_invalid,dd.core.Scalar):any_invalid=any_invalid.compute()ifany_invalid:type_string={"url_inference_regex":"url","email_inference_regex":"email address","phone_inference_regex":"phone number","postal_code_inference_regex":"postal code",}[regex_key]info=f"Series {series.name} contains invalid {type_string} values. "info+=f"The {regex_key} can be changed in the config if needed."raiseTypeValidationError(info)def_replace_nans(series:pd.Series,primary_dtype:Optional[str]=None)->pd.Series:""" Replaces empty string values, string representations of NaN values ("nan", "<NA>"), and NaN equivalents with np.nan or pd.NA depending on column dtype. """original_dtype=series.dtypeifprimary_dtype==str(original_dtype):returnseriesifstr(original_dtype)=="string":series=series.replace(ww.config.get_option("nan_values"),pd.NA)returnseriesifnot_is_spark_series(series):series=series.replace(ww.config.get_option("nan_values"),np.nan)ifstr(original_dtype)=="boolean":series=series.astype(original_dtype)returnseriesdef_validate_age(series,return_invalid_values):"""Validates data values are non-negative."""invalid=_get_index_invalid_age(series)ifreturn_invalid_values:returnseries[invalid]else:any_invalid=invalid.any()ifddandisinstance(any_invalid,dd.core.Scalar):any_invalid=any_invalid.compute()ifany_invalid:info=f"Series {series.name} contains negative values."raiseTypeValidationError(info)def_get_index_invalid_integer(series):returnseries.mod(1).ne(0)def_get_index_invalid_string(series,regex_key):regex=config.get_option(regex_key)if_is_spark_series(series):defmatch(x):ifisinstance(x,str):returnbool(re.match(regex,x))returnseries.apply(match).astype("boolean")==False# noqa: E712else:return~series.str.match(regex).astype("boolean")def_get_index_invalid_age(series):returnseries.lt(0)def_get_index_invalid_latlong(series):return~series.apply(_is_valid_latlong_value)def_coerce_string(series,regex=None):ifpd.api.types.is_object_dtype(series)ornotpd.api.types.is_string_dtype(series):series=series.astype("string")ifisinstance(regex,str):invalid=_get_index_invalid_string(series,regex)ifinvalid.any():series[invalid]=pd.NAreturnseriesdef_coerce_numeric(series):ifnotpd.api.types.is_numeric_dtype(series):series=pd.to_numeric(_coerce_string(series),errors="coerce")returnseriesdef_coerce_boolean(series,null_invalid_values=False):ifnotpd.api.types.is_bool_dtype(series):series=_coerce_string(series).str.lower()return_transform_boolean(series,null_invalid_values)returnseriesdef_transform_boolean(series,null_invalid_values):boolean_inference_list=config.get_option("boolean_inference_strings").copy()boolean_inference_list.update({frozenset(["1","0"]),frozenset(["1.0","0.0"])})boolean_transform_mappings=config.get_option("boolean_transform_mappings").copy()boolean_transform_mappings.update({"1":True,"0":False,"1.0":True,"0.0":False,},)ifnull_invalid_values:series=series.apply(lambdax:boolean_transform_mappings.get(x,np.nan))else:series=series.apply(lambdax:boolean_transform_mappings.get(x,x))returnseriesdef_coerce_integer(series):series=_coerce_numeric(series)invalid=_get_index_invalid_integer(series)ifinvalid.any():series[invalid]=Nonereturnseriesdef_coerce_age(series,fractional=False):coerce_type=_coerce_numericiffractionalelse_coerce_integerseries=coerce_type(series)invalid=_get_index_invalid_age(series)ifinvalid.any():series[invalid]=Nonereturnseriesdef_coerce_latlong(series):invalid=_get_index_invalid_latlong(series)ifinvalid.any():series[invalid]=Nonereturnseriesdef_coerce_postal_code(series):ifpd.api.types.is_numeric_dtype(series):series=_coerce_integer(series).astype("Int64")return_coerce_string(series,regex="postal_code_inference_regex")