importastimportimportlibimportrefromdatetimeimportdatetimefrominspectimportisclassfrommimetypesimportadd_type,guess_typefromtimeitimportdefault_timerastimerimportnumpyasnpimportpandasaspdimportwoodworkaswwfromwoodwork.exceptionsimportTypeValidationError# Dictionary mapping formats/content types to the appropriate pandas read functiontype_to_read_func_map={"csv":pd.read_csv,"text/csv":pd.read_csv,"parquet":pd.read_parquet,"application/parquet":pd.read_parquet,"arrow":pd.read_feather,"application/arrow":pd.read_feather,"feather":pd.read_feather,"application/feather":pd.read_feather,"orc":pd.read_orc,"application/orc":pd.read_orc,}PYARROW_ERR_MSG=("The pyarrow library is required to read from parquet/arrow/feather files.\n""Install via pip:\n"" pip install 'pyarrow>=3.0.0'\n""Install via conda:\n"" conda install 'pyarrow>=3.0.0'")# Add new mimetypesadd_type("application/parquet",".parquet")add_type("application/arrow",".arrow")add_type("application/feather",".feather")add_type("application/orc",".orc")defimport_or_none(library):"""Attempts to import the requested library. Args: library (str): the name of the library Returns: the library if it is installed, else None """try:returnimportlib.import_module(library)exceptImportError:returnNonedefcamel_to_snake(s):s=re.sub("(.)([A-Z][a-z]+)",r"\1_\2",s)returnre.sub("([a-z0-9])([A-Z])",r"\1_\2",s).lower()def_convert_input_to_set(semantic_tags,error_language="semantic_tags",validate=True):"""Takes input as a single string, a list of strings, or a set of strings and returns a set with the supplied values. If no values are supplied, an empty set will be returned."""ifnotsemantic_tags:returnset()ifvalidate:_validate_tags_input_type(semantic_tags,error_language)ifisinstance(semantic_tags,str):return{semantic_tags}ifisinstance(semantic_tags,list):semantic_tags=set(semantic_tags)ifvalidate:_validate_string_tags(semantic_tags,error_language)returnsemantic_tagsdef_validate_tags_input_type(semantic_tags,error_language):iftype(semantic_tags)notin[list,set,str]:raiseTypeError(f"{error_language} must be a string, set or list")def_validate_string_tags(semantic_tags,error_language):ifnotall([isinstance(tag,str)fortaginsemantic_tags]):raiseTypeError(f"{error_language} must contain only strings")
[docs]defread_file(filepath=None,content_type=None,name=None,index=None,time_index=None,semantic_tags=None,logical_types=None,use_standard_tags=True,column_origins=None,replace_nan=False,validate=True,**kwargs,):"""Read data from the specified file and return a DataFrame with initialized Woodwork typing information. Note: As the engine `fastparquet` cannot handle nullable pandas dtypes, `pyarrow` will be used for reading from parquet and arrow. Args: filepath (str): A valid string path to the file to read content_type (str): Content type of file to read name (str, optional): Name used to identify the DataFrame. index (str, optional): Name of the index column. time_index (str, optional): Name of the time index column. semantic_tags (dict, optional): Dictionary mapping column names in the dataframe to the semantic tags for the column. The keys in the dictionary should be strings that correspond to columns in the underlying dataframe. There are two options for specifying the dictionary values: (str): If only one semantic tag is being set, a single string can be used as a value. (list[str] or set[str]): If multiple tags are being set, a list or set of strings can be used as the value. Semantic tags will be set to an empty set for any column not included in the dictionary. logical_types (dict[str -> LogicalType], optional): Dictionary mapping column names in the dataframe to the LogicalType for the column. LogicalTypes will be inferred for any columns not present in the dictionary. use_standard_tags (bool, optional): If True, will add standard semantic tags to columns based on the inferred or specified logical type for the column. Defaults to True. column_origins (str or dict[str -> str], optional): Origin of each column. If a string is supplied, it is used as the origin for all columns. A dictionary can be used to set origins for individual columns. replace_nan (bool, optional): Whether to replace empty string values and string representations of NaN values ("nan", "<NA>") with np.nan or pd.NA values based on column dtype. Defaults to False. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. **kwargs: Additional keyword arguments to pass to the underlying pandas read file function. For more information on available keywords refer to the pandas documentation. Returns: pd.DataFrame: DataFrame created from the specified file with Woodwork typing information initialized. """fromwoodwork.logical_typesimport_replace_nansifcontent_typeisNone:inferred_type,_=guess_type(filepath)ifinferred_typeisNone:raiseRuntimeError("Content type could not be inferred. Please specify content_type and try again.",)content_type=inferred_typeifcontent_typenotintype_to_read_func_map:raiseRuntimeError("Reading from content type {} is not currently supported".format(content_type,),)pyarrow_types=["parquet","application/parquet","arrow","application/arrow","feather","application/feather","orc","application/orc",]ifcontent_typeinpyarrow_types:import_or_raise("pyarrow",PYARROW_ERR_MSG)ifcontent_typein["parquet","application/parquet"]:kwargs["engine"]="pyarrow"dataframe=type_to_read_func_map[content_type](filepath,**kwargs)ifreplace_nan:dataframe=dataframe.apply(_replace_nans)dataframe.ww.init(name=name,index=index,time_index=time_index,semantic_tags=semantic_tags,logical_types=logical_types,use_standard_tags=use_standard_tags,column_origins=column_origins,validate=validate,)returndataframe
defimport_or_raise(library,error_msg):"""Attempts to import the requested library. If the import fails, raises an ImportError with the supplied error message. Args: library (str): the name of the library error_msg (str): error message to return if the import fails """try:returnimportlib.import_module(library)exceptImportError:raiseImportError(error_msg)def_is_s3(string):"""Checks if the given string is a s3 path. Returns a boolean."""return"s3://"instringdef_is_url(string):"""Checks if the given string is an url path. Returns a boolean."""return"http"instringdef_reformat_to_latlong(latlong):""" Accepts 2-tuple like values, or a single NaN like value. NaN like values are replaced with np.nan. """ifisinstance(latlong,str):latlong=_parse_latlong(latlong)orlatlongifisinstance(latlong,(list,tuple)):iflen(latlong)!=2:raiseTypeValidationError(f"LatLong values must have exactly two values. {latlong} does not have two values.",)latitude,longitude=latlongtry:latitude=_coerce_to_float(latitude)longitude=_coerce_to_float(longitude)exceptValueError:raiseTypeValidationError(f"LatLong values must be in decimal degrees. {latlong} does not have latitude or longitude values that can be converted to a float.",)latlong=(latitude,longitude)returnlatlongif_is_nan(latlong):returnnp.nanraiseTypeValidationError(f"""LatLong value is not properly formatted. Value must be one of the following:- A 2-tuple or list of 2 values representing decimal latitude or longitude values (NaN values are allowed).- A single NaN value.- A string representation of the above.{latlong} does not fit the criteria.""",)def_coerce_to_float(val):"""Attempts to convert a value to a float, propagating null values."""if_is_nan(val):returnnp.nantry:returnfloat(val)except(ValueError,TypeError):raiseValueError(f"The value represented by {val} cannot be converted to a float.",)def_is_valid_latlong_series(series):"""Returns True if all elements in the series contain properly formatted LatLong values, otherwise returns False"""ifseries.apply(_is_valid_latlong_value).all():returnTruereturnFalsedef_is_valid_latlong_value(val):"""Returns True if the value provided is a properly formatted LatLong value, otherwise returns False."""ifisinstance(val,(list,tuple)):iflen(val)!=2:returnFalseifnotisinstance(val,tuple):returnFalselatitude,longitude=vallat_null,long_null=map(pd.isnull,val)is_valid=isinstance(latitude,float)orlat_nullis_valid&=isinstance(longitude,float)orlong_nullreturnis_validifisinstance(val,float):returnnp.isnan(val)ifisinstance(val,str):val=_parse_latlong(val)ifvalisNone:returnFalseelse:return_is_valid_latlong_value(val)returnFalsedef_is_nan(value):"""This function checks if string values are common NaN values. Lists are not counted as NaN, and all other values are passed to pd.isnull """ifisinstance(value,str):returnvalueinww.config.get_option("nan_values")ifisinstance(value,list):returnFalsereturnpd.isnull(value)def_is_latlong_nan(value):"""Checks if a LatLong value is NaN"""ifisinstance(value,(tuple,list)):returnall([_is_nan(x)forxinvalue])return_is_nan(value)
[docs]defget_valid_mi_types():""" Generate a list of LogicalTypes that are valid for calculating mutual information. Note that index columns are not valid for calculating mutual information, but their types may be returned by this function. Args: None Returns: list(LogicalType): A list of the LogicalTypes that can be use to calculate mutual information """valid_types=[]forltypeinww.type_system.registered_types:if"category"inltype.standard_tags:valid_types.append(ltype)elif"numeric"inltype.standard_tags:valid_types.append(ltype)elif(ltype==ww.logical_types.Datetimeorltype==ww.logical_types.Booleanorltype==ww.logical_types.BooleanNullable):valid_types.append(ltype)returnvalid_types
[docs]defget_valid_pearson_types():""" Generate a list of LogicalTypes that are valid for calculating Pearson correlation. Note that index columns are not valid for calculating dependence, but their types may be returned by this function. Args: None Returns: list(LogicalType): A list of the LogicalTypes that can be use to calculate Pearson correlation """valid_types=[]forltypeinww.type_system.registered_types:if"numeric"inltype.standard_tags:valid_types.append(ltype)elifltype==ww.logical_types.Datetime:valid_types.append(ltype)returnvalid_types
[docs]defget_valid_spearman_types():""" Generate a list of LogicalTypes that are valid for calculating Spearman correlation. Note that index columns are not valid for calculating dependence, but their types may be returned by this function. Args: None Returns: list(LogicalType): A list of the LogicalTypes that can be use to calculate Spearman correlation """valid_types=[]forltypeinww.type_system.registered_types:if"numeric"inltype.standard_tags:valid_types.append(ltype)elifltype==ww.logical_types.Datetimeorltype==ww.logical_types.Ordinal:valid_types.append(ltype)returnvalid_types
def_get_column_logical_type(series,logical_type,name):iflogical_type:return_parse_logical_type(logical_type,name)else:returnww.type_system.infer_logical_type(series)def_parse_logical_type(logical_type,name):ifisinstance(logical_type,str):logical_type=ww.type_system.str_to_logical_type(logical_type)ifisclass(logical_type):logical_type=logical_type()iftype(logical_type)notinww.type_system.registered_types:raiseTypeError(f"Invalid logical type specified for '{name}'")returnlogical_type
[docs]defconcat_columns(objs,validate_schema=True):""" Concatenate Woodwork objects along the columns axis. There can only be one index and time index set across the objects passed in. As Woodwork does not allow duplicate column names, will not allow duplicate columns at concatenation. Args: objs (list[Series, DataFrame]): The Woodwork objects to be concatenated. If Woodwork is not initialized on any of the objects, type inference will be performed. validate_schema (bool, optional): Whether validation should be performed on the typing information for the concatenated DataFrame. Defaults to True. Returns: DataFrame: A Woodwork dataframe whose typing information is also a concatenation of the input dataframes. """ifnotobjs:raiseValueError("No objects to concatenate")table_name=""logical_types={}semantic_tags={}col_descriptions={}col_origins={}col_metadata={}table_metadata={}use_standard_tags={}index=Nonetime_index=None# Record the typing information for all the columns that have Woodwork schemascol_names_seen=set()forobjinobjs:ww_columns={}ifisinstance(obj.ww.schema,ww.table_schema.TableSchema):# Raise error if there's overlap between table metadataoverlapping_keys=obj.ww.metadata.keys()&table_metadata.keys()ifoverlapping_keys:raiseValueError(f"Cannot resolve overlapping keys in table metadata: {overlapping_keys}",)table_metadata={**obj.ww.metadata,**table_metadata}# Combine table namesifobj.ww.nameisnotNone:iftable_name:table_name+="_"table_name+=str(obj.ww.name)# Cannot have multiple tables with indexes or time indexes setifobj.ww.indexisnotNone:ifindexisNone:index=obj.ww.indexelse:raiseIndexError("Cannot set the Woodwork index of multiple input objects. ""Please remove the index columns from all but one table.",)ifobj.ww.time_indexisnotNone:iftime_indexisNone:time_index=obj.ww.time_indexelse:raiseIndexError("Cannot set the Woodwork time index of multiple input objects. ""Please remove the time index columns from all but one table.",)ww_columns=obj.ww.schema.columnselifisinstance(obj.ww.schema,ww.column_schema.ColumnSchema):ww_columns={obj.name:obj.ww.schema}# Compile the typing information per columnforname,col_schemainww_columns.items():ifnameincol_names_seen:raiseValueError(f"Duplicate column '{name}' has been found in more than one input object. ""Please remove duplicate columns from all but one table.",)logical_types[name]=col_schema.logical_typesemantic_tags[name]=col_schema.semantic_tags-{"time_index"}-{"index"}col_metadata[name]=col_schema.metadatacol_descriptions[name]=col_schema.descriptioncol_origins[name]=col_schema.originuse_standard_tags[name]=col_schema.use_standard_tagscol_names_seen.add(name)combined_df=pd.concat(objs,axis=1,join="outer")# The lib.concat breaks the woodwork schema for dataframes with different shapes# or mismatched indices.mask=combined_df.isnull().any()null_cols=mask[mask].indexnull_cols=null_cols.to_numpy()fornull_colinnull_cols:ifnull_colinlogical_typesandisinstance(logical_types[null_col],ww.logical_types.Integer,):logical_types.pop(null_col)# Initialize Woodwork with all of the typing information from the input objs# performing type inference on any columns that did not already have Woodwork initializedcombined_df.ww.init(name=table_nameorNone,index=index,time_index=time_index,logical_types=logical_types,semantic_tags=semantic_tags,table_metadata=table_metadataorNone,column_metadata=col_metadata,column_descriptions=col_descriptions,column_origins=col_origins,use_standard_tags=use_standard_tags,validate=validate_schema,)returncombined_df
classCallbackCaller:""" Helper class for updating progress of a function and making a call to the progress callback function, if provided. Adds the progress increment to the current progress. If provided, the callback function should accept the following parameters: - update (int): change in progress since last call - progress (int): the progress so far in the calculations - total (int): the total number of calculations to do - unit (str): unit of measurement for progress/total - time_elapsed (float): total time in seconds elapsed since start of call """def__init__(self,callback,unit,total,start_time=None,start_progress=0):""" Args: callback (func): callback method to call unit (str): unit of measurement for progress/total total (int): the total number of calculations to do start_time (datetime): when time started for the callback. Defaults to when the class instance is created start_progress (int): starting progress for the callback. Defaults to 0. """ifstart_timeisnotNone:self.start_time=start_timeelse:self.start_time=timer()self.callback=callbackself.unit=unitself.current_progress=start_progressself.total=totaldefupdate(self,progress_increment):""" Args: progress_increment (int): change in progress since the last call """ifself.callbackisnotNone:elapsed_time=timer()-self.start_timenew_progress=self.current_progress+progress_incrementself.callback(progress_increment,new_progress,self.total,self.unit,elapsed_time,)self.current_progress=new_progressdef_infer_datetime_format(dates,n=100):"""Helper function to infer the datetime format of the first n non-null rows of a series Args: dates (Series): Series of string or datetime string to guess the format of n (int): the maximum number of nonnull rows to sample from the series """dates_no_null=dates.dropna()random_n=dates_no_null.sample(min(n,len(dates_no_null)),random_state=42)iflen(random_n)==0:returnNonetry:fmts=random_n.astype(str).map(pd.core.tools.datetimes.guess_datetime_format)mode_fmt=fmts.mode().loc[0]# select first most common formatexceptKeyError:check_for_other_formats=["%y/%m/%d","%m/%d/%y","%d/%m/%y","%y/%d/%m","%d/%y/%m","%m/%y/%d","%d/%Y/%m","%m/%Y/%d",]dash_formats=[]forformat_incheck_for_other_formats:dash_formats.append(format_.replace("/","-"))dot_formats=[]forformat_incheck_for_other_formats:dot_formats.append(format_.replace("/","."))datetime_only_formats=check_for_other_formats+dash_formats+dot_formatstime_stamp_formats=[]forformat_indatetime_only_formats:time_stamp_formats.append(format_+" %H:%M:%S")time_stamp_formats_with_timezone=[]forformat_indatetime_only_formats:time_stamp_formats_with_timezone.append(format_+" %H:%M:%S%z")check_for_other_formats=(datetime_only_formats+time_stamp_formats+time_stamp_formats_with_timezone)mode_fmt=Noneforformat_incheck_for_other_formats:try:random_n.map(lambdax:datetime.strptime(x,format_))returnformat_exceptValueError:# Format doesn't matchcontinueexceptTypeError:# TimeStamp found instead of stringbreakreturnmode_fmtdef_parse_latlong(latlong):nan_values_strs=[xforxinww.config.get_option("nan_values")ifisinstance(x,str)andlen(x)andx!=" "]nan_values="|".join(nan_values_strs)latlong=re.sub(nan_values,"None",latlong)try:returnast.literal_eval(latlong)exceptValueError:pass