[docs]definit_series(series,logical_type=None,semantic_tags=None,use_standard_tags=True,description=None,origin=None,metadata=None,):"""Initializes Woodwork typing information for a series, numpy.ndarray or pd.api.extensions. ExtensionArray, returning a new Series. The dtype of the returned series will be converted to match the dtype associated with the LogicalType. Args: series (pd.Series, dd.Series, ps.Series, numpy.ndarray or pd.api.extensions.ExtensionArray): The original series from which to create the Woodwork initialized series. logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. origin (str, optional): Optional text specifying origin of the column (i.e. "base" or "engineered"). metadata (dict[str -> json serializable], optional): Metadata associated with the series. Returns: Series: A series with Woodwork typing information initialized """ifnot_is_series(series):if(isinstance(series,(np.ndarray,pd.api.extensions.ExtensionArray))andseries.ndim==1):series=pd.Series(series)elifisinstance(series,np.ndarray)andseries.ndim!=1:raiseValueError(f"np.ndarray input must be 1 dimensional. Current np.ndarray is {series.ndim} dimensional")else:raiseTypeError(f"Input must be of series type. The current input is of type {type(series)}")logical_type=_get_column_logical_type(series,logical_type,series.name)new_series=logical_type.transform(series)new_series.ww.init(logical_type=logical_type,semantic_tags=semantic_tags,use_standard_tags=use_standard_tags,description=description,origin=origin,metadata=metadata,)returnnew_series
[docs]defget_invalid_schema_message(dataframe,schema):"""Return a message indicating the reason that the provided schema cannot be used to initialize Woodwork on the dataframe. If the schema is valid for the dataframe, None will be returned. Args: dataframe (DataFrame): The dataframe against which to check the schema. schema (ww.TableSchema): The schema to use in the validity check. Returns: str or None: The reason that the schema is invalid for the dataframe """dataframe_cols=set(dataframe.columns)schema_cols=set(schema.columns.keys())df_cols_not_in_schema=dataframe_cols-schema_colsifdf_cols_not_in_schema:return(f"The following columns in the DataFrame were missing from the typing information: "f"{df_cols_not_in_schema}")schema_cols_not_in_df=schema_cols-dataframe_colsifschema_cols_not_in_df:return(f"The following columns in the typing information were missing from the DataFrame: "f"{schema_cols_not_in_df}")logical_types=schema.logical_typesfornameindataframe.columns:df_dtype=dataframe[name].dtypevalid_dtype=logical_types[name]._get_valid_dtype(type(dataframe[name]))ifstr(df_dtype)!=valid_dtype:return(f"dtype mismatch for column {name} between DataFrame dtype, "f"{df_dtype}, and {logical_types[name]} dtype, {valid_dtype}")ifschema.indexisnotNoneandisinstance(dataframe,pd.DataFrame):# Index validation not performed for Dask/Sparkifnotpd.Series(dataframe.index,dtype=dataframe[schema.index].dtype).equals(pd.Series(dataframe[schema.index].values)):return"Index mismatch between DataFrame and typing information"elifnotdataframe[schema.index].is_unique:return"Index column is not unique"elifdataframe[schema.index].isnull().any():return"Index contains null values"
[docs]defis_schema_valid(dataframe,schema):"""Check if a schema is valid for initializing Woodwork on a dataframe Args: dataframe (DataFrame): The dataframe against which to check the schema. schema (ww.TableSchema): The schema to use in the validity check. Returns: boolean: Boolean indicating whether the schema is valid for the dataframe """invalid_schema_message=get_invalid_schema_message(dataframe,schema)ifinvalid_schema_message:returnFalsereturnTrue
def_is_dask_series(data):ifddandisinstance(data,dd.Series):returnTruereturnFalsedef_is_dask_dataframe(data):ifddandisinstance(data,dd.DataFrame):returnTruereturnFalsedef_is_spark_dataframe(data):ifpsandisinstance(data,ps.DataFrame):returnTruereturnFalsedef_is_spark_series(data):ifpsandisinstance(data,ps.Series):returnTruereturnFalsedef_check_column_schema(method):"""Decorator for WoodworkColumnAccessor that checks schema initialization"""@wraps(method)defwrapper(self,*args,**kwargs):ifself._schemaisNone:msg=("Woodwork not initialized for this Series. Initialize by ""calling Series.ww.init")raiseWoodworkNotInitError(msg)returnmethod(self,*args,**kwargs)returnwrapperdef_check_table_schema(method):"""Decorator for WoodworkTableAccessor that checks schema initialization"""@wraps(method)defwrapper(self,*args,**kwargs):ifself._schemaisNone:msg=("Woodwork not initialized for this DataFrame. Initialize by ""calling DataFrame.ww.init")raiseWoodworkNotInitError(msg)returnmethod(self,*args,**kwargs)returnwrapper