[docs]def__init__(self,column_names,logical_types,name=None,index=None,time_index=None,semantic_tags=None,table_metadata=None,column_metadata=None,use_standard_tags=False,column_descriptions=None,column_origins=None,validate=True,):"""Create TableSchema Args: column_names (list, set): The columns present in the TableSchema. logical_types (dict[str -> LogicalType]): Dictionary mapping column names in the TableSchema to the LogicalType for the column. All columns present in the TableSchema must be present in the logical_types dictionary. name (str, optional): Name used to identify the TableSchema. index (str, optional): Name of the index column. time_index (str, optional): Name of the time index column. semantic_tags (dict, optional): Dictionary mapping column names in the TableSchema to the semantic tags for the column. The keys in the dictionary should be strings that correspond to columns in the TableSchema. There are two options for specifying the dictionary values: (str): If only one semantic tag is being set, a single string can be used as a value. (list[str] or set[str]): If multiple tags are being set, a list or set of strings can be used as the value. Semantic tags will be set to an empty set for any column not included in the dictionary. table_metadata (dict[str -> json serializable], optional): Dictionary containing extra metadata for the TableSchema. The dictionary must contain data types that are JSON serializable such as string, integers, and floats. DataFrame and Series types are not supported. column_metadata (dict[str -> dict[str -> json serializable]], optional): Dictionary mapping column names to that column's metadata dictionary. use_standard_tags (bool, dict[str -> bool], optional): Determines whether standard semantic tags will be added to columns based on the specified logical type for the column. If a single boolean is supplied, will apply the same use_standard_tags value to all columns. A dictionary can be used to specify ``use_standard_tags`` values for individual columns. Unspecified columns will use the default value. Defaults to False. column_descriptions (dict[str -> str], optional): Dictionary mapping column names to column descriptions. column_origins (str, dict[str -> str], optional): Origin of each column. If a string is supplied, it is used as the origin for all columns. A dictionary can be used to set origins for individual columns. validate (bool, optional): Whether parameter validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ifvalidate:# Check that inputs are valid_validate_params(column_names,name,index,time_index,logical_types,table_metadata,column_metadata,semantic_tags,column_descriptions,column_origins,use_standard_tags,)self._name=nameself._metadata=table_metadataor{}# use_standard_tags should be a dictionary mapping each column to its booleanifisinstance(use_standard_tags,bool):use_standard_tags={col_name:use_standard_tagsforcol_nameincolumn_names}else:use_standard_tags={**{col_name:Falseforcol_nameincolumn_names},**use_standard_tags,}# Infer logical types and create columnsself.columns=self._create_columns(column_names,logical_types,semantic_tags,use_standard_tags,column_descriptions,column_origins,column_metadata,validate,)ifindexisnotNone:self.set_index(index,validate=validate)iftime_indexisnotNone:self.set_time_index(time_index,validate=validate)
def__eq__(self,other,deep=True):ifself.name!=other.name:returnFalseifself.index!=other.index:returnFalseifself.time_index!=other.time_index:returnFalseifset(self.columns.keys())!=set(other.columns.keys()):returnFalseforcol_nameinself.columns:ifnotself.columns[col_name].__eq__(other.columns[col_name],deep=deep):returnFalseifdeepandself.metadata!=other.metadata:returnFalsereturnTruedef__repr__(self):"""A string representation of a TableSchema containing typing information."""returnrepr(self._get_typing_info())def_repr_html_(self):"""An HTML representation of a TableSchema for IPython.display in Jupyter Notebooks containing typing information and a preview of the data."""returnself._get_typing_info().to_html()@propertydeftypes(self):"""DataFrame containing the physical dtypes, logical types and semantic tags for the TableSchema."""returnself._get_typing_info()def_get_typing_info(self):"""Creates a DataFrame that contains the typing information for a TableSchema."""typing_info={}forcol_name,colinself.columns.items():types=[col.logical_type,str(list(col.semantic_tags))]typing_info[col_name]=typescolumns=["Logical Type","Semantic Tag(s)"]df=pd.DataFrame.from_dict(typing_info,orient="index",columns=columns,dtype="object",)df.index.name="Column"returndf@propertydefname(self):"""Name of schema"""returnself._name@name.setterdefname(self,name):"""Set name of schema"""ifname:_check_name(name)self._name=name@propertydefmetadata(self):"""Metadata of the table"""returnself._metadata@metadata.setterdefmetadata(self,metadata):"""Set table metadata"""ifmetadata:_check_table_metadata(metadata)self._metadata=metadataor{}@propertydeflogical_types(self):"""A dictionary containing logical types for each column"""return{col_name:col.logical_typeforcol_name,colinself.columns.items()}@propertydefsemantic_tags(self):"""A dictionary containing semantic tags for each column"""return{col_name:col.semantic_tagsforcol_name,colinself.columns.items()}@propertydefindex(self):"""The index column for the table"""forcol_name,columninself.columns.items():if"index"incolumn.semantic_tags:returncol_namereturnNone@propertydeftime_index(self):"""The time index column for the table"""forcol_name,columninself.columns.items():if"time_index"incolumn.semantic_tags:returncol_namereturnNone@propertydefuse_standard_tags(self):return{col_name:col.use_standard_tagsforcol_name,colinself.columns.items()}
[docs]defset_types(self,logical_types=None,semantic_tags=None,retain_index_tags=True):"""Update the logical type and semantic tags for any columns names in the provided types dictionaries, updating the TableSchema at those columns. Args: logical_types (dict[str -> LogicalType], optional): A dictionary defining the new logical types for the specified columns. semantic_tags (dict[str -> str/list/set], optional): A dictionary defining the new semantic_tags for the specified columns. retain_index_tags (bool, optional): If True, will retain any index or time_index semantic tags set on the column. If False, will replace all semantic tags any time a column's semantic tags or logical type changes. Defaults to True. """logical_types=logical_typesor{}_check_logical_types(self.columns.keys(),logical_types,require_all_cols=False)semantic_tags=semantic_tagsor{}_check_semantic_tags(self.columns.keys(),semantic_tags)forcol_nameinlogical_types.keys()|semantic_tags.keys():original_tags=self.semantic_tags[col_name]custom_tags=self.columns[col_name].custom_tags# Update Logical Type for the TableSchema, getting new semantic tagsnew_logical_type=logical_types.get(col_name)ifnew_logical_typeisnotNone:self.columns[col_name].logical_type=new_logical_type# Retain custom tags if no tags are given, otherwise set to new semantic tagsnew_semantic_tags=semantic_tags.get(col_name)ifnew_semantic_tagsisNone:self.columns[col_name]._reset_semantic_tags()self.columns[col_name]._set_semantic_tags(custom_tags)else:self.columns[col_name]._set_semantic_tags(new_semantic_tags)_validate_not_setting_index_tags(self.semantic_tags[col_name],col_name)ifretain_index_tagsand"index"inoriginal_tags:self._set_index_tags(col_name)ifretain_index_tagsand"time_index"inoriginal_tags:self._set_time_index_tags(col_name)
[docs]defadd_semantic_tags(self,semantic_tags):"""Adds specified semantic tags to columns, updating the Woodwork typing information. Will retain any previously set values. Args: semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns in the DataFrame to the tags that should be added to the column's semantic tags """_check_semantic_tags(self.columns.keys(),semantic_tags)forcol_name,tags_to_addinsemantic_tags.items():tags_to_add=_convert_input_to_set(tags_to_add)_validate_not_setting_index_tags(tags_to_add,col_name)self.columns[col_name]._add_semantic_tags(tags_to_add,col_name)
[docs]defremove_semantic_tags(self,semantic_tags):"""Remove the semantic tags for any column names in the provided semantic_tags dictionary, updating the Woodwork typing information. Including `index` or `time_index` tags will set the Woodwork index or time index to None for the DataFrame. Args: semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns in the DataFrame to the tags that should be removed from the column's semantic tags """_check_semantic_tags(self.columns.keys(),semantic_tags)forcol_name,tags_to_removeinsemantic_tags.items():standard_tags=self.logical_types[col_name].standard_tagstags_to_remove=_convert_input_to_set(tags_to_remove)original_tags=self.semantic_tags[col_name].copy()self.columns[col_name]._remove_semantic_tags(tags_to_remove,col_name)# If the index is removed, reinsert any standard tags not explicitly removedif(self.use_standard_tags[col_name]and"index"inoriginal_tagsand"index"notinself.columns[col_name].semantic_tags):standard_tags_removed=tags_to_remove.intersection(standard_tags)standard_tags_to_reinsert=standard_tags.difference(standard_tags_removed,)self.columns[col_name].semantic_tags=self.semantic_tags[col_name].union(standard_tags_to_reinsert)
[docs]defreset_semantic_tags(self,columns=None,retain_index_tags=False):"""Reset the semantic tags for the specified columns to the default values. The default values will be either an empty set or a set of the standard tags based on the column logical type, controlled by the use_standard_tags property on the table. Column names can be provided as a single string, a list of strings or a set of strings. If columns is not specified, tags will be reset for all columns. Args: columns (str/list/set, optional): The columns for which the semantic tags should be reset. retain_index_tags (bool, optional): If True, will retain any index or time_index semantic tags set on the column. If False, will clear all semantic tags. Defaults to False. """columns=_convert_input_to_set(columns,"columns")cols_not_found=sorted(list(columns.difference(set(self.columns.keys()))))ifcols_not_found:raiseColumnNotPresentError(cols_not_found)ifnotcolumns:columns=self.columns.keys()forcol_nameincolumns:original_tags=self.semantic_tags[col_name]self.columns[col_name]._reset_semantic_tags()ifretain_index_tagsand"index"inoriginal_tags:self._set_index_tags(col_name)ifretain_index_tagsand"time_index"inoriginal_tags:self._set_time_index_tags(col_name)
def_create_columns(self,column_names,logical_types,semantic_tags,use_standard_tags,column_descriptions,column_origins,column_metadata,validate,):"""Create a dictionary with column names as keys and new column dictionaries holding each column's typing information as values."""columns={}fornameincolumn_names:semantic_tags_for_col=_convert_input_to_set((semantic_tagsor{}).get(name),error_language=f"semantic_tags for {name}",validate=validate,)ifvalidate:_validate_not_setting_index_tags(semantic_tags_for_col,name)description=(column_descriptionsor{}).get(name)origin=(column_originsifisinstance(column_origins,str)else(column_originsor{}).get(name))metadata_for_col=(column_metadataor{}).get(name)columns[name]=ColumnSchema(logical_type=logical_types.get(name),semantic_tags=semantic_tags_for_col,use_standard_tags=use_standard_tags.get(name),description=description,origin=origin,metadata=metadata_for_col,validate=validate,)returncolumns
[docs]defset_index(self,new_index,validate=True):"""Sets the index. Handles setting a new index, updating the index, or removing the index. Args: new_index (str): Name of the new index column. Must be present in the TableSchema. If None, will remove the index. """old_index=self.indexifold_indexisnotNone:self.remove_semantic_tags({old_index:"index"})ifnew_indexisnotNone:ifvalidate:_check_index(self.columns.keys(),new_index)if"time_index"inself.columns[new_index].semantic_tags:info=f'"{new_index}" is already set as the time index. 'info+="A time index cannot also be the index."raiseValueError(info)self._set_index_tags(new_index)
[docs]defset_time_index(self,new_time_index,validate=True):"""Set the time index. Adds the 'time_index' semantic tag to the column and clears the tag from any previously set index column Args: new_time_index (str): The name of the column to set as the time index. If None, will remove the time_index. """old_time_index=self.time_indexifold_time_indexisnotNone:self.remove_semantic_tags({old_time_index:"time_index"})ifnew_time_indexisnotNone:ifvalidate:_check_time_index(self.columns.keys(),new_time_index,self.logical_types.get(new_time_index),)if"index"inself.columns[new_time_index].semantic_tags:info=f'"{new_time_index}" is already set as the index. 'info+="An index cannot also be the time index."raiseValueError(info)self._set_time_index_tags(new_time_index)
[docs]defrename(self,columns):"""Renames columns in a TableSchema Args: columns (dict[str -> str]): A dictionary mapping current column names to new column names. Returns: woodwork.TableSchema: TableSchema with the specified columns renamed. """ifnotisinstance(columns,dict):raiseTypeError("columns must be a dictionary")forold_name,new_nameincolumns.items():ifold_namenotinself.columns:raiseColumnNotPresentError(f"Column to rename must be present. {old_name} cannot be found.",)ifnew_nameinself.columnsandnew_namenotincolumns.keys():raiseValueError(f"The column {new_name} is already present. Please choose another name to rename {old_name} to or also rename {old_name}.",)iflen(columns)!=len(set(columns.values())):raiseValueError("New columns names must be unique from one another.")new_schema=copy.deepcopy(self)ordered_col_dict=OrderedDict()forcolumninself.columns:ordered_col_dict[column]=columns.get(column,column)cols_to_update={}forold_name,new_nameinordered_col_dict.items():col=new_schema.columns.pop(old_name)cols_to_update[new_name]=colnew_schema.columns.update(cols_to_update)returnnew_schema
def_set_index_tags(self,index):"""Updates the semantic tags of the index by removing any standard tags before adding the 'index' tag."""column=self.columns[index]standard_tags=column.logical_type.standard_tagsnew_tags=column.semantic_tags.difference(standard_tags)new_tags.add("index")self.columns[index].semantic_tags=new_tagsdef_set_time_index_tags(self,time_index):self.columns[time_index].semantic_tags.add("time_index")def_filter_cols(self,include=None,exclude=None,col_names=False):"""Return list of columns filtered with any of: semantic tags, LogicalTypes, column names Args: include (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be included in the returned list of columns. exclude (str or LogicalType or list[str or LogicalType]): parameter or list of parameters to filter columns by. Can be Logical Types or Semantic Tags. Columns that match will be excluded from the returned list of columns. col_names (bool): Specifies whether to filter columns by name. Defaults to False. Returns: List[str] of column names that fit into filter. """assertnot(includeandexclude),"Cannot specify both include and exclude"ifincludeandnotisinstance(include,list):include=[include]elifexcludeandnotisinstance(exclude,list):exclude=[exclude]ifincludeisnotNone:selectors=includeelifexcludeisnotNone:selectors=excludeltypes_used=set()ltypes_in_schema={type(col.logical_type)forcolinself.columns.values()}tags_used=set()tags_in_schema={tagforcolinself.columns.values()fortagincol.semantic_tags}col_name_matches=set()forselectorinselectors:# Determine if the selector is a registered, uninstantiated LogicalTypemaybe_ltype=selectorifisinstance(selector,str):# Convert possible string to LogicalType - unregistered LogicalTypes return Nonemaybe_ltype=ww.type_system.str_to_logical_type(selector,raise_error=False,)# Get the class - unregistered LogicalTypes return LogicalTypeMetaClassmaybe_ltype_class=_get_ltype_class(maybe_ltype)ifmaybe_ltype_classinww.type_system.registered_types:ifmaybe_ltypenotinww.type_system.registered_types:raiseTypeError(f"Invalid selector used in include: {maybe_ltype} cannot be instantiated",)ifmaybe_ltypeinltypes_in_schema:ltypes_used.add(maybe_ltype)elifmaybe_ltype_class==ww.logical_types.LogicalType.__class__:raiseTypeError(f"Specified LogicalType selector {maybe_ltype} is not registered in Woodwork's type system.",)# Hashability as a proxy for whether a selector is possibly a semantic tag or column nameifnotisinstance(selector,Hashable):raiseTypeError(f"Invalid selector used in include: {selector} must be a ""string, uninstantiated and registered LogicalType, or valid column name",)# Determine if the selector is a semantic tagifselectorintags_in_schema:tags_used.add(selector)# Determine if the selector is a column nameifcol_namesandselectorinself.columns:col_name_matches.add(selector)cols_to_return=[]cols_seen=set()forcol_name,colinself.columns.items():is_match=(type(col.logical_type)inltypes_usedorcol.semantic_tags.intersection(tags_used)orcol_nameincol_name_matches)ifincludeisnotNoneandis_matchandcol_namenotincols_seen:cols_to_return.append(col_name)cols_seen.add(col_name)elifexcludeisnotNoneandnotis_matchandcol_namenotincols_seen:cols_to_return.append(col_name)cols_seen.add(col_name)returncols_to_return
[docs]defget_subset_schema(self,subset_cols):"""Creates a new TableSchema with specified columns, retaining typing information. Args: subset_cols (list[str]): subset of columns from which to create the new TableSchema Returns: TableSchema: New TableSchema with attributes from original TableSchema """new_logical_types={}new_semantic_tags={}new_column_descriptions={}new_column_origins={}new_column_metadata={}forcol_nameinsubset_cols:col=col=self.columns[col_name]new_logical_types[col_name]=col.logical_typenew_semantic_tags[col_name]=col.semantic_tagsnew_column_descriptions[col_name]=col.descriptionnew_column_origins[col_name]=col.originnew_column_metadata[col_name]=col.metadatanew_index=self.indexifself.indexinsubset_colselseNonenew_time_index=self.time_indexifself.time_indexinsubset_colselseNoneifnew_indexisnotNone:new_semantic_tags[new_index]=new_semantic_tags[new_index].difference({"index"},)ifnew_time_indexisnotNone:new_semantic_tags[new_time_index]=new_semantic_tags[new_time_index].difference({"time_index"})returnTableSchema(subset_cols,new_logical_types,name=self.name,index=new_index,time_index=new_time_index,semantic_tags=copy.deepcopy(new_semantic_tags),use_standard_tags=self.use_standard_tags.copy(),table_metadata=copy.deepcopy(self.metadata),column_metadata=copy.deepcopy(new_column_metadata),column_descriptions=new_column_descriptions,column_origins=new_column_origins,validate=False,)
def_validate_params(column_names,name,index,time_index,logical_types,table_metadata,column_metadata,semantic_tags,column_descriptions,column_origins,use_standard_tags,):"""Check that values supplied during TableSchema initialization are valid"""_check_column_names(column_names)_check_use_standard_tags(column_names,use_standard_tags)ifname:_check_name(name)ifindexisnotNone:_check_index(column_names,index)iflogical_types:_check_logical_types(column_names,logical_types)iftable_metadata:_check_table_metadata(table_metadata)ifcolumn_metadata:_check_column_metadata(column_names,column_metadata)iftime_indexisnotNone:_check_time_index(column_names,time_index,logical_types.get(time_index))ifsemantic_tags:_check_semantic_tags(column_names,semantic_tags)ifcolumn_descriptions:_check_column_descriptions(column_names,column_descriptions)ifcolumn_origins:_check_column_origins(column_names,column_origins)def_check_name(name):ifnotisinstance(name,str):raiseTypeError("Table name must be a string")def_check_column_names(column_names):ifnotisinstance(column_names,(list,set)):raiseTypeError("Column names must be a list or set")iflen(column_names)!=len(set(column_names)):raiseIndexError("TableSchema cannot contain duplicate columns names")def_check_index(column_names,index):ifindexnotincolumn_names:# User specifies an index that is not in the list of column namesraiseColumnNotPresentError(f"Specified index column `{index}` not found in TableSchema.",)def_check_time_index(column_names,time_index,logical_type):iftime_indexnotincolumn_names:raiseColumnNotPresentError(f"Specified time index column `{time_index}` not found in TableSchema",)ltype_class=_get_ltype_class(logical_type)ifnot(ltype_class==ww.logical_types.Datetimeor"numeric"inltype_class.standard_tags):raiseTypeError("Time index column must be a Datetime or numeric column.")def_check_logical_types(column_names,logical_types,require_all_cols=True):ifnotisinstance(logical_types,dict):raiseTypeError("logical_types must be a dictionary")cols_in_ltypes=set(logical_types.keys())cols_in_schema=set(column_names)cols_not_found_in_schema=cols_in_ltypes.difference(cols_in_schema)ifcols_not_found_in_schema:raiseColumnNotPresentError("logical_types contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found_in_schema))}",)cols_not_found_in_ltypes=cols_in_schema.difference(cols_in_ltypes)ifcols_not_found_in_ltypesandrequire_all_cols:raiseColumnNotPresentError(f"logical_types is missing columns that are present in "f"TableSchema: {sorted(list(cols_not_found_in_ltypes))}",)forcol_name,logical_typeinlogical_types.items():if_get_ltype_class(logical_type)notinww.type_system.registered_types:raiseTypeError("Logical Types must be of the LogicalType class ""and registered in Woodwork's type system. "f"{logical_type} does not meet that criteria.",)def_check_semantic_tags(column_names,semantic_tags):ifnotisinstance(semantic_tags,dict):raiseTypeError("semantic_tags must be a dictionary")cols_not_found=set(semantic_tags.keys()).difference(set(column_names))ifcols_not_found:raiseColumnNotPresentError("semantic_tags contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found))}",)forcol_name,col_tagsinsemantic_tags.items():ifnotisinstance(col_tags,(str,list,set)):raiseTypeError(f"semantic_tags for {col_name} must be a string, set or list",)def_check_column_descriptions(column_names,column_descriptions):ifnotisinstance(column_descriptions,dict):raiseTypeError("column_descriptions must be a dictionary")cols_not_found=set(column_descriptions.keys()).difference(set(column_names))ifcols_not_found:raiseColumnNotPresentError("column_descriptions contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found))}",)def_check_column_origins(column_names,column_origins):ifnotisinstance(column_origins,(dict,str)):raiseTypeError("column_origins must be a dictionary or a string")ifisinstance(column_origins,dict):cols_not_found=set(column_origins.keys()).difference(set(column_names))ifcols_not_found:raiseColumnNotPresentError("column_origins contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found))}",)def_check_table_metadata(table_metadata):ifnotisinstance(table_metadata,dict):raiseTypeError("Table metadata must be a dictionary.")def_check_column_metadata(column_names,column_metadata):ifnotisinstance(column_metadata,dict):raiseTypeError("Column metadata must be a dictionary.")cols_not_found=set(column_metadata.keys()).difference(set(column_names))ifcols_not_found:raiseColumnNotPresentError("column_metadata contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found))}",)def_check_use_standard_tags(column_names,use_standard_tags):ifnotisinstance(use_standard_tags,(dict,bool)):raiseTypeError("use_standard_tags must be a dictionary or a boolean")ifisinstance(use_standard_tags,dict):cols_not_found=set(use_standard_tags.keys()).difference(set(column_names))ifcols_not_found:raiseColumnNotPresentError("use_standard_tags contains columns that are not present in "f"TableSchema: {sorted(list(cols_not_found))}",)forcol_name,use_standard_tags_for_colinuse_standard_tags.items():ifnotisinstance(use_standard_tags_for_col,bool):raiseTypeError(f"use_standard_tags for column {col_name} must be a boolean",)def_validate_not_setting_index_tags(semantic_tags,col_name):"""Verify user has not supplied tags that cannot be set directly"""if"index"insemantic_tags:raiseValueError(f"Cannot add 'index' tag directly for column {col_name}. To set a column as the index, ""use DataFrame.ww.set_index() instead.",)if"time_index"insemantic_tags:raiseValueError(f"Cannot add 'time_index' tag directly for column {col_name}. To set a column as the time index, ""use DataFrame.ww.set_time_index() instead.",)