Source code for woodwork.demo.retail

import pandas as pd

import woodwork as ww
from woodwork.logical_types import (
    Boolean,
    Categorical,
    Datetime,
    Double,
    Integer,
    NaturalLanguage,
)


[docs]def load_retail(id="demo_retail_data", nrows=None, init_woodwork=True): """Load a demo retail dataset into a DataFrame, optionally initializing Woodwork's typing information. Args: id (str, optional): The name to assign to the DataFrame, if returning a DataFrame with Woodwork typing information initialized. If not returning a DataFrame with Woodwork initialized, this will be ignored. Defaults to ``demo_retail_data``. nrows (int, optional): The number of rows to return in the dataset. If None, will return all possible rows. Defaults to None. init_woodwork (bool): If True, will return a pandas DataFrame with Woodwork typing information initialized. If False, will return a DataFrame without Woodwork initialized. Defaults to False. Returns: pd.DataFrame: A DataFrame containing the demo data with Woodwork typing initialized. If `init_woodwork` is False, will return an uninitialized DataFrame. """ csv_s3_gz = ( "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv.gz?library=woodwork&version=" + ww.__version__ ) csv_s3 = ( "https://api.featurelabs.com/datasets/online-retail-logs-2018-08-28.csv?library=woodwork&version=" + ww.__version__ ) # Try to read in gz compressed file try: df = pd.read_csv(csv_s3_gz, nrows=nrows, parse_dates=["order_date"]) # Fall back to uncompressed except Exception: df = pd.read_csv(csv_s3, nrows=nrows, parse_dates=["order_date"]) # Add unique column for index df.insert(0, "order_product_id", range(len(df))) if init_woodwork: logical_types = { "order_product_id": Categorical, "order_id": Categorical, "product_id": Categorical, "description": NaturalLanguage, "quantity": Integer, "order_date": Datetime, "unit_price": Double, "customer_name": Categorical, "country": Categorical, "total": Double, "cancelled": Boolean, } df.ww.init( name=id, index="order_product_id", time_index="order_date", logical_types=logical_types, ) return df