Source code for multimodal_transformers.data.load_data

from functools import partial
import logging
from os.path import join, exists

import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

from .tabular_torch_dataset import TorchTabularTextDataset
from .data_utils import (
    CategoricalFeatures,
    agg_text_columns_func,
    convert_to_func,
    get_matching_cols,
    load_num_feats,
    load_cat_and_num_feats,
    normalize_numerical_feats,
)

logger = logging.getLogger(__name__)


[docs]def load_data_into_folds(data_csv_path, num_splits, validation_ratio, text_cols, tokenizer, label_col, label_list=None, categorical_cols=None, numerical_cols=None, sep_text_token_str=' ', categorical_encode_type='ohe', numerical_transformer_method='quantile_normal', empty_text_values=None, replace_empty_text=None, max_token_length=None, debug=False ): """ Function to load tabular and text data from a specified folder into folds Loads train, test and/or validation text and tabular data from specified csv path into num_splits of train, val and test for Kfold cross validation. Performs categorical and numerical data preprocessing if specified. `data_csv_path` is a path to Args: data_csv_path (str): The path to the csv containing the data num_splits (int): The number of cross validation folds to split the data into. validation_ratio (float): A float between 0 and 1 representing the percent of the data to hold as a consistent validation set. text_cols (:obj:`list` of :obj:`str`): The column names in the dataset that contain text from which we want to load tokenizer (:obj:`transformers.tokenization_utils.PreTrainedTokenizer`): HuggingFace tokenizer used to tokenize the input texts as specifed by text_cols label_col (str): The column name of the label, for classification the column should have int values from 0 to n_classes-1 as the label for each class. For regression the column can have any numerical value label_list (:obj:`list` of :obj:`str`, optional): Used for classification; the names of the classes indexed by the values in label_col. categorical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain categorical features. The features can be already prepared numerically, or could be preprocessed by the method specified by categorical_encode_type numerical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain numerical features. These columns should contain only numeric values. sep_text_token_str (str, optional): The string token that is used to separate between the different text columns for a given data example. For Bert for example, this could be the [SEP] token. categorical_encode_type (str, optional): Given categorical_cols, this specifies what method we want to preprocess our categorical features. choices: [ 'ohe', 'binary', None] see encode_features.CategoricalFeatures for more details numerical_transformer_method (str, optional): Given numerical_cols, this specifies what method we want to use for normalizing our numerical data. choices: ['yeo_johnson', 'box_cox', 'quantile_normal', None] see https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html for more details empty_text_values (:obj:`list` of :obj:`str`, optional): specifies what texts should be considered as missing which would be replaced by replace_empty_text replace_empty_text (str, optional): The value of the string that will replace the texts that match with those in empty_text_values. If this argument is None then the text that match with empty_text_values will be skipped max_token_length (int, optional): The token length to pad or truncate to on the input text debug (bool, optional): Whether or not to load a smaller debug version of the dataset Returns: :obj:`tuple` of `list` of `tabular_torch_dataset.TorchTextDataset`: This tuple contains three lists representing the splits of training, validation and testing sets. The length of the lists is equal to the number of folds specified by `num_splits` """ assert 0 <= validation_ratio <= 1, 'validation ratio needs to be between 0 and 1' all_data_df = pd.read_csv(data_csv_path) folds_df, val_df = train_test_split(all_data_df, test_size=validation_ratio, shuffle=True, train_size=1-validation_ratio, random_state=5) kfold = KFold(num_splits, shuffle=True, random_state=5) train_splits, val_splits, test_splits = [], [], [] for train_index, test_index in kfold.split(folds_df): train_df = folds_df.copy().iloc[train_index] test_df = folds_df.copy().iloc[test_index] train, val, test = load_train_val_test_helper(train_df, val_df.copy(), test_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer_method, empty_text_values, replace_empty_text, max_token_length, debug) train_splits.append(train) val_splits.append(val) test_splits.append(test) return train_splits, val_splits, test_splits
[docs]def load_data_from_folder(folder_path, text_cols, tokenizer, label_col, label_list=None, categorical_cols=None, numerical_cols=None, sep_text_token_str=' ', categorical_encode_type='ohe', numerical_transformer_method='quantile_normal', empty_text_values=None, replace_empty_text=None, max_token_length=None, debug=False, ): """ Function to load tabular and text data from a specified folder Loads train, test and/or validation text and tabular data from specified folder path into TorchTextDataset class and does categorical and numerical data preprocessing if specified. Inside the folder, there is expected to be a train.csv, and test.csv (and if given val.csv) containing the training, testing, and validation sets respectively Args: folder_path (str): The path to the folder containing `train.csv`, and `test.csv` (and if given `val.csv`) text_cols (:obj:`list` of :obj:`str`): The column names in the dataset that contain text from which we want to load tokenizer (:obj:`transformers.tokenization_utils.PreTrainedTokenizer`): HuggingFace tokenizer used to tokenize the input texts as specifed by text_cols label_col (str): The column name of the label, for classification the column should have int values from 0 to n_classes-1 as the label for each class. For regression the column can have any numerical value label_list (:obj:`list` of :obj:`str`, optional): Used for classification; the names of the classes indexed by the values in label_col. categorical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain categorical features. The features can be already prepared numerically, or could be preprocessed by the method specified by categorical_encode_type numerical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain numerical features. These columns should contain only numeric values. sep_text_token_str (str, optional): The string token that is used to separate between the different text columns for a given data example. For Bert for example, this could be the [SEP] token. categorical_encode_type (str, optional): Given categorical_cols, this specifies what method we want to preprocess our categorical features. choices: [ 'ohe', 'binary', None] see encode_features.CategoricalFeatures for more details numerical_transformer_method (str, optional): Given numerical_cols, this specifies what method we want to use for normalizing our numerical data. choices: ['yeo_johnson', 'box_cox', 'quantile_normal', None] see https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html for more details empty_text_values (:obj:`list` of :obj:`str`, optional): specifies what texts should be considered as missing which would be replaced by replace_empty_text replace_empty_text (str, optional): The value of the string that will replace the texts that match with those in empty_text_values. If this argument is None then the text that match with empty_text_values will be skipped max_token_length (int, optional): The token length to pad or truncate to on the input text debug (bool, optional): Whether or not to load a smaller debug version of the dataset Returns: :obj:`tuple` of `tabular_torch_dataset.TorchTextDataset`: This tuple contains the training, validation and testing sets. The val dataset is :obj:`None` if there is no `val.csv` in folder_path """ train_df = pd.read_csv(join(folder_path, 'train.csv'), index_col=0) test_df = pd.read_csv(join(folder_path, 'test.csv'), index_col=0) if exists(join(folder_path, 'val.csv')): val_df = pd.read_csv(join(folder_path, 'val.csv'), index_col=0) else: val_df = None return load_train_val_test_helper(train_df, val_df, test_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer_method, empty_text_values, replace_empty_text, max_token_length, debug)
def load_train_val_test_helper(train_df, val_df, test_df, text_cols, tokenizer, label_col, label_list=None, categorical_cols=None, numerical_cols=None, sep_text_token_str=' ', categorical_encode_type='ohe', numerical_transformer_method='quantile_normal', empty_text_values=None, replace_empty_text=None, max_token_length=None, debug=False): if categorical_encode_type == 'ohe' or categorical_encode_type == 'binary': dfs = [df for df in [train_df, val_df, test_df] if df is not None] data_df = pd.concat(dfs, axis=0) cat_feat_processor = CategoricalFeatures(data_df, categorical_cols, categorical_encode_type) vals = cat_feat_processor.fit_transform() cat_df = pd.DataFrame(vals, columns=cat_feat_processor.feat_names) data_df = pd.concat([data_df, cat_df], axis=1) categorical_cols = cat_feat_processor.feat_names len_train = len(train_df) len_val = len(val_df) if val_df is not None else 0 train_df = data_df.iloc[:len_train] if val_df is not None: val_df = data_df.iloc[len_train: len_train + len_val] len_train = len_train + len_val test_df = data_df.iloc[len_train:] categorical_encode_type = None if numerical_transformer_method != 'none': if numerical_transformer_method == 'yeo_johnson': numerical_transformer = PowerTransformer(method='yeo-johnson') elif numerical_transformer_method == 'box_cox': numerical_transformer = PowerTransformer(method='box-cox') elif numerical_transformer_method == 'quantile_normal': numerical_transformer = QuantileTransformer(output_distribution='normal') else: raise ValueError(f'preprocessing transformer method ' f'{numerical_transformer_method} not implemented') num_feats = load_num_feats(train_df, convert_to_func(numerical_cols)) numerical_transformer.fit(num_feats) else: numerical_transformer = None train_dataset = load_data(train_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug ) test_dataset = load_data(test_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug ) if val_df is not None: val_dataset = load_data(val_df, text_cols, tokenizer, label_col, label_list, categorical_cols, numerical_cols, sep_text_token_str, categorical_encode_type, numerical_transformer, empty_text_values, replace_empty_text, max_token_length, debug ) else: val_dataset = None return train_dataset, val_dataset, test_dataset
[docs]def load_data(data_df, text_cols, tokenizer, label_col, label_list=None, categorical_cols=None, numerical_cols=None, sep_text_token_str=' ', categorical_encode_type='ohe', numerical_transformer=None, empty_text_values=None, replace_empty_text=None, max_token_length=None, debug=False, ): """Function to load a single dataset given a pandas DataFrame Given a DataFrame, this function loads the data to a :obj:`torch_dataset.TorchTextDataset` object which can be used in a :obj:`torch.utils.data.DataLoader`. Args: data_df (:obj:`pd.DataFrame`): The DataFrame to convert to a TorchTextDataset text_cols (:obj:`list` of :obj:`str`): the column names in the dataset that contain text from which we want to load tokenizer (:obj:`transformers.tokenization_utils.PreTrainedTokenizer`): HuggingFace tokenizer used to tokenize the input texts as specifed by text_cols label_col (str): The column name of the label, for classification the column should have int values from 0 to n_classes-1 as the label for each class. For regression the column can have any numerical value label_list (:obj:`list` of :obj:`str`, optional): Used for classification; the names of the classes indexed by the values in label_col. categorical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain categorical features. The features can be already prepared numerically, or could be preprocessed by the method specified by categorical_encode_type numerical_cols (:obj:`list` of :obj:`str`, optional): The column names in the dataset that contain numerical features. These columns should contain only numeric values. sep_text_token_str (str, optional): The string token that is used to separate between the different text columns for a given data example. For Bert for example, this could be the [SEP] token. categorical_encode_type (str, optional): Given categorical_cols, this specifies what method we want to preprocess our categorical features. choices: [ 'ohe', 'binary', None] see encode_features.CategoricalFeatures for more details numerical_transformer (:obj:`sklearn.base.TransformerMixin`): The sklearn numeric transformer instance to transform our numerical features empty_text_values (:obj:`list` of :obj:`str`, optional): Specifies what texts should be considered as missing which would be replaced by replace_empty_text replace_empty_text (str, optional): The value of the string that will replace the texts that match with those in empty_text_values. If this argument is None then the text that match with empty_text_values will be skipped max_token_length (int, optional): The token length to pad or truncate to on the input text debug (bool, optional): Whether or not to load a smaller debug version of the dataset Returns: :obj:`tabular_torch_dataset.TorchTextDataset`: The converted dataset """ if debug: data_df = data_df[:500] if empty_text_values is None: empty_text_values = ['nan', 'None'] text_cols_func = convert_to_func(text_cols) categorical_cols_func = convert_to_func(categorical_cols) numerical_cols_func = convert_to_func(numerical_cols) categorical_feats, numerical_feats = load_cat_and_num_feats(data_df, categorical_cols_func, numerical_cols_func, categorical_encode_type) numerical_feats = normalize_numerical_feats(numerical_feats, numerical_transformer) agg_func = partial(agg_text_columns_func, empty_text_values, replace_empty_text) texts_cols = get_matching_cols(data_df, text_cols_func) logger.info(f'Text columns: {texts_cols}') texts_list = data_df[texts_cols].agg(agg_func, axis=1).tolist() for i, text in enumerate(texts_list): texts_list[i] = f' {sep_text_token_str} '.join(text) logger.info(f'Raw text example: {texts_list[0]}') hf_model_text_input = tokenizer(texts_list, padding=True, truncation=True, max_length=max_token_length) tokenized_text_ex = ' '.join(tokenizer.convert_ids_to_tokens(hf_model_text_input['input_ids'][0])) logger.debug(f'Tokenized text example: {tokenized_text_ex}') labels = data_df[label_col].values return TorchTabularTextDataset(hf_model_text_input, categorical_feats, numerical_feats, labels, data_df, label_list)