Data Reference

This page documents the data sub-package.

`medpipe.data.Preprocessor`

Preprocessor class.

This class creates a Preprocessor to prepare data.

`Preprocessor`

Class that creates a Preprocessor.

Attributes:

Name	Type	Description
`preprocess`	`bool`	Flag to preprocess data or not.
`transform_seq`	`dict[str, dict[str, list[str]]]`	Transformation sequence for the data.
`logger`	`logging.Logger or None, default: None`	Logger object to log prints. If None print to terminal.

Methods:

Name	Description
`__init__`	Init method.
`_clean_data`	Cleans data in preparation for transformation.
`fit_transform`	Fits the operations and transforms the input data.
`fit`	Fits the operations based on input data.
`transform`	Transforms input data based on fitted operations.

Source code in src/medpipe/data/Preprocessor.py

class Preprocessor:
    """
    Class that creates a Preprocessor.

    Attributes
    ----------
    preprocess : bool
        Flag to preprocess data or not.
    transform_seq : dict[str, dict[str,list[str]]]
        Transformation sequence for the data.
    logger : logging.Logger or None, default: None
        Logger object to log prints. If None print to terminal.

    Methods
    -------
    __init__(preprocessor_config, logger)
        Init method.
    _clean_data(X)
        Cleans data in preparation for transformation.
    fit_transform(X)
        Fits the operations and transforms the input data.
    fit(X)
        Fits the operations based on input data.
    transform(X)
        Transforms input data based on fitted operations.
    """

    def __init__(self, preprocessor_config, logger=None):
        """
        Initialise a Preprocessor class instance.

        Parameters
        ----------
        preprocessor_config : dict[str, dict[str, list[str]]]
            Configuration parameters for the preprocessor object.
        logger : logging.Logger or None, default: None
            Logger object to log prints. If None print to terminal.

        Returns
        -------
        None
            Nothing is returned.

        """
        self.preprocess = preprocessor_config.pop("preprocess")
        self.transform_seq = preprocessor_config
        self.operations = dict()  # Empty dict to contain operations
        self.logger = logger

    def _clean_data(self, X):
        """
        Cleans data before transformation.

        Removes rows with Nan values and converts objects to
        categoricals.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Cleaned data.

        """
        # Convert objects to categorical (not saved so needs to be here)
        data = convert_object_to_categorical(X)

        # Remove NaN values
        nb_nan_rows = data.isna().any(axis=1).sum()
        data = data.dropna()

        print_message(
            f"Dropped {nb_nan_rows} rows with NaN values", self.logger, SCRIPT_NAME
        )

        return data

    def fit_transform(self, X):
        """
        Fits the operations and transforms the input data.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Transformed data.

        """
        data = self._clean_data(deepcopy(X))  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Fitting processing operations", self.logger, SCRIPT_NAME)
            self.operations = fit_preprocess_operations(data, self.transform_seq)
            data = self.transform(data)

        return data

    def fit(self, X):
        """
        Fits the operations based on input data.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        None
            Nothings is returned.

        """
        data = self._clean_data(X)  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Fitting preprocessing operations", self.logger, SCRIPT_NAME)
            self.operations = fit_preprocess_operations(data, self.transform_seq)

    def transform(self, X):
        """
        Transforms input data based on fitted operations.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Transformed data.

        """
        data = self._clean_data(deepcopy(X))  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Preprocessing data", self.logger, SCRIPT_NAME)
            for operation in self.operations:
                features = self.transform_seq[operation]["feature_list"]

                if operation == "bin":
                    transformed_data = bin_score(data[features])
                else:
                    transformed_data = self.operations[operation].transform(
                        data[features]
                    )
                data[features] = transformed_data

        return data

`init(preprocessor_config, logger=None)`

Initialise a Preprocessor class instance.

Parameters:

Name	Type	Description	Default
`preprocessor_config`	`dict[str, dict[str, list[str]]]`	Configuration parameters for the preprocessor object.	required
`logger`	`Logger or None`	Logger object to log prints. If None print to terminal.	`None`

Returns:

Type	Description
`None`	Nothing is returned.

Source code in src/medpipe/data/Preprocessor.py

def __init__(self, preprocessor_config, logger=None):
    """
    Initialise a Preprocessor class instance.

    Parameters
    ----------
    preprocessor_config : dict[str, dict[str, list[str]]]
        Configuration parameters for the preprocessor object.
    logger : logging.Logger or None, default: None
        Logger object to log prints. If None print to terminal.

    Returns
    -------
    None
        Nothing is returned.

    """
    self.preprocess = preprocessor_config.pop("preprocess")
    self.transform_seq = preprocessor_config
    self.operations = dict()  # Empty dict to contain operations
    self.logger = logger

`fit(X)`

Fits the operations based on input data.

Parameters:

Name	Type	Description	Default
`X`	`pd.Dataframe of shape (n_samples, n_features)`	Data to clean.	required

Returns:

Type	Description
`None`	Nothings is returned.

Source code in src/medpipe/data/Preprocessor.py

def fit(self, X):
    """
    Fits the operations based on input data.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    None
        Nothings is returned.

    """
    data = self._clean_data(X)  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Fitting preprocessing operations", self.logger, SCRIPT_NAME)
        self.operations = fit_preprocess_operations(data, self.transform_seq)

`fit_transform(X)`

Fits the operations and transforms the input data.

Parameters:

Name	Type	Description	Default
`X`	`pd.Dataframe of shape (n_samples, n_features)`	Data to clean.	required

Returns:

Name	Type	Description
`data`	`pd.Dataframe of shape (n_samples, n_features)`	Transformed data.

Source code in src/medpipe/data/Preprocessor.py

def fit_transform(self, X):
    """
    Fits the operations and transforms the input data.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    data : pd.Dataframe of shape (n_samples, n_features)
         Transformed data.

    """
    data = self._clean_data(deepcopy(X))  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Fitting processing operations", self.logger, SCRIPT_NAME)
        self.operations = fit_preprocess_operations(data, self.transform_seq)
        data = self.transform(data)

    return data

`transform(X)`

Transforms input data based on fitted operations.

Parameters:

Name	Type	Description	Default
`X`	`pd.Dataframe of shape (n_samples, n_features)`	Data to clean.	required

Returns:

Name	Type	Description
`data`	`pd.Dataframe of shape (n_samples, n_features)`	Transformed data.

Source code in src/medpipe/data/Preprocessor.py

def transform(self, X):
    """
    Transforms input data based on fitted operations.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    data : pd.Dataframe of shape (n_samples, n_features)
         Transformed data.

    """
    data = self._clean_data(deepcopy(X))  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Preprocessing data", self.logger, SCRIPT_NAME)
        for operation in self.operations:
            features = self.transform_seq[operation]["feature_list"]

            if operation == "bin":
                transformed_data = bin_score(data[features])
            else:
                transformed_data = self.operations[operation].transform(
                    data[features]
                )
            data[features] = transformed_data

    return data

`medpipe.data.db`

Database functions module.

This module provides functions to open, query, and save data from databases.

Functions: - parquet_to_db: Converts a .parquet file to a .db fil. - extract_data_from_db: Queries a SQL .db to extract data.

`extract_data_from_db(db_file, query)`

Extracts data from a .db and saves it to a .csv file.

The parquet file

Parameters:

Name	Type	Description	Default
`db_file`	`str`	Path to the .db file.	required
`query`	`str`	Query to send to extract data.	required

Returns:

Name	Type	Description
`data`	`DataFrame`	Extracted data from the database.

Raises:

Type	Description
`TypeError`	If db_file or query is not a str.
`FileNotFoundError`	If db_file does not exist.
`IsADirectoryError`	If db_file is not a file.
`ValueError`	If db_file extension is not a .sqlite3 file.

Source code in src/medpipe/data/db.py

def extract_data_from_db(db_file: str, query: str):
    """
    Extracts data from a .db and saves it to a .csv file.

    The parquet file

    Parameters
    ----------
    db_file
        Path to the .db file.
    query
        Query to send to extract data.

    Returns
    -------
    data : pd.DataFrame
        Extracted data from the database.

    Raises
    ------
    TypeError
        If db_file or query is not a str.
    FileNotFoundError
        If db_file does not exist.
    IsADirectoryError
        If db_file is not a file.
    ValueError
        If db_file extension is not a .sqlite3 file.

    """
    try:
        medpipe.utils.exceptions.file_checks(db_file, ".db")
    except (FileNotFoundError, IsADirectoryError, TypeError, ValueError):
        raise

    if type(query) is not str:
        raise TypeError(f"{query} should be a string")

    conn = sqlite3.connect(db_file)
    df = pd.read_sql_query(query, conn)
    conn.close()

    return df

`parquet_to_db(parquet_file, db_file, table_name='main')`

Converts a .parquet file to a .db file.

Parameters:

Name	Type	Description	Default
`parquet_file`	`str`	File path to the .parquet file.	required
`db_file`	`str`	File path to the .db file.	required
`table_name`	`default: 'main'`	Name of the table to create in the SQL database.	`'main'`

Returns:

Type	Description
`None`	Nothing is returned.

Raises:

Type	Description
`TypeError`	If parquet_file or db_file are not str.
`FileNotFoundError`	If parquet_file does not exist.
`IsADirectoryError`	If parquet_file or db_file are not a file.
`ValueError`	If parquet_file extension is not a .parquet file.
`ValueError`	If db_file extension is not a .sqlite3 file.

Source code in src/medpipe/data/db.py

def parquet_to_db(parquet_file: str, db_file: str, table_name: str = "main") -> None:
    """
    Converts a .parquet file to a .db file.

    Parameters
    ----------
    parquet_file
        File path to the .parquet file.
    db_file
        File path to the .db file.
    table_name : default: 'main'
        Name of the table to create in the SQL database.

    Returns
    -------
    None
        Nothing is returned.

    Raises
    ------
    TypeError
        If parquet_file or db_file are not str.
    FileNotFoundError
        If parquet_file does not exist.
    IsADirectoryError
        If parquet_file or db_file are not a file.
    ValueError
        If parquet_file extension is not a .parquet file.
    ValueError
        If db_file extension is not a .sqlite3 file.

    """
    try:
        medpipe.utils.exceptions.file_checks(parquet_file, ".parquet")
    except (FileNotFoundError, IsADirectoryError, TypeError, ValueError):
        raise

    try:
        medpipe.utils.exceptions.file_checks(db_file, ".db", exists=False)
    except (FileNotFoundError, TypeError, ValueError, IsADirectoryError):
        raise

    conn = sqlite3.connect(db_file)
    df = pd.read_parquet(parquet_file)

    drop_table = f"DROP TABLE IF EXISTS {table_name}"

    # Execute the queries
    conn.execute(drop_table)
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    conn.commit()
    conn.close()

`medpipe.data.preprocessing`

Preprocessing functions module.

This module provides functions to preprocess data before training.

Functions: - train_test_it: Creates a KFold iterator to split data into test and train sets. - get_validation_idx: Removes some of the indices to create a validation set. - convert_object_to_categorical: Converts object columns to categoricals. - fit_preprocess_operations: Fits processing operations to data. - bin_score: Bins the M3 score into 5 categories. - extract_labels: Extracts prediction labels from data.

`bin_score(data)`

Bins the M3 score into 5 categories.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	M3 score data.	required

Returns:

Name	Type	Description
`binned_data`	`DataFrame`	Binned data.

Source code in src/medpipe/data/preprocessing.py

def bin_score(data):
    """
    Bins the M3 score into 5 categories.

    Parameters
    ----------
    data : pd.DataFrame
        M3 score data.

    Returns
    -------
    binned_data : pd.DataFrame
        Binned data.

    """
    binned_data = np.ceil(data)
    binned_data[binned_data > 4] = 4
    return binned_data

`convert_object_to_categorical(data)`

Converts all object columns of a DataFrame to categoricals.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	DataFrame to manipulate.	required

Returns:

Name	Type	Description
`processed_data`	`DataFrame`	Processed DataFrame.

Raises:

Type	Description
`TypeError`	If data is not a pd.DataFrame.

Source code in src/medpipe/data/preprocessing.py

def convert_object_to_categorical(data: pd.DataFrame) -> pd.DataFrame:
    """
    Converts all object columns of a DataFrame to categoricals.

    Parameters
    ----------
    data
        DataFrame to manipulate.

    Returns
    -------
    processed_data : pd.DataFrame
        Processed DataFrame.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    # Create a copy of data to work on
    processed_data = data

    for column in data.select_dtypes(include=["object"]).columns:
        processed_data[column] = data[column].astype("category")

    return processed_data

`extract_labels(data, labels)`

Extracts the prediction labels from the training data.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	DataFrame to manipulate.	required
`labels`	`list(str)`	List of labels to extract from the data.	required

Returns:

Name	Type	Description
`X`	`DataFrame`	DataFrame containing the data.
`y`	`array - like`	Array containing the prediction labels.

Raises:

Type	Description
`TypeError`	If data is not a pd.DataFrame.
`TypeError`	If labels is not list(str).
`KeyError`	If a prediction label is not a valid key.

Source code in src/medpipe/data/preprocessing.py

def extract_labels(data, labels):
    """
    Extracts the prediction labels from the training data.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame to manipulate.
    labels : list(str)
        List of labels to extract from the data.

    Returns
    -------
    X : pd.DataFrame
        DataFrame containing the data.
    y : array-like
        Array containing the prediction labels.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.
    TypeError
        If labels is not list(str).
    KeyError
        If a prediction label is not a valid key.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    if type(labels) is not type([]):
        raise TypeError(f"labels should be a list, but got {type(labels)}")

    if type(labels[0]) is not type(""):
        raise TypeError(f"labels should be a list(str), but got {type(labels[0])}")

    X = data.drop(labels, axis=1)
    y = data[labels]

    return X, y.to_numpy()

`fit_preprocess_operations(data, preprocessing_dict)`

Fits processing operations to data.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	DataFrame to manipulate.	required
`preprocessing_dict`	`dict[str, list[str]]`	Dictionary of the operations and the features on which to operate.	required

Returns:

Name	Type	Description
`operation_dict`	`dict[]`	Dictionary of the different preprocessing objects.

Raises:

Type	Description
`TypeError`	If data is not a pd.DataFrame. If features is not a list(str).
`KeyError`	If a features is not a valid key.
`ValueError`	If preprocess is not a valid preprocessing function.

Source code in src/medpipe/data/preprocessing.py

def fit_preprocess_operations(data, preprocessing_dict):
    """
    Fits processing operations to data.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame to manipulate.
    preprocessing_dict : dict[str, list[str]]
        Dictionary of the operations and the features on which to operate.

    Returns
    -------
    operation_dict : dict[]
        Dictionary of the different preprocessing objects.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.
        If features is not a list(str).
    KeyError
        If a features is not a valid key.
    ValueError
        If preprocess is not a valid preprocessing function.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    # Operation dictionary to store fitted operations
    data_copy = deepcopy(data)
    operation_dict = dict()

    for preprocess in preprocessing_dict.keys():
        features = preprocessing_dict[preprocess]["feature_list"]

        if type(features) is not type([]):
            raise TypeError(f"features should be a list, but got {type(features)}")

        if type(features[0]) is not type(""):
            raise TypeError(
                f"features should be a list(str), but got list({type(features[0])}"
            )

        match preprocess:
            case "ordinal_encoder":
                operation_dict[preprocess] = OrdinalEncoder().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "standardise":
                operation_dict[preprocess] = StandardScaler().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "power_transform":
                operation_dict[preprocess] = PowerTransformer().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "bin":
                operation_dict[preprocess] = "bin"
            case _:
                raise ValueError(f"{preprocess} invalid preprocessing function")

    return operation_dict

`get_validation_idx(idx_list, groups=None, val_size=0.1)`

Removes some of the indices to create a validation set.

If groups are provided, all the indices of the group with the largest value are selected as the validation set.

Parameters:

Name	Type	Description	Default
`idx_list`	`array(n_samples)`	Indices of the set to split.	required
`groups`	`Series(n_samples) or None`	Groups to which the train indices belong. Must be numeric.	`None`
`val_size`	`float`	Size of the validation set if groups are None.	`0.1`

Returns:

Name	Type	Description
`train_idx`	`array`	Train indices.
`val_idx`	`array`	Validation indices.

Source code in src/medpipe/data/preprocessing.py

def get_validation_idx(idx_list, groups=None, val_size=0.1):
    """
    Removes some of the indices to create a validation set.

    If groups are provided, all the indices of the group with the largest
    value are selected as the validation set.

    Parameters
    ----------
    idx_list : np.array(n_samples,)
        Indices of the set to split.
    groups : pd.Series(n_samples,) or None, default: None
        Groups to which the train indices belong. Must be numeric.
    val_size : float, default: 0.1
        Size of the validation set if groups are None.

    Returns
    -------
    train_idx : np.array
        Train indices.
    val_idx : np.array
        Validation indices.

    """
    array_check(idx_list)
    if groups is not None:
        # If groups are provided
        groups = groups.to_numpy()  # Convert to array
        array_check(idx_list)
        array_dim_check(idx_list, groups, dim=0)

        if not np.isscalar(groups[0]):
            raise ValueError(f"groups should be scalar but instead got {groups.dtype}")
        group_max = np.max(groups)
        val_idx = np.where(groups == group_max)[0]
        train_idx = np.where(groups != group_max)[0]

    else:
        train_idx, val_idx = skl.model_selection.train_test_split(
            idx_list, test_size=val_size, random_state=42
        )

    return train_idx, val_idx

`train_test_it(temporal_k_fold=False, **kwargs)`

Creates a KFold iterator to split data into test and train sets.

Parameters:

Name	Type	Description	Default
`temporal_k_fold`	`bool`	If True, the data will be split using a group and a GroupKFold iterator is returned.	`False`
`**kwargs`		Extra arguments for the StratifiedKFold or GroupKFold class.	`{}`

Returns:

Name	Type	Description
`kfold_it`	`StratifiedKFold or GroupKFold`	KFold iterator.

Raises:

Type	Description
`ValueError`	If n_splits is less than 2.

Source code in src/medpipe/data/preprocessing.py

def train_test_it(temporal_k_fold=False, **kwargs):
    """
    Creates a KFold iterator to split data into test and train sets.

    Parameters
    ----------
    temporal_k_fold : bool, default: False
        If True, the data will be split using a group and a
        GroupKFold iterator is returned.
    **kwargs
        Extra arguments for the StratifiedKFold or GroupKFold class.

    Returns
    -------
    kfold_it : StratifiedKFold or GroupKFold
        KFold iterator.

    Raises
    ------
    ValueError
        If n_splits is less than 2.

    """
    # Create the correct argument dict for StratifiedKFold
    args_dict = dict()
    for key, value in kwargs.items():
        match key:
            case "random_state":
                if value == -1:
                    value = None
                args_dict.update({key: value})

            case "shuffle":
                args_dict.update({key: value})

            case "n_splits":
                if value < 2:
                    raise ValueError(
                        f"n_splits should be greater than 2, but got {value}"
                    )

                args_dict.update({key: value})

    if not temporal_k_fold:
        kfold_it = skl.model_selection.StratifiedKFold(**args_dict)
    else:
        kfold_it = skl.model_selection.GroupKFold(**args_dict)

    return kfold_it

`medpipe.data.weighting`

Weighting functions module.

This module provides functions to create sample weigths to address class imbalance.

Functions: - inverse_frequency_multiclass_sample_weights: Create sample weights using the total number of samples over the number of positive and negative samples. - inverse_frequency_single_sample_weights: Create sample weights using the inverse frequency of positive and negative samples. - inverse_frequency_class_weights: Create class weights using inverse frequency of classes. - negative_positive_ratio_sample_weights: Create sample weights using the ratio betwee negative and positive classes. - negative_positive_ratio_class_weights: Create class weights using the ratio between negative and positive classes.

`inverse_frequency_class_weights(labels)`

Create class weights of the positive class using inverse frequency of the positive class.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes)	required

Returns:

Name	Type	Description
`class_weights`	`array(n_classes)`	Weight for each class.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels is empty.
`ZeroDivisionError`	If there are no positive labels.

Source code in src/medpipe/data/weighting.py

def inverse_frequency_class_weights(labels):
    """
    Create class weights of the positive class using inverse frequency of
    the positive class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    class_weights : np.array(n_classes,)
        Weight for each class.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    return len(labels) / pos_counts

`inverse_frequency_multiclass_sample_weights(labels)`

Create sample weights using the total number of samples over the number of positive and negative samples.

Each class has its own set of weights for positive and negative examples based on the number of positive and negative examples in that class.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes)	required

Returns:

Name	Type	Description
`sample_weights`	`array(n_samples, n_classes)`	Weight for each sample.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels is empty.
`ZeroDivisionError`	If there are no positive labels.

Notes

For each class, the weights are calculated as: len(labels) / (pos_weight + neg_weight), where pos_weight is an array of shape (n_samples, n_classes) for the positive examples with the total number of positive samples in each class, and neg_weight is similar but for the negative examples.

Source code in src/medpipe/data/weighting.py

def inverse_frequency_multiclass_sample_weights(labels):
    """
    Create sample weights using the total number of samples over the number of
    positive and negative samples.

    Each class has its own set of weights for positive and negative examples
    based on the number of positive and negative examples in that class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples, n_classes)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    Notes
    -----
    For each class, the weights are calculated as:
        len(labels) / (pos_weight + neg_weight),
    where pos_weight is an array of shape (n_samples, n_classes) for the positive examples
    with the total number of positive samples in each class, and neg_weight is similar
    but for the negative examples.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    pos_weight = pos_counts * labels
    neg_weight = neg_counts * ~np.array(labels, dtype=bool)  # Invert for negatives
    return len(labels) / (pos_weight + neg_weight)

`inverse_frequency_single_sample_weights(labels)`

Create sample weights using the inverse frequency of positive and negative samples.

One set of weights is created and used for each class based on the total number of positive and negative examples. Weights are normalised so that negative weights are 1.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes)	required

Returns:

Name	Type	Description
`sample_weights`	`array(n_samples)`	Weight for each sample.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels is empty.
`ZeroDivisionError`	If there are no positive labels.

Source code in src/medpipe/data/weighting.py

def inverse_frequency_single_sample_weights(labels):
    """
    Create sample weights using the inverse frequency of positive
    and negative samples.

    One set of weights is created and used for each class based on the
    total number of positive and negative examples. Weights are normalised
    so that negative weights are 1.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples,)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_examples = np.sum(labels, axis=1) > 0
    pos_count = np.sum(pos_examples)

    if pos_count == 0:
        raise ZeroDivisionError("No positive labels found")

    # Create weights
    weights = np.ones(len(labels))  # Negative weight is 1
    pos_weight = (len(labels) - pos_count) / pos_count
    weights[pos_examples] *= pos_weight

    return weights

`negative_positive_ratio_class_weights(labels)`

Create class weights of the positive class using the ratio between the number of samples in the negative and positive classes.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes)	required

Returns:

Name	Type	Description
`class_weights`	`array(n_classes)`	Weight for each class.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels is empty.
`ZeroDivisionError`	If there are no positive labels.

Source code in src/medpipe/data/weighting.py

def negative_positive_ratio_class_weights(labels):
    """
    Create class weights of the positive class using the ratio
    between the number of samples in the negative and positive classes.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    class_weights : np.array(n_classes,)
        Weight for each class.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    return neg_counts / pos_counts

`negative_positive_ratio_sample_weights(labels)`

Create sample weights using the ratio between negative and positive samples.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes)	required

Returns:

Name	Type	Description
`sample_weights`	`array(n_samples, n_classes)`	Weight for each sample.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels is empty.
`ZeroDivisionError`	If there are no positive labels.

Source code in src/medpipe/data/weighting.py

def negative_positive_ratio_sample_weights(labels):
    """
    Create sample weights using the ratio between negative and
    positive samples.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples, n_classes)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    pos_weight = (neg_counts / pos_counts) * labels
    neg_weight = 1 * ~np.array(labels, dtype=bool)  # Invert for negatives

    return pos_weight + neg_weight

`medpipe.data.sampler`

Sampler functions module.

This module provides functions to sample the data to address class imbalance.

Functions: - data_sampler: Samples the data and labels to adjust the class imbalance. - random_undersampler: Randomly select labels to achieve the target ratio between minority and majority classes by undersampling majority class. - group_random_undersampler: Randomly select labels to achieve the target ratio between minority and majority classes in each group. - random_oversampler: Randomly select labels to achieve the target ratio between minority and majority classes by oversampling minority class. - group_random_oversampler: Randomly select labels to achieve the target ratio between minority and majority classes in each group. - mean_dist_sampler: Computes the mean data sample of the majority class and uses the distance to it to select examples. - group_mean_dist_sampler: Computes the mean data sample of the majority class in each group and uses the distance to it to select examples. - smote: Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE). - group_smote: Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE) in each group.

`data_sampler(data, labels, target_ratio=0.25, sampler_fn='random_undersampler', groups=None, **kwargs)`

Samples the data and labels to adjust the class imbalance.

The majority class is assumed to have a False or 0 label. The new set will have an imbalance equal to: IR * target_ratio, where IR is the current imbalance ratio.

If the target ratio is too small, the algorithm defaults to obtain a balanced dataset.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to sample of shape (n_samples, n_features).	required
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Target ratio between the minority and majority classes.	`0.25`
`sampler_fn`	`str`	Sampler function to use to sample the data.	`"random_undersampler"`
`groups`	`Series or None`	List containing groups for the group_sampler function.	`None`
`**kwargs`		Extra arguments for the sampler functions.	`{}`

Returns:

Name	Type	Description
`X`	`DataFrame`	Sampled data.
`y`	`array`	Sampled labels.
`groups`	`Series or None`	Groups of the examples, None if not specified.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def data_sampler(
    data,
    labels,
    target_ratio=0.25,
    sampler_fn="random_undersampler",
    groups=None,
    **kwargs,
):
    """
    Samples the data and labels to adjust the class imbalance.

    The majority class is assumed to have a False or 0 label.
    The new set will have an imbalance equal to:
        IR * target_ratio, where IR is the current imbalance ratio.

    If the target ratio is too small, the algorithm defaults to
    obtain a balanced dataset.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float, default: 0.25
        Target ratio between the minority and majority classes.
    sampler_fn : str, default: "random_undersampler"
        Sampler function to use to sample the data.
    groups : pd.Series or None, default: None
        List containing groups for the group_sampler function.
    **kwargs
        Extra arguments for the sampler functions.

    Returns
    -------
    X : pd.DataFrame
        Sampled data.
    y : np.array
        Sampled labels.
    groups : pd.Series or None
        Groups of the examples, None if not specified.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    sample_idx = np.array([])  # Empty sample index

    if target_ratio > 0:
        imbalance_ratio = (len(labels) - np.sum(labels)) / np.sum(labels)
        new_ratio = 1 / (imbalance_ratio * target_ratio)

        if (imbalance_ratio * new_ratio) < 1:
            # Set to 1 to get balanced dataset
            new_ratio = 1

    elif target_ratio == 0:
        new_ratio = 1  # Set to 1 to get balanced dataset
    else:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    if groups is None:
        # Convert groups to an empty list if nothing is provided
        groups = np.array([])

    match sampler_fn:
        case "random_undersampler":
            sample_idx = random_undersampler(labels, new_ratio)
        case "group_random_undersampler":
            sample_idx = group_random_undersampler(labels, new_ratio, groups)
        case "random_oversampler":
            sample_idx = random_oversampler(labels, new_ratio)
        case "group_random_oversampler":
            sample_idx = group_random_oversampler(labels, new_ratio, groups)
        case "mean_dist_sampler":
            sample_idx = mean_dist_sampler(
                data, labels, new_ratio, kwargs["hard_percent"]
            )
        case "group_mean_dist_sampler":
            sample_idx = group_mean_dist_sampler(
                data, labels, new_ratio, groups, kwargs["hard_percent"]
            )
        case "smote":
            X_gen, y_gen = smote(data, labels, new_ratio, kwargs["k_neighbors"])
            return concat((data, X_gen)), np.concatenate((labels, y_gen)), None
        case "group_smote":
            return group_smote(data, labels, new_ratio, groups, kwargs["k_neighbors"])
        case _:
            raise ValueError(f"{sampler_fn} invalid sampler function")

    X = data.iloc[sample_idx]
    y = labels[sample_idx]

    if len(groups) != 0:
        return X, y, groups.iloc[sample_idx]

    return X, y, None

`group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5)`

Computes the mean data sample of the majority class in each group and uses the distance to it to select examples.

The examples are sorted based on their distance to the mean. The hardest examples are the ones that have the greatest distance to the mean and the easiest are the ones closest to the mean.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to sample of shape (n_samples, n_features).	required
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`groups`	`array - like`	List of groups in which labels belong of shape (n_samples,).	required
`hard_percent`	`float`	Percentage of examples that are considered hard, between 0 and 1. If hard_percent is 0.5, half of the examples are chosen from the end of the sorted list and the other half from the beginning.	`0.5`

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels and group do not have the same dimension.

Source code in src/medpipe/data/sampler.py

def group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5):
    """
    Computes the mean data sample of the majority class in each group and
    uses the distance to it to select examples.

    The examples are sorted based on their distance to the mean.
    The hardest examples are the ones that have the greatest distance to
    the mean and the easiest are the ones closest to the mean.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).
    hard_percent : float, default: 0.5
        Percentage of examples that are considered hard, between 0 and 1.
        If hard_percent is 0.5, half of the examples are chosen from
        the end of the sorted list and the other half from the beginning.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)
    X = deepcopy(data)  # Create copy of data to not mess with actual data

    if groups.name in X.columns:
        # Remove group name to avoid calculation in the mean
        X = X.drop(groups.name, axis=1)

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = X.iloc[group_idx]
        group_labels = labels[group_idx]
        sample_idx = np.concatenate(
            (
                sample_idx,
                group_idx[
                    mean_dist_sampler(
                        group_data, group_labels, target_ratio, hard_percent
                    )
                ],
            )
        )

    return sample_idx

`group_random_oversampler(labels, target_ratio, groups)`

Randomly select labels to achieve the target ratio between minority and majority classes in each group.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`groups`	`array - like`	List of groups in which labels belong of shape (n_samples,).	required

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels and group do not have the same dimension. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def group_random_oversampler(labels, target_ratio, groups):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes in each group.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = labels[group_idx]
        sample_idx = np.concatenate(
            (sample_idx, group_idx[random_oversampler(group_data, target_ratio)])
        )

    return sample_idx

`group_random_undersampler(labels, target_ratio, groups)`

Randomly select labels to achieve the target ratio between minority and majority classes in each group.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`groups`	`array - like`	List of groups in which labels belong of shape (n_samples,).	required

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels and group do not have the same dimension. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def group_random_undersampler(labels, target_ratio, groups):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes in each group.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = labels[group_idx]
        sample_idx = np.concatenate(
            (sample_idx, group_idx[random_undersampler(group_data, target_ratio)])
        )

    return sample_idx

`group_smote(data, labels, target_ratio, groups, k_neighbors)`

Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE) in each group.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to sample of shape (n_samples, n_features).	required
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`groups`	`array - like`	List of groups in which labels belong of shape (n_samples,).	required
`k_neighbors`	`int`	Number of neighbors to use for SMOTE knn.	required

Returns:

Name	Type	Description
`X_gen`	`DataFrame`	Generated data.
`multilabels_gen`	`array`	Generated labels.
`groups_gen`	`array - like`	Generated groups.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If labels and group do not have the same dimension.

Source code in src/medpipe/data/sampler.py

def group_smote(data, labels, target_ratio, groups, k_neighbors):
    """
    Oversample minority class using Synthetic Minority Over-Sampling Technique
    (SMOTE) in each group.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).
    k_neighbors : int
        Number of neighbors to use for SMOTE knn.

    Returns
    -------
    X_gen : pd.DataFrame
        Generated data.
    multilabels_gen : np.array
        Generated labels.
    groups_gen : array-like
        Generated groups.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)
    X = deepcopy(data)  # Create copy of data to not mess with actual data
    y = deepcopy(labels)
    grps = deepcopy(groups)

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = data.iloc[group_idx]
        group_labels = labels[group_idx]

        # Generate new data for groups
        X_gen, y_gen = smote(group_data, group_labels, target_ratio, k_neighbors)
        group_gen = group * np.ones(y_gen.shape[0])
        X = concat((X, X_gen))
        y = np.concatenate((y, y_gen))
        grps = concat((grps, Series(group_gen.squeeze(), name=grps.name)))

    return X, y, grps

`mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5)`

Computes the mean data sample of the majority class and uses the distance to it to select examples.

The examples are sorted based on their distance to the mean. The hardest examples are the ones that have the greatest distance to the mean and the easiest are the ones closest to the mean.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to sample of shape (n_samples, n_features).	required
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`hard_percent`	`float`	Percentage of examples that are considered hard, between 0 and 1. If hard_percent is 0.5, half of the examples are chosen from the end of the sorted list and the other half from the beginning.	`0.5`

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If hard_percent is not between 0 and 1. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5):
    """
    Computes the mean data sample of the majority class and uses the
    distance to it to select examples.

    The examples are sorted based on their distance to the mean.
    The hardest examples are the ones that have the greatest distance to
    the mean and the easiest are the ones closest to the mean.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    hard_percent : float, default: 0.5
        Percentage of examples that are considered hard, between 0 and 1.
        If hard_percent is 0.5, half of the examples are chosen from
        the end of the sorted list and the other half from the beginning.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If hard_percent is not between 0 and 1.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    if hard_percent > 1 or hard_percent < 0:
        raise ValueError(
            f"hard_percent should be between 0 and 1, but got {hard_percent}"
        )
    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = np.round(n_min_class / target_ratio)  # Majority class examples

    maj_class_data = data.iloc[label_sums == 0]
    mean_maj_class = np.mean(maj_class_data, axis=0)

    # Get the distance to the mean
    dist = np.linalg.norm(mean_maj_class - maj_class_data, axis=1)
    sorted_dist_idx = np.argsort(dist)

    hard_samples_idx = sorted_dist_idx[-round(n_maj_class * hard_percent) :]
    easy_samples_idx = sorted_dist_idx[: round(n_maj_class * (1 - hard_percent))]

    return np.concatenate((easy_samples_idx, hard_samples_idx))

`random_oversampler(labels, target_ratio)`

Randomly select labels to achieve the target ratio between minority and majority classes by oversampling minority class.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def random_oversampler(labels, target_ratio):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes by oversampling minority class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)  # Check that labels is array-like
    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = len(labels) - n_min_class  # Majority class examples

    if n_min_class == 0:
        raise ValueError("No minority examples found")

    # Indices of minority and majority examples
    maj_idx = np.where(label_sums == 0)[0]
    min_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.where(label_sums > 0)[0], size=int(n_maj_class * target_ratio), replace=True
    )

    return np.concatenate((min_idx, maj_idx))

`random_undersampler(labels, target_ratio)`

Randomly select labels to achieve the target ratio between minority and majority classes by undersampling majority class.

Parameters:

Name	Type	Description	Default
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required

Returns:

Name	Type	Description
`sample_idx`	`array(n_samples)`	Index list of examples to achieve target ratio.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def random_undersampler(labels, target_ratio):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes by undersampling majority class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)  # Check that labels is array-like

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = np.round(n_min_class / target_ratio)  # Majority class examples

    min_idx = np.where(label_sums > 0)[0]
    maj_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.where(label_sums == 0)[0], size=int(n_maj_class), replace=False
    )

    return np.concatenate((min_idx, maj_idx))

`smote(data, labels, target_ratio, k_neighbors)`

Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE).

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to sample of shape (n_samples, n_features).	required
`labels`	`array - like`	Binary prediction labels of shape (n_samples, n_classes).	required
`target_ratio`	`float`	Ratio of minority over majority classes to achieve.	required
`k_neighbors`	`int`	Number of neighbors to use for SMOTE knn.	required

Returns:

Name	Type	Description
`X_gen`	`DataFrame`	Generated data.
`multilabels_gen`	`array`	Generated labels.

Raises:

Type	Description
`TypeError`	If labels is not array-like.
`ValueError`	If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py

def smote(data, labels, target_ratio, k_neighbors):
    """
    Oversample minority class using Synthetic Minority Over-Sampling Technique
    (SMOTE).

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    k_neighbors : int
        Number of neighbors to use for SMOTE knn.

    Returns
    -------
    X_gen : pd.DataFrame
        Generated data.
    multilabels_gen : np.array
        Generated labels.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    X = deepcopy(data)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_maj_class = np.sum(label_sums == 0)  # Majority class examples
    n_min_class = np.round(n_maj_class * target_ratio) - np.sum(
        label_sums > 0
    )  # Minority class examples

    # Convert labels into unique classes
    unique_multilabels, class_labels = np.unique(labels, axis=0, return_inverse=True)

    sm = SMOTE(k_neighbors=k_neighbors)
    X_gen, y_gen = sm.fit_resample(X, class_labels)

    if "SEX_ORIGINAL" in X_gen.columns:
        X_gen["SEX_ORIGINAL"] = X_gen["SEX_ORIGINAL"].round()

    min_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.arange(len(labels), len(y_gen)), size=int(n_min_class), replace=False
    )
    return X_gen.iloc[min_idx], unique_multilabels[y_gen[min_idx]]

Data Reference

medpipe.data.Preprocessor

Preprocessor

__init__(preprocessor_config, logger=None)

fit(X)

fit_transform(X)

transform(X)

medpipe.data.db

extract_data_from_db(db_file, query)

parquet_to_db(parquet_file, db_file, table_name='main')

medpipe.data.preprocessing

bin_score(data)

convert_object_to_categorical(data)

extract_labels(data, labels)

fit_preprocess_operations(data, preprocessing_dict)

get_validation_idx(idx_list, groups=None, val_size=0.1)

train_test_it(temporal_k_fold=False, **kwargs)

medpipe.data.weighting

inverse_frequency_class_weights(labels)

inverse_frequency_multiclass_sample_weights(labels)

inverse_frequency_single_sample_weights(labels)

negative_positive_ratio_class_weights(labels)

negative_positive_ratio_sample_weights(labels)

medpipe.data.sampler

data_sampler(data, labels, target_ratio=0.25, sampler_fn='random_undersampler', groups=None, **kwargs)

group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5)

group_random_oversampler(labels, target_ratio, groups)

group_random_undersampler(labels, target_ratio, groups)

group_smote(data, labels, target_ratio, groups, k_neighbors)

mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5)

random_oversampler(labels, target_ratio)

random_undersampler(labels, target_ratio)

smote(data, labels, target_ratio, k_neighbors)

`medpipe.data.Preprocessor`

`Preprocessor`

`init(preprocessor_config, logger=None)`

`fit(X)`

`fit_transform(X)`

`transform(X)`

`medpipe.data.db`

`extract_data_from_db(db_file, query)`

`parquet_to_db(parquet_file, db_file, table_name='main')`

`medpipe.data.preprocessing`

`bin_score(data)`

`convert_object_to_categorical(data)`

`extract_labels(data, labels)`

`fit_preprocess_operations(data, preprocessing_dict)`

`get_validation_idx(idx_list, groups=None, val_size=0.1)`

`train_test_it(temporal_k_fold=False, **kwargs)`

`medpipe.data.weighting`

`inverse_frequency_class_weights(labels)`

`inverse_frequency_multiclass_sample_weights(labels)`

`inverse_frequency_single_sample_weights(labels)`

`negative_positive_ratio_class_weights(labels)`

`negative_positive_ratio_sample_weights(labels)`

`medpipe.data.sampler`

`data_sampler(data, labels, target_ratio=0.25, sampler_fn='random_undersampler', groups=None, **kwargs)`

`group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5)`

`group_random_oversampler(labels, target_ratio, groups)`

`group_random_undersampler(labels, target_ratio, groups)`

`group_smote(data, labels, target_ratio, groups, k_neighbors)`

`mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5)`

`random_oversampler(labels, target_ratio)`

`random_undersampler(labels, target_ratio)`

`smote(data, labels, target_ratio, k_neighbors)`