Skip to content

Data Reference

This page documents the data sub-package.


medpipe.data.Preprocessor

Preprocessor class.

This class creates a Preprocessor to prepare data.

Preprocessor

Class that creates a Preprocessor.

Attributes:

Name Type Description
preprocess bool

Flag to preprocess data or not.

transform_seq dict[str, dict[str, list[str]]]

Transformation sequence for the data.

logger logging.Logger or None, default: None

Logger object to log prints. If None print to terminal.

Methods:

Name Description
__init__

Init method.

_clean_data

Cleans data in preparation for transformation.

fit_transform

Fits the operations and transforms the input data.

fit

Fits the operations based on input data.

transform

Transforms input data based on fitted operations.

Source code in src/medpipe/data/Preprocessor.py
class Preprocessor:
    """
    Class that creates a Preprocessor.

    Attributes
    ----------
    preprocess : bool
        Flag to preprocess data or not.
    transform_seq : dict[str, dict[str,list[str]]]
        Transformation sequence for the data.
    logger : logging.Logger or None, default: None
        Logger object to log prints. If None print to terminal.

    Methods
    -------
    __init__(preprocessor_config, logger)
        Init method.
    _clean_data(X)
        Cleans data in preparation for transformation.
    fit_transform(X)
        Fits the operations and transforms the input data.
    fit(X)
        Fits the operations based on input data.
    transform(X)
        Transforms input data based on fitted operations.
    """

    def __init__(self, preprocessor_config, logger=None):
        """
        Initialise a Preprocessor class instance.

        Parameters
        ----------
        preprocessor_config : dict[str, dict[str, list[str]]]
            Configuration parameters for the preprocessor object.
        logger : logging.Logger or None, default: None
            Logger object to log prints. If None print to terminal.

        Returns
        -------
        None
            Nothing is returned.

        """
        self.preprocess = preprocessor_config.pop("preprocess")
        self.transform_seq = preprocessor_config
        self.operations = dict()  # Empty dict to contain operations
        self.logger = logger

    def _clean_data(self, X):
        """
        Cleans data before transformation.

        Removes rows with Nan values and converts objects to
        categoricals.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Cleaned data.

        """
        # Convert objects to categorical (not saved so needs to be here)
        data = convert_object_to_categorical(X)

        # Remove NaN values
        nb_nan_rows = data.isna().any(axis=1).sum()
        data = data.dropna()

        print_message(
            f"Dropped {nb_nan_rows} rows with NaN values", self.logger, SCRIPT_NAME
        )

        return data

    def fit_transform(self, X):
        """
        Fits the operations and transforms the input data.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Transformed data.

        """
        data = self._clean_data(deepcopy(X))  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Fitting processing operations", self.logger, SCRIPT_NAME)
            self.operations = fit_preprocess_operations(data, self.transform_seq)
            data = self.transform(data)

        return data

    def fit(self, X):
        """
        Fits the operations based on input data.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        None
            Nothings is returned.

        """
        data = self._clean_data(X)  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Fitting preprocessing operations", self.logger, SCRIPT_NAME)
            self.operations = fit_preprocess_operations(data, self.transform_seq)

    def transform(self, X):
        """
        Transforms input data based on fitted operations.

        Parameters
        ----------
        X : pd.Dataframe of shape (n_samples, n_features)
            Data to clean.

        Returns
        -------
        data : pd.Dataframe of shape (n_samples, n_features)
             Transformed data.

        """
        data = self._clean_data(deepcopy(X))  # Clean data before transformation

        if self.preprocess:
            # If the preprocess flag is true
            print_message("Preprocessing data", self.logger, SCRIPT_NAME)
            for operation in self.operations:
                features = self.transform_seq[operation]["feature_list"]

                if operation == "bin":
                    transformed_data = bin_score(data[features])
                else:
                    transformed_data = self.operations[operation].transform(
                        data[features]
                    )
                data[features] = transformed_data

        return data

__init__(preprocessor_config, logger=None)

Initialise a Preprocessor class instance.

Parameters:

Name Type Description Default
preprocessor_config dict[str, dict[str, list[str]]]

Configuration parameters for the preprocessor object.

required
logger Logger or None

Logger object to log prints. If None print to terminal.

None

Returns:

Type Description
None

Nothing is returned.

Source code in src/medpipe/data/Preprocessor.py
def __init__(self, preprocessor_config, logger=None):
    """
    Initialise a Preprocessor class instance.

    Parameters
    ----------
    preprocessor_config : dict[str, dict[str, list[str]]]
        Configuration parameters for the preprocessor object.
    logger : logging.Logger or None, default: None
        Logger object to log prints. If None print to terminal.

    Returns
    -------
    None
        Nothing is returned.

    """
    self.preprocess = preprocessor_config.pop("preprocess")
    self.transform_seq = preprocessor_config
    self.operations = dict()  # Empty dict to contain operations
    self.logger = logger

fit(X)

Fits the operations based on input data.

Parameters:

Name Type Description Default
X pd.Dataframe of shape (n_samples, n_features)

Data to clean.

required

Returns:

Type Description
None

Nothings is returned.

Source code in src/medpipe/data/Preprocessor.py
def fit(self, X):
    """
    Fits the operations based on input data.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    None
        Nothings is returned.

    """
    data = self._clean_data(X)  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Fitting preprocessing operations", self.logger, SCRIPT_NAME)
        self.operations = fit_preprocess_operations(data, self.transform_seq)

fit_transform(X)

Fits the operations and transforms the input data.

Parameters:

Name Type Description Default
X pd.Dataframe of shape (n_samples, n_features)

Data to clean.

required

Returns:

Name Type Description
data pd.Dataframe of shape (n_samples, n_features)

Transformed data.

Source code in src/medpipe/data/Preprocessor.py
def fit_transform(self, X):
    """
    Fits the operations and transforms the input data.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    data : pd.Dataframe of shape (n_samples, n_features)
         Transformed data.

    """
    data = self._clean_data(deepcopy(X))  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Fitting processing operations", self.logger, SCRIPT_NAME)
        self.operations = fit_preprocess_operations(data, self.transform_seq)
        data = self.transform(data)

    return data

transform(X)

Transforms input data based on fitted operations.

Parameters:

Name Type Description Default
X pd.Dataframe of shape (n_samples, n_features)

Data to clean.

required

Returns:

Name Type Description
data pd.Dataframe of shape (n_samples, n_features)

Transformed data.

Source code in src/medpipe/data/Preprocessor.py
def transform(self, X):
    """
    Transforms input data based on fitted operations.

    Parameters
    ----------
    X : pd.Dataframe of shape (n_samples, n_features)
        Data to clean.

    Returns
    -------
    data : pd.Dataframe of shape (n_samples, n_features)
         Transformed data.

    """
    data = self._clean_data(deepcopy(X))  # Clean data before transformation

    if self.preprocess:
        # If the preprocess flag is true
        print_message("Preprocessing data", self.logger, SCRIPT_NAME)
        for operation in self.operations:
            features = self.transform_seq[operation]["feature_list"]

            if operation == "bin":
                transformed_data = bin_score(data[features])
            else:
                transformed_data = self.operations[operation].transform(
                    data[features]
                )
            data[features] = transformed_data

    return data

medpipe.data.db

Database functions module.

This module provides functions to open, query, and save data from databases.

Functions: - parquet_to_db: Converts a .parquet file to a .db fil. - extract_data_from_db: Queries a SQL .db to extract data.

extract_data_from_db(db_file, query)

Extracts data from a .db and saves it to a .csv file.

The parquet file

Parameters:

Name Type Description Default
db_file str

Path to the .db file.

required
query str

Query to send to extract data.

required

Returns:

Name Type Description
data DataFrame

Extracted data from the database.

Raises:

Type Description
TypeError

If db_file or query is not a str.

FileNotFoundError

If db_file does not exist.

IsADirectoryError

If db_file is not a file.

ValueError

If db_file extension is not a .sqlite3 file.

Source code in src/medpipe/data/db.py
def extract_data_from_db(db_file: str, query: str):
    """
    Extracts data from a .db and saves it to a .csv file.

    The parquet file

    Parameters
    ----------
    db_file
        Path to the .db file.
    query
        Query to send to extract data.

    Returns
    -------
    data : pd.DataFrame
        Extracted data from the database.

    Raises
    ------
    TypeError
        If db_file or query is not a str.
    FileNotFoundError
        If db_file does not exist.
    IsADirectoryError
        If db_file is not a file.
    ValueError
        If db_file extension is not a .sqlite3 file.

    """
    try:
        medpipe.utils.exceptions.file_checks(db_file, ".db")
    except (FileNotFoundError, IsADirectoryError, TypeError, ValueError):
        raise

    if type(query) is not str:
        raise TypeError(f"{query} should be a string")

    conn = sqlite3.connect(db_file)
    df = pd.read_sql_query(query, conn)
    conn.close()

    return df

parquet_to_db(parquet_file, db_file, table_name='main')

Converts a .parquet file to a .db file.

Parameters:

Name Type Description Default
parquet_file str

File path to the .parquet file.

required
db_file str

File path to the .db file.

required
table_name default: 'main'

Name of the table to create in the SQL database.

'main'

Returns:

Type Description
None

Nothing is returned.

Raises:

Type Description
TypeError

If parquet_file or db_file are not str.

FileNotFoundError

If parquet_file does not exist.

IsADirectoryError

If parquet_file or db_file are not a file.

ValueError

If parquet_file extension is not a .parquet file.

ValueError

If db_file extension is not a .sqlite3 file.

Source code in src/medpipe/data/db.py
def parquet_to_db(parquet_file: str, db_file: str, table_name: str = "main") -> None:
    """
    Converts a .parquet file to a .db file.

    Parameters
    ----------
    parquet_file
        File path to the .parquet file.
    db_file
        File path to the .db file.
    table_name : default: 'main'
        Name of the table to create in the SQL database.

    Returns
    -------
    None
        Nothing is returned.

    Raises
    ------
    TypeError
        If parquet_file or db_file are not str.
    FileNotFoundError
        If parquet_file does not exist.
    IsADirectoryError
        If parquet_file or db_file are not a file.
    ValueError
        If parquet_file extension is not a .parquet file.
    ValueError
        If db_file extension is not a .sqlite3 file.

    """
    try:
        medpipe.utils.exceptions.file_checks(parquet_file, ".parquet")
    except (FileNotFoundError, IsADirectoryError, TypeError, ValueError):
        raise

    try:
        medpipe.utils.exceptions.file_checks(db_file, ".db", exists=False)
    except (FileNotFoundError, TypeError, ValueError, IsADirectoryError):
        raise

    conn = sqlite3.connect(db_file)
    df = pd.read_parquet(parquet_file)

    drop_table = f"DROP TABLE IF EXISTS {table_name}"

    # Execute the queries
    conn.execute(drop_table)
    df.to_sql(table_name, conn, if_exists="replace", index=False)
    conn.commit()
    conn.close()

medpipe.data.preprocessing

Preprocessing functions module.

This module provides functions to preprocess data before training.

Functions: - train_test_it: Creates a KFold iterator to split data into test and train sets. - get_validation_idx: Removes some of the indices to create a validation set. - convert_object_to_categorical: Converts object columns to categoricals. - fit_preprocess_operations: Fits processing operations to data. - bin_score: Bins the M3 score into 5 categories. - extract_labels: Extracts prediction labels from data.

bin_score(data)

Bins the M3 score into 5 categories.

Parameters:

Name Type Description Default
data DataFrame

M3 score data.

required

Returns:

Name Type Description
binned_data DataFrame

Binned data.

Source code in src/medpipe/data/preprocessing.py
def bin_score(data):
    """
    Bins the M3 score into 5 categories.

    Parameters
    ----------
    data : pd.DataFrame
        M3 score data.

    Returns
    -------
    binned_data : pd.DataFrame
        Binned data.

    """
    binned_data = np.ceil(data)
    binned_data[binned_data > 4] = 4
    return binned_data

convert_object_to_categorical(data)

Converts all object columns of a DataFrame to categoricals.

Parameters:

Name Type Description Default
data DataFrame

DataFrame to manipulate.

required

Returns:

Name Type Description
processed_data DataFrame

Processed DataFrame.

Raises:

Type Description
TypeError

If data is not a pd.DataFrame.

Source code in src/medpipe/data/preprocessing.py
def convert_object_to_categorical(data: pd.DataFrame) -> pd.DataFrame:
    """
    Converts all object columns of a DataFrame to categoricals.

    Parameters
    ----------
    data
        DataFrame to manipulate.

    Returns
    -------
    processed_data : pd.DataFrame
        Processed DataFrame.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    # Create a copy of data to work on
    processed_data = data

    for column in data.select_dtypes(include=["object"]).columns:
        processed_data[column] = data[column].astype("category")

    return processed_data

extract_labels(data, labels)

Extracts the prediction labels from the training data.

Parameters:

Name Type Description Default
data DataFrame

DataFrame to manipulate.

required
labels list(str)

List of labels to extract from the data.

required

Returns:

Name Type Description
X DataFrame

DataFrame containing the data.

y array - like

Array containing the prediction labels.

Raises:

Type Description
TypeError

If data is not a pd.DataFrame.

TypeError

If labels is not list(str).

KeyError

If a prediction label is not a valid key.

Source code in src/medpipe/data/preprocessing.py
def extract_labels(data, labels):
    """
    Extracts the prediction labels from the training data.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame to manipulate.
    labels : list(str)
        List of labels to extract from the data.

    Returns
    -------
    X : pd.DataFrame
        DataFrame containing the data.
    y : array-like
        Array containing the prediction labels.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.
    TypeError
        If labels is not list(str).
    KeyError
        If a prediction label is not a valid key.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    if type(labels) is not type([]):
        raise TypeError(f"labels should be a list, but got {type(labels)}")

    if type(labels[0]) is not type(""):
        raise TypeError(f"labels should be a list(str), but got {type(labels[0])}")

    X = data.drop(labels, axis=1)
    y = data[labels]

    return X, y.to_numpy()

fit_preprocess_operations(data, preprocessing_dict)

Fits processing operations to data.

Parameters:

Name Type Description Default
data DataFrame

DataFrame to manipulate.

required
preprocessing_dict dict[str, list[str]]

Dictionary of the operations and the features on which to operate.

required

Returns:

Name Type Description
operation_dict dict[]

Dictionary of the different preprocessing objects.

Raises:

Type Description
TypeError

If data is not a pd.DataFrame. If features is not a list(str).

KeyError

If a features is not a valid key.

ValueError

If preprocess is not a valid preprocessing function.

Source code in src/medpipe/data/preprocessing.py
def fit_preprocess_operations(data, preprocessing_dict):
    """
    Fits processing operations to data.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame to manipulate.
    preprocessing_dict : dict[str, list[str]]
        Dictionary of the operations and the features on which to operate.

    Returns
    -------
    operation_dict : dict[]
        Dictionary of the different preprocessing objects.

    Raises
    ------
    TypeError
        If data is not a pd.DataFrame.
        If features is not a list(str).
    KeyError
        If a features is not a valid key.
    ValueError
        If preprocess is not a valid preprocessing function.

    """
    if type(data) is not type(pd.DataFrame()):
        raise TypeError(f"data should be a pd.DataFrame, but got {type(data)}")

    # Operation dictionary to store fitted operations
    data_copy = deepcopy(data)
    operation_dict = dict()

    for preprocess in preprocessing_dict.keys():
        features = preprocessing_dict[preprocess]["feature_list"]

        if type(features) is not type([]):
            raise TypeError(f"features should be a list, but got {type(features)}")

        if type(features[0]) is not type(""):
            raise TypeError(
                f"features should be a list(str), but got list({type(features[0])}"
            )

        match preprocess:
            case "ordinal_encoder":
                operation_dict[preprocess] = OrdinalEncoder().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "standardise":
                operation_dict[preprocess] = StandardScaler().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "power_transform":
                operation_dict[preprocess] = PowerTransformer().fit(data_copy[features])
                data_copy[features] = operation_dict[preprocess].transform(
                    data_copy[features]
                )
            case "bin":
                operation_dict[preprocess] = "bin"
            case _:
                raise ValueError(f"{preprocess} invalid preprocessing function")

    return operation_dict

get_validation_idx(idx_list, groups=None, val_size=0.1)

Removes some of the indices to create a validation set.

If groups are provided, all the indices of the group with the largest value are selected as the validation set.

Parameters:

Name Type Description Default
idx_list array(n_samples)

Indices of the set to split.

required
groups Series(n_samples) or None

Groups to which the train indices belong. Must be numeric.

None
val_size float

Size of the validation set if groups are None.

0.1

Returns:

Name Type Description
train_idx array

Train indices.

val_idx array

Validation indices.

Source code in src/medpipe/data/preprocessing.py
def get_validation_idx(idx_list, groups=None, val_size=0.1):
    """
    Removes some of the indices to create a validation set.

    If groups are provided, all the indices of the group with the largest
    value are selected as the validation set.

    Parameters
    ----------
    idx_list : np.array(n_samples,)
        Indices of the set to split.
    groups : pd.Series(n_samples,) or None, default: None
        Groups to which the train indices belong. Must be numeric.
    val_size : float, default: 0.1
        Size of the validation set if groups are None.

    Returns
    -------
    train_idx : np.array
        Train indices.
    val_idx : np.array
        Validation indices.

    """
    array_check(idx_list)
    if groups is not None:
        # If groups are provided
        groups = groups.to_numpy()  # Convert to array
        array_check(idx_list)
        array_dim_check(idx_list, groups, dim=0)

        if not np.isscalar(groups[0]):
            raise ValueError(f"groups should be scalar but instead got {groups.dtype}")
        group_max = np.max(groups)
        val_idx = np.where(groups == group_max)[0]
        train_idx = np.where(groups != group_max)[0]

    else:
        train_idx, val_idx = skl.model_selection.train_test_split(
            idx_list, test_size=val_size, random_state=42
        )

    return train_idx, val_idx

train_test_it(temporal_k_fold=False, **kwargs)

Creates a KFold iterator to split data into test and train sets.

Parameters:

Name Type Description Default
temporal_k_fold bool

If True, the data will be split using a group and a GroupKFold iterator is returned.

False
**kwargs

Extra arguments for the StratifiedKFold or GroupKFold class.

{}

Returns:

Name Type Description
kfold_it StratifiedKFold or GroupKFold

KFold iterator.

Raises:

Type Description
ValueError

If n_splits is less than 2.

Source code in src/medpipe/data/preprocessing.py
def train_test_it(temporal_k_fold=False, **kwargs):
    """
    Creates a KFold iterator to split data into test and train sets.

    Parameters
    ----------
    temporal_k_fold : bool, default: False
        If True, the data will be split using a group and a
        GroupKFold iterator is returned.
    **kwargs
        Extra arguments for the StratifiedKFold or GroupKFold class.

    Returns
    -------
    kfold_it : StratifiedKFold or GroupKFold
        KFold iterator.

    Raises
    ------
    ValueError
        If n_splits is less than 2.

    """
    # Create the correct argument dict for StratifiedKFold
    args_dict = dict()
    for key, value in kwargs.items():
        match key:
            case "random_state":
                if value == -1:
                    value = None
                args_dict.update({key: value})

            case "shuffle":
                args_dict.update({key: value})

            case "n_splits":
                if value < 2:
                    raise ValueError(
                        f"n_splits should be greater than 2, but got {value}"
                    )

                args_dict.update({key: value})

    if not temporal_k_fold:
        kfold_it = skl.model_selection.StratifiedKFold(**args_dict)
    else:
        kfold_it = skl.model_selection.GroupKFold(**args_dict)

    return kfold_it

medpipe.data.weighting

Weighting functions module.

This module provides functions to create sample weigths to address class imbalance.

Functions: - inverse_frequency_multiclass_sample_weights: Create sample weights using the total number of samples over the number of positive and negative samples. - inverse_frequency_single_sample_weights: Create sample weights using the inverse frequency of positive and negative samples. - inverse_frequency_class_weights: Create class weights using inverse frequency of classes. - negative_positive_ratio_sample_weights: Create sample weights using the ratio betwee negative and positive classes. - negative_positive_ratio_class_weights: Create class weights using the ratio between negative and positive classes.

inverse_frequency_class_weights(labels)

Create class weights of the positive class using inverse frequency of the positive class.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes)

required

Returns:

Name Type Description
class_weights array(n_classes)

Weight for each class.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels is empty.

ZeroDivisionError

If there are no positive labels.

Source code in src/medpipe/data/weighting.py
def inverse_frequency_class_weights(labels):
    """
    Create class weights of the positive class using inverse frequency of
    the positive class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    class_weights : np.array(n_classes,)
        Weight for each class.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    return len(labels) / pos_counts

inverse_frequency_multiclass_sample_weights(labels)

Create sample weights using the total number of samples over the number of positive and negative samples.

Each class has its own set of weights for positive and negative examples based on the number of positive and negative examples in that class.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes)

required

Returns:

Name Type Description
sample_weights array(n_samples, n_classes)

Weight for each sample.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels is empty.

ZeroDivisionError

If there are no positive labels.

Notes

For each class, the weights are calculated as: len(labels) / (pos_weight + neg_weight), where pos_weight is an array of shape (n_samples, n_classes) for the positive examples with the total number of positive samples in each class, and neg_weight is similar but for the negative examples.

Source code in src/medpipe/data/weighting.py
def inverse_frequency_multiclass_sample_weights(labels):
    """
    Create sample weights using the total number of samples over the number of
    positive and negative samples.

    Each class has its own set of weights for positive and negative examples
    based on the number of positive and negative examples in that class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples, n_classes)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    Notes
    -----
    For each class, the weights are calculated as:
        len(labels) / (pos_weight + neg_weight),
    where pos_weight is an array of shape (n_samples, n_classes) for the positive examples
    with the total number of positive samples in each class, and neg_weight is similar
    but for the negative examples.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    pos_weight = pos_counts * labels
    neg_weight = neg_counts * ~np.array(labels, dtype=bool)  # Invert for negatives
    return len(labels) / (pos_weight + neg_weight)

inverse_frequency_single_sample_weights(labels)

Create sample weights using the inverse frequency of positive and negative samples.

One set of weights is created and used for each class based on the total number of positive and negative examples. Weights are normalised so that negative weights are 1.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes)

required

Returns:

Name Type Description
sample_weights array(n_samples)

Weight for each sample.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels is empty.

ZeroDivisionError

If there are no positive labels.

Source code in src/medpipe/data/weighting.py
def inverse_frequency_single_sample_weights(labels):
    """
    Create sample weights using the inverse frequency of positive
    and negative samples.

    One set of weights is created and used for each class based on the
    total number of positive and negative examples. Weights are normalised
    so that negative weights are 1.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples,)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_examples = np.sum(labels, axis=1) > 0
    pos_count = np.sum(pos_examples)

    if pos_count == 0:
        raise ZeroDivisionError("No positive labels found")

    # Create weights
    weights = np.ones(len(labels))  # Negative weight is 1
    pos_weight = (len(labels) - pos_count) / pos_count
    weights[pos_examples] *= pos_weight

    return weights

negative_positive_ratio_class_weights(labels)

Create class weights of the positive class using the ratio between the number of samples in the negative and positive classes.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes)

required

Returns:

Name Type Description
class_weights array(n_classes)

Weight for each class.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels is empty.

ZeroDivisionError

If there are no positive labels.

Source code in src/medpipe/data/weighting.py
def negative_positive_ratio_class_weights(labels):
    """
    Create class weights of the positive class using the ratio
    between the number of samples in the negative and positive classes.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    class_weights : np.array(n_classes,)
        Weight for each class.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    return neg_counts / pos_counts

negative_positive_ratio_sample_weights(labels)

Create sample weights using the ratio between negative and positive samples.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes)

required

Returns:

Name Type Description
sample_weights array(n_samples, n_classes)

Weight for each sample.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels is empty.

ZeroDivisionError

If there are no positive labels.

Source code in src/medpipe/data/weighting.py
def negative_positive_ratio_sample_weights(labels):
    """
    Create sample weights using the ratio between negative and
    positive samples.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes)

    Returns
    -------
    sample_weights : np.array(n_samples, n_classes)
        Weight for each sample.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels is empty.
    ZeroDivisionError
        If there are no positive labels.

    """
    array_check(labels)  # Check that labels is array-like

    if len(labels) == 0:
        raise ValueError("The input labels are empty")

    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts

    if pos_counts.any() == 0:
        raise ZeroDivisionError("No positive labels found")

    pos_weight = (neg_counts / pos_counts) * labels
    neg_weight = 1 * ~np.array(labels, dtype=bool)  # Invert for negatives

    return pos_weight + neg_weight

medpipe.data.sampler

Sampler functions module.

This module provides functions to sample the data to address class imbalance.

Functions: - data_sampler: Samples the data and labels to adjust the class imbalance. - random_undersampler: Randomly select labels to achieve the target ratio between minority and majority classes by undersampling majority class. - group_random_undersampler: Randomly select labels to achieve the target ratio between minority and majority classes in each group. - random_oversampler: Randomly select labels to achieve the target ratio between minority and majority classes by oversampling minority class. - group_random_oversampler: Randomly select labels to achieve the target ratio between minority and majority classes in each group. - mean_dist_sampler: Computes the mean data sample of the majority class and uses the distance to it to select examples. - group_mean_dist_sampler: Computes the mean data sample of the majority class in each group and uses the distance to it to select examples. - smote: Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE). - group_smote: Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE) in each group.

data_sampler(data, labels, target_ratio=0.25, sampler_fn='random_undersampler', groups=None, **kwargs)

Samples the data and labels to adjust the class imbalance.

The majority class is assumed to have a False or 0 label. The new set will have an imbalance equal to: IR * target_ratio, where IR is the current imbalance ratio.

If the target ratio is too small, the algorithm defaults to obtain a balanced dataset.

Parameters:

Name Type Description Default
data DataFrame

Data to sample of shape (n_samples, n_features).

required
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Target ratio between the minority and majority classes.

0.25
sampler_fn str

Sampler function to use to sample the data.

"random_undersampler"
groups Series or None

List containing groups for the group_sampler function.

None
**kwargs

Extra arguments for the sampler functions.

{}

Returns:

Name Type Description
X DataFrame

Sampled data.

y array

Sampled labels.

groups Series or None

Groups of the examples, None if not specified.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def data_sampler(
    data,
    labels,
    target_ratio=0.25,
    sampler_fn="random_undersampler",
    groups=None,
    **kwargs,
):
    """
    Samples the data and labels to adjust the class imbalance.

    The majority class is assumed to have a False or 0 label.
    The new set will have an imbalance equal to:
        IR * target_ratio, where IR is the current imbalance ratio.

    If the target ratio is too small, the algorithm defaults to
    obtain a balanced dataset.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float, default: 0.25
        Target ratio between the minority and majority classes.
    sampler_fn : str, default: "random_undersampler"
        Sampler function to use to sample the data.
    groups : pd.Series or None, default: None
        List containing groups for the group_sampler function.
    **kwargs
        Extra arguments for the sampler functions.

    Returns
    -------
    X : pd.DataFrame
        Sampled data.
    y : np.array
        Sampled labels.
    groups : pd.Series or None
        Groups of the examples, None if not specified.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    sample_idx = np.array([])  # Empty sample index

    if target_ratio > 0:
        imbalance_ratio = (len(labels) - np.sum(labels)) / np.sum(labels)
        new_ratio = 1 / (imbalance_ratio * target_ratio)

        if (imbalance_ratio * new_ratio) < 1:
            # Set to 1 to get balanced dataset
            new_ratio = 1

    elif target_ratio == 0:
        new_ratio = 1  # Set to 1 to get balanced dataset
    else:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    if groups is None:
        # Convert groups to an empty list if nothing is provided
        groups = np.array([])

    match sampler_fn:
        case "random_undersampler":
            sample_idx = random_undersampler(labels, new_ratio)
        case "group_random_undersampler":
            sample_idx = group_random_undersampler(labels, new_ratio, groups)
        case "random_oversampler":
            sample_idx = random_oversampler(labels, new_ratio)
        case "group_random_oversampler":
            sample_idx = group_random_oversampler(labels, new_ratio, groups)
        case "mean_dist_sampler":
            sample_idx = mean_dist_sampler(
                data, labels, new_ratio, kwargs["hard_percent"]
            )
        case "group_mean_dist_sampler":
            sample_idx = group_mean_dist_sampler(
                data, labels, new_ratio, groups, kwargs["hard_percent"]
            )
        case "smote":
            X_gen, y_gen = smote(data, labels, new_ratio, kwargs["k_neighbors"])
            return concat((data, X_gen)), np.concatenate((labels, y_gen)), None
        case "group_smote":
            return group_smote(data, labels, new_ratio, groups, kwargs["k_neighbors"])
        case _:
            raise ValueError(f"{sampler_fn} invalid sampler function")

    X = data.iloc[sample_idx]
    y = labels[sample_idx]

    if len(groups) != 0:
        return X, y, groups.iloc[sample_idx]

    return X, y, None

group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5)

Computes the mean data sample of the majority class in each group and uses the distance to it to select examples.

The examples are sorted based on their distance to the mean. The hardest examples are the ones that have the greatest distance to the mean and the easiest are the ones closest to the mean.

Parameters:

Name Type Description Default
data DataFrame

Data to sample of shape (n_samples, n_features).

required
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
groups array - like

List of groups in which labels belong of shape (n_samples,).

required
hard_percent float

Percentage of examples that are considered hard, between 0 and 1. If hard_percent is 0.5, half of the examples are chosen from the end of the sorted list and the other half from the beginning.

0.5

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels and group do not have the same dimension.

Source code in src/medpipe/data/sampler.py
def group_mean_dist_sampler(data, labels, target_ratio, groups, hard_percent=0.5):
    """
    Computes the mean data sample of the majority class in each group and
    uses the distance to it to select examples.

    The examples are sorted based on their distance to the mean.
    The hardest examples are the ones that have the greatest distance to
    the mean and the easiest are the ones closest to the mean.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).
    hard_percent : float, default: 0.5
        Percentage of examples that are considered hard, between 0 and 1.
        If hard_percent is 0.5, half of the examples are chosen from
        the end of the sorted list and the other half from the beginning.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)
    X = deepcopy(data)  # Create copy of data to not mess with actual data

    if groups.name in X.columns:
        # Remove group name to avoid calculation in the mean
        X = X.drop(groups.name, axis=1)

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = X.iloc[group_idx]
        group_labels = labels[group_idx]
        sample_idx = np.concatenate(
            (
                sample_idx,
                group_idx[
                    mean_dist_sampler(
                        group_data, group_labels, target_ratio, hard_percent
                    )
                ],
            )
        )

    return sample_idx

group_random_oversampler(labels, target_ratio, groups)

Randomly select labels to achieve the target ratio between minority and majority classes in each group.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
groups array - like

List of groups in which labels belong of shape (n_samples,).

required

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels and group do not have the same dimension. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def group_random_oversampler(labels, target_ratio, groups):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes in each group.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = labels[group_idx]
        sample_idx = np.concatenate(
            (sample_idx, group_idx[random_oversampler(group_data, target_ratio)])
        )

    return sample_idx

group_random_undersampler(labels, target_ratio, groups)

Randomly select labels to achieve the target ratio between minority and majority classes in each group.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
groups array - like

List of groups in which labels belong of shape (n_samples,).

required

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels and group do not have the same dimension. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def group_random_undersampler(labels, target_ratio, groups):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes in each group.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = labels[group_idx]
        sample_idx = np.concatenate(
            (sample_idx, group_idx[random_undersampler(group_data, target_ratio)])
        )

    return sample_idx

group_smote(data, labels, target_ratio, groups, k_neighbors)

Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE) in each group.

Parameters:

Name Type Description Default
data DataFrame

Data to sample of shape (n_samples, n_features).

required
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
groups array - like

List of groups in which labels belong of shape (n_samples,).

required
k_neighbors int

Number of neighbors to use for SMOTE knn.

required

Returns:

Name Type Description
X_gen DataFrame

Generated data.

multilabels_gen array

Generated labels.

groups_gen array - like

Generated groups.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If labels and group do not have the same dimension.

Source code in src/medpipe/data/sampler.py
def group_smote(data, labels, target_ratio, groups, k_neighbors):
    """
    Oversample minority class using Synthetic Minority Over-Sampling Technique
    (SMOTE) in each group.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    groups : array-like
        List of groups in which labels belong of shape (n_samples,).
    k_neighbors : int
        Number of neighbors to use for SMOTE knn.

    Returns
    -------
    X_gen : pd.DataFrame
        Generated data.
    multilabels_gen : np.array
        Generated labels.
    groups_gen : array-like
        Generated groups.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If labels and group do not have the same dimension.

    """
    array_check(labels)
    array_dim_check(labels, groups, dim=0)
    X = deepcopy(data)  # Create copy of data to not mess with actual data
    y = deepcopy(labels)
    grps = deepcopy(groups)

    sample_idx = np.array([], dtype=int)  # Empty array for the majority class index
    n_groups = np.unique(groups)

    for group in n_groups:
        group_idx = np.where(groups == group)[0]
        group_data = data.iloc[group_idx]
        group_labels = labels[group_idx]

        # Generate new data for groups
        X_gen, y_gen = smote(group_data, group_labels, target_ratio, k_neighbors)
        group_gen = group * np.ones(y_gen.shape[0])
        X = concat((X, X_gen))
        y = np.concatenate((y, y_gen))
        grps = concat((grps, Series(group_gen.squeeze(), name=grps.name)))

    return X, y, grps

mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5)

Computes the mean data sample of the majority class and uses the distance to it to select examples.

The examples are sorted based on their distance to the mean. The hardest examples are the ones that have the greatest distance to the mean and the easiest are the ones closest to the mean.

Parameters:

Name Type Description Default
data DataFrame

Data to sample of shape (n_samples, n_features).

required
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
hard_percent float

Percentage of examples that are considered hard, between 0 and 1. If hard_percent is 0.5, half of the examples are chosen from the end of the sorted list and the other half from the beginning.

0.5

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If hard_percent is not between 0 and 1. If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def mean_dist_sampler(data, labels, target_ratio, hard_percent=0.5):
    """
    Computes the mean data sample of the majority class and uses the
    distance to it to select examples.

    The examples are sorted based on their distance to the mean.
    The hardest examples are the ones that have the greatest distance to
    the mean and the easiest are the ones closest to the mean.

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    hard_percent : float, default: 0.5
        Percentage of examples that are considered hard, between 0 and 1.
        If hard_percent is 0.5, half of the examples are chosen from
        the end of the sorted list and the other half from the beginning.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If hard_percent is not between 0 and 1.
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    if hard_percent > 1 or hard_percent < 0:
        raise ValueError(
            f"hard_percent should be between 0 and 1, but got {hard_percent}"
        )
    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = np.round(n_min_class / target_ratio)  # Majority class examples

    maj_class_data = data.iloc[label_sums == 0]
    mean_maj_class = np.mean(maj_class_data, axis=0)

    # Get the distance to the mean
    dist = np.linalg.norm(mean_maj_class - maj_class_data, axis=1)
    sorted_dist_idx = np.argsort(dist)

    hard_samples_idx = sorted_dist_idx[-round(n_maj_class * hard_percent) :]
    easy_samples_idx = sorted_dist_idx[: round(n_maj_class * (1 - hard_percent))]

    return np.concatenate((easy_samples_idx, hard_samples_idx))

random_oversampler(labels, target_ratio)

Randomly select labels to achieve the target ratio between minority and majority classes by oversampling minority class.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def random_oversampler(labels, target_ratio):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes by oversampling minority class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)  # Check that labels is array-like
    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = len(labels) - n_min_class  # Majority class examples

    if n_min_class == 0:
        raise ValueError("No minority examples found")

    # Indices of minority and majority examples
    maj_idx = np.where(label_sums == 0)[0]
    min_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.where(label_sums > 0)[0], size=int(n_maj_class * target_ratio), replace=True
    )

    return np.concatenate((min_idx, maj_idx))

random_undersampler(labels, target_ratio)

Randomly select labels to achieve the target ratio between minority and majority classes by undersampling majority class.

Parameters:

Name Type Description Default
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required

Returns:

Name Type Description
sample_idx array(n_samples)

Index list of examples to achieve target ratio.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def random_undersampler(labels, target_ratio):
    """
    Randomly select labels to achieve the target ratio between minority and
    majority classes by undersampling majority class.

    Parameters
    ----------
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.

    Returns
    -------
    sample_idx : np.array(n_samples,)
        Index list of examples to achieve target ratio.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)  # Check that labels is array-like

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_min_class = np.sum(label_sums != 0)  # Minority class examples
    n_maj_class = np.round(n_min_class / target_ratio)  # Majority class examples

    min_idx = np.where(label_sums > 0)[0]
    maj_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.where(label_sums == 0)[0], size=int(n_maj_class), replace=False
    )

    return np.concatenate((min_idx, maj_idx))

smote(data, labels, target_ratio, k_neighbors)

Oversample minority class using Synthetic Minority Over-Sampling Technique (SMOTE).

Parameters:

Name Type Description Default
data DataFrame

Data to sample of shape (n_samples, n_features).

required
labels array - like

Binary prediction labels of shape (n_samples, n_classes).

required
target_ratio float

Ratio of minority over majority classes to achieve.

required
k_neighbors int

Number of neighbors to use for SMOTE knn.

required

Returns:

Name Type Description
X_gen DataFrame

Generated data.

multilabels_gen array

Generated labels.

Raises:

Type Description
TypeError

If labels is not array-like.

ValueError

If target_ratio is less than 0.0.

Source code in src/medpipe/data/sampler.py
def smote(data, labels, target_ratio, k_neighbors):
    """
    Oversample minority class using Synthetic Minority Over-Sampling Technique
    (SMOTE).

    Parameters
    ----------
    data : pd.DataFrame
        Data to sample of shape (n_samples, n_features).
    labels : array-like
        Binary prediction labels of shape (n_samples, n_classes).
    target_ratio : float
        Ratio of minority over majority classes to achieve.
    k_neighbors : int
        Number of neighbors to use for SMOTE knn.

    Returns
    -------
    X_gen : pd.DataFrame
        Generated data.
    multilabels_gen : np.array
        Generated labels.

    Raises
    ------
    TypeError
        If labels is not array-like.
    ValueError
        If target_ratio is less than 0.0.

    """
    array_check(labels)
    X = deepcopy(data)

    if target_ratio <= 0:
        raise ValueError(f"Target ratio should be positive, but got {target_ratio}")

    label_sums = np.sum(labels, axis=1)  # Sum to find example with at least one 1
    n_maj_class = np.sum(label_sums == 0)  # Majority class examples
    n_min_class = np.round(n_maj_class * target_ratio) - np.sum(
        label_sums > 0
    )  # Minority class examples

    # Convert labels into unique classes
    unique_multilabels, class_labels = np.unique(labels, axis=0, return_inverse=True)

    sm = SMOTE(k_neighbors=k_neighbors)
    X_gen, y_gen = sm.fit_resample(X, class_labels)

    if "SEX_ORIGINAL" in X_gen.columns:
        X_gen["SEX_ORIGINAL"] = X_gen["SEX_ORIGINAL"].round()

    min_idx = np.random.choice(  # Select examples so that target ratio is achieved
        np.arange(len(labels), len(y_gen)), size=int(n_min_class), replace=False
    )
    return X_gen.iloc[min_idx], unique_multilabels[y_gen[min_idx]]