Source code for modina.statistics_utils

import numpy as np
import pandas as pd
from typing import Tuple

# Pre-rescaling (Z-score normalization)
# TODO: implement a filtering version
[docs] def pre_rescaling(scores1, scores2, metric): scores1 = scores1.copy() scores2 = scores2.copy() # TODO: eventually remove p-value rescaling completely (change all code snippets where pre-P is used!) if metric == 'pre-P': # no rescaling metric_raw = 'raw-P' scores1[metric] = scores1[metric_raw] scores2[metric] = scores2[metric_raw] elif metric == 'pre-E': metric_raw = 'raw-E' # Consider both contexts at the same time to make them comparable scores1[metric] = np.nan scores2[metric] = np.nan # Perform rescaling for every test type separately if not scores1['test_type'].equals(scores2['test_type']): raise ValueError("scores1 and scores2 must have identical 'test_type' columns.") test_types = np.unique(scores1['test_type']) for test in test_types: scores1_filtered = scores1[scores1['test_type'] == test] scores2_filtered = scores2[scores2['test_type'] == test] values = np.concatenate([scores1_filtered[metric_raw].to_numpy(), scores2_filtered[metric_raw].to_numpy()]) # Z-score normalization mean = np.mean(values) std = np.std(values) rescaled1 = None rescaled2 = None if std == 0: rescaled1 = 0 rescaled2 = 0 else: rescaled1 = (scores1_filtered[metric_raw] - mean) / std rescaled2 = (scores2_filtered[metric_raw] - mean) / std scores1.loc[scores1['test_type'] == test, metric] = rescaled1 scores2.loc[scores2['test_type'] == test, metric] = rescaled2 else: raise ValueError(f"Invalid metric '{metric}'. Only 'pre-E' and 'pre-P' are supported.") return scores1, scores2
# Post-rescaling (Min-Max normalization)
[docs] def post_rescaling(diff_scores, metric): diff_scores = diff_scores.copy() if metric == 'post-LS': metric_raw = 'raw-LS' elif metric == 'post-E': metric_raw = 'raw-E' elif metric == 'post-P': metric_raw = 'raw-P' elif metric == 'post-CS': metric_raw = 'raw-CS' else: raise ValueError(f"Invalid metric '{metric}'. Only 'post-E', 'post-P', 'post-CS', and 'post-LS' are supported.") diff_scores[metric] = np.nan # Perform rescaling for every test type separately test_types = np.unique(diff_scores['test_type']) for test in test_types: diff_scores_filtered = diff_scores[diff_scores['test_type'] == test] values = diff_scores_filtered[metric_raw].to_numpy() # Min-Max normalization min_val = np.min(values) max_val = np.max(values) if min_val == max_val: rescaled = 0 else: rescaled = (diff_scores_filtered[metric_raw] - min_val) / (max_val - min_val) diff_scores.loc[diff_scores['test_type'] == test, metric] = rescaled return diff_scores
def _separate_types(all_data, meta_file) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Separating the data into ordinal, nominal, continuous and binary variables. :param all_data: DataFrame with all data :param meta_file: DataFrame with metadata of the variables :return: tuple with the ordinal, nominal, continuous and binary variables """ # Check if meta_file has an invalid type if not meta_file['type'].str.lower().isin(['ordinal', 'nominal', 'binary', 'continuous']).all(): raise ValueError("Invalid type found in meta_file. Allowed types are 'ordinal', 'nominal', 'binary', and 'continuous'.") # Extract ordinal phenotypes ord_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'ordinal'].label)].copy() # Extract nominal phenotypes nom_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'nominal'].label)].copy() # Extract binary phenotypes bi_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'binary'].label)].copy() # Extract continuous phenotypes cont_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'continuous'].label)].copy() return ord_data, nom_data, cont_data, bi_data def _df_to_numpy(df: pd.DataFrame): cols = df.columns df_np = df.to_numpy(dtype=np.float64).copy() return df_np, cols