Source code for modina.statistics_utils

import numpy as np
import pandas as pd
from scipy import stats
from typing import Tuple


# Convert Cohen's d to point-biserial r for ttest edges.
# Uses context sample sizes to account for unequal groups:
# equal sizes (n1==n2): r = d / sqrt(d² + 4)
# unequal sizes:        r = d / sqrt(d² + (n1+n2)² / (n1*n2))
[docs] def cohens_d_to_r(scores1, scores2, n1: int, n2: int): scores1 = scores1.copy() scores2 = scores2.copy() correction = (n1 + n2) ** 2 / (n1 * n2) for scores in [scores1, scores2]: mask = scores['test_type'] == 'ttest' if mask.any(): d = scores.loc[mask, 'raw-E'].to_numpy() scores.loc[mask, 'raw-E'] = d / np.sqrt(d ** 2 + correction) return scores1, scores2
# Std rescaling (divide raw-E by pooled std per test type)
[docs] def std_rescaling(scores1, scores2, metric='std-E'): scores1 = scores1.copy() scores2 = scores2.copy() if metric != 'std-E': raise ValueError(f"Invalid metric '{metric}'. Only 'std-E' is supported.") metric_raw = 'raw-E' scores1[metric] = np.nan scores2[metric] = np.nan if not scores1['test_type'].equals(scores2['test_type']): raise ValueError("scores1 and scores2 must have identical 'test_type' columns.") for test in np.unique(scores1['test_type']): s1f = scores1[scores1['test_type'] == test] s2f = scores2[scores2['test_type'] == test] values = np.concatenate([s1f[metric_raw].to_numpy(), s2f[metric_raw].to_numpy()]) std = np.std(values) if std == 0: rescaled1 = rescaled2 = 0 else: rescaled1 = s1f[metric_raw] / std rescaled2 = s2f[metric_raw] / std scores1.loc[scores1['test_type'] == test, metric] = rescaled1 scores2.loc[scores2['test_type'] == test, metric] = rescaled2 return scores1, scores2
# Probit rescaling (rank-based normalization)
[docs] def probit_rescaling(scores1, scores2, metric='probit-E'): scores1 = scores1.copy() scores2 = scores2.copy() if metric != 'probit-E': raise ValueError(f"Invalid metric '{metric}'. Only 'probit-E' is supported.") metric_raw = 'raw-E' scores1[metric] = np.nan scores2[metric] = np.nan if not scores1['test_type'].equals(scores2['test_type']): raise ValueError("scores1 and scores2 must have identical 'test_type' columns.") for test in np.unique(scores1['test_type']): idx1 = scores1['test_type'] == test idx2 = scores2['test_type'] == test v1 = scores1.loc[idx1, metric_raw].to_numpy() v2 = scores2.loc[idx2, metric_raw].to_numpy() combined = np.concatenate([v1, v2]) n = len(combined) # Folded probit: rank |raw-E| so association strength (not sign) determines rank. # Percentile mapped to (0.5, 1) so norm.ppf gives values in (0, +inf). # Sign is restored afterward, so strong negative associations rank equally to # strong positive ones. For non-negative test types sign=+1 always (no-op). signs = np.sign(combined) ranks = stats.rankdata(np.abs(combined)) percentiles = 0.5 + ranks / (n + 1) * 0.5 probit_magnitude = stats.norm.ppf(percentiles) probit_vals = signs * probit_magnitude scores1.loc[idx1, metric] = probit_vals[:len(v1)] scores2.loc[idx2, metric] = probit_vals[len(v1):] return scores1, scores2
def _separate_types(all_data, meta_file) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Separating the data into ordinal, nominal, continuous and binary variables. :param all_data: DataFrame with all data :param meta_file: DataFrame with metadata of the variables :return: tuple with the ordinal, nominal, continuous and binary variables """ # Check if meta_file has an invalid type if not meta_file['type'].str.lower().isin(['ordinal', 'nominal', 'binary', 'continuous']).all(): raise ValueError("Invalid type found in meta_file. Allowed types are 'ordinal', 'nominal', 'binary', and 'continuous'.") # Extract ordinal phenotypes ord_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'ordinal'].label)].copy() # Extract nominal phenotypes nom_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'nominal'].label)].copy() # Extract binary phenotypes bi_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'binary'].label)].copy() # Extract continuous phenotypes cont_data = all_data.iloc[:, all_data.columns.isin(meta_file[meta_file.type.str.lower() == 'continuous'].label)].copy() return ord_data, nom_data, cont_data, bi_data def _df_to_numpy(df: pd.DataFrame): cols = df.columns df_np = df.to_numpy(dtype=np.float64).copy() return df_np, cols