Source code for grenadine.Inference.statistical_predictors

"""
This module allows to infer co-expression Gene Regulatory Networks using
gene expression data (RNAseq or Microarray). This module implements severall
inference algorithms based on statistical predictors, using `scipy-stats`_ and
`scikit-learn`_.

.. _scipy-stats:
    https://docs.scipy.org/doc/scipy/reference/stats.html
.. _scikit-learn:
    https://scikit-learn.org
"""

from sklearn.feature_selection import f_regression as _sklearn_f_regression
from sklearn.feature_selection import mutual_info_regression as _sklearn_mutual_info_regression
from scipy.stats import spearmanr as _scipy_spearmanr
from scipy.stats import pearsonr as _scipy_pearsonr
from scipy.stats import wilcoxon as _scipy_wilcoxon
from scipy.stats import mannwhitneyu as _scipy_mannwhitneyu
from scipy.stats import theilslopes as _scipy_theilslopes
from scipy.stats import kendalltau as _scipy_kendalltau
from scipy.stats import rankdata as _scipy_rankdata
from scipy.stats import energy_distance as _scipy_energy_distance
from scipy.stats import wasserstein_distance as _scipy_wasserstein_distance
import numpy as np

__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"

[docs]def abs_pearsonr_coef(X,y): """ Score predictor function based on the `scipy-stats`_ absolute Pearson correlation. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the absolute value of the correlation between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = abs_pearsonr_coef(tfs,tg) >>> scores array([0.41724166, 0.02212467, 0.23708491]) """ scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = _scipy_pearsonr(y,x_tf)[0] scores = np.abs(scores) return(scores)
[docs]def abs_spearmanr_coef(X,y): """ Score predictor function based on the `scipy-stats`_ absolute Spearman correlation. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the absolute value of the correlation between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = abs_spearmanr_coef(tfs,tg) >>> scores array([0.5, 0.3, 0.3]) """ scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = _scipy_spearmanr(y,x_tf)[0] scores = np.abs(scores) return(scores)
[docs]def kendalltau_score(X,y,**kendalltau_parameters): """ Score predictor function based on the `scipy-stats`_ Kendall’s tau correlation measure. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **kendalltau_parameters: Named parameters for the scipy-stats kendall's tau correlation measure Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score of the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = kendalltau_score(tfs,tg) >>> scores array([0.8487997 , 1.30065214, 0.20467198])s """ epsilon = 1e-300 scores = np.zeros(X.shape[1]) y = _scipy_rankdata(y) for i in range(X.shape[1]): x_tf = X[:,i] x_tf_ranks = _scipy_rankdata(x_tf) scores[i] = _scipy_kendalltau(y,x_tf_ranks,**kendalltau_parameters)[1] scores[i] = -np.log10(scores[i]+epsilon) return(scores)
[docs]def f_regression_score(X,y): """ Score predictor function based on the `scikit-learn`_ f_regression score. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score of the f_regression linear test between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = f_regression_score(tfs,tg) >>> scores array([0.63235967, 0.00146922, 0.17867071]) """ scores, p_values = _sklearn_f_regression(X, y, center=True) return(scores)
[docs]def CLR(X,y,**mi_parameters): """ Score predictor function based on `scikit-learn`_ mutual_info_regression score. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **mi_parameters: Named parameters for sklearn mutual_info_regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score of the sklearn mutual_info_regression computation between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = CLR(tfs,tg) >>> scores array([6.66666667e-02, 1.16666667e-01, 2.22044605e-16]) """ scores = _sklearn_mutual_info_regression(X,y,**mi_parameters) return(scores)
[docs]def wilcoxon_score(X,y,**wilcoxon_parameters): """ Score predictor function based on the `scipy-stats`_ Wilcoxon signed-rank test. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **wilcoxon_parameters: Named parameters for the scipy-stats Wilcoxon signed-rank test Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3),index =["c1","c2","c3","c4","c5"],columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = wilcoxon_score(tfs,tg) >>> scores array([1.36537718, 0.64797987, 0.30086998]) """ epsilon = 1e-300 scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = _scipy_wilcoxon(y,x_tf,**wilcoxon_parameters)[1] scores[i] = -np.log10(scores[i]+epsilon) return(scores)
[docs]def mannwhitneyu_score(X,y,**mannwhitneyu_parameters): """ Score predictor function based on the `scipy-stats`_ Mann-Whitney rank test. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **mannwhitneyu_parameters: Named parameters for the scipy-stats Mann-Whitney rank test Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = mannwhitneyu_score(tfs,tg) >>> scores array([1.52213525, 0.47101693, 0.3795872 ]) """ epsilon = 1e-300 scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = _scipy_mannwhitneyu(y,x_tf,**mannwhitneyu_parameters)[1] scores[i] = -np.log10(scores[i]+epsilon) return(scores)
[docs]def theilslopes_score(X,y,**theilslopes_parameters): """ Score predictor function based on the `scipy-stats`_ Theil-Sen robust slope estimator. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **theilslopes_parameters: Named parameters for the scipy-stats Theil-Sen robust slope estimator Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = theilslopes_score(tfs,tg) >>> scores array([0.92309299, 0.90933202, 0.26451817]) """ scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = np.abs(_scipy_theilslopes(y,x_tf,**theilslopes_parameters)[0]) return(scores)
[docs]def energy_distance_score(X,y,**energy_distance_parameters): """ Score predictor function based on the `scipy-stats`_ energy distance between 1D distributions. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **energy_distance_parameters: Named parameters for the scipy-stats energy distance Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = energy_distance_score(tfs,tg) >>> scores array([0.40613705, 0.6881455 , 0.72786711]) """ scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = np.exp(-_scipy_energy_distance(y,x_tf,**energy_distance_parameters)) return(scores)
[docs]def wasserstein_distance_score(X,y,**wasserstein_distance_parameters): """ Score predictor function based on the `scipy-stats`_ Wasserstein distance between 1D distributions. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **wasserstein_distance_parameters: Named parameters for the scipy-stats Wasserstein distance Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score between target gene expression and the i-th transcription factor gene expression. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = wasserstein_distance_score(tfs,tg) >>> scores array([0.36457586, 0.72057084, 0.81207932]) """ scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): x_tf = X[:,i] scores[i] = np.exp(-_scipy_wasserstein_distance(y,x_tf,**wasserstein_distance_parameters)) return(scores)