"""
This module allows to infer co-expression Gene Regulatory Networks using
gene expression data (RNAseq or Microarray). This module implements severall
inference algorithms based on statistical predictors, using `scipy-stats`_ and
`scikit-learn`_.
.. _scipy-stats:
https://docs.scipy.org/doc/scipy/reference/stats.html
.. _scikit-learn:
https://scikit-learn.org
"""
from sklearn.feature_selection import f_regression as _sklearn_f_regression
from sklearn.feature_selection import mutual_info_regression as _sklearn_mutual_info_regression
from scipy.stats import spearmanr as _scipy_spearmanr
from scipy.stats import pearsonr as _scipy_pearsonr
from scipy.stats import wilcoxon as _scipy_wilcoxon
from scipy.stats import mannwhitneyu as _scipy_mannwhitneyu
from scipy.stats import theilslopes as _scipy_theilslopes
from scipy.stats import kendalltau as _scipy_kendalltau
from scipy.stats import rankdata as _scipy_rankdata
from scipy.stats import energy_distance as _scipy_energy_distance
from scipy.stats import wasserstein_distance as _scipy_wasserstein_distance
import numpy as np
__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"
[docs]def abs_pearsonr_coef(X,y):
"""
Score predictor function based on the `scipy-stats`_
absolute Pearson correlation.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the absolute value of the
correlation between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = abs_pearsonr_coef(tfs,tg)
>>> scores
array([0.41724166, 0.02212467, 0.23708491])
"""
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = _scipy_pearsonr(y,x_tf)[0]
scores = np.abs(scores)
return(scores)
[docs]def abs_spearmanr_coef(X,y):
"""
Score predictor function based on the `scipy-stats`_
absolute Spearman correlation.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the absolute value of the
correlation between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = abs_spearmanr_coef(tfs,tg)
>>> scores
array([0.5, 0.3, 0.3])
"""
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = _scipy_spearmanr(y,x_tf)[0]
scores = np.abs(scores)
return(scores)
[docs]def kendalltau_score(X,y,**kendalltau_parameters):
"""
Score predictor function based on the `scipy-stats`_
Kendall’s tau correlation measure.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**kendalltau_parameters: Named parameters for the scipy-stats kendall's
tau correlation measure
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score of the
score between target gene expression and the i-th transcription factor
gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = kendalltau_score(tfs,tg)
>>> scores
array([0.8487997 , 1.30065214, 0.20467198])s
"""
epsilon = 1e-300
scores = np.zeros(X.shape[1])
y = _scipy_rankdata(y)
for i in range(X.shape[1]):
x_tf = X[:,i]
x_tf_ranks = _scipy_rankdata(x_tf)
scores[i] = _scipy_kendalltau(y,x_tf_ranks,**kendalltau_parameters)[1]
scores[i] = -np.log10(scores[i]+epsilon)
return(scores)
[docs]def f_regression_score(X,y):
"""
Score predictor function based on the `scikit-learn`_ f_regression score.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score of the
f_regression linear test between target gene expression and the
i-th transcription factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = f_regression_score(tfs,tg)
>>> scores
array([0.63235967, 0.00146922, 0.17867071])
"""
scores, p_values = _sklearn_f_regression(X, y, center=True)
return(scores)
[docs]def CLR(X,y,**mi_parameters):
"""
Score predictor function based on `scikit-learn`_ mutual_info_regression
score.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**mi_parameters: Named parameters for sklearn mutual_info_regression
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score of the
sklearn mutual_info_regression computation between target gene
expression and the i-th transcription factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = CLR(tfs,tg)
>>> scores
array([6.66666667e-02, 1.16666667e-01, 2.22044605e-16])
"""
scores = _sklearn_mutual_info_regression(X,y,**mi_parameters)
return(scores)
[docs]def wilcoxon_score(X,y,**wilcoxon_parameters):
"""
Score predictor function based on the `scipy-stats`_
Wilcoxon signed-rank test.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**wilcoxon_parameters: Named parameters for the scipy-stats Wilcoxon
signed-rank test
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score
between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),index =["c1","c2","c3","c4","c5"],columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = wilcoxon_score(tfs,tg)
>>> scores
array([1.36537718, 0.64797987, 0.30086998])
"""
epsilon = 1e-300
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = _scipy_wilcoxon(y,x_tf,**wilcoxon_parameters)[1]
scores[i] = -np.log10(scores[i]+epsilon)
return(scores)
[docs]def mannwhitneyu_score(X,y,**mannwhitneyu_parameters):
"""
Score predictor function based on the `scipy-stats`_ Mann-Whitney rank test.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**mannwhitneyu_parameters: Named parameters for the scipy-stats
Mann-Whitney rank test
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score
between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = mannwhitneyu_score(tfs,tg)
>>> scores
array([1.52213525, 0.47101693, 0.3795872 ])
"""
epsilon = 1e-300
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = _scipy_mannwhitneyu(y,x_tf,**mannwhitneyu_parameters)[1]
scores[i] = -np.log10(scores[i]+epsilon)
return(scores)
[docs]def theilslopes_score(X,y,**theilslopes_parameters):
"""
Score predictor function based on the `scipy-stats`_
Theil-Sen robust slope estimator.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**theilslopes_parameters: Named parameters for the scipy-stats
Theil-Sen robust slope estimator
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score
between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = theilslopes_score(tfs,tg)
>>> scores
array([0.92309299, 0.90933202, 0.26451817])
"""
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = np.abs(_scipy_theilslopes(y,x_tf,**theilslopes_parameters)[0])
return(scores)
[docs]def energy_distance_score(X,y,**energy_distance_parameters):
"""
Score predictor function based on the `scipy-stats`_ energy distance between
1D distributions.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**energy_distance_parameters: Named parameters for the scipy-stats
energy distance
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score
between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = energy_distance_score(tfs,tg)
>>> scores
array([0.40613705, 0.6881455 , 0.72786711])
"""
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = np.exp(-_scipy_energy_distance(y,x_tf,**energy_distance_parameters))
return(scores)
[docs]def wasserstein_distance_score(X,y,**wasserstein_distance_parameters):
"""
Score predictor function based on the `scipy-stats`_ Wasserstein distance
between 1D distributions.
Args:
X (pandas.DataFrame): Transcriptor factor gene expressions where rows
are experimental conditions and columns are transcription factors
y (pandas.Series): Target gene expression vector where rows are
experimental conditions
**wasserstein_distance_parameters: Named parameters for the scipy-stats
Wasserstein distance
Returns:
numpy.array: co-regulation scores.
The i-th element of the score array represents the score
between target gene expression and the i-th transcription
factor gene expression.
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> tfs = pd.DataFrame(np.random.randn(5,3),
index =["c1","c2","c3","c4","c5"],
columns=["tf1","tf2","tf3"])
>>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
>>> scores = wasserstein_distance_score(tfs,tg)
>>> scores
array([0.36457586, 0.72057084, 0.81207932])
"""
scores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x_tf = X[:,i]
scores[i] = np.exp(-_scipy_wasserstein_distance(y,x_tf,**wasserstein_distance_parameters))
return(scores)