Source code for grenadine.Inference.regression_predictors

"""
This module allows to infer co-expression  Gene Regulatory Networks using
gene expression data (RNAseq or Microarray). This module implements severall
inference algorithms based on regression, using `scikit-learn`_.

.. _scikit-learn:
    https://scikit-learn.org
"""
from sklearn.linear_model import BayesianRidge as _sklearn_BayesianRidge
from sklearn.svm import SVR as _sklearn_SVR
from sklearn.linear_model import Lasso as _sklearn_Lasso
from sklearn.linear_model import LassoLars as _sklearn_LassoLars
from sklearn.model_selection import train_test_split as _sklearn_train_test_split
from sklearn.linear_model import Lars as _sklearn_Lars
from sklearn.ensemble import RandomForestRegressor as _sklearn_RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor as _sklearn_ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV as _sklearn_ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor as _sklearn_GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor as _sklearn_AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor as _sklearn_BaggingRegressor
from pandas import DataFrame
import numpy as np
import pandas as pd
try:
    from sklearn.linear_model import RandomizedLasso as _sklearn_RandomizedLasso
except:
    import sklearn
    print("sklearn.linear_model.RandomizedLasso could not be loaded")
    print('RandomizedLasso is not available in sklearn '+sklearn.__version__)

__author__ = "Sergio Peignier, Pauline Schmitt"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"

[docs]def GENIE3(X,y,**rf_parameters): """ GENIE3, score predictor function based on `scikit-learn`_ RandomForestRegressor. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **rf_parameters: Named parameters for the sklearn RandomForestRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the RandomForestRegressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = GENIE3(tfs,tg) >>> scores array([0.11983888, 0.28071399, 0.59944713]) """ regressor = _sklearn_RandomForestRegressor(**rf_parameters) regressor.fit(X, y) scores = regressor.feature_importances_ return(scores)
[docs]def XGENIE3(X,y,**rf_parameters): """ XGENIE3, score predictor function based on `scikit-learn`_ ExtraTreesRegressor. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **rf_parameters: Named parameters for the sklearn RandomForestRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the ExtraTreesRegressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = XGENIE3(tfs,tg) >>> scores array([0.24905241, 0.43503283, 0.31591477]) """ regressor = _sklearn_ExtraTreesRegressor(**rf_parameters) regressor.fit(X, y) scores = regressor.feature_importances_ return(scores)
[docs]def GRNBoost2(X,y,**boost_parameters): """ GRNBoost2 score predictor based on `scikit-learn`_ GradientBoostingRegressor. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **boost_parameters: Named parameters for GradientBoostingRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the GradientBoostingRegressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = GRNBoost2(tfs,tg) >>> scores array([0.83904506, 0.01783977, 0.14311517]) """ regressor = _sklearn_GradientBoostingRegressor(**boost_parameters) regressor.fit(X, y) scores = regressor.feature_importances_ return(scores)
[docs]def BayesianRidgeScore(X,y,**brr_parameters): """ Score predictor based on `scikit-learn`_ BayesianRidge regression. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **brr_parameters: Named parameters for sklearn BayesianRidge regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn BayesianRidge regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = BayesianRidgeScore(tfs,tg) >>> scores array([1.32082000e-03, 6.24177371e-05, 3.32319918e-04]) """ regressor = _sklearn_BayesianRidge(**brr_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return(scores)
[docs]def SVR_score(X,y,**svr_parameters): """ Score predictor based on `scikit-learn`_ SVR (Support Vector Regression). Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **svr_parameters: Named parameters for sklearn SVR regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn SVR regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = SVR_score(tfs,tg) >>> scores array([[-0.38156814, 0.28128811, -1.0230867 ]]) """ svr_parameters["kernel"] = 'linear' regressor = _sklearn_SVR(**svr_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return(scores[0])
[docs]def Lasso_score(X,y,**l1_parameters): """ Score predictor based on `scikit-learn`_ Lasso regression. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **l1_parameters: Named parameters for sklearn Lasso regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn Lasso regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = Lasso_score(tfs,tg, alpha=0.01) >>> scores array([0.13825495, 0.94939204, 0.19118214]) """ regressor = _sklearn_Lasso(**l1_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return(scores)
[docs]def LassoLars_score(X,y,**l1_parameters): """ Score predictor based on `scikit-learn`_ LassoLars regression. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **l1_parameters: Named parameters for sklearn Lasso regression Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn LassoLars regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = LassoLars_score(tfs,tg, alpha=0.01) >>> scores array([0.12179406, 0.92205553, 0.15503451]) """ regressor = _sklearn_LassoLars(**l1_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return(scores)
[docs]def stability_randomizedlasso(X,y,**rl_parameters): """ Score predictor based on `scikit-learn`_ randomizedlasso stability selection. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **rl_parameters: Named parameters for sklearn randomizedlasso Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the sklearn randomizedlasso stability selection to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = stability_randomizedlasso(tfs,tg) >>> scores array([0.11 , 0.17 , 0.085]) """ regressor = _sklearn_RandomizedLasso(**rl_parameters) regressor.fit(X,y) scores = np.abs(regressor.scores_) return(scores)
[docs]def TIGRESS(X, y, nsplit=100, nstepsLARS=5, alpha=0.4, scoring="area"): """ TIGRESS score predictor based on stability selection. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions nsplit (int): number of splits applied, i.e., randomization tests, the highest the best nstepsLARS (int): number of steps of LARS algorithm, i.e., number of non zero coefficients to keep (Lars parameter) alpha: Noise multiplier coefficient, Each transcription factor expression is multiplied by a random variable $\in [\alpha,1]$ scoring (str): option used to score each possible link only "area" and "max" options are available Returns: numpy.array: co-regulation scores The i-th element of the score array represents the score assigned by the sklearn randomizedlasso stability selection to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = TIGRESS(tfs,tg) >>> scores array([349. , 312.875, 588.125]) """ n,p = X.shape halfsize = int(n/2) if nstepsLARS > p: nstepsLARS = p-1 freq = np.zeros((p, nstepsLARS)) i = 0 while i < nsplit: # Randomly reweight each variable (TF expression) random_perturbation = np.random.uniform(low=alpha, high=1.0, size=p) X *= random_perturbation # Randomly split the sample in two sets X_1,X_2,y_1,y_2 = _sklearn_train_test_split(X,y,test_size=halfsize, shuffle=True) for X_i,y_i in [[X_1, y_1],[X_2,y_2]]: if y_i.std() > 0: # run LARS on each subsample and collect variables are selected lars = _sklearn_Lars(normalize=False, n_nonzero_coefs=nstepsLARS) lars.fit(X_i,y_i) # collect the presence of the coefficients along the path path = lars.coef_path_ if path.shape[1] < nstepsLARS+1: path_add = np.tile(path[:,-1],(nstepsLARS+1 - path.shape[1],1)).T path = np.hstack((path,path_add)) freq += np.abs(np.sign(path[:,1:])) i += 1 X /= random_perturbation # normalize frequence in [0,1] to get stability curves freq /= 2*halfsize if (scoring=="area"): score = np.cumsum(freq,axis=1)/np.arange(1,nstepsLARS+1,1) if (scoring=="max"): score = np.maximum.accumulate(freq,axis=1) return(score[:,nstepsLARS - 1])
[docs]def AdaBoost_regressor (X, y, **adab_parameters): """ AdaBoost regressor, score predictor function based on `scikit-learn`_ AdaBoostRegressor. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **adab_parameters: Named parameters for the sklearn AdaBoostRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the AdaBoostRegressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = AdaBoost_regressor(tfs,tg) >>> scores array([0.32978247, 0.3617295 , 0.28896647]) """ regressor = _sklearn_AdaBoostRegressor(**adab_parameters) regressor.fit(X, y) scores = regressor.feature_importances_ return scores
[docs]def Elastica(X, y, **elastica_parameters): """ ElasticNetCV regressor, score predictor function based on `scikit-learn`_ ElasticNetCV. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **elastica_parameters: Named parameters for the sklearn ElasticNetCV Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the AdaBoostRegressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> scores = Elastica(tfs,tg) >>> scores array([0.05512459, 0.34453337, 0. ]) """ regressor = _sklearn_ElasticNetCV(**elastica_parameters) regressor.fit(X, y) scores = np.abs(regressor.coef_) return scores
[docs]def bagging_regressor(X, y, **bagging_parameters): """ Apply the bagging technique to a regression algorithm, based on `scikit-learn`_ BaggingRegressor. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **adab_parameters: Named parameters for the sklearn AdaBoostRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the average score assigned by the Base Regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> from sklearn.svm import SVR >>> np.random.seed(0) >>> svr = SVR(kernel="linear") >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"]) >>> bagging_parameters = {"base_estimator":svr, "n_estimators":100, "max_samples":0.7} >>> scores = bagging_regressor(tfs,tg,**bagging_parameters) >>> scores array([0.32978247, 0.3617295 , 0.28896647]) """ def get_score(classifier): scores=None if hasattr(classifier, 'feature_importances_'): scores = DataFrame(feature_importances).mean(axis=0) elif hasattr(classifier,"coef_"): scores = np.abs(classifier.coef_).flatten() return(scores) bc = _sklearn_BaggingRegressor(**bagging_parameters) bc.fit(X,y) coefs = [get_score(e) for e in bc.estimators_] coefs = [pd.Series(c,index=bc.estimators_features_[i]) for i,c in enumerate(coefs) if c is not None] if len(coefs): scores = pd.Series(np.random.randn(bc.n_features_)*1e-5) scores = scores + pd.DataFrame(coefs).mean(axis=0) else: print("Base regressor has not a feature_importance of coef_ attribute") return(0) scores = scores.sort_index() return(scores)