Source code for grenadine.Inference.regression_predictors

"""
This module allows to infer co-expression  Gene Regulatory Networks using
gene expression data (RNAseq or Microarray). This module implements severall
inference algorithms based on regression, using `scikit-learn`_.

.. _scikit-learn:
    https://scikit-learn.org
"""
from sklearn.linear_model import BayesianRidge as _sklearn_BayesianRidge
from sklearn.svm import SVR as _sklearn_SVR
from sklearn.linear_model import Lasso as _sklearn_Lasso
from sklearn.linear_model import LassoLars as _sklearn_LassoLars
from sklearn.model_selection import train_test_split as _sklearn_train_test_split
from sklearn.linear_model import Lars as _sklearn_Lars
from sklearn.ensemble import RandomForestRegressor as _sklearn_RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor as _sklearn_ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV as _sklearn_ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor as _sklearn_GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor as _sklearn_AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor as _sklearn_BaggingRegressor
from pandas import DataFrame
import numpy as np
import pandas as pd
try:
    from sklearn.linear_model import RandomizedLasso as _sklearn_RandomizedLasso
except:
    import sklearn
    print("sklearn.linear_model.RandomizedLasso could not be loaded")
    print('RandomizedLasso is not available in sklearn '+sklearn.__version__)

__author__ = "Sergio Peignier, Pauline Schmitt"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"

[docs]def GENIE3(X,y,**rf_parameters):
    """
    GENIE3, score predictor function based on `scikit-learn`_
    RandomForestRegressor.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **rf_parameters: Named parameters for the sklearn RandomForestRegressor

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        RandomForestRegressor to the regulatory relationship between the target
        gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = GENIE3(tfs,tg)
        >>> scores
        array([0.11983888, 0.28071399, 0.59944713])
    """
    regressor = _sklearn_RandomForestRegressor(**rf_parameters)
    regressor.fit(X, y)
    scores = regressor.feature_importances_
    return(scores)

[docs]def XGENIE3(X,y,**rf_parameters):
    """
    XGENIE3, score predictor function based on `scikit-learn`_
    ExtraTreesRegressor.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **rf_parameters: Named parameters for the sklearn RandomForestRegressor

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        ExtraTreesRegressor to the regulatory relationship between the target
        gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = XGENIE3(tfs,tg)
        >>> scores
        array([0.24905241, 0.43503283, 0.31591477])
    """
    regressor = _sklearn_ExtraTreesRegressor(**rf_parameters)
    regressor.fit(X, y)
    scores = regressor.feature_importances_
    return(scores)

[docs]def GRNBoost2(X,y,**boost_parameters):
    """
    GRNBoost2 score predictor based on `scikit-learn`_
    GradientBoostingRegressor.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **boost_parameters: Named parameters for GradientBoostingRegressor

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        GradientBoostingRegressor to the regulatory relationship between the
        target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = GRNBoost2(tfs,tg)
        >>> scores
        array([0.83904506, 0.01783977, 0.14311517])
    """
    regressor = _sklearn_GradientBoostingRegressor(**boost_parameters)
    regressor.fit(X, y)
    scores = regressor.feature_importances_
    return(scores)

[docs]def BayesianRidgeScore(X,y,**brr_parameters):
    """
    Score predictor based on `scikit-learn`_ BayesianRidge regression.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **brr_parameters: Named parameters for sklearn BayesianRidge regression

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn BayesianRidge regressor to the regulatory relationship between
        the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = BayesianRidgeScore(tfs,tg)
        >>> scores
        array([1.32082000e-03, 6.24177371e-05, 3.32319918e-04])
    """
    regressor = _sklearn_BayesianRidge(**brr_parameters)
    regressor.fit(X, y)
    scores = np.abs(regressor.coef_)
    return(scores)

[docs]def SVR_score(X,y,**svr_parameters):
    """
    Score predictor based on `scikit-learn`_ SVR (Support Vector Regression).

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **svr_parameters: Named parameters for sklearn SVR regression

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn SVR regressor to the regulatory relationship between
        the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = SVR_score(tfs,tg)
        >>> scores
        array([[-0.38156814,  0.28128811, -1.0230867 ]])
    """
    svr_parameters["kernel"] = 'linear'
    regressor = _sklearn_SVR(**svr_parameters)
    regressor.fit(X, y)
    scores = np.abs(regressor.coef_)
    return(scores[0])

[docs]def Lasso_score(X,y,**l1_parameters):
    """
    Score predictor based on `scikit-learn`_ Lasso regression.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **l1_parameters: Named parameters for sklearn Lasso regression

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn Lasso regressor to the regulatory relationship between
        the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = Lasso_score(tfs,tg, alpha=0.01)
        >>> scores
        array([0.13825495, 0.94939204, 0.19118214])
    """
    regressor = _sklearn_Lasso(**l1_parameters)
    regressor.fit(X, y)
    scores = np.abs(regressor.coef_)
    return(scores)

[docs]def LassoLars_score(X,y,**l1_parameters):
    """
    Score predictor based on `scikit-learn`_ LassoLars regression.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **l1_parameters: Named parameters for sklearn Lasso regression

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn LassoLars regressor to the regulatory relationship between the
        target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = LassoLars_score(tfs,tg, alpha=0.01)
        >>> scores
        array([0.12179406, 0.92205553, 0.15503451])
    """
    regressor = _sklearn_LassoLars(**l1_parameters)
    regressor.fit(X, y)
    scores = np.abs(regressor.coef_)
    return(scores)

[docs]def stability_randomizedlasso(X,y,**rl_parameters):
    """
    Score predictor based on `scikit-learn`_ randomizedlasso stability selection.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **rl_parameters: Named parameters for sklearn randomizedlasso
    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        sklearn randomizedlasso stability selection to the regulatory
        relationship between the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = stability_randomizedlasso(tfs,tg)
        >>> scores
        array([0.11 , 0.17 , 0.085])
    """
    regressor = _sklearn_RandomizedLasso(**rl_parameters)
    regressor.fit(X,y)
    scores = np.abs(regressor.scores_)
    return(scores)

[docs]def TIGRESS(X,
            y,
            nsplit=100,
            nstepsLARS=5,
            alpha=0.4,
            scoring="area"):
    """
    TIGRESS score predictor based on stability selection.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        nsplit (int): number of splits applied,
            i.e., randomization tests, the highest the best
        nstepsLARS (int): number of steps of LARS algorithm,
            i.e., number of non zero coefficients to keep (Lars parameter)
        alpha: Noise multiplier coefficient,
            Each transcription factor expression is multiplied by a
            random variable $\in [\alpha,1]$
        scoring (str): option used to score each possible link
            only "area" and "max" options are available

    Returns:
        numpy.array: co-regulation scores

        The i-th element of the score array represents the score assigned by the
        sklearn randomizedlasso stability selection to the regulatory
        relationship between the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = TIGRESS(tfs,tg)
        >>> scores
        array([349.   , 312.875, 588.125])
    """
    n,p = X.shape
    halfsize = int(n/2)
    if nstepsLARS > p:
        nstepsLARS = p-1
    freq = np.zeros((p, nstepsLARS))
    i = 0
    while i < nsplit:
        # Randomly reweight each variable (TF expression)
        random_perturbation = np.random.uniform(low=alpha, high=1.0, size=p)
        X *= random_perturbation
        # Randomly split the sample in two sets
        X_1,X_2,y_1,y_2 = _sklearn_train_test_split(X,y,test_size=halfsize, shuffle=True)
        for X_i,y_i in [[X_1, y_1],[X_2,y_2]]:
            if y_i.std() > 0:
                # run LARS on each subsample and collect variables are selected
                lars = _sklearn_Lars(normalize=False, n_nonzero_coefs=nstepsLARS)
                lars.fit(X_i,y_i)
                # collect the presence of the coefficients along the path
                path = lars.coef_path_
                if path.shape[1] < nstepsLARS+1:
                    path_add = np.tile(path[:,-1],(nstepsLARS+1 - path.shape[1],1)).T
                    path = np.hstack((path,path_add))
                freq += np.abs(np.sign(path[:,1:]))
                i += 1
        X /= random_perturbation
    # normalize frequence in [0,1] to get stability curves
    freq /= 2*halfsize
    if (scoring=="area"):
        score = np.cumsum(freq,axis=1)/np.arange(1,nstepsLARS+1,1)
    if (scoring=="max"):
        score = np.maximum.accumulate(freq,axis=1)
    return(score[:,nstepsLARS - 1])

[docs]def AdaBoost_regressor (X, y,  **adab_parameters):
    """
    AdaBoost regressor, score predictor function based on `scikit-learn`_
    AdaBoostRegressor.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **adab_parameters: Named parameters for the sklearn AdaBoostRegressor

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        AdaBoostRegressor to the regulatory relationship between the target
        gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                       index =["c1","c2","c3","c4","c5"],
                       columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = AdaBoost_regressor(tfs,tg)
        >>> scores
        array([0.32978247, 0.3617295 , 0.28896647])
    """
    regressor = _sklearn_AdaBoostRegressor(**adab_parameters)
    regressor.fit(X, y)
    scores = regressor.feature_importances_
    return scores

[docs]def Elastica(X, y, **elastica_parameters):
    """
    ElasticNetCV regressor, score predictor function based on `scikit-learn`_
    ElasticNetCV.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **elastica_parameters: Named parameters for the sklearn ElasticNetCV

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the score assigned by the
        AdaBoostRegressor to the regulatory relationship between the target
        gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                       index =["c1","c2","c3","c4","c5"],
                       columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = Elastica(tfs,tg)
        >>> scores
        array([0.05512459, 0.34453337, 0.        ])
    """
    regressor = _sklearn_ElasticNetCV(**elastica_parameters)
    regressor.fit(X, y)
    scores = np.abs(regressor.coef_)
    return scores


[docs]def bagging_regressor(X, y, **bagging_parameters):
    """
    Apply the bagging technique to a regression algorithm, based on
    `scikit-learn`_ BaggingRegressor.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        **adab_parameters: Named parameters for the sklearn AdaBoostRegressor

    Returns:
        numpy.array: co-regulation scores.

        The i-th element of the score array represents the average score
        assigned by the Base Regressor to the regulatory relationship
        between the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.svm import SVR
        >>> np.random.seed(0)
        >>> svr = SVR(kernel="linear")
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                       index =["c1","c2","c3","c4","c5"],
                       columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> bagging_parameters = {"base_estimator":svr,
                                  "n_estimators":100,
                                  "max_samples":0.7}
        >>> scores = bagging_regressor(tfs,tg,**bagging_parameters)
        >>> scores
        array([0.32978247, 0.3617295 , 0.28896647])
    """
    def get_score(classifier):
        scores=None
        if hasattr(classifier, 'feature_importances_'):
            scores = DataFrame(feature_importances).mean(axis=0)
        elif hasattr(classifier,"coef_"):
            scores = np.abs(classifier.coef_).flatten()
        return(scores)
    bc = _sklearn_BaggingRegressor(**bagging_parameters)
    bc.fit(X,y)
    coefs = [get_score(e) for e in bc.estimators_]
    coefs = [pd.Series(c,index=bc.estimators_features_[i]) for i,c in enumerate(coefs) if c is not None]
    if len(coefs):
        scores = pd.Series(np.random.randn(bc.n_features_)*1e-5)
        scores = scores + pd.DataFrame(coefs).mean(axis=0)
    else:
        print("Base regressor has not a feature_importance of coef_ attribute")
        return(0)
    scores = scores.sort_index()
    return(scores)