Source code for grenadine.Inference.classification_predictors

"""
This module allows to infer Gene Regulatory Networks using
gene expresion data (RNAseq or Microarray). This module implements several
inference algorithms based on classification, using `scikit-learn`_.

.. _scikit-learn:
    https://scikit-learn.org
"""
__author__ = "Pauline Schmitt, Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as _sklearn_RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier as _sklearn_ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier as _sklearn_AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier as _sklearn_GradientBoostingClassifier
from sklearn.svm import LinearSVC as _sklearn_SVC
from sklearn.naive_bayes import MultinomialNB as _sklearn_MultinomialNB
from sklearn.naive_bayes import ComplementNB as _sklearn_ComplementNB
from sklearn.ensemble import BaggingClassifier as _sklearn_BaggingClassifier
from collections import Counter

[docs]def RF_classifier_score(X, y, **rf_parameters): """ Random Forest Classifier, score predictor function based on `scikit-learn`_ RandomForestClassifier. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **rf_parameters: Named parameters for the sklearn _sklearn_RandomForestClassifier Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the RandomForestClassifier to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = RF_classifier_score(tfs,tg) >>> scores array([0.21071429, 0.4 , 0.28928571]) """ classifier = _sklearn_RandomForestClassifier(**rf_parameters) classifier.fit(X, y) scores = classifier.feature_importances_ return scores
[docs]def XRF_classifier_score(X, y, **xrf_parameters): """ Randomized decision trees Classifier, score predictor function based on `scikit-learn`_ ExtraTreesClassifier. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **xrf_parameters: Named parameters for the sklearn _sklearn_ExtraTreesClassifier Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the ExtraTreesClassifier to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = XRF_classifier_score(tfs,tg) >>> scores array([0.31354167, 0.35520833, 0.33125 ]) """ classifier = _sklearn_ExtraTreesClassifier(**xrf_parameters) classifier.fit(X, y) scores = classifier.feature_importances_ return(scores)
[docs]def AdaBoost_classifier_score(X, y, **adab_parameters): """ AdaBoost Classifier, score predictor function based on `scikit-learn`_ AdaBoostClassifier. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **adab_parameters: Named parameters for the sklearn AdaBoostClassifier Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the AdaBoostClassifier to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = AdaBoost_classifier_score(tfs,tg) >>> scores array([0.24, 0.44, 0.32]) """ classifier = _sklearn_AdaBoostClassifier(**adab_parameters) classifier.fit(X, y) scores = classifier.feature_importances_ return scores
[docs]def GB_classifier_score(X, y, **gb_parameters): """ Gradient Boosting Classifier, score predictor function based on `scikit-learn`_ GradientBoostingClassifier. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **gb_parameters: Named parameters for the sklearn _sklearn_ExtraTreesClassifier Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the GradientBoostingClassifier to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = GB_classifier_score(tfs,tg) >>> scores array([0.33959125, 0.21147015, 0.4489386 ]) """ classifier = _sklearn_GradientBoostingClassifier(**gb_parameters) classifier.fit(X, y) scores = classifier.feature_importances_ return scores
[docs]def SVM_classifier_score(X, y, **svm_parameters): """ SVM Classifier, score predictor function based on `scikit-learn`_ SVC (Support Vector Classifier). Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **svm_parameters: Named parameters for the sklearn SVC Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the SVC to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = SVM_classifier_score(tfs,tg) >>> scores array([0.58413783, 0.5448345 , 0.31764191]) """ # consider changing kernel function: SVC(kernel = 'linear'/'poly'/'sigmoid'/'precomputed') # default is rbf # also consider changing polynomial degree : SVC(kernel = 'poly', degree = int value) # default is 3 # but linear is the only kernel that has coef_ attribute #svm_parameters["kernel"] = 'linear' svm_parameters["multi_class"] = 'ovr' svm_parameters["dual"]=False classifier = _sklearn_SVC(**svm_parameters) classifier.fit(X, y) # coef_ = array of shape (nb_classes, nb_TFs) # replacing each class value (= discretized value) of y with corresponding TFs weights from attribute coef_ # then taking the mean along each column (= mean importance of TF over all conditions) if classifier.coef_.shape[0] > 1: scores = np.abs(classifier.coef_).mean(axis=0) #coef = pd.DataFrame(classifier.coef_,index=classifier.classes_) #nb_classes = pd.Series(Counter(y)) #scores = np.abs((coef.T*nb_classes/nb_classes.sum()).T).mean(axis=0) else: scores = np.abs(classifier.coef_[0,:]) return scores
[docs]def MultinomialNB_classifier_score(X, y, **nb_parameters): """ Multinomial Naive Bayes Classifier, score predictor function based on `scikit-learn`_ MultinomialNB. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **nb_parameters: Named parameters for the sklearn MultinomialNB Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the MultinomialNB to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = MultinomialNB_classifier_score(tfs,tg) >>> scores array([0.3010284 , 0.41871716, 0.4272386 ]) """ x_shift = 0 x_min = X.min().min() if x_min<0: x_shift = -x_min classifier = _sklearn_MultinomialNB(**nb_parameters) classifier.fit(X+x_shift, y) # coef_ = array of shape (nb_classes, nb_TFs) # replacing each class value (= discretized value) of y with corresponding TFs weights from attribute coef_ # then taking the mean along each column (= mean importance of TF over all conditions) if classifier.coef_.shape[0] > 1: coef = pd.DataFrame(classifier.feature_log_prob_,index=classifier.classes_) scores = np.abs(coef) scores = coef.fillna(0) scores = scores.mean(axis=0) else: scores = np.abs(classifier.feature_log_prob_[0,:]) scores = np.nan_to_num(scores) #scores = scores.fillna(0) return np.array(scores)
[docs]def ComplementNB_classifier_score(X, y, **nb_parameters): """ Complement Naive Bayes Classifier, score predictor function based on `scikit-learn`_ ComplementtNB. Args: X (pandas.DataFrame): Transcription factor gene expressions (discretized or not) where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector (discretized) where rows are experimental conditions **nb_parameters: Named parameters for the sklearn MultinomialNB Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the score assigned by the ComplementNB to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> tfs = pd.DataFrame(np.random.randn(5,3), index =["c1","c2","c3","c4","c5"], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,3,size=5), index=["c1","c2","c3","c4","c5"]) >>> scores = ComplementNB_classifier_score(tfs,tg) >>> scores array([0.28113447, 0.39096368, 0.45629413]) """ x_shift = 0 x_min = X.min().min() if x_min<0: x_shift = -x_min classifier = _sklearn_ComplementNB(**nb_parameters) classifier.fit(X+x_shift, y) # coef_ = array of shape (nb_classes, nb_TFs) # replacing each class value (= discretized value) of y with corresponding # TFs weights from attribute coef_ # then taking the mean along each column (= mean importance of TF over all # conditions) if classifier.coef_.shape[0] > 1: coef = pd.DataFrame(classifier.feature_log_prob_,index=classifier.classes_) scores = np.abs(coef) scores = coef.fillna(0) scores = scores.mean(axis=0) else: scores = np.abs(classifier.feature_log_prob_[0,:]) scores = np.nan_to_num(scores) #scores = scores.fillna(0) return np.array(scores)
[docs]def bagging_classifier_score(X, y, **bagging_parameters): """ Apply the bagging technique to a regression algorithm, based on `scikit-learn`_ BaggingClassifier. Args: X (pandas.DataFrame): Transcriptor factor gene expressions where rows are experimental conditions and columns are transcription factors y (pandas.Series): Target gene expression vector where rows are experimental conditions **adab_parameters: Named parameters for the sklearn AdaBoostRegressor Returns: numpy.array: co-regulation scores. The i-th element of the score array represents the average score assigned by the Base Regressor to the regulatory relationship between the target gene and transcription factor i. Examples: >>> import pandas as pd >>> import numpy as np >>> from sklearn.svm import SVR >>> np.random.seed(0) >>> svc = SVC(kernel="linear",decision_function_shape='ovr') >>> nb_conditions = 10 >>> tfs = pd.DataFrame(np.random.randn(nb_conditions,3), index =["c"+str(i) for i in range(nb_conditions)], columns=["tf1","tf2","tf3"]) >>> tg = pd.Series(np.random.randint(0,2,size=nb_conditions), index =["c"+str(i) for i in range(nb_conditions)]) >>> bagging_parameters = {"base_estimator":svc, "n_estimators":5, "max_samples":0.9} >>> scores = bagging_classifier_score(tfs,tg,**bagging_parameters) >>> scores array([0.269231,0.412219,0.299806]) """ def get_score(classifier): scores=None if hasattr(classifier, 'feature_importances_'): scores = pd.DataFrame(feature_importances).mean(axis=0) elif hasattr(classifier,"coef_"): if classifier.coef_.shape[0] > 1: scores = np.abs(classifier.coef_).mean(axis=0) else: scores = np.abs(classifier.coef_[0,:]) return(scores) bc = _sklearn_BaggingClassifier(**bagging_parameters)#BaggingClassifier bc.fit(X,y) coefs = [get_score(e) for e in bc.estimators_] coefs = [c for c in coefs if c is not None] if len(coefs): scores = pd.DataFrame(coefs).mean(axis=0) else: print("Base regressor has not a feature_importance of coef_ attribute") return(0) return(scores)