Source code for grenadine.Inference.inference

# -*- coding: utf-8 -*-
"""
This module allows to infer co-expression  Gene Regulatory Networks using
gene expression data (RNAseq or Microarray).
"""
from tqdm.autonotebook import tqdm
from grenadine.Preprocessing.standard_preprocessing import z_score
import pandas as pd
import numpy as np

__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__credits__ = ["Pauline Schmitt"]
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"


[docs]def score_links(gene_expression_matrix,
                score_predictor,
                tf_list = None,
                tg_list = None,
                normalize = False,
                discr_method=None,
                progress_bar=False,
                **predictor_parameters):
    """
    Scores transcription factors-target gene co-expressions using a predictor.

    Args:
        gene_expression_matrix (pandas.DataFrame):  gene expression matrix where
            rows are genes and  columns ares samples (conditions).
            The value at row i and column j represents the expression of gene i
            in condition j.
        score_predictor (function): function that receives a pandas.DataFrame X
            containing the transcriptor factor expressions and a pandas.Series
            y containing the expression of a target gene, and scores the
            co-expression level between each transcription factor and the target
            gene.
        tf_list (list or numpy.array): list of transcription factors ids.
        tg_list (list or numpy.array): list of target genes ids.
        normalize (boolean): If True the gene expression of genes is z-scored
        discr_method : discretization method to use, if discretization of target
            gene expression is desired
        progress_bar: bool, if true include progress bar
        **predictor_parameters: Named parameters for the score predictor

    Returns:
        pandas.DataFrame: co-regulation score matrix.

        Rows are target genes and columns are transcription factors.
        The value at row i and column j represents the score assigned by the
        score_predictor to the regulatory relationship between target gene i
        and transcription factor j.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["gene1", "gene2", "gene3", "gene4", "gene5"],
                            columns=["c1", "c2", "c3", "c4", "c5"])
        >>> tf_list = ["gene1", "gene2", "gene5"]

        >>> # Example with a regression method
        >>> from grenadine.Inference.regression_predictors import GENIE3
        >>> scores1 = score_links(gene_expression_matrix=data,
                                  score_predictor=GENIE3,
                                  tf_list=tf_list)
        >>> scores1
                  gene2     gene5     gene1
        gene1  0.484081  0.515919       NaN
        gene2       NaN  0.653471  0.346529
        gene3  0.245136  0.301229  0.453634
        gene4  0.309982  0.306964  0.383054
        gene5  0.529839       NaN  0.470161

        >>> # Example with a classification method
        >>> from grenadine.Inference.classification_predictors import RF_classifier_score
        >>> from grenadine.Preprocessing.discretization import discretize_genexp
        >>> discr_method = lambda X: discretize_genexp (X, "efd", 5, axis=1)
        >>> scores2 = score_links(gene_expression_matrix=data,
                                        score_predictor=RF_classifier_score,
                                        tf_list=tf_list,
                                        discr_method=discr_method)
        >>> scores2
                  gene2     gene5     gene1
        gene1  0.512659  0.487341       NaN
        gene2       NaN  0.463122  0.536878
        gene3  0.368175  0.317341  0.314484
        gene4  0.302738  0.346799  0.350463
        gene5  0.524815       NaN  0.475185

    """
    gene_expression_matrix = gene_expression_matrix.T
    # Set the list of TFs and TGs if necessary
    if tg_list is None:
        tg_list = gene_expression_matrix.columns
    if tf_list is None:
        tf_list = gene_expression_matrix.columns
    tf_list_present = set(gene_expression_matrix.columns).intersection(tf_list)
    tg_list_present = list(set(gene_expression_matrix.columns).intersection(tg_list))
    if not len(tf_list_present):
        Exception('None of the tfs in '+str(tf_list)+\
        " is present in the gene_expression_matrix genes list"+\
        str(gene_expression_matrix.columns))
    if not len(tg_list_present):
        Exception('None of the tgs in '+str(tg_list)+\
        " is present in the gene_expression_matrix genes list"+\
        str(gene_expression_matrix.columns))
    tg_list_present.sort()
    #  Normalize expression data for each gene
    if normalize:
        gene_expression_matrix = z_score(gene_expression_matrix,axis=0)
    # compute tf scores for each gene
    scores_tf_per_gene = []
    for gene in tqdm(tg_list_present,disable=not progress_bar):
        # Exclude the current gene from the tfs list
        tfs2test = list(tf_list_present.difference(set([gene])))
        tfs2test.sort()
        X = gene_expression_matrix[tfs2test].values
        y = gene_expression_matrix[gene].values
        if discr_method is not None :
            y = discr_method(y)
        if len(np.unique(y)) <= 1:
            # handle the case when only one class was detected in y
            score = np.zeros(len(tfs2test))
        else:
            score = score_predictor(X, y, **predictor_parameters)
        # Get the features importance (score for each TF -> Gene)
        scores = pd.Series(score)
        scores.index = tfs2test
        scores_tf_per_gene.append(scores)
    df_results = pd.DataFrame(scores_tf_per_gene, index=tg_list_present)
    return(df_results)

[docs]def ensemble_score_links(score_links_matrices, score_links_weights=None):
    """
    Makes an ensemble co-regulation score matrix from a list of co-regulation
    score matrices obtained using different methods, and possibly a list of
    weights for each method

    Args:
        score_links_matrices (list): list of co-regulation score matrices (pandas
            DataFrames)
        score_links_weights (list): list of weights for each method (the higher
            the more confidence on the method). If no value is provided each
            method as a unitary weight

    Returns:
        pandas.DataFrame: co-regulation score matrix.

        Rows are target genes and columns are transcription factors.
        The value at row i and column j represents the score assigned by the
        score_predictor to the regulatory relationship between target gene i
        and transcription factor j.
    """
    if score_links_weights is None:
        score_links_weights = np.asarray([1 for s in score_links_matrices])

    for i,score in enumerate(score_links_matrices):
        score_links_matrices[i] -= score_links_matrices[i].values.flatten().mean()
        score_links_matrices[i] /= score_links_matrices[i].values.flatten().std()
        score_links_matrices[i] *= score_links_weights[i]
        if i == 0:
            score_links = score_links_matrices[i]
        else:
            score_links += score_links_matrices[i]
    return score_links/score_links_weights.sum()


[docs]def clean_nan_inf_scores(scores):
    """
    Replaces nan and -inf scores by the (minimum_score - 1), and inf scores by
    (maximum_score + 1)

    Args:
        scores (pandas.DataFrame): co-regulation score matrix.
        Rows are target genes and columns are transcription factors.
        The value at row i and column j represents the score assigned by the
        score_predictor to the regulatory relationship between target gene i
        and transcription factor j.

    Returns:
        pandas.DataFrame: co-regulation score matrix.

        Rows are target genes and columns are transcription factors.
        The value at row i and column j represents the score assigned by the
        score_predictor to the regulatory relationship between target gene i
        and transcription factor j.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["gene1", "gene2", "gene3", "gene4", "gene5"],
                            columns=["c1", "c2", "c3", "c4", "c5"])
        >>> tf_list = ["gene1", "gene2", "gene5"]

        >>> # Example with a regression method
        >>> from grenadine.Inference.regression_predictors import GENIE3
        >>> scores1 = score_links(gene_expression_matrix=data,
                                  score_predictor=GENIE3,
                                  tf_list=tf_list)
        >>> scores1
                  gene2     gene5     gene1
        gene1  0.484081  0.515919       NaN
        gene2       NaN  0.653471  0.346529
        gene3  0.245136  0.301229  0.453634
        gene4  0.309982  0.306964  0.383054
        gene5  0.529839       NaN  0.470161
        >>> clean_nan_inf_scores(scores1)
                  gene2     gene5     gene1
        gene1  0.484081  0.515919  0.245126
        gene2  0.245126  0.653471  0.346529
        gene3  0.245136  0.301229  0.453634
        gene4  0.309982  0.306964  0.383054
        gene5  0.529839  0.245126  0.470161

    """
    # replacing NaN scores with minimum score - epsilon
    epsilon = 1e-10
    nan_scores = np.isnan(scores)
    min_score =  scores[np.logical_not(nan_scores)].min().min()
    scores[nan_scores] = min_score-epsilon
    # replacing infinite scores with maximum score + epsilon
    finite_scores = np.isfinite(scores)
    max_score =  scores[finite_scores].max().max()
    positive_inf = np.logical_and(np.logical_not(finite_scores),finite_scores>0)
    negative_inf = np.logical_and(np.logical_not(finite_scores),finite_scores<0)
    scores[positive_inf] = max_score+epsilon
    scores[positive_inf] = min_score-epsilon
    return scores


[docs]def rank_GRN(coexpression_scores_matrix, take_abs_score=False, clean_scores=True, pyscenic_format=False):
    """
    Ranks the co-regulation scores between transcription factors and target genes.

    Args:
        coexpression_scores_matrix (pandas.DataFrame):co-expression score matrix
            where rows are target genes and columns are transcription factors.
            The value at row i and column j represents the score assigned by a
            score_predictor to the regulatory relationship between target gene i
            and transcription factor j.
        take_abs_score (bool): take the absolute value of the score instead of
            taking scores themselves
    Returns:
        pandas.DataFrame: ranking matrix.

        A ranking matrix contains a row for each possible regulatory link, it
        also contains 4 columns, namely the rank, the score, the transcription
        factor id, and the target gene id.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(3, 2),
                            index=["gene1", "gene2", "gene3"],
                            columns=["gene1", "gene3"])
        >>> # scores associated to self loops are set to nan
        >>> data.iloc[0,0]=np.nan
        >>> data.iloc[2,1]=np.nan
        >>> ranking_matrix = rank_GRN(data)
        >>> ranking_matrix
                     rank     score     TF     TG
        gene3_gene2   1.0  2.240893  gene3  gene2
        gene1_gene3   2.0  1.867558  gene1  gene3
        gene1_gene2   3.0  0.978738  gene1  gene2
        gene3_gene1   4.0  0.400157  gene3  gene1

    """
    epsilon = 1e-10
    rand_values = np.random.rand(*coexpression_scores_matrix.shape)*epsilon
    coexpression_scores_matrix += rand_values
    if clean_scores:
        coexpression_scores_matrix = clean_nan_inf_scores(coexpression_scores_matrix)
    if take_abs_score:
        coexpression_scores_matrix = np.abs(coexpression_scores_matrix)
    coexpression_unstack = coexpression_scores_matrix.unstack(level=0)
    ranking = pd.DataFrame()
    ranking["rank"] = coexpression_unstack.rank(method="dense",
                                                ascending=False,
                                                na_option="bottom")
    ranking["score"] = coexpression_unstack
    ranking["TF"] = list(ranking.index.get_level_values(level=0))
    ranking["TG"] = list(ranking.index.get_level_values(level=1))
    ranking = ranking.sort_values("score",ascending=False)
    ranking.index = ranking["TF"].astype(str)+"_"+ranking["TG"].astype(str)
    ranking = ranking.dropna()
    if pyscenic_format:
        ranking.rename(columns={'score':'importance',
                'TG':'target'},
                inplace=True)
        ranking = ranking.reset_index()
        ranking = ranking.drop(["rank","index"],axis=1)
        cols = ranking.columns.tolist()
        cols = ["TF","target","importance"]
        ranking = ranking[cols]
    return(ranking)

[docs]def join_rankings_scores_df(**rank_scores):
    """
    Join rankings and scores data frames generated by different methods.

    Args:
        **rank_scores: Named parameters, where arguments names should be the
            methods names and arguments values correspond to pandas.DataFrame
            output of rank_GRN

    Returns:
        (pandas.DataFrame, pandas.DataFrame): joined ranks and joined scores
            where rows represent possible regulatory links and columns represent
            each method.
            Values at row i and column j represent resp. the rank or the score
            of edge i computed by method j.

    Examples:
        >>> import pandas as pd
        >>> method1_rank = pd.DataFrame([[1,1.3, "gene1", "gene2"],
                                         [2,1.1, "gene1", "gene3"],
                                         [3,0.9, "gene3", "gene2"]],
                                         columns=['rank', 'score', 'TF', 'TG'])
        >>> method1_rank.index = method1_rank['TF']+'_'+method1_rank['TG']
        >>> method2_rank = pd.DataFrame([[1,1.4, "gene1", "gene3"],
                                         [2,1.0, "gene1", "gene2"],
                                         [3,0.9, "gene3", "gene2"]],
                                         columns=['rank', 'score', 'TF', 'TG'])
        >>> method2_rank.index = method2_rank['TF']+'_'+method2_rank['TG']
        >>> ranks, scores = join_rankings_scores_df(method1=method1_rank, method2=method2_rank)
        >>> ranks
                     method1  method2
        gene1_gene2        1        2
        gene1_gene3        2        1
        gene3_gene2        3        3
        >>> scores
                     method1  method2
        gene1_gene2      1.3      1.0
        gene1_gene3      1.1      1.4
        gene3_gene2      0.9      0.9

    """
    ranks_df = {method:rank_scores[method]["rank"] for method in rank_scores}
    ranks_df = pd.DataFrame(ranks_df)
    scores_df = {method:rank_scores[method]["score"] for method in rank_scores}
    scores_df = pd.DataFrame(scores_df)
    return(ranks_df,scores_df)