Source code for grenadine.Preprocessing.standard_preprocessing

# -*- coding: utf-8 -*-
"""
This module allows to pre-process gene expression data.
"""
import numpy as np
import pandas as pd
try:
    import ot

except:
    print("Warning: could not import POT. columns_matrix_OT_norm() function cannot be called.")



__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"

[docs]def z_score(A, axis=0):
    """
    Compute the z-score along the specified axis.

    Args:
        A (pandas.DataFrame or numpy.array): matrix
        axis (int): 0 for columns and 1 for rows

    Returns:
        pandas.DataFrame or numpy.array: Normalized matrix

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["c1", "c2", "c3", "c4", "c5"],
                            columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
        >>> norm_data = z_score(data)
        >>> norm_data
               gene1     gene2     gene3     gene4     gene5
        c1  1.254757 -1.222682  0.914682  1.672581  0.828015
        c2 -0.446591 -0.083589 -1.038607 -0.418644 -0.331945
        c3  0.249333  0.960749  0.538403 -0.218012 -0.305461
        c4  0.367024  1.043200 -1.131598 -0.047267 -1.338834
        c5 -1.424523 -0.697678  0.717120 -0.988659  1.148225

    """
    epsilon = 1e-5
    if axis==0:
        A = (A - A.mean(axis=0)) / (A.std(axis=0) + epsilon)
    if axis==1:
        A = A.T
        A = (A - A.mean(axis=0)) / (A.std(axis=0) + epsilon)
        A = A.T
    return(A)

[docs]def mean_std_polishing(A, nb_iterations=5):
    """
    Iterative z-score on rows and columns.

    Args:
        A (pandas.DataFrame or numpy.array): matrix
        nb_iterations (int): number of polishing iterations

    Returns:
        pandas.DataFrame or numpy.array: Polished matrix

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["c1", "c2", "c3", "c4", "c5"],
                            columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
        >>> norm_data = mean_std_polishing(data)
        >>> norm_data
               gene1     gene2     gene3     gene4     gene5
        c1  0.336095 -1.618781  0.187436  1.109617 -0.014367
        c2 -0.321684  0.586608 -1.606905  0.484159  0.857821
        c3  0.139260  0.860934  0.976541 -1.395814 -0.580921
        c4  1.243263  0.421752 -0.585940  0.282319 -1.361394
        c5 -1.363323 -0.161066  0.826375 -0.421998  1.120013

    """
    epsilon = 1e-10
    for i in range(nb_iterations):
        A = A-A.mean(axis=0)
        # std polish the column
        A = A/(A.std(axis=0)+epsilon)
        # mean polish the row
        A = (A.T-A.T.mean(axis=0)).T
        # std polish the row
        A = (A.T/(A.T.std(axis=0) + epsilon)).T
    return(A)

[docs]def cat_gene_expression_dfs(gene_expression_dfs):
    """
    Concatenate different gene expression datasets, based on gene id (rows).

    Args:
        gene_expression_dfs (list of pandas.DataFrame): Expression datasets list

    Returns:
        pandas.DataFrame: concatenated gene expression datasets

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data1 = pd.DataFrame(np.random.randn(3, 3),
                            index=["gene1", "gene2", "gene3"],
                            columns=["c1", "c2", "c3"])
        >>> data1
                     c1        c2        c3
        gene1  1.764052  0.400157  0.978738
        gene2  2.240893  1.867558 -0.977278
        gene3  0.950088 -0.151357 -0.103219
        >>> data2 = pd.DataFrame(np.random.randn(3, 3),
                            index=["gene2", "gene3", "gene4"],
                            columns=["c4", "c5", "c6"])
        >>> data2
                     c4        c5        c6
        gene2  0.410599  0.144044  1.454274
        gene3  0.761038  0.121675  0.443863
        gene4  0.333674  1.494079 -0.205158
        >>> data=cat_gene_expression_dfs([data1, data2])
        >>> data
                     c1        c2        c3        c4        c5        c6
        gene1  1.764052  0.400157  0.978738       NaN       NaN       NaN
        gene2  2.240893  1.867558 -0.977278  0.410599  0.144044  1.454274
        gene3  0.950088 -0.151357 -0.103219  0.761038  0.121675  0.443863
        gene4       NaN       NaN       NaN  0.333674  1.494079 -0.205158

    """
    cat = pd.concat(gene_expression_dfs,axis=1,sort=True)
    return(cat)

[docs]def median_outliers_filter(X, threshold=3):
    """
    Ensures that all the values of data_set are within:
    :math:`median(X) \pm \\tau \\times MAD(X))`

    Args:
        X (pandas.DataFrame or numpy.array): gene expression matrix (for instance)
        threshold (float): :math:`\\tau` threshold

    Returns:
        pandas.DataFrame or numpy.array: X without outliers (outliers set to
        the extreme values allowed)

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["c1", "c2", "c3", "c4", "c5"],
                            columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
        >>> median_outliers_filter(data)
               gene1     gene2     gene3     gene4     gene5
        c1  1.764052  0.400157  0.978738  0.674682  1.867558
        c2 -0.977278  0.950088 -0.653101 -0.103219  0.410599
        c3  0.144044  1.454274  0.761038  0.121675  0.443863
        c4  0.333674  1.494079 -0.653101  0.313068 -0.854096
        c5 -2.552990  0.653619  0.864436 -0.674682  2.269755

    """
    difference = np.abs(X-np.median(X,axis=0))
    median_difference = np.median(difference,axis=0)
    s = difference / median_difference
    mask = s > threshold
    mask_negative = X < np.median(X,axis=0)
    mask_positive = X > np.median(X,axis=0)
    X[np.logical_and(mask,mask_negative)] = -threshold*np.ones(X.shape)*median_difference
    X[np.logical_and(mask,mask_positive)] = threshold*np.ones(X.shape)*median_difference
    return(X)

[docs]def columns_matrix_OT_norm(X,reference=None,bins=None,**SinkhornTransport_para):
    """
    Use optimal transport in order to make all conditions disributions alike.

    Args:
        X (pandas.DataFrame): gene expression matrix
        r_percentile (numpy.array): reference distribution
        bins (numpy.array): bins for percentiles computation
        SinkhornTransport_para: ot.da.SinkhornTransport parameters

    Returns:
        pandas.DataFrame: Normalized matrix

    Examples:
        >>> import numpy as np
        >>> import pandas as pd
        >>> a = pd.DataFrame(np.random.randn(10000,10))
        >>> b = pd.DataFrame(np.random.randn(10000,10)*3+4)
        >>> bins = list(range(1,100))
        >>> b_ = columns_matrix_OT_norm(b,a.iloc[:,0],bins,reg_e=5e-1)
    """
    if bins is None:
        bins = [0.01]+list(np.arange(0.1,100,0.1))+[99.99]
    if reference is None:
        reference = X.values[:,0]
    percentile = np.percentile(reference,bins)
    X_c = pd.DataFrame()
    if "reg_e" not in SinkhornTransport_para:
        SinkhornTransport_para["reg_e"] = 1e-3
    for c in X.columns:
        percentile_c = np.percentile(X[c].values.flatten(),bins)
        ot_sinkhorn = ot.da.SinkhornTransport(**SinkhornTransport_para)
        ot_sinkhorn.fit(Xs=percentile_c.reshape(-1,1),
                        Xt=percentile.reshape(-1,1))
        X_c_transf = ot_sinkhorn.transform(Xs=X[c].values.reshape(-1,1))
        X_c[c] = X_c_transf.flatten()
    X_c.index = X.index
    return(X_c)