# -*- coding: utf-8 -*-
"""
This module allows to pre-process gene expression data.
"""
import numpy as np
import pandas as pd
try:
import ot
except:
print("Warning: could not import POT. columns_matrix_OT_norm() function cannot be called.")
__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"
[docs]def z_score(A, axis=0):
"""
Compute the z-score along the specified axis.
Args:
A (pandas.DataFrame or numpy.array): matrix
axis (int): 0 for columns and 1 for rows
Returns:
pandas.DataFrame or numpy.array: Normalized matrix
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> data = pd.DataFrame(np.random.randn(5, 5),
index=["c1", "c2", "c3", "c4", "c5"],
columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
>>> norm_data = z_score(data)
>>> norm_data
gene1 gene2 gene3 gene4 gene5
c1 1.254757 -1.222682 0.914682 1.672581 0.828015
c2 -0.446591 -0.083589 -1.038607 -0.418644 -0.331945
c3 0.249333 0.960749 0.538403 -0.218012 -0.305461
c4 0.367024 1.043200 -1.131598 -0.047267 -1.338834
c5 -1.424523 -0.697678 0.717120 -0.988659 1.148225
"""
epsilon = 1e-5
if axis==0:
A = (A - A.mean(axis=0)) / (A.std(axis=0) + epsilon)
if axis==1:
A = A.T
A = (A - A.mean(axis=0)) / (A.std(axis=0) + epsilon)
A = A.T
return(A)
[docs]def mean_std_polishing(A, nb_iterations=5):
"""
Iterative z-score on rows and columns.
Args:
A (pandas.DataFrame or numpy.array): matrix
nb_iterations (int): number of polishing iterations
Returns:
pandas.DataFrame or numpy.array: Polished matrix
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> data = pd.DataFrame(np.random.randn(5, 5),
index=["c1", "c2", "c3", "c4", "c5"],
columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
>>> norm_data = mean_std_polishing(data)
>>> norm_data
gene1 gene2 gene3 gene4 gene5
c1 0.336095 -1.618781 0.187436 1.109617 -0.014367
c2 -0.321684 0.586608 -1.606905 0.484159 0.857821
c3 0.139260 0.860934 0.976541 -1.395814 -0.580921
c4 1.243263 0.421752 -0.585940 0.282319 -1.361394
c5 -1.363323 -0.161066 0.826375 -0.421998 1.120013
"""
epsilon = 1e-10
for i in range(nb_iterations):
A = A-A.mean(axis=0)
# std polish the column
A = A/(A.std(axis=0)+epsilon)
# mean polish the row
A = (A.T-A.T.mean(axis=0)).T
# std polish the row
A = (A.T/(A.T.std(axis=0) + epsilon)).T
return(A)
[docs]def cat_gene_expression_dfs(gene_expression_dfs):
"""
Concatenate different gene expression datasets, based on gene id (rows).
Args:
gene_expression_dfs (list of pandas.DataFrame): Expression datasets list
Returns:
pandas.DataFrame: concatenated gene expression datasets
Examples:
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(0)
>>> data1 = pd.DataFrame(np.random.randn(3, 3),
index=["gene1", "gene2", "gene3"],
columns=["c1", "c2", "c3"])
>>> data1
c1 c2 c3
gene1 1.764052 0.400157 0.978738
gene2 2.240893 1.867558 -0.977278
gene3 0.950088 -0.151357 -0.103219
>>> data2 = pd.DataFrame(np.random.randn(3, 3),
index=["gene2", "gene3", "gene4"],
columns=["c4", "c5", "c6"])
>>> data2
c4 c5 c6
gene2 0.410599 0.144044 1.454274
gene3 0.761038 0.121675 0.443863
gene4 0.333674 1.494079 -0.205158
>>> data=cat_gene_expression_dfs([data1, data2])
>>> data
c1 c2 c3 c4 c5 c6
gene1 1.764052 0.400157 0.978738 NaN NaN NaN
gene2 2.240893 1.867558 -0.977278 0.410599 0.144044 1.454274
gene3 0.950088 -0.151357 -0.103219 0.761038 0.121675 0.443863
gene4 NaN NaN NaN 0.333674 1.494079 -0.205158
"""
cat = pd.concat(gene_expression_dfs,axis=1,sort=True)
return(cat)
[docs]def columns_matrix_OT_norm(X,reference=None,bins=None,**SinkhornTransport_para):
"""
Use optimal transport in order to make all conditions disributions alike.
Args:
X (pandas.DataFrame): gene expression matrix
r_percentile (numpy.array): reference distribution
bins (numpy.array): bins for percentiles computation
SinkhornTransport_para: ot.da.SinkhornTransport parameters
Returns:
pandas.DataFrame: Normalized matrix
Examples:
>>> import numpy as np
>>> import pandas as pd
>>> a = pd.DataFrame(np.random.randn(10000,10))
>>> b = pd.DataFrame(np.random.randn(10000,10)*3+4)
>>> bins = list(range(1,100))
>>> b_ = columns_matrix_OT_norm(b,a.iloc[:,0],bins,reg_e=5e-1)
"""
if bins is None:
bins = [0.01]+list(np.arange(0.1,100,0.1))+[99.99]
if reference is None:
reference = X.values[:,0]
percentile = np.percentile(reference,bins)
X_c = pd.DataFrame()
if "reg_e" not in SinkhornTransport_para:
SinkhornTransport_para["reg_e"] = 1e-3
for c in X.columns:
percentile_c = np.percentile(X[c].values.flatten(),bins)
ot_sinkhorn = ot.da.SinkhornTransport(**SinkhornTransport_para)
ot_sinkhorn.fit(Xs=percentile_c.reshape(-1,1),
Xt=percentile.reshape(-1,1))
X_c_transf = ot_sinkhorn.transform(Xs=X[c].values.reshape(-1,1))
X_c[c] = X_c_transf.flatten()
X_c.index = X.index
return(X_c)