Source code for grenadine.Preprocessing.rnaseq_normalization

# -*- coding: utf-8 -*-
"""
This module allows to normalize RNAseq gene expression data.
"""
import pandas as pd
import numpy as np
try:
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    from rpy2.robjects import Formula
    from rpy2.robjects import pandas2ri
    deseq2 = importr('DESeq2')
    base = importr('base')
    biogeneric = importr('BiocGenerics')
    sum_exp = importr('SummarizedExperiment')
    pandas2ri.activate()

except:
    print("Warning: could not import R components. DEseq2() function cannot be called.")

__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__credits__ = ["Nicolas Parisot"]
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"


[docs]def RPM(raw_counts):
    """
    Reads Per Million.

    Args:
        raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes
            and columns are conditions

    Returns:
        pandas.DataFrame: Normalized counts

    Examples:
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> import pandas as pd
        >>> nb_genes = 1000
        >>> nb_conditions = 5
        >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions))
        >>> raw_counts = pd.DataFrame(raw_counts)
        >>> rpm = RPM(raw_counts)
        >>> rpm.head()
                    0            1            2            3            4
        0  1994.031850   617.999738   895.038590   234.467249  1929.409246
        1   308.104674  1783.727269   738.867008   604.569366   245.491264
        2  1235.090833   906.128463   769.221953  1462.698984  1474.653899
        3   628.576824   344.838319  1723.115991  1201.585019  1084.225878
        4  1921.133736  1363.195297   761.440687  1481.020714  1777.679267

    """
    # per million factor: nb of reads per condition divided by 10^6
    pmf = raw_counts.sum(axis=0)/1e6
    rpm = raw_counts / pmf
    return(rpm)

[docs]def RPK(raw_counts, seq_lengths, seq_in_kb=False):
    """
    Reads Per Kilobase normalization.

    Args:
        raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes
            and columns are conditions
        seq_lengths (pandas.Series): sequences DNA lengths
        seq_in_kb (bool): True if lengths in kb, False otherwise

    Returns:
        pandas.DataFrame: Normalized counts

    Examples:
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> import pandas as pd
        >>> nb_genes = 1000
        >>> nb_conditions = 5
        >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions))
        >>> raw_counts = pd.DataFrame(raw_counts)
        >>> seq_lengths = np.random.randint(100,20000,nb_genes)
        >>> seq_lengths = pd.Series(seq_lengths)
        >>> rpk = RPK(raw_counts, seq_lengths)
        >>> rpk.head()
                       0              1              2              3              4
        0  321202.997719   99612.577387  142010.101010   38433.365917  313911.697621
        1   26853.843441  155566.114245   63431.417489   53620.768688   21611.248237
        2   97195.319962   71353.390640   59624.960204  117133.237822  117212.034384
        3  132006.796941   72465.590484  356436.703483  256785.896347  229981.733220
        4   48384.227419   34354.424576   18889.143614   37956.492944   45220.490091

    """
    seq_lengths = seq_lengths[raw_counts.index]
    seq_lengths = seq_lengths.fillna(seq_lengths.mean())
    if not seq_in_kb:
        seq_lengths_in_kb = seq_lengths / 1e3
    else:
        seq_lengths_in_kb = seq_lengths
    rpk = (raw_counts.T / seq_lengths_in_kb).T
    return(rpk)

[docs]def RPKM(raw_counts, seq_lengths, seq_in_kb=False):
    """
    Reads Per Kilobase Million (also known as FPM: Fragments per kilobase).

    Args:
        raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes
            and columns are conditions
        seq_lengths (pandas.Series): sequences DNA lengths
        seq_in_kb (bool): True if lengths in kb, False otherwise

    Returns:
        pandas.DataFrame: Normalized counts

    Examples:
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> import pandas as pd
        >>> nb_genes = 1000
        >>> nb_conditions = 5
        >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions))
        >>> raw_counts = pd.DataFrame(raw_counts)
        >>> seq_lengths = np.random.randint(100,20000,nb_genes)
        >>> seq_lengths = pd.Series(seq_lengths)
        >>> rpkm = RPKM(raw_counts, seq_lengths)
        >>> rpkm.head()
                    0           1           2           3           4
        0  649.733415  201.368439  291.638511   76.398582  628.676848
        1   54.320288  314.479420  130.265692  106.588393   43.281252
        2  196.607901  144.242035  122.448576  232.839698  234.742741
        3  267.024989  146.490365  731.994898  510.443933  460.588733
        4   97.872216   69.448026   38.791619   75.450645   90.563924

    """
    # reads per million
    rpm = RPM(raw_counts=raw_counts)
    rpkm = RPK(rpm, seq_lengths, seq_in_kb=False)
    return(rpkm)

[docs]def TPM(raw_counts, seq_lengths, seq_in_kb=False):
    """
    Transcript Per Million normalization.

    Args:
        raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes
            and columns are conditions
        seq_lengths (pandas.Series): sequences DNA lengths
        seq_in_kb (bool): True if lengths in kb, False otherwise

    Returns:
        pandas.DataFrame: Normalized counts

    Examples:
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> import pandas as pd
        >>> nb_genes = 1000
        >>> nb_conditions = 5
        >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions))
        >>> raw_counts = pd.DataFrame(raw_counts)
        >>> seq_lengths = np.random.randint(100,20000,nb_genes)
        >>> seq_lengths = pd.Series(seq_lengths)
        >>> tpm = TPM(raw_counts, seq_lengths)
        >>> tpm.head()
                     0            1            2            3            4
        0  2455.468465   739.530213  1103.147117   265.510632  2397.256398
        1   205.286894  1154.932887   492.740902   370.430324   165.039097
        2   743.019352   529.732184   463.172003   809.195846   895.115733
        3  1009.139172   537.989227  2768.832068  1773.963432  1756.306584
        4   369.878069   255.049468   146.732550   262.216233   345.336316

    """
    rpk = RPK(raw_counts, seq_lengths, seq_in_kb)
    tpm = RPM(rpk)
    return(tpm)


[docs]def DEseq2(raw_counts,col_data,rlog=True):
    """
    Apply R DEseq2 normalization.

    Args:
        raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes
            and columns are conditions
        col_data (pandas.DataFrame): Two columns, one corresponding to ids of
            each condition (individuals), and one with the experiment id
            (if many repetitions)

    Returns:
        pandas.DataFrame: Normalized counts

    Example:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> raw_counts = pd.DataFrame(np.random.randint(0,1000,(20,10)),
                                      columns = ["Z"+str(i) for i in range(10)])
        >>> col_data = pd.DataFrame([["Z0","1"],
                                     ["Z1","2"],
                                     ["Z2","3"],
                                     ["Z3","4"],
                                     ["Z4","5"],
                                     ["Z5","6"],
                                     ["Z6","7"],
                                     ["Z7","8"],
                                     ["Z8","9"],
                                     ["Z9","10"]
                                     ],columns=["individuals","conditions"])
        >>> raw_counts.columns = col_data["individuals"]
        >>> col_data.index = col_data['individuals']
        >>> DEseq2(raw_counts,col_data,rlog=False)
        individuals          X0          X1     ...              X8          X9
        0            408.025477  382.991634     ...        7.745300  611.474516
        1            165.238388  516.593367     ...      270.224902  596.251084
        2            289.912839  377.510537     ...      727.197585   60.893728
        3            463.502625  627.585575     ...      385.543809  718.884285
        4             59.056319  674.174898     ...      364.029087  243.574911
        5            573.263865  181.561329     ...      129.948918  570.878697
        6            304.229522  314.477925     ...      802.068816   44.824550
        7            537.472156  376.825400     ...       36.144732  373.819828
        8            323.914962  608.401737     ...      748.712307  100.643800
        9            464.695682  294.608949     ...      781.414683  535.357356
        10           559.543710   57.551516     ...      112.737140  822.065324
        11           517.786716  123.324676     ...      618.763389  768.783312
        12           222.505121  584.421939     ...       81.755942  166.612005
        13           361.496256  175.395095     ...      333.047888  515.905193
        14           330.476775  666.638390     ...      779.693506  312.926101
        15           331.073304  653.620785     ...      493.978005  787.389729
        16           437.851901   84.271862     ...      483.650938  347.601696
        17           466.485268   28.090621     ...      750.433484    9.303208
        18           459.326926  210.337087     ...      149.742461  468.543405
        19           221.312064  126.065225     ...      662.653421  435.559302

    """
    if len(col_data.index) != len(raw_counts.columns):
        print('each raw of col_data should correspond to each column of raw_counts ... dataframes shapes mismatches')
        return(1)
    if (col_data.index!=raw_counts.columns).any():
        col_data.index = raw_counts.columns
    dds = deseq2.DESeqDataSetFromMatrix(raw_counts,col_data,Formula('~ conditions'))
    dds = deseq2.DESeq(dds)
    res = deseq2.results(dds)
    if rlog:
        rlog_counts = deseq2.rlog(dds)
        normed_counts = sum_exp.assay(rlog_counts)
    else:
        normed_counts = biogeneric.counts(dds,True)
    normed_counts = pd.DataFrame(normed_counts)
    normed_counts.index = raw_counts.index
    normed_counts.columns = raw_counts.columns
    return(normed_counts)

[docs]def log(X,base=10, pseudocount=1):
    """
    Add a pseudocount and apply the log transformation with a given base.

    Args:
        X (pandas.DataFrame or numpy.array): gene expression matrix
        base (float): logarithm base
        pseudocount(float): pseudocount value

    Returns:
        pandas.DataFrame or numpy.array: log transformed gene expression matrix

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> data = pd.DataFrame(np.random.randn(5, 5),
                            index=["c1", "c2", "c3", "c4", "c5"],
                            columns=["gene1", "gene2", "gene3", "gene4", "gene5"])
        >>> pseudocount = -np.min(data.values)+1
        >>> log_data = log(data, pseudocount=pseudocount)
        >>> log_data
               gene1     gene2     gene3     gene4     gene5
        c1  0.725670  0.596943  0.656264  0.762970  0.734043
        c2  0.410897  0.653509  0.531687  0.537790  0.598089
        c3  0.567853  0.699600  0.634883  0.565218  0.601718
        c4  0.589577  0.703039  0.524764  0.587268  0.431186
        c5  0.000000  0.623932  0.645169  0.448834  0.765128

    """
    X_log = np.log(X+pseudocount)
    return(X_log/np.log(base))