Source code for grenadine.Preprocessing.rnaseq_normalization

# -*- coding: utf-8 -*-
"""
This module allows to normalize RNAseq gene expression data.
"""
import pandas as pd
import numpy as np
try:
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    from rpy2.robjects import Formula
    from rpy2.robjects import pandas2ri
    deseq2 = importr('DESeq2')
    base = importr('base')
    biogeneric = importr('BiocGenerics')
    sum_exp = importr('SummarizedExperiment')
    pandas2ri.activate()

except:
    print("Warning: could not import R components. DEseq2() function cannot be called.")

__author__ = "Sergio Peignier"
__copyright__ = "Copyright 2019, The GReNaDIne Project"
__credits__ = ["Nicolas Parisot"]
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Sergio Peignier"
__email__ = "sergio.peignier@insa-lyon.fr"
__status__ = "pre-alpha"


[docs]def RPM(raw_counts): """ Reads Per Million. Args: raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes and columns are conditions Returns: pandas.DataFrame: Normalized counts Examples: >>> import numpy as np >>> np.random.seed(0) >>> import pandas as pd >>> nb_genes = 1000 >>> nb_conditions = 5 >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions)) >>> raw_counts = pd.DataFrame(raw_counts) >>> rpm = RPM(raw_counts) >>> rpm.head() 0 1 2 3 4 0 1994.031850 617.999738 895.038590 234.467249 1929.409246 1 308.104674 1783.727269 738.867008 604.569366 245.491264 2 1235.090833 906.128463 769.221953 1462.698984 1474.653899 3 628.576824 344.838319 1723.115991 1201.585019 1084.225878 4 1921.133736 1363.195297 761.440687 1481.020714 1777.679267 """ # per million factor: nb of reads per condition divided by 10^6 pmf = raw_counts.sum(axis=0)/1e6 rpm = raw_counts / pmf return(rpm)
[docs]def RPK(raw_counts, seq_lengths, seq_in_kb=False): """ Reads Per Kilobase normalization. Args: raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes and columns are conditions seq_lengths (pandas.Series): sequences DNA lengths seq_in_kb (bool): True if lengths in kb, False otherwise Returns: pandas.DataFrame: Normalized counts Examples: >>> import numpy as np >>> np.random.seed(0) >>> import pandas as pd >>> nb_genes = 1000 >>> nb_conditions = 5 >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions)) >>> raw_counts = pd.DataFrame(raw_counts) >>> seq_lengths = np.random.randint(100,20000,nb_genes) >>> seq_lengths = pd.Series(seq_lengths) >>> rpk = RPK(raw_counts, seq_lengths) >>> rpk.head() 0 1 2 3 4 0 321202.997719 99612.577387 142010.101010 38433.365917 313911.697621 1 26853.843441 155566.114245 63431.417489 53620.768688 21611.248237 2 97195.319962 71353.390640 59624.960204 117133.237822 117212.034384 3 132006.796941 72465.590484 356436.703483 256785.896347 229981.733220 4 48384.227419 34354.424576 18889.143614 37956.492944 45220.490091 """ seq_lengths = seq_lengths[raw_counts.index] seq_lengths = seq_lengths.fillna(seq_lengths.mean()) if not seq_in_kb: seq_lengths_in_kb = seq_lengths / 1e3 else: seq_lengths_in_kb = seq_lengths rpk = (raw_counts.T / seq_lengths_in_kb).T return(rpk)
[docs]def RPKM(raw_counts, seq_lengths, seq_in_kb=False): """ Reads Per Kilobase Million (also known as FPM: Fragments per kilobase). Args: raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes and columns are conditions seq_lengths (pandas.Series): sequences DNA lengths seq_in_kb (bool): True if lengths in kb, False otherwise Returns: pandas.DataFrame: Normalized counts Examples: >>> import numpy as np >>> np.random.seed(0) >>> import pandas as pd >>> nb_genes = 1000 >>> nb_conditions = 5 >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions)) >>> raw_counts = pd.DataFrame(raw_counts) >>> seq_lengths = np.random.randint(100,20000,nb_genes) >>> seq_lengths = pd.Series(seq_lengths) >>> rpkm = RPKM(raw_counts, seq_lengths) >>> rpkm.head() 0 1 2 3 4 0 649.733415 201.368439 291.638511 76.398582 628.676848 1 54.320288 314.479420 130.265692 106.588393 43.281252 2 196.607901 144.242035 122.448576 232.839698 234.742741 3 267.024989 146.490365 731.994898 510.443933 460.588733 4 97.872216 69.448026 38.791619 75.450645 90.563924 """ # reads per million rpm = RPM(raw_counts=raw_counts) rpkm = RPK(rpm, seq_lengths, seq_in_kb=False) return(rpkm)
[docs]def TPM(raw_counts, seq_lengths, seq_in_kb=False): """ Transcript Per Million normalization. Args: raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes and columns are conditions seq_lengths (pandas.Series): sequences DNA lengths seq_in_kb (bool): True if lengths in kb, False otherwise Returns: pandas.DataFrame: Normalized counts Examples: >>> import numpy as np >>> np.random.seed(0) >>> import pandas as pd >>> nb_genes = 1000 >>> nb_conditions = 5 >>> raw_counts = np.random.randint(0,1e6,(nb_genes,nb_conditions)) >>> raw_counts = pd.DataFrame(raw_counts) >>> seq_lengths = np.random.randint(100,20000,nb_genes) >>> seq_lengths = pd.Series(seq_lengths) >>> tpm = TPM(raw_counts, seq_lengths) >>> tpm.head() 0 1 2 3 4 0 2455.468465 739.530213 1103.147117 265.510632 2397.256398 1 205.286894 1154.932887 492.740902 370.430324 165.039097 2 743.019352 529.732184 463.172003 809.195846 895.115733 3 1009.139172 537.989227 2768.832068 1773.963432 1756.306584 4 369.878069 255.049468 146.732550 262.216233 345.336316 """ rpk = RPK(raw_counts, seq_lengths, seq_in_kb) tpm = RPM(rpk) return(tpm)
[docs]def DEseq2(raw_counts,col_data,rlog=True): """ Apply R DEseq2 normalization. Args: raw_counts (pandas.DataFrame): raw RNAseq counts where rows are genes and columns are conditions col_data (pandas.DataFrame): Two columns, one corresponding to ids of each condition (individuals), and one with the experiment id (if many repetitions) Returns: pandas.DataFrame: Normalized counts Example: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> raw_counts = pd.DataFrame(np.random.randint(0,1000,(20,10)), columns = ["Z"+str(i) for i in range(10)]) >>> col_data = pd.DataFrame([["Z0","1"], ["Z1","2"], ["Z2","3"], ["Z3","4"], ["Z4","5"], ["Z5","6"], ["Z6","7"], ["Z7","8"], ["Z8","9"], ["Z9","10"] ],columns=["individuals","conditions"]) >>> raw_counts.columns = col_data["individuals"] >>> col_data.index = col_data['individuals'] >>> DEseq2(raw_counts,col_data,rlog=False) individuals X0 X1 ... X8 X9 0 408.025477 382.991634 ... 7.745300 611.474516 1 165.238388 516.593367 ... 270.224902 596.251084 2 289.912839 377.510537 ... 727.197585 60.893728 3 463.502625 627.585575 ... 385.543809 718.884285 4 59.056319 674.174898 ... 364.029087 243.574911 5 573.263865 181.561329 ... 129.948918 570.878697 6 304.229522 314.477925 ... 802.068816 44.824550 7 537.472156 376.825400 ... 36.144732 373.819828 8 323.914962 608.401737 ... 748.712307 100.643800 9 464.695682 294.608949 ... 781.414683 535.357356 10 559.543710 57.551516 ... 112.737140 822.065324 11 517.786716 123.324676 ... 618.763389 768.783312 12 222.505121 584.421939 ... 81.755942 166.612005 13 361.496256 175.395095 ... 333.047888 515.905193 14 330.476775 666.638390 ... 779.693506 312.926101 15 331.073304 653.620785 ... 493.978005 787.389729 16 437.851901 84.271862 ... 483.650938 347.601696 17 466.485268 28.090621 ... 750.433484 9.303208 18 459.326926 210.337087 ... 149.742461 468.543405 19 221.312064 126.065225 ... 662.653421 435.559302 """ if len(col_data.index) != len(raw_counts.columns): print('each raw of col_data should correspond to each column of raw_counts ... dataframes shapes mismatches') return(1) if (col_data.index!=raw_counts.columns).any(): col_data.index = raw_counts.columns dds = deseq2.DESeqDataSetFromMatrix(raw_counts,col_data,Formula('~ conditions')) dds = deseq2.DESeq(dds) res = deseq2.results(dds) if rlog: rlog_counts = deseq2.rlog(dds) normed_counts = sum_exp.assay(rlog_counts) else: normed_counts = biogeneric.counts(dds,True) normed_counts = pd.DataFrame(normed_counts) normed_counts.index = raw_counts.index normed_counts.columns = raw_counts.columns return(normed_counts)
[docs]def log(X,base=10, pseudocount=1): """ Add a pseudocount and apply the log transformation with a given base. Args: X (pandas.DataFrame or numpy.array): gene expression matrix base (float): logarithm base pseudocount(float): pseudocount value Returns: pandas.DataFrame or numpy.array: log transformed gene expression matrix Examples: >>> import pandas as pd >>> import numpy as np >>> np.random.seed(0) >>> data = pd.DataFrame(np.random.randn(5, 5), index=["c1", "c2", "c3", "c4", "c5"], columns=["gene1", "gene2", "gene3", "gene4", "gene5"]) >>> pseudocount = -np.min(data.values)+1 >>> log_data = log(data, pseudocount=pseudocount) >>> log_data gene1 gene2 gene3 gene4 gene5 c1 0.725670 0.596943 0.656264 0.762970 0.734043 c2 0.410897 0.653509 0.531687 0.537790 0.598089 c3 0.567853 0.699600 0.634883 0.565218 0.601718 c4 0.589577 0.703039 0.524764 0.587268 0.431186 c5 0.000000 0.623932 0.645169 0.448834 0.765128 """ X_log = np.log(X+pseudocount) return(X_log/np.log(base))