Source code for netZooPy.lioness.lioness

from __future__ import print_function
import os, os.path, sys
#from netZooPy.command_line import lioness
import numpy as np
import pandas as pd
from .timer import Timer
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects


sys.path.insert(1, "../panda")
from netZooPy.panda.panda import Panda
from netZooPy.panda.calculations import compute_panda

[docs] class Lioness(Panda): """ Using LIONESS to infer single-sample gene regulatory networks. 1. Reading in PANDA network and preprocessed middle data 2. Computing coexpression network 3. Normalizing coexpression network 4. Running PANDA algorithm 5. Writing out LIONESS networks Parameters ---------- obj : object PANDA object, generated with keep_expression_matrix=True. computing : str 'cpu' uses Central Processing Unit (CPU) to run PANDA 'gpu' use the Graphical Processing Unit (GPU) to run PANDA precision : str 'double' computes the regulatory network in double precision (15 decimal digits). 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. subset_numbers : list List of sample index onto which lioness should be run. ([1,10,20]) subset_names : list List of sample names onto which lioness should be run. (['s1','s2','s3']) start : int Index of first sample to compute the network. If subset_numbers or subset_names is passed, this is ignored end : int Index of last sample to compute the network.If subset_numbers or subset_names is passed, this is ignored all_background : bool Pass the flag if you want to keep the whole samples as background save_dir : str Directory to save the networks. save_fmt : str Save format. - '.npy': (Default) Numpy file of the network. - '.txt': Text file, only values are saved, no tf or gene names. Will be deprecated. - '.csv': text file with index (tf) and column (gene) names - '.h5': hdf file, fastest way to save the lioness dataframe with index/column names - '.mat': MATLAB file. output : str - 'network' returns all networks in a single edge-by-sample matrix (lioness_obj.total_lioness_network is the unlabeled variable and lioness_obj.export_lioness_results is the row-labeled variable). For large sample sizes, this variable requires large RAM memory. - 'gene_targeting' returns gene targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). - 'tf_targeting' returns tf targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). alpha : float learning rate, set to 0.1 by default but has to be changed manually to match the learning rate of the PANDA object. save_single: bool when set to True it will save each lioness network with its sample name inside the lioness output folder export_filename: str if passed, the final lioness table will be saved with all tf-gene edges as dataframe index and samples as column name ignore_final: bool if True, no lioness network is kept in memory. This requires saving single networks at each step Returns -------- export_lioness_results : _ Depeding on the output argument, this can be either all the lioness networks or their gene/tf targeting scores. Notes ------- Example on how to use Lioness and plot the network >>> from netZooPy.lioness.lioness import Lioness >>> #To run the Lioness algorithm for single sample networks, first run PANDA using the keep_expression_matrix flag, then use Lioness as follows: >>> panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False, keep_expression_matrix=True) >>> lioness_obj = Lioness(panda_obj) >>> #Save Lioness results: >>> lioness_obj.save_lioness_results('Toy_Lioness.txt') >>> #Return a network plot for one of the Lioness single sample networks: >>> plot = AnalyzeLioness(lioness_obj) >>> plot.top_network_plot(column= 0, top=100, file='top_100_genes.png') Example lioness output: TF, Gene and Motif order is identical to the panda output file. - Sample1 Sample2 Sample3 Sample4\n - -------------------------------\n - -0.667452814003 -1.70433776179 -0.158129613892 -0.655795512803\n - -0.843366539284 -0.733709815256 -0.84849895139 -0.915217389738\n - 3.23445386464 2.68888472802 3.35809757371 3.05297381396\n - 2.39500370135 1.84608635425 2.80179804094 2.67540878165\n - -0.117475863987 0.494923925853 0.0518448588965 -0.0584810456421\n References ----------- .. [1]__ Kuijjer, Marieke Lydia, et al. "Estimating sample-specific regulatory networks." Iscience 14 (2019): 226-240. Authors: Cho-Yi Chen, David Vi, Daniel Morgan """ def __init__( self, obj, computing="cpu", precision="double", ncores=1, start=1, end=None, subset_numbers=None, subset_names=None, save_dir="lioness_output", save_fmt="npy", output="network", alpha=0.1, save_single = False, export_filename = None, ignore_final=False, ): """ Initialize instance of Lioness class and load data. """ # Load data with Timer("Loading input data ..."): self.export_panda_results = obj.export_panda_results self.expression_samples = obj.expression_samples if precision == "single": self.expression_matrix = np.float32(obj.expression_matrix) self.correlation_matrix = np.float32(obj.correlation_matrix) self.motif_matrix = np.float32(obj.motif_matrix) self.ppi_matrix = np.float32(obj.ppi_matrix) self.alpha = np.float32(alpha) else: self.expression_matrix = obj.expression_matrix self.motif_matrix = obj.motif_matrix self.ppi_matrix = obj.ppi_matrix self.correlation_matrix = obj.correlation_matrix self.alpha = alpha self.computing = computing self.n_cores = int(ncores) self.save_single = save_single self.precision = precision if precision == "single": self.np_dtype = np.float32 else: self.np_dtype = np.float64 if hasattr(obj, "panda_network"): self.network = obj.panda_network.to_numpy() elif hasattr(obj, "puma_network"): self.network = obj.puma_network else: print("Cannot find panda or puma network in object") raise AttributeError("Cannot find panda or puma network in object") gene_names = obj.gene_names tf_names = obj.unique_tfs origmotif = obj.motif_data # save state of original motif matrix self.gene_names = gene_names self.tf_names = tf_names del obj # Get sample range to iterate # the number of conditions is the N parameter used for the number of samples in the whole background self.n_conditions = self.expression_matrix.shape[1] self.n_lio_samples = self.n_conditions if (subset_numbers!=None or subset_names!=None): if (subset_numbers!=None and subset_names!=None): sys.exit('Pass only one between subset_numbers and subset_names') elif (subset_numbers!=None and subset_names==None): # select using indexes assert isinstance(subset_numbers, list) assert isinstance(subset_numbers[0], int) self.indexes = [int(i) for i in subset_numbers] else: #select using sample names assert isinstance(subset_names, list) assert isinstance(subset_names[0], int) self.indexes = [self.expression_samples.index(int(i)) for i in subset_names] #self.expression_samples = self.expression_samples[self.indexes] # number of lioness networks to be computed self.n_lio_samples = len(self.indexes) else: # if no subset is selected, we just use the start and end numbers to decide # which samples need to be analyses. The background is always what is used for PANDA # and stays the same self.indexes = range(self.n_conditions)[ start - 1 : end ] # sample indexes to include #self.expression_samples = self.expression_samples[start-1:end] self.n_lio_samples = len(self.indexes) print("Number of total samples:", self.n_conditions) print("Number of computed samples:", len(self.indexes)) print("Number of parallel cores:", self.n_cores) # Create the output folder if not exists self.save_dir = save_dir self.save_fmt = save_fmt # if true, no final large matrix is saved self.ignore_final=ignore_final # Here we make sure that at least one between the complete lioness (edgex x samples) # dataframe or each single network is saved if ((self.save_single==False) and (self.ignore_final==True)): sys.exit("ERROR: you are passing save_single=False, and ignore_final=True. This way no output is saved, change one of the two to have either the single networks or the final large table") # We create the folder if not os.path.exists(save_dir): os.makedirs(save_dir) ############# # Run LIONESS ############# if int(self.n_conditions) >= int(self.n_cores) and self.computing == "cpu": # the total_lioness_network here is a list of 1d # arrays (network:(tfXgene,1),gene_targeting:(gene,1),tf_targeting:(tf,1)) self.total_lioness_network = Parallel(n_jobs=self.n_cores)( self.__par_lioness_loop(i, output) for i in (self.indexes) ) elif self.computing == "gpu": for i in self.indexes: self.total_lioness_network = self.__lioness_loop(i) self.total_lioness_network = self.total_lioness_network.T # create result data frame if self.ignore_final: print('WARNING: we do not keep all lionesses in memory. They have been saved singularly.') else: if output == "network": if isinstance(origmotif, pd.DataFrame): # get row of all TFs total_tfs = tf_names * len(gene_names) # get row of all genes total_genes = [i for i in gene_names for _ in range(len(tf_names))] # first dataframe is made of tf and gene names indDF = pd.DataFrame([total_tfs, total_genes], index=["tf", "gene"]) # concatenate with dataframe of data, rows are samples, columns the edges indDF = pd.concat([indDF, pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])], axis = 0).T # TODO: remove this with next release #indDF = indDF.append( # pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes]) #).transpose() else: # if equal to None to be specific total_genes1 = gene_names * len(gene_names) total_genes2 = [i for i in gene_names for _ in range(len(gene_names))] indDF = pd.DataFrame( [total_genes1, total_genes2], index=["gene1", "gene2"] ) indDF = pd.concat([indDF, pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])], axis = 0).T # TODO: remove this with next release #indDF = indDF.append( # pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes]) #).transpose() # keep the df as the export results self.export_lioness_results = indDF del indDF elif output == "gene_targeting": self.export_lioness_results = pd.DataFrame( self.total_lioness_network, columns=gene_names, index =self.expression_samples[self.indexes] ).transpose() elif output == "tf_targeting": self.export_lioness_results = pd.DataFrame( self.total_lioness_network, columns=tf_names, index = self.expression_samples[self.indexes] ).transpose() # if export filename is passed, the full lioness table is saved if export_filename: self.export_lioness_table(output_filename = export_filename) else: self.save_lioness_results() def __lioness_loop(self, i): #TODO: this is now for GPU only in practice """ Initialize instance of Lioness class and load data. Returns -------- self.total_lioness_network: array An edge-by-sample matrix containing sample-specific networks. """ # for i in self.indexes: print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions))) idx = [x for x in range(self.n_conditions) if x != i] # all samples except i with Timer("Computing coexpression network:"): if self.computing == "gpu": import cupy as cp correlation_matrix_cp = cp.corrcoef(self.expression_matrix[:, idx].astype(self.np_dtype)).astype(self.np_dtype) if cp.isnan(correlation_matrix_cp).any(): cp.fill_diagonal(correlation_matrix_cp, 1) correlation_matrix_cp = cp.nan_to_num(correlation_matrix_cp) correlation_matrix = cp.asnumpy(correlation_matrix_cp) del correlation_matrix_cp cp._default_memory_pool.free_all_blocks() else: correlation_matrix = np.corrcoef(self.expression_matrix[:, idx]) if np.isnan(correlation_matrix).any(): np.fill_diagonal(correlation_matrix, 1) correlation_matrix = np.nan_to_num(correlation_matrix) with Timer("Normalizing networks:"): correlation_matrix_orig = ( correlation_matrix # save matrix before normalization ) correlation_matrix = self._normalize_network(correlation_matrix) with Timer("Inferring LIONESS network:"): if self.motif_matrix is not None: del correlation_matrix_orig subset_panda_network = compute_panda( correlation_matrix, np.copy(self.ppi_matrix), np.copy(self.motif_matrix), computing = self.computing, alpha = self.alpha, ) else: del correlation_matrix subset_panda_network = correlation_matrix_orig # For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q lioness_network = (self.n_conditions * self.network) - ( (self.n_conditions - 1) * subset_panda_network ) # old #lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network if self.save_single: with Timer( "Saving LIONESS network %d (%s) to %s using %s format:" % (i, self.expression_samples[i], self.save_dir, self.save_fmt) ): path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt)) self.__lioness_to_disk(lioness_network, path) if self.ignore_final: return(np.array([0])) else: if self.computing == "gpu" and i == 0: self.total_lioness_network = np.fromstring( np.transpose(lioness_network).tostring(), dtype=lioness_network.dtype )[:,np.newaxis] elif self.computing == "gpu" and i != 0: self.total_lioness_network = np.column_stack( ( self.total_lioness_network, np.fromstring( np.transpose(lioness_network).tostring(), dtype=lioness_network.dtype, ), ) ) return self.total_lioness_network @delayed @wrap_non_picklable_objects def __par_lioness_loop(self, i, output): """ Initialize instance of Lioness class and load data. Returns --------- self.total_lioness_network: array An edge-by-sample matrix containing sample-specific networks. """ # for i in self.indexes: print("Running LIONESS for sample %d:" % (i + 1)) idx = [x for x in range(self.n_conditions) if x != i] # all samples except i with Timer("Computing coexpression network:"): if self.computing == "gpu": import cupy as cp correlation_matrix = cp.corrcoef(self.expression_matrix[:, idx]) if cp.isnan(correlation_matrix).any(): cp.fill_diagonal(correlation_matrix, 1) correlation_matrix = cp.nan_to_num(correlation_matrix) correlation_matrix = cp.asnumpy(correlation_matrix) else: # run on CPU with numpy correlation_matrix = np.corrcoef(self.expression_matrix[:, idx]) if np.isnan(correlation_matrix).any(): np.fill_diagonal(correlation_matrix, 1) correlation_matrix = np.nan_to_num(correlation_matrix) with Timer("Normalizing networks:"): correlation_matrix_orig = ( correlation_matrix # save matrix before normalization ) correlation_matrix = self._normalize_network(correlation_matrix) with Timer("Inferring LIONESS network:"): # TODO: fix this correlation matrix+delete if self.motif_matrix is not None: del correlation_matrix_orig subset_panda_network = compute_panda( correlation_matrix, np.copy(self.ppi_matrix), np.copy(self.motif_matrix), computing = self.computing, alpha = self.alpha, ) else: del correlation_matrix subset_panda_network = correlation_matrix_orig # For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q #lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network lioness_network = (self.n_conditions * self.network) - ( (self.n_conditions - 1) * subset_panda_network ) # the lioness network here is a TFxGENE np array # if save_single flag is passed, save each single lioness sample if self.save_single: # TODO: here we need to decide whether to add the tf and gene name with Timer( "Saving LIONESS network %d (%s) to %s using %s format:" % (i, self.expression_samples[i], self.save_dir, self.save_fmt) ): path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt)) print(path) self.__lioness_to_disk(lioness_network, path) # TODO: why this? Should we remove it? # if i == 0: # self.total_lioness_network = np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype) # else: # self.total_lioness_network=np.column_stack((self.total_lioness_network ,np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype))) if self.ignore_final: return(np.array([0])) else: if output == "network": self.total_lioness_network = np.transpose(lioness_network).flatten() elif output == "gene_targeting": self.total_lioness_network = np.sum(lioness_network, axis=0) elif output == "tf_targeting": self.total_lioness_network = np.sum(lioness_network, axis=1) return self.total_lioness_network
[docs] def save_lioness_results(self, lioness_output_filename = None): """ Saves LIONESS network. Uses self.save_fmt, self.save_dir to save the data into self.total_lioness_network """ # self.lioness_network.to_csv(file, index=False, header=False, sep="\t") if lioness_output_filename: fullpath = lioness_output_filename else: fullpath = os.path.join(self.save_dir, "lioness.%s" % (self.save_fmt)) if fullpath.endswith("txt"): np.savetxt( fullpath, np.transpose(self.total_lioness_network), delimiter="\t", ) elif fullpath.endswith("csv"): np.savetxt( fullpath, np.transpose(self.total_lioness_network), delimiter=",", ) elif fullpath.endswith("npy"): np.save(fullpath, np.transpose(self.total_lioness_network)) elif fullpath.endswith("h5"): print('Warning: saving as txt for now. Need to rewrite this') np.savetxt(fullpath, np.transpose(self.total_lioness_network), delimiter="\t", ) elif fullpath.endswith("mat"): from scipy.io import savemat mdic = {"results": np.transpose(self.total_lioness_network), "label": "lioness"} savemat(fullpath, mdic) else: print('Trying to save lioness output. Format %s not recognised' %str(fullpath)) return None
def __lioness_to_disk(self, net, path): if self.save_fmt == "txt": np.savetxt(path, net) elif self.save_fmt == "h5": pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_hdf(path, key = 'lioness' ,mode='w') elif self.save_fmt == 'csv': pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_csv(path) elif self.save_fmt == "npy": np.save(path, net) elif self.save_fmt == "mat": from scipy.io import savemat savemat(path, {"PredNet": net}) else: print("Unknown format %s! Use npy format instead." % self.save_fmt) np.save(path, net)
[docs] def export_lioness_table(self, output_filename="lioness_table.txt", header=False, output = 'network'): """ Saves LIONESS network with edge names. This saves a dataframe with the corresponding header and indexes. Parameters ------------ output_filename: str Path to save the network. Specify relative path and format. Choose between .csv, .tsv and .txt. (Defaults to .lioness_table.txt)) """ # TODO: add case where there is tf_targeting or gene_targeting if (output=='network'): # we first get the names of first two columns (tf,gene) or (gene1,gene2) sort_cols = self.export_lioness_results.columns.tolist()[:2] if output_filename.endswith("txt"): self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=" ", index = False) elif output_filename.endswith("csv"): self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=",", index = False) elif output_filename.endswith("tsv"): self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep="\t", index = False) else: sys.exit('Export output unknown: use txt/csv/tsv') else: # otherwise we only need one column and we sort by index if output_filename.endswith("txt"): self.export_lioness_results.sort_index().to_csv(output_filename, sep=" ") elif output_filename.endswith("csv"): self.export_lioness_results.sort_index().to_csv(output_filename, sep=",") elif output_filename.endswith("tsv"): self.export_lioness_results.sort_index().to_csv(output_filename, sep="\t") else: sys.exit('Export output unknown: use txt/csv/tsv')