from __future__ import print_function
import os, os.path, sys
#from netZooPy.command_line import lioness
import numpy as np
import pandas as pd
from .timer import Timer
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects
sys.path.insert(1, "../panda")
from netZooPy.panda.panda import Panda
from netZooPy.panda.calculations import compute_panda
[docs]
class Lioness(Panda):
""" Using LIONESS to infer single-sample gene regulatory networks.
1. Reading in PANDA network and preprocessed middle data
2. Computing coexpression network
3. Normalizing coexpression network
4. Running PANDA algorithm
5. Writing out LIONESS networks
Parameters
----------
obj : object
PANDA object, generated with keep_expression_matrix=True.
computing : str
'cpu' uses Central Processing Unit (CPU) to run PANDA
'gpu' use the Graphical Processing Unit (GPU) to run PANDA
precision : str
'double' computes the regulatory network in double precision (15 decimal digits).
'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate.
subset_numbers : list
List of sample index onto which lioness should be run. ([1,10,20])
subset_names : list
List of sample names onto which lioness should be run. (['s1','s2','s3'])
start : int
Index of first sample to compute the network. If subset_numbers or subset_names is passed,
this is ignored
end : int
Index of last sample to compute the network.If subset_numbers or subset_names is passed,
this is ignored
all_background : bool
Pass the flag if you want to keep the whole samples as background
save_dir : str
Directory to save the networks.
save_fmt : str
Save format.
- '.npy': (Default) Numpy file of the network.
- '.txt': Text file, only values are saved, no tf or gene names. Will be deprecated.
- '.csv': text file with index (tf) and column (gene) names
- '.h5': hdf file, fastest way to save the lioness dataframe with index/column names
- '.mat': MATLAB file.
output : str
- 'network' returns all networks in a single edge-by-sample matrix (lioness_obj.total_lioness_network is the unlabeled variable and lioness_obj.export_lioness_results is the row-labeled variable). For large sample sizes, this variable requires large RAM memory.
- 'gene_targeting' returns gene targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network).
- 'tf_targeting' returns tf targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network).
alpha : float
learning rate, set to 0.1 by default but has to be changed manually to match the learning rate of the PANDA object.
save_single: bool
when set to True it will save each lioness network with its sample name inside the lioness output folder
export_filename: str
if passed, the final lioness table will be saved with all tf-gene edges as dataframe index and
samples as column name
ignore_final: bool
if True, no lioness network is kept in memory. This requires saving single networks at each step
Returns
--------
export_lioness_results : _
Depeding on the output argument, this can be either all the lioness
networks or their gene/tf targeting scores.
Notes
-------
Example on how to use Lioness and plot the network
>>> from netZooPy.lioness.lioness import Lioness
>>> #To run the Lioness algorithm for single sample networks, first run PANDA using the keep_expression_matrix flag, then use Lioness as follows:
>>> panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False, keep_expression_matrix=True)
>>> lioness_obj = Lioness(panda_obj)
>>> #Save Lioness results:
>>> lioness_obj.save_lioness_results('Toy_Lioness.txt')
>>> #Return a network plot for one of the Lioness single sample networks:
>>> plot = AnalyzeLioness(lioness_obj)
>>> plot.top_network_plot(column= 0, top=100, file='top_100_genes.png')
Example lioness output:
TF, Gene and Motif order is identical to the panda output file.
- Sample1 Sample2 Sample3 Sample4\n
- -------------------------------\n
- -0.667452814003 -1.70433776179 -0.158129613892 -0.655795512803\n
- -0.843366539284 -0.733709815256 -0.84849895139 -0.915217389738\n
- 3.23445386464 2.68888472802 3.35809757371 3.05297381396\n
- 2.39500370135 1.84608635425 2.80179804094 2.67540878165\n
- -0.117475863987 0.494923925853 0.0518448588965 -0.0584810456421\n
References
-----------
.. [1]__ Kuijjer, Marieke Lydia, et al. "Estimating sample-specific regulatory networks."
Iscience 14 (2019): 226-240.
Authors: Cho-Yi Chen, David Vi, Daniel Morgan
"""
def __init__(
self,
obj,
computing="cpu",
precision="double",
ncores=1,
start=1,
end=None,
subset_numbers=None,
subset_names=None,
save_dir="lioness_output",
save_fmt="npy",
output="network",
alpha=0.1,
save_single = False,
export_filename = None,
ignore_final=False,
):
""" Initialize instance of Lioness class and load data.
"""
# Load data
with Timer("Loading input data ..."):
self.export_panda_results = obj.export_panda_results
self.expression_samples = obj.expression_samples
if precision == "single":
self.expression_matrix = np.float32(obj.expression_matrix)
self.correlation_matrix = np.float32(obj.correlation_matrix)
self.motif_matrix = np.float32(obj.motif_matrix)
self.ppi_matrix = np.float32(obj.ppi_matrix)
self.alpha = np.float32(alpha)
else:
self.expression_matrix = obj.expression_matrix
self.motif_matrix = obj.motif_matrix
self.ppi_matrix = obj.ppi_matrix
self.correlation_matrix = obj.correlation_matrix
self.alpha = alpha
self.computing = computing
self.n_cores = int(ncores)
self.save_single = save_single
self.precision = precision
if precision == "single":
self.np_dtype = np.float32
else:
self.np_dtype = np.float64
if hasattr(obj, "panda_network"):
self.network = obj.panda_network.to_numpy()
elif hasattr(obj, "puma_network"):
self.network = obj.puma_network
else:
print("Cannot find panda or puma network in object")
raise AttributeError("Cannot find panda or puma network in object")
gene_names = obj.gene_names
tf_names = obj.unique_tfs
origmotif = obj.motif_data # save state of original motif matrix
self.gene_names = gene_names
self.tf_names = tf_names
del obj
# Get sample range to iterate
# the number of conditions is the N parameter used for the number of samples in the whole background
self.n_conditions = self.expression_matrix.shape[1]
self.n_lio_samples = self.n_conditions
if (subset_numbers!=None or subset_names!=None):
if (subset_numbers!=None and subset_names!=None):
sys.exit('Pass only one between subset_numbers and subset_names')
elif (subset_numbers!=None and subset_names==None):
# select using indexes
assert isinstance(subset_numbers, list)
assert isinstance(subset_numbers[0], int)
self.indexes = [int(i) for i in subset_numbers]
else:
#select using sample names
assert isinstance(subset_names, list)
assert isinstance(subset_names[0], int)
self.indexes = [self.expression_samples.index(int(i)) for i in subset_names]
#self.expression_samples = self.expression_samples[self.indexes]
# number of lioness networks to be computed
self.n_lio_samples = len(self.indexes)
else:
# if no subset is selected, we just use the start and end numbers to decide
# which samples need to be analyses. The background is always what is used for PANDA
# and stays the same
self.indexes = range(self.n_conditions)[
start - 1 : end
] # sample indexes to include
#self.expression_samples = self.expression_samples[start-1:end]
self.n_lio_samples = len(self.indexes)
print("Number of total samples:", self.n_conditions)
print("Number of computed samples:", len(self.indexes))
print("Number of parallel cores:", self.n_cores)
# Create the output folder if not exists
self.save_dir = save_dir
self.save_fmt = save_fmt
# if true, no final large matrix is saved
self.ignore_final=ignore_final
# Here we make sure that at least one between the complete lioness (edgex x samples)
# dataframe or each single network is saved
if ((self.save_single==False) and (self.ignore_final==True)):
sys.exit("ERROR: you are passing save_single=False, and ignore_final=True. This way no output is saved, change one of the two to have either the single networks or the final large table")
# We create the folder
if not os.path.exists(save_dir):
os.makedirs(save_dir)
#############
# Run LIONESS
#############
if int(self.n_conditions) >= int(self.n_cores) and self.computing == "cpu":
# the total_lioness_network here is a list of 1d
# arrays (network:(tfXgene,1),gene_targeting:(gene,1),tf_targeting:(tf,1))
self.total_lioness_network = Parallel(n_jobs=self.n_cores)(
self.__par_lioness_loop(i, output) for i in (self.indexes)
)
elif self.computing == "gpu":
for i in self.indexes:
self.total_lioness_network = self.__lioness_loop(i)
self.total_lioness_network = self.total_lioness_network.T
# create result data frame
if self.ignore_final:
print('WARNING: we do not keep all lionesses in memory. They have been saved singularly.')
else:
if output == "network":
if isinstance(origmotif, pd.DataFrame):
# get row of all TFs
total_tfs = tf_names * len(gene_names)
# get row of all genes
total_genes = [i for i in gene_names for _ in range(len(tf_names))]
# first dataframe is made of tf and gene names
indDF = pd.DataFrame([total_tfs, total_genes], index=["tf", "gene"])
# concatenate with dataframe of data, rows are samples, columns the edges
indDF = pd.concat([indDF, pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])], axis = 0).T
# TODO: remove this with next release
#indDF = indDF.append(
# pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])
#).transpose()
else: # if equal to None to be specific
total_genes1 = gene_names * len(gene_names)
total_genes2 = [i for i in gene_names for _ in range(len(gene_names))]
indDF = pd.DataFrame(
[total_genes1, total_genes2], index=["gene1", "gene2"]
)
indDF = pd.concat([indDF, pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])], axis = 0).T
# TODO: remove this with next release
#indDF = indDF.append(
# pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes])
#).transpose()
# keep the df as the export results
self.export_lioness_results = indDF
del indDF
elif output == "gene_targeting":
self.export_lioness_results = pd.DataFrame(
self.total_lioness_network, columns=gene_names, index =self.expression_samples[self.indexes]
).transpose()
elif output == "tf_targeting":
self.export_lioness_results = pd.DataFrame(
self.total_lioness_network, columns=tf_names, index = self.expression_samples[self.indexes]
).transpose()
# if export filename is passed, the full lioness table is saved
if export_filename:
self.export_lioness_table(output_filename = export_filename)
else:
self.save_lioness_results()
def __lioness_loop(self, i):
#TODO: this is now for GPU only in practice
""" Initialize instance of Lioness class and load data.
Returns
--------
self.total_lioness_network: array
An edge-by-sample matrix containing sample-specific networks.
"""
# for i in self.indexes:
print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions)))
idx = [x for x in range(self.n_conditions) if x != i] # all samples except i
with Timer("Computing coexpression network:"):
if self.computing == "gpu":
import cupy as cp
correlation_matrix_cp = cp.corrcoef(self.expression_matrix[:, idx].astype(self.np_dtype)).astype(self.np_dtype)
if cp.isnan(correlation_matrix_cp).any():
cp.fill_diagonal(correlation_matrix_cp, 1)
correlation_matrix_cp = cp.nan_to_num(correlation_matrix_cp)
correlation_matrix = cp.asnumpy(correlation_matrix_cp)
del correlation_matrix_cp
cp._default_memory_pool.free_all_blocks()
else:
correlation_matrix = np.corrcoef(self.expression_matrix[:, idx])
if np.isnan(correlation_matrix).any():
np.fill_diagonal(correlation_matrix, 1)
correlation_matrix = np.nan_to_num(correlation_matrix)
with Timer("Normalizing networks:"):
correlation_matrix_orig = (
correlation_matrix # save matrix before normalization
)
correlation_matrix = self._normalize_network(correlation_matrix)
with Timer("Inferring LIONESS network:"):
if self.motif_matrix is not None:
del correlation_matrix_orig
subset_panda_network = compute_panda(
correlation_matrix,
np.copy(self.ppi_matrix),
np.copy(self.motif_matrix),
computing = self.computing,
alpha = self.alpha,
)
else:
del correlation_matrix
subset_panda_network = correlation_matrix_orig
# For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q
lioness_network = (self.n_conditions * self.network) - (
(self.n_conditions - 1) * subset_panda_network
)
# old
#lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network
if self.save_single:
with Timer(
"Saving LIONESS network %d (%s) to %s using %s format:"
% (i, self.expression_samples[i], self.save_dir, self.save_fmt)
):
path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt))
self.__lioness_to_disk(lioness_network, path)
if self.ignore_final:
return(np.array([0]))
else:
if self.computing == "gpu" and i == 0:
self.total_lioness_network = np.fromstring(
np.transpose(lioness_network).tostring(), dtype=lioness_network.dtype
)[:,np.newaxis]
elif self.computing == "gpu" and i != 0:
self.total_lioness_network = np.column_stack(
(
self.total_lioness_network,
np.fromstring(
np.transpose(lioness_network).tostring(),
dtype=lioness_network.dtype,
),
)
)
return self.total_lioness_network
@delayed
@wrap_non_picklable_objects
def __par_lioness_loop(self, i, output):
""" Initialize instance of Lioness class and load data.
Returns
---------
self.total_lioness_network: array
An edge-by-sample matrix containing sample-specific networks.
"""
# for i in self.indexes:
print("Running LIONESS for sample %d:" % (i + 1))
idx = [x for x in range(self.n_conditions) if x != i] # all samples except i
with Timer("Computing coexpression network:"):
if self.computing == "gpu":
import cupy as cp
correlation_matrix = cp.corrcoef(self.expression_matrix[:, idx])
if cp.isnan(correlation_matrix).any():
cp.fill_diagonal(correlation_matrix, 1)
correlation_matrix = cp.nan_to_num(correlation_matrix)
correlation_matrix = cp.asnumpy(correlation_matrix)
else:
# run on CPU with numpy
correlation_matrix = np.corrcoef(self.expression_matrix[:, idx])
if np.isnan(correlation_matrix).any():
np.fill_diagonal(correlation_matrix, 1)
correlation_matrix = np.nan_to_num(correlation_matrix)
with Timer("Normalizing networks:"):
correlation_matrix_orig = (
correlation_matrix # save matrix before normalization
)
correlation_matrix = self._normalize_network(correlation_matrix)
with Timer("Inferring LIONESS network:"):
# TODO: fix this correlation matrix+delete
if self.motif_matrix is not None:
del correlation_matrix_orig
subset_panda_network = compute_panda(
correlation_matrix,
np.copy(self.ppi_matrix),
np.copy(self.motif_matrix),
computing = self.computing,
alpha = self.alpha,
)
else:
del correlation_matrix
subset_panda_network = correlation_matrix_orig
# For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q
#lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network
lioness_network = (self.n_conditions * self.network) - (
(self.n_conditions - 1) * subset_panda_network
)
# the lioness network here is a TFxGENE np array
# if save_single flag is passed, save each single lioness sample
if self.save_single:
# TODO: here we need to decide whether to add the tf and gene name
with Timer(
"Saving LIONESS network %d (%s) to %s using %s format:"
% (i, self.expression_samples[i], self.save_dir, self.save_fmt)
):
path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt))
print(path)
self.__lioness_to_disk(lioness_network, path)
# TODO: why this? Should we remove it?
# if i == 0:
# self.total_lioness_network = np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype)
# else:
# self.total_lioness_network=np.column_stack((self.total_lioness_network ,np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype)))
if self.ignore_final:
return(np.array([0]))
else:
if output == "network":
self.total_lioness_network = np.transpose(lioness_network).flatten()
elif output == "gene_targeting":
self.total_lioness_network = np.sum(lioness_network, axis=0)
elif output == "tf_targeting":
self.total_lioness_network = np.sum(lioness_network, axis=1)
return self.total_lioness_network
[docs]
def save_lioness_results(self, lioness_output_filename = None):
""" Saves LIONESS network.
Uses self.save_fmt, self.save_dir to save the data
into self.total_lioness_network
"""
# self.lioness_network.to_csv(file, index=False, header=False, sep="\t")
if lioness_output_filename:
fullpath = lioness_output_filename
else:
fullpath = os.path.join(self.save_dir, "lioness.%s" % (self.save_fmt))
if fullpath.endswith("txt"):
np.savetxt(
fullpath,
np.transpose(self.total_lioness_network),
delimiter="\t",
)
elif fullpath.endswith("csv"):
np.savetxt(
fullpath,
np.transpose(self.total_lioness_network),
delimiter=",",
)
elif fullpath.endswith("npy"):
np.save(fullpath, np.transpose(self.total_lioness_network))
elif fullpath.endswith("h5"):
print('Warning: saving as txt for now. Need to rewrite this')
np.savetxt(fullpath,
np.transpose(self.total_lioness_network),
delimiter="\t",
)
elif fullpath.endswith("mat"):
from scipy.io import savemat
mdic = {"results": np.transpose(self.total_lioness_network), "label": "lioness"}
savemat(fullpath, mdic)
else:
print('Trying to save lioness output. Format %s not recognised' %str(fullpath))
return None
def __lioness_to_disk(self, net, path):
if self.save_fmt == "txt":
np.savetxt(path, net)
elif self.save_fmt == "h5":
pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_hdf(path, key = 'lioness' ,mode='w')
elif self.save_fmt == 'csv':
pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_csv(path)
elif self.save_fmt == "npy":
np.save(path, net)
elif self.save_fmt == "mat":
from scipy.io import savemat
savemat(path, {"PredNet": net})
else:
print("Unknown format %s! Use npy format instead." % self.save_fmt)
np.save(path, net)
[docs]
def export_lioness_table(self, output_filename="lioness_table.txt", header=False, output = 'network'):
"""
Saves LIONESS network with edge names. This saves a dataframe with the corresponding
header and indexes.
Parameters
------------
output_filename: str
Path to save the network. Specify relative path
and format. Choose between .csv, .tsv and .txt.
(Defaults to .lioness_table.txt))
"""
# TODO: add case where there is tf_targeting or gene_targeting
if (output=='network'):
# we first get the names of first two columns (tf,gene) or (gene1,gene2)
sort_cols = self.export_lioness_results.columns.tolist()[:2]
if output_filename.endswith("txt"):
self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=" ", index = False)
elif output_filename.endswith("csv"):
self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=",", index = False)
elif output_filename.endswith("tsv"):
self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep="\t", index = False)
else:
sys.exit('Export output unknown: use txt/csv/tsv')
else:
# otherwise we only need one column and we sort by index
if output_filename.endswith("txt"):
self.export_lioness_results.sort_index().to_csv(output_filename, sep=" ")
elif output_filename.endswith("csv"):
self.export_lioness_results.sort_index().to_csv(output_filename, sep=",")
elif output_filename.endswith("tsv"):
self.export_lioness_results.sort_index().to_csv(output_filename, sep="\t")
else:
sys.exit('Export output unknown: use txt/csv/tsv')