from __future__ import print_function
import math
import time
import pandas as pd
from .timer import Timer
import numpy as np
from netZooPy.cobra import cobra
from netZooPy.panda import calculations as calc
import os
[docs]
class Panda(object):
"""
Using PANDA to infer gene regulatory network.
1. Reading in input data (expression data, motif prior, TF PPI data)
2. Computing coexpression network
3. Normalizing networks
4. Running PANDA algorithm
5. Writing out PANDA network
Parameters
----------
expression_file : str
Either i) a string of a path to file containing the gene expression data or ii) a pandas dataframe. By default, the expression file does not have a header, and the cells ares separated by a tab.
motif_file : str
Either i) a string of a path to file containing the transcription factor DNA binding motif data in the form of
TF-gene-weight(0/1) as a tab-separated file without a header or ii) a pandas dataframe.
If set to none, the gene coexpression matrix is returned as a result network.
ppi_file : str
Either i) a path to file containing the PPI data or a ii) pandas dataframe.
The PPI has to reflect an undirected network (A - B), if not, it will be transformed into an undirected network by building a symmetrical adjacency matrix (A -> B, B -> A).
computing : str
'cpu' uses Central Processing Unit (CPU) to run PANDA.
'gpu' use the Graphical Processing Unit (GPU) to run PANDA.
precision : str
- 'double' computes the regulatory network in double precision (15 decimal digits).
- 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate.
save_memory : bool
- True : removes temporary results from memory. The result network is weighted adjacency matrix of size
(nTFs, nGenes).
- False: keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge.
save_tmp : bool
Save temporary variables.
remove_missing : bool
Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
keep_expression_matrix : bool
Keeps the input expression matrix in the result Panda object.
modeProcess : str
The input data processing mode.
- 'legacy': refers to the processing mode in netZooPy<=0.5
- (Default)'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros.
- 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes.
alpha : str
Learning rate (default: 0.1)
start : int
First sample of the expression dataset. This replicates the behavior of Lioness (default : 1)
end : int
Last sample of the expression dataset. This replicates the behavior of Lioness (default : None )
cobra_design_matrix : np.ndarray, pd.DataFrame
COBRA design matrix of size (n, q), n = number of samples, q = number of covariates
cobra_covariate_to_keep : int
Zero-indedex base of COBRA co-expression component to use
Examples
--------
Note these examples use a small toy data that may not reflect an actual use case. To use actual gene expression, motif, and PPI data, please refer to [GRAND](https://grand.networkmedicine.org/) database.
>>> #Import the classes in the pypanda library:
>>> from netZooPy.panda.panda import Panda
>>> #Run the Panda algorithm, leave out motif and PPI data to use Pearson correlation network:
>>> panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False)
>>> #Save the results:
>>> panda_obj.save_panda_results('Toy_Panda.pairs.txt')
>>> #Return a network plot:
>>> panda_obj.top_network_plot(top=70, file='top_genes.png')
>>> #Calculate in- and outdegrees for further analysis:
>>> indegree = panda_obj.return_panda_indegree()
>>> outdegree = panda_obj.return_panda_outdegree()
Notes
------
Toy data: The example gene expression data that we have available here contains gene expression profiles
for different samples in the columns. Of note, this is just a small subset of a larger gene
expression dataset. We provided these "toy" data so that the user can test the method.
Gene naming nomeclature: Gene names have to be consistent between gene expresssion and motif columns; and TF PPI matrix and motif rows.
For example, gene expression and motif columns can be in Ensembl gene IDs (ENSG), and TF PPI and motif rows can be in HUGO gene symbols.
Sample PANDA results:\b
- TF Gene Motif Force\n
- CEBPA AACSL 0.0 -0.951416589143\n
- CREB1 AACSL 0.0 -0.904241609324\n
- DDIT3 AACSL 0.0 -0.956471642313\n
- E2F1 AACSL 1.0 3.685316051\n
- EGR1 AACSL 0.0 -0.695698519643
References
----------
.. [1]__ Glass, Kimberly, et al. "Passing messages between biological networks to refine predicted interactions."
PloS one 8.5 (2013): e64832.
Authors: Cho-Yi Chen, David Vi, Alessandro Marin, Marouen Ben Guebila, Daniel Morgan
"""
def __init__(
self,
expression_file,
motif_file,
ppi_file,
computing="cpu",
precision="double",
save_memory=True,
save_tmp=False,
remove_missing=False,
keep_expression_matrix=False,
modeProcess="union",
alpha=0.1,
start=1,
end=None,
with_header=False,
cobra_design_matrix = None,
cobra_covariate_to_keep = 0
):
""" Intialize instance of Panda class and load data.
"""
# Read data
self.processData(
modeProcess,
motif_file,
expression_file,
ppi_file,
remove_missing,
keep_expression_matrix,
start=start,
end=end,
with_header=with_header,
cobra_design_matrix=cobra_design_matrix,
cobra_covariate_to_keep=cobra_covariate_to_keep
)
print(modeProcess,motif_file,expression_file,ppi_file,save_memory,remove_missing,keep_expression_matrix)
if hasattr(self, "export_panda_results"):
return
# =====================================================================
# Network normalization
# =====================================================================
self.precision = precision
with Timer("Normalizing networks ..."):
self.correlation_matrix = self._normalize_network(self.correlation_matrix)
with np.errstate(invalid="ignore"): # silly warning bothering people
self.motif_matrix = self._normalize_network(
self.motif_matrix_unnormalized
)
self.ppi_matrix = self._normalize_network(self.ppi_matrix)
if self.precision == "single":
self.correlation_matrix = np.float32(self.correlation_matrix)
self.motif_matrix = np.float32(self.motif_matrix)
self.ppi_matrix = np.float32(self.ppi_matrix)
# =====================================================================
# Clean up useless variables to release memory
# =====================================================================
self.tfs, self.genes = self.unique_tfs, self.gene_names
if (save_memory==True):
print("Clearing motif and ppi data, unique tfs, and gene names for speed")
del self.unique_tfs, self.gene_names, self.motif_matrix_unnormalized
print("WARNING: save_memory will be uncoupled from the output behavior.\n Pass as_adjacency to save the output panda as adjacency matrix")
# =====================================================================
# Saving middle data to tmp
# =====================================================================
if save_tmp:
with Timer("Saving expression matrix and normalized networks ..."):
os.makedirs('./tmp',exist_ok=True)
if self.expression_data is not None:
np.save("/tmp/expression.npy", self.expression_data.values)
np.save("/tmp/motif.normalized.npy", self.motif_matrix)
np.save("/tmp/ppi.normalized.npy", self.ppi_matrix)
# delete expression data
del self.expression_data
# =====================================================================
# Running PANDA algorithm
# =====================================================================
if self.motif_data is not None:
print("Running PANDA algorithm ...")
self.panda_network = self.panda_loop(
self.correlation_matrix,
self.motif_matrix,
self.ppi_matrix,
computing,
alpha,
)
# label dataframe
self.panda_network = pd.DataFrame(
self.panda_network, index=self.tfs, columns=self.genes
)
else:
self.panda_network = self.correlation_matrix
self.__pearson_results_data_frame()
# label dataframe
self.panda_network = pd.DataFrame(
self.panda_network, index=self.genes, columns=self.genes
)
def __remove_missing(self):
""" Removes the genes and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
"""
if self.expression_data is not None:
print("Remove expression not in motif:")
motif_unique_genes = set(self.motif_data[1])
len_tot = len(self.expression_data)
self.expression_data = self.expression_data[
self.expression_data.index.isin(motif_unique_genes)
]
self.gene_names = self.expression_data.index.tolist()
self.num_genes = len(self.gene_names)
print(
" {} rows removed from the initial {}".format(
len_tot - self.num_genes, len_tot
)
)
# if self.motif_data is not None:
print("Remove motif not in expression data:")
len_tot = len(self.motif_data)
self.motif_data = self.motif_data[
self.motif_data.iloc[:, 1].isin(self.gene_names)
]
self.unique_tfs = sorted(set(self.motif_data[0]))
self.num_tfs = len(self.unique_tfs)
print(
" {} rows removed from the initial {}".format(
len_tot - len(self.motif_data), len_tot
)
)
if self.ppi_data is not None:
print("Remove ppi not in motif:")
motif_unique_tfs = np.unique(self.motif_data.iloc[:, 0])
len_tot = len(self.ppi_data)
self.ppi_data = self.ppi_data[
self.ppi_data.iloc[:, 0].isin(motif_unique_tfs)
]
self.ppi_data = self.ppi_data[
self.ppi_data.iloc[:, 1].isin(motif_unique_tfs)
]
print(
" {} rows removed from the initial {}".format(
len_tot - len(self.ppi_data), len_tot
)
)
return None
[docs]
def _normalize_network(self, x):
"""Standardizes the input data matrices.
Parameters
----------
x : array
Input adjacency matrix.
Returns
-------
normalized_matrix: array
Standardized adjacency matrix.
"""
normalized_matrix = calc.normalize_network(x)
return normalized_matrix
[docs]
def processData(
self,
modeProcess,
motif_file,
expression_file,
ppi_file,
remove_missing,
keep_expression_matrix,
start=1,
end=None,
with_header = False,
cobra_design_matrix=None,
cobra_covariate_to_keep=0
):
""" Processes data files into data matrices.
Parameters
----------
expression_file : str
Path to file containing the gene expression data or pandas dataframe.
By default, the expression file does not have a header, and the cells ares separated by a tab.
Pass `with_header=True` if the expression data includes the sample names
motif_file : str
Path to file containing the transcription factor DNA binding motif data in the form of
TF-gene-weight(0/1) or pandas dataframe.
If set to none, the gene coexpression matrix is returned as a result network.
ppi_file : str
Path to file containing the PPI data. or pandas dataframe.
The PPI can be symmetrical, if not, it will be transformed into a symmetrical adjacency matrix.
remove_missing : bool
Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
keep_expression_matrix : bool
Keeps the input expression matrix in the result Panda object.
modeProcess : str
The input data processing mode.
- 'legacy': refers to the processing mode in netZooPy<=0.5
- (Default)'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros.
- 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes.
with_header: bool
pass True when the expression file has a header with the sample names
"""
# if modeProcess=="legacy":
# =====================================================================
# Data loading
# =====================================================================
### Loading Motif
if type(motif_file) is str:
# If motif_file is a filename
with Timer("Loading motif data ..."):
self.motif_data = pd.read_csv(motif_file, sep="\t", header=None)
self.motif_tfs = sorted(set(self.motif_data[0]))
self.motif_genes = sorted(set(self.motif_data[1]))
# self.num_tfs = len(self.unique_tfs)
# print('Unique TFs:', self.num_tfs)
elif type(motif_file) is not str:
# If motif_file is an object
if motif_file is None:
# Computation without motif
self.motif_data = None
self.motif_genes = []
self.motif_tfs = []
else:
# If motif_file is an object, it needs to be a dataframe
if not isinstance(motif_file, pd.DataFrame):
raise Exception(
"Please provide a pandas dataframe for motif data with column names as 'source', 'target', and 'weight'."
)
if ("source" not in motif_file.columns) or (
"target" not in motif_file.columns
):
print('renaming motif columns to "source", "target" and "weight" ')
motif_file.columns = ["source", "target", "weight"]
self.motif_data = pd.DataFrame(motif_file.values)
self.motif_tfs = sorted(set(motif_file["source"]))
self.motif_genes = sorted(set(motif_file["target"]))
# self.num_tfs = len(self.unique_tfs)
# print('Unique TFs:', self.num_tfs)
### Loading expression
if type(expression_file) is str:
# If we pass an expression file, check if we have a 'with header' flag and read it
with Timer("Loading expression data ..."):
if with_header:
# Read data with header
self.expression_data = pd.read_csv(
expression_file, sep="\t", index_col=0
)
else:
self.expression_data = pd.read_csv(
expression_file, sep="\t", header=None, index_col=0
)
# assign expression data and samples/gene names
self.expression_data = self.expression_data.iloc[:, (start-1):end]
self.expression_genes = self.expression_data.index.tolist()
self.expression_samples = self.expression_data.columns.astype(str)
# self.num_genes = len(self.gene_names)
# print('Expression matrix:', self.expression_data.shape)
elif type(expression_file) is not str:
# Pass expression as a dataframe
if expression_file is not None:
if not isinstance(expression_file, pd.DataFrame):
raise Exception(
"Please provide a pandas dataframe for expression data."
)
self.expression_data = expression_file.iloc[:, (start-1):end] # pd.read_csv(expression_file, sep='\t', header=None, index_col=0)
self.expression_genes = self.expression_data.index.tolist()
self.expression_samples = self.expression_data.columns.astype(str)
# self.num_genes = len(self.gene_names)
# print('Expression matrix:', self.expression_data.shape)
else:
# If no expression is passed
self.gene_names = self.motif_genes
self.expression_genes = self.motif_genes
self.num_genes = len(self.gene_names)
self.expression_data = (
None # pd.DataFrame(np.identity(self.num_genes, dtype=int))
)
print(
# TODO: Marouen check here. Here we do not pass the identity matrix
"No Expression data given: correlation matrix will be an identity matrix of size",
len(self.motif_genes),
)
if len(self.expression_genes) != len(np.unique(self.expression_genes)):
print(
"Duplicate gene symbols detected. Consider averaging before running PANDA"
)
### Loading the PPI
if type(ppi_file) is str:
with Timer("Loading PPI data ..."):
self.ppi_data = pd.read_csv(ppi_file, sep="\t", header=None)
self.ppi_tfs = sorted(
set(pd.concat([self.ppi_data[0], self.ppi_data[1]]))
)
print("Number of PPIs:", self.ppi_data.shape[0])
elif type(ppi_file) is not str:
if ppi_file is not None:
if not isinstance(ppi_file, pd.DataFrame):
raise Exception("Please provide a pandas dataframe for PPI data.")
self.ppi_data = ppi_file # pd.read_csv(ppi_file, sep='\t', header=None)
self.ppi_tfs = sorted(
set(pd.concat([self.ppi_data[0], self.ppi_data[1]]))
)
print("Number of PPIs:", self.ppi_data.shape[0])
else:
# TODO: marouen, here we do not have an identiy matrix
print(
"No PPI data given: ppi matrix will be an identity matrix of size",
len(self.motif_tfs),
)
self.ppi_data = None
self.ppi_tfs = self.motif_tfs
### Data combination
if modeProcess == "legacy" and remove_missing and motif_file is not None:
self.__remove_missing()
print('new case')
if modeProcess == "legacy" and remove_missing==False:
if expression_file is not None:
self.gene_names = (
self.expression_genes
) # sorted( np.unique(self.motif_genes + self.expression_genes ))
if motif_file is None:
self.unique_tfs = self.ppi_tfs
else:
self.unique_tfs = (
self.motif_tfs
) # sorted( np.unique(self.ppi_tfs + self.motif_tfs ))
elif modeProcess == "union":
self.gene_names = sorted(
np.unique(self.motif_genes + self.expression_genes)
)
self.unique_tfs = sorted(np.unique(self.ppi_tfs + self.motif_tfs))
elif modeProcess == "intersection":
if motif_file is None:
self.gene_names = sorted(np.unique(self.expression_genes))
self.unique_tfs = sorted(np.unique(self.ppi_tfs))
else:
self.gene_names = sorted(
np.unique(
list(
set(self.motif_genes).intersection(
set(self.expression_genes)
)
)
)
)
self.unique_tfs = sorted(
np.unique(list(set(self.ppi_tfs).intersection(set(self.motif_tfs))))
)
self.num_genes = len(self.gene_names)
self.num_tfs = len(self.unique_tfs)
# Auxiliary dicts
gene2idx = {x: i for i, x in enumerate(self.gene_names)}
tf2idx = {x: i for i, x in enumerate(self.unique_tfs)}
if (
(modeProcess == "union" or modeProcess == "intersection")
and (self.expression_data is not None)
and (self.num_genes != 0)
):
# Initialize data & Populate gene expression
self.expression = np.zeros((self.num_genes, self.expression_data.shape[1]))
idx_geneEx = [gene2idx.get(x, np.nan) for x in self.expression_genes]
filtered_genes = [i for (i, v) in zip(self.expression_genes, idx_geneEx) if ~np.isnan(v)]
idx_geneEx = [x for x in idx_geneEx if str(x) != 'nan']
self.expression[idx_geneEx, :] = self.expression_data.loc[filtered_genes].values
self.expression_data = pd.DataFrame(data=self.expression)
# =====================================================================
# Network construction
# =====================================================================
with Timer("Calculating coexpression network ..."):
if self.expression_data is None:
self.correlation_matrix = np.identity(self.num_genes, dtype=int)
else:
if cobra_design_matrix is not None:
psi, Q, d, g = cobra(cobra_design_matrix, self.expression_data)
if cobra_covariate_to_keep < 0 or cobra_covariate_to_keep >= psi.shape[0]:
raise AttributeError(
"Invalid COBRA component! Valid COBRA components are in range " + str(0) + " - " + str(psi.shape[0] - 1)
)
self.correlation_matrix = Q.dot(np.diag(psi[cobra_covariate_to_keep,])).dot(Q.T)
else:
self.correlation_matrix = np.corrcoef(self.expression_data)
if np.isnan(self.correlation_matrix).any():
np.fill_diagonal(self.correlation_matrix, 1)
self.correlation_matrix = np.nan_to_num(self.correlation_matrix)
# Clean up useless variables to release memory
if keep_expression_matrix:
if self.expression_data is not None:
self.expression_matrix = self.expression_data.values
else:
self.expression_matrix = None
if self.motif_data is None:
print(
"Returning the correlation matrix of expression data in <Panda_obj>.correlation_matrix"
)
self.panda_network = self.correlation_matrix
self.export_panda_results = self.correlation_matrix
self.motif_matrix = self.motif_data
self.ppi_matrix = self.ppi_data
self.__pearson_results_data_frame()
self.panda_network = pd.DataFrame(
self.panda_network,
index=self.expression_genes,
columns=self.expression_genes,
)
return
with Timer("Creating motif network ..."):
self.motif_matrix_unnormalized = np.zeros((self.num_tfs, self.num_genes))
idx_tfs = [tf2idx.get(x, np.nan) for x in self.motif_data[0]]
idx_genes = [gene2idx.get(x, np.nan) for x in self.motif_data[1]]
commind1 = ~np.isnan(idx_tfs) & ~np.isnan(idx_genes)
idx_tfs = [i for (i, v) in zip(idx_tfs, commind1) if v]
idx_genes = [i for (i, v) in zip(idx_genes, commind1) if v]
if (len(idx_genes) == 0) or (len(idx_tfs) == 0):
raise Exception('Error when creating the motif network!'
' Typically this exception is raised if your'
' expression matrix genes and motif priors'
' not have any intersection.')
idx = np.ravel_multi_index(
(idx_tfs, idx_genes), self.motif_matrix_unnormalized.shape
)
self.motif_matrix_unnormalized.ravel()[idx] = self.motif_data[2][commind1]
if self.ppi_data is None:
self.ppi_matrix = np.identity(self.num_tfs, dtype=int)
else:
with Timer("Creating PPI network ..."):
self.ppi_matrix = np.identity(self.num_tfs)
idx_tf1 = [tf2idx.get(x, np.nan) for x in self.ppi_data[0]]
idx_tf2 = [tf2idx.get(x, np.nan) for x in self.ppi_data[1]]
commind2 = ~np.isnan(idx_tf1) & ~np.isnan(idx_tf2)
idx_tf1 = [i for (i, v) in zip(idx_tf1, commind2) if v]
idx_tf2 = [i for (i, v) in zip(idx_tf2, commind2) if v]
idx = np.ravel_multi_index((idx_tf1, idx_tf2), self.ppi_matrix.shape)
self.ppi_matrix.ravel()[idx] = self.ppi_data[2][commind2]
idx = np.ravel_multi_index((idx_tf2, idx_tf1), self.ppi_matrix.shape)
self.ppi_matrix.ravel()[idx] = self.ppi_data[2][commind2]
return
[docs]
def panda_loop(
self, correlation_matrix, motif_matrix, ppi_matrix, computing="cpu", alpha=0.1
):
""" The PANDA algorithm.
Parameters
----------
correlation_matrix: array
Input coexpression matrix.
motif_matrix : array
Input motif regulation prior network.
ppi_matrix : array
Input PPI matrix.
computing : str
'cpu' uses Central Processing Unit (CPU) to run PANDA.
'gpu' use the Graphical Processing Unit (GPU) to run PANDA.
"""
panda_loop_time = time.time()
# TODO:This should be using self.correlation. Keeping for retrocompatibility
motif_matrix = calc.compute_panda(
correlation_matrix,
ppi_matrix,
motif_matrix,
computing=computing,
alpha=alpha,
)
print("Running panda took: %.2f seconds!" % (time.time() - panda_loop_time))
# Ale: reintroducing the export_panda_results array if Panda called with save_memory=False
if hasattr(self, "unique_tfs"):
tfs = np.tile(self.unique_tfs, (len(self.gene_names), 1)).flatten()
genes = np.repeat(self.gene_names, self.num_tfs)
motif = self.motif_matrix_unnormalized.flatten(order="F")
force = motif_matrix.flatten(order="F")
self.export_panda_results = pd.DataFrame(
{"tf": tfs, "gene": genes, "motif": motif, "force": force}
)
# self.export_panda_results = np.column_stack((tfs,genes,motif,force))
return motif_matrix
def __pearson_results_data_frame(self):
""" Saves PANDA network in edges format.
"""
genes_1 = np.tile(self.gene_names, (len(self.gene_names), 1)).flatten()
genes_2 = (
np.tile(self.gene_names, (len(self.gene_names), 1)).transpose().flatten()
)
self.flat_panda_network = self.panda_network.transpose().flatten()
self.export_panda_results = pd.DataFrame(
{"tf": genes_1, "gene": genes_2, "force": self.flat_panda_network}
)
self.export_panda_results = self.export_panda_results[["tf", "gene", "force"]]
return None
[docs]
def save_panda_results(self, path="panda.npy", save_adjacency=False, old_compatible=True ):
""" Saves PANDA network.
Parameters
----------
path: str
Path to save the network.
save_adjacency: bool
if True the output is an adjacency matrix and not the edge list
old_compatible: bool
if True saves the data as it was saved until netzoopy 0.9.11
"""
with Timer("Saving PANDA network to %s ..." % path):
# Because there are two modes of operation (save_memory), save to file will be different
# Export to file
if old_compatible:
if not hasattr(self, "unique_tfs"):
toexport = self.panda_network
else:
# save the network with names
toexport = self.export_panda_results
print('WARNING: saving without header will soon become obsolete. \n\
Use old_compatible=False to save the panda results with correct column naming')
if path.endswith(".txt"):
np.savetxt(path, toexport, fmt="%s", delimiter=" ")
elif path.endswith(".csv"):
np.savetxt(path, toexport, fmt="%s", delimiter=",")
elif path.endswith(".tsv"):
np.savetxt(path, toexport, fmt="%s", delimiter="\t")
else:
np.save(path, toexport)
else:
print('WARNING: panda is now saved with the column names. \nUse old_compatible=True to save the panda results as previous versions without header')
if not hasattr(self, "unique_tfs"):
toexport = self.panda_network
toexport = toexport.reset_index()
else:
# save the network with names
toexport = self.export_panda_results
#saving as adjacency matrix
if (save_adjacency):
toexport = pd.pivot_table(toexport, values = 'force', index = 'tf', columns='gene', dropna=False)
toexport = toexport.reset_index()
if path.endswith(".txt"):
toexport.to_csv(path, sep=" ", index=False)
elif path.endswith(".csv"):
toexport.to_csv(path, sep=",", index=False)
elif path.endswith(".tsv"):
toexport.to_csv(path, sep="\t", index=False)
else:
np.save(path, toexport)
[docs]
def top_network_plot(self, top=100, file="panda_top_100.png", plot_bipart=False):
""" Selects top genes.
Parameters
----------
top : int
Top number of genes to plot.
file : str
File to save the network plot.
plot_bipart: bool
Plot the network as a bipartite layout.
"""
if not hasattr(self, "export_panda_results"):
raise AttributeError(
"Panda object does not contain the export_panda_results attribute.\n"
+ "Run Panda with the flag save_memory=False"
)
# Ale TODO: work in numpy instead of pandas?
self.panda_results = pd.DataFrame(
self.export_panda_results, columns=["tf", "gene", "motif", "force"]
)
subset_panda_results = self.panda_results.sort_values(
by=["force"], ascending=False
)
subset_panda_results = subset_panda_results[
subset_panda_results.tf != subset_panda_results.gene
]
subset_panda_results = subset_panda_results[0:top]
self.__shape_plot_network(
subset_panda_results=subset_panda_results,
file=file,
plot_bipart=plot_bipart,
)
return None
def __shape_plot_network(
self, subset_panda_results, file="panda.png", plot_bipart=False
):
""" Creates plot.
Parameters
-----------
subset_panda_results : array
Reduced PANDA network to the top genes.
file : str
File to save the network plot.
plot_bipart: bool
Plot the network as a bipartite layout.
"""
# reshape data for networkx
unique_genes = list(
set(list(subset_panda_results["tf"]) + list(subset_panda_results["gene"]))
)
unique_genes = pd.DataFrame(unique_genes)
unique_genes.columns = ["name"]
unique_genes["index"] = unique_genes.index
subset_panda_results = subset_panda_results.merge(
unique_genes, how="inner", left_on="tf", right_on="name"
)
subset_panda_results = subset_panda_results.rename(
columns={"index": "tf_index"}
)
subset_panda_results = subset_panda_results.drop(["name"], 1)
subset_panda_results = subset_panda_results.merge(
unique_genes, how="inner", left_on="gene", right_on="name"
)
subset_panda_results = subset_panda_results.rename(
columns={"index": "gene_index"}
)
subset_panda_results = subset_panda_results.drop(["name"], 1)
links = subset_panda_results[["tf_index", "gene_index", "force"]]
self.__create_plot(
unique_genes=unique_genes, links=links, file=file, plot_bipart=plot_bipart
)
return None
def __create_plot(self, unique_genes, links, file="panda.png", plot_bipart=False):
""" Runs the plot.
Parameters
----------
unique_genes : list
Unique list of PANDA genes.
links : list
Edges of the subset PANDA network to the top genes.
file : str
File to save the network plot.
plot_bipart : bool
Plot the network as a bipartite layout.
Notes
-----
split_label: Splits the plot label over several lines for plotting purposes.
"""
import networkx as nx
import matplotlib.pyplot as plt
g = nx.Graph()
g.clear()
plt.clf()
# img = plt.imread("../img/panda.jpg")
# fig, ax = plt.subplots()
# ax.imshow(img, extent=[0, 400, 0, 300])
##ax.plot(x, x, '--', linewidth=5, color='firebrick')
g.add_nodes_from(unique_genes["index"])
edges = []
for i in range(0, len(links)):
edges = edges + [
(
links.iloc[i]["tf_index"],
links.iloc[i]["gene_index"],
float(links.iloc[i]["force"]) / 200,
)
]
g.add_weighted_edges_from(edges)
labels = {}
def split_label(label):
""" Splits the plot label over several lines for plotting purposes.
Parameters
----------
label: Input label text.
Returns:
label: _
Output label text divided over several lines.
"""
ll = len(label)
if ll > 6:
return (
label[0 : int(np.ceil(ll / 2))]
+ "\n"
+ label[int(np.ceil(ll / 2)) :]
)
return label
for i, l in enumerate(unique_genes.iloc[:, 0]):
labels[i] = split_label(l)
if not plot_bipart:
pos = nx.spring_layout(g)
else:
pos = nx.drawing.layout.bipartite_layout(g, set(links["tf_index"]))
# nx.draw_networkx(g, pos, labels=labels, node_size=40, font_size=3, alpha=0.3, linewidth = 0.5, width =0.5)
print(plot_bipart)
if not plot_bipart:
colors = range(len(edges))
else:
colors = list(zip(*edges))[-1]
options = {
"alpha": 0.7,
"edge_color": colors,
"edge_cmap": plt.cm.Blues,
"node_size": 110,
"vmin": -100,
"width": 2,
"labels": labels,
"font_weight": "regular",
"font_size": 3,
"linewidths": 20,
}
nx.draw_networkx(g, pos=pos, **options)
plt.axis("off")
plt.savefig(file, dpi=300)
return None
[docs]
def return_panda_indegree(self):
""" Computes indegree of PANDA network, only if save_memory = False.
"""
export_panda_results_pd = pd.DataFrame(
self.export_panda_results, columns=["tf", "gene", "motif", "force"]
)
subset_indegree = export_panda_results_pd.loc[:, ["gene", "force"]]
self.panda_indegree = subset_indegree.groupby("gene").sum()
return self.panda_indegree
[docs]
def return_panda_outdegree(self):
""" computes outdegree of PANDA network, only if save_memory = False.
"""
export_panda_results_pd = pd.DataFrame(
self.export_panda_results, columns=["tf", "gene", "motif", "force"]
)
subset_outdegree = export_panda_results_pd.loc[:, ["tf", "force"]]
self.panda_outdegree = subset_outdegree.groupby("tf").sum()
return self.panda_outdegree