Source code for netZooPy.puma.puma

from __future__ import print_function

import math
import time
import pandas as pd
import numpy as np
from .timer import Timer
from netZooPy.panda.panda import Panda
import netZooPy.panda.calculations as calc
from netZooPy.puma.calculations import compute_puma
import os

[docs] class Puma(object): """ Using PUMA to infer gene regulatory network. 1. Reading in input data (expression data, motif prior, TF PPI data, miR) 2. Computing coexpression network 3. Normalizing networks 4. Running PUMA algorithm 5. Writing out PUMA network Parameters ------------ expression_file : str Path to file containing the gene expression data or pandas dataframe. By default, the expression file does not have a header, and the cells ares separated by a tab. motif_file : str Path to file containing the regulation prior as a tab-separated file without a header. This can be a miRNA-Gene predicted network from TargetScan/miRanda. However, this can be combined with transcription factor DNA binding motif data in the form of TF-gene-weight(0/1) to estimate gene regulation by TF and miRNA. Alternatively, can be a dataframe with the motif network If set to none, the gene coexpression matrix is returned as a result network. ppi_file : str Path to file containing the TF PPI data, or pandas dataframe. This can be provided as 'None' if no TF data is given and PUMA will estimate a miRNA-Gene networks. mir_file : str Path to file containing miRNA list or a list. A standard mir_file can be read as: >>> with open(mir_file, "r") as f: >>> miR = f.read().splitlines() computing : str - 'cpu' uses Central Processing Unit (CPU) to run PANDA. - 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. precision : str 'double' computes the regulatory network in double precision (15 decimal digits). 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. save_memory : bool True : removes temporary results from memory. The result network is weighted adjacency matrix of size (nTFs, nGenes). False: keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PUMA edge. save_tmp : bool Save temporary variables. remove_missing : bool Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'. keep_expression_matrix: bool Keeps the input expression matrix in the result Puma object. modeProcess : str The input data processing mode. - 'legacy': refers to the processing mode in netZooPy<=0.5 - (Default)'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros. - 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. alpha : float Learning rate (default: 0.1) start : int first sample of expression (default 1) end : int last sample of expression (default None) Examples -------- Run the PUMA algorithm, leave out motif and PPI data to use Pearson correlation network: from netZooPy.puma.puma import Puma puma_obj = Puma('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt','../../tests/ToyData/ToyMiRList.txt') References ---------- ..[1]__ Kuijjer, Marieke L., et al. "PUMA: PANDA Using MicroRNA Associations." OUP Bioinformatics (2019). Authors:cychen, davidvi, alessandromarin """ def __init__( self, expression_file, motif_file, ppi_file, mir_file, modeProcess="union", computing="cpu", precision="double", save_memory=False, save_tmp=False, remove_missing=False, keep_expression_matrix=False, alpha=0.1, start=1, end=None, df_correlation_matrix = None ): """ Intialize instance of Puma class and load data. """ # ===================================================================== # Data loading # ===================================================================== Panda.processData( self, modeProcess, motif_file, expression_file, ppi_file, remove_missing, keep_expression_matrix, start=start, end=end, ) with Timer("Loading miR data ..."): # If the mir_file is a string the mir list is read from file # otherwise the input list is used directly if type(mir_file) is str: with open(mir_file, "r") as f: miR = f.read().splitlines() elif isinstance(mir_file,list): miR = mir_file else: raise Exception( "For mir_file please provide either a file name or a list" ) TFNames = self.unique_tfs sort_idx = np.argsort(TFNames) self.s1 = sort_idx[np.searchsorted(TFNames, miR, sorter=sort_idx)] if remove_missing and motif_file is not None: self.__remove_missing() # ===================================================================== # Network construction # ===================================================================== with Timer("Calculating coexpression network ..."): if self.expression_data is None: if df_correlation_matrix is None: self.correlation_matrix = np.identity(self.num_genes, dtype=int) else: #if a gene is missing from the correlation matrix will have NaN in the matrix self.correlation_matrix = df_correlation_matrix.reindex(index=self.gene_names,columns = self.gene_names).values else: self.correlation_matrix = np.corrcoef(self.expression_data) if np.isnan(self.correlation_matrix).any(): np.fill_diagonal(self.correlation_matrix, 1) self.correlation_matrix = np.nan_to_num(self.correlation_matrix) if self.motif_data is None: print( "Returning the correlation matrix of expression data in <Puma_obj>.correlation_matrix" ) # self.puma_network = self.correlation_matrix self.__pearson_results_data_frame() return # Auxiliary dicts gene2idx = {x: i for i, x in enumerate(self.gene_names)} tf2idx = {x: i for i, x in enumerate(self.unique_tfs)} with Timer("Creating motif network ..."): self.motif_matrix_unnormalized = np.zeros((self.num_tfs, self.num_genes)) idx_tfs = [tf2idx[x] for x in self.motif_data[0]] idx_genes = [gene2idx[x] for x in self.motif_data[1]] idx = np.ravel_multi_index( (idx_tfs, idx_genes), self.motif_matrix_unnormalized.shape ) self.motif_matrix_unnormalized.ravel()[idx] = self.motif_data[2] if self.ppi_data is None: self.ppi_matrix = np.identity(self.num_tfs, dtype=int) else: with Timer("Creating PPI network ..."): self.ppi_matrix = np.identity(self.num_tfs) idx_tf1 = [tf2idx[x] for x in self.ppi_data[0]] idx_tf2 = [tf2idx[x] for x in self.ppi_data[1]] idx = np.ravel_multi_index((idx_tf1, idx_tf2), self.ppi_matrix.shape) self.ppi_matrix.ravel()[idx] = self.ppi_data[2] idx = np.ravel_multi_index((idx_tf2, idx_tf1), self.ppi_matrix.shape) self.ppi_matrix.ravel()[idx] = self.ppi_data[2] # ===================================================================== # Network normalization # ===================================================================== with Timer("Normalizing networks ..."): self.correlation_matrix = self._normalize_network(self.correlation_matrix) with np.errstate(invalid="ignore"): # silly warning bothering people self.motif_matrix = self._normalize_network( self.motif_matrix_unnormalized ) self.ppi_matrix = self._normalize_network(self.ppi_matrix) if precision == "single": self.correlation_matrix = np.float32(self.correlation_matrix) self.motif_matrix = np.float32(self.motif_matrix) self.ppi_matrix = np.float32(self.ppi_matrix) # ===================================================================== # Clean up useless variables to release memory # ===================================================================== if save_memory: print("Clearing motif and ppi data, unique tfs, and gene names for speed") del ( self.motif_data, self.ppi_data, self.unique_tfs, self.gene_names, self.motif_matrix_unnormalized, ) # ===================================================================== # Saving middle data to tmp # ===================================================================== if save_tmp: with Timer("Saving expression matrix and normalized networks ..."): os.makedirs('./tmp',exist_ok=True) if self.expression_data is not None: np.save("./tmp/expression.npy", self.expression_data.values) np.save("./tmp/motif.normalized.npy", self.motif_matrix) np.save("./tmp/ppi.normalized.npy", self.ppi_matrix) np.save("./tmp/correlation_matrix.npy", self.correlation_matrix) # Clean up useless variables to release memory if keep_expression_matrix: self.expression_matrix = self.expression_data.values del self.expression_data # ===================================================================== # Running PUMA algorithm # ===================================================================== print("Running PUMA algorithm ...") self.puma_network = self.puma_loop( self.correlation_matrix, self.motif_matrix, self.ppi_matrix, computing, alpha, ) def __remove_missing(self): """ Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'. """ if self.expression_data is not None: print("Remove expression not in motif:") motif_unique_genes = set(self.motif_data[1]) len_tot = len(self.expression_data) self.expression_data = self.expression_data[ self.expression_data.index.isin(motif_unique_genes) ] self.gene_names = self.expression_data.index.tolist() self.num_genes = len(self.gene_names) print( " {} rows removed from the initial {}".format( len_tot - self.num_genes, len_tot ) ) # if self.motif_data is not None: print("Remove motif not in expression data:") len_tot = len(self.motif_data) self.motif_data = self.motif_data[ self.motif_data.iloc[:, 1].isin(self.gene_names) ] self.unique_tfs = sorted(set(self.motif_data[0])) self.num_tfs = len(self.unique_tfs) print( " {} rows removed from the initial {}".format( len_tot - len(self.motif_data), len_tot ) ) if self.ppi_data is not None: print("Remove ppi not in motif:") motif_unique_tfs = np.unique(self.motif_data.iloc[:, 0]) len_tot = len(self.ppi_data) self.ppi_data = self.ppi_data[ self.ppi_data.iloc[:, 0].isin(motif_unique_tfs) ] self.ppi_data = self.ppi_data[ self.ppi_data.iloc[:, 1].isin(motif_unique_tfs) ] print( " {} rows removed from the initial {}".format( len_tot - len(self.ppi_data), len_tot ) ) return None
[docs] def _normalize_network(self, x): """ Standardizes the input data matrices. Parameters ---------- x : array Input adjacency matrix. Returns -------- normalized_matrix: array Standardized adjacency matrix. """ return calc.normalize_network(x)
[docs] def puma_loop( self, correlation_matrix, motif_matrix, ppi_matrix, computing="cpu", alpha=0.1 ): """ The PUMA algorithm. Parameters ------------ correlation_matrix: array Input coexpression matrix. motif_matrix : array Input motif regulation prior network. ppi_matrix : array Input PPI matrix. computing : str - 'cpu' uses Central Processing Unit (CPU) to run PANDA. - 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. """ puma_loop_time = time.time() # TODO:This should be using self.correlation. Keeping for retrocompatibility motif_matrix = compute_puma( correlation_matrix, motif_matrix, ppi_matrix, self.s1, computing=computing, alpha=alpha, ) print("Running puma took: %.2f seconds!" % (time.time() - puma_loop_time)) self.puma_network = motif_matrix # Ale: reintroducing the export_puma_results array if Puma called with save_memory=False if hasattr(self, "unique_tfs"): tfs = np.tile(self.unique_tfs, (len(self.gene_names), 1)).flatten() genes = np.repeat(self.gene_names, self.num_tfs) motif = self.motif_matrix_unnormalized.flatten(order="F") force = motif_matrix.flatten(order="F") # TODO: should we keep formats consistent between Panda and Puma? # self.export_puma_results = pd.DataFrame({'tf':tfs, 'gene': genes,'motif': motif, 'force': force}) self.export_puma_results = np.column_stack((tfs, genes, motif, force)) return motif_matrix
def __pearson_results_data_frame(self): """ Saves PUMA network in edges format. """ genes_1 = np.tile(self.gene_names, (len(self.gene_names), 1)).flatten() genes_2 = ( np.tile(self.gene_names, (len(self.gene_names), 1)).transpose().flatten() ) self.flat_puma_network = self.puma_network.transpose().flatten() self.export_puma_results = pd.DataFrame( {"tf": genes_1, "gene": genes_2, "force": self.flat_puma_network} ) self.export_puma_results = self.export_puma_results[["tf", "gene", "force"]] return None
[docs] def save_puma_results(self, path="puma.npy"): """ Saves PUMA network. Parameters ------------- path: str Path to save the network. """ with Timer("Saving PUMA network to %s ..." % path): # Because there are two modes of operation (save_memory), save to file will be different if hasattr(self, "export_puma_results"): toexport = self.export_puma_results else: toexport = self.puma_network # Export to file if path.endswith(".txt"): np.savetxt(path, toexport, fmt="%s", delimiter=" ") elif path.endswith(".csv"): np.savetxt(path, toexport, fmt="%s", delimiter=",") elif path.endswith(".tsv"): np.savetxt(path, toexport, fmt="%s", delimiter="\t") else: np.save(path, toexport)
[docs] def top_network_plot(self, top=100, file="puma_top_100.png"): """ Selects top genes and plot network Parameters ---------- top : int Top number of genes to plot. file : str File to save the network plot. """ if not hasattr(self, "export_puma_results"): raise AttributeError( "Puma object does not contain the export_puma_results attribute.\n" + "Run Puma with the flag save_memory=False" ) # Ale TODO: work in numpy instead of pandas? self.puma_results = pd.DataFrame( self.export_puma_results, columns=["tf", "gene", "motif", "force"] ) subset_puma_results = self.puma_results.sort_values( by=["force"], ascending=False ) subset_puma_results = subset_puma_results[ subset_puma_results.tf != subset_puma_results.gene ] subset_puma_results = subset_puma_results[0:top] self.__shape_plot_network(subset_puma_results=subset_puma_results, file=file) return None
def __shape_plot_network(self, subset_puma_results, file="puma.png"): """ Creates plot. Paramters: subset_puma_results : array Reduced PUMA network to the top genes. file : str File to save the network plot. """ # reshape data for networkx unique_genes = list( set(list(subset_puma_results["tf"]) + list(subset_puma_results["gene"])) ) unique_genes = pd.DataFrame(unique_genes) unique_genes.columns = ["name"] unique_genes["index"] = unique_genes.index subset_puma_results = subset_puma_results.merge( unique_genes, how="inner", left_on="tf", right_on="name" ) subset_puma_results = subset_puma_results.rename(columns={"index": "tf_index"}) subset_puma_results = subset_puma_results.drop(["name"], 1) subset_puma_results = subset_puma_results.merge( unique_genes, how="inner", left_on="gene", right_on="name" ) subset_puma_results = subset_puma_results.rename( columns={"index": "gene_index"} ) subset_puma_results = subset_puma_results.drop(["name"], 1) links = subset_puma_results[["tf_index", "gene_index", "force"]] self.__create_plot(unique_genes=unique_genes, links=links, file=file) return None def __create_plot(self, unique_genes, links, file="puma.png"): """ Runs the plot. Parameters ----------- unique_genes : list Unique list of PUMA genes. links : list Edges of the subset PUMA network to the top genes. file : str File to save the network plot. """ import networkx as nx import matplotlib.pyplot as plt g = nx.Graph() g.clear() plt.clf() g.add_nodes_from(unique_genes["index"]) edges = [] for i in range(0, len(links)): edges = edges + [ ( links.iloc[i]["tf_index"], links.iloc[i]["gene_index"], float(links.iloc[i]["force"]) / 200, ) ] g.add_weighted_edges_from(edges) labels = {} def split_label(label): """ Splits the plot label over several lines for plotting purposes. Parameters ---------- label: str Input label text. Returns ------- label: str Output label text divided over several lines. """ ll = len(label) if ll > 6: return label[0 : math.ceil(ll / 2)] + "\n" + label[math.ceil(ll / 2) :] return label for i, l in enumerate(unique_genes.iloc[:, 0]): labels[i] = split_label(l) pos = nx.spring_layout(g) # nx.draw_networkx(g, pos, labels=labels, node_size=40, font_size=3, alpha=0.3, linewidths = 0.5, width =0.5) colors = range(len(edges)) options = { "alpha": 0.7, "edge_color": colors, "edge_cmap": plt.cm.Blues, "node_size": 110, "vmin": -100, "width": 2, "labels": labels, "font_weight": "regular", "font_size": 3, "linewidths": 20, } nx.draw_spring(g, k=0.25, iterations=50, **options) plt.axis("off") plt.savefig(file, dpi=300) return None
[docs] def return_puma_indegree(self): """ Computes indegree of PUMA network, only if save_memory = False. """ # subset_indegree = self.export_puma_results.loc[:,['gene','force']] subset_indegree = self.puma_results.loc[:, ["gene", "force"]] self.puma_indegree = subset_indegree.groupby("gene").sum() return self.puma_indegree
[docs] def return_puma_outdegree(self): """ Computes outdegree of PUMA network, only if save_memory = False. """ export_puma_results_pd = pd.DataFrame( self.export_puma_results, columns=["tf", "gene", "motif", "force"] ) subset_outdegree = export_puma_results_pd.loc[:, ["tf", "force"]] self.puma_outdegree = subset_outdegree.groupby("tf").sum() return self.puma_outdegree