Source code for netZooPy.panda.panda

from __future__ import print_function
import math
import time
import pandas as pd
from .timer import Timer
import numpy as np
from netZooPy.cobra import cobra
from netZooPy.panda import calculations as calc
import os


[docs]
class Panda(object):
    """ 
    Using PANDA to infer gene regulatory network.
        1. Reading in input data (expression data, motif prior, TF PPI data)
        2. Computing coexpression network
        3. Normalizing networks
        4. Running PANDA algorithm
        5. Writing out PANDA network


    Parameters
    ----------

            expression_file : str
                Either i) a string of a path to file containing the gene expression data or ii) a pandas dataframe. By default, the expression file does not have a header, and the cells ares separated by a tab.
            motif_file : str 
                Either i) a string of a path to file containing the transcription factor DNA binding motif data in the form of
                TF-gene-weight(0/1) as a tab-separated file without a header or ii) a pandas dataframe.
                If set to none, the gene coexpression matrix is returned as a result network.
            ppi_file : str
                Either i) a path to file containing the PPI data or a ii) pandas dataframe. 
                The PPI has to  reflect an undirected network (A - B), if not, it will be transformed into an undirected network by building a  symmetrical adjacency matrix (A -> B, B -> A).
            computing : str
                'cpu' uses Central Processing Unit (CPU) to run PANDA.
                'gpu' use the Graphical Processing Unit (GPU) to run PANDA.
            precision : str
                - 'double' computes the regulatory network in double precision (15 decimal digits).
                - 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate.
            save_memory : bool
                - True : removes temporary results from memory. The result network is weighted adjacency matrix of size
                  (nTFs, nGenes).
                - False: keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge.
            save_tmp : bool
                Save temporary variables.
            remove_missing : bool
                Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
            keep_expression_matrix : bool
                Keeps the input expression matrix in the result Panda object.
            modeProcess : str
                The input data processing mode.
                - 'legacy': refers to the processing mode in netZooPy<=0.5
                - (Default)'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros.
                - 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes.
            alpha : str
                Learning rate (default: 0.1)
            start : int
                First sample of the expression dataset. This replicates the behavior of Lioness (default : 1)
            end : int
                Last sample of the expression dataset. This replicates the behavior of Lioness (default : None )
            cobra_design_matrix  : np.ndarray, pd.DataFrame
                COBRA design matrix of size (n, q), n = number of samples, q = number of covariates
            cobra_covariate_to_keep : int
                Zero-indedex base of COBRA co-expression component to use

    Examples
    --------
	Note these examples use a small toy data that may not reflect an actual use case. To use actual gene expression, motif, and PPI data, please refer to [GRAND](https://grand.networkmedicine.org/) database.
        >>> #Import the classes in the pypanda library:  
        >>> from netZooPy.panda.panda import Panda
        >>> #Run the Panda algorithm, leave out motif and PPI data to use Pearson correlation network:
        >>> panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False)
        >>> #Save the results:
        >>> panda_obj.save_panda_results('Toy_Panda.pairs.txt')
        >>> #Return a network plot:
        >>> panda_obj.top_network_plot(top=70, file='top_genes.png')
        >>> #Calculate in- and outdegrees for further analysis:
        >>> indegree = panda_obj.return_panda_indegree()
        >>> outdegree = panda_obj.return_panda_outdegree()


    Notes
    ------

    Toy data: The example gene expression data that we have available here contains gene expression profiles 
    for different samples in the columns. Of note, this is just a small subset of a larger gene 
    expression dataset. We provided these "toy" data so that the user can test the method.

    Gene naming nomeclature: Gene names have to be consistent between gene expresssion and motif columns; and TF PPI matrix and motif rows.
    For example, gene expression and motif columns can be in Ensembl gene IDs (ENSG), and TF PPI and motif rows can be in HUGO gene symbols.


    Sample PANDA results:\b
        - TF    Gene    Motif   Force\n
        - CEBPA AACSL	0.0	-0.951416589143\n
        - CREB1 AACSL	0.0	-0.904241609324\n
        - DDIT3 AACSL	0.0	-0.956471642313\n
        - E2F1  AACSL	1.0	3.685316051\n
        - EGR1  AACSL	0.0	-0.695698519643

    References
    ----------
    .. [1]__ Glass, Kimberly, et al. "Passing messages between biological networks to refine predicted interactions." 
        PloS one 8.5 (2013): e64832.

    Authors: Cho-Yi Chen, David Vi, Alessandro Marin, Marouen Ben Guebila, Daniel Morgan
       
    """

    def __init__(
        self,
        expression_file,
        motif_file,
        ppi_file,
        computing="cpu",
        precision="double",
        save_memory=True,
        save_tmp=False,
        remove_missing=False,
        keep_expression_matrix=False,
        modeProcess="union",
        alpha=0.1,
        start=1,
        end=None,
        with_header=False, 
        cobra_design_matrix = None, 
        cobra_covariate_to_keep = 0
    ):
        """ Intialize instance of Panda class and load data.
        """
        # Read data
        
        self.processData(
            modeProcess,
            motif_file,
            expression_file,
            ppi_file,
            remove_missing,
            keep_expression_matrix,
            start=start,
            end=end,
            with_header=with_header, 
            cobra_design_matrix=cobra_design_matrix,
            cobra_covariate_to_keep=cobra_covariate_to_keep
        )
        print(modeProcess,motif_file,expression_file,ppi_file,save_memory,remove_missing,keep_expression_matrix)
        if hasattr(self, "export_panda_results"):
            return

        # =====================================================================
        # Network normalization
        # =====================================================================

        self.precision = precision
        
        with Timer("Normalizing networks ..."):
            self.correlation_matrix = self._normalize_network(self.correlation_matrix)
            with np.errstate(invalid="ignore"):  # silly warning bothering people
                self.motif_matrix = self._normalize_network(
                    self.motif_matrix_unnormalized
                )
            self.ppi_matrix = self._normalize_network(self.ppi_matrix)
            if self.precision == "single":
                self.correlation_matrix = np.float32(self.correlation_matrix)
                self.motif_matrix = np.float32(self.motif_matrix)
                self.ppi_matrix = np.float32(self.ppi_matrix)
        # =====================================================================
        # Clean up useless variables to release memory
        # =====================================================================
        self.tfs, self.genes = self.unique_tfs, self.gene_names

        if (save_memory==True):
            print("Clearing motif and ppi data, unique tfs, and gene names for speed")
            del self.unique_tfs, self.gene_names, self.motif_matrix_unnormalized
            print("WARNING: save_memory will be uncoupled from the output behavior.\n Pass as_adjacency to save the output panda as adjacency matrix")
        # =====================================================================
        # Saving middle data to tmp
        # =====================================================================
        if save_tmp:
            with Timer("Saving expression matrix and normalized networks ..."):
                os.makedirs('./tmp',exist_ok=True) 
                if self.expression_data is not None:
                    np.save("/tmp/expression.npy", self.expression_data.values)
                np.save("/tmp/motif.normalized.npy", self.motif_matrix)
                np.save("/tmp/ppi.normalized.npy", self.ppi_matrix)

        # delete expression data
        del self.expression_data

        # =====================================================================
        # Running PANDA algorithm
        # =====================================================================
        if self.motif_data is not None:
            print("Running PANDA algorithm ...")
            self.panda_network = self.panda_loop(
                self.correlation_matrix,
                self.motif_matrix,
                self.ppi_matrix,
                computing,
                alpha,
            )
            # label dataframe
            self.panda_network = pd.DataFrame(
                self.panda_network, index=self.tfs, columns=self.genes
            )
        else:
            self.panda_network = self.correlation_matrix
            self.__pearson_results_data_frame()
            # label dataframe
            self.panda_network = pd.DataFrame(
                self.panda_network, index=self.genes, columns=self.genes
            )

    def __remove_missing(self):
        """ Removes the genes and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
        """
        if self.expression_data is not None:
            print("Remove expression not in motif:")
            motif_unique_genes = set(self.motif_data[1])
            len_tot = len(self.expression_data)
            self.expression_data = self.expression_data[
                self.expression_data.index.isin(motif_unique_genes)
            ]
            self.gene_names = self.expression_data.index.tolist()
            self.num_genes = len(self.gene_names)
            print(
                "   {} rows removed from the initial {}".format(
                    len_tot - self.num_genes, len_tot
                )
            )
        # if self.motif_data is not None:
        print("Remove motif not in expression data:")
        len_tot = len(self.motif_data)
        self.motif_data = self.motif_data[
            self.motif_data.iloc[:, 1].isin(self.gene_names)
        ]
        self.unique_tfs = sorted(set(self.motif_data[0]))
        self.num_tfs = len(self.unique_tfs)
        print(
            "   {} rows removed from the initial {}".format(
                len_tot - len(self.motif_data), len_tot
            )
        )
        if self.ppi_data is not None:
            print("Remove ppi not in motif:")
            motif_unique_tfs = np.unique(self.motif_data.iloc[:, 0])
            len_tot = len(self.ppi_data)
            self.ppi_data = self.ppi_data[
                self.ppi_data.iloc[:, 0].isin(motif_unique_tfs)
            ]
            self.ppi_data = self.ppi_data[
                self.ppi_data.iloc[:, 1].isin(motif_unique_tfs)
            ]
            print(
                "   {} rows removed from the initial {}".format(
                    len_tot - len(self.ppi_data), len_tot
                )
            )
        return None


[docs]
    def _normalize_network(self, x):
        """Standardizes the input data matrices.

        Parameters
        ----------
            x     : array
                Input adjacency matrix.

        Returns
        -------
            normalized_matrix: array
                Standardized adjacency matrix.
        """
        normalized_matrix = calc.normalize_network(x)
        return normalized_matrix



[docs]
    def processData(
        self,
        modeProcess,
        motif_file,
        expression_file,
        ppi_file,
        remove_missing,
        keep_expression_matrix,
        start=1,
        end=None,
        with_header = False,
        cobra_design_matrix=None,
        cobra_covariate_to_keep=0
    ):
        """ Processes data files into data matrices.

        Parameters
        ----------

            expression_file : str
                Path to file containing the gene expression data or pandas dataframe. 
                By default, the expression file does not have a header, and the cells ares separated by a tab.
                Pass `with_header=True` if the expression data includes the sample names 
            motif_file : str 
                Path to file containing the transcription factor DNA binding motif data in the form of
                TF-gene-weight(0/1) or pandas dataframe.
                If set to none, the gene coexpression matrix is returned as a result network.
            ppi_file : str
                Path to file containing the PPI data. or pandas dataframe. 
                The PPI can be symmetrical, if not, it will be transformed into a symmetrical adjacency matrix. 
            remove_missing : bool
                Removes the gens and TFs that are not present in one of the priors. Works only if modeProcess='legacy'.
            keep_expression_matrix : bool
                Keeps the input expression matrix in the result Panda object.
            modeProcess : str
                The input data processing mode.
                - 'legacy': refers to the processing mode in netZooPy<=0.5
                - (Default)'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros.
                - 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes.
            with_header: bool
                pass True when the expression file has a header with the sample names
        """

        # if modeProcess=="legacy":
        # =====================================================================
        # Data loading
        # =====================================================================
        ### Loading Motif
        if type(motif_file) is str:
            # If motif_file is a filename
            with Timer("Loading motif data ..."):
                self.motif_data = pd.read_csv(motif_file, sep="\t", header=None)
                self.motif_tfs = sorted(set(self.motif_data[0]))
                self.motif_genes = sorted(set(self.motif_data[1]))
                # self.num_tfs = len(self.unique_tfs)
                # print('Unique TFs:', self.num_tfs)
        elif type(motif_file) is not str:
            # If motif_file is an object
            if motif_file is None:
                # Computation without motif
                self.motif_data = None
                self.motif_genes = []
                self.motif_tfs = []
            else:
                # If motif_file is an object, it needs to be a dataframe
                if not isinstance(motif_file, pd.DataFrame):
                    raise Exception(
                        "Please provide a pandas dataframe for motif data with column names as 'source', 'target', and 'weight'."
                    )
                if ("source" not in motif_file.columns) or (
                    "target" not in motif_file.columns
                ):
                    print('renaming motif columns to "source", "target" and "weight" ')
                    motif_file.columns = ["source", "target", "weight"]
                self.motif_data = pd.DataFrame(motif_file.values)
                self.motif_tfs = sorted(set(motif_file["source"]))
                self.motif_genes = sorted(set(motif_file["target"]))
            # self.num_tfs = len(self.unique_tfs)
            # print('Unique TFs:', self.num_tfs)

        ### Loading expression
        if type(expression_file) is str:
            # If we pass an expression file, check if we have a 'with header' flag and read it
            with Timer("Loading expression data ..."):
                if with_header:
                    # Read data with header
                    self.expression_data = pd.read_csv(
                        expression_file, sep="\t", index_col=0
                    )
                else:
                    self.expression_data = pd.read_csv(
                        expression_file, sep="\t", header=None, index_col=0
                    )
                # assign expression data and samples/gene names
                self.expression_data = self.expression_data.iloc[:, (start-1):end]
                self.expression_genes = self.expression_data.index.tolist()
                self.expression_samples = self.expression_data.columns.astype(str)
                # self.num_genes = len(self.gene_names)
                # print('Expression matrix:', self.expression_data.shape)
        elif type(expression_file) is not str:
            # Pass expression as a dataframe 
            if expression_file is not None:
                if not isinstance(expression_file, pd.DataFrame):
                    raise Exception(
                        "Please provide a pandas dataframe for expression data."
                    )
                self.expression_data = expression_file.iloc[:, (start-1):end]  # pd.read_csv(expression_file, sep='\t', header=None, index_col=0)
                self.expression_genes = self.expression_data.index.tolist()
                self.expression_samples = self.expression_data.columns.astype(str)
                # self.num_genes = len(self.gene_names)
                # print('Expression matrix:', self.expression_data.shape)
            else:
                # If no expression is passed
                self.gene_names = self.motif_genes
                self.expression_genes = self.motif_genes
                self.num_genes = len(self.gene_names)
                self.expression_data = (
                    None  # pd.DataFrame(np.identity(self.num_genes, dtype=int))
                )
                print(
                    # TODO: Marouen check here. Here we do not pass the identity matrix
                    "No Expression data given: correlation matrix will be an identity matrix of size",
                    len(self.motif_genes),
                )

        if len(self.expression_genes) != len(np.unique(self.expression_genes)):
            print(
                "Duplicate gene symbols detected. Consider averaging before running PANDA"
            )

        ### Loading the PPI
        if type(ppi_file) is str:
            with Timer("Loading PPI data ..."):
                self.ppi_data = pd.read_csv(ppi_file, sep="\t", header=None)
                self.ppi_tfs = sorted(
                    set(pd.concat([self.ppi_data[0], self.ppi_data[1]]))
                )
                print("Number of PPIs:", self.ppi_data.shape[0])
        elif type(ppi_file) is not str:
            if ppi_file is not None:
                if not isinstance(ppi_file, pd.DataFrame):
                    raise Exception("Please provide a pandas dataframe for PPI data.")
                self.ppi_data = ppi_file  # pd.read_csv(ppi_file, sep='\t', header=None)
                self.ppi_tfs = sorted(
                    set(pd.concat([self.ppi_data[0], self.ppi_data[1]]))
                )
                print("Number of PPIs:", self.ppi_data.shape[0])
            else:
                # TODO: marouen, here we do not have an identiy matrix
                print(
                    "No PPI data given: ppi matrix will be an identity matrix of size",
                    len(self.motif_tfs),
                )
                self.ppi_data = None
                self.ppi_tfs = self.motif_tfs


        ### Data combination
        if modeProcess == "legacy" and remove_missing and motif_file is not None:
            self.__remove_missing()
            print('new case')
        if modeProcess == "legacy" and remove_missing==False:
            if expression_file is not None:
                self.gene_names = (
                    self.expression_genes
                )  # sorted( np.unique(self.motif_genes +  self.expression_genes ))
            if motif_file is None:
                self.unique_tfs = self.ppi_tfs
            else:
                self.unique_tfs = (
                    self.motif_tfs
                )  # sorted( np.unique(self.ppi_tfs     +  self.motif_tfs ))

        elif modeProcess == "union":
            self.gene_names = sorted(
                np.unique(self.motif_genes + self.expression_genes)
            )
            self.unique_tfs = sorted(np.unique(self.ppi_tfs + self.motif_tfs))

        elif modeProcess == "intersection":
            if motif_file is None:
                self.gene_names = sorted(np.unique(self.expression_genes))
                self.unique_tfs = sorted(np.unique(self.ppi_tfs))
            else:
                self.gene_names = sorted(
                    np.unique(
                        list(
                            set(self.motif_genes).intersection(
                                set(self.expression_genes)
                            )
                        )
                    )
                )
                self.unique_tfs = sorted(
                    np.unique(list(set(self.ppi_tfs).intersection(set(self.motif_tfs))))
                )

        self.num_genes = len(self.gene_names)
        self.num_tfs = len(self.unique_tfs)

        # Auxiliary dicts
        gene2idx = {x: i for i, x in enumerate(self.gene_names)}
        tf2idx = {x: i for i, x in enumerate(self.unique_tfs)}
        if (
            (modeProcess == "union" or modeProcess == "intersection")
            and (self.expression_data is not None)
            and (self.num_genes != 0)
        ):
            # Initialize data & Populate gene expression
            self.expression = np.zeros((self.num_genes, self.expression_data.shape[1]))
            idx_geneEx = [gene2idx.get(x, np.nan) for x in self.expression_genes]
            filtered_genes = [i for (i, v) in zip(self.expression_genes, idx_geneEx) if ~np.isnan(v)]
            idx_geneEx = [x for x in idx_geneEx if str(x) != 'nan']
            self.expression[idx_geneEx, :] = self.expression_data.loc[filtered_genes].values
            self.expression_data = pd.DataFrame(data=self.expression)

        # =====================================================================
        # Network construction
        # =====================================================================
        with Timer("Calculating coexpression network ..."):
            if self.expression_data is None:
                self.correlation_matrix = np.identity(self.num_genes, dtype=int)
            else:
                if cobra_design_matrix is not None:
                    psi, Q, d, g = cobra(cobra_design_matrix, self.expression_data)
                    if cobra_covariate_to_keep < 0 or cobra_covariate_to_keep >= psi.shape[0]:
                        raise AttributeError(
                            "Invalid COBRA component! Valid COBRA components are in range " + str(0) + " - " + str(psi.shape[0] - 1)
                        )
                    self.correlation_matrix = Q.dot(np.diag(psi[cobra_covariate_to_keep,])).dot(Q.T)
                else:
                    self.correlation_matrix = np.corrcoef(self.expression_data)
            if np.isnan(self.correlation_matrix).any():
                np.fill_diagonal(self.correlation_matrix, 1)
                self.correlation_matrix = np.nan_to_num(self.correlation_matrix)

        # Clean up useless variables to release memory
        if keep_expression_matrix:
            if self.expression_data is not None:
                self.expression_matrix = self.expression_data.values
            else:
                self.expression_matrix = None

        if self.motif_data is None:
            print(
                "Returning the correlation matrix of expression data in <Panda_obj>.correlation_matrix"
            )
            self.panda_network = self.correlation_matrix
            self.export_panda_results = self.correlation_matrix
            self.motif_matrix = self.motif_data
            self.ppi_matrix = self.ppi_data
            self.__pearson_results_data_frame()
            self.panda_network = pd.DataFrame(
                self.panda_network,
                index=self.expression_genes,
                columns=self.expression_genes,
            )
            return

        with Timer("Creating motif network ..."):
            self.motif_matrix_unnormalized = np.zeros((self.num_tfs, self.num_genes))
            idx_tfs = [tf2idx.get(x, np.nan) for x in self.motif_data[0]]
            idx_genes = [gene2idx.get(x, np.nan) for x in self.motif_data[1]]
            commind1 = ~np.isnan(idx_tfs) & ~np.isnan(idx_genes)
            idx_tfs = [i for (i, v) in zip(idx_tfs, commind1) if v]
            idx_genes = [i for (i, v) in zip(idx_genes, commind1) if v]
            if (len(idx_genes) == 0) or (len(idx_tfs) == 0):
                raise Exception('Error when creating the motif network!'
                             ' Typically this exception is raised if your'
                             ' expression matrix genes and motif priors'
                             ' not have any intersection.')
            idx = np.ravel_multi_index(
                (idx_tfs, idx_genes), self.motif_matrix_unnormalized.shape
            )
            self.motif_matrix_unnormalized.ravel()[idx] = self.motif_data[2][commind1]

        if self.ppi_data is None:
            self.ppi_matrix = np.identity(self.num_tfs, dtype=int)
        else:
            with Timer("Creating PPI network ..."):
                self.ppi_matrix = np.identity(self.num_tfs)
                idx_tf1 = [tf2idx.get(x, np.nan) for x in self.ppi_data[0]]
                idx_tf2 = [tf2idx.get(x, np.nan) for x in self.ppi_data[1]]
                commind2 = ~np.isnan(idx_tf1) & ~np.isnan(idx_tf2)
                idx_tf1 = [i for (i, v) in zip(idx_tf1, commind2) if v]
                idx_tf2 = [i for (i, v) in zip(idx_tf2, commind2) if v]
                idx = np.ravel_multi_index((idx_tf1, idx_tf2), self.ppi_matrix.shape)
                self.ppi_matrix.ravel()[idx] = self.ppi_data[2][commind2]
                idx = np.ravel_multi_index((idx_tf2, idx_tf1), self.ppi_matrix.shape)
                self.ppi_matrix.ravel()[idx] = self.ppi_data[2][commind2]

        return



[docs]
    def panda_loop(
        self, correlation_matrix, motif_matrix, ppi_matrix, computing="cpu", alpha=0.1
    ):
        """ The PANDA algorithm.

        Parameters
        ----------
            correlation_matrix: array
                Input coexpression matrix.
            motif_matrix      : array
                Input motif regulation prior network.
            ppi_matrix        : array
                Input PPI matrix.
            computing         : str
                'cpu' uses Central Processing Unit (CPU) to run PANDA.
                'gpu' use the Graphical Processing Unit (GPU) to run PANDA.
        """


        panda_loop_time = time.time()
        # TODO:This should be using self.correlation. Keeping for retrocompatibility
        motif_matrix = calc.compute_panda(
            correlation_matrix,
            ppi_matrix,
            motif_matrix,
            computing=computing,
            alpha=alpha,
        )
        print("Running panda took: %.2f seconds!" % (time.time() - panda_loop_time))
        # Ale: reintroducing the export_panda_results array if Panda called with save_memory=False

        if hasattr(self, "unique_tfs"):
            tfs = np.tile(self.unique_tfs, (len(self.gene_names), 1)).flatten()
            genes = np.repeat(self.gene_names, self.num_tfs)
            motif = self.motif_matrix_unnormalized.flatten(order="F")
            force = motif_matrix.flatten(order="F")
            self.export_panda_results = pd.DataFrame(
                {"tf": tfs, "gene": genes, "motif": motif, "force": force}
            )
            # self.export_panda_results = np.column_stack((tfs,genes,motif,force))
        return motif_matrix


    def __pearson_results_data_frame(self):
        """ Saves PANDA network in edges format.
        """
        genes_1 = np.tile(self.gene_names, (len(self.gene_names), 1)).flatten()
        genes_2 = (
            np.tile(self.gene_names, (len(self.gene_names), 1)).transpose().flatten()
        )
        self.flat_panda_network = self.panda_network.transpose().flatten()
        self.export_panda_results = pd.DataFrame(
            {"tf": genes_1, "gene": genes_2, "force": self.flat_panda_network}
        )
        self.export_panda_results = self.export_panda_results[["tf", "gene", "force"]]
        return None


[docs]
    def save_panda_results(self, path="panda.npy", save_adjacency=False, old_compatible=True ):
        """ Saves PANDA network.

        Parameters
        ----------
            path: str
                Path to save the network.
            save_adjacency: bool
                if True the output is an adjacency matrix and not the edge list
            old_compatible: bool
                if True saves the data as it was saved until netzoopy 0.9.11
        """
        with Timer("Saving PANDA network to %s ..." % path):
            # Because there are two modes of operation (save_memory), save to file will be different

            # Export to file
            if old_compatible:
                if not hasattr(self, "unique_tfs"):
                    toexport = self.panda_network
                else:
                    # save the network with names
                    toexport = self.export_panda_results
                print('WARNING: saving without header will soon become obsolete. \n\
                    Use old_compatible=False to save the panda results with correct column naming')
                if path.endswith(".txt"):
                    np.savetxt(path, toexport, fmt="%s", delimiter=" ")
                elif path.endswith(".csv"):
                    np.savetxt(path, toexport, fmt="%s", delimiter=",")
                elif path.endswith(".tsv"):
                    np.savetxt(path, toexport, fmt="%s", delimiter="\t")
                else:
                    np.save(path, toexport)
            else:
                print('WARNING: panda is now saved with the column names. \nUse old_compatible=True to save the panda results as previous versions without header')
                if not hasattr(self, "unique_tfs"):
                    toexport = self.panda_network
                    toexport = toexport.reset_index()
                else:
                    # save the network with names
                    toexport = self.export_panda_results
                    #saving as adjacency matrix
                    if (save_adjacency):
                        toexport = pd.pivot_table(toexport, values = 'force', index = 'tf', columns='gene', dropna=False)
                        toexport = toexport.reset_index()
                if path.endswith(".txt"):
                    toexport.to_csv(path, sep=" ", index=False)
                elif path.endswith(".csv"):
                    toexport.to_csv(path, sep=",", index=False)
                elif path.endswith(".tsv"):
                    toexport.to_csv(path, sep="\t", index=False)
                else:
                    np.save(path, toexport)



[docs]
    def top_network_plot(self, top=100, file="panda_top_100.png", plot_bipart=False):
        """ Selects top genes.

        Parameters
        ----------
            top        : int
                Top number of genes to plot.
            file       : str
                File to save the network plot.
            plot_bipart: bool
                Plot the network as a bipartite layout.
        """
        if not hasattr(self, "export_panda_results"):
            raise AttributeError(
                "Panda object does not contain the export_panda_results attribute.\n"
                + "Run Panda with the flag save_memory=False"
            )
        # Ale TODO: work in numpy instead of pandas?
        self.panda_results = pd.DataFrame(
            self.export_panda_results, columns=["tf", "gene", "motif", "force"]
        )
        subset_panda_results = self.panda_results.sort_values(
            by=["force"], ascending=False
        )
        subset_panda_results = subset_panda_results[
            subset_panda_results.tf != subset_panda_results.gene
        ]
        subset_panda_results = subset_panda_results[0:top]
        self.__shape_plot_network(
            subset_panda_results=subset_panda_results,
            file=file,
            plot_bipart=plot_bipart,
        )
        return None


    def __shape_plot_network(
        self, subset_panda_results, file="panda.png", plot_bipart=False
    ):
        """ Creates plot.

        Parameters
        -----------
            subset_panda_results : array
                Reduced PANDA network to the top genes.
            file                 : str
                File to save the network plot.
            plot_bipart: bool
                Plot the network as a bipartite layout.
        """
        # reshape data for networkx
        unique_genes = list(
            set(list(subset_panda_results["tf"]) + list(subset_panda_results["gene"]))
        )
        unique_genes = pd.DataFrame(unique_genes)
        unique_genes.columns = ["name"]
        unique_genes["index"] = unique_genes.index
        subset_panda_results = subset_panda_results.merge(
            unique_genes, how="inner", left_on="tf", right_on="name"
        )
        subset_panda_results = subset_panda_results.rename(
            columns={"index": "tf_index"}
        )
        subset_panda_results = subset_panda_results.drop(["name"], 1)
        subset_panda_results = subset_panda_results.merge(
            unique_genes, how="inner", left_on="gene", right_on="name"
        )
        subset_panda_results = subset_panda_results.rename(
            columns={"index": "gene_index"}
        )
        subset_panda_results = subset_panda_results.drop(["name"], 1)
        links = subset_panda_results[["tf_index", "gene_index", "force"]]
        self.__create_plot(
            unique_genes=unique_genes, links=links, file=file, plot_bipart=plot_bipart
        )
        return None

    def __create_plot(self, unique_genes, links, file="panda.png", plot_bipart=False):
        """ Runs the plot.

        Parameters
        ----------
            unique_genes : list
                Unique list of PANDA genes.
            links        : list
                Edges of the subset PANDA network to the top genes.
            file         : str
                File to save the network plot.
            plot_bipart  : bool
                Plot the network as a bipartite layout.

        Notes
        -----
            split_label: Splits the plot label over several lines for plotting purposes.
        """
        import networkx as nx
        import matplotlib.pyplot as plt

        g = nx.Graph()
        g.clear()
        plt.clf()
        # img = plt.imread("../img/panda.jpg")
        # fig, ax = plt.subplots()
        # ax.imshow(img, extent=[0, 400, 0, 300])
        ##ax.plot(x, x, '--', linewidth=5, color='firebrick')
        g.add_nodes_from(unique_genes["index"])
        edges = []
        for i in range(0, len(links)):
            edges = edges + [
                (
                    links.iloc[i]["tf_index"],
                    links.iloc[i]["gene_index"],
                    float(links.iloc[i]["force"]) / 200,
                )
            ]
        g.add_weighted_edges_from(edges)
        labels = {}

        def split_label(label):
            """ Splits the plot label over several lines for plotting purposes.

            Parameters
            ----------
                label: Input label text.

            Returns:
                label: _
                    Output label text divided over several lines.
            """
            ll = len(label)
            if ll > 6:
                return (
                    label[0 : int(np.ceil(ll / 2))]
                    + "\n"
                    + label[int(np.ceil(ll / 2)) :]
                )
            return label

        for i, l in enumerate(unique_genes.iloc[:, 0]):
            labels[i] = split_label(l)
        if not plot_bipart:
            pos = nx.spring_layout(g)
        else:
            pos = nx.drawing.layout.bipartite_layout(g, set(links["tf_index"]))
        # nx.draw_networkx(g, pos, labels=labels, node_size=40, font_size=3, alpha=0.3, linewidth = 0.5, width =0.5)
        print(plot_bipart)
        if not plot_bipart:
            colors = range(len(edges))
        else:
            colors = list(zip(*edges))[-1]

        options = {
            "alpha": 0.7,
            "edge_color": colors,
            "edge_cmap": plt.cm.Blues,
            "node_size": 110,
            "vmin": -100,
            "width": 2,
            "labels": labels,
            "font_weight": "regular",
            "font_size": 3,
            "linewidths": 20,
        }

        nx.draw_networkx(g, pos=pos, **options)
        plt.axis("off")
        plt.savefig(file, dpi=300)
        return None


[docs]
    def return_panda_indegree(self):
        """ Computes indegree of PANDA network, only if save_memory = False.
        """
        export_panda_results_pd = pd.DataFrame(
            self.export_panda_results, columns=["tf", "gene", "motif", "force"]
        )
        subset_indegree = export_panda_results_pd.loc[:, ["gene", "force"]]
        self.panda_indegree = subset_indegree.groupby("gene").sum()
        return self.panda_indegree



[docs]
    def return_panda_outdegree(self):
        """ computes outdegree of PANDA network, only if save_memory = False.
        """
        export_panda_results_pd = pd.DataFrame(
            self.export_panda_results, columns=["tf", "gene", "motif", "force"]
        )
        subset_outdegree = export_panda_results_pd.loc[:, ["tf", "force"]]
        self.panda_outdegree = subset_outdegree.groupby("tf").sum()
        return self.panda_outdegree