Source code for netZooPy.ligress.bonobo

from __future__ import print_function
import math
import time
import pandas as pd
from .timer import Timer
import numpy as np
from netZooPy.panda.panda import Panda
from netZooPy.panda import calculations as calc
from netZooPy.ligress import io
import sys
import os
import pandas as pd
import scipy.stats as stats



[docs]
def compute_bonobo(
    expression_matrix,
    expression_mean,
    sample_idx,
    online_coexpression=False,
    computing="cpu",
    cores=1,
    delta=None,
    compute_sparse=False,
    confidence=0.05,
    save_pvals=False,
):

    """Compute one bonobo matrix. Takes as input an expression matrix, the row-wise mean of the expression, and the
    index of the sample for which the bonobo is computed (index of the columns).

    Args:
        expression_matrix (numpy array): genes as rows, samples as columns  
        expression_mean (numpy array): rowise mean of expression.  
        sample_idx (int): index of the sample of interest
        delta (float, optional): delta value for the computation. Defaults to None, which means that delta is tuned  
        compute_sparse (bool, optional): if True, the bonobo gets sparsified and relative pvalues are returned  
        confidence (float, optional): if sparsify is True, this is the CI for the approximate zscore.  
        save_pvals (bool, optional): if True, the pvalues are saved and returned  
        online_coexpression (bool, optional): FOR FUTURE use. if True, the coexpression is computed with a closed form  
        computing (str, optional): FOR FUTURE use. Defaults to 'cpu'.  
        cores (int, optional): FOR FUTURE use. Defaults to 1.    
          
    Return:
        bonobo_matrix: bonobo coexpression matrix  
        delta: delta value, useful when tuned  
        pval: pvalues, if save_pvals is True  
    """
    
    pval = None

    mask_include = [True] * expression_matrix.shape[1]
    mask_include[sample_idx] = False

    # Compute covariance matrix from the rest of the data, leaving out sample
    covariance_matrix = np.cov(expression_matrix[:, mask_include])

    # Compute posterior weight delta from data
    if delta == None:
        delta = 1 / (
            3
            + 2
            * np.sqrt(covariance_matrix.diagonal()).mean()
            / covariance_matrix.diagonal().var()
        )
    else:
        assert type(delta) == float

    # Compute sample-specific covariance matrix
    sscov = (
        delta
        * np.outer(
            (expression_matrix - expression_mean)[:, sample_idx],
            (expression_matrix - expression_mean)[:, sample_idx],
        )
        + (1 - delta) * covariance_matrix
    )

    # Compute sample-specific coexpression matrix from the sample-specific covariance matrix

    sscov = np.array(sscov)
    diag = np.sqrt(np.diag(np.diag(sscov)))

    # Replace 0 diagonals by 1, so that the diagonal matrix can be inverted
    diag = np.array(diag)
    indices = np.where(np.diag(diag) == 0)[0]
    for i in indices:
        diag[i, i] = 1

    sds = np.linalg.inv(diag)
    bonobo_matrix = sds @ sscov @ sds

    if compute_sparse:
        threshold = stats.norm.ppf(1 - (confidence / 2))
        g = sscov.shape[1]
        d = g + 1 / delta
        a1 = (d - g + 1) / ((d - g) * (d - g - 3))
        a2 = (d - g - 1) / ((d - g) * (d - g - 3))

        # m1 = a1*(np.multiply(sscov, sscov))
        v = np.sqrt(np.diag(sscov))
        # m2 = a2 * (np.outer(v, v))

        # Pointwise variance: v = m1 + m2
        v = (a1 * (np.multiply(sscov, sscov))) + (a2 * (np.outer(v, v)))
        # Pointwise standard deviation
        v = np.sqrt(v)

        # zscore
        v = np.divide(sscov, v)

        pval = 2 * (1 - stats.norm.cdf(np.abs(v)))

        # bonobo gets sparsified: diagonal bonobo + sparsified off diagonal
        # bonobo_matrix = (np.eye(g)@bonobo_matrix) + np.multiply( 1-np.eye(g), (v*(v<threshold) ) )
        if save_pvals:
            print("keep Bonobo whole, and use pvals to threshold and sparsify")
        else:
            bonobo_matrix = np.eye(g) + np.multiply(
                1 - np.eye(g), (bonobo_matrix * (np.abs(v) > threshold))
            )

    return (bonobo_matrix, delta, pval)




[docs]
class Bonobo:
    """
    BONOBO


    Parameters
    ----------

            expression_file : str
                Path to file containing the gene expression data or pandas dataframe. By default, the expression file does not have a header, and the cells ares separated by a tab.


    Notes
    ------

        BONOBO can be run as follows:    
            bonobo_obj_sparse = Bonobo(expression_file)
            bonobo_obj_sparse.run_bonobo(keep_in_memory=True, output_fmt='.hdf', sparsify=True, output_folder='../data/processed/bonobo_sparse_pvals/', save_pvals=False)



    References
    ----------
    .. [1]__ Bayesian Optimized sample-specific Networks Obtained By Omics data (BONOBO), Saha and Fanfani et al. 2021. doi: https://doi.org/10.1101/2023.11.16.567119

    Authors: Viola Fanfani, Enakshi Saha
    """

    def __init__(
        self,
        expression_file,
    ):
        """Intialize instance of Panda class and load data."""

        self.expression_file = expression_file

        # data read
        self.samples = None
        self.n_samples = None
        self.expression_data = None
        self.expression_genes = None
        self.expression_gene2idx = None
        self.expression_samples = None
        # prepare all the data
        print("BONOBO: preparing expression")
        self._prepare_expression()
        self.delta = {}
        self.bonobos = {}
        self.pvals = {}
        self.save_pvals = False

    ########################
    ### METHODS ############
    ########################


[docs]
    def _prepare_expression(self):
        
        """Prepare expression data. Uses the io module to read the expression data."""

        with Timer("Reading expression data..."):
            # Read expression
            self.expression_data, self.expression_genes = io.prepare_expression(
                self.expression_file, samples=self.samples
            )

            self.expression_samples = self.expression_data.columns.tolist()

        # Auxiliary dicts
        self.expression_gene2idx = {x: i for i, x in enumerate(self.expression_genes)}



[docs]
    def run_bonobo(
        self,
        output_folder="bonobo/",
        output_fmt=".h5",
        keep_in_memory=False,
        save_full=False,
        online_coexpression=False,
        delta=None,
        computing="cpu",
        cores=1,
        precision="single",
        sample_names=[],
        sparsify=False,
        confidence=0.05,
        save_pvals=False,
    ):

        """BONOBO algorithm

        Args:
            output_folder (str, optional): output folder. If an empty string is passed the matrix is automatically kept
            in memory, overwriting the value of keep_in_memory
            output_fmt (str, optional): format of output bonobo matrix. By default it is an hd5 file, can be a txt or csv.
            keep_in_memory (bool, optional): if True, the bonobo coexpression matrix is kept in memory, otherwise it is
            discarded after saving.
            save_full (bool, optional): whether to save the coexpression with the gene names. We recommend using True
            only when the number of genes is not very big to avoid saving huge matrices.
            online_coexpression (bool, optional): if true coexpression is computed with a closed form
            cores (int, optional): cores. Defaults to 1.
            delta (float, optional): delta parameter. If default (None) delta is trained, otherwise pass a value.
            Recommended is 0.3.
            precision (str, optional): matrix precision, defaults to single precision.
            sparsify (bool, optiona): if True, bonobo gets sparsified and relative pvalues are returned
            confidence (float, optional): if sparsify is True, this is the CI for the approximate zscore.
            save_pvals (bool, optional): if True, the pvalues are saved and returned
        """

        ligress_start = time.time()

        # first let's reorder the expression data

        if precision == "single":
            atype = "float32"
        elif precision == "double":
            atype = "float64"
        else:
            sys.exit("ERROR: Precision %s unknonw" % str(precision))

        # let's sort the expression and ppi data
        self.expression_data = self.expression_data.astype(atype)

        self.sparsify = sparsify
        self.confidence = confidence
        self.save_pvals = save_pvals
        # If output folder is an empty string, keep the matrix in memory and don't save it to disk
        # Otherwise the output folder can be created and the matrix saved
        if output_folder == "":
            keep_in_memory = True
            save_matrix = False
        else:
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            save_matrix = True

        # correlation_complete = self.expression_data.T.corr()
        # we automatically multiply the correlation with the number of samples

        # Center expression data to make mean = 0
        # let's remove this from here, and keep it only inside the ligress computation
        # self.expression_data_centered = (self.expression_data - np.mean(self.expression_data.values,axis = 1, keepdims=True))
        self.expression_mean = np.mean(
            self.expression_data.values, axis=1, keepdims=True
        )

        print("BONOBO: We are starting to compute the bonobos...")
        # Now for each sample we compute the lioness network from correlations and
        # the panda using the motif and ppi tables

        if sample_names == []:
            sample_names = self.expression_samples
        else:
            different = set(sample_names).difference(set(self.expression_samples))
            sample_names = set(sample_names).intersection(set(self.expression_samples))
            if len(different) > 0:
                print(
                    "WARNING: some of the sample names are not in the expression data"
                )
                print("\tMissing:")
                print("\t" + str(different))
                print("\tUsing:")
                print("\t" + str(sample_names))

        for s, sample in enumerate(sample_names):
            sample_start = time.time()
            # first run bonobo
            print("BONOBO: bonobo for sample %s" % str(sample))
            self._bonobo_loop(
                sample,
                output_fmt=output_fmt,
                keep_in_memory=keep_in_memory,
                save_matrix=save_matrix,
                computing=computing,
                output_folder=output_folder,
                delta=delta,
            )



[docs]
    def _bonobo_loop(
        self,
        sample,
        output_fmt=".h5",
        keep_in_memory=False,
        save_matrix=True,
        online_coexpression=False,
        computing="cpu",
        output_folder="./bonobo/",
        delta=None,
    ):
        """Runs BONOBO on one sample. For now all samples are saved separately.

        Args:
            sample (str): sample of interest
            output_fmt (str, optional): format of output bonobo matrix. By default it is an hd5 file, can be a txt or csv. Defaults to '.h5'.
            keep_in_memory (bool, optional): If true, all BONOBOs are kept in memory and accessed from the bonobo object. Defaults to False.
            save_matrix (bool, optional): If true, the BONOBO is saved to disk. Defaults to True.
            output_folder (str, optional): results folder. Defaults to './bonobo/'.
            delta (float, optional): Delta value, if None the value is computed with the optimization strategy. Defaults
            to None.
            computing (str, optional): For the future. Defaults to 'cpu'.
            online_coexpression (bool, optional): For the future. Defaults to False.
            
        """

        touse = list(set(self.expression_samples).difference(set([sample])))
        sample_idx = list(self.expression_samples).index(sample)

        print("BONOBO: computing bonobo for sample %s" % str(sample))
        sample_bonobo, sample_delta, pval = compute_bonobo(
            self.expression_data.values,
            self.expression_mean,
            sample_idx,
            delta=delta,
            online_coexpression=online_coexpression,
            computing=computing,
            compute_sparse=self.sparsify,
            confidence=self.confidence,
            save_pvals=self.save_pvals,
        )

        self.delta[sample] = sample_delta

        df_bonobo = pd.DataFrame(
            data=sample_bonobo, columns=self.expression_data.index.tolist()
        )

        if save_matrix:
            print("Saving BONOBO for sample %s" % (str(sample)))
            output_fn = output_folder + "bonobo_" + str(sample) + output_fmt
            if output_fmt == ".h5":
                df_bonobo.to_hdf(output_fn, key="bonobo", index=False)
            elif output_fmt == ".csv":
                df_bonobo.to_csv(output_fn, index=False)
            elif output_fmt == ".txt":
                df_bonobo.to_csv(output_fn, index=False, sep="\t")
            else:
                print(
                    "WARNING: output format (%s) not recognised. We are saving in hdf"
                    % str(output_fmt)
                )
                output_fn = output_folder + "bonobo_" + str(sample) + ".h5"
                df_bonobo.to_hdf(output_fn, key="bonobo", index=False)

        if self.sparsify and self.save_pvals:
            df_pvals = pd.DataFrame(
                data=pval, columns=self.expression_data.index.tolist()
            )
            print("Saving pvalues for sample %s" % (str(sample)))
            output_fn = output_folder + "pvals_" + str(sample) + output_fmt
            if output_fmt == ".h5":
                df_pvals.to_hdf(output_fn, key="pvals", index=False)
            elif output_fmt == ".csv":
                df_pvals.to_csv(output_fn, index=False)
            elif output_fmt == ".txt":
                df_pvals.to_csv(output_fn, index=False, sep="\t")
            else:
                print(
                    "WARNING: output format (%s) not recognised. We are saving in hdf"
                    % str(output_fmt)
                )
                output_fn = output_folder + "pvals_" + str(sample) + ".h5"
                df_pvals.to_hdf(output_fn, key="pvals", index=False)

        if keep_in_memory:
            self.bonobos[sample] = df_bonobo
            self.pvals[sample] = pval