Source code for netZooPy.condor.condor

import numpy as np
import pandas as pd
from igraph import *
from .timer import Timer


[docs]
class condor_object:
    """
    Initialization of the condor object. The function gets a network
    in edgelist format as a path to a file or encoded in a pandas dataframe.
    Builds a condor_object with an edgelist,an igraph network, names of the
    targets and regulators.

    Note: The edgelist is assumed to contain a bipartite network.
    The program will relabel the nodes so that the edgelist represents a bipartite network anyway.
    It is on the user to know that the network they are using is suitable for the method.

    Parameters
    ------------

        network_file: str
            Path to file encoding an edgelist.
        sep: str
            Separator used in the file.
        index_col: int
            Column that stores the index of the edgelist. E.g. None, 0...
        header: int
            Row that stores the header of the edgelist. E.g. None, 0...
        dataframe: dataFrame
            Pandas DataFrame containing the edgelist. Use as alternative
            to filename
        silent: Silent mode, will not print anything on console.

    Returns
    ---------

        condor_object : object
            The object has the following attributes:
            - net: Contains the network edgelist as a pandas DataFrame.
            - graph: iGraph object containing the bipartite network.
            - reg_names: list of the nodes in the first column.
            - tar_names: list of the nodes in the second column.
            - index_dict: dictionary keeping track of the indices in the "graph" variable and the actual names of the nodes.
    
        warning:
            Condor uses iGraph for the node assignment. For the initialization of the
            assignment, there is some stochasticity involved, that can be somehow controlled 
            by setting the python random seed.
            
            For instance, running condor twice
            on the same dataset, in the same python process, might result in slightly different assignments. 
            To avoid this behavior, you can set the seed ( random.seed(0) ) before calling 
            condor.

                >>> import random
                >>> random.seed(1)
                >>> c1 = condor(...)
                >>> random.seed(1)
                >>> c2 = condor(...)

            In this case c1 and c2 are exactly the same.

            On the contrary, if condor is called twice during two different python calls, you will 
            have the exact same results, as the random seed will have resetted. 
            Instead, the stochasticity of the initial assignment can be kept by setting 
            a random seed at the beginning

                >>> import random
                >>> random.seed(random.randint(1,10000000)) 
                >>> condor(...)
    
    """

    def __init__(
        self, network_file=None, sep=",", index_col=0, header=0, dataframe=None,silent=False
    ):

        # Checks that either a dataframe or a path to an edgelist are provided, and not to both at the same time.
        assert (
            dataframe is not None or network_file is not None
        ), "Could not create condor object. Edgelist must be passed either as a pandas DataFrame or path to file."
        assert (
            dataframe is None or network_file is None
        ), "Could not create condor object. Either pass a DataFrame or a file path. Not both."

        if network_file is not None:
            # Reads the edgelist and creates a DataFrame
            self.net = pd.read_csv(
                network_file, sep=sep, index_col=index_col, header=header
            )

        if dataframe is not None:
            # Checks that the dataframe has the proper format.
            assert (
                dataframe.shape[1] < 4
            ), "DataFrame must have at most three columns. Two containing vertices and an additional one containing edgeweights."
            if dataframe.shape[1] == 3:
                assert pd.api.types.is_numeric_dtype(
                    dataframe.iloc[:, 2]
                ), "Weight column must contain numeric values."
            self.net = dataframe

        # Forces the edgelist to induce a bipartite network by renaming the columns.
        cnames = self.net.columns
        self.net = self.net.astype({cnames[0]: str, cnames[1]: str})
        self.net.iloc[:, 0] = "reg_" + self.net.iloc[:, 0]
        self.net.iloc[:, 1] = "tar_" + self.net.iloc[:, 1]
        # self.net.columns = ["V1","V2","weight"]
        self.silent = silent

        with Timer("Object creation:",self.silent):
            # Checks that the DataFrame gives rise to a well-defined bipartite network.
            assert not self.net.isnull().any().any(), "NaN values detected."
            assert not (
                "" in list(self.net.iloc[:, 0]) or "" in list(self.net.iloc[:, 1])
            ), "Empty strings detected."

            # Checks if weights are provided. If not, a column with 1s is added to the DataFrame.
            if self.net.shape[1] != 3:
                print("Unweighted network. Weights initialized as 1.")
                self.net["weight"] = 1

            self.net.columns = ["V1", "V2", "weight"]
            # Creates iGraph object from the DataFrame.
            print('hello')
            print(self.net)
            self.graph = Graph.DataFrame(self.net, directed=False, use_vids=False)
            # from igraph 0.10 add parameter: use_vids=False

            self.reg_names = sorted(set(self.net.iloc[:, 0]))
            self.tar_names = sorted(set(self.net.iloc[:, 1]))

            # By construction of the graph object, and the fact that reg_ is sorted in front of tar_
            # we have that the graph nodes are sorted by first the reg nodes and then the tar nodes.
            #types = [0] * len(self.reg_names)
            #types.extend([1] * len(self.tar_names))
            types = [0 if self.graph.vs[i]['name'].startswith('reg') else 1 for i in range(len(self.graph.vs))]
            
            self.graph.vs["type"] = types

            # Dictionary to keep track of node indices and node names should they be rearranged.
            # TO-DO: Check if this is really necessary.
            self.index_dict = {node.index: node["name"] for node in self.graph.vs}

            self.modularity = None
            self.reg_memb = None
            self.tar_memb = None
            self.Qcoms = None


[docs]
    def initial_community(self, method="LDN", project=False,resolution=1):
        """
            Computation of the initial community structure based on unipartite methods.

        Parameters
        -----------

            method : str
                Method to determine intial community assignment.
                    - "LCS": Multilevel method.
                    - "LDN": Leiden method
            project: bool
                Whether to apply the initial community structure on the bipartite network
                disregarding the bipartite structure or apply it to the unipartite network resulting from the projection onto
                one of these nodes.
        Outputs:
            self     : updates condor object with
                     tar_memb: DataFrame of initial target node membership.
                     reg_memb: DataFrame of initial reg node membership.
        """

        weights_id = self.net.columns[2]
        if project:
            with Timer("Initial community structure with projection:",self.silent):

                # Obtains bipartite projection onto target subset.
                projected_graph = self.graph.bipartite_projection(which=1)

                if method == "LCS":
                    vc = Graph.community_multilevel(projected_graph, weights=weights_id)
                if method == "LDN":
                    vc = Graph.community_leiden(
                        projected_graph, objective_function='modularity',resolution_parameter=resolution, weights=weights_id
                    )

                self.modularity = vc.modularity
                if not self.silent: print("Initial modularity: ", self.modularity)

                tar_index = [i.index for i in self.graph.vs.select(type_in=[1])]
                # By the ordering on the indices, the target nodes indices begin at len(co.reg_names)
                # in order not to mess with the indices in vc we substract that starting value.
                tar_memb = [vc.membership[i - len(self.reg_names)] for i in tar_index]
                T0 = pd.DataFrame(zip(tar_index, tar_memb))
                T0.columns = ["index", "community"]
                self.tar_memb = T0

                # Here we have only computed the community structure for the target nodes. We initialize a properly sized dataframe for the reg nodes.
                reg_index = [i.index for i in self.graph.vs.select(type_in=[0])]
                # By the ordering on the indices, the target nodes indices begin at len(co.reg_names)
                # in order not to mess with the indices in vc we substract that starting value.
                reg_memb = [0 for i in reg_index]
                R0 = pd.DataFrame(zip(reg_index, reg_memb))
                R0.columns = ["index", "community"]
                self.reg_memb = R0

                # Sort of the same as above but without projecting. The bipartite network is treated as a unipartite network.
        else:
            with Timer("Initial community structure without projection:",self.silent):
                if method == "LCS":
                    vc = Graph.community_multilevel(self.graph, weights=weights_id)
                if method == "LDN":
                    vc = Graph.community_leiden(
                        self.graph,objective_function='modularity',resolution_parameter=resolution, weights=weights_id
                    )

                self.modularity = vc.modularity
                if not self.silent: print("Initial modularity: ", self.modularity)

                tar_index = [i.index for i in self.graph.vs.select(type_in=[1])]
                tar_memb = [vc.membership[i] for i in tar_index]
                tar_index = [self.index_dict[i] for i in tar_index]
                T0 = pd.DataFrame(zip(tar_index, tar_memb))
                T0.columns = ["index", "community"]
                self.tar_memb = T0

                reg_index = [i.index for i in self.graph.vs.select(type_in=[0])]
                reg_memb = [vc.membership[i] for i in reg_index]
                reg_index = [self.index_dict[i] for i in reg_index]
                R0 = pd.DataFrame(zip(reg_index, reg_memb))
                R0.columns = ["index", "community"]
                self.reg_memb = R0




[docs]
    def bipartite_modularity(self, B, m, R, T):
        """ Computation of the bipartite modularity as described in 
        Michael J. Barber. Modularity and community detection in bipartite networks.

        Parameters
        ------------
            B        : array
                modularity matrix.
            m        : array
                sum of the weights (or number of edges in the unweighted case).
            R        : array
                community assignement matrix for reg nodes.
            T        : array
                community assignement matrix for tar nodes.
        
        Returns
        --------
            Q: _
                Modularity score.
        Notes
        --------
            self.Qcoms: _
                Modularity contribution by each community.
            self.modularity: _
                Modularity score.

        References
        -----------
        .. [1]__ Michael J. Barber. Modularity and community detection in bipartite networks.

        """

        RtBT = T.transpose().dot(B.dot(R))
        Qcoms = (1 / m) * (np.diagonal(RtBT))
        Q = sum(Qcoms)
        self.Qcoms = Qcoms[Qcoms > 0]
        self.modularity = Q
        return Q



[docs]
    def matrices(self, c,resolution):
        """ Computation of modularity matrix and initial community matrix.

        Parameters
        ------------
            c        : int
                max number of communities.
        Returns
        ----------
            B        : array
                Modularity matrix.
            m        : int
                Sum of the weights (or number of edges in the unweighted case.
            T0       : array
                Initial community structure matrix for target nodes.
            R0       : array
                Initial community structure matrix for reg nodes.
            gn       : dict
                Index dictionary for tar node names.
            rg       : dict
                Index dictionary for reg node names.

        """

        with Timer("Matrix computation:",self.silent):

            # Dimensions of the matrix
            p = len(self.tar_names)
            q = len(self.reg_names)

            # Index dictionaries for the matrix. Note that this set of indices is different of that in the condor object (that one is for the igraph network.)
            rg = {self.reg_names[i]: i for i in range(0, q)}
            gn = {self.tar_names[i]: i for i in range(0, p)}

            # Computes weighted biadjacency matrix.
            A = np.matrix(np.zeros((p, q)))
            for edge in self.net.iterrows():
                A[gn[edge[1][1]], rg[edge[1][0]]] = edge[1][2]

            # Computes node degrees for the nodesets.
            ki = A.sum(1)
            dj = A.sum(0)
            # Computes sum of edges and bimodularity matrix.
            m = float(sum(ki))
            B = A - resolution*((ki @ dj) / m)

            # d = self.index_dict

            # Computation of initial modularity matrix for tar and reg nodes from the membership dataframe.
            T_ed = zip(
                [gn[j] for j in [i for i in self.tar_memb.iloc[:, 0]]],
                self.tar_memb.iloc[:, 1],
            )
            T0 = np.zeros((p, c))
            for edge in T_ed:
                T0[edge] = 1

            R_ed = zip(
                [rg[j] for j in [i for i in self.reg_memb.iloc[:, 0]]],
                self.reg_memb.iloc[:, 1],
            )
            R0 = np.zeros((q, c))
            for edge in R_ed:
                R0[edge] = 1

        return B, m, T0, R0, gn, rg



[docs]
    def brim(self, deltaQmin="def", c="def", resolution=1):

        """ Implementation of the BRIM algorithm to iteratively maximize 
        bipartite modularity. Note that c is the maximum number of communities. 
        Dynamic choice of c is not yet implemented.

        Parameters
        ------------
            deltaQmin: str
                Difference modularity threshold for stopping the iterative process
            c        : int
                max number of communities.
            resolution: float
                
        
        Notes
        ---------
            Updates condor object with the following:
            - self.modularity: Modularity score for the final assignement.
            - self.tar_memb: Final community assignement for tar nodes.
            - self.reg_memb: Final community assignement for reg nodes.

        Note:
            c        : Has to be bigger than the number of communities given by the initial community assignement. Otherwise the program will crash. The default option gives room for 20% more communities which rarely fails.

        """

        if c == "def":
            c = int(len(self.tar_memb["community"].unique()) * 1.2)

        B, m, T0, R0, gn, rg = self.matrices(c,resolution)


        # Default deltaQmin.
        if deltaQmin == "def":
            deltaQmin = min(1 / m, 1e-5)

        with Timer("BRIM: ",self.silent):
            Qnow = 0
            deltaQ = 1
            p, q = B.shape
            while deltaQ > deltaQmin:
                # Right sweep
                Tp = T0.transpose().dot(B)
                R = np.zeros((q, c))
                am = np.array(np.argmax(Tp.transpose(), axis=1))
                for i in range(0, len(am)):
                    R[i, am[i][0]] = 1

                # Left sweep
                Rp = B.dot(R)
                T = np.zeros((p, c))
                am = np.array(np.argmax(Rp, axis=1))

                for i in range(0, len(am)):
                    T[i, am[i][0]] = 1
                T0 = T

                Qthen = Qnow
                Qnow = self.bipartite_modularity(B, m, R, T)
                deltaQ = Qnow - Qthen
                if not self.silent: print(Qnow)

            self.modularity = Qnow

        self.tar_memb = pd.DataFrame(
            list(zip(list(gn), [T0[i, :].argmax() for i in range(0, len(gn))]))
        )
        self.reg_memb = pd.DataFrame(
            list(zip(list(rg), [R0[i, :].argmax() for i in range(0, len(rg))]))
        )
        self.tar_memb.columns = ["tar", "community"]
        self.reg_memb.columns = ["reg", "community"]



[docs]
    def qscores(self):
        """
            Computes the qscores (contribution of a vertex to its community modularity)
            for each vertex in the network.
        """

        c = 1 + max(self.reg_memb["com"])
        B, m, T, R, gn, rg = self.matrices(c)
        self.Qscores = {"reg_qscores": None, "tar_qscores": None}

        # Qscores for the targets:
        Rq = B.dot(R) / (2 * m)
        Qj = list()
        for j, r in self.tar_memb.iterrows():
            Qjh = Rq[j, r["com"]] / self.Qcoms[r["com"]]
            Qj.append(Qjh)
        self.Qscores["tar_qscores"] = self.tar_memb.copy()
        self.Qscores["tar_qscores"]["qscore"] = Qj

        # Qscores for the regulators:
        Tq = T.transpose().dot(B) / (2 * m)
        Qi = list()
        for i, r in self.reg_memb.iterrows():
            Qih = Tq[r["com"], i] / self.Qcoms[r["com"]]
            Qi.append(Qih)
        self.Qscores["reg_qscores"] = self.reg_memb.copy()
        self.Qscores["reg_qscores"]["qscore"] = Qi





[docs]
def run_condor(
    network_file,
    sep=",",
    index_col=0,
    header=0,
    initial_method="LDN",
    initial_project=False,
    com_num="def",
    deltaQmin="def",
    resolution=1,
    return_output=False,
    tar_output="tar_memb.txt",
    reg_output="reg_memb.txt",
    silent = False
):
    """
        Computation of the whole condor process. It creates a condor object and runs all the steps of BRIM on it. The function outputs

        Note: The edgelist is assumed to contain a bipartite network. The program will relabel the nodes so that the edgelist represents a bipartite network anyway.
        It is on the user to know that the network they are using is suitable for the method.
    
    Parameters
    -----------
        network_file: str
            Path to file encoding an edgelist.
        sep: str
            Separator used in the file.
        index_col: int
            Column that stores the index of the edgelist. E.g. None, 0...
        header: int
            Row that stores the header of the edgelist. E.g. None, 0...
        initial_method: str
            Method to determine intial community assignment. (By default Leiden method).
        initial_project: bool
            Whether to project the network onto one of the bipartite sets for the initial community detection.
        com_num: int
            Max number of communities. It is recomended to leave this to default, otherwise if the initial community assignement is bigger the program will crash.
        deltaQmin: float
            Difference modularity threshold for stopping the iterative process.
        resolution: int
            Not yet implemented.
        return_output:  bool
            Whether the function returns the created condor object.
        tar_output: str
            Filename for saving the tar node final membership.
        reg_output: str
            Filename for saving the reg node final membership.
        silent: Run in silent mode

    Returns
    --------
        Files "tar_memb.txt" and "reg_memb.txt" encoding the final tar and reg node membership.
    """

    co = condor_object(network_file, sep, index_col, header,dataframe=None,silent=silent)

    co.initial_community(method=initial_method, project=initial_project,resolution=resolution)


    co.brim(deltaQmin, c=com_num, resolution=resolution)
    co.tar_memb.to_csv(tar_output)
    co.reg_memb.to_csv(reg_output)
    

    if return_output == True:
        return co