from __future__ import print_function
import math
from random import sample
import time
import pandas as pd
from scipy.stats import zscore
from .timer import Timer
import numpy as np
from netZooPy.panda import calculations as calc
import sys
def check_expression_integrity(df):
"""Check data integrity
- Number of NA
Args:
df (dataframe): gene expression dataframe
"""
# check that for each
if (df.isna().sum(axis = 1)>(len(df.columns)-3)).any():
sys.exit('Too many nan in gene expression (need more than 1 sample to compute coexpression)')
def read_ppi(ppi_fn, tf_list = None):
"""Read PPI network
Args:
ppi_fn (str): ppi network filename
"""
with open(ppi_fn, 'r') as f:
ppi_data = pd.read_csv(f, sep="\t", header=None)
ppi_data.columns = ['tf1','tf2','exists']
# get all tfs from first and second column
if tf_list:
ppi_tfs = tf_list
ppi_data = ppi_data[(ppi_data.tf1.isin(ppi_tfs)) & (ppi_data.tf2.isin(ppi_tfs))]
else:
ppi_tfs = sorted(set(ppi_data.iloc[:,0].values.tolist()).union(set(ppi_data.iloc[:,1].values.tolist())))
# create adjacency matrix
df = pd.DataFrame(np.eye(len(ppi_tfs)), index=ppi_tfs, columns=ppi_tfs)
z = ppi_data.pivot_table(columns='tf2',index = 'tf1',values = 'exists', fill_value=0)
df = df.add(z, fill_value=0).add(z.T, fill_value=0)
df = 1*(df>0)
# return adjacency matrix and tfs list
return(df, ppi_tfs)
def read_priors_table(table_filename, sample_col = 'sample', prior_col = 'prior'):
"""Read the table detailing the sample-prior dyads
Parameters
----------
table_filename :str
A csv that couples each sample with a prior
filename. The csv file can have any number of columns, but only
the sample_col and prior_col are read.
sample_col: str
name of the column storing the sample name (default: 'sample')
prior_col: str
name of the column storing the prior filename (default: 'prior')
Returns
-------
samples: list
list of samples that are defined
prior_dict: dict
dictionary sample:prior_filename
"""
with open(table_filename, 'r') as f:
df = pd.read_csv(table_filename, usecols=[sample_col, prior_col])
# check that sample-prior are unique
if len(df.drop_duplicates(subset = [sample_col, prior_col]))!=len(df):
print('Some samples are reported twice')
print(df[df.duplicated(subset = [sample_col, prior_col])])
df = df.drop_duplicates(subset = [sample_col, prior_col])
# check that there are no more samples duplicated
if len(df.drop_duplicates(subset = [sample_col]))!=len(df):
sys.exit('No unique sample-prior assignment, there are duplicated sample columns')
samples = df[sample_col].astype(str).values.tolist()
sample2prior_dict = {i[1]:i[2] for i in df.itertuples()}
prior2sample_dict = {}
for p,tab in df.groupby(prior_col):
prior2sample_dict[p] = tab.loc[:,sample_col].values.tolist()
return(samples, sample2prior_dict, prior2sample_dict)
def read_motif(motif_fn, tf_names = None, gene_names = None, pivot = True):
""" Read a motif edgelist, generates
Args:
motif_fn (_type_): filename of the motif edgelist
tf_names (_type_, optional): list of tf_names to be used. Defaults to None.
gene_names (_type_, optional): list of gene_names to be used. Defaults to None.
pivot (bool): if true returns a pivot tfs X genes table. Otherwise keeps the edgelist
Returns:
piv/df: motif as edgelist or pivot table
tftoadd: list of tf missing compared to the universe
genetoadd: list of genes missing compared to universe
"""
with open(motif_fn, 'r') as f:
df = pd.read_csv(f, sep= '\t', header = None)
tftoadd = {}
genetoadd = {}
presenttf = df.iloc[:,0].unique()
presentgene = df.iloc[:,1].unique()
ntfs = len(presenttf)
ngenes = len(presentgene)
if isinstance(tf_names, list):
# Check how many are removed
dtf = len(set(presenttf).difference(set(tf_names)))
if (dtf!=0):
print('Note: %d/%d tfs in the prior are not in the tf universe ' %(int(dtf),int(ntfs)))
# tfs to add
tftoadd = set(tf_names).difference(set(presenttf))
if isinstance(gene_names, list):
# Check how many are removed
dgene = len(set(presentgene).difference(set(gene_names)))
if (dgene!=0):
print('Note: %d/%d genes in the prior are not in the gene universe ' %(int(dgene), int(ngenes)))
# genes to add
genetoadd = set(gene_names).difference(set(presentgene))
# now if one or both tftoadd/genetoadd are not None, we add rows with zeros at the end
if (tftoadd or genetoadd):
usetfs = list(tftoadd)
usegenes = (genetoadd)
# if one of the two is None, add all the existing ones
if len(tftoadd)==0:
usetfs = [presenttf[0]]
if len(genetoadd)==0:
usegenes = [presentgene[0]]
# Here we add the nodes to the dataframe, by adding edges with zero values
# If there are no tfs/genes to add, only the first one is used. This allows
# to add nodes to the edgelist without adding too many zero edges.
toadd = pd.DataFrame(usetfs, columns = [0]).merge(pd.DataFrame(usegenes, columns = [1]), how='cross')
toadd[2] = 0
df = pd.concat([df,toadd])
if pivot:
piv = df.pivot_table(values=2, index=0, columns=1, fill_value=0)
if isinstance(tf_names, list):
piv = piv.loc[tf_names,:]
if isinstance(gene_names, list):
piv = piv.loc[:,gene_names]
return(piv, list(tftoadd), list(genetoadd))
else:
if isinstance(tf_names, list):
df = df[df[0].isin(tf_names)]
if isinstance(gene_names, list):
df = df[df[1].isin(gene_names)]
return(df, list(tftoadd), list(genetoadd))
def read_expression(expression_fn, header = 0, usecols = None, nrows = None):
"""Read expression data.
Parameters
-----------
expression_fn: str
filename of the expression file
header: str or int
header row
usecols:list
pass a list of the columns that need to be read
"""
with open(expression_fn, 'r') as f:
if expression_fn.endswith('.txt'):
df = pd.read_csv(f, sep = '\t', usecols = usecols, index_col=0, nrows=nrows)
elif expression_fn.endswith('.csv'):
df = pd.read_csv(f, sep = ' ', usecols = usecols, index_col=0, nrows=nrows)
elif expression_fn.endswith('.tsv'):
df = pd.read_csv(f, sep = '\t', usecols = usecols, index_col=0, nrows=nrows)
else:
sys.exit("Format of expression filename not recognised %s" %str(expression_fn))
return(df)
[docs]
def prepare_expression(expression_filename, samples = None):
""" Prepare main coexpression network by reading the expression file.
Parameters
----------
expression_filename :str
A table (tsv, csv, or txt) where each column is a sample
and each row is a gene. Values are expression.
samples: list
list of sample names. If None all samples are read (default: None)
Returns
---------
expression_data: pd.DataFrame
expression_genes:set
"""
# expression file is properly annotated with the sample name and
# a list of sample of interest is passed
print(samples)
if type(expression_filename) is str:
if (isinstance(samples, list)):
expression_data = read_expression(expression_filename, usecols = samples)
else:
expression_data = read_expression(expression_filename)
elif isinstance(expression_filename, pd.DataFrame):
if (isinstance(samples, list)):
expression_data = expression_filename.loc[:,samples]
else:
expression_data = expression_filename
else:
sys.exit('Expression filename needs to be either a table string or a panda dataframe')
# keep names of expression genes
expression_genes = set(expression_data.index.tolist())
if len(expression_data) != len(expression_genes):
print(
"Duplicate gene symbols detected. Consider averaging before running PANDA"
)
check_expression_integrity(expression_data)
return(expression_data, expression_genes)
def read_motif_universe(priors_dict, mode = 'union', tf_col = 0, gene_col = 1):
""" Read tf and gene names for all the priors and generate
a universe. By default the union of all gene names and tfs
is considered. Tables needs to be tab separated.
Parameters
-----------
priors_dict: dict
dictionary sample:prior_filename
mode: str
how to generate the universe. Default: union
tf_col:str or int
Column used to store the TF names
gene_col: str or int
Column used to store the gene names
Returns
--------
tfs:set
names of tfs
genes:set
names of genes
universe:set
name of both genes and tfs
"""
files = set(priors_dict.values())
tfs = set()
genes = set()
for fff in files:
with open(fff, 'r') as f:
df = pd.read_csv(f, sep= '\t', header = None)
tf = df.iloc[:,0].unique().tolist()
gene = df.iloc[:,1].unique().tolist()
if mode=='union':
tfs = tfs.union(set(tf))
genes = genes.union(set(gene))
universe = genes.union(tfs)
elif mode=='intersection':
tfs = tfs.intersection(set(tf))
genes = genes.intersection(set(gene))
universe = genes.intersection(tfs)
else:
sys.exit('Name %s is not a valid mode for prior tf/gene universe. Use union or intersection' %str(mode))
return(tfs, genes)