Source code for pycistarget.motif_enrichment_dem

from ctxcore.rnkdb import FeatherRankingDatabase
from ctxcore.genesig import GeneSignature
import io
import logging
import numpy as np
import os
import pandas as pd
import pyranges as pr
import random
import ray
import re
from scipy.stats import ranksums
import scipy.sparse as sparse
from sklearn.metrics import roc_curve, auc
import ssl
import subprocess
import sys
from typing import Union, Dict, Sequence, Optional

from IPython.display import HTML
ssl._create_default_https_context = ssl._create_unverified_context
pd.set_option('display.max_colwidth', None)

# Set stderr to null when using ray.init to avoid ray printing Broken pipe million times
_stderr = sys.stderr                                                         
null = open(os.devnull,'wb') 

from .cluster_buster import *
from .utils import *

# DEM database
[docs]class DEMDatabase: 
    """
    DEM Database class.
    :class:`DEMDatabase` contains a dataframe with motifs as rows, regions as columns and CRM scores as
    values. In addition, is contains a slot to map query regions to regions in the database. For more
    information on how to generate databases, please visit: https://github.com/aertslab/create_cisTarget_databases
    
    Attributes
    ---------
    regions_to_db: pd.DataFrame
        A dataframe containing the mapping between query regions and regions in the database.
    db_scores: pd.DataFrame
        A dataframe with motifs as rows, regions as columns and CRM scores as values.
    total_regions: int
        Total number of regions in the database
    """
    def __init__(self, 
                fname: str,
                region_sets: Dict[str, pr.PyRanges] = None,
                name: str = None,
                fraction_overlap: float = 0.4):
        """
        Initialize DEMDatabase
        
        Parameters
        ---------
        fname: str
            Path to feather file containing the DEM database (regions_vs_motifs)
        region_sets: Dict or pr.PyRanges, optional
            Dictionary or pr.PyRanges that are going to be analyzed with DEM. Default: None.
        name: str, optional
            Name for the DEM database. Default: None
        fraction_overlap: float, optional
            Minimal overlap between query and regions in the database for the mapping.     
        """
        self.regions_to_db, self.db_scores = self.load_db(fname,
                                                          region_sets,
                                                          name,
                                                          fraction_overlap)
    
[docs]    def load_db(self,
                fname: str,
                region_sets: Dict[str, pr.PyRanges] = None,
                name: str = None,
                fraction_overlap: float = 0.4):
        """
        Load DEMDatabase
        
        Parameters
        ---------
        fname: str
            Path to feather file containing the DEM database (regions_vs_motifs)
        region_sets: Dict or pr.PyRanges, optional
            Dictionary or pr.PyRanges that are going to be analyzed with DEM. Default: None.
        name: str, optional
            Name for the DEM database. Default: None
        fraction_overlap: float, optional
            Minimal overlap between query and regions in the database for the mapping.     
            
        Return
        ---------
        target_to_db_dict: pd.DataFrame
            A dataframe containing the mapping between query regions and regions in the database.
        db_rankings: pd.DataFrame
            A dataframe with motifs as rows, regions as columns and CRM scores as values.
        total_regions: int
            Total number of regions in the database
        """
        #Create logger
        level    = logging.INFO
        format   = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
        handlers = [logging.StreamHandler(stream=sys.stdout)]
        logging.basicConfig(level = level, format = format, handlers = handlers)
        log = logging.getLogger('DEM')
        
        log.info('Reading DEM database')
        
        if name is None:
            name = os.path.basename(fname)
        db = FeatherRankingDatabase(fname, name=name)
        db_regions = db.genes
        if region_sets is not None:
            target_to_db_dict = {x: target_to_query(region_sets[x], list(db_regions)) for x in region_sets.keys()}
            target_regions_in_db = list(set(sum([target_to_db_dict[x]['Query'].tolist() for x in target_to_db_dict.keys()],[])))
            target_regions_in_db = GeneSignature(name=name, gene2weight=target_regions_in_db)
            db_scores = db.load(target_regions_in_db)
        else:
            target_to_db_dict = None
            db_scores = db.load_full()
        return target_to_db_dict, db_scores

[docs]class DEM():
    """
    DEM class.
    :class:`DEM` contains DEM method for motif enrichment analysis on sets of regions. 
    
    Attributes
    ---------
    regions_to_db: pd.DataFrame
        A dataframe containing the mapping between query regions and regions in the database.
    region_sets: Dict
        A dictionary of PyRanges containing region coordinates for the regions to be analyzed.
    specie: str
        Specie from which genomic coordinates come from.
    subset_motifs: List, optional
        List of motifs to disregard in the analysis. Default: None
    contrasts: List, optional
        List of contrasts to perform. Default: None (Each group versus all the rest)
    name: str
        Analysis name
    max_bg_regions: int, optional
        Maximum number of regions to use as background. Default: None (All)
    adjpval_thr: float, optional
        Adjusted p-value threshold to consider a motif enriched. Default: 0.05
    log2fc_thr: float, optional
        Log2 Fold-change threshold to consider a motif enriched. Default: 1
    mean_fg_thr: float, optional
        Minimul mean signal in the foreground to consider a motif enriched. Default: 0
    motif_hit_thr: float, optional
        Minimal CRM score to consider a region enriched for a motif. Default: None (It will be automatically
        calculated based on precision-recall).
    n_cpu: int, optional
        Number of cores to use. Default: 1
    fraction_overlap: float, optional
        Minimal overlap between query and regions in the database for the mapping.
    cluster_buster_path: str, optional
        Path to cluster buster bin. Only required if using a shuffled background. Default: None
    path_to_genome_fasta: str, optional.
         Path to genome fasta file. Only required if using a shuffled background. Default: None
    path_to_motifs: str, optional.
         Path to motif collection folder (in .cb format). Only required if using a shuffled background. 
         Default: None
    genome_annotation: pr.PyRanges, optional.
         Pyranges containing genome annotation (e.g. biomart). Only required if using promoter balance. 
         Default: None
    promoter_space: int, optional
        Space around TSS to consider a region promoter. Only used if using promoter balance.
        Default: 1000
    path_to_motif_annotations: str, optional
        Path to motif annotations. If not provided, they will be downloaded from 
        https://resources.aertslab.org based on the specie name provided (only possible for mus_musculus,
        homo_sapiens and drosophila_melanogaster). Default: None
    motif_similarity_fdr: float, optional
        Minimal motif similarity value to consider two motifs similar. Default: 0.001
    orthologous_identity_threshold: float, optional
        Minimal orthology value for considering two TFs orthologous. Default: 0.0
    motifs_to_use: List, optional
        A subset of motifs to use for the analysis. Default: None (All)
    tmp_dir: str, optional
        Temp directory to use if running cluster_buster. Default: None (\tmp)
    motif_enrichment: pd.DataFrame
        A dataframe containing motif enrichment results
    motif_hits: Dict
        A dictionary containing regions that are considered enriched for each motif.
    cistromes: Dict
        A dictionary containing TF cistromes. Cistromes with no extension contain regions linked to directly
        annotated motifs, while '_extended' cistromes can contain regions linked to motifs annotated by 
        similarity or orthology.
    """
    def __init__(self,
                 dem_db,
                 region_sets: Dict[str, pr.PyRanges],
                 specie: str,
                 subset_motifs: Optional[List[str]] = None,
                 contrasts: Optional[Union[str,List]] = 'Other',
                 name: Optional[str] = 'DEM',
                 max_bg_regions: int = None,
                 adjpval_thr: Optional[float] = 0.05,
                 log2fc_thr: Optional[float] = 1,
                 mean_fg_thr: Optional[float] = 0,
                 motif_hit_thr : Optional[float] = None,
                 n_cpu: Optional[int] = 1,
                 fraction_overlap: float = 0.4,
                 cluster_buster_path: str = None,
                 path_to_genome_fasta: str = None,
                 path_to_motifs: str = None,
                 genome_annotation: pr.PyRanges = None,
                 promoter_space: int = 1000,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 motif_annotation: list = ['Direct_annot', 'Motif_similarity_annot', 'Orthology_annot', 'Motif_similarity_and_Orthology_annot'],
                 motif_similarity_fdr: float = 0.001,
                 orthologous_identity_threshold: float = 0.0,
                 tmp_dir: int = None,
                 **kwargs):
        """
        Initialize DEM 
    
        Parameters
        ---------
        dem_db: :class:`DEMDatabase`
            A DEM database object.
        region_sets: Dict
            A dictionary of PyRanges containing region coordinates for the regions to be analyzed.
        specie: str
            Specie from which genomic coordinates come from.
        subset_motifs: List, optional
            List of motifs to disregard in the analysis. Default: None
        contrasts: List, optional
            List of contrasts to perform. Default: None (Each group versus all the rest)
        name: str
            Analysis name
        max_bg_regions: int, optional
            Maximum number of regions to use as background. Default: None (All)
        adjpval_thr: float, optional
            Adjusted p-value threshold to consider a motif enriched. Default: 0.05
        log2fc_thr: float, optional
            Log2 Fold-change threshold to consider a motif enriched. Default: 1
        mean_fg_thr: float, optional
            Minimal mean signal in the foreground to consider a motif enriched. Default: 0
        motif_hit_thr: float, optional
            Minimal CRM score to consider a region enriched for a motif. Default: None (It will be automatically
            calculated based on precision-recall).
        n_cpu: int, optional
            Number of cores to use. Default: 1
        fraction_overlap: float, optional
            Minimal overlap between query and regions in the database for the mapping.
        cluster_buster_path: str, optional
            Path to cluster buster bin. Only required if using a shuffled background. Default: None
        path_to_genome_fasta: str, optional.
            Path to genome fasta file. Only required if using a shuffled background. Default: None
        path_to_motifs: str, optional.
            Path to motif collection folder (in .cb format). Only required if using a shuffled background. 
            Default: None
        genome_annotation: pr.PyRanges, optional.
            Pyranges containing genome annotation (e.g. biomart). Only required if using promoter balance. 
            Default: None
        promoter_space: int, optional
            Space around TSS to consider a region promoter. Only used if using promoter balance.
            Default: 1000
        path_to_motif_annotations: str, optional
            Path to motif annotations. If not provided, they will be downloaded from 
            https://resources.aertslab.org based on the specie name provided (only possible for mus_musculus,
            homo_sapiens and drosophila_melanogaster). Default: None
        motif_similarity_fdr: float, optional
            Minimal motif similarity value to consider two motifs similar. Default: 0.001
        orthologous_identity_threshold: float, optional
            Minimal orthology value for considering two TFs orthologous. Default: 0.0
        motifs_to_use: List, optional
            A subset of motifs to use for the analysis. Default: None (All)
        tmp_dir: str, optional
            Temp directory to use if running cluster_buster. Default: None (\tmp)
        **kwargs:
            Additional parameters to pass to `ray.init()`
        """
        # DEM db
        if dem_db is not None:
            if isinstance(dem_db, str):
                dem_db = DEMDatabase(dem_db,
                                     region_sets,
                                     name = name,
                                     fraction_overlap = fraction_overlap)
            self.regions_to_db = dem_db.regions_to_db
            if subset_motifs is not None:
                dem_db.db_scores = dem_db.db_scores.loc[subset_motifs,:]
        # Other params
        self.region_sets = region_sets
        self.specie = specie
        self.subset_motifs= subset_motifs
        self.contrasts = contrasts
        self.name = name
        self.max_bg_regions = max_bg_regions
        self.adjpval_thr = adjpval_thr
        self.log2fc_thr = log2fc_thr
        self.mean_fg_thr = mean_fg_thr
        self.motif_hit_thr = motif_hit_thr
        self.n_cpu = n_cpu
        # For Shuffle background options
        self.cluster_buster_path = cluster_buster_path
        self.path_to_genome_fasta =  path_to_genome_fasta
        self.path_to_motifs = path_to_motifs
        # For promoter awareness
        self.genome_annotation = genome_annotation
        self.promoter_space = promoter_space
        # For annotation
        self.annotation_version = annotation_version
        self.motif_annotation = motif_annotation
        self.path_to_motif_annotations = path_to_motif_annotations
        self.motif_similarity_fdr = motif_similarity_fdr
        self.orthologous_identity_threshold = orthologous_identity_threshold
        # Tmp
        self.tmp_dir = tmp_dir
        # Info
        self.motif_enrichment = None
        self.motif_hits = None
        self.cistromes = None
        if dem_db is not None:
            self.run(dem_db.db_scores, **kwargs)
        
[docs]    def run(self, dem_db_scores, **kwargs):
        """
        Run DEM
    
        Parameters
        ---------
        dem_db_scores: pd.DataFrame
            A dataframe containing maximum CRM score for each motif in each regions.
        **kwargs
            Additional parameters to pass to `ray.init()`
        """
        # Create logger
        level    = logging.INFO
        format   = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
        handlers = [logging.StreamHandler(stream=sys.stdout)]
        logging.basicConfig(level = level, format = format, handlers = handlers)
        log = logging.getLogger('DEM')
        
        contrast_keys=[x for x in self.region_sets.keys()]
        
        region_sets_names = {key: self.regions_to_db[key]['Query'].tolist() for key in self.regions_to_db.keys()}

        if self.contrasts == 'Other':
            if len(self.region_sets) > 1:
                levels=list(self.region_sets.keys())
                contrasts=[[[x], levels[:levels.index(x)] + levels[levels.index(x)+1:]] for x in levels]
                contrasts_names=levels
            else:
                contrasts = [[self.region_sets.keys()[0]],['Shuffle']]
                contrasts_names = [self.region_sets.keys()[0] + '_VS_Shuffle']  
                
        elif isinstance(self.contrasts, list):
            contrasts = self.contrasts
            contrasts_names=['_'.join(contrasts[i][0]) + '_VS_' +'_'.join(contrasts[i][1]) for i in range(len(contrasts))]
            for i in range(len(contrasts)):
                self.regions_to_db[contrasts_names[i]] = pd.concat([self.regions_to_db[x] for x in contrasts[i][0]])
                    
        elif self.contrasts == 'Shuffle':
            levels=list(self.region_sets.keys())
            contrasts=[[[x], 'Shuffle'] for x in levels]
            contrasts_names=levels
        
        # Get region groups
        log.info('Creating contrast groups')
        region_groups = [create_groups(contrast = contrasts[x],
                                   region_sets_names = region_sets_names,
                                   max_bg_regions = self.max_bg_regions,
                                   path_to_genome_fasta = self.path_to_genome_fasta,
                                   path_to_regions_fasta = os.path.join(self.tmp_dir, contrasts_names[x] +'.fa'),  
                                   cbust_path = self.cluster_buster_path,
                                   path_to_motifs = self.path_to_motifs,
                                   annotation = self.genome_annotation,
                                   promoter_space = self.promoter_space,
                                   motifs = dem_db_scores.index.tolist(),
                                   n_cpu = self.n_cpu,
                                   **kwargs) for x in range(len(contrasts))]

        # Compute p-val and log2FC
        if self.n_cpu > len(region_groups):
            self.n_cpu = len(region_groups)
        
        if self.n_cpu > 1:
            ray.init(num_cpus=self.n_cpu, **kwargs)
            sys.stderr = null
            DEM_list = ray.get([DEM_internal_ray.remote(dem_db_scores,
                                             region_groups[i],
                                             contrasts_names[i],
                                             adjpval_thr = self.adjpval_thr,
                                             log2fc_thr = self.log2fc_thr,
                                             mean_fg_thr = self.mean_fg_thr,
                                             motif_hit_thr = self.motif_hit_thr) for i in range(len(contrasts))])
            ray.shutdown()
            sys.stderr = sys.__stderr__ 
        else:
            DEM_list = [DEM_internal(dem_db_scores,
                                region_groups[i],
                                contrasts_names[i],
                                adjpval_thr = self.adjpval_thr,
                                log2fc_thr = self.log2fc_thr,
                                mean_fg_thr = self.mean_fg_thr,
                                motif_hit_thr = self.motif_hit_thr) for i in range(len(contrasts))]
        self.motif_enrichment = {contrasts_names[i]: DEM_list[i][0] for i in range(len(DEM_list))} 
        db_motif_hits =  {contrasts_names[i]: DEM_list[i][1] for i in range(len(DEM_list))} 
        # Add annotation and logo
        self.add_motif_annotation_dem()
        # Format motif hits
        rs_motif_hits = {name_1: {name_2: list(set(self.regions_to_db[name_1].loc[self.regions_to_db[name_1]['Query'].isin(db_motif_hits[name_1][name_2]), 'Target'].tolist())) for name_2 in db_motif_hits[name_1].keys()} for name_1 in contrasts_names}
        self.motif_hits = {'Database': db_motif_hits, 'Region_set': rs_motif_hits}
        # TF cistromes
        log.info('Forming cistromes')
        cistromes_db = {key: get_cistromes_per_region_set(self.motif_enrichment[key], self.motif_hits['Database'][key], self.motif_annotation) for key in self.motif_enrichment.keys()}
        cistromed_rs = {key: get_cistromes_per_region_set(self.motif_enrichment[key], self.motif_hits['Region_set'][key], self.motif_annotation) for key in self.motif_enrichment.keys()}
        self.cistromes = {'Database': cistromes_db, 'Region_set': cistromed_rs}
        log.info('Done!')
        
[docs]    def add_motif_annotation_dem(self,
                       add_logo: Optional[bool] = True):
        """
        Add motif annotation

        Parameters
        ---------
        add_logo: boolean, optional
            Whether to add the motif logo to the motif enrichment dataframe
        """
        # Create DEM logger
        level = logging.INFO
        format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
        handlers = [logging.StreamHandler(stream=sys.stdout)]
        logging.basicConfig(level=level, format=format, handlers=handlers)
        log = logging.getLogger('DEM')

        # Read motif annotation. 
        try:
            annot_df = load_motif_annotations(self.specie,
                                          self.annotation_version,
                                          fname=self.path_to_motif_annotations,
                                          motif_similarity_fdr = self.motif_similarity_fdr,
                                          orthologous_identity_threshold = self.orthologous_identity_threshold)
            motif_enrichment_dict_w_annot = {key: pd.concat([self.motif_enrichment[key], annot_df], axis=1, sort=False).loc[self.motif_enrichment[key].index.tolist(),:] for key in self.motif_enrichment.keys()}
        except:
            log.info('Unable to load annotation for ' + self.specie)
            annot_df = None
            motif_enrichment_dict_w_annot = self.motif_enrichment
        # Add info to elements in dict
        if add_logo == True:
            for key in self.motif_enrichment.keys():
                motif_enrichment_dict_w_annot[key]['Logo']=['<img src="' +'https://motifcollections.aertslab.org/' + self.annotation_version + '/logos/'+ motif_enrichment_dict_w_annot[key].index.tolist()[i] + '.png' + '" width="200" >' for i in range(motif_enrichment_dict_w_annot[key].shape[0])]
            if annot_df is not None:
                motif_enrichment_dict_w_annot = {key: motif_enrichment_dict_w_annot[key][sum([['Logo', 'Contrast'], self.motif_annotation, ['Log2FC', 'Adjusted_pval', 'Mean_fg', 'Mean_bg', 'Motif_hit_thr', 'Motif_hits']], [])] for key in motif_enrichment_dict_w_annot.keys()}
            else:
                motif_enrichment_dict_w_annot = {key: motif_enrichment_dict_w_annot[key][['Logo', 'Contrast', 'Log2FC', 'Adjusted_pval', 'Mean_fg', 'Mean_bg', 'Motif_hit_thr', 'Motif_hits']] for key in motif_enrichment_dict_w_annot.keys()}
        else:
            if annot_df is not None:
                motif_enrichment_dict_w_annot = {key: motif_enrichment_dict_w_annot[key][sum([['Contrast'], self.motif_annotation, ['Log2FC', 'Adjusted_pval', 'Mean_fg', 'Mean_bg', 'Motif_hit_thr', 'Motif_hits']],[])] for key in motif_enrichment_dict_w_annot.keys()}
            else:
                motif_enrichment_dict_w_annot = {key: motif_enrichment_dict_w_annot[key][['Contrast', 'Log2FC', 'Adjusted_pval', 'Mean_fg', 'Mean_bg', 'Motif_hit_thr', 'Motif_hits']] for key in motif_enrichment_dict_w_annot.keys()}
        
        self.motif_enrichment = motif_enrichment_dict_w_annot 
    
        
[docs]    def DEM_results(self,
                    name: Optional[str] = None):
        """
        Print motif enrichment table as HTML
        """
        motif_enrichment_dict = self.motif_enrichment
        if name is None:
            motif_enrichment_table=pd.concat([motif_enrichment_dict[key] for key in motif_enrichment_dict.keys()], axis=0, sort=False)
        else:
            motif_enrichment_table=motif_enrichment_dict[name]
        return HTML(motif_enrichment_table.to_html(escape=False, col_space=80))
        

# Utils
## Shuffle sequences for shuffle background
[docs]def shuffle_sequence(sequence: str):
    """
    Shuffle given sequence
    """
    shuffled_sequence = np.frombuffer(sequence.encode('utf-8'), dtype='uint8')
    np.random.shuffle(shuffled_sequence)
    return shuffled_sequence.tobytes().decode('utf-8')
    
## Create groups to compare
[docs]def create_groups(contrast: list,
                  region_sets_names: list,
                  max_bg_regions: int,
                  path_to_genome_fasta: str,
                  path_to_regions_fasta: str,
                  cbust_path: str,
                  path_to_motifs: str,
                  annotation: pr.PyRanges = None,
                  promoter_space: int = 1000,
                  motifs: list = None,
                  n_cpu: int = 1,
                  **kwargs):
    """"
    Format contrast groups
    """
    # Create DEM logger
    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('DEM')
    foreground = list(set(sum([region_sets_names[key] for key in contrast[0]],[])))
    if contrast[1] != 'Shuffle':
        background = list(set(sum([region_sets_names[key] for key in contrast[1]],[])))
        background = list(np.setdiff1d(background,foreground))
        if max_bg_regions is not None:
            if annotation is None:
                random.Random(555).shuffle(background)
                background = background[0:max_bg_regions]
            else:
                # Annotation of promoters
                annotation['End'] = annotation['Start']+promoter_space
                annotation['Start'] = annotation['Start']-promoter_space
                annotation = pr.PyRanges(annotation[['Chromosome', 'Start', 'End']])
                # Nr of promoters in the foreground
                fg_pr_overlap = pr.PyRanges(region_names_to_coordinates(foreground)).count_overlaps(annotation)
                fg_pr = coord_to_region_names(fg_pr_overlap[fg_pr_overlap.NumberOverlaps != 0])
                if len(fg_pr) == len(foreground):
                    nr_pr = max_bg_regions
                elif len(fg_pr) == 0:
                    nr_pr = 0
                else:
                    fg_pr = coord_to_region_names(fg_pr_overlap[fg_pr_overlap.NumberOverlaps != 0][['Chromosome', 'Start', 'End']])
                    fg_no_pr =  coord_to_region_names(fg_pr_overlap[fg_pr_overlap.NumberOverlaps == 0][['Chromosome', 'Start', 'End']])
                    nr_pr = int(max_bg_regions*(len(fg_pr)/(len(fg_pr) + len(fg_no_pr))))
                
                nr_no_pr = max_bg_regions-nr_pr
                # Nr of promoters in the background
                bg_pr_overlap = pr.PyRanges(region_names_to_coordinates(background)).count_overlaps(annotation)
                bg_pr = coord_to_region_names(bg_pr_overlap[bg_pr_overlap.NumberOverlaps != 0][['Chromosome', 'Start', 'End']])
                bg_no_pr = coord_to_region_names(bg_pr_overlap[bg_pr_overlap.NumberOverlaps == 0][['Chromosome', 'Start', 'End']])
                if len(bg_pr) < nr_pr:
                    nr_pr = len(bg_pr)
                nr_no_pr = max_bg_regions-nr_pr
                if len(bg_no_pr) < nr_no_pr:
                    nr_no_pr = len(bg_no_pr)
                # For promoters   
                bg_pr_subset = bg_pr.copy()
                random.Random(555).shuffle(bg_pr_subset)
                bg_pr = bg_pr_subset[0:nr_pr]
                # For others        
                bg_no_pr_subset = bg_no_pr.copy()
                random.Random(555).shuffle(bg_no_pr_subset)
                bg_no_pr = bg_no_pr_subset[0:nr_no_pr]         
                background = bg_no_pr+bg_pr 
        
    else:
        log.info('Generating and scoring shuffled background')
        if max_bg_regions is None:
            background_pr = pr.PyRanges(region_names_to_coordinates(foreground))
            background_sequences = pd.DataFrame([foreground, pr.get_fasta(background_pr, path_to_genome_fasta).tolist()], index=['Name', 'Sequence']).T
        else:
            if annotation is not None:
                annotation['End'] = annotation['Start']+promoter_space
                annotation['Start'] = annotation['Start']-promoter_space
                annotation = pr.PyRanges(annotation[['Chromosome', 'Start', 'End']])
                fg_pr_overlap = pr.PyRanges(region_names_to_coordinates(foreground)).count_overlaps(annotation)
                if len(fg_pr_overlap) == len(foreground):
                    nr_pr = len(foreground)
                elif len(fg_pr_overlap) == 0:
                    nr_pr = 0
                else:
                    fg_pr = coord_to_region_names(fg_pr_overlap[fg_pr_overlap.NumberOverlaps != 0][['Chromosome', 'Start', 'End']])
                    fg_no_pr =  coord_to_region_names(fg_pr_overlap[fg_pr_overlap.NumberOverlaps == 0][['Chromosome', 'Start', 'End']])
                    nr_pr = int(max_bg_regions*(len(fg_pr)/(len(fg_pr) + len(fg_no_pr))))
                nr_no_pr = max_bg_regions-nr_pr
                # For promoters           
                fg_pr_subset = fg_pr.copy()
                random.Random(555).shuffle(fg_pr_subset)
                bg_pr = fg_pr_subset[0:nr_pr]
                # For others        
                fg_no_pr_subset = fg_no_pr.copy()
                random.Random(555).shuffle(fg_no_pr_subset)
                bg_no_pr = fg_no_pr_subset[0:nr_no_pr] 
                region_names = bg_no_pr+bg_pr
                background_pr = pr.PyRanges(region_names_to_coordinates(region_names))  
            else:
                foreground_subset = foreground.copy()
                random.Random(555).shuffle(foreground_subset)
                region_names = foreground_subset[0:max_bg_regions]
                background_pr = pr.PyRanges(region_names_to_coordinates(region_names))   
                                          
            background_sequences = pd.DataFrame([region_names, pr.get_fasta(background_pr, path_to_genome_fasta).tolist()], index=['Name', 'Sequence']).T
            background_sequences['Sequence'] = [shuffle_sequence(x) for x in background_sequences['Sequence']] 
            background_sequences['Name'] = '>' + background_sequences['Name'] 
            background_sequences.to_csv(path_to_regions_fasta, header=False, index=False, sep='\n')
        # Motifs should include .cb
        motifs = [motif + '.cb' for motif in motifs]
        background  = cluster_buster(cbust_path, path_to_motifs, path_to_regions_fasta=path_to_regions_fasta, n_cpu=n_cpu, motifs=motifs, **kwargs)
    return [foreground, background]
 
## Ray function for getting LogFC, pAdj and motif hits between groups   
@ray.remote
def DEM_internal_ray(dem_db_scores: pd.DataFrame,
            region_group: List[List[str]],
            contrast_name: str,
            adjpval_thr: Optional[float] = 0.05,
            log2fc_thr: Optional[float] = 1,
            mean_fg_thr: Optional[float] = 0,
            motif_hit_thr: Optional[float] = None):
    """"
    Internal DEM function to use with ray.
    """
            
    return DEM_internal(dem_db_scores, region_group, contrast_name, adjpval_thr, log2fc_thr, mean_fg_thr, motif_hit_thr)



[docs]def DEM_internal(dem_db_scores: pd.DataFrame,
            region_group: List[List[str]],
            contrast_name: str,
            adjpval_thr: Optional[float] = 0.05,
            log2fc_thr: Optional[float] = 1,
            mean_fg_thr: Optional[float] = 0,
            motif_hit_thr: Optional[float] = None):
    """
    Internal operations for DEM.
    """
    # Create DEM logger
    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('DEM')

    motifs = dem_db_scores.index.tolist()
    fg_mat = sparse.csc_matrix(dem_db_scores.loc[:,region_group[0]].values).toarray()
    
    if isinstance(region_group[1], list):
        bg_mat = sparse.csc_matrix(dem_db_scores.loc[:,region_group[1]].values).toarray()
    else:
        bg_mat = sparse.csc_matrix(region_group[1].values).toarray()
    
    # Delete db
    del dem_db_scores

    # Wilcox
    log.info('Computing DEM for ' + contrast_name)
    wilcox_test = [ranksums(fg_mat[x], y=bg_mat[x]) for x in range(fg_mat.shape[0])]
    # Log2FC
    mean_fg = fg_mat.mean(axis=1)
    mean_bg = bg_mat.mean(axis=1)
    logFC = np.log2((mean_fg + 10 ** -12) / (mean_bg + 10 ** -12)).tolist()

    # P-value correction
    pvalue = [wilcox_test[x].pvalue for x in range(len(wilcox_test))]
    adj_pvalue = p_adjust_bh(pvalue)
    name = [contrast_name] * len(adj_pvalue)
    # Motif df
    motif_df = pd.DataFrame([logFC, adj_pvalue, mean_fg, mean_bg, name], index=[
                                     'Log2FC', 'Adjusted_pval', 'Mean_fg', 'Mean_bg', 'Contrast'], columns=motifs).transpose()
    motif_df = motif_df.loc[motif_df['Adjusted_pval']
                                              <= adjpval_thr, :]
    motif_df = motif_df.loc[motif_df['Log2FC']
                                              >= log2fc_thr, :]
    motif_df = motif_df.loc[motif_df['Mean_fg']
                                              >= mean_fg_thr, :]
    motif_df = motif_df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True])
    
    # Motif hits versus background
    keep_motifs = motif_df.index.tolist()
    keep_motifs_index = get_position_index(keep_motifs, motifs)
    scores_mat = sparse.vstack([fg_mat[keep_motifs_index,].T, bg_mat[keep_motifs_index,].T], format='csr').T
    regions = region_group[0] + ['Bg']*bg_mat.shape[1]
    labels = [1]*fg_mat.shape[1] + [0]*bg_mat.shape[1]
    motif_hits_list = [get_motif_hits(scores_mat[i], regions, labels, motif_hit_thr) for i in range(len(keep_motifs))]
    motif_hits = {keep_motifs[i]: motif_hits_list[i][0] for i in range(len(keep_motifs))}
    motif_df['Motif_hit_thr'] = [motif_hits_list[i][1] for i in range(len(keep_motifs))]
    motif_df['Motif_hits'] = [len(motif_hits_list[i][0]) for i in range(len(keep_motifs))]
    motif_df['Motif_hits'] = motif_df['Motif_hits'].astype(int)
    return motif_df, motif_hits

# Helper function to adjust p-value
[docs]def p_adjust_bh(p: float):
    """
    Benjamini-Hochberg p-value correction for multiple hypothesis testing.
    """
    p = np.asfarray(p)
    by_descend = p.argsort()[::-1]
    by_orig = by_descend.argsort()
    steps = float(len(p)) / np.arange(len(p), 0, -1)
    q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
    return q[by_orig]

# Helper function to determine optimal threshold for motif hits
[docs]def get_motif_hits(scores, regions, labels, optimal_threshold=None):
    """
    Determine optimal score threshold based on precision-recall.
    """
    df = pd.DataFrame([labels, scores.toarray()[0]], columns=regions, index=['Label', 'Score']).sample(frac=1).T
    if optimal_threshold is None:
        df = df[df['Score'] > 0]
        fpr, tpr, thresholds = roc_curve(df['Label'], df['Score'])
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
    motif_hits = df[(df['Score'] > optimal_threshold) & (df['Label'] == 1)].index.to_list()
    return motif_hits, optimal_threshold