Source code for pycistarget.cluster_buster

import io
import logging
import numpy as np
import os
import pandas as pd
import pyranges as pr
import subprocess
import sys
import ray

from typing import Optional, Union
from typing import List, Iterable, Tuple, Dict

[docs]def cluster_buster(cbust_path: str, path_to_motifs: str, region_sets: Union[Dict[str, pr.PyRanges], Dict[str, List]] = None, path_to_genome_fasta: str = None, path_to_regions_fasta: str = None, n_cpu: Optional[int] = 1, motifs: Optional[List[str]] = None, verbose: Optional[bool] = False, **kwargs): """ Add motif annotation Parameters --------- cluster_buster_path: str Path to cluster buster bin. path_to_motifs: str, optional. Path to motif collection folder (in .cb format). Only required if using a shuffled background. region_sets: Dict A dictionary of PyRanges containing region coordinates for the regions to be analyzed. Only required if `path_to_regions_fasta` is not provided. path_to_genome_fasta: str, optional. Path to genome fasta file. Only required if `path_to_regions_fasta` is not provided. Default: None path_to_regions_fasta: str, optional. Path to regions fasta file. Only required if `path_to_genome_fasta` is not provided. Default: None n_cpu: int, optional Number of cores to use motifs: List, optional Names of the motif files to use (from `path_to_motifs`). Default: None (All) verbose: bool, optional Whether to print progress to screen **kwargs: Additional parameters to pass to `ray.init()` References --------- Frith, Martin C., Michael C. Li, and Zhiping Weng. "Cluster-Buster: Finding dense clusters of motifs in DNA sequences." Nucleic acids research 31, no. 13 (2003): 3666-3668. """ # Create logger level = logging.INFO format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' handlers = [logging.StreamHandler(stream=sys.stdout)] logging.basicConfig(level = level, format = format, handlers = handlers) log = logging.getLogger('Cluster-Buster') # Generate fasta file if path_to_regions_fasta is None: path_to_regions_fasta = os.path.join(outdir,'regions.fa') if not os.path.exists(path_to_regions_fasta):'Getting sequences') pr_regions_names_dict = {key: pyranges2names(region_sets[key]) for key in region_sets.keys()} pr_sequence_list = [pd.DataFrame([region_sets[key], pr.get_fasta(region_sets[key], path_to_genome_fasta).tolist()], index=['Name', 'Sequence'], columns=region_sets[key]) for key in region_sets.keys()] seq_df = pd.concat(pr_sequence_list, axis=1) seq_df = seq_df.loc[:,~seq_df.columns.duplicated()] seq_df.T.to_csv(path_to_regions_fasta, header=False, index=False, sep='\n') sequence_names = [seq[1:] for seq in seq_df.columns] else: sequence_names = get_sequence_names_from_fasta(path_to_regions_fasta) # Get motifs and sequence name if motifs is None: motifs = os.listdir(path_to_motifs) motifs = grep(motifs, '.cb')'Scoring sequences') ray.init(num_cpus=n_cpu, **kwargs) crm_scores = ray.get([run_cluster_buster_for_motif.remote(cbust_path, path_to_regions_fasta, path_to_motifs+motifs[i], motifs[i], i, len(motifs), verbose) for i in range(len(motifs))]) ray.shutdown() crm_df = pd.concat(crm_scores, axis=1, sort=False).fillna(0).T # Remove .cb from motifs names crm_df.index = [x.replace('.cb','') for x in crm_df.index.tolist()]'Done!') return crm_df
# Utils @ray.remote def run_cluster_buster_for_motif(cluster_buster_path: str, fasta_filename: str, motif_filename: str, motif_name: str, i: int, nr_motifs: int, verbose: Optional[bool] = False): """ Ray method to run cluster buster for one motif """ # Create logger level = logging.INFO format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' handlers = [logging.StreamHandler(stream=sys.stdout)] logging.basicConfig(level = level, format = format, handlers = handlers) log = logging.getLogger('Cluster-Buster') if verbose == True:'Scoring motif ' + str(i) + ' out of ' + str(nr_motifs) + ' motifs') # Score each region in FASTA file with Cluster-Buster # for motif and get top CRM score for each region. clusterbuster_command = [cluster_buster_path, '-f', '4', '-c', '0.0', '-r', '10000', '-t', '1', '-l', #Mask repeats motif_filename, fasta_filename] try: pid = subprocess.Popen(args=clusterbuster_command, bufsize=1, executable=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=None, close_fds=False, shell=False, cwd=None, env=None, universal_newlines=False, startupinfo=None, creationflags=0) stdout_data, stderr_data = pid.communicate() except OSError as msg: print("\nExecution error for: '" + ' '.join(clusterbuster_command) + "': " + str(msg), file=sys.stderr) sys.exit(1) if pid.returncode != 0: print("\nError: Non-zero exit status for: " + ' '.join(clusterbuster_command) + "'", file=sys.stderr) sys.exit(1) crm_scores_df = pd.read_csv( filepath_or_buffer=io.BytesIO(stdout_data), sep='\t', header=0, names=['motifs', 'crm_score'], index_col='motifs', usecols=['motifs','crm_score'], dtype={'crm_score': np.float32}, engine='c' ) crm_scores_df.columns=[motif_name] return crm_scores_df # Utils functions for Cluster-buster
[docs]def get_sequence_names_from_fasta(fasta_filename: str): """ Retrieve sequence names from fasta """ sequence_names_list = list() sequence_names_set = set() duplicated_sequences = False with open(fasta_filename, 'r') as fh: for line in fh: if line.startswith('>'): # Get sequence name by getting everything after '>' up till the first whitespace. sequence_name = line[1:].split(maxsplit=1)[0] # Check if all sequence names only appear once. if sequence_name in sequence_names_set: print( 'Error: Sequence name "{0:s}" is not unique in FASTA file "{1:s}".'.format( sequence_name, fasta_filename ), file=sys.stderr ) duplicated_sequences = True sequence_names_list.append(sequence_name) sequence_names_set.add(sequence_name) if duplicated_sequences: sys.exit(1) return sequence_names_list
[docs]def pyranges2names(regions: pr.PyRanges): """ Convert pyranges to sequence name (fasta format) """ return ['>'+str(chrom) + ":" + str(start) + '-' + str(end) for chrom, start, end in zip(list(regions.Chromosome), list(regions.Start), list(regions.End))]
[docs]def grep(l: List, s: str): """ Helper for grep """ return [i for i in l if s in i]