Source code for regdiffusion.data.beeline

import numpy as np
import scanpy as sc
import pandas as pd
from tqdm import tqdm
import zipfile
import os
from .utils import download_file

# Read ground truth
def load_beeline_ground_truth(data_dir):
    ground_truth = pd.read_csv(f'{data_dir}/label.csv')
    return ground_truth.values

beeline_cell_type_dict = {
    'hESC': [1],
    'hHep': [1],
    'mDC': [1],
    'mESC': [2],
    'mHSC-E': [0],
    'mHSC-GM': [0],
    'mHSC-L': [0]
}

def cell_type_separator(sc_data, cell_type_element_indices=[0], sep='_'):
    cell_types = []
    for x in sc_data.obs_names:
        x_elements = x.split(sep)
        cell_type = [x_elements[i] for i in cell_type_element_indices]
        cell_types.append(sep.join(cell_type))        
    cell_type_set = set(cell_types)
    cell_type_dict = {ct:i for i, ct in enumerate(cell_type_set)}
    cell_type_indices = [cell_type_dict[x] for x in cell_types]
    return cell_types, cell_type_indices

[docs] def load_beeline(data_dir='data', benchmark_data='hESC', benchmark_setting='500_STRING'): """ Load BEELINE data and its ground truth (download if necessary). Paper: Benchmarking algorithms for gene regulatory network inference from single-cell transcriptomic data Paper Link: https://www.nature.com/articles/s41592-019-0690-6 BEELINE consists of 7 single-cell datasets (``hESC``, ``hHep``, ``mDC``, ``mESC``, ``mHSC``, ``mHSC-GM``, and ``mHSC-L``) and 3 sets of ground truth networks (``STRING``, ``Non-ChIP``, ``ChIP-seq``). Args: data_dir (str): Parent directory to save and load the data. If the path does not exist, it will be created. Data will be saved in a subdirectory under the provided path. benchmark_data (str): Benchmark datasets. Choose among "hESC", "hHep", "mDC", "mESC", "mHSC", "mHSC-GM", and "mHSC-L". benchmark_setting (str): Benchmark settings. Choose among "500_STRING", "1000_STRING", "500_Non-ChIP", "1000_Non-ChIP", "500_ChIP-seq", "1000_ChIP-seq", "500_lofgof", and "1000_lofgof". If either of the "lofgof" settings is choosed, only "mESC" data is available. Returns: tuple: A tuple containing two objects for a single BEELINE benchmark. The first element is a scanpy AnnData with cells on rows and genes on columns. Second element is an numpy array for the adjacency list of the ground truth network. """ if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(f'{data_dir}/BEELINE/'): download_beeline(data_dir) data_dir = f'{data_dir}/BEELINE/{benchmark_setting}_{benchmark_data}' data = sc.read(f'{data_dir}/data.csv') # We do need to transpose the data to have cells on rows and genes on columns data = data.transpose() cell_types, cell_type_indices = cell_type_separator( data, beeline_cell_type_dict[benchmark_data]) data.obs['cell_type'] = cell_types data.obs['cell_type_index'] = cell_type_indices ground_truth = load_beeline_ground_truth(data_dir) return data, ground_truth
def download_beeline(save_dir, remove_zip=True): if not os.path.exists(save_dir): raise Exception("save_dir does not exist") zip_path = os.path.join(save_dir, 'BEELINE.zip') download_file('https://bcb.cs.tufts.edu/DAZZLE/BEELINE.zip', zip_path) with zipfile.ZipFile(zip_path,"r") as zip_ref: for file in tqdm(desc='Extracting', iterable=zip_ref.namelist(), total=len(zip_ref.namelist())): zip_ref.extract(member=file, path=save_dir) if remove_zip: os.remove(zip_path)