Source code for regdiffusion.data.microglia

import numpy as np
import scanpy as sc
import pandas as pd
from tqdm import tqdm
import zipfile
import os
from .utils import download_file


[docs] def load_atlas_microglia(data_dir='data') -> sc.AnnData: """ Load single cell for microglia from Broad Institute SCP795 Data Source: https://singlecell.broadinstitute.org/single_cell/study/SCP795/a-transcriptomic-atlas-of-the-mouse-cerebellum#study-summary Paper: A transcriptomic atlas of mouse cerebellar cortex comprehensively defines cell types Paper Link: https://www.nature.com/articles/s41586-021-03220-z Raw data is count data. We select all genes that have non-zero expression. We also removed all gene models, Mitochondrial genes, and ribosome genes. We used log-plus-one to transform the count data. The output is an AnnData object where rows are cells and columns are genes. Args: data_dir (str): Parent directory to save and load the data. If the path does not exist, it will be created. Data will be saved in a subdirectory under the provided path. """ if not os.path.exists(data_dir): os.mkdir(data_dir) file_dir = f'{data_dir}/scp795_microglia/' if not os.path.exists(file_dir): os.mkdir(file_dir) download_regdiffusion_data(file_dir, 'atlas_microglia.zip') ann_dt = sc.read_h5ad(f'{file_dir}scp795_microglia.h5ad') ann_dt.X = ann_dt.X.toarray() ann_dt = ann_dt.transpose() sc.pp.filter_genes(ann_dt, min_counts=1) ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Gm')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('mt')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rpl')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rps')] ann_dt = sc.pp.log1p(ann_dt, copy=True) return ann_dt
[docs] def load_hammond_microglia(data_dir='data'): """ Load single cell for microglia from Hammond Microglia dataset Data Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE121654 Paper: Single-Cell RNA Sequencing of Microglia throughout the Mouse Lifespan and in the Injured Brain Reveals Complex Cell-State Changes Paper Link: https://www.cell.com/immunity/fulltext/S1074-7613(18)30485-0 IMPORTANT! This is not the complete data from the study. We only selected data from the 4 adult male mouses at P100. Here are their accession IDs. GSM3442026 P100 male no 1 GSM3442027 P100 male no 2 GSM3442030 P100 male no 3 GSM3442031 P100 male no 4 Raw data has already been log transformed. We select all genes that have non-zero expression. We also removed all gene models, Mitochondrial genes, and ribosome genes. The output is an AnnData object where rows are cells and columns are genes. Args: data_dir (str): Parent directory to save and load the data. If the path does not exist, it will be created. Data will be saved in a subdirectory under the provided path. """ if not os.path.exists(data_dir): os.mkdir(data_dir) file_dir = f'{data_dir}/hammond_microglia/' if not os.path.exists(file_dir): os.mkdir(file_dir) download_regdiffusion_data(file_dir, 'hammond_microglia.zip') ann_dt = sc.read_csv(f'{file_dir}/hammond_male_p100_microglia.csv') ann_dt = ann_dt.transpose() sc.pp.filter_genes(ann_dt, min_counts=0.0001) ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Gm')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('mt')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rpl')] ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rps')] return ann_dt
def download_regdiffusion_data(save_dir, file_name, remove_zip=True): if not os.path.exists(save_dir): raise Exception("save_dir does not exist") zip_path = os.path.join(save_dir, file_name) download_file( f'https://bcb.cs.tufts.edu/regdiffusion/{file_name}', zip_path) with zipfile.ZipFile(zip_path,"r") as zip_ref: for file in tqdm(desc='Extracting', iterable=zip_ref.namelist(), total=len(zip_ref.namelist())): zip_ref.extract(member=file, path=save_dir) if remove_zip: os.remove(zip_path)