Source code for regdiffusion.data.microglia
import numpy as np
import scanpy as sc
import pandas as pd
from tqdm import tqdm
import zipfile
import os
from .utils import download_file
[docs]
def load_atlas_microglia(data_dir='data') -> sc.AnnData:
"""
Load single cell for microglia from Broad Institute SCP795
Data Source: https://singlecell.broadinstitute.org/single_cell/study/SCP795/a-transcriptomic-atlas-of-the-mouse-cerebellum#study-summary
Paper: A transcriptomic atlas of mouse cerebellar cortex comprehensively
defines cell types
Paper Link: https://www.nature.com/articles/s41586-021-03220-z
Raw data is count data. We select all genes that have non-zero expression.
We also removed all gene models, Mitochondrial genes, and ribosome genes.
We used log-plus-one to transform the count data.
The output is an AnnData object where rows are cells and columns are genes.
Args:
data_dir (str): Parent directory to save and load the data. If the path
does not exist, it will be created. Data will be saved in a
subdirectory under the provided path.
"""
if not os.path.exists(data_dir):
os.mkdir(data_dir)
file_dir = f'{data_dir}/scp795_microglia/'
if not os.path.exists(file_dir):
os.mkdir(file_dir)
download_regdiffusion_data(file_dir, 'atlas_microglia.zip')
ann_dt = sc.read_h5ad(f'{file_dir}scp795_microglia.h5ad')
ann_dt.X = ann_dt.X.toarray()
ann_dt = ann_dt.transpose()
sc.pp.filter_genes(ann_dt, min_counts=1)
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Gm')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('mt')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rpl')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rps')]
ann_dt = sc.pp.log1p(ann_dt, copy=True)
return ann_dt
[docs]
def load_hammond_microglia(data_dir='data'):
"""
Load single cell for microglia from Hammond Microglia dataset
Data Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE121654
Paper: Single-Cell RNA Sequencing of Microglia throughout the Mouse Lifespan
and in the Injured Brain Reveals Complex Cell-State Changes
Paper Link: https://www.cell.com/immunity/fulltext/S1074-7613(18)30485-0
IMPORTANT! This is not the complete data from the study. We only selected
data from the 4 adult male mouses at P100. Here are their accession IDs.
GSM3442026 P100 male no 1
GSM3442027 P100 male no 2
GSM3442030 P100 male no 3
GSM3442031 P100 male no 4
Raw data has already been log transformed. We select all genes that have
non-zero expression. We also removed all gene models, Mitochondrial genes,
and ribosome genes.
The output is an AnnData object where rows are cells and columns are genes.
Args:
data_dir (str): Parent directory to save and load the data. If the path
does not exist, it will be created. Data will be saved in a
subdirectory under the provided path.
"""
if not os.path.exists(data_dir):
os.mkdir(data_dir)
file_dir = f'{data_dir}/hammond_microglia/'
if not os.path.exists(file_dir):
os.mkdir(file_dir)
download_regdiffusion_data(file_dir, 'hammond_microglia.zip')
ann_dt = sc.read_csv(f'{file_dir}/hammond_male_p100_microglia.csv')
ann_dt = ann_dt.transpose()
sc.pp.filter_genes(ann_dt, min_counts=0.0001)
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Gm')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('mt')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rpl')]
ann_dt = ann_dt[:, ~ann_dt.var_names.str.startswith('Rps')]
return ann_dt
def download_regdiffusion_data(save_dir, file_name, remove_zip=True):
if not os.path.exists(save_dir):
raise Exception("save_dir does not exist")
zip_path = os.path.join(save_dir, file_name)
download_file(
f'https://bcb.cs.tufts.edu/regdiffusion/{file_name}',
zip_path)
with zipfile.ZipFile(zip_path,"r") as zip_ref:
for file in tqdm(desc='Extracting', iterable=zip_ref.namelist(),
total=len(zip_ref.namelist())):
zip_ref.extract(member=file, path=save_dir)
if remove_zip:
os.remove(zip_path)