Skip to content

Download

download(census_version, dataset_id, base_dir)

Download cell type dataset from cellxgene census.

Parameters:

Name Type Description Default
census_version str

Version of the cellxgene census to use

required
dataset_id str

ID of the dataset to download

required
base_dir Path

Base directory for data downloads

required
Source code in bionemo/geneformer/scripts/celltype_classification_bench/download.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def download(census_version: str, dataset_id: str, base_dir: Path):
    """Download cell type dataset from cellxgene census.

    Args:
        census_version: Version of the cellxgene census to use
        dataset_id: ID of the dataset to download
        base_dir: Base directory for data downloads
    """
    # Ensure base_dir exists
    base_dir.mkdir(parents=True, exist_ok=True)

    # Setup paths
    h5ad_outfile = base_dir / "hs-celltype-bench.h5ad"

    # Download data from census
    print(f"Downloading data from census version {census_version}")
    with cellxgene_census.open_soma(census_version=census_version) as census:
        adata = cellxgene_census.get_anndata(
            census,
            "Homo sapiens",
            obs_value_filter=f'dataset_id=="{dataset_id}"',
        )

    # Print unique cell types
    uq_cells = sorted(adata.obs["cell_type"].unique().tolist())
    print(f"Found {len(uq_cells)} unique cell types")

    # Handle subsampling
    selection = list(range(len(adata)))

    print(f"Selected {len(selection)} cells")

    # Subset and save data - Fix: Convert list to numpy array
    adata = adata[np.array(selection)].copy()
    adata.write_h5ad(h5ad_outfile)
    print(f"Saved data to {h5ad_outfile}")

random_seed(seed)

Context manager to set the random seed for reproducibility.

Source code in bionemo/geneformer/scripts/celltype_classification_bench/download.py
27
28
29
30
31
32
33
34
35
@contextmanager
def random_seed(seed: int):
    """Context manager to set the random seed for reproducibility."""
    state = random.getstate()
    random.seed(seed)
    try:
        yield
    finally:
        random.setstate(state)