Vignette#
Exemplary analysis of the PBMC3K single-cell RNA dataset.
[1]:
import os
import math
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
from keggtools import (
Pathway,
Enrichment,
EnrichmentResult,
Resolver,
Renderer,
IMMUNE_SYSTEM_PATHWAYS,
plot_enrichment_result,
)
from IPython.display import Image, display
# Used folders
figure_dir = os.path.join(os.getcwd(), "figures")
# Global settings
sc.settings.verbosity = 0
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor="white")
# Ignore all warnings
warnings.filterwarnings("ignore", category=UserWarning)
scanpy==1.10.4 anndata==0.10.9 umap==0.5.1 numpy==2.0.2 scipy==1.14.1 pandas==2.2.3 scikit-learn==1.5.2 statsmodels==0.14.4 igraph==0.11.8 pynndescent==0.5.13
Load preprocessed PMBC3K dataset#
[2]:
adata = sc.datasets.pbmc3k_processed()
Plot UMAP with cluster annotations#
[3]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
sc.pl.umap(adata, color="louvain", legend_loc="right margin", title="", frameon=True, ax=ax, show=False)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
fig.savefig(os.path.join(figure_dir, "figure2.png"))
data:image/s3,"s3://crabby-images/62770/6277054368dcd83d08d85fe6e39975e4d5db2f9f" alt="_images/vignette_5_0.png"
Differential analysis results to pandas dataframe#
[4]:
sc.tl.rank_genes_groups(adata, "louvain", method="wilcoxon", n_genes=100)
[5]:
# Export diff exp data
diffexp_df = sc.get.rank_genes_groups_df(adata, group="NK cells", pval_cutoff=0.05, log2fc_min=1)
diffexp_df.head()
[5]:
names | scores | logfoldchanges | pvals | pvals_adj | |
---|---|---|---|---|---|
0 | NKG7 | 20.181747 | 5.623642 | 1.416465e-90 | 1.942540e-86 |
1 | GZMB | 19.738880 | 6.650631 | 9.996947e-87 | 6.854906e-83 |
2 | PRF1 | 19.434803 | 5.433477 | 3.919145e-84 | 1.791572e-80 |
3 | GNLY | 19.357994 | 6.657686 | 1.745530e-83 | 5.984549e-80 |
4 | CTSW | 18.578722 | 3.808060 | 4.777493e-77 | 1.310371e-73 |
Perform KEGGTOOLS enrichment analysis#
[6]:
organism_id: str = "hsa"
[7]:
from keggtools.utils import merge_entrez_geneid
[8]:
diffexp_df = merge_entrez_geneid(diffexp_df)
diffexp_df.head()
[8]:
names | scores | logfoldchanges | pvals | pvals_adj | symbol | entrez | |
---|---|---|---|---|---|---|---|
0 | NKG7 | 20.181747 | 5.623642 | 1.416465e-90 | 1.942540e-86 | NKG7 | 4818 |
1 | GZMB | 19.738880 | 6.650631 | 9.996947e-87 | 6.854906e-83 | GZMB | 3002 |
2 | PRF1 | 19.434803 | 5.433477 | 3.919145e-84 | 1.791572e-80 | PRF1 | 5551 |
3 | GNLY | 19.357994 | 6.657686 | 1.745530e-83 | 5.984549e-80 | GNLY | 10578 |
4 | CTSW | 18.578722 | 3.808060 | 4.777493e-77 | 1.310371e-73 | CTSW | 1521 |
[9]:
# Init resolver instance
resolver: Resolver = Resolver()
pathway_list: list[Pathway] = []
# Download all immune system pathways
for number, _ in IMMUNE_SYSTEM_PATHWAYS.items():
pathway_list.append(resolver.get_pathway(organism=organism_id, code=number))
[10]:
# Init enrichment from list of immune system pathways
analysis: Enrichment = Enrichment(pathways=pathway_list)
analysis_result: list[EnrichmentResult] = analysis.run_analysis(gene_list=list(diffexp_df["entrez"]))
result_df: pd.DataFrame = analysis.to_dataframe()
# Filter out pathways with no genes found
result_df = result_df[result_df["study_count"] > 0]
# Print out result
result_df.head()
[10]:
pathway_name | pathway_title | pathway_id | study_count | pathway_genes | pvalue | found_genes | |
---|---|---|---|---|---|---|---|
0 | path:hsa04640 | Hematopoietic cell lineage | 04640 | 1 | 100 | 0.183444 | 924 |
1 | path:hsa04610 | Complement and coagulation cascades | 04610 | 1 | 88 | 0.259797 | 3689 |
2 | path:hsa04611 | Platelet activation | 04611 | 2 | 126 | 0.235741 | 5908,2207 |
3 | path:hsa04620 | Toll-like receptor signaling pathway | 04620 | 3 | 109 | 0.799511 | 6352,6348,6351 |
4 | path:hsa04621 | NOD-like receptor signaling pathway | 04621 | 1 | 189 | 0.005539 | 6352 |
Plot results of enrichment analysis#
[11]:
fig, ax = plt.subplots(figsize=(7, 7))
plot_enrichment_result(analysis, ax=ax)
fig.savefig(os.path.join(figure_dir, "figure4.png"), bbox_inches="tight")
data:image/s3,"s3://crabby-images/c0eb7/c0eb7e6a8675e5d566c41120b18fff8dff9cead8" alt="_images/vignette_16_0.png"
Plot pathway#
“Antigen processing and presentation” (hsa:04612) show a significant p value
[12]:
pathway: Pathway = resolver.get_pathway(organism=organism_id, code="04650")
[13]:
# diffexp_df[["entrez"]] = diffexp_df[["entrez"]].astype(int)
overlay: dict = dict(zip(list(diffexp_df["entrez"]), list(diffexp_df["logfoldchanges"]), strict=True))
[14]:
renderer: Renderer = Renderer(kegg_pathway=pathway, gene_dict=overlay, cache_or_resolver=resolver.storage)
renderer.render(display_unlabeled_genes=True)
[15]:
# Save dot string to file
with open(os.path.join(figure_dir, "figure5.dot"), "w") as file_obj:
file_obj.write(renderer.to_string())
# Save binary data to file
renderer.to_file(filename=os.path.join(figure_dir, "figure5.png"), extension="png")
[16]:
# Display image
img: Image = Image(os.path.join(figure_dir, "figure5.png"))
display(img)
data:image/s3,"s3://crabby-images/3b069/3b069e03db38dce5595fd0f0cd935be3160c7e89" alt="_images/vignette_22_0.png"