BioLLM: CellType Annotation
1. Prediction
scGPT:
from biollm.tasks.cell_annotation import CellAnnotation
config_file = './config/anno/scgpt.toml'
obj = CellAnnotation(config_file)
obj.run()
Geneformer:
from biollm.tasks.cell_annotation import CellAnnotation
config_file = './config/anno/geneformer.toml'
obj = CellAnnotation(config_file)
obj.run()
scFoundation:
from biollm.tasks.cell_annotation import CellAnnotation
config_file = './config/anno/scfoundation.toml'
obj = CellAnnotation(config_file)
obj.run()
scBERT:
from biollm.tasks.cell_annotation import CellAnnotation
config_file = './config/anno/scbert.toml'
obj = CellAnnotation(config_file)
obj.run()
CellPLM:
from biollm.tasks.cell_annotation import CellAnnotation
config_file = './config/anno/cellplm_ms.toml'
obj = CellAnnotation(config_file)
obj.run()
Note: The config directory can be found in the biollm/config/anno. Users can modify the corresponding parameters based on the path of their own input and output.
2. Evaluation
scGPT:
import scanpy as sc
import pickle
from sklearn.metrics import accuracy_score, f1_score
path = f'./output/scgpt/' # the outputdir in the config file.
predict_label = pickle.load(open(path + 'predict_list.pk', 'rb'))
adata = sc.read_h5ad(
f'./zheng68k.h5ad')
labels = adata.obs['celltype'].values
acc = accuracy_score(labels, predict_label)
macro_f1 = f1_score(labels, predict_label, average='macro')
res = {'acc': acc, 'macro_f1': macro_f1}
print(acc, macro_f1)
Geneformer:
import scanpy as sc
import pickle
from sklearn.metrics import accuracy_score, f1_score
path = f'./output/geneformer/' # the outputdir in the config file.
predict_label = pickle.load(open(path + 'predict_list.pk', 'rb'))
adata = sc.read_h5ad(
f'./zheng68k.h5ad')
labels = adata.obs['celltype'].values
acc = accuracy_score(labels, predict_label)
macro_f1 = f1_score(labels, predict_label, average='macro')
res = {'acc': acc, 'macro_f1': macro_f1}
print(acc, macro_f1)
scFoundation:
import scanpy as sc
import pickle
from sklearn.metrics import accuracy_score, f1_score
path = f'./output/scfoundation/' # the outputdir in the config file.
predict_label = pickle.load(open(path + 'predict_list.pk', 'rb'))
adata = sc.read_h5ad(
f'./zheng68k.h5ad')
labels = adata.obs['celltype'].values
acc = accuracy_score(labels, predict_label)
macro_f1 = f1_score(labels, predict_label, average='macro')
res = {'acc': acc, 'macro_f1': macro_f1}
print(acc, macro_f1)
scBERT:
import scanpy as sc
import pickle
from sklearn.metrics import accuracy_score, f1_score
path = f'./output/scbert/' # the outputdir in the config file.
predict_label = pickle.load(open(path + 'predict_list.pk', 'rb'))
adata = sc.read_h5ad(
f'./zheng68k.h5ad')
labels = adata.obs['celltype'].values
acc = accuracy_score(labels, predict_label)
macro_f1 = f1_score(labels, predict_label, average='macro')
res = {'acc': acc, 'macro_f1': macro_f1}
print(acc, macro_f1)
3. Visualization
import pandas as pd
from typing import Optional
from plottable import ColumnDefinition, Table
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
import matplotlib
from plottable.plots import bar
_METRIC_TYPE = "Metric Type"
def plot_results_table(df, show: bool = True, save_path: Optional[str] = None) -> Table:
"""Plot the benchmarking results as bar charts for Accuracy and Macro F1 using different colormaps.
Parameters
----------
show
Whether to show the plot.
save_path
The path to save the plot to. If `None`, the plot is not saved.
"""
# Delete the 'Metric Type' row as it does not need to be displayed in the final table
plot_df = df.drop(_METRIC_TYPE, axis=0)
num_embeds = plot_df.shape[0]
# Add “Dataset” as a Column
plot_df["Dataset"] = plot_df.index
# Define all columns as bar charts
column_definitions = [
ColumnDefinition("Dataset", width=1.5, textprops={"ha": "left", "weight": "bold"}),
]
# Extract columns for “Accuracy” and “Macro F1”
accuracy_cols = df.columns[df.loc[_METRIC_TYPE] == "Accuracy"]
macro_f1_cols = df.columns[df.loc[_METRIC_TYPE] == "Macro F1"]
colors = plt.get_cmap('PRGn')(np.linspace(0.25, 1, 256))
new_colors = colors[::-1]
new_cmap1 = LinearSegmentedColormap.from_list('modified_magma', new_colors, N=256)
colors = plt.get_cmap('YlGnBu')(np.linspace(0, 1, 256))
new_colors = colors[::-1]
new_cmap2 = LinearSegmentedColormap.from_list('modified_magma', new_colors, N=256)
# Define a bar chart for the “Accuracy” column
column_definitions += [
ColumnDefinition(
col,
width=1,
title=col.split('.')[0],
plot_fn=bar,
plot_kw={
"cmap": new_cmap1,
"plot_bg_bar": False,
"annotate": True,
"height": 0.9,
"formatter": "{:.2f}",
},
group=df.loc[_METRIC_TYPE, col],
)
for col in accuracy_cols
]
# Define a bar chart for the “Macro F1” column
column_definitions += [
ColumnDefinition(
col,
width=1,
title=col.split('.')[0],
plot_fn=bar,
plot_kw={
"cmap": new_cmap2,
"plot_bg_bar": False,
"annotate": True,
"height": 0.9,
"formatter": "{:.2f}",
},
group=df.loc[_METRIC_TYPE, col],
)
for col in macro_f1_cols
]
plt.rcParams['pdf.fonttype'] = 42 # Set PDF font type
with matplotlib.rc_context({"svg.fonttype": "none"}):
fig, ax = plt.subplots(figsize=(len(df.columns) * 1.3, 3 + 0.35 * num_embeds))
ax.patch.set_facecolor("white")
tab = Table(
plot_df,
cell_kw={
"linewidth": 0,
"edgecolor": "k",
},
column_definitions=column_definitions,
ax=ax,
row_dividers=True,
footer_divider=True,
textprops={"fontsize": 10, "ha": "center"},
row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 5))},
col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
column_border_kw={"linewidth": 1, "linestyle": "-"},
index_col="Dataset",
).autoset_fontcolors(colnames=plot_df.columns)
if show:
plt.show()
if save_path is not None:
fig.savefig(save_path, facecolor=ax.get_facecolor(), dpi=300)
return tab
df = pd.read_csv('./annotation_performance.csv') # Regarding the model performance (Accuracy and Macro F1) of four models and three other annotation tools (scANVI, celltypist and singleR) on different datasets
df = df.set_index("dataset")
plot_results_table(df)
Sample Data

Figure
