import os
from pathlib import Path
from warnings import catch_warnings

import scanpy as sc
import scrublet as scr
import anndata as ad

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

RANDOM_STATE = 978323


sc.settings.set_figure_params(
    facecolor="white",
    figsize=(8, 8),
    format='png',
)


os.makedirs('data', exist_ok=True)

downloaded_path = Path('data/downloaded')
if not downloaded_path.is_symlink():
    downloaded_path.symlink_to('/mnt/storage/r0978323/project_data/skin_aging')

print(list(downloaded_path.glob('*')))

[PosixPath('data/downloaded/barcodes.tsv'), PosixPath('data/downloaded/genes.tsv'), PosixPath('data/downloaded/matrix.mtx'), PosixPath('data/downloaded/output')]


adata          = sc.read_mtx(downloaded_path / 'matrix.mtx')
adata_bc       = pd.read_csv(downloaded_path / 'barcodes.tsv', header=None)
adata_features = pd.read_csv(downloaded_path / 'genes.tsv', header=None, sep='\t')

adata = adata.T

adata.obs['cell_id'] = adata_bc[0].values
adata.obs_names = adata.obs['cell_id']

adata.var['feature_name'] = adata_features[1].values
adata.var_names = adata.var['feature_name']

adata.var_names_make_unique()
# The make-unique method only touches the index, so we need to reassign the actual column:
adata.var['feature_name'] = adata.var.index

print("> Original data, as downloaded:")
print(adata)

> Original data, as downloaded:
AnnData object with n_obs × n_vars = 16062 × 32738
    obs: 'cell_id'
    var: 'feature_name'


# Assign sample labels
adata.obs.loc[adata.obs_names.str.endswith('-1'), 'sample_id'] = 'y1'
adata.obs.loc[adata.obs_names.str.endswith('-2'), 'sample_id'] = 'y2'
adata.obs.loc[adata.obs_names.str.endswith('-3'), 'sample_id'] = 'o1'
adata.obs.loc[adata.obs_names.str.endswith('-4'), 'sample_id'] = 'o2'
adata.obs.loc[adata.obs_names.str.endswith('-5'), 'sample_id'] = 'o3'

# Save files for later
adata[adata.obs['sample_id'] == 'y1'].copy().write('data/downloaded_y1.h5ad')
adata[adata.obs['sample_id'] == 'y2'].copy().write('data/downloaded_y2.h5ad')


adata = sc.read('data/downloaded_y1.h5ad')

print("> Sample y1, first young donor:")
print(adata)

> Sample y1, first young donor:
AnnData object with n_obs × n_vars = 3130 × 32738
    obs: 'cell_id', 'sample_id'
    var: 'feature_name'


# Before cleanup:
sc.pl.highest_expr_genes(adata, n_top=20)


scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.06)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2,
    min_cells=3,
    min_gene_variability_pctl=85,
    n_prin_comps=30
)

scrub.plot_histogram()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.53
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 7.7%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 6.2%
Elapsed time: 30.6 seconds

(<Figure size 640x240 with 2 Axes>,
 array([<Axes: title={'center': 'Observed transcriptomes'}, xlabel='Doublet score', ylabel='Prob. density'>,
        <Axes: title={'center': 'Simulated doublets'}, xlabel='Doublet score', ylabel='Prob. density'>],
       dtype=object))


# Override default call for y1:
predicted_doublets = scrub.call_doublets(threshold=0.2)

# Save doublets and scores for later:
adata.obs['predicted_doublets'] = predicted_doublets
adata.obs['doublet_scores']     = doublet_scores

scrub.plot_histogram()

Detected doublet rate = 2.5%
Estimated detectable doublet fraction = 43.5%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 5.7%

(<Figure size 640x240 with 2 Axes>,
 array([<Axes: title={'center': 'Observed transcriptomes'}, xlabel='Doublet score', ylabel='Prob. density'>,
        <Axes: title={'center': 'Simulated doublets'}, xlabel='Doublet score', ylabel='Prob. density'>],
       dtype=object))


# Ignore UMAP warnings for random state
with catch_warnings(action="ignore"):
    scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    scrub.plot_embedding('UMAP', order_points=True)


sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)


adata.var['mt']   = adata.var_names.str.startswith('MT-')
adata.var['ribo'] = adata.var_names.str.startswith(("RPS", "RPL"))

adata.var['hb'] = adata.var_names.str.contains("^HB[^(P)]")
adata.var.loc[adata.var_names == 'HBEGF', 'hb'] = False

sc.pp.calculate_qc_metrics(
    adata,
    qc_vars=['mt', 'ribo', 'hb'],
    percent_top=None,
    log1p=False,
    inplace=True,
)


figure, axes = plt.subplots(1, 4, figsize=(20, 5))

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',     show=False, ax=axes[0])
sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo',   show=False, ax=axes[1])
sc.pl.scatter(adata, x='total_counts', y='pct_counts_hb',     show=False, ax=axes[2])
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', show=False, ax=axes[3])

figure.show()


sc.pl.violin(
    adata,
    ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
    jitter=0.4,
    multi_panel=True,
)


adata = adata[adata.obs.n_genes_by_counts < 3500, :]
print("> After removal by gene count:")
print(adata.shape)

adata = adata[adata.obs.pct_counts_mt < 5, :]
print("After removal by MT percentage:")
print(adata.shape)

adata = adata[adata.obs.pct_counts_hb < 5, :]
print("After removal by HB percentage (note: no change)")
print(adata.shape)

adata = adata[adata.obs.pct_counts_ribo < 50, :]
print("After removal by Ribosomal percentage")
print(adata.shape)

adata_clean = adata.copy()
sc.pl.highest_expr_genes(adata_clean, n_top=20)

> After removal by gene count:
(3076, 17852)
After removal by MT percentage:
(2735, 17852)
After removal by HB percentage (note: no change)
(2735, 17852)
After removal by Ribosomal percentage
(2728, 17852)


adata_clean.write('data/cleaned_y1.h5ad')


sc.settings.figdir = 'figures/leiden_y1'

os.makedirs(sc.settings.figdir, exist_ok=True)


adata = sc.read('data/cleaned_y1.h5ad')

# Normalize counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Save current snapshot before further rescaling
adata.raw = adata

# Find variable genes
sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True)


# Scale all values to a consistent range of values:
sc.pp.scale(adata, max_value=10)

# Run PCA for 100 components, limiting to highly variable genes
sc.pp.pca(adata, n_comps=100, mask_var="highly_variable")

# Visualize loadings:
sc.pl.pca_loadings(adata, include_lowest=False)


with plt.rc_context({'figure.figsize': (10, 5)}):
    sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)


PCA_COMPONENTS=45


# UMAP clustering based on number of PCs:
sc.pp.neighbors(adata, n_pcs=PCA_COMPONENTS)
sc.tl.umap(adata, random_state=RANDOM_STATE)


# Labels from PCs:
sc.pl.umap(adata, color=['COL6A2', 'CST3', 'PTPRCAP'])

# Labels from the paper:
sc.pl.umap(adata, color=['IL8', 'LUM', 'KRT1'])


sc.pl.umap(
    adata,
    color=['pct_counts_mt', 'pct_counts_ribo', 'n_genes_by_counts']
)


sc.pl.umap(
    adata,
    color=['predicted_doublets', 'doublet_scores']
)


def perform_leiden(adata, resolutions, label):
    for res in resolutions:
        key = 'leiden_' + str(res)
        sc.tl.leiden(
            adata,
            resolution=res,
            flavor="igraph",
            # Note: we'll let the algorithm iterate until convergence, it does not take long:
            n_iterations=-1,
            key_added=key,
            random_state=RANDOM_STATE,
        )
        sc.pl.umap(
            adata,
            color=[key],
            legend_loc='on data',
            show=False,
            save=f"_{key}_{label}.png",
        )


perform_leiden(adata, resolutions=[0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 2, 2.5, 5], label="exploratory")

WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.1_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.5_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.6_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.7_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.8_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.9_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_2_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_2.5_exploratory.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_5_exploratory.png


def visualize_resolution_images(label, resolutions):
    figure, axes = plt.subplots(1, len(resolutions), figsize=(18, 10))

    for ax in axes:
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

    for i, resolution in enumerate(resolutions):
        axes[i].imshow(mpimg.imread(f'{sc.settings.figdir}/umap_leiden_{resolution}_{label}.png'))
       
    figure.tight_layout()
    figure.show()
    
visualize_resolution_images('exploratory', [0.1, 0.9, 5])


def fraction_of_doublets(adata, resolution):
    cluster_labels = np.unique(adata.obs[resolution])

    cell_counts       = []
    doublet_fractions = []

    for cluster_label in cluster_labels:
        clus_adata    = adata[adata.obs[resolution] == cluster_label]
        cell_count    = clus_adata.shape[0]
        doublet_count = sum(clus_adata.obs['predicted_doublets'])

        doublet_fractions.append(doublet_count / cell_count)
        cell_counts.append(cell_count)

    return pd.DataFrame({
        'cluster_label':    cluster_labels,
        'cell_count':       cell_counts,
        'doublet_fraction': doublet_fractions
    }).sort_values('doublet_fraction', ascending=False)

# We pick a very high resolution to find clusters with a large amount of doublets:
doublet_fractions = fraction_of_doublets(adata, resolution='leiden_5')
doublet_fractions.head(10)


doublet_clusters = [
    label
    for label
    in doublet_fractions[doublet_fractions['doublet_fraction'] >= 0.1]['cluster_label']
]
adata = adata[~adata.obs['leiden_5'].isin(doublet_clusters)].copy()
adata = adata[adata.obs['predicted_doublets'] == False].copy()


perform_leiden(adata, resolutions=[0.1, 0.9], label="post_doublet")
visualize_resolution_images('post_doublet', [0.1, 0.9])

WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.1_post_doublet.png
WARNING: saving figure to file figures/leiden_y1/umap_leiden_0.9_post_doublet.png


adata.write('data/cleaned_clustered_y1.h5ad')


sc.settings.figdir = 'figures/markers_y1'

os.makedirs(sc.settings.figdir, exist_ok=True)
os.makedirs('data/markers_y1/all', exist_ok=True)
os.makedirs('data/markers_y1/up', exist_ok=True)
os.makedirs('data/markers_y1/gprofiler', exist_ok=True)


class MarkerGenes():
    def __init__(self, adata, resolution):
        # Initialize inputs
        self.adata = adata
        self.cluster_labels = np.unique(self.adata.obs[f'leiden_{resolution}'])

        # Rank genes by comparing raw counts via wilcoxon tests:
        # (ignore warnings for fragmented data frames)
        with catch_warnings(action='ignore'):
            sc.tl.rank_genes_groups(
                adata,
                f'leiden_{resolution}',
                method='wilcoxon',
                use_raw=True,
                key_added=f"rank_{resolution}",
            )

        # Only select up-regulated genes specific to the cluster:
        sc.tl.filter_rank_genes_groups(
            adata,
            min_in_group_fraction=0.25,
            max_out_group_fraction=0.5,
            min_fold_change=2,
            key=f"rank_{resolution}",
            key_added=f"rank_{resolution}_up",
        )

        # Collect ranked genes into dictionaries for easy access:
        self.gene_dfs = {'all': {}, 'up': {}}

        for label in self.cluster_labels:
            gene_df = sc.get.rank_genes_groups_df(
                adata,
                label,
                key=f"rank_{resolution}",
            )
            
            # We'll only keep significant marker genes
            self.gene_dfs['all'][label] = gene_df[gene_df["pvals_adj"] < 0.05].copy()

            gene_df = sc.get.rank_genes_groups_df(
                adata,
                label,
                key=f"rank_{resolution}_up",
            ).dropna()
            
            # We remove NA values that didn't pass the "up" filter
            self.gene_dfs['up'][label] = gene_df[gene_df["pvals_adj"] < 0.05].dropna().copy()

    def gene_clusters(self, gene_names, type='all'):
        """
        Find clusters that include any of the given gene names. The type can
        either be "all" for the full list of differentially-expressed genes or
        "up" for the upregulated ones.
        """
        gene_names = set(gene_names)
        result_df = pd.DataFrame({
            'clusters':       [],
            'names':          [],
            'logfoldchanges': [],
            'pvals_adj':      [],
        })

        for cluster in self.cluster_labels:
            cluster_df = self.gene_dfs[type][cluster]
            found_df   = cluster_df[cluster_df['names'].isin(gene_names)].copy()

            if len(found_df) > 0:
                found_df['clusters'] = cluster
                if len(result_df) > 0:
                    result_df = pd.concat((result_df, found_df[[*result_df.columns]]))
                else:
                    result_df = found_df[[*result_df.columns]]

        return result_df
    
    def gprofiler_lookup(self, cluster, sources):
        """
        Make an enrichment query to g:Profiler for the given cluster's
        upregulated genes. Return a subset of columns from the resulting
        dataframe.
        """
        gene_df = self.gene_dfs['up'][cluster]
        gene_names = [gene for gene in gene_df['names']]
        
        result = sc.queries.enrich(
            gene_names, 
            org='hsapiens', 
            gprofiler_kwargs={'sources': sources}
        )

        return result[["source", "name", "description", "p_value"]].copy()


marker_genes_broad = MarkerGenes(adata, '0.1')


lookup = marker_genes_broad.gprofiler_lookup('0', ['GO:BP'])
lookup.head(10)


lookup = marker_genes_broad.gprofiler_lookup('0', ['HPA'])
lookup.head(5)


lookup = marker_genes_broad.gprofiler_lookup('2', ['GO:BP'])
lookup.head(5)


lookup = marker_genes_broad.gprofiler_lookup('2', ['HPA'])
lookup.head(5)


for cluster in marker_genes_broad.cluster_labels:
    lookup = marker_genes_broad.gprofiler_lookup(cluster, ['GO:BP', 'HPA'])
    lookup.to_csv(f'data/markers_y1/gprofiler/cluster_{int(cluster):02}.csv')


broad_annotation = {
  '0': 'Immunity 1 (Lymphocytes)',      # t-cell/lymphocyte activation, immune response
  '1': 'Immunity 2 (Macrophages)',      # regulation of immune processes, defense response, macrophages
  '2': 'Smooth muscles',                # cell motility, actin filament-based process, muscle contraction
  '3': 'Fibroblasts',                   # extracellular matrix organisation, cell adhesion, collagen fibril organisation
  '4': 'Development 1 (Epidermis)',     # skin/epidermis/epithelium/tissue development
  '5': 'Development 2 (Blood vessels)', # angiogenesis, vascular/circulatory system development
}

adata.obs['broad_celltype'] = [
    broad_annotation[cluster]
    for cluster 
    in adata.obs['leiden_0.1'] 
]

with plt.rc_context({'figure.figsize': (6, 6)}):
    sc.pl.umap(
        adata,
        color=['broad_celltype']
    )


marker_genes_narrow = MarkerGenes(adata, '0.9')


print("> Epidermal stem cells and undifferentiated progenitors:")
clusters_df = marker_genes_narrow.gene_clusters(['KRT5', 'TP63', 'ITGA6', 'ITGB1'], type='up')
print(f"Clusters: {np.unique(clusters_df['clusters'])}")
print(clusters_df)

print("> Differentiated keratinocytes:")
clusters_df = marker_genes_narrow.gene_clusters(['KRT1', 'KRT10', 'SBSN', 'KRTDAP'], type='up')
print(f"Clusters: {np.unique(clusters_df['clusters'])}")
print(clusters_df)

> Epidermal stem cells and undifferentiated progenitors:
Clusters: ['15' '18' '7' '8']
    clusters  names  logfoldchanges     pvals_adj
136       15  ITGA6        3.402256  2.495204e-14
108       18  ITGA6        3.229118  1.216474e-02
33         7   KRT5        4.440905  5.100525e-16
128        7   TP63        4.200040  6.527335e-08
514        7  ITGA6        2.336281  1.722833e-02
1          8   KRT5        6.768706  1.057143e-64
45         8   TP63        5.512927  2.983660e-27
72         8  ITGA6        3.883892  2.669389e-20
> Differentiated keratinocytes:
Clusters: ['7']
    clusters   names  logfoldchanges     pvals_adj
2          7    KRT1        7.685723  5.556684e-26
5          7   KRT10        6.142022  1.043802e-25
32         7  KRTDAP        6.632085  5.100525e-16
112        7    SBSN        6.395090  6.068905e-09


with plt.rc_context({'figure.figsize': (6, 6)}):
    sc.pl.umap(
        adata,
        color=['leiden_0.9'],
        legend_loc='on data'
    )


print("> Fibroblasts:")
clusters_df = marker_genes_narrow.gene_clusters(['LUM', 'DCN', 'VIM', 'PDGFRA', 'COL1A2'], type='up')
print(f"Clusters: {np.unique(clusters_df['clusters'])})")
print(clusters_df)

> Fibroblasts:
Clusters: ['10' '11' '13' '17' '5' '6'])
   clusters   names  logfoldchanges     pvals_adj
2        10     LUM        5.064789  1.159146e-14
8        11     LUM        3.254276  5.424429e-44
49       11  PDGFRA        2.463679  1.605754e-16
63       13     LUM        2.885251  6.212636e-41
81       17  PDGFRA        2.791304  3.867666e-20
47        5     LUM        2.153567  3.991706e-19
2         6     LUM        3.470551  2.628202e-34


clusters_df = marker_genes_narrow.gene_clusters(['LUM', 'DCN', 'VIM', 'PDGFRA', 'COL1A2'], type='all')
clusters_df[clusters_df["logfoldchanges"] > 1]


for cluster in ['5', '6', '11', '13', '17']:
    # Save up-regulated genes with log-fold changes above 2:
    marker_genes_narrow.gene_dfs['up'][cluster]["names"].to_csv(
        f'data/markers_y1/up/cluster_{int(cluster):02}.txt', 
        index=False,
        header=False
    )
    
    # Save all genes, both up- and down-regulated for GOrilla:
    marker_genes_narrow.gene_dfs['all'][cluster].sort_values('logfoldchanges', ascending=False)["names"].to_csv(
        f'data/markers_y1/all/cluster_{int(cluster):02}.txt', 
        index=False, 
        header=False
    )


sc.settings.figdir = 'figures/harmony'

os.makedirs(sc.settings.figdir, exist_ok=True)


adata_y1 = sc.read(f"data/cleaned_y1.h5ad")


adata_y2 = sc.read(f"data/downloaded_y2.h5ad")

scrub = scr.Scrublet(adata_y2.X, expected_doublet_rate=0.06)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2,
    min_cells=3,
    min_gene_variability_pctl=85,
    n_prin_comps=30
)
# Call it at 0.22:
predicted_doublets = scrub.call_doublets(threshold=0.22)
scrub.plot_histogram()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.51
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 8.4%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 2.5%
Elapsed time: 16.9 seconds
Detected doublet rate = 1.7%
Estimated detectable doublet fraction = 34.4%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 4.9%

(<Figure size 640x240 with 2 Axes>,
 array([<Axes: title={'center': 'Observed transcriptomes'}, xlabel='Doublet score', ylabel='Prob. density'>,
        <Axes: title={'center': 'Simulated doublets'}, xlabel='Doublet score', ylabel='Prob. density'>],
       dtype=object))


adata_y2.obs['predicted_doublets'] = predicted_doublets
adata_y2.obs['doublet_scores']     = doublet_scores

sc.pp.filter_cells(adata_y2, min_genes=200)
sc.pp.filter_genes(adata_y2, min_cells=3)


adata_y2.var['mt']   = adata_y2.var_names.str.startswith('MT-')
adata_y2.var['ribo'] = adata_y2.var_names.str.startswith(("RPS", "RPL"))

adata_y2.var['hb']   = adata_y2.var_names.str.contains("^HB[^(P)]")
adata_y2.var.loc[adata_y2.var_names == 'HBEGF', 'hb'] = False

sc.pp.calculate_qc_metrics(
    adata_y2,
    qc_vars=['mt', 'ribo', 'hb'],
    percent_top=None,
    log1p=False,
    inplace=True,
)

sc.pl.violin(
    adata_y2,
    ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
    jitter=0.4,
    multi_panel=True,
)


# Looks like we should pick a slightly higher threshold for this sample, let's go with 4000:
adata_y2 = adata_y2[adata_y2.obs.n_genes_by_counts < 4000, :].copy()
print(adata_y2.shape)

adata_y2 = adata_y2[adata_y2.obs.pct_counts_mt < 5, :].copy()
print(adata_y2.shape)

# We do have some blood cells to remove:
adata_y2 = adata_y2[adata_y2.obs.pct_counts_hb < 5, :].copy()
print(adata_y2.shape)

(2553, 17272)
(2396, 17272)
(2353, 17272)


# Save to disk to have a record of this step:
adata_y2.write('data/clean_y2.h5ad')


adata = ad.concat(
    [adata_y1, adata_y2],
    uns_merge="unique",
)
print(adata)

AnnData object with n_obs × n_vars = 5081 × 16599
    obs: 'cell_id', 'sample_id', 'predicted_doublets', 'doublet_scores', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb'


sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Save current snapshot before further rescaling
adata.raw = adata

# Find variable genes
sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000, inplace=True)

# PCA
sc.pp.scale(adata, max_value=10)
sc.pp.pca(adata, n_comps=100, mask_var="highly_variable")

# We'll pick the same number of components as before, we'd expect the variances to be compatible:
PCA_COMPONENTS=45

sc.pp.neighbors(adata, n_pcs=PCA_COMPONENTS)
sc.tl.umap(adata, random_state=RANDOM_STATE)


adata.obsm['X_umap_preharmony']  = adata.obsm['X_umap'].copy()
sc.pl.umap(adata, color="sample_id")

sc.tl.embedding_density(
    adata,
    basis='umap_preharmony',
    groupby='sample_id',
    key_added='density_X_umap_sample_id',
)
sc.pl.embedding_density(
    adata,
    basis='umap_preharmony',
    key='density_X_umap_sample_id'
)


sc.external.pp.harmony_integrate(
    adata,
    'sample_id',
    basis='X_pca',
    adjusted_basis='X_pca_harmony',
)

sc.pp.neighbors(
    adata,
    use_rep='X_pca_harmony',
    key_added='neighbors_harmony',
    n_pcs=PCA_COMPONENTS,
    random_state=RANDOM_STATE,
)
sc.tl.umap(adata, neighbors_key='neighbors_harmony', random_state=RANDOM_STATE)
sc.pl.umap(adata, color='sample_id')

2025-01-05 11:24:03,294 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2025-01-05 11:24:06,022 - harmonypy - INFO - sklearn.KMeans initialization complete.
2025-01-05 11:24:06,899 - harmonypy - INFO - Iteration 1 of 10
2025-01-05 11:24:08,637 - harmonypy - INFO - Iteration 2 of 10
2025-01-05 11:24:10,180 - harmonypy - INFO - Iteration 3 of 10
2025-01-05 11:24:11,254 - harmonypy - INFO - Converged after 3 iterations


sc.tl.embedding_density(
    adata,
    basis='umap',
    groupby='sample_id',
    key_added='density_X_umap_sample_id',
)
sc.pl.embedding_density(
    adata,
    basis='umap',
    key='density_X_umap_sample_id'
)


perform_leiden(adata, resolutions=[0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 2, 2.5, 5], label="exploratory")
visualize_resolution_images('exploratory', [0.1, 0.9, 5])

WARNING: saving figure to file figures/harmony/umap_leiden_0.1_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.5_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.6_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.7_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.8_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.9_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_2_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_2.5_exploratory.png
WARNING: saving figure to file figures/harmony/umap_leiden_5_exploratory.png


# Clean doublet clusters
doublet_fractions = fraction_of_doublets(adata, resolution='leiden_5')
print(doublet_fractions.sort_values('doublet_fraction', ascending=False).head(10))
doublet_clusters = [
    label
    for label
    in doublet_fractions[doublet_fractions['doublet_fraction'] >= 0.1]['cluster_label']
]

adata = adata[~adata.obs['leiden_5'].isin(doublet_clusters)].copy()
adata = adata[adata.obs['predicted_doublets'] == False].copy()

perform_leiden(adata, label='post_doublet', resolutions=[0.1, 0.9, 5])
visualize_resolution_images('post_doublet', [0.1, 0.9, 5])

   cluster_label  cell_count  doublet_fraction
38            43          20          0.800000
50            54          11          0.545455
48            52          23          0.130435
30            36           8          0.125000
15            22          65          0.076923
44            49          63          0.063492
17            24          32          0.062500
43            48          19          0.052632
46            50          86          0.046512
51            55          23          0.043478
WARNING: saving figure to file figures/harmony/umap_leiden_0.1_post_doublet.png
WARNING: saving figure to file figures/harmony/umap_leiden_0.9_post_doublet.png
WARNING: saving figure to file figures/harmony/umap_leiden_5_post_doublet.png


marker_genes = MarkerGenes(adata, '0.9')


clusters_df = marker_genes.gene_clusters(['LUM', 'DCN', 'VIM', 'PDGFRA', 'COL1A2'], type='up')
print(f"Clusters: {np.unique(clusters_df['clusters'])})")
print(clusters_df)

Clusters: ['10' '11' '13' '17' '5' '8' '9'])
   clusters   names  logfoldchanges     pvals_adj
27       10     LUM        2.612704  1.297550e-31
13       11     LUM        2.889503  2.979106e-51
69       11  PDGFRA        2.095949  2.159267e-18
64       13     LUM        2.840708  3.092165e-59
42       17  PDGFRA        3.099354  4.181933e-55
33        5     LUM        2.268180  1.571616e-49
1         8     LUM        4.666899  4.618369e-22
6         9     LUM        3.760450  5.626671e-47


marker_genes.gprofiler_lookup('13', ['GO:BP']).head(5)

	source	name	description	p_value
0	GO:BP	T cell activation	"The change in morphology and behavior of a ma...	2.334303e-14
1	GO:BP	immune system process	"Any process involved in the development or fu...	1.302104e-13
2	GO:BP	regulation of immune system process	"Any process that modulates the frequency, rat...	1.595562e-13
3	GO:BP	lymphocyte activation	"A change in morphology and behavior of a lymp...	1.985179e-13
4	GO:BP	leukocyte activation	"A change in morphology and behavior of a leuk...	1.228464e-12
5	GO:BP	cell activation	"A multicellular organismal process by which e...	3.309318e-12
6	GO:BP	immune response	"Any immune system process that functions in t...	5.339767e-12
7	GO:BP	positive regulation of immune system process	"Any process that activates or increases the f...	1.432505e-10
8	GO:BP	regulation of leukocyte activation	"Any process that modulates the frequency, rat...	1.202071e-09
9	GO:BP	mononuclear cell differentiation	"The process in which a relatively unspecializ...	1.370003e-09

	source	name	description	p_value
0	HPA	tonsil; non-germinal center cells[≥Medium]	tonsil; non-germinal center cells[≥Medium]	2.712662e-10
1	HPA	lymph node; non-germinal center cells[≥Medium]	lymph node; non-germinal center cells[≥Medium]	1.723807e-08
2	HPA	tonsil; non-germinal center cells[≥Low]	tonsil; non-germinal center cells[≥Low]	4.821424e-07
3	HPA	tonsil; non-germinal center cells[High]	tonsil; non-germinal center cells[High]	5.659463e-07
4	HPA	spleen	spleen	7.909529e-07

	source	name	description	p_value
0	GO:BP	circulatory system development	"The process whose specific outcome is the pro...	1.068127e-13
1	GO:BP	blood vessel development	"The process whose specific outcome is the pro...	7.740081e-12
2	GO:BP	vasculature development	"The process whose specific outcome is the pro...	1.833645e-11
3	GO:BP	blood vessel morphogenesis	"The process in which the anatomical structure...	8.658498e-10
4	GO:BP	anatomical structure formation involved in mor...	"The developmental process pertaining to the i...	1.986941e-08

	source	name	description	p_value
0	HPA	endometrium; smooth muscle cells[High]	endometrium; smooth muscle cells[High]	4.776062e-09
1	HPA	endometrium; smooth muscle cells[≥Medium]	endometrium; smooth muscle cells[≥Medium]	3.158520e-07
2	HPA	endometrium; smooth muscle cells[≥Low]	endometrium; smooth muscle cells[≥Low]	3.287242e-06
3	HPA	skin 2; endothelial cells[≥Medium]	skin 2; endothelial cells[≥Medium]	1.224378e-04
4	HPA	endometrium	endometrium	2.325990e-04

	source	name	description	p_value
0	GO:BP	extracellular matrix organization	"A process that is carried out at the cellular...	1.968961e-29
1	GO:BP	extracellular structure organization	"A process that is carried out at the cellular...	2.221784e-29
2	GO:BP	external encapsulating structure organization	"A process that is carried out at the cellular...	2.506022e-29
3	GO:BP	cell adhesion	"The attachment of a cell, either to another c...	3.722681e-15
4	GO:BP	collagen fibril organization	"Any process that determines the size and arra...	1.088421e-12

scRNA-seq Analysis of Human Skin Samples¶

Table of contents¶

Technical preparation¶

Cleaning and quality control¶

Dimensionality reduction and clustering¶

Marker genes¶

Broad clustering¶

Narrow clustering¶

Functional analysis¶

Cluster 5: Pro-inflammatory¶

Cluster 11: Mesenchymal¶

Cluster 13: Secretory-reticular¶

Cluster 17: Secretory-papillary¶

Cluster 6: Signaling and/or neuronal differentiation?¶

Sample integration with Harmony¶

Closing thoughts¶

	cluster_label	cell_count	doublet_fraction
36	41	36	0.361111
42	47	21	0.190476
30	36	6	0.166667
33	39	50	0.080000
16	23	60	0.066667
18	25	30	0.066667
5	13	36	0.055556
40	45	38	0.052632
46	6	79	0.050633
0	0	80	0.037500

	clusters	names	logfoldchanges	pvals_adj
2	10	LUM	5.064789	1.159146e-14
17	10	DCN	3.166609	5.806327e-07
2	11	COL1A2	3.448838	2.592614e-57
8	11	LUM	3.254276	5.424429e-44
17	11	DCN	2.552973	4.142319e-32
49	11	PDGFRA	2.463679	1.605754e-16
1	13	DCN	5.254685	1.639696e-107
7	13	COL1A2	4.582181	5.198708e-90
63	13	LUM	2.885251	6.212636e-41
344	13	PDGFRA	1.472978	1.452574e-05
10	17	COL1A2	3.342720	1.426196e-59
12	17	DCN	3.472524	2.345787e-56
81	17	PDGFRA	2.791304	3.867666e-20
334	17	LUM	1.009381	3.340726e-04
14	5	DCN	3.225979	1.207637e-40
47	5	LUM	2.153567	3.991706e-19
294	5	PDGFRA	1.383599	5.541931e-03
2	6	LUM	3.470551	2.628202e-34
11	6	DCN	2.671365	1.921617e-24
15	6	COL1A2	2.386633	2.296702e-23
22	6	VIM	1.105719	5.124640e-21
157	6	PDGFRA	1.742508	5.751365e-04