forked from joan-smith/covid19
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
fc55b91
commit 59adeff
Showing
13 changed files
with
944 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Tue Apr 14 13:38:41 2020 | ||
@author: Joan Smith | ||
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE134174 | ||
""" | ||
#%% | ||
|
||
import pandas as pd | ||
import scanpy as sc | ||
from matplotlib import pyplot as plt | ||
import matplotlib as mpl | ||
import os | ||
import numpy as np | ||
#%% | ||
sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174/' | ||
sc.settings.verbosity = 4 | ||
|
||
mpl.rcParams.update(mpl.rcParamsDefault) | ||
mpl.rcParams.update({ | ||
'font.sans-serif': 'Arial', | ||
'font.family': 'sans-serif', | ||
'axes.titlesize': 18, | ||
'axes.labelsize': 14, | ||
}) | ||
|
||
#%% Load prenormed data and metadata | ||
input_file = 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_norm.txt' | ||
metadata = pd.read_csv('data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_metadata.txt', sep='\t', index_col=0) | ||
adata = sc.read_text(input_file).T | ||
adata.obs = metadata | ||
all_adata = adata.copy() | ||
VARIANT = 'all' | ||
|
||
#%% | ||
def exploratory_plots(adata): | ||
|
||
num_non_int = (adata.to_df().applymap(float.is_integer) == False).sum().sum() | ||
print('Num non-int: ', num_non_int) | ||
plt.figure() | ||
sc.pp.filter_cells(adata, min_genes=0) | ||
plt.hist(adata.obs.n_genes, bins=500) | ||
plt.title('Genes per cell') | ||
minimum = adata.obs['n_genes'].min() | ||
plt.xlabel('# Genes. Min:' + str(minimum)) | ||
plt.ylabel('# Cells') | ||
|
||
plt.figure() | ||
sc.pl.highest_expr_genes(adata, n_top=20, ) | ||
sc.pl.pca_variance_ratio(adata, log=True) | ||
plt.show() | ||
exploratory_plots(adata) | ||
|
||
#%% HEAVY AND NEVER | ||
adata = all_adata[all_adata.obs.Smoke_status.isin(['heavy', 'never'])].copy() | ||
VARIANT='heavy_and_never' | ||
sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174-heavy-and-never' | ||
|
||
#%% | ||
sc.pp.highly_variable_genes(adata) | ||
sc.tl.pca(adata) | ||
sc.pp.neighbors(adata) | ||
|
||
#%% | ||
LEARNING_RATE = 1000 | ||
EARLY_EXAGGERATION = 12 | ||
RESOLUTION = 2.5 #2 | ||
PERPLEXITY=30 | ||
N_PCS = 15 | ||
|
||
sc.tl.tsne(adata, learning_rate=LEARNING_RATE, n_jobs=8, early_exaggeration=EARLY_EXAGGERATION, perplexity=PERPLEXITY, n_pcs=N_PCS) | ||
sc.tl.leiden(adata, resolution=RESOLUTION) | ||
|
||
params = {'learning_rate': LEARNING_RATE, | ||
'early_exaggeration':EARLY_EXAGGERATION, | ||
'resolution': RESOLUTION, | ||
'perplexity': PERPLEXITY, | ||
'n_pcs': N_PCS, | ||
'genes': 'all', | ||
'variant': VARIANT, | ||
'files': input_file} | ||
sc.pl.tsne(adata, color=['leiden'], size=25, save='_leiden.pdf', color_map='plasma') | ||
pd.Series(params).to_csv(os.path.join(sc.settings.figdir, 'params.txt')) | ||
adata.uns['joan_cluster_params'] = params | ||
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) | ||
#%% | ||
#%% | ||
#%% | ||
#%% | ||
markers = pd.read_csv(os.path.join(sc.settings.figdir, 'markers.csv'), header=None, names=['gene', 'cluster']) | ||
markers['gene'] = markers['gene'].str.upper() | ||
markers = markers[markers['gene'].isin(adata.var.index.values)] | ||
|
||
markers['title'] = markers['gene'] + '+: ' + markers['cluster'] | ||
markers = markers.set_index('gene') | ||
markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells' | ||
markers.loc['leiden'] = ['Leiden', 'Clusters'] | ||
#%% | ||
adata = sc.read_h5ad(os.path.join(sc.settings.figdir, 'adata.h5ad')) | ||
params = pd.read_csv(os.path.join(sc.settings.figdir, 'params.txt'), index_col=0).to_dict('dict')['0'] | ||
VARIANT = params['variant'] | ||
#%% | ||
for i, g in markers.iterrows(): | ||
sc.pl.tsne(adata, color=i, | ||
title=g['title'], | ||
color_map='plasma', | ||
size=25, | ||
save='_' + i + '_airway_smoking_2020.pdf', | ||
show=False) | ||
|
||
#%% | ||
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test', n_genes=20) | ||
pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'rank_groups.csv')) | ||
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) | ||
|
||
|
||
#%% | ||
genes = ['ACE2', 'KRT5', 'TP63', 'BCAM', 'NGFR'] | ||
df = adata.to_df() | ||
for g in genes: | ||
print(g) | ||
g_df = df.join(adata.obs['Smoke_status'])[[g, 'Smoke_status']] | ||
g_df = g_df[g_df[g] > 0].pivot(columns='Smoke_status') | ||
g_df.to_csv(os.path.join(sc.settings.figdir, g +'_smoking_status.csv')) | ||
|
||
#%% | ||
|
||
#%% Correlations | ||
|
||
df = adata.to_df() | ||
corr = df.corrwith(df['ACE2']).sort_values(ascending=False) | ||
print('correlations with ACE2') | ||
print(corr.head(20)) | ||
corr.to_csv(os.path.join(sc.settings.figdir, 'ACE2_correlates.csv')) | ||
|
||
|
||
#%% | ||
markers_for_pos_counts = markers.drop('leiden').copy() | ||
markers_for_pos_counts.loc['MKI67'] = ['MKI67', 'MKI67'] | ||
markers_for_pos_counts.loc['PCNA'] = ['PCNA', 'PCNA'] | ||
markers_for_pos_counts.loc['TOP2A'] = ['TOP2A', 'TOP2A'] | ||
|
||
## pos for every marker, # cells | ||
df = adata.to_df() | ||
pos_counts = (df[markers_for_pos_counts.index] > 0).sum() | ||
pos_counts['cell_counts'] = len(df.index) | ||
pos_counts.to_csv(os.path.join(sc.settings.figdir, 'single_positive_counts.csv'), header=['single pos counts']) | ||
|
||
# # double pos w ace2, # | ||
double_pos = (df[markers_for_pos_counts.index] > 0).apply(lambda x: x & (df['ACE2'] > 0)).sum() | ||
double_pos['cell_counts'] = len(df.index) | ||
double_pos.to_csv(os.path.join(sc.settings.figdir, 'double_positive_counts.csv'), header=['double pos counts']) | ||
|
||
#%% | ||
df = adata.to_df() | ||
ace2_muc5ac_double_pos = (df['MUC5AC'] > 0) & (df['ACE2'] > 0) | ||
ace2_muc5ac = df.loc[ace2_muc5ac_double_pos][['ACE2']].join(adata.obs.Smoke_status) | ||
ace2_muc5ac.pivot(columns='Smoke_status').to_csv(os.path.join(sc.settings.figdir, 'ACE2_MUC5AC_double_pos.csv')) | ||
|
||
#%% | ||
cluster_labels = pd.read_csv(os.path.join(sc.settings.figdir, 'key.csv'), header=None, names=['cluster', 'label'], dtype='str') | ||
cluster_labels.loc[cluster_labels['label'].isna(),'label'] = cluster_labels.loc[cluster_labels['label'].isna(), 'cluster'] | ||
cluster_labels_dict = cluster_labels.set_index('cluster')['label'].to_dict() | ||
adata.obs['Cell Types'] = adata.obs.leiden.map(cluster_labels_dict).astype('category') | ||
|
||
sc.pl.tsne(adata, color=['Cell Types'], size=25, save='_manually_labeled.pdf', color_map='plasma', title='Human Trachea') | ||
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) | ||
|
||
sc.tl.rank_genes_groups(adata, 'Cell Types', method='t-test', n_genes=100, key_added='cell_types_gene_groups') | ||
pd.DataFrame(adata.uns['cell_types_gene_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'manually_labeled_rank_groups_ttest.csv')) | ||
|
||
#%% | ||
adata.obs.groupby('Cell Types')['Smoke_status'].value_counts().to_csv(os.path.join(sc.settings.figdir, 'cell_type_smoker_counts.csv')) | ||
ace2_pos_cell_types = adata.to_df()[(adata.to_df()['ACE2'] > 0)][['ACE2']].join(adata.obs[['Cell Types', 'Smoke_status']]) | ||
ace2_pos_cell_types.groupby(['Cell Types', 'Smoke_status']).count().to_csv(os.path.join(sc.settings.figdir, 'cell_type_ace2_pos_counts.csv')) | ||
|
||
#%% | ||
adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ace2_pos'] = 'ace2_pos' | ||
adata.obs.loc[(adata.to_df()['ACE2'] <= 0), 'ace2_pos'] = 'ace2_neg' | ||
sc.tl.rank_genes_groups(adata, groupby='ace2_pos', key_added='ACE2_pos') | ||
#%% | ||
|
||
adata.obs['ACE2_goblet'] = np.nan | ||
adata.obs.loc[(adata.obs['Cell Types'] == 'Goblet/Club cells'), 'ACE2_goblet'] = 'Goblet/Club' | ||
adata.obs.loc[(adata.obs['Cell Types'] == 'Ciliated cells'), 'ACE2_goblet'] = 'Ciliated' | ||
adata.obs.loc[(adata.obs['Cell Types'] == 'Basal cells'), 'ACE2_goblet'] = 'Basal' | ||
adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ACE2_goblet'] = 'ACE2+' | ||
|
||
|
||
adata_ace2_goblet = adata[adata.obs.dropna(subset=['ACE2_goblet']).index].copy() | ||
adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].astype(pd.CategoricalDtype(ordered=True)) | ||
adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].cat.reorder_categories(['ACE2+', 'Goblet/Club', 'Ciliated', 'Basal'], ordered=True) | ||
|
||
|
||
heatmap_genes = {'ACE2+ markers': adata.uns['ACE2_pos']['names']['ace2_pos'][1:11], | ||
'Goblet/Club markers': adata.uns['cell_types_gene_groups']['names']['Goblet/Club cells'][0:10], | ||
'Ciliated markers': adata.uns['cell_types_gene_groups']['names']['Ciliated cells'][0:10], | ||
'Basal markers': adata.uns['cell_types_gene_groups']['names']['Basal cells'][0:10], | ||
} | ||
|
||
ax = sc.pl.dotplot(adata_ace2_goblet, groupby='ACE2_goblet', var_names=heatmap_genes, #save='_ACE2_marker_goblet_ciliated.png', | ||
mean_only_expressed=True, var_group_rotation=0.0, show=False, figsize=(12, 2.5)) | ||
all_axes = plt.gcf().get_axes() | ||
all_axes[0].set_ylabel('') | ||
|
||
plt.savefig(os.path.join(sc.settings.figdir, 'dotplot_ace2_marker_goblet_ciliated_basal.pdf'), bbox_inches='tight') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
0, | ||
1, | ||
2,Basal cells | ||
3,Proliferating cells | ||
4,Goblet/Club cells | ||
5,Proliferating cells | ||
6,SMG-basal | ||
7,SMG-basal | ||
8,KRT8+ differentiating cells | ||
9, | ||
10, | ||
11,Ciliated cells | ||
12,Goblet/Club cells | ||
13,Basal cells | ||
14, | ||
15,Ciliated cells | ||
16,KRT8+ differentiating cells | ||
17,Basal cells | ||
18,Goblet/Club cells | ||
19,KRT8+ differentiating cells | ||
20,Goblet/Club cells | ||
21,Goblet/Club cells | ||
22,SMG-secretory | ||
23,Basal cells | ||
24, | ||
25,Ionocytes/Neuroendocrine cells | ||
26,Ionocytes/Neuroendocrine cells |
80 changes: 80 additions & 0 deletions
80
data/cluster-plots/airway-smoking-20200-gse134174/markers.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
ACE2,Coronavirus Receptor | ||
EPCAM,Epithelium | ||
PTPRC,Immune Cells | ||
PDGFRA,Mesenchymal Cells | ||
TMEM100,Endothelium | ||
RTKN2,Alveolar Type 1 | ||
FOXJ1,Ciliated Cells | ||
LAMP3,Alveolar Type 2 | ||
MUC5B,Secretory Cells | ||
MUC5AC,Goblet Cells | ||
SCGB3C1,Secretory Cells | ||
SCGB1A1,Club Cells | ||
GABRP,Secretory Cells | ||
LYPD2,Goblet Cells | ||
CLDN10,Secretory Cells | ||
KRT5,Basal Cells | ||
TRP63,Basal Cells | ||
MKI67,Proliferating Cells | ||
TOP2A,Proliferating Cells | ||
PCNA,Proliferating Cells | ||
DEUP1,Deuterosomal Cells | ||
FOXI1,Ionocytes | ||
CFTR,Ionocytes | ||
CGRP,Neuroendocrine Cells | ||
CHGA,Neuroendocrine Cells | ||
ASCL1,Brush/Tuft Cells | ||
LRMP,Brush/Tuft Cells | ||
COL1A1,Fibroblasts | ||
FBLN1,Fibroblasts | ||
ACTA2,Smooth Muscle Cells | ||
MYH11,Smooth Muscle Cells | ||
DNAH11,Ciliated Cells | ||
TP63,Basal Cells | ||
KRT8,Basal Cells | ||
CAV1,Basal Cells | ||
AGR2,Goblet Cells | ||
BPIFB1,Goblet/Club Cells | ||
SPDEF,Secretory Cells | ||
MUC1,Secretory Cells | ||
MUCL1,Secretory Cells? | ||
KIAA0101,01_p.basal | ||
TYMS,01_p.basal | ||
TK1,01_p.basal | ||
ZWINT,01_p.basal | ||
RRM2,01_p.basal | ||
KRT17,02_d.basal | ||
IL33,02_d.basal | ||
MMP10,02_d.basal | ||
BCAM,02_d.basal | ||
ETS2,02_d.basal | ||
C19orf33,03_KRT8.high | ||
GPRC5A,03_KRT8.high | ||
BHLHE40,03_KRT8.high | ||
TXN,03_KRT8.high | ||
EPAS1,03_KRT8.high | ||
BPIFB1,04_muc.secretory | ||
VMO1,04_muc.secretory | ||
MSMB,04_muc.secretory | ||
S100P,04_muc.secretory | ||
RDH10,04_muc.secretory | ||
FAM216B,05_ciliated | ||
STOML3,05_ciliated | ||
MRLN,05_ciliated | ||
CATSPERD,05_ciliated | ||
ANKRD66,05_ciliated | ||
HEPACAM2,06_rare | ||
PROX1,06_rare | ||
TUBB2B,06_rare | ||
NEURL1,06_rare | ||
PHGR1,06_rare | ||
KRT14,07_SMG.basal | ||
CAV1,07_SMG.basal | ||
FOS,07_SMG.basal | ||
G0S2,07_SMG.basal | ||
NAP1L1,07_SMG.basal | ||
SLPI,08_SMG.secretory | ||
RARRES1,08_SMG.secretory | ||
SCGB3A1,08_SMG.secretory | ||
TCN1,08_SMG.secretory | ||
MMP7,08_SMG.secretory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
0,T cells | ||
1,Alveolar type 2 | ||
2,Macrophages | ||
3,Macrophages | ||
4,Plasma cells | ||
5,Macrophages | ||
6,Alveolar type 2 | ||
7,Macrophages | ||
8,Macrophages | ||
9,Macrophages | ||
10,Macrophages | ||
11,Ciliated cells | ||
12,Dendritic cells | ||
13,Macrophages | ||
14,Goblet/Club cells | ||
15,Endothelial cells | ||
16,B cells | ||
17,Proliferating cells | ||
18,B cells | ||
19,Alveolar type 2 | ||
20,Macrophages | ||
21,Basal cells | ||
22,B cells | ||
23,Goblet/Club cells | ||
24,Macrophages | ||
25,Mast cells | ||
26,Macrophages | ||
27,Macrophages | ||
28,Macrophages | ||
29,Macrophages | ||
30,Macrophages | ||
31,Mesenchymal cells | ||
32,Alveolar type 2 | ||
33,Smooth muscle cells | ||
34,T cells | ||
35,Endothelial cells | ||
36,Macrophages | ||
37,Macrophages | ||
38,Plasma cells | ||
39,Endothelial cells | ||
40,Macrophages | ||
41,Macrophages | ||
42,Mesenchymal cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
0,B cells | ||
1,Alveolar type 2 | ||
2,T cells | ||
3,Macrophages | ||
4,NK cells | ||
5,Endothelial cells | ||
6,T cells | ||
7,Monocytes | ||
8,Monocytes | ||
9,Dendritic cells | ||
10,Endothelial cells | ||
11,Fibroblasts | ||
12,Macrophages | ||
13,Alveolar type 1 | ||
14,Goblet/Club cells | ||
15,Endothelial cells | ||
16,B cells | ||
17,Alveolar type 2 | ||
18,Neutrophils | ||
19,Ciliated cells | ||
20,Macrophages | ||
21,Endothelial cells | ||
22,Alveolar type 2 | ||
23,Dendritic cells | ||
24,Alveolar type 2 | ||
25,Monocytes | ||
26,Monocytes | ||
27,Alveolar type 2 | ||
28,Neutrophils | ||
29,Alveolar type 2 |
Oops, something went wrong.