Skip to content

Commit

Permalink
Merge pull request #2 from joan-smith/revisions
Browse files Browse the repository at this point in the history
April 26 revisions
  • Loading branch information
joan-smith authored Apr 26, 2020
2 parents fc55b91 + 59adeff commit b9271f1
Show file tree
Hide file tree
Showing 13 changed files with 944 additions and 116 deletions.
209 changes: 209 additions & 0 deletions airway_smoking_2020_gse134174.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 14 13:38:41 2020
@author: Joan Smith
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE134174
"""
#%%

import pandas as pd
import scanpy as sc
from matplotlib import pyplot as plt
import matplotlib as mpl
import os
import numpy as np
#%%
sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174/'
sc.settings.verbosity = 4

mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams.update({
'font.sans-serif': 'Arial',
'font.family': 'sans-serif',
'axes.titlesize': 18,
'axes.labelsize': 14,
})

#%% Load prenormed data and metadata
input_file = 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_norm.txt'
metadata = pd.read_csv('data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_metadata.txt', sep='\t', index_col=0)
adata = sc.read_text(input_file).T
adata.obs = metadata
all_adata = adata.copy()
VARIANT = 'all'

#%%
def exploratory_plots(adata):

num_non_int = (adata.to_df().applymap(float.is_integer) == False).sum().sum()
print('Num non-int: ', num_non_int)
plt.figure()
sc.pp.filter_cells(adata, min_genes=0)
plt.hist(adata.obs.n_genes, bins=500)
plt.title('Genes per cell')
minimum = adata.obs['n_genes'].min()
plt.xlabel('# Genes. Min:' + str(minimum))
plt.ylabel('# Cells')

plt.figure()
sc.pl.highest_expr_genes(adata, n_top=20, )
sc.pl.pca_variance_ratio(adata, log=True)
plt.show()
exploratory_plots(adata)

#%% HEAVY AND NEVER
adata = all_adata[all_adata.obs.Smoke_status.isin(['heavy', 'never'])].copy()
VARIANT='heavy_and_never'
sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174-heavy-and-never'

#%%
sc.pp.highly_variable_genes(adata)
sc.tl.pca(adata)
sc.pp.neighbors(adata)

#%%
LEARNING_RATE = 1000
EARLY_EXAGGERATION = 12
RESOLUTION = 2.5 #2
PERPLEXITY=30
N_PCS = 15

sc.tl.tsne(adata, learning_rate=LEARNING_RATE, n_jobs=8, early_exaggeration=EARLY_EXAGGERATION, perplexity=PERPLEXITY, n_pcs=N_PCS)
sc.tl.leiden(adata, resolution=RESOLUTION)

params = {'learning_rate': LEARNING_RATE,
'early_exaggeration':EARLY_EXAGGERATION,
'resolution': RESOLUTION,
'perplexity': PERPLEXITY,
'n_pcs': N_PCS,
'genes': 'all',
'variant': VARIANT,
'files': input_file}
sc.pl.tsne(adata, color=['leiden'], size=25, save='_leiden.pdf', color_map='plasma')
pd.Series(params).to_csv(os.path.join(sc.settings.figdir, 'params.txt'))
adata.uns['joan_cluster_params'] = params
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad'))
#%%
#%%
#%%
#%%
markers = pd.read_csv(os.path.join(sc.settings.figdir, 'markers.csv'), header=None, names=['gene', 'cluster'])
markers['gene'] = markers['gene'].str.upper()
markers = markers[markers['gene'].isin(adata.var.index.values)]

markers['title'] = markers['gene'] + '+: ' + markers['cluster']
markers = markers.set_index('gene')
markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells'
markers.loc['leiden'] = ['Leiden', 'Clusters']
#%%
adata = sc.read_h5ad(os.path.join(sc.settings.figdir, 'adata.h5ad'))
params = pd.read_csv(os.path.join(sc.settings.figdir, 'params.txt'), index_col=0).to_dict('dict')['0']
VARIANT = params['variant']
#%%
for i, g in markers.iterrows():
sc.pl.tsne(adata, color=i,
title=g['title'],
color_map='plasma',
size=25,
save='_' + i + '_airway_smoking_2020.pdf',
show=False)

#%%
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test', n_genes=20)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'rank_groups.csv'))
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad'))


#%%
genes = ['ACE2', 'KRT5', 'TP63', 'BCAM', 'NGFR']
df = adata.to_df()
for g in genes:
print(g)
g_df = df.join(adata.obs['Smoke_status'])[[g, 'Smoke_status']]
g_df = g_df[g_df[g] > 0].pivot(columns='Smoke_status')
g_df.to_csv(os.path.join(sc.settings.figdir, g +'_smoking_status.csv'))

#%%

#%% Correlations

df = adata.to_df()
corr = df.corrwith(df['ACE2']).sort_values(ascending=False)
print('correlations with ACE2')
print(corr.head(20))
corr.to_csv(os.path.join(sc.settings.figdir, 'ACE2_correlates.csv'))


#%%
markers_for_pos_counts = markers.drop('leiden').copy()
markers_for_pos_counts.loc['MKI67'] = ['MKI67', 'MKI67']
markers_for_pos_counts.loc['PCNA'] = ['PCNA', 'PCNA']
markers_for_pos_counts.loc['TOP2A'] = ['TOP2A', 'TOP2A']

## pos for every marker, # cells
df = adata.to_df()
pos_counts = (df[markers_for_pos_counts.index] > 0).sum()
pos_counts['cell_counts'] = len(df.index)
pos_counts.to_csv(os.path.join(sc.settings.figdir, 'single_positive_counts.csv'), header=['single pos counts'])

# # double pos w ace2, #
double_pos = (df[markers_for_pos_counts.index] > 0).apply(lambda x: x & (df['ACE2'] > 0)).sum()
double_pos['cell_counts'] = len(df.index)
double_pos.to_csv(os.path.join(sc.settings.figdir, 'double_positive_counts.csv'), header=['double pos counts'])

#%%
df = adata.to_df()
ace2_muc5ac_double_pos = (df['MUC5AC'] > 0) & (df['ACE2'] > 0)
ace2_muc5ac = df.loc[ace2_muc5ac_double_pos][['ACE2']].join(adata.obs.Smoke_status)
ace2_muc5ac.pivot(columns='Smoke_status').to_csv(os.path.join(sc.settings.figdir, 'ACE2_MUC5AC_double_pos.csv'))

#%%
cluster_labels = pd.read_csv(os.path.join(sc.settings.figdir, 'key.csv'), header=None, names=['cluster', 'label'], dtype='str')
cluster_labels.loc[cluster_labels['label'].isna(),'label'] = cluster_labels.loc[cluster_labels['label'].isna(), 'cluster']
cluster_labels_dict = cluster_labels.set_index('cluster')['label'].to_dict()
adata.obs['Cell Types'] = adata.obs.leiden.map(cluster_labels_dict).astype('category')

sc.pl.tsne(adata, color=['Cell Types'], size=25, save='_manually_labeled.pdf', color_map='plasma', title='Human Trachea')
adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad'))

sc.tl.rank_genes_groups(adata, 'Cell Types', method='t-test', n_genes=100, key_added='cell_types_gene_groups')
pd.DataFrame(adata.uns['cell_types_gene_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'manually_labeled_rank_groups_ttest.csv'))

#%%
adata.obs.groupby('Cell Types')['Smoke_status'].value_counts().to_csv(os.path.join(sc.settings.figdir, 'cell_type_smoker_counts.csv'))
ace2_pos_cell_types = adata.to_df()[(adata.to_df()['ACE2'] > 0)][['ACE2']].join(adata.obs[['Cell Types', 'Smoke_status']])
ace2_pos_cell_types.groupby(['Cell Types', 'Smoke_status']).count().to_csv(os.path.join(sc.settings.figdir, 'cell_type_ace2_pos_counts.csv'))

#%%
adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ace2_pos'] = 'ace2_pos'
adata.obs.loc[(adata.to_df()['ACE2'] <= 0), 'ace2_pos'] = 'ace2_neg'
sc.tl.rank_genes_groups(adata, groupby='ace2_pos', key_added='ACE2_pos')
#%%

adata.obs['ACE2_goblet'] = np.nan
adata.obs.loc[(adata.obs['Cell Types'] == 'Goblet/Club cells'), 'ACE2_goblet'] = 'Goblet/Club'
adata.obs.loc[(adata.obs['Cell Types'] == 'Ciliated cells'), 'ACE2_goblet'] = 'Ciliated'
adata.obs.loc[(adata.obs['Cell Types'] == 'Basal cells'), 'ACE2_goblet'] = 'Basal'
adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ACE2_goblet'] = 'ACE2+'


adata_ace2_goblet = adata[adata.obs.dropna(subset=['ACE2_goblet']).index].copy()
adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].astype(pd.CategoricalDtype(ordered=True))
adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].cat.reorder_categories(['ACE2+', 'Goblet/Club', 'Ciliated', 'Basal'], ordered=True)


heatmap_genes = {'ACE2+ markers': adata.uns['ACE2_pos']['names']['ace2_pos'][1:11],
'Goblet/Club markers': adata.uns['cell_types_gene_groups']['names']['Goblet/Club cells'][0:10],
'Ciliated markers': adata.uns['cell_types_gene_groups']['names']['Ciliated cells'][0:10],
'Basal markers': adata.uns['cell_types_gene_groups']['names']['Basal cells'][0:10],
}

ax = sc.pl.dotplot(adata_ace2_goblet, groupby='ACE2_goblet', var_names=heatmap_genes, #save='_ACE2_marker_goblet_ciliated.png',
mean_only_expressed=True, var_group_rotation=0.0, show=False, figsize=(12, 2.5))
all_axes = plt.gcf().get_axes()
all_axes[0].set_ylabel('')

plt.savefig(os.path.join(sc.settings.figdir, 'dotplot_ace2_marker_goblet_ciliated_basal.pdf'), bbox_inches='tight')

27 changes: 27 additions & 0 deletions data/cluster-plots/airway-smoking-20200-gse134174/key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
0,
1,
2,Basal cells
3,Proliferating cells
4,Goblet/Club cells
5,Proliferating cells
6,SMG-basal
7,SMG-basal
8,KRT8+ differentiating cells
9,
10,
11,Ciliated cells
12,Goblet/Club cells
13,Basal cells
14,
15,Ciliated cells
16,KRT8+ differentiating cells
17,Basal cells
18,Goblet/Club cells
19,KRT8+ differentiating cells
20,Goblet/Club cells
21,Goblet/Club cells
22,SMG-secretory
23,Basal cells
24,
25,Ionocytes/Neuroendocrine cells
26,Ionocytes/Neuroendocrine cells
80 changes: 80 additions & 0 deletions data/cluster-plots/airway-smoking-20200-gse134174/markers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
ACE2,Coronavirus Receptor
EPCAM,Epithelium
PTPRC,Immune Cells
PDGFRA,Mesenchymal Cells
TMEM100,Endothelium
RTKN2,Alveolar Type 1
FOXJ1,Ciliated Cells
LAMP3,Alveolar Type 2
MUC5B,Secretory Cells
MUC5AC,Goblet Cells
SCGB3C1,Secretory Cells
SCGB1A1,Club Cells
GABRP,Secretory Cells
LYPD2,Goblet Cells
CLDN10,Secretory Cells
KRT5,Basal Cells
TRP63,Basal Cells
MKI67,Proliferating Cells
TOP2A,Proliferating Cells
PCNA,Proliferating Cells
DEUP1,Deuterosomal Cells
FOXI1,Ionocytes
CFTR,Ionocytes
CGRP,Neuroendocrine Cells
CHGA,Neuroendocrine Cells
ASCL1,Brush/Tuft Cells
LRMP,Brush/Tuft Cells
COL1A1,Fibroblasts
FBLN1,Fibroblasts
ACTA2,Smooth Muscle Cells
MYH11,Smooth Muscle Cells
DNAH11,Ciliated Cells
TP63,Basal Cells
KRT8,Basal Cells
CAV1,Basal Cells
AGR2,Goblet Cells
BPIFB1,Goblet/Club Cells
SPDEF,Secretory Cells
MUC1,Secretory Cells
MUCL1,Secretory Cells?
KIAA0101,01_p.basal
TYMS,01_p.basal
TK1,01_p.basal
ZWINT,01_p.basal
RRM2,01_p.basal
KRT17,02_d.basal
IL33,02_d.basal
MMP10,02_d.basal
BCAM,02_d.basal
ETS2,02_d.basal
C19orf33,03_KRT8.high
GPRC5A,03_KRT8.high
BHLHE40,03_KRT8.high
TXN,03_KRT8.high
EPAS1,03_KRT8.high
BPIFB1,04_muc.secretory
VMO1,04_muc.secretory
MSMB,04_muc.secretory
S100P,04_muc.secretory
RDH10,04_muc.secretory
FAM216B,05_ciliated
STOML3,05_ciliated
MRLN,05_ciliated
CATSPERD,05_ciliated
ANKRD66,05_ciliated
HEPACAM2,06_rare
PROX1,06_rare
TUBB2B,06_rare
NEURL1,06_rare
PHGR1,06_rare
KRT14,07_SMG.basal
CAV1,07_SMG.basal
FOS,07_SMG.basal
G0S2,07_SMG.basal
NAP1L1,07_SMG.basal
SLPI,08_SMG.secretory
RARRES1,08_SMG.secretory
SCGB3A1,08_SMG.secretory
TCN1,08_SMG.secretory
MMP7,08_SMG.secretory
43 changes: 43 additions & 0 deletions data/cluster-plots/gse1229560_ipf/key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
0,T cells
1,Alveolar type 2
2,Macrophages
3,Macrophages
4,Plasma cells
5,Macrophages
6,Alveolar type 2
7,Macrophages
8,Macrophages
9,Macrophages
10,Macrophages
11,Ciliated cells
12,Dendritic cells
13,Macrophages
14,Goblet/Club cells
15,Endothelial cells
16,B cells
17,Proliferating cells
18,B cells
19,Alveolar type 2
20,Macrophages
21,Basal cells
22,B cells
23,Goblet/Club cells
24,Macrophages
25,Mast cells
26,Macrophages
27,Macrophages
28,Macrophages
29,Macrophages
30,Macrophages
31,Mesenchymal cells
32,Alveolar type 2
33,Smooth muscle cells
34,T cells
35,Endothelial cells
36,Macrophages
37,Macrophages
38,Plasma cells
39,Endothelial cells
40,Macrophages
41,Macrophages
42,Mesenchymal cells
30 changes: 30 additions & 0 deletions data/cluster-plots/healthy-mouse/key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
0,B cells
1,Alveolar type 2
2,T cells
3,Macrophages
4,NK cells
5,Endothelial cells
6,T cells
7,Monocytes
8,Monocytes
9,Dendritic cells
10,Endothelial cells
11,Fibroblasts
12,Macrophages
13,Alveolar type 1
14,Goblet/Club cells
15,Endothelial cells
16,B cells
17,Alveolar type 2
18,Neutrophils
19,Ciliated cells
20,Macrophages
21,Endothelial cells
22,Alveolar type 2
23,Dendritic cells
24,Alveolar type 2
25,Monocytes
26,Monocytes
27,Alveolar type 2
28,Neutrophils
29,Alveolar type 2
Loading

0 comments on commit b9271f1

Please sign in to comment.