Skip to content

Commit

Permalink
Initial code commit, for preprint submission
Browse files Browse the repository at this point in the history
  • Loading branch information
joan-smith committed Mar 28, 2020
1 parent 66c76c8 commit 93030c0
Show file tree
Hide file tree
Showing 12 changed files with 778 additions and 0 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# covid19
ACE2 expression and cigarette exposure

These analyses require:
- pandas
- sklearn
- scipy
- statsmodels
- scanpy

Optional:
- Multicore-TSNE
149 changes: 149 additions & 0 deletions aging-mouse-lung-ecurd9.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 24 09:40:32 2020
@author: Joan Smith
https://www.ebi.ac.uk/gxa/sc/experiments/E-CURD-9/results/tsne
"""


#%%

import pandas as pd
import scanpy as sc
from matplotlib import pyplot as plt
import matplotlib as mpl

#%% Set scanpy and matplotlib settings
sc.settings.figdir = 'data/cluster-plots/aging-mouse/'
sc.settings.verbosity = 4

mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams.update({
'font.sans-serif': 'Arial',
'font.family': 'sans-serif',
'axes.titlesize': 18,
'axes.labelsize': 14,
})
#%% Collect gene names and annotation file

gene_names = pd.read_csv('data/raw-data/Mus_musculus.GRCm38.99.chr.gtf', sep='\t', comment='#', header=None)

gene_names.columns = ['chr', 'annotation_src', 'feature_type', 'start', 'end', 'score', 'genomic_strand', 'genomic_phase', 'extra']
gene_names['gene_name'] = gene_names['extra'].str.extract(pat='gene_name "(.*?)";')
gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";')
gene_names['transcript_type'] = gene_names['extra'].str.extract(pat='transcript_type "(.*?)";')
gene_names = gene_names[gene_names['feature_type'] == 'gene']

annotation = gene_names[['gene_id', 'gene_name', 'chr']].groupby('gene_name').head(1)
annotation['gene_name'] = annotation['gene_name'].str.upper()
annotation = annotation.set_index('gene_id')

#%% Read and log normalize single cell data

adata = sc.read_mtx('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx')
data = adata.X.toarray().T
adata = sc.AnnData(data)

cols = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_cols', header=None)
rows = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_rows', sep='\t', header=None)
adata.var = rows.set_index(0).join(annotation)[['gene_name']].reset_index().set_index('gene_name')
adata.var.index = adata.var.index.astype('str')
adata.var_names_make_unique()
adata.obs = cols
sc.pp.log1p(adata, base=2)

#%% Exploratory Plots

# Unused in the main analysis, but used to parameter search for filtering cutoff.
def exploratory_plots(adata):
plt.figure()
plt.hist(adata.obs.n_genes, bins=500)
plt.title('Aging Mouse per cell')
print(adata.obs['n_genes'].min())

minimum = adata.obs['n_genes'].min()
plt.xlabel('# Genes. Min:' + str(minimum))
plt.ylabel('# Cells')
plt.show()
plt.savefig('data/cluster-plots/aging-mouse/aging_mouse_genes_cell.pdf')

#%% Perform Clustering

sc.pp.filter_cells(adata, min_genes=500)
sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=13, min_disp=0.5)

sc.tl.pca(adata)
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
sc.tl.tsne(adata, perplexity=25)
sc.tl.leiden(adata)

#%% Plot Clusters with highlighted genes

markers = pd.read_csv('data/highlighted_genes.csv', header=None, names=['gene', 'cluster'])
markers['gene'] = markers['gene'].str.upper()
markers = markers[markers['gene'].isin(list(annotation['gene_name'].values))]

markers['title'] = markers['gene'] + '+: ' + markers['cluster']
markers = markers.set_index('gene')
markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells'


ax = sc.pl.tsne(adata, color=['leiden'],
title=['Mouse Lung'],
color_map='plasma',
size=25,
save='_labeled_leiden_aging_mouse_ecurd9_markers_500gene.pdf',
show=False)

ax = sc.pl.tsne(adata, color=['leiden'],
title=['Mouse Lung'],
color_map='plasma',
size=25,
show=False)
ax.legend().remove()
plt.savefig('data/cluster-plots/aging-mouse/leiden_aging_mouse_ecurd9_potential_markers_500gene.pdf')

for i, g in markers.iterrows():
sc.pl.tsne(adata, color=i,
title=g['title'],
color_map='plasma',
size=25,
save='_' + i + '_aging_mouse_ecurd9_potential_markers_500gene.pdf',
show=False)


#%% Collect top ranked genes for each group

sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=10)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv('data/cluster-plots/aging-mouse/aging_mouse_groups.csv')

#%% Calculate Correlations with ACE2

df = adata.to_df()
corr = df.corrwith(df['ACE2']).sort_values(ascending=False)
print('normal ace2')
print(corr)
corr.to_csv('data/cluster-plots/aging-mouse/ACE2_correlates_aging_mouse_ecurd9.csv')

#%%

violin_markers = pd.read_csv('data/violin_markers_mouse.csv', header=None)
violin_markers[0] = violin_markers[0].str.upper()
violin_markers = violin_markers[violin_markers[0].isin(adata.var.index.values)]

cluster_key = pd.read_csv('data/aging_mouse_cluster_key.csv', header=None, names=['cluster', 'cell_type'])
cluster_key['cluster'] = cluster_key['cluster'].astype(str)
cluster_key_map = cluster_key.set_index('cluster')['cell_type'].to_dict()

adata.obs['Cell Types'] = adata.obs['leiden'].map(cluster_key_map, na_action='ignore').astype('category')


axes_list = sc.pl.stacked_violin(adata, var_names=violin_markers[0].values, groupby='Cell Types', swap_axes=True, show=False, )
[i.yaxis.set_ticks([]) for i in axes_list]
ax = plt.gca()
ax.get_xaxis().set_label_text('')
ax.figure.set_size_inches(5, 6)
plt.savefig('data/cluster-plots/aging-mouse/cell_type_stacked_violin2.pdf', bbox_inches='tight')
19 changes: 19 additions & 0 deletions data/aging_mouse_cluster_key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
0,Alveolar type 2
2,Alveolar type 2
15,Alveolar type 2
5,B cells
14,B cells
3,Ciliated cells
6,Dendritic cells
7,Endothelial cells
16,Endothelial cells
13,Fibroblasts
8,Goblet/Club cells
17,Ki67+ cells
1,Macrophages
9,Macrophages
12,Macrophages
11,Mesothelial cells
10,Monocytes
18,Neutrophils
4,T cells
11 changes: 11 additions & 0 deletions data/combined_human_cluster_key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
1,Basal cells
8,Basal cells
3,Ciliated cells
0,Club cells
4,Club cells
5,Club cells
2,Goblet cells
6,Goblet cells
10,Goblet cells
12,Immune cells
14,Ionocytes
27 changes: 27 additions & 0 deletions data/healthy_mouse_cluster_key.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
17,Alveolar type 1
1,Alveolar type 2
12,Alveolar type 2
19,Alveolar type 2
0,B cells
10,B cells
20,Ciliated cells
8,Dendritic cells
21,Dendritic cells
5,Endothelial cells
7,Endothelial cells
23,Endothelial cells
18,Goblet/Club cells
13,Fibroblasts
3,Macrophages
22,Macrophages
26,Macrophages
15,Macrophages
6,Monocytes
9,Monocytes
16,Neutrophils
28,Neutrophils
4,NK cells
11,NK cells
25,NK cells
2,T cells
14,T cells
17 changes: 17 additions & 0 deletions data/highlighted_genes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
ACE2,Coronavirus Receptor
EPCAM,Epithelium
PTPRC,Immune Cells
PDGFRA,Mesenchymal Cells
TMEM100,Endothelium
RTKN2,Alveolar Type 1
FOXJ1,Ciliated Cells
LAMP3,Alveolar Type 2
MUC5B,Secretory Cells
MUC5AC,Goblet Cells
SCGB3C1,Secretory Cells
SCGB1A1,Club Cells
GABRP,Secretory Cells
LYPD2,Goblet Cells
CLDN10,Secretory Cells
KRT5,Basal Cells
TRP63,Basal Cells
18 changes: 18 additions & 0 deletions data/violin_markers_human.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
ACE2,Coronavirus Receptor
AGR2,Goblet cells
BPIFB1, goblet/club
CFTR,Ionocytes
CLDN10,ACE2 correlation
EPCAM,Epithelium
FOXJ1, ciliated cells
GABRP,Secretory Cells
GZMA, NK cells
KRT5,Basal
LAMP3,AT-2
MUC1,AT-2
MUC5AC, goblet cells
MUC5B,Secretory Cells
PTPRC,Immune Cells
SCGB1A1, club cells (secretory cells)
Scgb3a1, goblet cells
SCGB3C1,Secretory Cells
30 changes: 30 additions & 0 deletions data/violin_markers_mouse.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
ACE2,Coronavirus Receptor
AGR2,Goblet cells
AKAP5,AT-1
BPIFB1, goblet/club
CCL5,Neutrophils
CD3G,T cells
CD79A,B cells
CHIL3,Alveolar Macrophages
CLDN10,ACE2 correlation
CXCR2, neutrophils/eosinophils
EPCAM,Epithelium
FOXJ1, ciliated cells
GABRP,Secretory Cells
GZMA, NK cells
LAMP3,AT-2
MGP,Interstitial fibroblasts
MUC1,AT-2
MUC5AC, goblet cells
MUC5B,Secretory Cells
PDGFRA,Mesenchymal Cells
PLA2G1B,AT-2
PTPRC,Immune Cells
Reg3g, goblet cells
RTKN2,AT-1
SCGB1A1, club cells (secretory cells)
Scgb3a1, goblet cells
SCGB3C1,Secretory Cells
SFTPC,AT-2
TFF2, goblet cells
TMEM100,Endothelium
92 changes: 92 additions & 0 deletions gtex_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 21 19:26:43 2020
@author: Joan Smith
"""

#%%
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

gtex = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct", sep="\t", header=2, index_col=1)
gtex = gtex.drop('Name', axis=1).astype(float)
attributes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0, dtype=None)
phenotypes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt", sep='\t', index_col=0)
phenotypes[['SEX']] = phenotypes[['SEX']].replace(2, 'F')
phenotypes[['SEX']] = phenotypes[['SEX']].replace(1, 'M')

#%%
def set_labels(ax, labels):
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels)

#%%
def plot_by_pheno(ace2_tissue_w_pheno, col, tissue):
fig, ax = plt.subplots()
plt.title(tissue + '\nACE2 by ' + col.lower())
pheno = [(i[0], i[1].ACE2.dropna()) for i in ace2_tissue_w_pheno.groupby(col)]
labels, split = zip(*pheno)
ax.violinplot(split, showmeans=True)
set_labels(ax, labels)

ace2_tissue_w_pheno.reset_index().pivot_table(columns='SEX', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_sex.csv')
ace2_tissue_w_pheno.reset_index().pivot_table(columns='AGE', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_age.csv')


#%% LUNG
ace2 = gtex.loc['ACE2']
samples_w_lung = attributes[attributes['SMTS'] == 'Lung'].index
ace2_lung = gtex.loc['ACE2'][samples_w_lung].astype(float)
ace2_lung.index = ace2_lung.index.str[0:10]

ace2_lung_w_pheno = phenotypes.join(ace2_lung, how='inner')
plot_by_pheno(ace2_lung_w_pheno, 'AGE', 'Lung')
plot_by_pheno(ace2_lung_w_pheno, 'SEX', 'Lung')

#%% Esophogeal Mucosa

samples_w_esoph_muc = attributes[attributes['SMTSD'] == 'Esophagus - Mucosa'].index
ace2_esoph = gtex.loc['ACE2'][samples_w_esoph_muc].astype(float)
ace2_esoph.index = ace2_esoph.index.str[0:10]

ace2_esoph_w_pheno = phenotypes.join(ace2_esoph, how='inner')
plot_by_pheno(ace2_esoph_w_pheno, 'AGE', 'Esophagus - Mucosa')
plot_by_pheno(ace2_esoph_w_pheno, 'SEX', 'Esophagus - Mucosa')

#%% Salivary

samples_w_salivary = attributes[attributes['SMTS'] == 'Salivary Gland'].index
ace2_sal = gtex.loc['ACE2'][samples_w_salivary].astype(float)
ace2_sal.index = ace2_sal.index.str[0:10]

ace2_sal_w_pheno = phenotypes.join(ace2_sal, how='inner')
plot_by_pheno(ace2_sal_w_pheno, 'AGE', 'Salivary Gland')
plot_by_pheno(ace2_sal_w_pheno, 'SEX', 'Salivary Gland')

#%% All Tissue

#%% Plot All Tissue

ace2_tissue = gtex.loc[['ACE2']].T.join(attributes)
ace2_tissue['ACE2'] = np.log2(ace2_tissue['ACE2'] + 1)

fig, ax = plt.subplots()
plt.title('ACE2 by tissue')

order = ace2_tissue.groupby('SMTS')['ACE2'].apply(np.mean).sort_values()
print(order)
g = {i[0]: i[1].ACE2.dropna() for i in ace2_tissue.groupby('SMTS')}
ordered_g= [(k, g[k]) for k in order.index]
labels, split = zip(*ordered_g)
ax.violinplot(split, showmeans=True)
set_labels(ax, labels)
plt.xticks(rotation=45)
plt.show()

#%% Export Data for All Tissue

ace2_tissue.reset_index().pivot_table(columns='SMTS', values='ACE2', index='index').to_csv('ace2_by_tissue.csv')

Loading

0 comments on commit 93030c0

Please sign in to comment.