forked from joan-smith/covid19
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial code commit, for preprint submission
- Loading branch information
1 parent
66c76c8
commit 93030c0
Showing
12 changed files
with
778 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,12 @@ | ||
# covid19 | ||
ACE2 expression and cigarette exposure | ||
|
||
These analyses require: | ||
- pandas | ||
- sklearn | ||
- scipy | ||
- statsmodels | ||
- scanpy | ||
|
||
Optional: | ||
- Multicore-TSNE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Tue Mar 24 09:40:32 2020 | ||
@author: Joan Smith | ||
https://www.ebi.ac.uk/gxa/sc/experiments/E-CURD-9/results/tsne | ||
""" | ||
|
||
|
||
#%% | ||
|
||
import pandas as pd | ||
import scanpy as sc | ||
from matplotlib import pyplot as plt | ||
import matplotlib as mpl | ||
|
||
#%% Set scanpy and matplotlib settings | ||
sc.settings.figdir = 'data/cluster-plots/aging-mouse/' | ||
sc.settings.verbosity = 4 | ||
|
||
mpl.rcParams.update(mpl.rcParamsDefault) | ||
mpl.rcParams.update({ | ||
'font.sans-serif': 'Arial', | ||
'font.family': 'sans-serif', | ||
'axes.titlesize': 18, | ||
'axes.labelsize': 14, | ||
}) | ||
#%% Collect gene names and annotation file | ||
|
||
gene_names = pd.read_csv('data/raw-data/Mus_musculus.GRCm38.99.chr.gtf', sep='\t', comment='#', header=None) | ||
|
||
gene_names.columns = ['chr', 'annotation_src', 'feature_type', 'start', 'end', 'score', 'genomic_strand', 'genomic_phase', 'extra'] | ||
gene_names['gene_name'] = gene_names['extra'].str.extract(pat='gene_name "(.*?)";') | ||
gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";') | ||
gene_names['transcript_type'] = gene_names['extra'].str.extract(pat='transcript_type "(.*?)";') | ||
gene_names = gene_names[gene_names['feature_type'] == 'gene'] | ||
|
||
annotation = gene_names[['gene_id', 'gene_name', 'chr']].groupby('gene_name').head(1) | ||
annotation['gene_name'] = annotation['gene_name'].str.upper() | ||
annotation = annotation.set_index('gene_id') | ||
|
||
#%% Read and log normalize single cell data | ||
|
||
adata = sc.read_mtx('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx') | ||
data = adata.X.toarray().T | ||
adata = sc.AnnData(data) | ||
|
||
cols = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_cols', header=None) | ||
rows = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_rows', sep='\t', header=None) | ||
adata.var = rows.set_index(0).join(annotation)[['gene_name']].reset_index().set_index('gene_name') | ||
adata.var.index = adata.var.index.astype('str') | ||
adata.var_names_make_unique() | ||
adata.obs = cols | ||
sc.pp.log1p(adata, base=2) | ||
|
||
#%% Exploratory Plots | ||
|
||
# Unused in the main analysis, but used to parameter search for filtering cutoff. | ||
def exploratory_plots(adata): | ||
plt.figure() | ||
plt.hist(adata.obs.n_genes, bins=500) | ||
plt.title('Aging Mouse per cell') | ||
print(adata.obs['n_genes'].min()) | ||
|
||
minimum = adata.obs['n_genes'].min() | ||
plt.xlabel('# Genes. Min:' + str(minimum)) | ||
plt.ylabel('# Cells') | ||
plt.show() | ||
plt.savefig('data/cluster-plots/aging-mouse/aging_mouse_genes_cell.pdf') | ||
|
||
#%% Perform Clustering | ||
|
||
sc.pp.filter_cells(adata, min_genes=500) | ||
sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=13, min_disp=0.5) | ||
|
||
sc.tl.pca(adata) | ||
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) | ||
sc.tl.tsne(adata, perplexity=25) | ||
sc.tl.leiden(adata) | ||
|
||
#%% Plot Clusters with highlighted genes | ||
|
||
markers = pd.read_csv('data/highlighted_genes.csv', header=None, names=['gene', 'cluster']) | ||
markers['gene'] = markers['gene'].str.upper() | ||
markers = markers[markers['gene'].isin(list(annotation['gene_name'].values))] | ||
|
||
markers['title'] = markers['gene'] + '+: ' + markers['cluster'] | ||
markers = markers.set_index('gene') | ||
markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells' | ||
|
||
|
||
ax = sc.pl.tsne(adata, color=['leiden'], | ||
title=['Mouse Lung'], | ||
color_map='plasma', | ||
size=25, | ||
save='_labeled_leiden_aging_mouse_ecurd9_markers_500gene.pdf', | ||
show=False) | ||
|
||
ax = sc.pl.tsne(adata, color=['leiden'], | ||
title=['Mouse Lung'], | ||
color_map='plasma', | ||
size=25, | ||
show=False) | ||
ax.legend().remove() | ||
plt.savefig('data/cluster-plots/aging-mouse/leiden_aging_mouse_ecurd9_potential_markers_500gene.pdf') | ||
|
||
for i, g in markers.iterrows(): | ||
sc.pl.tsne(adata, color=i, | ||
title=g['title'], | ||
color_map='plasma', | ||
size=25, | ||
save='_' + i + '_aging_mouse_ecurd9_potential_markers_500gene.pdf', | ||
show=False) | ||
|
||
|
||
#%% Collect top ranked genes for each group | ||
|
||
sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=10) | ||
pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv('data/cluster-plots/aging-mouse/aging_mouse_groups.csv') | ||
|
||
#%% Calculate Correlations with ACE2 | ||
|
||
df = adata.to_df() | ||
corr = df.corrwith(df['ACE2']).sort_values(ascending=False) | ||
print('normal ace2') | ||
print(corr) | ||
corr.to_csv('data/cluster-plots/aging-mouse/ACE2_correlates_aging_mouse_ecurd9.csv') | ||
|
||
#%% | ||
|
||
violin_markers = pd.read_csv('data/violin_markers_mouse.csv', header=None) | ||
violin_markers[0] = violin_markers[0].str.upper() | ||
violin_markers = violin_markers[violin_markers[0].isin(adata.var.index.values)] | ||
|
||
cluster_key = pd.read_csv('data/aging_mouse_cluster_key.csv', header=None, names=['cluster', 'cell_type']) | ||
cluster_key['cluster'] = cluster_key['cluster'].astype(str) | ||
cluster_key_map = cluster_key.set_index('cluster')['cell_type'].to_dict() | ||
|
||
adata.obs['Cell Types'] = adata.obs['leiden'].map(cluster_key_map, na_action='ignore').astype('category') | ||
|
||
|
||
axes_list = sc.pl.stacked_violin(adata, var_names=violin_markers[0].values, groupby='Cell Types', swap_axes=True, show=False, ) | ||
[i.yaxis.set_ticks([]) for i in axes_list] | ||
ax = plt.gca() | ||
ax.get_xaxis().set_label_text('') | ||
ax.figure.set_size_inches(5, 6) | ||
plt.savefig('data/cluster-plots/aging-mouse/cell_type_stacked_violin2.pdf', bbox_inches='tight') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
0,Alveolar type 2 | ||
2,Alveolar type 2 | ||
15,Alveolar type 2 | ||
5,B cells | ||
14,B cells | ||
3,Ciliated cells | ||
6,Dendritic cells | ||
7,Endothelial cells | ||
16,Endothelial cells | ||
13,Fibroblasts | ||
8,Goblet/Club cells | ||
17,Ki67+ cells | ||
1,Macrophages | ||
9,Macrophages | ||
12,Macrophages | ||
11,Mesothelial cells | ||
10,Monocytes | ||
18,Neutrophils | ||
4,T cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
1,Basal cells | ||
8,Basal cells | ||
3,Ciliated cells | ||
0,Club cells | ||
4,Club cells | ||
5,Club cells | ||
2,Goblet cells | ||
6,Goblet cells | ||
10,Goblet cells | ||
12,Immune cells | ||
14,Ionocytes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
17,Alveolar type 1 | ||
1,Alveolar type 2 | ||
12,Alveolar type 2 | ||
19,Alveolar type 2 | ||
0,B cells | ||
10,B cells | ||
20,Ciliated cells | ||
8,Dendritic cells | ||
21,Dendritic cells | ||
5,Endothelial cells | ||
7,Endothelial cells | ||
23,Endothelial cells | ||
18,Goblet/Club cells | ||
13,Fibroblasts | ||
3,Macrophages | ||
22,Macrophages | ||
26,Macrophages | ||
15,Macrophages | ||
6,Monocytes | ||
9,Monocytes | ||
16,Neutrophils | ||
28,Neutrophils | ||
4,NK cells | ||
11,NK cells | ||
25,NK cells | ||
2,T cells | ||
14,T cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
ACE2,Coronavirus Receptor | ||
EPCAM,Epithelium | ||
PTPRC,Immune Cells | ||
PDGFRA,Mesenchymal Cells | ||
TMEM100,Endothelium | ||
RTKN2,Alveolar Type 1 | ||
FOXJ1,Ciliated Cells | ||
LAMP3,Alveolar Type 2 | ||
MUC5B,Secretory Cells | ||
MUC5AC,Goblet Cells | ||
SCGB3C1,Secretory Cells | ||
SCGB1A1,Club Cells | ||
GABRP,Secretory Cells | ||
LYPD2,Goblet Cells | ||
CLDN10,Secretory Cells | ||
KRT5,Basal Cells | ||
TRP63,Basal Cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
ACE2,Coronavirus Receptor | ||
AGR2,Goblet cells | ||
BPIFB1, goblet/club | ||
CFTR,Ionocytes | ||
CLDN10,ACE2 correlation | ||
EPCAM,Epithelium | ||
FOXJ1, ciliated cells | ||
GABRP,Secretory Cells | ||
GZMA, NK cells | ||
KRT5,Basal | ||
LAMP3,AT-2 | ||
MUC1,AT-2 | ||
MUC5AC, goblet cells | ||
MUC5B,Secretory Cells | ||
PTPRC,Immune Cells | ||
SCGB1A1, club cells (secretory cells) | ||
Scgb3a1, goblet cells | ||
SCGB3C1,Secretory Cells |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
ACE2,Coronavirus Receptor | ||
AGR2,Goblet cells | ||
AKAP5,AT-1 | ||
BPIFB1, goblet/club | ||
CCL5,Neutrophils | ||
CD3G,T cells | ||
CD79A,B cells | ||
CHIL3,Alveolar Macrophages | ||
CLDN10,ACE2 correlation | ||
CXCR2, neutrophils/eosinophils | ||
EPCAM,Epithelium | ||
FOXJ1, ciliated cells | ||
GABRP,Secretory Cells | ||
GZMA, NK cells | ||
LAMP3,AT-2 | ||
MGP,Interstitial fibroblasts | ||
MUC1,AT-2 | ||
MUC5AC, goblet cells | ||
MUC5B,Secretory Cells | ||
PDGFRA,Mesenchymal Cells | ||
PLA2G1B,AT-2 | ||
PTPRC,Immune Cells | ||
Reg3g, goblet cells | ||
RTKN2,AT-1 | ||
SCGB1A1, club cells (secretory cells) | ||
Scgb3a1, goblet cells | ||
SCGB3C1,Secretory Cells | ||
SFTPC,AT-2 | ||
TFF2, goblet cells | ||
TMEM100,Endothelium |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Sat Mar 21 19:26:43 2020 | ||
@author: Joan Smith | ||
""" | ||
|
||
#%% | ||
import pandas as pd | ||
from matplotlib import pyplot as plt | ||
import numpy as np | ||
|
||
gtex = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct", sep="\t", header=2, index_col=1) | ||
gtex = gtex.drop('Name', axis=1).astype(float) | ||
attributes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0, dtype=None) | ||
phenotypes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt", sep='\t', index_col=0) | ||
phenotypes[['SEX']] = phenotypes[['SEX']].replace(2, 'F') | ||
phenotypes[['SEX']] = phenotypes[['SEX']].replace(1, 'M') | ||
|
||
#%% | ||
def set_labels(ax, labels): | ||
ax.set_xticks(np.arange(1, len(labels) + 1)) | ||
ax.set_xticklabels(labels) | ||
|
||
#%% | ||
def plot_by_pheno(ace2_tissue_w_pheno, col, tissue): | ||
fig, ax = plt.subplots() | ||
plt.title(tissue + '\nACE2 by ' + col.lower()) | ||
pheno = [(i[0], i[1].ACE2.dropna()) for i in ace2_tissue_w_pheno.groupby(col)] | ||
labels, split = zip(*pheno) | ||
ax.violinplot(split, showmeans=True) | ||
set_labels(ax, labels) | ||
|
||
ace2_tissue_w_pheno.reset_index().pivot_table(columns='SEX', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_sex.csv') | ||
ace2_tissue_w_pheno.reset_index().pivot_table(columns='AGE', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_age.csv') | ||
|
||
|
||
#%% LUNG | ||
ace2 = gtex.loc['ACE2'] | ||
samples_w_lung = attributes[attributes['SMTS'] == 'Lung'].index | ||
ace2_lung = gtex.loc['ACE2'][samples_w_lung].astype(float) | ||
ace2_lung.index = ace2_lung.index.str[0:10] | ||
|
||
ace2_lung_w_pheno = phenotypes.join(ace2_lung, how='inner') | ||
plot_by_pheno(ace2_lung_w_pheno, 'AGE', 'Lung') | ||
plot_by_pheno(ace2_lung_w_pheno, 'SEX', 'Lung') | ||
|
||
#%% Esophogeal Mucosa | ||
|
||
samples_w_esoph_muc = attributes[attributes['SMTSD'] == 'Esophagus - Mucosa'].index | ||
ace2_esoph = gtex.loc['ACE2'][samples_w_esoph_muc].astype(float) | ||
ace2_esoph.index = ace2_esoph.index.str[0:10] | ||
|
||
ace2_esoph_w_pheno = phenotypes.join(ace2_esoph, how='inner') | ||
plot_by_pheno(ace2_esoph_w_pheno, 'AGE', 'Esophagus - Mucosa') | ||
plot_by_pheno(ace2_esoph_w_pheno, 'SEX', 'Esophagus - Mucosa') | ||
|
||
#%% Salivary | ||
|
||
samples_w_salivary = attributes[attributes['SMTS'] == 'Salivary Gland'].index | ||
ace2_sal = gtex.loc['ACE2'][samples_w_salivary].astype(float) | ||
ace2_sal.index = ace2_sal.index.str[0:10] | ||
|
||
ace2_sal_w_pheno = phenotypes.join(ace2_sal, how='inner') | ||
plot_by_pheno(ace2_sal_w_pheno, 'AGE', 'Salivary Gland') | ||
plot_by_pheno(ace2_sal_w_pheno, 'SEX', 'Salivary Gland') | ||
|
||
#%% All Tissue | ||
|
||
#%% Plot All Tissue | ||
|
||
ace2_tissue = gtex.loc[['ACE2']].T.join(attributes) | ||
ace2_tissue['ACE2'] = np.log2(ace2_tissue['ACE2'] + 1) | ||
|
||
fig, ax = plt.subplots() | ||
plt.title('ACE2 by tissue') | ||
|
||
order = ace2_tissue.groupby('SMTS')['ACE2'].apply(np.mean).sort_values() | ||
print(order) | ||
g = {i[0]: i[1].ACE2.dropna() for i in ace2_tissue.groupby('SMTS')} | ||
ordered_g= [(k, g[k]) for k in order.index] | ||
labels, split = zip(*ordered_g) | ||
ax.violinplot(split, showmeans=True) | ||
set_labels(ax, labels) | ||
plt.xticks(rotation=45) | ||
plt.show() | ||
|
||
#%% Export Data for All Tissue | ||
|
||
ace2_tissue.reset_index().pivot_table(columns='SMTS', values='ACE2', index='index').to_csv('ace2_by_tissue.csv') | ||
|
Oops, something went wrong.