Initial code commit, for preprint submission

JackNg88 · Mar 28, 2020 · 93030c0 · 93030c0
1 parent 66c76c8
commit 93030c0
Show file tree

Hide file tree

Showing 12 changed files with 778 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,12 @@
 # covid19
 ACE2 expression and cigarette exposure
+
+These analyses require: 
+- pandas
+- sklearn
+- scipy
+- statsmodels
+- scanpy
+
+Optional:
+- Multicore-TSNE
diff --git a/aging-mouse-lung-ecurd9.py b/aging-mouse-lung-ecurd9.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 24 09:40:32 2020
+
+@author: Joan Smith
+
+https://www.ebi.ac.uk/gxa/sc/experiments/E-CURD-9/results/tsne
+"""
+
+
+#%%
+
+import pandas as pd
+import scanpy as sc
+from matplotlib import pyplot as plt
+import matplotlib as mpl
+
+#%% Set scanpy and matplotlib settings
+sc.settings.figdir = 'data/cluster-plots/aging-mouse/'
+sc.settings.verbosity = 4
+
+mpl.rcParams.update(mpl.rcParamsDefault)
+mpl.rcParams.update({
+    'font.sans-serif': 'Arial',
+    'font.family': 'sans-serif',
+    'axes.titlesize': 18,
+    'axes.labelsize': 14,
+    })
+#%% Collect gene names and annotation file
+
+gene_names = pd.read_csv('data/raw-data/Mus_musculus.GRCm38.99.chr.gtf', sep='\t', comment='#', header=None)
+
+gene_names.columns = ['chr', 'annotation_src', 'feature_type', 'start', 'end', 'score', 'genomic_strand', 'genomic_phase', 'extra']
+gene_names['gene_name'] = gene_names['extra'].str.extract(pat='gene_name "(.*?)";')
+gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";')
+gene_names['transcript_type'] = gene_names['extra'].str.extract(pat='transcript_type "(.*?)";')
+gene_names = gene_names[gene_names['feature_type'] == 'gene']
+
+annotation = gene_names[['gene_id', 'gene_name', 'chr']].groupby('gene_name').head(1)
+annotation['gene_name'] = annotation['gene_name'].str.upper()
+annotation = annotation.set_index('gene_id')
+
+#%% Read and log normalize single cell data
+
+adata = sc.read_mtx('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx')
+data = adata.X.toarray().T
+adata = sc.AnnData(data)
+
+cols = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_cols', header=None)
+rows = pd.read_csv('data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_rows', sep='\t', header=None)
+adata.var = rows.set_index(0).join(annotation)[['gene_name']].reset_index().set_index('gene_name')
+adata.var.index = adata.var.index.astype('str')
+adata.var_names_make_unique()
+adata.obs = cols
+sc.pp.log1p(adata, base=2)
+
+#%% Exploratory Plots
+
+# Unused in the main analysis, but used to parameter search for filtering cutoff.
+def exploratory_plots(adata):
+    plt.figure()
+    plt.hist(adata.obs.n_genes, bins=500)
+    plt.title('Aging Mouse per cell')
+    print(adata.obs['n_genes'].min())
+
+    minimum = adata.obs['n_genes'].min()
+    plt.xlabel('# Genes. Min:' + str(minimum))
+    plt.ylabel('# Cells')
+    plt.show()
+    plt.savefig('data/cluster-plots/aging-mouse/aging_mouse_genes_cell.pdf')
+
+#%% Perform Clustering
+
+sc.pp.filter_cells(adata, min_genes=500)
+sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=13, min_disp=0.5)
+
+sc.tl.pca(adata)
+sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
+sc.tl.tsne(adata, perplexity=25)
+sc.tl.leiden(adata)
+
+#%% Plot Clusters with highlighted genes
+
+markers = pd.read_csv('data/highlighted_genes.csv', header=None, names=['gene', 'cluster'])
+markers['gene'] = markers['gene'].str.upper()
+markers = markers[markers['gene'].isin(list(annotation['gene_name'].values))]
+
+markers['title'] = markers['gene'] + '+: ' + markers['cluster']
+markers = markers.set_index('gene')
+markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells'
+
+
+ax = sc.pl.tsne(adata, color=['leiden'],
+           title=['Mouse Lung'], 
+           color_map='plasma',
+           size=25,
+           save='_labeled_leiden_aging_mouse_ecurd9_markers_500gene.pdf',
+           show=False) 
+
+ax = sc.pl.tsne(adata, color=['leiden'],
+           title=['Mouse Lung'], 
+           color_map='plasma',
+           size=25,
+           show=False) 
+ax.legend().remove()
+plt.savefig('data/cluster-plots/aging-mouse/leiden_aging_mouse_ecurd9_potential_markers_500gene.pdf')
+
+for i, g in markers.iterrows():
+    sc.pl.tsne(adata, color=i,
+           title=g['title'],
+           color_map='plasma',
+           size=25,
+           save='_' + i + '_aging_mouse_ecurd9_potential_markers_500gene.pdf',
+           show=False) 
+
+
+#%% Collect top ranked genes for each group
+
+sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=10)
+pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv('data/cluster-plots/aging-mouse/aging_mouse_groups.csv')
+
+#%% Calculate Correlations with ACE2
+
+df = adata.to_df()
+corr = df.corrwith(df['ACE2']).sort_values(ascending=False)
+print('normal ace2')
+print(corr)
+corr.to_csv('data/cluster-plots/aging-mouse/ACE2_correlates_aging_mouse_ecurd9.csv')
+
+#%%
+
+violin_markers = pd.read_csv('data/violin_markers_mouse.csv', header=None)
+violin_markers[0] = violin_markers[0].str.upper()
+violin_markers = violin_markers[violin_markers[0].isin(adata.var.index.values)]
+
+cluster_key = pd.read_csv('data/aging_mouse_cluster_key.csv', header=None, names=['cluster', 'cell_type'])
+cluster_key['cluster'] = cluster_key['cluster'].astype(str)
+cluster_key_map = cluster_key.set_index('cluster')['cell_type'].to_dict()
+
+adata.obs['Cell Types'] = adata.obs['leiden'].map(cluster_key_map, na_action='ignore').astype('category')
+
+
+axes_list = sc.pl.stacked_violin(adata, var_names=violin_markers[0].values, groupby='Cell Types', swap_axes=True, show=False, )
+[i.yaxis.set_ticks([]) for i in axes_list]
+ax = plt.gca()
+ax.get_xaxis().set_label_text('')
+ax.figure.set_size_inches(5, 6)
+plt.savefig('data/cluster-plots/aging-mouse/cell_type_stacked_violin2.pdf', bbox_inches='tight')
diff --git a/data/aging_mouse_cluster_key.csv b/data/aging_mouse_cluster_key.csv
@@ -0,0 +1,19 @@
+0,Alveolar type 2
+2,Alveolar type 2
+15,Alveolar type 2
+5,B cells
+14,B cells
+3,Ciliated cells
+6,Dendritic cells
+7,Endothelial cells
+16,Endothelial cells
+13,Fibroblasts
+8,Goblet/Club cells
+17,Ki67+ cells
+1,Macrophages
+9,Macrophages
+12,Macrophages
+11,Mesothelial cells
+10,Monocytes
+18,Neutrophils
+4,T cells
diff --git a/data/combined_human_cluster_key.csv b/data/combined_human_cluster_key.csv
@@ -0,0 +1,11 @@
+1,Basal cells
+8,Basal cells
+3,Ciliated cells
+0,Club cells
+4,Club cells
+5,Club cells
+2,Goblet cells
+6,Goblet cells
+10,Goblet cells
+12,Immune cells
+14,Ionocytes
diff --git a/data/healthy_mouse_cluster_key.csv b/data/healthy_mouse_cluster_key.csv
@@ -0,0 +1,27 @@
+17,Alveolar type 1
+1,Alveolar type 2
+12,Alveolar type 2
+19,Alveolar type 2
+0,B cells
+10,B cells
+20,Ciliated cells
+8,Dendritic cells
+21,Dendritic cells
+5,Endothelial cells
+7,Endothelial cells
+23,Endothelial cells
+18,Goblet/Club cells
+13,Fibroblasts
+3,Macrophages
+22,Macrophages
+26,Macrophages
+15,Macrophages
+6,Monocytes
+9,Monocytes
+16,Neutrophils
+28,Neutrophils
+4,NK cells
+11,NK cells
+25,NK cells
+2,T cells
+14,T cells
diff --git a/data/highlighted_genes.csv b/data/highlighted_genes.csv
@@ -0,0 +1,17 @@
+ACE2,Coronavirus Receptor
+EPCAM,Epithelium
+PTPRC,Immune Cells
+PDGFRA,Mesenchymal Cells
+TMEM100,Endothelium
+RTKN2,Alveolar Type 1
+FOXJ1,Ciliated Cells
+LAMP3,Alveolar Type 2
+MUC5B,Secretory Cells
+MUC5AC,Goblet Cells
+SCGB3C1,Secretory Cells
+SCGB1A1,Club Cells
+GABRP,Secretory Cells
+LYPD2,Goblet Cells
+CLDN10,Secretory Cells
+KRT5,Basal Cells
+TRP63,Basal Cells
diff --git a/data/violin_markers_human.csv b/data/violin_markers_human.csv
@@ -0,0 +1,18 @@
+ACE2,Coronavirus Receptor
+AGR2,Goblet cells
+BPIFB1, goblet/club 
+CFTR,Ionocytes
+CLDN10,ACE2 correlation
+EPCAM,Epithelium
+FOXJ1, ciliated cells
+GABRP,Secretory Cells
+GZMA, NK cells
+KRT5,Basal
+LAMP3,AT-2
+MUC1,AT-2
+MUC5AC, goblet cells
+MUC5B,Secretory Cells
+PTPRC,Immune Cells
+SCGB1A1, club cells (secretory cells) 
+Scgb3a1, goblet cells 
+SCGB3C1,Secretory Cells
diff --git a/data/violin_markers_mouse.csv b/data/violin_markers_mouse.csv
@@ -0,0 +1,30 @@
+ACE2,Coronavirus Receptor
+AGR2,Goblet cells
+AKAP5,AT-1
+BPIFB1, goblet/club 
+CCL5,Neutrophils
+CD3G,T cells
+CD79A,B cells
+CHIL3,Alveolar Macrophages
+CLDN10,ACE2 correlation
+CXCR2, neutrophils/eosinophils
+EPCAM,Epithelium
+FOXJ1, ciliated cells
+GABRP,Secretory Cells
+GZMA, NK cells
+LAMP3,AT-2
+MGP,Interstitial fibroblasts
+MUC1,AT-2
+MUC5AC, goblet cells
+MUC5B,Secretory Cells
+PDGFRA,Mesenchymal Cells
+PLA2G1B,AT-2
+PTPRC,Immune Cells
+Reg3g, goblet cells
+RTKN2,AT-1
+SCGB1A1, club cells (secretory cells) 
+Scgb3a1, goblet cells 
+SCGB3C1,Secretory Cells
+SFTPC,AT-2
+TFF2, goblet cells
+TMEM100,Endothelium
diff --git a/gtex_analysis.py b/gtex_analysis.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar 21 19:26:43 2020
+
+@author: Joan Smith
+"""
+
+#%%
+import pandas as pd
+from matplotlib import pyplot as plt
+import numpy as np
+
+gtex = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct", sep="\t", header=2, index_col=1)
+gtex = gtex.drop('Name', axis=1).astype(float)
+attributes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0, dtype=None)
+phenotypes = pd.read_csv("data/raw-data/GTEX/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt", sep='\t', index_col=0)
+phenotypes[['SEX']] = phenotypes[['SEX']].replace(2, 'F')
+phenotypes[['SEX']] = phenotypes[['SEX']].replace(1, 'M')
+
+#%%
+def set_labels(ax, labels):
+    ax.set_xticks(np.arange(1, len(labels) + 1))
+    ax.set_xticklabels(labels)
+
+#%%
+def plot_by_pheno(ace2_tissue_w_pheno, col, tissue):
+    fig, ax = plt.subplots()
+    plt.title(tissue + '\nACE2 by ' + col.lower())
+    pheno = [(i[0], i[1].ACE2.dropna()) for i in ace2_tissue_w_pheno.groupby(col)]
+    labels, split = zip(*pheno)
+    ax.violinplot(split, showmeans=True)
+    set_labels(ax, labels)
+
+    ace2_tissue_w_pheno.reset_index().pivot_table(columns='SEX', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_sex.csv')
+    ace2_tissue_w_pheno.reset_index().pivot_table(columns='AGE', values='ACE2', index='index').to_csv('gtex_' + tissue.lower() + '_age.csv')
+
+
+#%% LUNG
+ace2 = gtex.loc['ACE2']
+samples_w_lung = attributes[attributes['SMTS'] == 'Lung'].index
+ace2_lung = gtex.loc['ACE2'][samples_w_lung].astype(float)
+ace2_lung.index = ace2_lung.index.str[0:10]
+
+ace2_lung_w_pheno = phenotypes.join(ace2_lung, how='inner')
+plot_by_pheno(ace2_lung_w_pheno, 'AGE', 'Lung')
+plot_by_pheno(ace2_lung_w_pheno, 'SEX', 'Lung')
+
+#%% Esophogeal Mucosa
+
+samples_w_esoph_muc = attributes[attributes['SMTSD'] == 'Esophagus - Mucosa'].index
+ace2_esoph = gtex.loc['ACE2'][samples_w_esoph_muc].astype(float)
+ace2_esoph.index = ace2_esoph.index.str[0:10]
+
+ace2_esoph_w_pheno = phenotypes.join(ace2_esoph, how='inner')
+plot_by_pheno(ace2_esoph_w_pheno, 'AGE', 'Esophagus - Mucosa')
+plot_by_pheno(ace2_esoph_w_pheno, 'SEX', 'Esophagus - Mucosa')
+
+#%% Salivary
+
+samples_w_salivary = attributes[attributes['SMTS'] == 'Salivary Gland'].index
+ace2_sal = gtex.loc['ACE2'][samples_w_salivary].astype(float)
+ace2_sal.index = ace2_sal.index.str[0:10]
+
+ace2_sal_w_pheno = phenotypes.join(ace2_sal, how='inner')
+plot_by_pheno(ace2_sal_w_pheno, 'AGE', 'Salivary Gland')
+plot_by_pheno(ace2_sal_w_pheno, 'SEX', 'Salivary Gland')
+
+#%% All Tissue
+
+#%% Plot All Tissue
+
+ace2_tissue = gtex.loc[['ACE2']].T.join(attributes)
+ace2_tissue['ACE2'] = np.log2(ace2_tissue['ACE2'] + 1)
+
+fig, ax = plt.subplots()
+plt.title('ACE2 by tissue')
+
+order = ace2_tissue.groupby('SMTS')['ACE2'].apply(np.mean).sort_values()
+print(order)
+g = {i[0]: i[1].ACE2.dropna() for i in ace2_tissue.groupby('SMTS')}
+ordered_g= [(k, g[k]) for k in order.index]
+labels, split = zip(*ordered_g)
+ax.violinplot(split, showmeans=True)
+set_labels(ax, labels)
+plt.xticks(rotation=45)
+plt.show()
+
+#%% Export Data for All Tissue
+
+ace2_tissue.reset_index().pivot_table(columns='SMTS', values='ACE2', index='index').to_csv('ace2_by_tissue.csv')
+