diff --git a/airway_smoking_2020_gse134174.py b/airway_smoking_2020_gse134174.py new file mode 100755 index 0000000..c5311d8 --- /dev/null +++ b/airway_smoking_2020_gse134174.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 14 13:38:41 2020 + +@author: Joan Smith +https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE134174 +""" +#%% + +import pandas as pd +import scanpy as sc +from matplotlib import pyplot as plt +import matplotlib as mpl +import os +import numpy as np +#%% +sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174/' +sc.settings.verbosity = 4 + +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams.update({ + 'font.sans-serif': 'Arial', + 'font.family': 'sans-serif', + 'axes.titlesize': 18, + 'axes.labelsize': 14, + }) + +#%% Load prenormed data and metadata +input_file = 'data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_norm.txt' +metadata = pd.read_csv('data/raw-data/airway-smoking-GSE134174/GSE134174_Processed_invivo_metadata.txt', sep='\t', index_col=0) +adata = sc.read_text(input_file).T +adata.obs = metadata +all_adata = adata.copy() +VARIANT = 'all' + +#%% +def exploratory_plots(adata): + + num_non_int = (adata.to_df().applymap(float.is_integer) == False).sum().sum() + print('Num non-int: ', num_non_int) + plt.figure() + sc.pp.filter_cells(adata, min_genes=0) + plt.hist(adata.obs.n_genes, bins=500) + plt.title('Genes per cell') + minimum = adata.obs['n_genes'].min() + plt.xlabel('# Genes. Min:' + str(minimum)) + plt.ylabel('# Cells') + + plt.figure() + sc.pl.highest_expr_genes(adata, n_top=20, ) + sc.pl.pca_variance_ratio(adata, log=True) + plt.show() +exploratory_plots(adata) + +#%% HEAVY AND NEVER +adata = all_adata[all_adata.obs.Smoke_status.isin(['heavy', 'never'])].copy() +VARIANT='heavy_and_never' +sc.settings.figdir = 'data/cluster-plots/airway-smoking-2020-gse134174-heavy-and-never' + +#%% +sc.pp.highly_variable_genes(adata) +sc.tl.pca(adata) +sc.pp.neighbors(adata) + +#%% +LEARNING_RATE = 1000 +EARLY_EXAGGERATION = 12 +RESOLUTION = 2.5 #2 +PERPLEXITY=30 +N_PCS = 15 + +sc.tl.tsne(adata, learning_rate=LEARNING_RATE, n_jobs=8, early_exaggeration=EARLY_EXAGGERATION, perplexity=PERPLEXITY, n_pcs=N_PCS) +sc.tl.leiden(adata, resolution=RESOLUTION) + +params = {'learning_rate': LEARNING_RATE, + 'early_exaggeration':EARLY_EXAGGERATION, + 'resolution': RESOLUTION, + 'perplexity': PERPLEXITY, + 'n_pcs': N_PCS, + 'genes': 'all', + 'variant': VARIANT, + 'files': input_file} +sc.pl.tsne(adata, color=['leiden'], size=25, save='_leiden.pdf', color_map='plasma') +pd.Series(params).to_csv(os.path.join(sc.settings.figdir, 'params.txt')) +adata.uns['joan_cluster_params'] = params +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) +#%% +#%% +#%% +#%% +markers = pd.read_csv(os.path.join(sc.settings.figdir, 'markers.csv'), header=None, names=['gene', 'cluster']) +markers['gene'] = markers['gene'].str.upper() +markers = markers[markers['gene'].isin(adata.var.index.values)] + +markers['title'] = markers['gene'] + '+: ' + markers['cluster'] +markers = markers.set_index('gene') +markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells' +markers.loc['leiden'] = ['Leiden', 'Clusters'] +#%% +adata = sc.read_h5ad(os.path.join(sc.settings.figdir, 'adata.h5ad')) +params = pd.read_csv(os.path.join(sc.settings.figdir, 'params.txt'), index_col=0).to_dict('dict')['0'] +VARIANT = params['variant'] +#%% +for i, g in markers.iterrows(): + sc.pl.tsne(adata, color=i, + title=g['title'], + color_map='plasma', + size=25, + save='_' + i + '_airway_smoking_2020.pdf', + show=False) + +#%% +sc.tl.rank_genes_groups(adata, 'leiden', method='t-test', n_genes=20) +pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'rank_groups.csv')) +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) + + +#%% +genes = ['ACE2', 'KRT5', 'TP63', 'BCAM', 'NGFR'] +df = adata.to_df() +for g in genes: + print(g) + g_df = df.join(adata.obs['Smoke_status'])[[g, 'Smoke_status']] + g_df = g_df[g_df[g] > 0].pivot(columns='Smoke_status') + g_df.to_csv(os.path.join(sc.settings.figdir, g +'_smoking_status.csv')) + +#%% + +#%% Correlations + +df = adata.to_df() +corr = df.corrwith(df['ACE2']).sort_values(ascending=False) +print('correlations with ACE2') +print(corr.head(20)) +corr.to_csv(os.path.join(sc.settings.figdir, 'ACE2_correlates.csv')) + + +#%% +markers_for_pos_counts = markers.drop('leiden').copy() +markers_for_pos_counts.loc['MKI67'] = ['MKI67', 'MKI67'] +markers_for_pos_counts.loc['PCNA'] = ['PCNA', 'PCNA'] +markers_for_pos_counts.loc['TOP2A'] = ['TOP2A', 'TOP2A'] + +## pos for every marker, # cells +df = adata.to_df() +pos_counts = (df[markers_for_pos_counts.index] > 0).sum() +pos_counts['cell_counts'] = len(df.index) +pos_counts.to_csv(os.path.join(sc.settings.figdir, 'single_positive_counts.csv'), header=['single pos counts']) + +# # double pos w ace2, # +double_pos = (df[markers_for_pos_counts.index] > 0).apply(lambda x: x & (df['ACE2'] > 0)).sum() +double_pos['cell_counts'] = len(df.index) +double_pos.to_csv(os.path.join(sc.settings.figdir, 'double_positive_counts.csv'), header=['double pos counts']) + +#%% +df = adata.to_df() +ace2_muc5ac_double_pos = (df['MUC5AC'] > 0) & (df['ACE2'] > 0) +ace2_muc5ac = df.loc[ace2_muc5ac_double_pos][['ACE2']].join(adata.obs.Smoke_status) +ace2_muc5ac.pivot(columns='Smoke_status').to_csv(os.path.join(sc.settings.figdir, 'ACE2_MUC5AC_double_pos.csv')) + +#%% +cluster_labels = pd.read_csv(os.path.join(sc.settings.figdir, 'key.csv'), header=None, names=['cluster', 'label'], dtype='str') +cluster_labels.loc[cluster_labels['label'].isna(),'label'] = cluster_labels.loc[cluster_labels['label'].isna(), 'cluster'] +cluster_labels_dict = cluster_labels.set_index('cluster')['label'].to_dict() +adata.obs['Cell Types'] = adata.obs.leiden.map(cluster_labels_dict).astype('category') + +sc.pl.tsne(adata, color=['Cell Types'], size=25, save='_manually_labeled.pdf', color_map='plasma', title='Human Trachea') +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) + +sc.tl.rank_genes_groups(adata, 'Cell Types', method='t-test', n_genes=100, key_added='cell_types_gene_groups') +pd.DataFrame(adata.uns['cell_types_gene_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'manually_labeled_rank_groups_ttest.csv')) + +#%% +adata.obs.groupby('Cell Types')['Smoke_status'].value_counts().to_csv(os.path.join(sc.settings.figdir, 'cell_type_smoker_counts.csv')) +ace2_pos_cell_types = adata.to_df()[(adata.to_df()['ACE2'] > 0)][['ACE2']].join(adata.obs[['Cell Types', 'Smoke_status']]) +ace2_pos_cell_types.groupby(['Cell Types', 'Smoke_status']).count().to_csv(os.path.join(sc.settings.figdir, 'cell_type_ace2_pos_counts.csv')) + +#%% +adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ace2_pos'] = 'ace2_pos' +adata.obs.loc[(adata.to_df()['ACE2'] <= 0), 'ace2_pos'] = 'ace2_neg' +sc.tl.rank_genes_groups(adata, groupby='ace2_pos', key_added='ACE2_pos') +#%% + +adata.obs['ACE2_goblet'] = np.nan +adata.obs.loc[(adata.obs['Cell Types'] == 'Goblet/Club cells'), 'ACE2_goblet'] = 'Goblet/Club' +adata.obs.loc[(adata.obs['Cell Types'] == 'Ciliated cells'), 'ACE2_goblet'] = 'Ciliated' +adata.obs.loc[(adata.obs['Cell Types'] == 'Basal cells'), 'ACE2_goblet'] = 'Basal' +adata.obs.loc[(adata.to_df()['ACE2'] > 0), 'ACE2_goblet'] = 'ACE2+' + + +adata_ace2_goblet = adata[adata.obs.dropna(subset=['ACE2_goblet']).index].copy() +adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].astype(pd.CategoricalDtype(ordered=True)) +adata_ace2_goblet.obs['ACE2_goblet'] = adata_ace2_goblet.obs['ACE2_goblet'].cat.reorder_categories(['ACE2+', 'Goblet/Club', 'Ciliated', 'Basal'], ordered=True) + + +heatmap_genes = {'ACE2+ markers': adata.uns['ACE2_pos']['names']['ace2_pos'][1:11], + 'Goblet/Club markers': adata.uns['cell_types_gene_groups']['names']['Goblet/Club cells'][0:10], + 'Ciliated markers': adata.uns['cell_types_gene_groups']['names']['Ciliated cells'][0:10], + 'Basal markers': adata.uns['cell_types_gene_groups']['names']['Basal cells'][0:10], + } + +ax = sc.pl.dotplot(adata_ace2_goblet, groupby='ACE2_goblet', var_names=heatmap_genes, #save='_ACE2_marker_goblet_ciliated.png', + mean_only_expressed=True, var_group_rotation=0.0, show=False, figsize=(12, 2.5)) +all_axes = plt.gcf().get_axes() +all_axes[0].set_ylabel('') + +plt.savefig(os.path.join(sc.settings.figdir, 'dotplot_ace2_marker_goblet_ciliated_basal.pdf'), bbox_inches='tight') + diff --git a/data/cluster-plots/airway-smoking-20200-gse134174/key.csv b/data/cluster-plots/airway-smoking-20200-gse134174/key.csv new file mode 100644 index 0000000..38a2b6e --- /dev/null +++ b/data/cluster-plots/airway-smoking-20200-gse134174/key.csv @@ -0,0 +1,27 @@ +0, +1, +2,Basal cells +3,Proliferating cells +4,Goblet/Club cells +5,Proliferating cells +6,SMG-basal +7,SMG-basal +8,KRT8+ differentiating cells +9, +10, +11,Ciliated cells +12,Goblet/Club cells +13,Basal cells +14, +15,Ciliated cells +16,KRT8+ differentiating cells +17,Basal cells +18,Goblet/Club cells +19,KRT8+ differentiating cells +20,Goblet/Club cells +21,Goblet/Club cells +22,SMG-secretory +23,Basal cells +24, +25,Ionocytes/Neuroendocrine cells +26,Ionocytes/Neuroendocrine cells \ No newline at end of file diff --git a/data/cluster-plots/airway-smoking-20200-gse134174/markers.csv b/data/cluster-plots/airway-smoking-20200-gse134174/markers.csv new file mode 100644 index 0000000..35e0cc1 --- /dev/null +++ b/data/cluster-plots/airway-smoking-20200-gse134174/markers.csv @@ -0,0 +1,80 @@ +ACE2,Coronavirus Receptor +EPCAM,Epithelium +PTPRC,Immune Cells +PDGFRA,Mesenchymal Cells +TMEM100,Endothelium +RTKN2,Alveolar Type 1 +FOXJ1,Ciliated Cells +LAMP3,Alveolar Type 2 +MUC5B,Secretory Cells +MUC5AC,Goblet Cells +SCGB3C1,Secretory Cells +SCGB1A1,Club Cells +GABRP,Secretory Cells +LYPD2,Goblet Cells +CLDN10,Secretory Cells +KRT5,Basal Cells +TRP63,Basal Cells +MKI67,Proliferating Cells +TOP2A,Proliferating Cells +PCNA,Proliferating Cells +DEUP1,Deuterosomal Cells +FOXI1,Ionocytes +CFTR,Ionocytes +CGRP,Neuroendocrine Cells +CHGA,Neuroendocrine Cells +ASCL1,Brush/Tuft Cells +LRMP,Brush/Tuft Cells +COL1A1,Fibroblasts +FBLN1,Fibroblasts +ACTA2,Smooth Muscle Cells +MYH11,Smooth Muscle Cells +DNAH11,Ciliated Cells +TP63,Basal Cells +KRT8,Basal Cells +CAV1,Basal Cells +AGR2,Goblet Cells +BPIFB1,Goblet/Club Cells +SPDEF,Secretory Cells +MUC1,Secretory Cells +MUCL1,Secretory Cells? +KIAA0101,01_p.basal +TYMS,01_p.basal +TK1,01_p.basal +ZWINT,01_p.basal +RRM2,01_p.basal +KRT17,02_d.basal +IL33,02_d.basal +MMP10,02_d.basal +BCAM,02_d.basal +ETS2,02_d.basal +C19orf33,03_KRT8.high +GPRC5A,03_KRT8.high +BHLHE40,03_KRT8.high +TXN,03_KRT8.high +EPAS1,03_KRT8.high +BPIFB1,04_muc.secretory +VMO1,04_muc.secretory +MSMB,04_muc.secretory +S100P,04_muc.secretory +RDH10,04_muc.secretory +FAM216B,05_ciliated +STOML3,05_ciliated +MRLN,05_ciliated +CATSPERD,05_ciliated +ANKRD66,05_ciliated +HEPACAM2,06_rare +PROX1,06_rare +TUBB2B,06_rare +NEURL1,06_rare +PHGR1,06_rare +KRT14,07_SMG.basal +CAV1,07_SMG.basal +FOS,07_SMG.basal +G0S2,07_SMG.basal +NAP1L1,07_SMG.basal +SLPI,08_SMG.secretory +RARRES1,08_SMG.secretory +SCGB3A1,08_SMG.secretory +TCN1,08_SMG.secretory +MMP7,08_SMG.secretory \ No newline at end of file diff --git a/data/cluster-plots/gse1229560_ipf/key.csv b/data/cluster-plots/gse1229560_ipf/key.csv new file mode 100644 index 0000000..7fa3677 --- /dev/null +++ b/data/cluster-plots/gse1229560_ipf/key.csv @@ -0,0 +1,43 @@ +0,T cells +1,Alveolar type 2 +2,Macrophages +3,Macrophages +4,Plasma cells +5,Macrophages +6,Alveolar type 2 +7,Macrophages +8,Macrophages +9,Macrophages +10,Macrophages +11,Ciliated cells +12,Dendritic cells +13,Macrophages +14,Goblet/Club cells +15,Endothelial cells +16,B cells +17,Proliferating cells +18,B cells +19,Alveolar type 2 +20,Macrophages +21,Basal cells +22,B cells +23,Goblet/Club cells +24,Macrophages +25,Mast cells +26,Macrophages +27,Macrophages +28,Macrophages +29,Macrophages +30,Macrophages +31,Mesenchymal cells +32,Alveolar type 2 +33,Smooth muscle cells +34,T cells +35,Endothelial cells +36,Macrophages +37,Macrophages +38,Plasma cells +39,Endothelial cells +40,Macrophages +41,Macrophages +42,Mesenchymal cells \ No newline at end of file diff --git a/data/cluster-plots/healthy-mouse/key.csv b/data/cluster-plots/healthy-mouse/key.csv new file mode 100644 index 0000000..b593a97 --- /dev/null +++ b/data/cluster-plots/healthy-mouse/key.csv @@ -0,0 +1,30 @@ +0,B cells +1,Alveolar type 2 +2,T cells +3,Macrophages +4,NK cells +5,Endothelial cells +6,T cells +7,Monocytes +8,Monocytes +9,Dendritic cells +10,Endothelial cells +11,Fibroblasts +12,Macrophages +13,Alveolar type 1 +14,Goblet/Club cells +15,Endothelial cells +16,B cells +17,Alveolar type 2 +18,Neutrophils +19,Ciliated cells +20,Macrophages +21,Endothelial cells +22,Alveolar type 2 +23,Dendritic cells +24,Alveolar type 2 +25,Monocytes +26,Monocytes +27,Alveolar type 2 +28,Neutrophils +29,Alveolar type 2 \ No newline at end of file diff --git a/data/cluster-plots/healthy-mouse/trackplot_mouse.csv b/data/cluster-plots/healthy-mouse/trackplot_mouse.csv new file mode 100644 index 0000000..617abcd --- /dev/null +++ b/data/cluster-plots/healthy-mouse/trackplot_mouse.csv @@ -0,0 +1,14 @@ +ACE2 +CD3G +CHIL3 +CXCR2 +EPCAM +FOXJ1 +GABRP +GZMA +LAMP3 +MUC5AC +PDGFRA +PTPRC +RTKN2 +TMEM100 \ No newline at end of file diff --git a/healthy-mouse-gse344007.py b/healthy-mouse-gse344007.py index 1c73d73..cfac43a 100755 --- a/healthy-mouse-gse344007.py +++ b/healthy-mouse-gse344007.py @@ -19,6 +19,7 @@ from matplotlib import pyplot as plt import matplotlib as mpl import scipy.sparse +import os #%% Set scanpy and matplotlib settings sc.settings.figdir = 'data/cluster-plots/healthy-mouse/' @@ -48,6 +49,8 @@ adata.var = genes.set_index('name') adata.var_names_make_unique() print('Healthy Mouse: ', adata.X.max()) +#%% +sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata, base=2) #%% Exploratory plots @@ -65,15 +68,32 @@ def exploratory_plots(adata): plt.savefig('data/cluster-plots/healthy-mouse/healthy_mouse_genes_cell_exploratory.pdf') #%% Perform Clustering +LEARNING_RATE = 1000 +EARLY_EXAGGERATION = 12 +RESOLUTION = 1 +PERPLEXITY=30 +N_PCS = 50 sc.pp.filter_cells(adata, min_genes=500) -sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=13, min_disp=0.5) - -sc.tl.pca(adata, zero_center=False) -sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) -sc.tl.tsne(adata, perplexity=50) -sc.tl.leiden(adata) - +sc.pp.highly_variable_genes(adata) + +sc.tl.pca(adata) +sc.pp.neighbors(adata) +sc.tl.tsne(adata, learning_rate=LEARNING_RATE, n_jobs=8, early_exaggeration=EARLY_EXAGGERATION, perplexity=PERPLEXITY, n_pcs=N_PCS) +sc.tl.leiden(adata, resolution=RESOLUTION) +sc.pl.tsne(adata, color='leiden') + +params = {'learning_rate': LEARNING_RATE, + 'early_exaggeration':EARLY_EXAGGERATION, + 'resolution': RESOLUTION, + 'perplexity': PERPLEXITY, + 'n_pcs': N_PCS, + 'genes': 'all' + } +sc.pl.tsne(adata, color=['leiden'], size=25, save='_leiden.pdf', color_map='plasma') +pd.Series(params).to_csv(os.path.join(sc.settings.figdir, 'params.txt')) +adata.uns['joan_cluster_params'] = params +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) #%% Plot clusters and clusters with highlighted genes markers = pd.read_csv('data/highlighted_genes.csv', header=None, names=['gene', 'cluster']) @@ -97,42 +117,50 @@ def exploratory_plots(adata): size=25, show=False) ax.legend().remove() -plt.savefig('data/cluster-plots/healthy-mouse/leiden_healthy_mouse_gse3440071_markers_500gene.pdf') +plt.savefig(os.path.join(sc.settings.figdir, 'leiden_healthy_mouse_gse3440071.pdf')) for i, g in markers.iterrows(): sc.pl.tsne(adata, color=i, title=g['title'], color_map='plasma', - size=25, + + size=25, save='_' + i + '_healthy_mouse_gse3440071_markers_500gene.pdf', show=False) + +#%% +adata = sc.read_h5ad(os.path.join(sc.settings.figdir, 'adata.h5ad')) #%% Collect top-ranked genes for each group -sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=10) -pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv('data/cluster-plots/healthy-mouse/healthy_mouse_groups.csv') +sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=20) +pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'healthy_mouse_groups.csv')) +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) #%% Calculate correlations with ACE2 df = adata.to_df() corr = df.corrwith(df['ACE2']).sort_values(ascending=False) -corr.to_csv('data/cluster-plots/healthy-mouse/ACE2_correlates_healthy_mouse_GSM3440071.csv') +corr.to_csv(os.path.join(sc.settings.figdir, 'ACE2_correlates_healthy_mouse_GSM3440071.csv')) + -#%% Stacked Violin Plot of labeled clusters and markers +#%% Cluster 14 and ACE2 pos -violin_markers = pd.read_csv('data/violin_markers_mouse.csv', header=None) -violin_markers[0] = violin_markers[0].str.upper() -violin_markers = violin_markers[violin_markers[0].isin(genes['name'].values)] +adata.obs.loc[adata.obs['leiden'] == '14','ace2_goblet'] = 'goblet' +adata.obs.loc[adata.to_df()['ACE2'] > 0, 'ace2_goblet'] = 'ace2_pos' +sc.tl.rank_genes_groups(adata, groupby='ace2_goblet', key_added='ACE2_goblet', groups=['ace2_pos', 'goblet']) +pd.DataFrame(adata.uns['ACE2_goblet']['names']).to_csv(os.path.join(sc.settings.figdir, 'ACE2_pos_v_goblet_rankings.csv')) -cluster_key = pd.read_csv('data/healthy_mouse_cluster_key.csv', header=None, names=['cluster', 'cell_type']) +#%% Trackplot Plot of labeled clusters and markers +cluster_key = pd.read_csv(os.path.join(sc.settings.figdir, 'key.csv'), header=None, names=['cluster', 'cell_type']) cluster_key['cluster'] = cluster_key['cluster'].astype(str) cluster_key_map = cluster_key.set_index('cluster')['cell_type'].to_dict() adata.obs['Cell Types'] = adata.obs['leiden'].map(cluster_key_map, na_action='ignore').astype('category') +sc.pl.tsne(adata, color=['Cell Types'], size=25, save='_leiden.pdf', color_map='plasma', title='Mouse Lung') - -axes_list = sc.pl.stacked_violin(adata, var_names=violin_markers[0].values, groupby='Cell Types', show=False, swap_axes=True) +trackplot_markers = pd.read_csv(os.path.join(sc.settings.figdir, 'trackplot_mouse.csv'), header=None) +axes_list = sc.pl.tracksplot(adata, var_names=trackplot_markers[0].values, groupby='Cell Types', figsize=(18,3)) [i.yaxis.set_ticks([]) for i in axes_list] ax = plt.gca() -ax.get_xaxis().set_label_text('') -ax.figure.set_size_inches(5, 6) -plt.savefig('data/cluster-plots/healthy-mouse/cell_type_stacked_violin2.pdf', bbox_inches='tight') +ax.set_xlabel('') +plt.savefig(os.path.join(sc.settings.figdir, 'tracksplot_cell_types.pdf'), bbox_inches='tight') diff --git a/human-ipf-gse122960.py b/human-ipf-gse122960.py new file mode 100755 index 0000000..802c974 --- /dev/null +++ b/human-ipf-gse122960.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Apr 25 2020 + +@author: Joan Smith + +https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE122960 +""" +#%% +import scipy.sparse as sp_sparse +import tables +import pandas as pd +import os +import glob +import scanpy as sc +from matplotlib import pyplot as plt +import matplotlib as mpl +import numpy as np + +#%% + +sc.settings.figdir = 'data/cluster-plots/gse1229560_ipf/' +sc.settings.verbosity = 4 + +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams.update({ + 'font.sans-serif': 'Arial', + 'font.family': 'sans-serif', + 'axes.titlesize': 18, + 'axes.labelsize': 14, + }) + + +#%% +def get_series_from_h5(filename): + with tables.open_file(filename, 'r') as f: + mat_group = f.get_node(f.root, 'GRCh38') + barcodes = f.get_node(mat_group, 'barcodes').read().astype(str) + gene_names = f.get_node(mat_group, 'gene_names').read().astype(str) + data = getattr(mat_group, 'data').read() + indices = getattr(mat_group, 'indices').read() + indptr = getattr(mat_group, 'indptr').read() + shape = getattr(mat_group, 'shape').read() + matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape) + return matrix, barcodes, gene_names.tolist() + +#%% +p = 'data/raw-data/GSE122960_RAW/*filtered_*.h5' +dfs = [] +genes = [] +cells = [] +donors = [] +globs = [i for i in glob.glob(p) if 'IPF' in i or 'Cryobiopsy' in i] +for f in globs: + print(f) + matrix, barcodes, gene_names = get_series_from_h5(f) + donor_id = '_'.join(f.split('/')[-1].split('_')[0:3]) + dfs.append(matrix) + cells.append(barcodes) + donors.append([donor_id]*barcodes.shape[0]) + genes.append(gene_names) +#%% +#Verifies all gene names are present, in the same order, for each matrix + +def verify_genes(gene_names): + gene_names_df = pd.DataFrame(genes) + assert (np.array([gene_names_df[i].value_counts().iloc[0] for i in gene_names_df]) < len(gene_names)).sum() == 0 +verify_genes(genes) + + +#%% Smoker v. Non + +#Never = 0, Former = 1, Active = 2 +smoker_v_non_donor_table = { + 'Donor_01': 0, + 'Donor_02': 1, + 'Donor_03': 0, + 'Donor_04': 0, + 'Donor_05': 2, + 'Donor_06': 0, + 'Donor_07': 2, + 'Donor_08': 0 } +smoker_v_non_disease_table = { + 'IPF_01': 1, + 'IPF_02': 0, + 'IPF_03': 1, + 'IPF_04': 0, + 'HP_01': 0, + 'SSc-ILD_01': 0, + 'SSc-ILD_02': 0, + 'Myositis-ILD_01': 0, + 'Cryobiopsy_01': 1 + } + +#%% Collect into AnnData object + +adata = sc.AnnData(sp_sparse.hstack(dfs).T) +adata.var = pd.DataFrame(genes[0], index=genes[0], columns=['name']) +adata.var_names_make_unique() +obs = pd.DataFrame({'barcodes': np.hstack(cells), 'donor_id': np.hstack(donors)}) +obs['donor-barcode'] = obs['barcodes'] + '_' + obs['donor_id'] +obs = obs.set_index('donor-barcode') + +adata.obs = obs +adata.obs['donor'] = adata.obs.donor_id.str.split('_', n=1, expand=True)[1] +#%% +adata.obs['smoker'] = adata.obs.donor.map(smoker_v_non_donor_table) +active_smokers = adata[adata.obs['smoker'] == 2].copy() +never_smokers = adata[adata.obs['smoker'] == 0].copy() + +#%% Normalize data +sc.pp.normalize_total(adata, target_sum=1e6) +sc.pp.log1p(adata, base=2) + +#%% +def exploratory_plots(adata): + + # Check normalization + num_non_int = (adata.to_df().applymap(float.is_integer) == False).sum().sum() + print('Num non-int: ', num_non_int) + + plt.figure() + sc.pp.filter_cells(adata, min_genes=0) + sc.pp.filter_genes(adata, min_cells=0) + + plt.hist(adata.obs.n_genes, bins=500) + plt.title('IPF per cell') + print('Min:', adata.obs['n_genes'].min()) + + minimum = adata.obs['n_genes'].min() + maximum = adata.obs['n_genes'].max() + print('Max:', maximum) + plt.xlabel('# Genes. Min:' + str(minimum)) + plt.ylabel('# Cells') + + plt.figure() + plt.hist(adata.var.n_cells, bins=500) + plt.title('IPF per gene') + print('Min:', adata.var['n_cells'].min()) + + sc.pl.pca_variance_ratio(adata, log=True) + +exploratory_plots(adata) + +#%% +sc.pp.filter_cells(adata, min_genes=500) +sc.pp.highly_variable_genes(adata) + +sc.tl.pca(adata) +sc.pp.neighbors(adata) +#%% +LEARNING_RATE = 1000 +EARLY_EXAGGERATION = 12 +RESOLUTION = 1.25 +PERPLEXITY=130 + +sc.tl.tsne(adata, learning_rate=LEARNING_RATE, n_jobs=8, early_exaggeration=EARLY_EXAGGERATION, perplexity=PERPLEXITY) +sc.tl.leiden(adata, resolution=RESOLUTION) + +params = {'learning_rate': LEARNING_RATE, + 'early_exaggeration':EARLY_EXAGGERATION, + 'resolution': RESOLUTION, + 'perplexity': PERPLEXITY, + 'genes': 'all', + 'files': globs} +pd.Series(params).to_csv(os.path.join(sc.settings.figdir, 'params.txt')) +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) + +#%% +adata = sc.read_h5ad(os.path.join(sc.settings.figdir, 'adata.h5ad')) +params = pd.read_csv(os.path.join(sc.settings.figdir, 'params.txt'), index_col=0).to_dict('dict')['0'] + +#%% +markers = pd.read_csv('data/highlighted_genes.csv', header=None, names=['gene', 'cluster']) +markers['gene'] = markers['gene'].str.upper() +markers = markers[markers['gene'].isin(gene_names)] + +markers['title'] = markers['gene'] + '+: ' + markers['cluster'] +markers = markers.set_index('gene') +markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells' +markers.loc['leiden'] = ['Leiden', 'Clusters'] + +addl_genes = pd.read_csv('data/additional_ipf_genes.csv', header=None) +addl_genes['title'] = addl_genes[0] +addl_genes = addl_genes.set_index(0) +markers = markers.append(addl_genes) + +#%% +for i, g in markers.iterrows(): + sc.pl.tsne(adata, color=i, + title=g['title'], + color_map='plasma', + size=25, + save='_' + i + '_all.pdf', + show=False) + +#%% +sc.tl.rank_genes_groups(adata, 'leiden', method='t-test', n_genes=20) +pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'rank_groups.csv')) +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) + +#%% Correlations + +df = adata.to_df() +corr = df.corrwith(df['ACE2']).sort_values(ascending=False) +print('correlations with ACE2') +print(corr.head(20)) +corr.to_csv(os.path.join(sc.settings.figdir, 'ACE2_correlates.csv')) + +#%% +cluster_labels = pd.read_csv(os.path.join(sc.settings.figdir, 'key.csv'), header=None, names=['cluster', 'label'], dtype='str') +cluster_labels.loc[cluster_labels['label'].isna(),'label'] = cluster_labels.loc[cluster_labels['label'].isna(), 'cluster'] +cluster_labels_dict = cluster_labels.set_index('cluster')['label'].to_dict() +adata.obs['Cell Types'] = adata.obs.leiden.map(cluster_labels_dict).astype('category') + +ax = sc.pl.tsne(adata, color=['Cell Types'], + size=25, + title='Human Lung', + save='_' + 'labeled_clusters.pdf') +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) +#%% +sc.tl.rank_genes_groups(adata, 'Cell Types', method='t-test', n_genes=20) +pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'labeled_clusters_rank_groups.csv')) +adata.write(os.path.join(sc.settings.figdir, 'adata.h5ad')) + +#%% +trackplot_markers = pd.read_csv(os.path.join(sc.settings.figdir, 'trackplot_human.csv'), header=None) +axes_list = sc.pl.tracksplot(adata, var_names=trackplot_markers[0].values, groupby='Cell Types', figsize=(18,3)) +[i.yaxis.set_ticks([]) for i in axes_list] +ax = plt.gca() +ax.set_xlabel('') +plt.savefig(os.path.join(sc.settings.figdir, 'tracksplot_cell_types.pdf'), bbox_inches='tight') diff --git a/human-smoker-v-non-gse131391.py b/human-smoker-v-non-gse131391.py index f760944..5811bb5 100755 --- a/human-smoker-v-non-gse131391.py +++ b/human-smoker-v-non-gse131391.py @@ -13,6 +13,8 @@ import scanpy as sc import matplotlib as mpl +import os + #%% Set scanpy and matplotlib settings sc.settings.figdir = 'data/cluster-plots/human-smoker-non/' sc.settings.verbosity = 4 @@ -43,7 +45,11 @@ cell_counts = annotation.join(cell_counts, how='inner').set_index('gene_name') cell_counts = cell_counts.drop('chr', axis=1) + +#%% adata = sc.AnnData(cell_counts.T) +sc.pp.filter_cells(adata, min_genes=500) +sc.pp.normalize_total(adata, target_sum=1e6) sc.pp.log1p(adata, base=2) #%% Exploratory plots @@ -57,37 +63,7 @@ def exploratory_plots(adata): minimum = adata.obs['n_genes'].min() plt.xlabel('# Genes. Min:' + str(minimum)) plt.ylabel('# Cells') - plt.savefig('data/cluster-plots/human-smoker-non/human_smoker_v_non_genes_cell.pdf') - - - -#%%% non-smokers clustering - -non_smokers_df = cell_counts.filter(regex="Never.*", axis=1) -non_smokers = sc.AnnData(non_smokers_df.T) -sc.pp.log1p(non_smokers, base=2) - -sc.pp.filter_cells(non_smokers, min_genes=500) -sc.pp.highly_variable_genes(non_smokers, min_mean=0.0125, max_mean=6, min_disp=0.5) - -sc.tl.pca(non_smokers) -sc.pp.neighbors(non_smokers, n_neighbors=10, n_pcs=30) -sc.tl.tsne(non_smokers) -sc.tl.leiden(non_smokers) - -#%% smokers clustering - -smokers_df = cell_counts.filter(regex="Current.*", axis=1) -smokers = sc.AnnData(smokers_df.T) -sc.pp.log1p(smokers, base=2) - -sc.pp.filter_cells(non_smokers, min_genes=500) -sc.pp.highly_variable_genes(smokers, min_mean=0.0125, max_mean=6, min_disp=0.5) - -sc.tl.pca(smokers) -sc.pp.neighbors(smokers, n_neighbors=10, n_pcs=30) -sc.tl.tsne(smokers) -sc.tl.leiden(smokers) + plt.savefig(os.path.join(sc.settings.figdir, 'human_smoker_v_non_genes_cell.pdf') #%% Clustering all data together @@ -109,51 +85,6 @@ def exploratory_plots(adata): markers = markers.set_index('gene') markers.loc['PTPRC']['title'] = 'PTPRC (CD45)+: Immune Cells' -#%% Plot nonsmoker clusters with higlighted genes - -ax = sc.pl.tsne(non_smokers, color=['leiden'], - title=['Human Never-Smokers'], - color_map='plasma', - save='_labeled_leiden_non_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf', - show=False) - -ax = sc.pl.tsne(non_smokers, color=['leiden'], - title=['Human Never-Smokers'], - color_map='plasma', - show=False) -ax.legend().remove() -plt.savefig('data/cluster-plots/human-smoker-non/leiden_tsne_non_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf') - -for i, g in markers.iterrows(): - axes_list = sc.pl.tsne(non_smokers, color=i, - title=g['title'], - color_map='plasma', - save='_' + i + '_non_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf', - show=False) - - -#%% Plot smoker cluster with highlighted genes - -ax = sc.pl.tsne(smokers, color=['leiden'], - title=['Human Smokers'], - color_map='plasma', - save='_labeled_leiden_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf', - show=False) - -ax = sc.pl.tsne(smokers, color=['leiden'], - title=['Human Smokers'], - color_map='plasma', - show=False) -ax.legend().remove() -plt.savefig('data/cluster-plots/human-smoker-non/leiden_tsne_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf') - -for i, g in markers.iterrows(): - axes_list = sc.pl.tsne(smokers, color=i, - title=g['title'], - color_map='plasma', - save='_' + i + '_smoker_human_smoker_v_non_gse131391_potential_markers_500gene.pdf', - show=False) - #%% Plot combined smokers+non-smokers with highlighted genes ax = sc.pl.tsne(adata, color=['leiden'], @@ -167,7 +98,8 @@ def exploratory_plots(adata): color_map='plasma', show=False) ax.legend().remove() -plt.savefig('data/cluster-plots/human-smoker-non/leiden_tsne_both_human_smoker_v_non_gse131391_potential_markers_500gene.pdf') + +plt.savefig(os.path.join(sc.settings.figdir, 'leiden_tsne_both_human_smoker_v_non_gse131391_potential_markers_500gene.pdf')) for i, g in markers.iterrows(): axes_list = sc.pl.tsne(adata, color=i, @@ -178,22 +110,6 @@ def exploratory_plots(adata): #%% Collect top ranked genes for each combined group sc.tl.rank_genes_groups(adata, groupby='leiden', n_genes=10) -pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv('data/cluster-plots/human-smoker-non/human_smoker_v_non_both_ranked_gene_groups.csv') - -#%% - -violin_markers = pd.read_csv('data/violin_markers_human.csv', header=None) -violin_markers[0] = violin_markers[0].str.upper() -violin_markers = violin_markers[violin_markers[0].isin(adata.var.index.values)] - -cluster_key = pd.read_csv('data/combined_human_cluster_key.csv', header=None, names=['cluster', 'cell_type']) -cluster_key['cluster'] = cluster_key['cluster'].astype(str) -cluster_key_map = cluster_key.set_index('cluster')['cell_type'].to_dict() +pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(os.path.join(sc.settings.figdir, 'human_smoker_v_non_both_ranked_gene_groups.csv')) -adata.obs['Cell Types'] = adata.obs['leiden'].map(cluster_key_map, na_action='ignore').astype('category') -axes_list = sc.pl.stacked_violin(adata, var_names=violin_markers[0].values, groupby='Cell Types', swap_axes=True, show=False, ) -[i.yaxis.set_ticks([]) for i in axes_list] -ax = plt.gca() -ax.get_xaxis().set_label_text('') -plt.savefig('data/cluster-plots/human-smoker-non/cell_type_stacked_violin_human_markers_cluster.pdf', bbox_inches='tight', quality=95) diff --git a/probes-to-genes-human-smoking.py b/probes-to-genes-human-smoking.py new file mode 100755 index 0000000..62116bf --- /dev/null +++ b/probes-to-genes-human-smoking.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 16 18:57:27 2020 + +@author: joan +""" + +import collections +import scipy.sparse as sp_sparse +import tables +import pandas as pd +import os +import glob +import scanpy as sc +from matplotlib import pyplot as plt +import matplotlib as mpl +import numpy as np +import sys + +#%% + +sc.settings.figdir = 'data/Data/Human smoking/Volcano plots' +sc.settings.verbosity = 4 + +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams.update({ + 'font.sans-serif': 'Arial', + 'font.family': 'sans-serif', + 'axes.titlesize': 18, + 'axes.labelsize': 14, + }) + +#%% +gpl570_files = ['GSE13933_trachea.csv', 'gse64614_SAE.csv', 'gse22047_large.csv'] + +gpl570 = pd.read_csv(os.path.join(sc.settings.figdir, 'GPL570.txt'),index_col=0, sep='\t', comment='#', dtype='str') +gpl570 = gpl570[['Gene Symbol']] +gpl570 = gpl570.dropna(how='any') +gpl570['Gene Symbol'] = '\'' + gpl570['Gene Symbol'] + +for i in gpl570_files: + f = pd.read_csv(os.path.join(sc.settings.figdir, i), index_col=0, comment='!') + mean_centered = f.apply(lambda x: x-x.mean(), axis=1) + probe_and_symbol = mean_centered.join(gpl570) + mapped = probe_and_symbol.groupby('Gene Symbol').agg(np.mean) + mapped.to_csv(os.path.join(sc.settings.figdir, 'genes', i)) +#%% + +gene_names = pd.read_csv('data/raw-data/Homo_sapiens.GRCh38.99.gtf', sep='\t', comment='#', header=None, dtype='str') + +gene_names.columns = ['chr', 'annotation_src', 'feature_type', 'start', 'end', 'score', 'genomic_strand', 'genomic_phase', 'extra'] +gene_names['gene_name'] = gene_names['extra'].str.extract(pat='gene_name "(.*?)";') +gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";') +gene_names['transcript_type'] = gene_names['extra'].str.extract(pat='transcript_type "(.*?)";') +gene_names['gene_biotype'] = gene_names['extra'].str.extract(pat='gene_biotype "(.*?)";') +gene_names['transcript_biotype'] = gene_names['extra'].str.extract(pat='transcript_biotype "(.*?)";') +gene_names['gene_name'] = '\'' + gene_names['gene_name'] +ensembl_annotation = gene_names[['gene_id', 'gene_name']].set_index('gene_id') + +f = pd.read_csv(os.path.join(sc.settings.figdir, 'gse79209.csv'), index_col=0, comment='!') +mean_centered = f.apply(lambda x: x-x.mean(), axis=1) +probe_and_symbol = mean_centered.join(ensembl_annotation) +mapped = probe_and_symbol.groupby('gene_name').agg(np.mean) +mapped.to_csv(os.path.join(sc.settings.figdir, 'genes', 'GSE79209.csv')) + +#%% +f = pd.read_csv(os.path.join(sc.settings.figdir, 'gse135188.csv'), index_col=0, comment='!') +mean_centered = f.apply(lambda x: x-x.mean(), axis=1) +probe_and_symbol = mean_centered.join(ensembl_annotation) +mapped = probe_and_symbol.groupby('gene_name').agg(np.mean) +mapped.to_csv(os.path.join(sc.settings.figdir, 'genes', 'GSE135188.csv')) diff --git a/protein-rna-correlations.py b/protein-rna-correlations.py new file mode 100755 index 0000000..f1afc37 --- /dev/null +++ b/protein-rna-correlations.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 24 20:27:33 2020 + +@author: Joan Smith +""" + +#%% + +import pandas as pd +import numpy as np +import os +from matplotlib import pyplot as plt + +# Column A: Gene +# Column B: Pearson Corr Coeff +# Column C: # of cell lines + +DIRECTORY = 'data/Data/RNA-protein correlation' +#%% +protein = pd.read_csv(os.path.join(DIRECTORY, 'protein_quant_current_normalized.csv')) +rna = pd.read_csv(os.path.join(DIRECTORY, 'CCLE_expression.csv'), index_col=0) +cell_line_key = pd.read_csv(os.path.join(DIRECTORY, 'sample_info (3).csv'), index_col=0) +#%% +rna = rna.join(cell_line_key[['CCLE_Name']], how='inner').set_index('CCLE_Name') +rna.columns = [i.split(' ')[0] for i in rna.columns] +#%% +protein = protein.set_index('Gene_Symbol') +protein = protein.drop(['SW948_LARGE_INTESTINE_TenPx11', 'CAL120_BREAST_TenPx02', 'HCT15_LARGE_INTESTINE_TenPx30'], axis=1) +protein.columns = pd.Series(protein.columns).str.rsplit('_', n=1, expand=True)[0] +#%% +overlapping_cell_lines = set(protein.columns) & set(rna.index.values) + +rna = rna.loc[overlapping_cell_lines] +protein = protein[overlapping_cell_lines].T +#%% +def corr(protein_series): + df = pd.DataFrame([protein_series, rna[protein_series.name]]).T + df = df.dropna(how='any') + c = df.corr().iloc[0,1] + count = df.shape[0] + return pd.Series({'corr': c, 'count': count}) + +gene_overlap = set(rna.columns) & set(protein.columns) +protein = protein[gene_overlap] +rna = rna[gene_overlap] + +corrs = protein.apply(corr) +#%% +corrs.T.to_csv(os.path.join(DIRECTORY, 'correlation_output.csv')) + diff --git a/smoking-v-non-regressions.py b/smoking-v-non-regressions.py new file mode 100644 index 0000000..0077d87 --- /dev/null +++ b/smoking-v-non-regressions.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 22 15:43:34 2020 + +@author: Joan Smith + +Smoking v. non regressions +""" + +#%% + +import pandas as pd +from matplotlib import pyplot as plt +import statsmodels.formula.api as smf + +#%% +def regress(data, cols, plot=False): + data = data.dropna(how='any', subset=cols) + + formula = 'ACE2 ~ ' + '+'.join(cols) + model = smf.ols(formula, data=data) + results = model.fit() + + if len(cols) == 1 and plot: + plt.figure() + plt.title(formula) + plt.plot(data[cols[0]], data['ACE2'], 'o') + plt.plot(data[cols[0]], model.predict(results.params), color='red') + plt.xlabel(cols[0]) + plt.ylabel('ACE2 expression') + + + df = pd.DataFrame({'p': results.pvalues, 'se': results.bse, 'beta': results.params}) + df.index = formula + ': ' + df.index + print(df) + return df + +#%% + +gse76925_ace2 = pd.read_csv("data/Data/Human smoking - packyears/pack-years.csv") +gse76925_ace2.loc[gse76925_ace2['Race'] == 'C', 'Race'] = 0 +gse76925_ace2.loc[gse76925_ace2['Race'] != 0, 'Race'] = 1 +gse76925_ace2['Sex'] = gse76925_ace2['Sex'].replace({'M': 1, 'F': 0}) + +sex_regress = regress(gse76925_ace2, ['Sex']) +race_regress = regress(gse76925_ace2, ['Race']) +age_regress = regress(gse76925_ace2, ['Age']) +bmi_regress = regress(gse76925_ace2, ['BMI']) +copd_regress = regress(gse76925_ace2, ['COPD']) +packyears_regress = regress(gse76925_ace2, ['Packyears']) + +pack_age_sex_regress = regress(gse76925_ace2, ['Packyears', 'Age', 'Sex']) +pack_age_sex_bmi_regress = regress(gse76925_ace2, ['Packyears', 'Age', 'Sex', 'BMI']) +pack_age_sex_bmi_copd_regress = regress(gse76925_ace2, ['Packyears', 'Age', 'Sex', 'BMI', 'COPD']) + +pack_age_sex_bmi_race_regress = regress(gse76925_ace2, ['Packyears', 'Age', 'Sex', 'BMI', 'Race']) +pack_age_bmi_race_regress = regress(gse76925_ace2, ['Packyears', 'Age', 'BMI', 'Race']) + + +output = pd.concat((sex_regress, race_regress, age_regress, bmi_regress, copd_regress, packyears_regress, + pack_age_sex_regress, pack_age_sex_bmi_regress, pack_age_sex_bmi_copd_regress, + pack_age_sex_bmi_race_regress, pack_age_bmi_race_regress)) +output.to_csv('data/gse76925_regressions.csv') + +#%% + +trachea = pd.read_csv('data/Data/Regressions/gse13933_trachea.csv') +trachea['Sex'] = trachea['Sex'].replace({'M': 1, 'F': 0}) +trachea.loc[trachea['Race'] == 'white', 'Race'] = 0 +trachea.loc[trachea['Race'] != 0, 'Race'] = 1 +trachea['Race'] = trachea['Race'].astype('int') + +regressions = [] +for i in trachea: + r = regress(trachea, [i]) + regressions.append(r) + +regressions.append(regress(trachea, ['Age', 'Sex', 'Race', 'Smoker'])) +df = pd.concat(regressions) +df.to_csv('data/Data/Regressions/gse139333_trachea_regressions.csv') \ No newline at end of file diff --git a/tpm-mouse-normalization.py b/tpm-mouse-normalization.py new file mode 100755 index 0000000..8d89fe6 --- /dev/null +++ b/tpm-mouse-normalization.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Apr 10 15:24:32 2020 + +@author: Joan Smith +""" + +import scanpy as sc +import pandas as pd + +#%% + +gene_names = pd.read_csv('data/raw-data/Mus_musculus.GRCm38.99.chr.gtf', sep='\t', comment='#', header=None) +gene_names.columns = ['chr', 'annotation_src', 'feature_type', 'start', 'end', 'score', 'genomic_strand', 'genomic_phase', 'extra'] +gene_names['gene_name'] = gene_names['extra'].str.extract(pat='gene_name "(.*?)";') +gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";') +gene_names['transcript_id'] = gene_names['extra'].str.extract(pat='transcript_id "(.*?)";') + +gene_names = gene_names[gene_names['feature_type'] == 'gene'] +annotation = gene_names[['gene_id', 'gene_name', 'chr']].groupby('gene_name').head(1) +annotation = annotation.join(gene_transcript_lens, on='gene_id') +annotation = annotation.set_index('gene_id') + +#%%% GSE 75715 +counts = pd.read_excel('data/raw-data/GSE75715_DE_results_RNA-seq.xlsx', header=[1, 2, 3], index_col=0) +counts = counts.set_index(counts.columns[0], append=True) +counts.index = counts.index.rename(['gene_id', 'gene_name']) +counts.columns = counts.columns.rename(['Day', 'Genetic Background', 'Replicate']) + +adata = sc.AnnData(counts.T) +sc.pp.normalize_total(adata, target_sum=1e6) +adata.to_df().T.to_csv('data/raw-data/GSE74715_cpm.csv') + +#%% GSE 1302040 + +annotation = annotation.reset_index().set_index('gene_name') +gse1302040_counts = pd.read_csv('data/raw-data/GSE132040/GSE132040_190214_A00111_0269_AHH3J3DSXX_190214_A00111_0270_BHHMFWDSXX (4).csv', index_col=0) + +adata = sc.AnnData(gse1302040_counts.T) +sc.pp.normalize_total(adata, target_sum=1e6) + +adata.to_df().T.to_csv('data/raw-data/GSE132040/GSE132040_cpm.csv')