-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexpression.csv.py
99 lines (84 loc) · 2.71 KB
/
expression.csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import string, sys, os
import csv
from itertools import izip
import uuid
def parse_gene(genes_file):
probeID_gene_mapping = {}
fin = open(genes_file, 'r')
fin.readline()
for line in fin.readlines():
data = line[:-1].split(',') # HCA genes csv
probe = data[0]
gene = data[1]
probeID_gene_mapping[probe] = gene
fin.close()
return probeID_gene_mapping
def parse(mappingfile):
mapping = {}
fin = open(mappingfile, 'r')
while 1:
line = fin.readline()
if line == '':
break
cell_int, cellkey = line.split()
mapping[cellkey] = cell_int
fin.close()
return mapping
def process (infile, outfile, input_delimiter, k, total_cell, probeID_gene_mapping):
fout=open(outfile,'w')
fout.close()
fout=open(outfile,'a')
# read header count gene number
fin=open(infile,'rU')
line = fin.readline()
totalN = len(string.split(line, input_delimiter))
fin.close()
count = 0
tmpdir = str(uuid.uuid4())
os.system("mkdir " + tmpdir)
#first line in output file because mapping is generated using the exp file, it is just sequential nubmers
fout.write('xena_cell_id')
for i in range(1, total_cell + 1):
fout.write('\t' + str(i))
fout.write('\n')
# rest columns
for i in range (1, totalN, k):
count = count + 1
print (count)
tmpinfile = tmpdir + "/" + str(count)
start = i + 1 #linux cut
command = "cut -d , -f " + str(start) + "-" + str(start+k-1) +" " + infile + " > " + tmpinfile
os.system(command)
fin=open(tmpinfile,'rU')
a = izip(*csv.reader(fin, delimiter = input_delimiter))
fin.close()
#switch gene ids in a
for row in a:
probeID = row[0]
try:
gene = probeID_gene_mapping[probeID]
except:
print (probeID)
gene = probeID
fout.write(gene+'\t')
csv.writer(fout, delimiter="\t", lineterminator="\n").writerow(row[1:])
os.system("rm -rf " + tmpinfile)
fout.close()
os.system("rm -rf " + tmpdir)
# transpose
# switch ensemble to gene
if len(sys.argv[:])!= 3:
print ("python expression.csv.py dataDir chunk_size(e.g.200 larger the file, smaller the size)")
sys.exit()
dir = sys.argv[1]
infile = dir + '/expression.csv'
genes_file = dir + '/genes.csv'
outfile= dir + '/expression.tsv'
mappingfile= dir + '/mapping'
mapping = parse(mappingfile)
total_cell = len(mapping)
k = int(sys.argv[2])
print (total_cell)
probeID_gene_mapping = parse_gene(genes_file)
input_delimiter = ',' # HCA matrix csv
process(infile, outfile, input_delimiter, k, total_cell, probeID_gene_mapping)