Skip to content

Commit

Permalink
verified Counter was better for IUPAC
Browse files Browse the repository at this point in the history
  • Loading branch information
Bardia-Masudy committed Jan 22, 2023
1 parent 5ca6c78 commit 0ceef4b
Showing 1 changed file with 49 additions and 35 deletions.
84 changes: 49 additions & 35 deletions testenv.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3333333333333333\n",
"0.3611111111111111\n"
]
}
],
"outputs": [],
"source": [
"import re\n",
"\n",
Expand All @@ -29,6 +20,7 @@
"# c = seq.count('C')\n",
"# c += seq.count('c')\n",
"# return (g + c) / len(seq)\n",
"\n",
"seq = \"This is a test of this program's capability of filtering and counting IUPAC bases. NRSCYMTVAYKYKYKDYNGHVNGMRYARAVYTGADNDTDSAVVDCNCYWRNDVRNAVRWNKGCDWKYYACMDCVTTASYKGKRRNSBRBBVWTCVNHKYYRBNCMRGRCRSKSASGKWYBBGMHTCCVRNBVYHTYMSNAKGGVGCAVRWTTKCDDWHYTGKWKKCCBARYKGHGYTWSANGDYABTGAWANNRVSMACDVNYNKKRDAGNBNTTYRRNVABVARRBWCCWRTBMGMHHVTSKNWYWMHYCTHMNRBDCYWATYKYYDHBBDGYDVVSHCVAVVBMHAMBMHVAKKGVWRGCASBTGNASARMNGAHSTRDRGMVMCGWANSYHNBWSHMRKGCWVHDWCWWKBBSSYWWVSSVHMWHVNCVDRAYBATHHHTWDKKTBKHWDBGHYAHAHKDYMBNVNYNDBVAWGBCYABDSBCYDCDTABSHRHAHSYDYNNWKCNCGNNABWVCSYCASSRASADHVKMBVMHGSCHBMWMVTNVAAWBTBNMHVCHVBTGWSGMKVSGVWBNHWARTVSWNYAVACHYVGBSGTKCKDKYBRRVWSRWNMCGCKVSTSWMVBVAASAYRCGMSRKTKVCCSHBDRASSAANKTCDMWWKRBMBGACHHBRDNBGGTGKMABTBKCSCWAHASSWARBKHVAMMNKAKYAMMAAYWKYSMHYKGSCMTSSNWWBGDCGBKDYNVNHNGCVCRYSHWWMGWGSMBRYGGNGBKSCHHVKMSVBMYSGNVWYSTNAYGGNCYMDWYVWCSRVMAAMBVBKVCNAYKBAMMASTCVNDSNGCDMANWSCDRRWDNNTYCKMWVYMWBASDAMRKDHVBTMSSRTBTANKGHHKVVTMNKMAVCSYSGSYGTSBWAMKWTAGCYYWYYSSNTSCBHVAWMRVMDKBMADVKAYBBVBWMVNTYSWANRANRSDGKRAVSSVWNVVGRDRYSHNTKSCHSNDRBTSCNAKCHWAWACCGKGSTKMYBBMDMNBMSBRSCHYDYCDAKSMMNRTYYBBGHTW\"\n",
"\n",
"def gc_strict(seq):\n",
Expand Down Expand Up @@ -64,46 +56,68 @@
" # return gc content\n",
" return gc / len(trimSeq)\n",
"\n",
"def gc_iupac2(seq):\n",
" from collections import Counter\n",
" \"\"\" Return the GC content of seq as a float, accounting for IUPAC ambiguity \n",
" >>> x = Sequence(name='chr1', seq='NMRATCGTA')\n",
" >>> y = round(x.gc, 2)\n",
" >>> y == 0.36\n",
" True\n",
" \"\"\"\n",
" trimSeq = re.sub(r'[^ACGTMRWSYKVHDBNacgtmrwsykvhdbn]', '', seq)\n",
" # count all, including fractions of GC content\n",
" seqCount = Counter(trimSeq)\n",
" gc = seq.count('S') + seq.count('C') + seq.count('G')\n",
" gc += 0.67 * (seq.count('B') + seq.count('V'))\n",
" gc += 0.5 * (seq.count('M') + seq.count('R') + seq.count('Y') + seq.count('K'))\n",
" gc += 0.33 * (seq.count('H') + seq.count('D'))\n",
" gc += 0.25 * (seq.count('N'))\n",
" # return gc content\n",
" return gc / len(trimSeq)\n",
"\n",
"\n",
"print(gc_strict(seq))\n",
"\n",
"print(gc_iupac(seq))\n"
"#print(gc_strict(seq))\n",
"\n",
"#print(gc_iupac(seq))\n",
"#print(gc_iupac2(seq))\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"acgtmrwsykvhdbn\n"
"48.3 µs ± 996 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"'''\n",
"S\t1\n",
"C\t1\n",
"G\t1\n",
"\n",
"B\t0.67\n",
"V\t0.67\n",
"\n",
"M\t0.5\n",
"R\t0.5\n",
"Y\t0.5\n",
"K\t0.5\n",
"\n",
"H\t0.33\n",
"D\t0.33\n",
"\n",
"N\t0.25\n",
"'''\n",
"\n",
"print(\"ACGTMRWSYKVHDBN\".lower())"
"%timeit gc_iupac(seq)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"60.2 µs ± 675 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
]
}
],
"source": [
"%timeit gc_iupac2(seq)"
]
}
],
Expand Down

0 comments on commit 0ceef4b

Please sign in to comment.