default memory is now 95% of available RAM

felipelouza · Dec 18, 2020 · 06a7188 · 06a7188
1 parent f67cba4
commit 06a7188
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -59,8 +59,8 @@ All input and output files are uncompressed. The value 0 is used as the eof symb
 
 ## Main command line options
 
-*-m, --mem*     
-  specify memory assigned to the algorithm in MB. This is a *mandatory* option. **Note:** do not assign all the available RAM to the algorithm: leave at least 5% to the operating system.
+*-m, --mem*
+  specify memory assigned to the algorithm in MB. Default is 95% of the available RAM
 
 *-o, --out*        
   specify basename for output and temporary files
@@ -77,10 +77,10 @@ All input and output files are uncompressed. The value 0 is used as the eof symb
 *--lbytes*      
   number of bytes for each LCP entry (def. 2)
 
-*-v*       
+*-v*
   verbose output in the log file
 
-*-h, --help*      
+*-h, --help*
   show usage
 
 
@@ -100,22 +100,21 @@ Use the options:
 *--sbytes*      
   number of bytes for each SA entry (def. 4)
 
-### Merging BWT files
+### Document array requirements
 
-In the case you want to **merge BWT files** and later compute the **Document Array**, you must provide DA and `file.docs` with the following option:
-
-*--da --docs*      
-  compute Document Array and output the number of documents into `file.docs` (required to use --bwt --da)
+If you want to simultaneaouly **merge BWT files** and compute the **Document Array** for each input BWT you must provide, in addition to the DA, also a `.docs` file for containing the number of documents in the file in 64 bits little endian format. The `.docs` file is automatically computed when the option *-d, --da* is used.
 
 **Example**
 
 ```sh
-./eGap -m 4096 dataset/file1.fastq -o file1 --da --docs
-./eGap -m 4096 dataset/file2.fastq -o file2 --da --docs
+./eGap -m 4096 dataset/file1.fastq -o file1 --da
+./eGap -m 4096 dataset/file2.fastq -o file2 --da
 
 ./eGap -m 4096 --bwt -o merge file1.bwt file2.bwt --da
 ```
 
+The first two commands compute `file1.bwt`, `file1.da`, `file1.docs` and `file2.bwt`, `file2.da`, `file2.docs` which are used by the third command to compute `merge.bwt`, `merge.da`, and `merge.docs`
+
 
 ## Truncated LCP values and de Bruijn graph info 
 
@@ -125,8 +124,8 @@ threshold *k*. Using the option *--trlcp k*, as an altenative to *--lcp*,
 the algorithm computes an LCP array in which all values greater than *k* are
 replaced by the value *k*.
 
-*--trlcp*
-  compute LCP values only up to TRLCP (truncated LCP)
+*--trlcp k*
+  truncate LCP values to the value *k*
 
 
 Another option offered by eGap, alternative to (truncated) LCP, 
@@ -138,8 +137,11 @@ the BOSS representation of the de Bruijn graph as described in the
 [Application](https://almob.biomedcentral.com/articles/10.1186/s13015-019-0140-0#Sec14)
 section of the [AMB paper](https://doi.org/10.1186/s13015-019-0140-0). 
 
-*--deB*      
-  compute info for order-DEB deBruijn graph
+*--deB K*
+  compute info for building the order-K deBruijn graph
+
+**Notice:** if the options *--trlcp* or *--deB* are used, suffixes are sorted only up the first *k* symbols so the resulting BWT *will not* be the standard one.
+
 
 
 ## Datasets

diff --git a/eGap b/eGap
@@ -3,7 +3,7 @@
 import sys, time, argparse, subprocess, os.path, shutil, struct
 from psutil import virtual_memory
 
-Version = "v2.0"
+Version = "v2.1"
 
 Description = """Tool to build the BWT and optionally the LCP and DA array for a collection 
 of sequences in external memory. There are two different usages depending 
@@ -50,13 +50,10 @@ POS_SIZE = 5    # must be equal to POS_SIZE in config.h
 
 
 def main():
-  # compute available memory in MB 
-  mem = virtual_memory().total  # total memory in bytes 
-  mem = int(0.95*mem/2**20)
   # init commad line parser 
   parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
   parser.add_argument('input', help='input file name(s)', type=str, nargs='+')
-  #parser.add_argument('-m', '--mem', help='available memory in MB', type=int, required=True)
+  parser.add_argument('-m', '--mem', help='use at most M MBs (def. 95%% of available RAM)',default=-1, type=int)
   parser.add_argument('-o', '--out', help='output base name (def. input base name)', default="", type=str)  
   parser.add_argument('-b', '--bwt', help='inputs are bwt files',action='store_true')
   parser.add_argument('-l', '--lcp', help='compute LCP Array',action='store_true')
@@ -70,25 +67,31 @@ def main():
   parser.add_argument('--deB', help='compute info for building a deBruijn graph of order DEB', default=0, type=int)
   parser.add_argument('--sum', help='compute output files shasum',action='store_true')
   parser.add_argument('--delete', help='delete output files (only with --sum)',action='store_true')
-  parser.add_argument('--docs', help='stores the number of documents (ext: .docs), required for --bwt --da',action='store_true')
   parser.add_argument('-1', '--phase1', help='stop after phase 1 (debug only)',action='store_true')  
   parser.add_argument('-2', '--phase2', help='stop after phase 2 (debug only)',action='store_true')  
   parser.add_argument('-v',  help='verbose: extra info in the log file',action='store_true')
-  requiredNamed = parser.add_argument_group('required named arguments')
-  requiredNamed.add_argument('-m', '--mem', help='RAM assigned to the computation in MB (%d suggested)' % mem, type=int, required=True)
+  #requiredNamed = parser.add_argument_group('required named arguments')
+  #requiredNamed.add_argument('-m', '--mem', help='RAM assigned to the computation in MB (%d suggested)' % mem, type=int, required=True)
   args = parser.parse_args()
+  # if no max RAM provided on command line uses 95% of total 
+  if(args.mem<0):
+    mem = virtual_memory().total
+    args.mem = max(16,int(0.95*mem/2**20)) # avoid accidental 0 
+    print("Using {0} MBs of RAM".format(args.mem))
+
   # ---- check number of input files and define basename
   check_input(args)
   # ---- create and open log file
   logfile_name = args.basename + ".eGap.log"
   # get main eGap directory 
   args.egap_dir = os.path.split(sys.argv[0])[0]
   print("Sending logging messages to file:", logfile_name)
-  with open(logfile_name,"w") as logfile:  
+  with open(logfile_name,"a") as logfile:  
 
     print(">>> Begin computation",file=logfile)
     print(">>> eGap version " + Version,file=logfile)
     show_command_line(logfile)
+    print("Using {0} MBs of RAM".format(args.mem), file=logfile)
     logfile.flush()
 
     # ---- phase1: concatenate/compute BWTs
@@ -158,13 +161,7 @@ def main():
           os.remove("{f}.{n}.sa".format(f=args.basename,n=args.sbytes))
       except OSError as  e:                 
         # if failed, report it back to the user and stop
-        print ("Error: %s - %s." % (e.filename,e.strerror))      
-    if (args.docs == 0 and args.da):
-      try:
-        os.remove(args.basename+".docs")
-      except OSError as  e:                 
-        # if failed, report it back to the user and stop
-        print ("Error: %s - %s." % (e.filename,e.strerror))      
+        print ("Error: %s - %s." % (e.filename,e.strerror))
     print(">>> End test", file=logfile);
   return
 
@@ -246,25 +243,25 @@ def phase1(args,logfile, logfile_name):
       for name in args.input:
         with open(name,'rb') as fd:
           shutil.copyfileobj(fd, bwtfile, 1024*1024*10) # 10 MBs buffer
+    # if da requested we must have partial docs files: we concatenate them in a new .docs file
     if(args.da): 
       print("==== creating .docs file")    
       with open(args.basename+ ".docs","wb") as docsfile:
         for name in args.input:
-          filename = os.path.splitext(name)[0]+".docs".format(byts=args.dbytes)
-          print(filename)
+          filename = os.path.splitext(name)[0]+".docs"
           with open(filename,'rb') as fd:
             shutil.copyfileobj(fd, docsfile, 1024*1024*10) # 10 MBs buffer
-        print("==== creating .da_bl file")
+      # concatenate .da files in a single .da_bl file 
+      print("==== creating .da_bl file")
       with open(args.basename+".{byts}.da_bl".format(byts=args.dbytes),"wb") as dablfile:
         for name in args.input:
           filename = os.path.splitext(name)[0]+".{byts}.da".format(byts=args.dbytes)
-          print(filename)
           with open(filename,'rb') as fd:
             shutil.copyfileobj(fd, dablfile, 1024*1024*10) # 10 MBs buffer
     return True # everything fine
   else:        
     # ---- gSACAK    
-    # shall we use gsaka or gsaka64?
+    # We must compute BWTs. Shall we use gsaka or gsaka64?
     if args.mem//5 < 2020:    # less than 10GB: OK 32 bit 
       exe = os.path.join(args.egap_dir,gsacak_exe)
     elif args.mem//9 < 2020:  # less than 18GB: use 32bit with RAM = 10GB