nuc_sampler.cpp

// nuc_sampler - Estimates the composition of (poly-)nucleotides in a sequence sample
// @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
// @license artistic 2.0
// @version 2.0

#define _MULTI_THREADED
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <vector>
#include "enveomics/universal.h"
#include "enveomics/sequence.h"

//#define DEBUG(a) (cerr << "(LINE " << a << ")" << endl)
#define DEBUG(a) (a)
#define LARGEST_LINE	2048
#define LARGEST_PATH	2048
#define OMAG_STEP	1024

using namespace std;

typedef struct {
   char		*kmer;	// <- The polynucleotide
   unsigned int	label;	// <- A numeric representation of the kmer
   // Total count is calculated as: base*omag + hang
   unsigned int	base;	// <- The base in the above expression
   unsigned int	omag;	// <- The omag in the above expression
   unsigned int	hang;	// <- The hang in the above expression
} count_t;

int	V=0;
char	nuc_char[] = {'N','A','C','G','T','R','Y','S','W','K','M','B','D','H','V','.'};
char	hex_char[] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};

void help(const char *msg){
   if(msg!=NULL && strlen(msg) != 0) cerr << endl << msg << endl << endl;
   cerr << "Goal:"<< endl
   	<< "   Calculate (or approximate) the composition of (poly-)nucleotides in large datasets." << endl
   	<< "Usage:" << endl
	<< "   nuc_sampler -s sequences.fa [options]" << endl
	<< "   nuc_sampler -h" << endl
	<< "   nuc_sampler -V" << endl
   	<< "Mandatory arguments:" << endl
	<< "   -s <str> : Path to the (input) file containing the sequences.  This is lowercase s." << endl
	<< "Additional options:" << endl
	<< "   -o <str> : Path to the (output) file where results will be saved.  By default the results are sent to stdout.  This" << endl
	<< "              is the same behavior as using a dash (-).  If an empty string is provided, does not produce the output." << endl
	<< "   -e       : Produce extended output." << endl
	<< "   -f <str> : The format of the sequences.  Can be 'fasta' or 'fastq'.  By default: 'fasta'" << endl
	<< "   -k <int> : Size of the polynucleotides (words) to count.  By default: 1." << endl
	<< "   -r <int> : Random generator seed.  By default current time." << endl
	<< "   -v <int> : Verbose, for debugging purposes.  By default 0.  This is lowercase v." << endl
	<< "   -x <num> : Probability of taking a sequence into account, regardless of the sequence's length. Higher values reduce" << endl
	<< "              accuracy but increase speed.   Any value lower than 1 produces and approximation of the composition.  By" << endl
	<< "              default 1." << endl
	<< "   -h       : Display this message and exit." << endl
	<< "   -V       : Show version information and exit.  This is uppercase V." << endl
	<< "Input:" << endl
	<< "   Sequences must be in FastA or FastQ format." << endl
	<< "Output:" << endl
	<< "   - The output is a tab-separated table containing the columns:  id, kmer, count.  Id is an internal identifier; it's" << endl
	<< "     provided for debug only.   The last column can be an approximation in scientific notation for large numbers.  See" << endl
	<< "     -o." << endl
	<< "   - The extended output is intended to provide higher precision for large counts, and contains the columns: id, kmer," << endl
	<< "     count, base, order of magnitude, extra counts. The third column (count) is the approximation in the above format." << endl
	<< "     The actual count can be calculated more accurately as the product of base (fourth column)  and order of magnitude" << endl
	<< "     (fourth column), plus the extra counts (fifth column).  In this compilation, the value of order of magnitude is a" << endl
	<< "     power of " << OMAG_STEP << ".  See -o and -e." << endl
	;
   exit(1);
}

void kmer2label(unsigned int &label, char *kmer, unsigned int length){
   // Vars
   char		hex[length+1]; // char representation of some hexadecimal integer

   // label
   for(unsigned int i=0; i<length; i++){
      hex[i] = 'f'; // <- Any other weird thing
      for(int j=0; j<16; j++)
	 if(kmer[i]==nuc_char[j])
	    hex[i]=hex_char[j];
   }
   hex[length] = (char)NULL;
   sscanf(hex, "%x", &label); // hex ---> label
}

void label2kmer(char *&kmer, unsigned int label, unsigned int length){
   // Vars
   char		hex[length+1]; // char representation of some hexadecimal integer

   // label
   sprintf(hex, "%x", label); // hex <--- label
   int		lead_zeroes = length-strlen(hex);
   for(unsigned int i=0; i<lead_zeroes; i++)
      kmer[i] = nuc_char[0];
   for(unsigned int i=lead_zeroes; i<length; i++){
      for(int j=0; j<16; j++)
	 if(hex[i-lead_zeroes]==hex_char[j])
	    kmer[i]=nuc_char[j];
   }
   kmer[length] = (char)NULL;
}

void count_plus(count_t &count, unsigned int howmany){
   // ++hang
   count.hang+=howmany;
   // ++base
   if(count.hang >= count.omag){
      count.base += (unsigned int) count.hang / count.omag;
      count.hang  = count.hang % count.omag;
   }
   // ++omag
   if(count.base >= UINT_MAX/OMAG_STEP){
      count.omag *= OMAG_STEP;
      count.hang += count.base % count.omag;
      count.base /= OMAG_STEP;
   }
}

unsigned int count_polynucleotides(count_t *&counts, char *seqFile, unsigned int k, float heur){
   // Vars
   unsigned int	labels_no, linelen, label;
   ifstream	fileh;
   char		*line, *kmer;

   // Kmers and counts initialization
   labels_no = (unsigned int)pow(16, k);DEBUG(265);
   if(V>=4) fprintf(stderr, "Creating unique integer ids for %u kmers\n", labels_no); 
   counts = (count_t *)malloc(labels_no * (sizeof *counts));
   if(!counts) error("Impossible to allocate memory for that many different kmers", labels_no);
   for(unsigned int a=0; a<labels_no; a++){
      counts[a].kmer = (char *)malloc(k+1);
      if(!counts[a].kmer) error("Insufficient memory to represent the next kmer", a);
      label2kmer(counts[a].kmer, a, k);
      counts[a].label = a;
      counts[a].base = 0;
      counts[a].hang = 0;
      counts[a].omag = 1;
      if(V>=5) fprintf(stderr, "Kmer %s -> %u\n", counts[a].kmer, counts[a].label);
   }
   
   // Allocate char* memory
   if(V>=5) fprintf(stderr, "Allocating %u bits\n", CHAR_BIT*(LARGEST_LINE+1));
   line = (char *)malloc(LARGEST_LINE+1);DEBUG(279);
   if(!line) error("Impossible to allocate one temporal line in memory", LARGEST_LINE);
   if(V>=5) fprintf(stderr, "Allocating %u bits\n", CHAR_BIT*(k+1));
   kmer = (char *)malloc(k+1);
   if(!kmer) error("Impossible to allocate one temporal kmer in memory", k);
   kmer[k] = (char)NULL;

   // Read the file
   if(V>=5) cerr << "Opening file: " << seqFile << endl;
   fileh.open(seqFile, ios::in);DEBUG(286);
   if(!fileh.is_open()) error("Error reading file", seqFile);
   while(!fileh.eof()){
      fileh.getline(line, LARGEST_LINE);
      if(line[0]!='>'){DEBUG(290);
	 linelen = strlen(line);
	 if(linelen < k+1) goto next_line;
	 for(int i=0; i<linelen-k+1; i++){DEBUG(292);
	    memmove(kmer, line+i, k);DEBUG(293);
	    kmer2label(label, kmer, k);DEBUG(294);
	    count_plus(counts[label], 1);DEBUG(295);
	 }
      }
      next_line:;
   }
   fileh.close();

   return labels_no;
}

void report(count_t *&counts, unsigned int length, char *outfile, bool extended){
   ofstream	outfs;
   char		*sep = (char *)"\t", text1[100], text2[100];
   bool		std_out=false;

   if(strlen(outfile)<=0) return;
   else if(strcmp(outfile, "-")==0) std_out = true;
   else{
      outfs.open(outfile, ios::out);
      if(!outfs.is_open()) error("I can not write in the output file", outfile);
   }

   for(unsigned i=0; i<length; i++){
      sprintf(text1, "%u%s%s%s%g",
         counts[i].label, sep,
	 counts[i].kmer, sep,
	 (double)counts[i].base*counts[i].omag+counts[i].hang);
      if(std_out) cout << text1; else outfs << text1;
      if(extended){
         sprintf(text2, "%s%u%s%u%s%u", sep,
	    counts[i].base, sep,
	    counts[i].omag, sep,
	    counts[i].hang);
	 if(std_out) cout << text2; else outfs << text2;
      }
      if(std_out) cout << endl; else outfs << endl;
   }
   if(!std_out) outfs.close();
}

int main(int argc, char *argv[]) {
   cout << "nuc_sampler v1.0" << endl;
   if(argc<=1) help("");

   // Vars
   char		*file, *format=(char *)"fasta", *outfile=(char *)"-", *namFile, *seqFile;
   double	heur=1.0;
   int		rseed=time(NULL), largest_seq;
   unsigned int	k=1, labels_no, N;
   count_t	*counts;
   bool		extended=false;
   
   // GetOpt
   int		optchr;
   while ((optchr = getopt (argc, argv, "ef:ho:r:s:v:Vk:")) != EOF)
      switch(optchr) {
	 case 'e': extended = true;	break;
	 case 'f': format = optarg;	break;
	 case 'h': help("");		break;
	 case 'k': k = atoi(optarg);	break;
	 case 'o': outfile = optarg;	break;
	 case 'r': rseed=atoi(optarg);	break;
         case 's': file = optarg;	break;
	 case 'v': V = atoi(optarg);	break;
	 case 'V': return 0;
	 case 'x': heur = atof(optarg);	break;
      }
   
   // Initialize
   if(strlen(file)==0) help("");
   if(strcmp(format, "fasta")!=0 & strcmp(format, "fastq")!=0)
      help("Unsupported value for -f option");
   if(k<=0) help("Bad argument for -k option: it must be a non-zero positive integer");
   if(heur<=0.0 | heur>1.0) help("Bad argument for -x option: it must be in the range (0, 1]");
   if(outfile && (strlen(outfile)>0) & (strcmp(outfile, "-")!=0)) remove(outfile);
   srand(rseed);
   
   // Parse file
   if(V) cerr << "Counting sequences" << endl;
   N = build_index(file, format, namFile, seqFile, largest_seq);
   if(largest_seq<1) error("Your sequences are empty or an internal error occurred.  Largest sequence is ", largest_seq);
   if(V>=2) cerr << "The file " << seqFile << " was just created with sequence-only data" << endl;
   if(V>=4) cerr << "Longest sequence is: " << largest_seq << endl;
   if(N==0) error("The file you provided do not contain sequences.  Before re-run please delete the file", seqFile);
   if(V) cerr << "Reading file with " << N << " sequences" << endl;

   // Run counts
   labels_no = count_polynucleotides(counts, seqFile, k, heur);
   report(counts, labels_no, outfile, extended);

   return 0;
}