-
Notifications
You must be signed in to change notification settings - Fork 11
/
nuc_sampler.cpp
259 lines (232 loc) · 9.9 KB
/
nuc_sampler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// nuc_sampler - Estimates the composition of (poly-)nucleotides in a sequence sample
// @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
// @license artistic 2.0
// @version 2.0
#define _MULTI_THREADED
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <vector>
#include "enveomics/universal.h"
#include "enveomics/sequence.h"
//#define DEBUG(a) (cerr << "(LINE " << a << ")" << endl)
#define DEBUG(a) (a)
#define LARGEST_LINE 2048
#define LARGEST_PATH 2048
#define OMAG_STEP 1024
using namespace std;
typedef struct {
char *kmer; // <- The polynucleotide
unsigned int label; // <- A numeric representation of the kmer
// Total count is calculated as: base*omag + hang
unsigned int base; // <- The base in the above expression
unsigned int omag; // <- The omag in the above expression
unsigned int hang; // <- The hang in the above expression
} count_t;
int V=0;
char nuc_char[] = {'N','A','C','G','T','R','Y','S','W','K','M','B','D','H','V','.'};
char hex_char[] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
void help(const char *msg){
if(msg!=NULL && strlen(msg) != 0) cerr << endl << msg << endl << endl;
cerr << "Goal:"<< endl
<< " Calculate (or approximate) the composition of (poly-)nucleotides in large datasets." << endl
<< "Usage:" << endl
<< " nuc_sampler -s sequences.fa [options]" << endl
<< " nuc_sampler -h" << endl
<< " nuc_sampler -V" << endl
<< "Mandatory arguments:" << endl
<< " -s <str> : Path to the (input) file containing the sequences. This is lowercase s." << endl
<< "Additional options:" << endl
<< " -o <str> : Path to the (output) file where results will be saved. By default the results are sent to stdout. This" << endl
<< " is the same behavior as using a dash (-). If an empty string is provided, does not produce the output." << endl
<< " -e : Produce extended output." << endl
<< " -f <str> : The format of the sequences. Can be 'fasta' or 'fastq'. By default: 'fasta'" << endl
<< " -k <int> : Size of the polynucleotides (words) to count. By default: 1." << endl
<< " -r <int> : Random generator seed. By default current time." << endl
<< " -v <int> : Verbose, for debugging purposes. By default 0. This is lowercase v." << endl
<< " -x <num> : Probability of taking a sequence into account, regardless of the sequence's length. Higher values reduce" << endl
<< " accuracy but increase speed. Any value lower than 1 produces and approximation of the composition. By" << endl
<< " default 1." << endl
<< " -h : Display this message and exit." << endl
<< " -V : Show version information and exit. This is uppercase V." << endl
<< "Input:" << endl
<< " Sequences must be in FastA or FastQ format." << endl
<< "Output:" << endl
<< " - The output is a tab-separated table containing the columns: id, kmer, count. Id is an internal identifier; it's" << endl
<< " provided for debug only. The last column can be an approximation in scientific notation for large numbers. See" << endl
<< " -o." << endl
<< " - The extended output is intended to provide higher precision for large counts, and contains the columns: id, kmer," << endl
<< " count, base, order of magnitude, extra counts. The third column (count) is the approximation in the above format." << endl
<< " The actual count can be calculated more accurately as the product of base (fourth column) and order of magnitude" << endl
<< " (fourth column), plus the extra counts (fifth column). In this compilation, the value of order of magnitude is a" << endl
<< " power of " << OMAG_STEP << ". See -o and -e." << endl
;
exit(1);
}
void kmer2label(unsigned int &label, char *kmer, unsigned int length){
// Vars
char hex[length+1]; // char representation of some hexadecimal integer
// label
for(unsigned int i=0; i<length; i++){
hex[i] = 'f'; // <- Any other weird thing
for(int j=0; j<16; j++)
if(kmer[i]==nuc_char[j])
hex[i]=hex_char[j];
}
hex[length] = (char)NULL;
sscanf(hex, "%x", &label); // hex ---> label
}
void label2kmer(char *&kmer, unsigned int label, unsigned int length){
// Vars
char hex[length+1]; // char representation of some hexadecimal integer
// label
sprintf(hex, "%x", label); // hex <--- label
int lead_zeroes = length-strlen(hex);
for(unsigned int i=0; i<lead_zeroes; i++)
kmer[i] = nuc_char[0];
for(unsigned int i=lead_zeroes; i<length; i++){
for(int j=0; j<16; j++)
if(hex[i-lead_zeroes]==hex_char[j])
kmer[i]=nuc_char[j];
}
kmer[length] = (char)NULL;
}
void count_plus(count_t &count, unsigned int howmany){
// ++hang
count.hang+=howmany;
// ++base
if(count.hang >= count.omag){
count.base += (unsigned int) count.hang / count.omag;
count.hang = count.hang % count.omag;
}
// ++omag
if(count.base >= UINT_MAX/OMAG_STEP){
count.omag *= OMAG_STEP;
count.hang += count.base % count.omag;
count.base /= OMAG_STEP;
}
}
unsigned int count_polynucleotides(count_t *&counts, char *seqFile, unsigned int k, float heur){
// Vars
unsigned int labels_no, linelen, label;
ifstream fileh;
char *line, *kmer;
// Kmers and counts initialization
labels_no = (unsigned int)pow(16, k);DEBUG(265);
if(V>=4) fprintf(stderr, "Creating unique integer ids for %u kmers\n", labels_no);
counts = (count_t *)malloc(labels_no * (sizeof *counts));
if(!counts) error("Impossible to allocate memory for that many different kmers", labels_no);
for(unsigned int a=0; a<labels_no; a++){
counts[a].kmer = (char *)malloc(k+1);
if(!counts[a].kmer) error("Insufficient memory to represent the next kmer", a);
label2kmer(counts[a].kmer, a, k);
counts[a].label = a;
counts[a].base = 0;
counts[a].hang = 0;
counts[a].omag = 1;
if(V>=5) fprintf(stderr, "Kmer %s -> %u\n", counts[a].kmer, counts[a].label);
}
// Allocate char* memory
if(V>=5) fprintf(stderr, "Allocating %u bits\n", CHAR_BIT*(LARGEST_LINE+1));
line = (char *)malloc(LARGEST_LINE+1);DEBUG(279);
if(!line) error("Impossible to allocate one temporal line in memory", LARGEST_LINE);
if(V>=5) fprintf(stderr, "Allocating %u bits\n", CHAR_BIT*(k+1));
kmer = (char *)malloc(k+1);
if(!kmer) error("Impossible to allocate one temporal kmer in memory", k);
kmer[k] = (char)NULL;
// Read the file
if(V>=5) cerr << "Opening file: " << seqFile << endl;
fileh.open(seqFile, ios::in);DEBUG(286);
if(!fileh.is_open()) error("Error reading file", seqFile);
while(!fileh.eof()){
fileh.getline(line, LARGEST_LINE);
if(line[0]!='>'){DEBUG(290);
linelen = strlen(line);
if(linelen < k+1) goto next_line;
for(int i=0; i<linelen-k+1; i++){DEBUG(292);
memmove(kmer, line+i, k);DEBUG(293);
kmer2label(label, kmer, k);DEBUG(294);
count_plus(counts[label], 1);DEBUG(295);
}
}
next_line:;
}
fileh.close();
return labels_no;
}
void report(count_t *&counts, unsigned int length, char *outfile, bool extended){
ofstream outfs;
char *sep = (char *)"\t", text1[100], text2[100];
bool std_out=false;
if(strlen(outfile)<=0) return;
else if(strcmp(outfile, "-")==0) std_out = true;
else{
outfs.open(outfile, ios::out);
if(!outfs.is_open()) error("I can not write in the output file", outfile);
}
for(unsigned i=0; i<length; i++){
sprintf(text1, "%u%s%s%s%g",
counts[i].label, sep,
counts[i].kmer, sep,
(double)counts[i].base*counts[i].omag+counts[i].hang);
if(std_out) cout << text1; else outfs << text1;
if(extended){
sprintf(text2, "%s%u%s%u%s%u", sep,
counts[i].base, sep,
counts[i].omag, sep,
counts[i].hang);
if(std_out) cout << text2; else outfs << text2;
}
if(std_out) cout << endl; else outfs << endl;
}
if(!std_out) outfs.close();
}
int main(int argc, char *argv[]) {
cout << "nuc_sampler v1.0" << endl;
if(argc<=1) help("");
// Vars
char *file, *format=(char *)"fasta", *outfile=(char *)"-", *namFile, *seqFile;
double heur=1.0;
int rseed=time(NULL), largest_seq;
unsigned int k=1, labels_no, N;
count_t *counts;
bool extended=false;
// GetOpt
int optchr;
while ((optchr = getopt (argc, argv, "ef:ho:r:s:v:Vk:")) != EOF)
switch(optchr) {
case 'e': extended = true; break;
case 'f': format = optarg; break;
case 'h': help(""); break;
case 'k': k = atoi(optarg); break;
case 'o': outfile = optarg; break;
case 'r': rseed=atoi(optarg); break;
case 's': file = optarg; break;
case 'v': V = atoi(optarg); break;
case 'V': return 0;
case 'x': heur = atof(optarg); break;
}
// Initialize
if(strlen(file)==0) help("");
if(strcmp(format, "fasta")!=0 & strcmp(format, "fastq")!=0)
help("Unsupported value for -f option");
if(k<=0) help("Bad argument for -k option: it must be a non-zero positive integer");
if(heur<=0.0 | heur>1.0) help("Bad argument for -x option: it must be in the range (0, 1]");
if(outfile && (strlen(outfile)>0) & (strcmp(outfile, "-")!=0)) remove(outfile);
srand(rseed);
// Parse file
if(V) cerr << "Counting sequences" << endl;
N = build_index(file, format, namFile, seqFile, largest_seq);
if(largest_seq<1) error("Your sequences are empty or an internal error occurred. Largest sequence is ", largest_seq);
if(V>=2) cerr << "The file " << seqFile << " was just created with sequence-only data" << endl;
if(V>=4) cerr << "Longest sequence is: " << largest_seq << endl;
if(N==0) error("The file you provided do not contain sequences. Before re-run please delete the file", seqFile);
if(V) cerr << "Reading file with " << N << " sequences" << endl;
// Run counts
labels_no = count_polynucleotides(counts, seqFile, k, heur);
report(counts, labels_no, outfile, extended);
return 0;
}