-
Notifications
You must be signed in to change notification settings - Fork 5
/
ribodbmaker
executable file
·4247 lines (3895 loc) · 252 KB
/
ribodbmaker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Time::HiRes qw(gettimeofday);
require "ribo.pm";
# make sure the RIBOSCRIPTSDIR and RIBOEASEL variables are set, others we will wait to see
# if they are required first
my $env_riboscripts_dir = utl_DirEnvVarValid("RIBOSCRIPTSDIR");
my $env_riboeasel_dir = utl_DirEnvVarValid("RIBOEASELDIR");
my $env_vecplus_dir = undef;
my $env_riboblast_dir = undef;
my $df_model_dir = $env_riboscripts_dir . "/models/";
my $df_tax_dir = $env_riboscripts_dir . "/taxonomy/";
#########################################################
# Command line and option processing using sqp_opts.pm
#
# opt_HH: 2D hash:
# 1D key: option name (e.g. "-h")
# 2D key: string denoting type of information
# (one of "type", "default", "group", "requires", "incompatible", "preamble", "help")
# value: string explaining 2D key:
# "type": "boolean", "string", "integer" or "real"
# "default": default value for option
# "group": integer denoting group number this option belongs to
# "requires": string of 0 or more other options this option requires to work, each separated by a ','
# "incompatible": string of 0 or more other options this option is incompatible with, each separated by a ','
# "preamble": string describing option for preamble section (beginning of output from script)
# "help": string describing option for help section (printed if -h used)
# "setby": '1' if option set by the user, else 'undef'
# "value": value for option, can be undef if default is undef
#
# opt_order_A: array of options in the order they should be processed
#
# opt_group_desc_H: key: group number (integer), value: description of group for help output
my %opt_HH = ();
my @opt_order_A = ();
my %opt_group_desc_H = ();
my $g = 0;
# Add all options to %opt_HH and @opt_order_A.
# This section needs to be kept in sync (manually) with the &GetOptions call below
$opt_group_desc_H{++$g} = "basic options";
# option type default group requires incompat preamble-output help-output
opt_Add("-h", "boolean", 0, 0, undef, undef, undef, "display this help", \%opt_HH, \@opt_order_A);
opt_Add("-f", "boolean", 0, $g, undef, undef, "forcing directory overwrite", "force; if <output directory> exists, overwrite it", \%opt_HH, \@opt_order_A);
opt_Add("-v", "boolean", 0, $g, undef, undef, "be verbose", "be verbose; output commands to stdout as they're run", \%opt_HH, \@opt_order_A);
opt_Add("-n", "integer", 1, $g, undef, "-p", "use <n> CPUs", "use <n> CPUs", \%opt_HH, \@opt_order_A);
opt_Add("--keep", "boolean", 0, $g, undef, undef, "keep all intermediate files", "keep all intermediate files that are removed by default", \%opt_HH, \@opt_order_A);
opt_Add("--special", "string", undef, $g, undef, undef, "read list of special species taxids from <s>", "read list of special species taxids from <s>", \%opt_HH, \@opt_order_A);
opt_Add("--taxin", "string", undef, $g, undef, undef, "use taxonomy tree file <s> instead of default", "use taxonomy tree file <s> instead of default", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for skipping stages";
# option type default group requires incompat preamble-output help-output
opt_Add("--skipfambig", "boolean", 0, $g, undef, undef, "skip stage that filters based on ambiguous nucleotides", "skip stage that filters based on ambiguous nucleotides", \%opt_HH, \@opt_order_A);
opt_Add("--skipftaxid", "boolean", 0, $g,"--skipmstbl", undef, "skip stage that filters by taxid", "skip stage that filters by taxid", \%opt_HH, \@opt_order_A);
opt_Add("--skipfvecsc", "boolean", 0, $g, undef, undef, "skip stage that filters based on VecScreen", "skip stage that filters based on VecScreen", \%opt_HH, \@opt_order_A);
opt_Add("--skipfblast", "boolean", 0, $g, undef, undef, "skip stage that filters based on BLAST hits to self", "skip stage that filters based on BLAST hits to self", \%opt_HH, \@opt_order_A);
opt_Add("--skipfribo1", "boolean", 0, $g, undef, undef, "skip 1st stage that filters based on ribotyper", "skip 1st stage that filters based on ribotyper", \%opt_HH, \@opt_order_A);
opt_Add("--skipfribo2", "boolean", 0, $g,"--skipfmspan,--skipingrup,--skipclustr", undef, "skip 2nd stage that filters based on ribotyper/riboaligner", "skip 2nd stage that filters based on ribotyper/riboaligner", \%opt_HH, \@opt_order_A);
opt_Add("--skipfmspan", "boolean", 0, $g, undef, undef, "skip stage that filters based on model span of hits", "skip stage that filters based on model span of hits", \%opt_HH, \@opt_order_A);
opt_Add("--skipingrup", "boolean", 0, $g, undef, undef, "skip stage that filters based on ingroup analysis", "skip stage that performs ingroup analysis", \%opt_HH, \@opt_order_A);
opt_Add("--skipclustr", "boolean", 0, $g, undef, undef, "skip stage that clusters surviving sequences", "skip stage that clusters sequences surviving all filters", \%opt_HH, \@opt_order_A);
opt_Add("--skiplistms", "boolean", 0, $g, undef, undef, "skip stage that lists missing taxids", "skip stage that lists missing taxids", \%opt_HH, \@opt_order_A);
opt_Add("--skipmstbl", "boolean", 0, $g, undef, undef, "skip stage that outputs model span tables", "skip stage that outputs model span tables", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for excluding seqs based on taxid pre-clustering, but after filter and ingroup stages";
# option type default group requires incompat preamble-output help-output
opt_Add("--exclist", "string", undef, $g, undef, undef, "exclude any seq w/a seq taxid listed in file <s>, post-filters/ingroup", "exclude any seq w/a seq taxid listed in file <s>, post-filters/ingroup", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the stage that filters based on ambiguous nucleotides";
# option type default group requires incompat preamble-output help-output
opt_Add("--famaxn", "integer", 5, $g, undef,"--skipfambig", "set maximum number of allowed ambiguous nts to <n>", "set maximum number of allowed ambiguous nts to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--famaxf", "real", 0.005, $g, undef,"--skipfambig", "set maximum fraction of of allowed ambiguous nts to <x>", "set maximum fraction of allowed ambiguous nts to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--faonlyn", "boolean", 0, $g, undef,"--skipfambig,--famaxf,--faonlyf", "enforce only max number of ambiguous nts", "enforce only max number of ambiguous nts", \%opt_HH, \@opt_order_A);
opt_Add("--faonlyf", "boolean", 0, $g, undef,"--skipfambig,--famaxn,--faonlyn", "enforce only max fraction of ambiguous nts", "enforce only max fraction of ambiguous nts", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the stage that filters by taxid";
# option type default group requires incompat preamble-output help-output
opt_Add("--ftstrict", "boolean", 0, $g, undef,"--skipftaxid", "require all taxids for sequences exist in input NCBI taxonomy tree", "require all taxids for sequences exist in input NCBI taxonomy tree", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the stage that filters based on self-BLAST hits";
# option type default group requires incompat preamble-output help-output
opt_Add("--fbcsize", "integer", 20, $g, undef, "--skipfblast", "set num seqs for each BLAST run to <n>", "set num seqs for each BLAST run to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fbcall", "boolean", 0, $g, undef, "--skipfblast", "do single BLAST run with all N seqs", "do single BLAST run with all N seqs (CAUTION: slow for large N)", \%opt_HH, \@opt_order_A);
opt_Add("--fbword", "integer", 20, $g, undef, "--skipfblast", "set word_size for BLAST to <n>", "set word_size for BLAST to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fbevalue", "real", 1, $g, undef, "--skipfblast", "set BLAST E-value cutoff to <x>", "set BLAST E-value cutoff to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--fbdbsize", "integer", 200000000, $g, undef, "--skipfblast", "set BLAST dbsize value to <n>", "set BLAST dbsize value to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fbnominus", "boolean", 0, $g, undef, "--skipfblast", "do not consider BLAST self hits to minus strand", "do not consider BLAST self hits to minus strand", \%opt_HH, \@opt_order_A);
opt_Add("--fbmdiagok", "boolean", 0, $g, undef, "--skipfblast,--fbnominus", "consider on-diagonal BLAST self hits to minus strand", "consider on-diagonal BLAST self hits to minus strand", \%opt_HH, \@opt_order_A);
opt_Add("--fbminuslen", "integer", 50, $g, undef, "--skipfblast,--fbnominus", "minimum length of BLAST self hit to minus strand is <n>", "minimum length of BLAST self hit to minus strand is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fbminuspid", "real", 95.0, $g, undef, "--skipfblast,--fbnominus", "minimum percent id of BLAST self hit to minus strand is <x>", "minimum percent id of BLAST self hit to minus strand is <x>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling both ribotyper/riboaligner stages";
# option type default group requires incompat preamble-output help-output
opt_Add("--model", "string", undef, $g, undef, undef, "model to use is <s>", "model to use is <s> (e.g. SSU.Eukarya)", \%opt_HH, \@opt_order_A);
opt_Add("--noscfail", "boolean", 0, $g, undef, undef, "do not fail sequences in ribotyper with low scores", "do not fail sequences in ribotyper with low scores", \%opt_HH, \@opt_order_A);
opt_Add("--lowppossc", "real", 0.50, $g, undef, undef, "set --lowppossc <x> option for ribotyper to <x>", "set --lowppossc <x> option for ribotyper to <x>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the first stage that filters based on ribotyper";
# option type default group requires incompat preamble-output help-output
opt_Add("--riboopts1", "string", undef, $g, undef, "--skipfribo1,--ribodir1", "use ribotyper options listed in <s> for round 1", "use ribotyper options listed in <s>", \%opt_HH, \@opt_order_A);
opt_Add("--ribodir1", "string", undef, $g, undef, "--skipfribo1", "use pre-computed ribotyper dir <s>", "use pre-computed ribotyper dir <s>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the second stage that filters based on ribotyper/riboaligner";
# option type default group requires incompat preamble-output help-output
opt_Add("--rainfo", "string", undef, $g, undef, "--skipfribo2,--ribodir2", "use ra model info file <s> instead of default", "use riboaligner model info file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("--nomultfail", "boolean", 0, $g, undef, "--skipfribo2,--ribodir2", "do not fail sequences in ribotyper stage 2 with multiple hits", "do not fail sequences in ribotyper with multiple hits", \%opt_HH, \@opt_order_A);
opt_Add("--nocovfail", "boolean", 0, $g, undef, "--skipfribo2,--ribodir2", "do not fail sequences in ribotyper stage 2 with low coverage", "do not fail sequences in ribotyper with low coverage", \%opt_HH, \@opt_order_A);
opt_Add("--nodifffail", "boolean", 0, $g, undef, "--skipfribo2,--ribodir2", "do not fail sequences in ribotyper stage 2 with low score difference", "do not fail sequences in ribotyper with low score difference", \%opt_HH, \@opt_order_A);
opt_Add("--tcov", "real", 0.99, $g, undef, "--skipfribo2,--ribodir2", "set --tcov <x> option for ribotyper stage 2 to <x>", "set --tcov <x> option for ribotyper to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--ribo2hmm", "boolean", 0, $g,"--skipfribo1","--skipfribo2,--ribodir2", "run ribotyper stage 2 in HMM-only mode (do not use --2slow)", "run ribotyper stage 2 in HMM-only mode (do not use --2slow)", \%opt_HH, \@opt_order_A);
opt_Add("--riboopts2", "string", undef, $g, undef, "--skipfribo2,--ribodir2", "use ribotyper stage 2 options listed in <s>", "use ribotyper options listed in <s>", \%opt_HH, \@opt_order_A);
opt_Add("--ribodir2", "string", undef, $g, undef, "--skipfribo2", "use pre-computed riboaligner dir <s>", "use pre-computed riboaligner dir <s>", \%opt_HH, \@opt_order_A);
opt_Add("--max5pins", "integer", undef, $g, undef, "--skipfribo2", "FAIL seqs with > <n> inserts before first model position", "FAIL seqs with > <n> inserts before first model position", \%opt_HH, \@opt_order_A);
opt_Add("--max3pins", "integer", undef, $g, undef, "--skipfribo2", "FAIL seqs with > <n> inserts after final model position", "FAIL seqs with > <n> inserts after final model position", \%opt_HH, \@opt_order_A);
opt_Add("--passlenclass","string", undef, $g, undef, "--skipfribo2", "PASS seqs in riboaligner length classes in comma separated string <s>", "PASS seqs in riboaligner length classes in comma separated string <s>", \%opt_HH, \@opt_order_A);
opt_Add("--faillenclass","string", undef, $g, undef, "--skipfribo2", "FAIL seqs in riboaligner length classes in comma separated string <s>", "FAIL seqs in riboaligner length classes in comma separated string <s>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling the stage that filters based on model span of hits";
# option type default group requires incompat preamble-output help-output
opt_Add("--fmpos", "integer", 60, $g, undef, "--skipfmspan", "aligned sequences must span from <n> to L - <n> + 1", "aligned sequences must span from <n> to L - <n> + 1 for model of length L", \%opt_HH, \@opt_order_A);
opt_Add("--fmlpos", "integer", undef, $g, "--fmrpos","--skipfmspan,--fmpos", "aligned sequences must begin at or 5' of position <n>", "aligned sequences must begin at or 5' of position <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fmrpos", "integer", undef, $g, "--fmlpos","--skipfmspan,--fmpos", "aligned sequences must end at or 3' of position <n>", "aligned sequences must end at or 3' of position <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fmnogap", "boolean", 0, $g, undef, "--skipfmspan", "require sequences do not have a gap at lpos and rpos", "require sequences do not have a gap at lpos and rpos", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling clustering stage";
# option type default group requires incompat preamble-output help-output
opt_Add("--cfid", "real", 0.995, $g, undef, "--skipclustr", "set esl-cluster fractional identity to cluster at to <x>", "set esl-cluster fractional identity to cluster at to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--cdthresh", "real", 0.0025, $g, undef, "--skipclustr,--ccentroid,--cmaxlen", "representative is longest seq within <x> distance of min distance seq", "representative is longest seq within <x> distance of min distance seq", \%opt_HH, \@opt_order_A);
opt_Add("--cmaxlen", "boolean", 0, $g, undef, "--skipclustr,--cdthresh,--ccentroid", "representative is longest seq in cluster", "representative is longest seq within cluster", \%opt_HH, \@opt_order_A);
opt_Add("--ccentroid", "boolean", 0, $g, undef, "--skipclustr,--cdthresh,--cmaxlen", "representative is centroid (min distance seq)", "representative is centroid (min distance seq)", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options that affect the alignment from which percent identities are calculated";
# option type default group requires incompat preamble-output help-output
opt_Add("--fullaln", "boolean", 0, $g, undef, undef, "do not trim alnment to min reqd span before pid calcs", "do not trim alignment to minimum required span before pid calculations", \%opt_HH, \@opt_order_A);
opt_Add("--noprob", "boolean", 0, $g, undef, undef, "do not trim alnment based on post probs before pid calcs", "do not trim alignment based on post probs before pid calculations", \%opt_HH, \@opt_order_A);
opt_Add("--pthresh", "real", 0.95, $g, undef,"--noprob", "posterior probability threshold for alnment trimming is <x>", "posterior probability threshold for alnment trimming is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--pfract", "real", 0.95, $g, undef,"--noprob", "seq fraction threshold for post prob alnment trimming is <x>", "seq fraction threshold for post prob alnment trimming is <x>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for reducing the number of passing sequences per taxid";
# option type default group requires incompat preamble-output help-output
opt_Add("--fione", "boolean", 0, $g, undef, "--skipingrup", "only allow 1 sequence per (species) taxid to survive ingroup filter", "only allow 1 sequence per (species) taxid to survive ingroup filter", \%opt_HH, \@opt_order_A);
opt_Add("--fimin", "integer", 1, $g,"--fione", "--skipingrup", "w/--fione, remove all sequences from species with < <n> sequences", "w/--fione, remove all sequences from species with < <n> sequences", \%opt_HH, \@opt_order_A);
opt_Add("--figroup", "boolean", 0, $g,"--fione", "--skipingrup", "w/--fione, keep winner (len/avg pid) in group (order,class,phyla), not in taxid", "w/--fione, keep winner (len/avg pid) in group (order,class,phyla), not in taxid", \%opt_HH, \@opt_order_A);
opt_Add("--fithresh", "real", 0.2, $g,"--fione", "--skipingrup", "w/--fione, winning seq is longest seq within <x> percent id of max percent id", "w/--fione, winning seq is longest seq within <x> percent id of max percent id", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for modifying the ingroup stage";
# option type default group requires incompat preamble-output help-output
opt_Add("--indiffseqtax","boolean", 0, $g, undef, "--skipingrup", "only consider sequences from different seq taxids when computing averages and maxes", "only consider sequences from different seq taxids when computing averages and maxes", \%opt_HH, \@opt_order_A);
opt_Add("--inminavgid", "real", 99.8, $g, undef, "--skipingrup", "fail any sequence with average percent identity within species taxid below <x>", "fail any sequence with average percent identity within species taxid below <x>", \%opt_HH, \@opt_order_A);
opt_Add("--innominavgid","boolean", 0, $g, undef, "--skipingrup,--inminavgid", "do not fail sequences with avg percent identity within species below a minimum", "do not fail sequences with avg percent identity within species taxid below a minimum", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling model span survival table output file";
# option type default group requires incompat preamble-output help-output
opt_Add("--msstep", "integer", 10, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set step size to <n>", "for model span output table, set step size to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--msminlen", "integer", 200, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set min length span to <n>", "for model span output table, set min length span to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--msminstart", "integer", undef, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set min start position to <n>", "for model span output table, set min start position to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--msmaxstart", "integer", undef, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set max start position to <n>", "for model span output table, set max start position to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--msminstop", "integer", undef, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set min stop position to <n>", "for model span output table, set min stop position to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--msmaxstop", "integer", undef, $g, undef, "--skipfribo2,--skipmstbl", "for model span output table, set max stop position to <n>", "for model span output table, set max stop position to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--mslist", "string", undef, $g, undef, "--skipfribo2,--skipmstbl", "re-sort model span table to prioritize taxids in file <s>", "re-sort model span table to prioritize taxids (orders) in file <s>", \%opt_HH, \@opt_order_A);
opt_Add("--msclass", "boolean", 0, $g, "--mslist", "--skipfribo2,--skipmstbl", "w/--mslist, taxids in --mslist file are classes not orders", "w/--mslist, taxids in --mslist file are classes not orders", \%opt_HH, \@opt_order_A);
opt_Add("--msphylum", "boolean", 0, $g, "--mslist", "--skipfribo2,--skipmstbl,--msclass", "w/--mslist, taxids in --mslist file are phyla not orders", "w/--mslist, taxids in --mslist file are phyla not orders", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for changing sequence descriptions (deflines)";
# option type default group requires incompat preamble-output help-output
opt_Add("--def", "boolean", 0, $g, undef, "--skipftaxid,--skipfribo2,--prvcmd", "standardize sequence descriptions/deflines", "standardize sequence descriptions/deflines", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling maximum number of sequences to calculate percent identities for";
# option type default group requires incompat preamble-output help-output
opt_Add("--pidmax", "integer", 40000, $g, undef, "--prvcmd,--pidforce", "set max number of seqs to compute percent identities for to <n>", "set maximum number of seqs to compute percent identities for to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--pidforce", "boolean", 0, $g, undef, "--prvcmd", "force calculation of percent identities for any number of sequences", "force calculation of percent identities for any number of sequences", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for parallelizing ribotyper/riboaligner's calls to cmsearch and cmalign on a compute farm";
# option type default group requires incompat preamble-output help-output
opt_Add("-p", "boolean", 0, $g, undef, undef, "parallelize ribotyper and riboaligner on a compute farm", "parallelize ribotyper and riboaligner on a compute farm", \%opt_HH, \@opt_order_A);
opt_Add("-q", "string", undef, $g, "-p", undef, "use qsub info file <s> instead of default", "use qsub info file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("-s", "integer", 181, $g, "-p", undef, "seed for random number generator is <n>", "seed for random number generator is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--nkb", "integer", 100, $g, "-p", undef, "number of KB of seq for each farm job is <n>", "number of KB of sequence for each farm job is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--wait", "integer", 1440, $g, "-p", undef, "allow <n> minutes for jobs on farm", "allow <n> wall-clock minutes for jobs on farm to finish, including queueing time", \%opt_HH, \@opt_order_A);
opt_Add("--errcheck", "boolean", 0, $g, "-p", undef, "consider any farm stderr output as indicating a job failure", "consider any farm stderr output as indicating a job failure", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "advanced options for debugging and testing";
# option type default group requires incompat preamble-output help-output
opt_Add("--prvcmd", "boolean", 0, $g, undef, "-f,-p", "do not execute commands; use output from previous run", "do not execute commands; use output from previous run", \%opt_HH, \@opt_order_A);
opt_Add("--pcreclustr", "boolean", 0, $g,"--prvcmd", "-f,-p,--skipclustr", "w/--prvcmd, recluster seqs and/or rechoose representatives", "w/--prvcmd, recluster seqs (--cfid) and/or rechoose representatives (--cdthresh or --cmaxlen)", \%opt_HH, \@opt_order_A);
# This section needs to be kept in sync (manually) with the opt_Add() section above
my %GetOptions_H = ();
my $usage = "Usage: ribodbmaker [-options] <input fasta sequence file> <output directory>\n";
my $synopsis = "ribodbmaker :: create representative database of ribosomal RNA sequences";
my $options_okay =
&GetOptions('h' => \$GetOptions_H{"-h"},
'f' => \$GetOptions_H{"-f"},
'n=s' => \$GetOptions_H{"-n"},
'v' => \$GetOptions_H{"-v"},
'fasta=s' => \$GetOptions_H{"--fasta"},
'keep' => \$GetOptions_H{"--keep"},
'special=s' => \$GetOptions_H{"--special"},
'taxin=s' => \$GetOptions_H{"--taxin"},
'skipftaxid' => \$GetOptions_H{"--skipftaxid"},
'skipfambig' => \$GetOptions_H{"--skipfambig"},
'skipfvecsc' => \$GetOptions_H{"--skipfvecsc"},
'skipfblast' => \$GetOptions_H{"--skipfblast"},
'skipfribo1' => \$GetOptions_H{"--skipfribo1"},
'skipfribo2' => \$GetOptions_H{"--skipfribo2"},
'skipfmspan' => \$GetOptions_H{"--skipfmspan"},
'skipingrup' => \$GetOptions_H{"--skipingrup"},
'skipclustr' => \$GetOptions_H{"--skipclustr"},
'skiplistms' => \$GetOptions_H{"--skiplistms"},
'skipmstbl' => \$GetOptions_H{"--skipmstbl"},
'exclist=s' => \$GetOptions_H{"--exclist"},
'famaxn=s' => \$GetOptions_H{"--famaxn"},
'famaxf=s' => \$GetOptions_H{"--famaxf"},
'faonlyn' => \$GetOptions_H{"--faonlyn"},
'faonlyf' => \$GetOptions_H{"--faonlyf"},
'ftstrict' => \$GetOptions_H{"--ftstrict"},
'fbcsize=s' => \$GetOptions_H{"--fbcsize"},
'fbcall' => \$GetOptions_H{"--fbcall"},
'fbword=s' => \$GetOptions_H{"--fbword"},
'fbevalue=s' => \$GetOptions_H{"--fbevalue"},
'fbdbsize=s' => \$GetOptions_H{"--fbdbsize"},
'fbnominus' => \$GetOptions_H{"--fbnominus"},
'fbmdiagok' => \$GetOptions_H{"--fbmdiagok"},
'fbminuslen=s' => \$GetOptions_H{"--fbminuslen"},
'fbminuspid=s' => \$GetOptions_H{"--fbminuspid"},
'model=s' => \$GetOptions_H{"--model"},
'nomultfail' => \$GetOptions_H{"--nomultfail"},
'noscfail' => \$GetOptions_H{"--noscfail"},
'nocovfail' => \$GetOptions_H{"--nocovfail"},
'nodifffail' => \$GetOptions_H{"--nodifffail"},
'lowppossc=s' => \$GetOptions_H{"--lowppossc"},
'tcov=s' => \$GetOptions_H{"--tcov"},
'riboopts1=s' => \$GetOptions_H{"--riboopts1"},
'ribodir1=s' => \$GetOptions_H{"--ribodir1"},
'rainfo=s' => \$GetOptions_H{"--rainfo"},
'ribo2hmm' => \$GetOptions_H{"--ribo2hmm"},
'riboopts2=s' => \$GetOptions_H{"--riboopts2"},
'ribodir2=s' => \$GetOptions_H{"--ribodir2"},
'max5pins=s' => \$GetOptions_H{"--max5pins"},
'max3pins=s' => \$GetOptions_H{"--max3pins"},
'passlenclass=s' =>\$GetOptions_H{"--passlenclass"},
'faillenclass=s' =>\$GetOptions_H{"--faillenclass"},
'cfid=s' => \$GetOptions_H{"--cfid"},
'cdthresh=s' => \$GetOptions_H{"--cdthresh"},
'cmaxlen' => \$GetOptions_H{"--cmaxlen"},
'ccentroid' => \$GetOptions_H{"--ccentroid"},
"fullaln" => \$GetOptions_H{"--fullaln"},
"noprob" => \$GetOptions_H{"--noprob"},
"pthresh=s" => \$GetOptions_H{"--pthresh"},
"pfract=s" => \$GetOptions_H{"--pfract"},
'fmpos=s' => \$GetOptions_H{"--fmpos"},
'fmlpos=s' => \$GetOptions_H{"--fmlpos"},
'fmrpos=s' => \$GetOptions_H{"--fmrpos"},
'fmnogap' => \$GetOptions_H{"--fmnogap"},
'fione' => \$GetOptions_H{"--fione"},
'fimin' => \$GetOptions_H{"--fimin"},
'figroup' => \$GetOptions_H{"--figroup"},
'fithresh=s' => \$GetOptions_H{"--fithresh"},
'indiffseqtax' => \$GetOptions_H{"--indiffseqtax"},
'inminavgid' => \$GetOptions_H{"--inminavgid"},
'innominavgid' => \$GetOptions_H{"--innominavgid"},
'msstep=s' => \$GetOptions_H{"--msstep"},
'msminlen=s' => \$GetOptions_H{"--msminlen"},
'msminstart=s' => \$GetOptions_H{"--msminstart"},
'msmaxstart=s' => \$GetOptions_H{"--msmaxstart"},
'msminstop=s' => \$GetOptions_H{"--msminstop"},
'msmaxstop=s' => \$GetOptions_H{"--msmaxstop"},
'mslist=s' => \$GetOptions_H{"--mslist"},
'msclass' => \$GetOptions_H{"--msclass"},
'msphylum' => \$GetOptions_H{"--msphylum"},
'def' => \$GetOptions_H{"--def"},
'pidmax=s' => \$GetOptions_H{"--pidmax"},
'pidforce' => \$GetOptions_H{"--pidforce"},
# options for parallelization
'p' => \$GetOptions_H{"-p"},
'q=s' => \$GetOptions_H{"-q"},
's=s' => \$GetOptions_H{"-s"},
'nkb=s' => \$GetOptions_H{"--nkb"},
'wait=s' => \$GetOptions_H{"--wait"},
'errcheck' => \$GetOptions_H{"--errcheck"},
'prvcmd' => \$GetOptions_H{"--prvcmd"},
'pcreclustr' => \$GetOptions_H{"--pcreclustr"});
my $total_seconds = -1 * ofile_SecondsSinceEpoch(); # by multiplying by -1, we can just add another ofile_SecondsSinceEpoch call at end to get total time
my $executable = $0;
my $date = scalar localtime();
my $version = "1.0.5";
my $releasedate = "Sep 2023";
my $package_name = "Ribovore";
# make *STDOUT file handle 'hot' so it automatically flushes whenever we print to it
select *STDOUT;
$| = 1;
# print help and exit if necessary
if((! $options_okay) || ($GetOptions_H{"-h"})) {
ofile_OutputBanner(*STDOUT, $package_name, $version, $releasedate, $synopsis, $date, undef);
opt_OutputHelp(*STDOUT, $usage, \%opt_HH, \@opt_order_A, \%opt_group_desc_H);
if(! $options_okay) { die "ERROR, unrecognized option;"; }
else { exit 0; } # -h, exit with 0 status
}
# check that number of command line args is correct
if(scalar(@ARGV) != 2) {
print "Incorrect number of command line arguments.\n";
print $usage;
print "\nTo see more help on available options, enter 'ribodbmaker -h'\n\n";
exit(1);
}
my ($in_fasta_file, $dir) = (@ARGV);
# set options in opt_HH
opt_SetFromUserHash(\%GetOptions_H, \%opt_HH);
# validate options (check for conflicts)
opt_ValidateSet(\%opt_HH, \@opt_order_A);
# define taxonomic level we will work at for output file that lists levels lost, including for the ingroup analysis, if we do that step, default is order
my @level_A = ("phylum", "class", "order");
my $level;
my %full_level_ct_HH = (); # key 1 is $level, key 2 is $level taxid, value is number of sequences in full set (input) for that taxid
my %surv_filters_level_ct_HH = (); # key 1 is $level, key 2 is $level taxid, value is number of sequences that survive all filters for that taxid
my %surv_ingrup_level_ct_HH = (); # key 1 is $level, key 2 is $level taxid, value is number of sequences that survive ingrup analysis for that taxid
my %surv_clustr_level_ct_HH = (); # key 1 is $level, key 2 is $level taxid, value is number of sequences that survive clustering for that taxid
my %all_gtaxid_HA = (); # 1D key is taxonomic level, array is all tax ids in that level
foreach $level (@level_A) {
%{$full_level_ct_HH{$level}} = ();
%{$surv_filters_level_ct_HH{$level}} = ();
%{$surv_ingrup_level_ct_HH{$level}} = ();
%{$surv_clustr_level_ct_HH{$level}} = ();
@{$all_gtaxid_HA{$level}} = ();
}
# determine what stages we are going to do:
my $do_ftaxid = opt_Get("--skipftaxid", \%opt_HH) ? 0 : 1;
my $do_fambig = opt_Get("--skipfambig", \%opt_HH) ? 0 : 1;
my $do_fvecsc = opt_Get("--skipfvecsc", \%opt_HH) ? 0 : 1;
my $do_fblast = opt_Get("--skipfblast", \%opt_HH) ? 0 : 1;
my $do_fribo1 = opt_Get("--skipfribo1", \%opt_HH) ? 0 : 1;
my $do_fribo2 = opt_Get("--skipfribo2", \%opt_HH) ? 0 : 1;
my $do_fmspan = opt_Get("--skipfmspan", \%opt_HH) ? 0 : 1;
my $do_ingrup = opt_Get("--skipingrup", \%opt_HH) ? 0 : 1;
my $do_clustr = opt_Get("--skipclustr", \%opt_HH) ? 0 : 1;
my $do_listms = opt_Get("--skiplistms", \%opt_HH) ? 0 : 1;
my $do_mstbl = opt_Get("--skipmstbl", \%opt_HH) ? 0 : 1;
my $do_prvcmd = opt_Get("--prvcmd", \%opt_HH) ? 1 : 0;
my $do_pcreclustr = opt_Get("--prvcmd", \%opt_HH) ? 1 : 0;
my $do_keep = opt_Get("--keep", \%opt_HH) ? 1 : 0;
my $do_special = opt_IsUsed("--special", \%opt_HH) ? 1 : 0;
my $do_def = opt_Get("--def", \%opt_HH) ? 1 : 0;
my $do_exclist = opt_IsUsed("--exclist", \%opt_HH) ? 1 : 0;
# and related options
my $do_fmspan_nogap = opt_Get("--fmnogap", \%opt_HH) ? 1 : 0;
my $did_ingrup = 0; # set to true if we did ingrup analysis stage, and filled %surv_ingrup_level_ct_HH
my $did_exc = 0; # set to true if we excluded seqs listed in --exclist file
my $did_clustr = 0; # set to true if we did cluster stage, and filled %surv_clustr_level_ct_HH
# do checks that are too sophisticated for sqp_opts.pm
# if we are skipping both ribotyper stages, make sure none of the ribotyper options related to both were used
if((! $do_fribo1) && (! $do_fribo2)) {
if(opt_IsUsed("--model", \%opt_HH)) { die "ERROR, --model does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("--noscfail", \%opt_HH)) { die "ERROR, --noscfail does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("--lowppossc", \%opt_HH)) { die "ERROR, --lowppossc does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("-p", \%opt_HH)) { die "ERROR, -p does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("-q", \%opt_HH)) { die "ERROR, -q does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("-s", \%opt_HH)) { die "ERROR, -s does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("--nkb", \%opt_HH)) { die "ERROR, --nkb does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("--wait", \%opt_HH)) { die "ERROR, --wait does not make sense in combination with --skipfribo1 and --skipfribo2"; }
if(opt_IsUsed("--errcheck", \%opt_HH)) { die "ERROR, --errcheck does not make sense in combination with --skipfribo1 and --skipfribo2"; }
}
if(opt_IsUsed("--cfid", \%opt_HH) &&
((opt_Get("--cfid", \%opt_HH) < 0.) || (opt_Get("--cfid", \%opt_HH) > 1.))) {
die "ERROR, with --cfid <x>, <x> must be >= 0. and <= 1";
}
if(opt_IsUsed("--cdthresh", \%opt_HH)) {
if((1. - opt_Get("--cfid", \%opt_HH)) < (opt_Get("--cdthresh", \%opt_HH))) {
die sprintf("ERROR, with --cdthresh <x1>, <x1> must be < %f (which is 1.0 - clustering fractional identity (from --cfid))", 1.0 - opt_Get("--cfid", \%opt_HH));
}
}
if(opt_IsUsed("--pcreclustr", \%opt_HH)) {
# at least one of --cfid, --cdthresh, or --cmaxlen must also be used
if((! opt_IsUsed("--cfid", \%opt_HH)) &&
(! opt_IsUsed("--cdthresh", \%opt_HH)) &&
(! opt_IsUsed("--cmaxlen", \%opt_HH))) {
die "ERROR --prcreclustr only works in combination with at least one of: --cfid, --cdthresh, --cmaxlen";
}
}
# we don't allow user to skip ALL filter stages, they need to do at least one.
# You might think it should be okay to skip all filter stages if they want to
# do ingroup analysis or just clustering but both of those require the riboaligner
# step because they require an alignment
if((! $do_ftaxid) && (! $do_fambig) && (! $do_fvecsc) && (! $do_fblast) && (! $do_fribo1) && (! $do_fribo2) && (! $do_fmspan)) {
die "ERROR, at least one of the following filter stages *must* not be skipped: ftaxid, fambig, fvecsc, fblast, fribo2";
}
my $in_special_file = opt_Get("--special", \%opt_HH); # this will be undefined unless --special used on the command line
# verify required files exist
if(defined $in_fasta_file) {
utl_FileValidateExistsAndNonEmpty($in_fasta_file, "<input fasta sequence file> command line argument", undef, 1, undef);
}
if(defined $in_special_file) {
utl_FileValidateExistsAndNonEmpty($in_special_file, "--special argument", undef, 1, undef);
}
# a more sophisticated check for --exclist arg, go ahead and parse it now
my %exc_taxid_H = ();
if(opt_IsUsed("--exclist", \%opt_HH)) {
# this subroutine will fail if anything is wrong with the --exclist file
parse_taxid_list_file("--exclist", \%exc_taxid_H, \%opt_HH);
}
# a more sophisticated check for --mslist arg, make sure we can parse it
if(opt_IsUsed("--mslist", \%opt_HH)) {
my %tmp_H;
# this subroutine will fail if anything is wrong with the --mslist file
# we don't need %tmp_H
parse_taxid_list_file("--mslist", \%tmp_H, \%opt_HH);
}
# options that affect alignment prior to pid calcs don't work in combination with *both* --skipingrup and --skipclustr
if((opt_IsUsed("--skipingrup", \%opt_HH)) && (opt_IsUsed("--skipclustr", \%opt_HH))) {
if(opt_IsUsed("--fullaln", \%opt_HH)) {
die "ERROR, --fullaln doesn't make sense in combination with both --skipingrup and --skipclustr";
}
if(opt_IsUsed("--noprob", \%opt_HH)) {
die "ERROR, --noprob doesn't make sense in combination with both --skipingrup and --skipclustr";
}
if(opt_IsUsed("--pthresh", \%opt_HH)) {
die "ERROR, --pthresh doesn't make sense in combination with both --skipingrup and --skipclustr";
}
if(opt_IsUsed("--pfract", \%opt_HH)) {
die "ERROR, --pfract doesn't make sense in combination with both --skipingrup and --skipclustr";
}
}
# --skipfmspan requires --fullaln UNLESS both --skipingrup and --skipclustr also used
if(opt_IsUsed("--skipfmspan", \%opt_HH)) {
if((! opt_IsUsed("--fullaln", \%opt_HH)) &&
((! opt_IsUsed("--skipingrup", \%opt_HH)) ||
(! opt_IsUsed("--skipclustr", \%opt_HH)))) {
die "ERROR, --fullaln is required if --skipfmspan is used unless --skipingrup and --skipclustr are also used";
}
}
# enforce <n> >= 1 for all of --msminstart, --msmaxstart, --msminstop, --msmaxstop
if((opt_IsUsed("--msminstart", \%opt_HH)) && (opt_Get("--msminstart", \%opt_HH) < 1)) {
die "ERROR, with --msminstart <n1>, <n1> must be >= 1";
}
if((opt_IsUsed("--msmaxstart", \%opt_HH)) && (opt_Get("--msmaxstart", \%opt_HH) < 1)) {
die "ERROR, with --msmaxstart <n1>, <n1> must be >= 1";
}
if((opt_IsUsed("--msminstop", \%opt_HH)) && (opt_Get("--msminstop", \%opt_HH) < 1)) {
die "ERROR, with --msminstop <n1>, <n1> must be >= 1";
}
if((opt_IsUsed("--msmaxstop", \%opt_HH)) && (opt_Get("--msmaxstop", \%opt_HH) < 1)) {
die "ERROR, with --msmaxstop <n1>, <n1> must be >= 1";
}
# with --msminstart <n1> and --msmaxstart <n2>, enforce <n1> <= <n2>
if((opt_IsUsed("--msminstart", \%opt_HH)) && (opt_IsUsed("--msmaxstart", \%opt_HH))) {
if((opt_Get("--msminstart", \%opt_HH)) > (opt_Get("--msmaxstart", \%opt_HH))) {
die "ERROR, with --msminstart <n1> and --msmaxstart <n2>, <n2> must be >= <n1>";
}
}
# with --msminstop <n1> and --msmaxstop <n2>, enforce <n1> <= <n2>
if((opt_IsUsed("--msminstop", \%opt_HH)) && (opt_IsUsed("--msmaxstop", \%opt_HH))) {
if((opt_Get("--msminstop", \%opt_HH)) > (opt_Get("--msmaxstop", \%opt_HH))) {
die "ERROR, with --msminstop <n1> and --msmaxstop <n2>, <n2> must be >= <n1>";
}
}
# with --msminstart <n1> and --msmaxstop <n2>, enforce <n1> <= <n2>
if((opt_IsUsed("--msminstart", \%opt_HH)) && (opt_IsUsed("--msmaxstop", \%opt_HH))) {
if((opt_Get("--msminstart", \%opt_HH)) > (opt_Get("--msmaxstop", \%opt_HH))) {
die "ERROR, with --msminstart <n1> and --msmaxstop <n2>, <n2> must be >= <n1>";
}
}
# we do more sophisticated tests for these --ms{min,max}{start,stop} options later
# after we know $family_modellen
# now that we know what steps we are doing, make sure that:
# - required ENV variables are set and point to valid dirs
# - required executables exist and are executable
# - required files exist
# we do this for each stage individually
my $in_riboopts1_file = undef;
my $in_riboopts2_file = undef;
my $df_ra_modelinfo_file = $df_model_dir . "riboaligner.rfam.modelinfo";
my $ra_modelinfo_file = undef;
my %execs_H = (); # key is name of program, value is path to the executable
my $taxonomy_tree_six_column_file = undef;
# we always require easel miniapps
$execs_H{"esl-sfetch"} = $env_riboeasel_dir . "/esl-sfetch";
$execs_H{"esl-seqstat"} = $env_riboeasel_dir . "/esl-seqstat";
$execs_H{"esl-alimanip"} = $env_riboeasel_dir . "/esl-alimanip";
$execs_H{"esl-alimerge"} = $env_riboeasel_dir . "/esl-alimerge";
$execs_H{"esl-alimask"} = $env_riboeasel_dir . "/esl-alimask";
$execs_H{"esl-alipid"} = $env_riboeasel_dir . "/esl-alipid";
$execs_H{"esl-alistat"} = $env_riboeasel_dir . "/esl-alistat";
$execs_H{"esl-cluster"} = $env_riboeasel_dir . "/esl-cluster";
$execs_H{"ali-apos-to-uapos.pl"} = $env_riboscripts_dir . "/miniscripts/ali-apos-to-uapos.pl";
if($do_ftaxid || $do_ingrup || $do_fvecsc || $do_special || $do_def) {
$env_vecplus_dir = utl_DirEnvVarValid("VECPLUSDIR");
if($do_fvecsc) {
$execs_H{"vecscreen"} = $env_vecplus_dir . "/scripts/vecscreen";
$execs_H{"parse_vecscreen.pl"} = $env_vecplus_dir . "/scripts/parse_vecscreen.pl";
$execs_H{"combine_summaries.pl"} = $env_vecplus_dir . "/scripts/combine_summaries.pl";
}
}
if($do_ftaxid || $do_ingrup || $do_special || $do_def) {
$execs_H{"srcchk"} = $env_vecplus_dir . "/scripts/srcchk";
# make sure the tax file exists
my $df_tax_file = $df_tax_dir . "ncbi-taxonomy-tree.ribodbmaker.txt";
if(! opt_IsUsed("--taxin", \%opt_HH)) { $taxonomy_tree_six_column_file = $df_tax_file; }
else { $taxonomy_tree_six_column_file = opt_Get("--taxin", \%opt_HH); }
utl_FileValidateExistsAndNonEmpty($taxonomy_tree_six_column_file, "taxonomy tree file with taxonomic levels and specified species", undef, 1, undef); # 1 says: die if it doesn't exist or is empty
$execs_H{"find_taxonomy_ancestors.pl"} = $env_vecplus_dir . "/scripts/find_taxonomy_ancestors.pl";
$execs_H{"alipid-taxinfo-analyze.pl"} = $env_riboscripts_dir . "/miniscripts/alipid-taxinfo-analyze.pl";
}
if($do_fblast) {
$env_riboblast_dir = utl_DirEnvVarValid("RIBOBLASTDIR");
$execs_H{"blastn"} = $env_riboblast_dir . "/blastn";
}
if($do_fribo1) {
# make sure model exists
if(! opt_IsUsed("--model", \%opt_HH)) {
die "ERROR, --model is a required option, unless --skipfribo1 and --skipfribo2 are both used";
}
# make sure the riboopts1 file exists if --riboopts1 used
if(opt_IsUsed("--riboopts1", \%opt_HH)) {
$in_riboopts1_file = opt_Get("--riboopts1", \%opt_HH);
utl_FileValidateExistsAndNonEmpty($in_riboopts1_file, "riboopts file specified with --riboopts1", undef, 1, undef); # last argument as 1 says: die if it doesn't exist or is empty
}
}
if($do_fribo1 || $do_fribo2) {
# make sure model exists
if(! opt_IsUsed("--model", \%opt_HH)) {
die "ERROR, --model is a required option, unless --skipfribo1 and --skipfribo2 are both used";
}
# make sure the riboopts2 file exists if --riboopts2 used
if(opt_IsUsed("--riboopts2", \%opt_HH)) {
$in_riboopts2_file = opt_Get("--riboopts2", \%opt_HH);
utl_FileValidateExistsAndNonEmpty($in_riboopts2_file, "riboopts file specified with --riboopts2", undef, 1, undef); # last argument as 1 says: die if it doesn't exist or is empty
}
# make sure the riboaligner modelinfo files exists
if(! opt_IsUsed("--rainfo", \%opt_HH)) {
$ra_modelinfo_file = $df_ra_modelinfo_file;
utl_FileValidateExistsAndNonEmpty($ra_modelinfo_file, "default riboaligner model info file", undef, 1, undef); # 1 says: die if it doesn't exist or is empty
}
else { # --rainfo used
$ra_modelinfo_file = opt_Get("--rainfo", \%opt_HH); }
if(! opt_IsUsed("--rainfo", \%opt_HH)) {
utl_FileValidateExistsAndNonEmpty($ra_modelinfo_file, "riboaligner model info file specified with --rainfo", undef, 1, undef); # 1 says: die if it doesn't exist or is empty
}
$execs_H{"ribotyper"} = $env_riboscripts_dir . "/ribotyper";
$execs_H{"riboaligner"} = $env_riboscripts_dir . "/riboaligner";
}
if(opt_IsUsed("--mslist", \%opt_HH)) {
$execs_H{"mdlspan-survtbl-sort.pl"} = $env_riboscripts_dir . "/miniscripts/mdlspan-survtbl-sort.pl";
}
# deal with 'time' differently because we can deal if it doesn't exist and
# we only use it if it does exist and -p is used
my $env_ribotime_dir = ribo_CheckForTimeExecutable(); # this will return undef if time doesn't exist or is undef
if(defined $env_ribotime_dir) {
$execs_H{"time"} = $env_ribotime_dir . "/time";
}
utl_ExecHValidate(\%execs_H, undef);
#############################
# create the output directory
#############################
my $cmd; # a command to run with runCommand()
my @early_cmd_A = (); # array of commands we run before our log file is opened
if($dir !~ m/\/$/) { $dir =~ s/\/$//; } # remove final '/' if it exists
if(-d $dir) {
$cmd = "rm -rf $dir";
if(opt_Get("-f", \%opt_HH)) {
utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef); push(@early_cmd_A, $cmd);
}
else { # $dir directory exists but -f not used
if(! $do_prvcmd) {
die "ERROR directory named $dir already exists. Remove it, or use -f to overwrite it.";
}
}
}
elsif(-e $dir) {
$cmd = "rm $dir";
if(opt_Get("-f", \%opt_HH)) {
utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef); push(@early_cmd_A, $cmd);
}
else { # $dir file exists but -f not used
die "ERROR a file named $dir already exists. Remove it, or use -f to overwrite it.";
}
}
# create the dir
$cmd = "mkdir $dir";
if(! $do_prvcmd) {
utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef);
push(@early_cmd_A, $cmd);
}
my $dir_tail = $dir;
$dir_tail =~ s/^.+\///; # remove all but last dir
my $out_root = $dir . "/" . $dir_tail . ".ribodbmaker";
# if -p used and "time" exists (we checked above), then leave execs_H{"time"} alone, else undefine it so we won't use it
my $use_time_program = ((opt_Get("-p", \%opt_HH)) && (defined $execs_H{"time"})) ? 1 : 0;
if(! $use_time_program) { $execs_H{"time"} = undef; }
# checkpoint related variables:
# 'filters' (filters) checkpoint, after fribo2 stage
my $npass_filters = 0; # number of seqs that pass all filter stages
my $nfail_filters = 0; # number of seqs that pass all filter stages
# 'ingrup' (ingroup) checkpoint, after ingrup stage
my $npass_ingrup = 0; # number of seqs that pass ingroup stage
my $nfail_ingrup = 0; # number of seqs that pass ingroup stage
# 'exc' (exclusion) checkpoint, after exclusion stage (if --exclist)
my $npass_exc = 0; # number of seqs that pass exclusion stage
my $nfail_exc = 0; # number of seqs that pass exclusion stage
# 'clustr' (cluster) checkpoint, after clustering stage
my $npass_clustr = 0; # number of seqs that pass clustering
my $nfail_clustr = 0; # number of seqs that pass clustering
#############################################
# output program banner and open output files
#############################################
# output preamble
my @arg_desc_A = ("input sequence file", "output directory name");
my @arg_A = ($in_fasta_file, $dir);
my %extra_H = ();
$extra_H{"\$RIBOSCRIPTSDIR"} = $env_riboscripts_dir;
$extra_H{"\$RIBOEASELDIR"} = $env_riboeasel_dir;
if(defined $env_vecplus_dir) { $extra_H{"\$VECPLUSDIR"} = $env_vecplus_dir; }
if(defined $env_riboblast_dir) { $extra_H{"\$RIBOBLASTDIR"} = $env_riboblast_dir; }
if($use_time_program) { $extra_H{"\$RIBOTIMEDIR"} = $env_ribotime_dir; }
ofile_OutputBanner(*STDOUT, $package_name, $version, $releasedate, $synopsis, $date, \%extra_H);
opt_OutputPreamble(*STDOUT, \@arg_desc_A, \@arg_A, \%opt_HH, \@opt_order_A);
# open the list, log and command files:
# set output file names and file handles, and open those file handles
my %ofile_info_HH = (); # hash of information on output files we created,
# 1D keys:
# "fullpath": full path to the file
# "nodirpath": file name, full path minus all directories
# "desc": short description of the file
# "FH": file handle to output to for this file, maybe undef
# 2D keys:
# "log": log file of what's output to stdout
# "cmd": command file with list of all commands executed
# open the log and command files
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "log", $out_root . ".log", 1, 1, "Output printed to screen");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "cmd", $out_root . ".cmd", 1, 1, "List of executed commands");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "list", $out_root . ".list", 1, 1, "List and description of all output files");
my $log_FH = $ofile_info_HH{"FH"}{"log"};
my $cmd_FH = $ofile_info_HH{"FH"}{"cmd"};
# output files are all open, if we exit after this point, we'll need
# to close these first.
# now we have the log file open, output the banner there too
ofile_OutputBanner($log_FH, $package_name, $version, $releasedate, $synopsis, $date, \%extra_H);
opt_OutputPreamble($log_FH, \@arg_desc_A, \@arg_A, \%opt_HH, \@opt_order_A);
# output any commands we already executed to $log_FH
foreach $cmd (@early_cmd_A) {
print $cmd_FH $cmd . "\n";
}
###########################################################################
# Preliminary stage: Parse/validate input files
# We do this first, so we can die quickly if anything goes wrong
# as opposed to waiting until we get to the relevant stage.
###########################################################################
my $progress_w = 81; # the width of the left hand column in our progress output, hard-coded
my $start_secs;
$start_secs = ofile_OutputProgressPrior("[Stage: prelim] Validating input files", $progress_w, $log_FH, *STDOUT);
# parse the modelinfo file, this tells us where the CM files are
my @tmp_family_order_A = (); # family name, in order, temporary because we enforce that there is only 1 before continuing
my %tmp_family_modelfile_H = (); # key is family name (e.g. "SSU.Archaea") from @tmp_family_order_A, value is CM file for that family
my %tmp_family_modellen_H = (); # key is family name (e.g. "SSU.Archaea") from @tmp_family_order_A, value is consensus length for that family
my %tmp_family_rtname_HA = (); # key is family name (e.g. "SSU.Archaea") from @family_order_A, value is array of ribotyper models to align with this family
my $family = undef;
my $family_modelfile = undef;
my $family_modellen = undef;
my $family_fail_str = "";
my $ribotyper_options; # string of options for ribotyper 1 stage
my $local_ra_riboopts_file = $out_root . ".ra.riboopts"; # riboopts file for fribo2 step
my $local_ra_modelinfo_file = $out_root . ".ra.modelinfo"; # model info file for fribo2 step
if($do_fribo1 || $do_fribo2) {
# make sure that the model specified with --model exists
ribo_ParseRAModelinfoFile($ra_modelinfo_file, $df_model_dir, \@tmp_family_order_A, \%tmp_family_modelfile_H, \%tmp_family_modellen_H, \%tmp_family_rtname_HA, $ofile_info_HH{"FH"});
$family = opt_Get("--model", \%opt_HH);
if(! exists $tmp_family_modelfile_H{$family}) {
foreach my $tmp_family (@tmp_family_order_A) { $family_fail_str .= $tmp_family. "\n"; }
ofile_FAIL("ERROR, model $family specified with --model not listed in $ra_modelinfo_file.\nValid options are:\n$family_fail_str", $!, $ofile_info_HH{"FH"});
}
$family_modelfile = $tmp_family_modelfile_H{$family};
$family_modellen = $tmp_family_modellen_H{$family};
if(! -s $family_modelfile) {
ofile_FAIL("ERROR, model file $family_modelfile specified in $ra_modelinfo_file does not exist or is empty", $!, $ofile_info_HH{"FH"});
}
# create the riboaligner info file for $family
open(RAINFO, ">", $local_ra_modelinfo_file) || ofile_FileOpenFailure($local_ra_modelinfo_file, "ribodbmaker::main()", $!, "writing", $ofile_info_HH{"FH"});
my $local_family_modelfile = ofile_RemoveDirPath($family_modelfile);
printf RAINFO ("%s %s %s", $family, $local_family_modelfile, $family_modellen);
foreach my $rtname (@{$tmp_family_rtname_HA{$family}}) {
printf RAINFO (" %s", $rtname);
}
print RAINFO ("\n");
close(RAINFO);
# create the riboopts1 string for ribotyper stage 1, unless --riboopts1 <s> provided in which case we read that
if(opt_IsUsed("--riboopts1", \%opt_HH)) {
open(OPTS1, $in_riboopts1_file) || ofile_FileOpenFailure($in_riboopts1_file, "ribodbmaker::main()", $!, "writing", $ofile_info_HH{"FH"});
$ribotyper_options = <OPTS1>;
chomp $ribotyper_options;
close(OPTS1);
}
else {
$ribotyper_options = sprintf("--minusfail --lowppossc %s ", opt_Get("--lowppossc", \%opt_HH));
if(! opt_IsUsed("--noscfail", \%opt_HH)) { $ribotyper_options .= "--scfail "; }
# --2slow doesn't apply here
}
# create the riboopts2 file to supply to riboaligner, unless --riboopts2 <s> provided in which case use <s>
if(opt_IsUsed("--riboopts2", \%opt_HH)) {
my $cp_command = sprintf("cp %s $local_ra_riboopts_file", opt_Get("--riboopts2", \%opt_HH));
utl_RunCommand($cp_command, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"});
}
else {
open(RIBOOPTS2, ">", $local_ra_riboopts_file) || ofile_FileOpenFailure($local_ra_riboopts_file, "ribodbmaker::main()", $!, "writing", $ofile_info_HH{"FH"});
my $riboopts_str = sprintf("--lowppossc %s --tcov %s", opt_Get("--lowppossc", \%opt_HH), opt_Get("--tcov", \%opt_HH));
if(! opt_IsUsed("--nodifffail", \%opt_HH)) { $riboopts_str .= " --difffail"; }
if(! opt_IsUsed("--nomultfail", \%opt_HH)) { $riboopts_str .= " --multfail"; }
if(! opt_IsUsed("--ribo2hmm", \%opt_HH)) { $riboopts_str .= " --2slow"; }
printf RIBOOPTS2 ($riboopts_str . "\n");
close(RIBOOPTS2);
}
# now that we know $family_modellen, check that the --ms{min,max}{start,stop} options still make make sense
# note that we already checked that --msminstart <= --msmaxstart, --msminstop <= --msmaxstop, and --msminstart <= --msmaxstop above
# first, make sure that all values are now less than or equal to modellen
if((opt_IsUsed("--msminstart", \%opt_HH)) && (opt_Get("--msminstart", \%opt_HH) < 1)) {
ofile_FAIL("ERROR, with --msminstart <n1>, <n1> must be >= 1", $!, $ofile_info_HH{"FH"});
}
if((opt_IsUsed("--msmaxstart", \%opt_HH)) && (opt_Get("--msmaxstart", \%opt_HH) < 1)) {
ofile_FAIL("ERROR, with --msmaxstart <n1>, <n1> must be >= 1", $!, $ofile_info_HH{"FH"});
}
if((opt_IsUsed("--msminstop", \%opt_HH)) && (opt_Get("--msminstop", \%opt_HH) < 1)) {
ofile_FAIL("ERROR, with --msminstop <n1>, <n1> must be >= 1", $!, $ofile_info_HH{"FH"});
}
if((opt_IsUsed("--msmaxstop", \%opt_HH)) && (opt_Get("--msmaxstop", \%opt_HH) < 1)) {
ofile_FAIL("ERROR, with --msmaxstop <n1>, <n1> must be >= 1", $!, $ofile_info_HH{"FH"});
}
# second, make sure there's at least one bin we're going to have
# this only relies on --msminstart and --msmaxstop
if(opt_IsUsed("--msminstart", \%opt_HH) || opt_IsUsed("--msmaxstop", \%opt_HH)) {
my $tmp_minstart = opt_IsUsed("--msminstart", \%opt_HH) ? opt_Get("--msminstart", \%opt_HH) : 1;
my $tmp_maxstop = opt_IsUsed("--msmaxstop", \%opt_HH) ? opt_Get("--msmaxstop", \%opt_HH) : $family_modellen;
my $tmp_step = opt_Get("--msstep", \%opt_HH);
my $tmp_msminlen = opt_Get("--msminlen", \%opt_HH);
my $tmp_minlen = ($tmp_step > $tmp_msminlen) ? $tmp_step : $tmp_msminlen;
# don't think this check is actually necessary, but I'm leaving it
if($tmp_minstart > $tmp_maxstop) {
ofile_FAIL("ERROR, with --msminstart <n1> and --msmaxstop <n2>, <n2> must be >= <n1> (<n1> is $tmp_minstart, <n2> is $tmp_maxstop)", $!, $ofile_info_HH{"FH"});
}
# enforce <n1> <= <n2> and (<n2> - <n1> + 1) >= MAX($tmp_minlen, $tmp_step)
if(($tmp_maxstop - $tmp_minstart + 1) < $tmp_minlen) {
ofile_FAIL("ERROR, with --msminstart <n1> and --msmaxstop <n2>, <n2> - <n1> + 1 must be must be >= $tmp_minlen (<n1> is $tmp_minstart, <n2> is $tmp_maxstop)", $!, $ofile_info_HH{"FH"});
}
}
}
my %taxid_is_special_H = (); # key is taxid (species level), value is '1' if listed in $in_special_file, does not exist otherwise
my $line;
if($do_special) {
open(IN, $in_special_file) || ofile_FileOpenFailure($in_special_file, "ribodbmaker::main()", $!, "reading", $ofile_info_HH{"FH"});
while($line = <IN>) {
chomp $line;
$line =~ s/^\s+//;
$line =~ s/\s+$//;
if($line !~ m/^\d+$/) {
ofile_FAIL("ERROR reading $in_special_file, expected only single integers on each line, got $line", $!, $ofile_info_HH{"FH"});
}
$taxid_is_special_H{$line} = 1;
}
close(IN);
}
ofile_OutputProgressComplete($start_secs, undef, $log_FH, *STDOUT);
###########################################################################################
# Preliminary stage: Copy the fasta file (if --fasta)
###########################################################################################
my $raw_fasta_file = $out_root . ".raw.fa";
my $full_fasta_file = $out_root . ".full.fa";
$start_secs = ofile_OutputProgressPrior("[Stage: prelim] Copying input fasta file ", $progress_w, $log_FH, *STDOUT);
my $cp_command .= "cp $in_fasta_file $raw_fasta_file";
if(! $do_prvcmd) { utl_RunCommand($cp_command, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_OutputProgressComplete($start_secs, undef, $log_FH, *STDOUT);
# reformat the names of the sequences, if necessary
# gi|675602128|gb|KJ925573.1| becomes KJ925573.1
$start_secs = ofile_OutputProgressPrior("[Stage: prelim] Reformatting names of sequences ", $progress_w, $log_FH, *STDOUT);
if(! $do_prvcmd) { reformat_sequence_names_in_fasta_file($raw_fasta_file, $full_fasta_file, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "fullfa", "$full_fasta_file", 0, 1, "fasta file with names possibly updated to accession.version");
ofile_OutputProgressComplete($start_secs, undef, $log_FH, *STDOUT);
# get lengths of all seqs and create a list of all sequences
my $seqstat_file = $out_root . ".full.seqstat";
my $comptbl_file = $out_root . ".full.comptbl";
my %seqidx_H = (); # key: sequence name, value: index of sequence in original input sequence file (1..$nseq)
my %seqlen_H = (); # key: sequence name, value: length of sequence,
# value multiplied by -1 after we output info for this sequence
# in round 1. Multiplied by -1 again after we output info
# for this sequence in round 2. We do this so that we know
# that 'we output this sequence already', so if we
# see it again before the next round, then we know the
# tbl file was not sorted properly. That shouldn't happen,
# but if somehow it does then we want to know about it.
my %seqnambig_H = (); # number of ambiguous nucleotides per sequence
my %seqtaxid_H = (); # taxid, from srcchk of each sequence, remains empty if srcchk does not need to be run
my %seqorgn_H = (); # organism field, from srcchk of each sequence, filled only if defline redefinition option (--def) is used
my %seqstrain_H = (); # strain field, from srcchk of each sequence, filled only if defline redefinition option (--def) is used
my %seqgtaxid_HH = (); # %seqtaxid_HH{$level}: group taxid at taxonomic level $level for each sequence, remains empty if srcchk does not need to be run
my %seqlpos_H = (); # key: sequence name, value is unaligned position that aligns to left model position we care about
my %seqrpos_H = (); # key: sequence name, value is unaligned position that aligns to right model position we care about
my %seqlenclass_H= (); # key: sequence name, value is length class from riboaligner
my @seqorder_A = (); # array of sequence names in order they appeared in the file
my %seqmdllen_H = (); # length of the model that aligns to the sequence
my %is_representative_H = (); # key is sequence name, value is 1 if sequence is a representative, 0 if it is not, key does not exist if sequence did not survive to clustering
my %not_representative_H = (); # key is sequence name, value is 1 if sequence is NOT a representative, "" if it is, key does not exist if sequence did not survive to clustering
my %in_cluster_H = (); # key is sequence name, value is cluster index this sequence belongs to
my %cluster_size_H = (); # key is a cluster index (value from %in_cluster_H), value is number of sequences in that cluster
my %width_H = (); # hash with max widths of "target", "length", "index"
my $nseq = 0;
my $full_list_file = $out_root . ".full.seqlist";
my $have_taxids = 0;
$start_secs = ofile_OutputProgressPrior("[Stage: prelim] Determining target sequence lengths", $progress_w, $log_FH, *STDOUT);
ribo_ProcessSequenceFile($execs_H{"esl-seqstat"}, $full_fasta_file, $seqstat_file, \@seqorder_A, \%seqidx_H, \%seqlen_H, \%width_H, \%opt_HH, \%ofile_info_HH);
$nseq = scalar(keys %seqidx_H);
ribo_CountAmbiguousNucleotidesInSequenceFile($execs_H{"esl-seqstat"}, $full_fasta_file, $comptbl_file, \%seqnambig_H, \%opt_HH, $ofile_info_HH{"FH"});
if(! $do_prvcmd) { utl_RunCommand("grep ^\= $seqstat_file | awk '{ print \$2 }' > $full_list_file", opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "fulllist", "$full_list_file", 0, 1, "file with list of all $nseq input sequences");
ofile_OutputProgressComplete($start_secs, undef, $log_FH, *STDOUT);
# index the sequence file
if(! $do_prvcmd) { utl_RunCommand($execs_H{"esl-sfetch"} . " --index $full_fasta_file > /dev/null", opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "fullssi", $full_fasta_file . ".ssi", 0, 1, ".ssi index file for full fasta file");
######################################################################################################
# Preliminary stage: Run srcchk, if necessary (if $do_ftaxid || $do_ingrup || $do_special || $do_def)
######################################################################################################
my $full_srcchk_file = undef;
my %taxinfo_wlevel_file_H = ();
foreach $level (@level_A) {
$taxinfo_wlevel_file_H{$level} = $out_root . ".taxinfo_w" . $level. ".txt";
%{$seqgtaxid_HH{$level}} = ();
}
if($do_ftaxid || $do_ingrup || $do_special || $do_def) {
$start_secs = ofile_OutputProgressPrior("[Stage: prelim] Running srcchk for all sequences ", $progress_w, $log_FH, *STDOUT);
$full_srcchk_file = $out_root . ".full.srcchk";
if(! $do_prvcmd) { utl_RunCommand($execs_H{"srcchk"} . " -i $full_list_file -f \'taxid,organism,strain\' > $full_srcchk_file", opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "fullsrcchk", "$full_srcchk_file", 0, 1, "srcchk output for all $nseq input sequences");
# parse srcchk output to fill %seqtaxid_H, and possibly %seq_orgn_H
parse_srcchk_file($full_srcchk_file, $taxonomy_tree_six_column_file, \%seqtaxid_H,
($do_def ? \%seqorgn_H : undef),
($do_def ? \%seqstrain_H : undef),
\@seqorder_A, \%ofile_info_HH);
$have_taxids = 1;
# get taxonomy file with taxonomic levels
foreach $level (@level_A) {
my $find_tax_cmd = $execs_H{"find_taxonomy_ancestors.pl"} . " --input_summary $full_srcchk_file --input_tax $taxonomy_tree_six_column_file --input_level $level --outfile " . $taxinfo_wlevel_file_H{$level};
if(! $do_prvcmd) { utl_RunCommand($find_tax_cmd, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "taxinfo.$level", $taxinfo_wlevel_file_H{$level}, 0, 1, "taxinfo file with $level");
# parse tax_level file to fill %full_level_ct_HH
parse_tax_level_file($taxinfo_wlevel_file_H{$level}, undef, $seqgtaxid_HH{$level}, $full_level_ct_HH{$level}, $ofile_info_HH{"FH"});
# now fill @all_gtaxid_HA
@{$all_gtaxid_HA{$level}} = (sort {$a <=> $b} keys %{$full_level_ct_HH{$level}});
}
ofile_OutputProgressComplete($start_secs, undef, $log_FH, *STDOUT);
}
####################
# FILTERING STAGES #
####################
my %seqfailstr_H = (); # hash that keeps track of failure strings for each sequence, will be "" for a passing sequence
my %curfailstr_H = (); # hash that keeps track of failure string for current stage only, will be "" for a passing sequence
my $seqname;
my $npass = 0;
ribo_InitializeHashToEmptyString(\%curfailstr_H, \@seqorder_A);
ribo_InitializeHashToEmptyString(\%seqfailstr_H, \@seqorder_A);
my $stage_key = undef;
########################################################
# 'fambig' stage: filter based on ambiguous nucleotides
########################################################
if($do_fambig) {
$stage_key = "fambig";
$start_secs = ofile_OutputProgressPrior("[Stage: $stage_key] Filtering based on ambiguous nucleotides ", $progress_w, $log_FH, *STDOUT);
my $do_num_ambig = opt_Get("--faonlyf", \%opt_HH) ? 0 : 1;
my $do_fract_ambig = opt_Get("--faonlyn", \%opt_HH) ? 0 : 1;
my $maxnambig = opt_Get("--famaxn", \%opt_HH);
my $maxfambig = opt_Get("--famaxf", \%opt_HH);
my $cur_maxnambig = 0; # maximum number of ambiguous nucleotides for current sequence
foreach $seqname (keys %seqnambig_H) {
# determine maximum number of ambiguous nucleotides for this sequence
$cur_maxnambig = undef;
if($do_fract_ambig) {
$cur_maxnambig = $maxfambig * $seqlen_H{$seqname};
}
if($do_num_ambig) {
if((! defined $cur_maxnambig) || ($maxnambig < $cur_maxnambig)) {
$cur_maxnambig = $maxnambig;
}
}
if($seqnambig_H{$seqname} > $cur_maxnambig) {
$curfailstr_H{$seqname} = "ambig[" . $seqnambig_H{$seqname} . ">" . $cur_maxnambig . "];;";
}
else {
$curfailstr_H{$seqname} = "";
}
}
$npass = update_and_output_pass_fails(\%curfailstr_H, \%seqfailstr_H, \@seqorder_A, 0, $out_root, $stage_key, \%ofile_info_HH); # 0: do not output description of pass/fail lists to log file
ofile_OutputProgressComplete($start_secs, sprintf("%6d pass; %6d fail;", $npass, $nseq-$npass), $log_FH, *STDOUT);
}
###############################################
# 'ftaxid' stage: filter for specified species
###############################################
if($do_ftaxid) {
$stage_key = "ftaxid";
$start_secs = ofile_OutputProgressPrior("[Stage: $stage_key] Filtering for specified species ", $progress_w, $log_FH, *STDOUT);
$npass = parse_srcchk_and_tax_files_for_specified_species($full_srcchk_file, $taxonomy_tree_six_column_file, \%seqtaxid_H, \%seqfailstr_H, \@seqorder_A, $out_root, \%opt_HH, \%ofile_info_HH);
ofile_OutputProgressComplete($start_secs, sprintf("%6d pass; %6d fail;", $npass, $nseq-$npass), $log_FH, *STDOUT);
}
#########################################################
# 'fvecscr' stage: filter for non-weak VecScreen matches
#########################################################
$stage_key = "fvecsc";
my $parse_vecscreen_terminal_file = $out_root . "." . $stage_key . ".terminal.parse_vecscreen";
my $parse_vecscreen_internal_file = $out_root . "." . $stage_key . ".internal.parse_vecscreen";
my $parse_vecscreen_combined_file = $out_root . "." . $stage_key . ".combined.parse_vecscreen";
if($do_fvecsc) {
$start_secs = ofile_OutputProgressPrior("[Stage: $stage_key] Identifying vector sequences with VecScreen", $progress_w, $log_FH, *STDOUT);
my $vecscreen_output_file = $out_root . ".vecscreen";
my $vecscreen_cmd = $execs_H{"vecscreen"} . " -text_output -query $full_fasta_file > $vecscreen_output_file";
if(! $do_prvcmd) { utl_RunCommand($vecscreen_cmd, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "vecout", "$vecscreen_output_file", 0, 1, "vecscreen output file");
# parse vecscreen
my $parse_vecscreen_cmd = $execs_H{"parse_vecscreen.pl"} . " --verbose --input $vecscreen_output_file --outfile_terminal $parse_vecscreen_terminal_file --outfile_internal $parse_vecscreen_internal_file";
my $combine_summaries_cmd = $execs_H{"combine_summaries.pl"} . " --input_internal $parse_vecscreen_internal_file --input_terminal $parse_vecscreen_terminal_file --outfile $parse_vecscreen_combined_file";
if(! $do_prvcmd) { utl_RunCommand($parse_vecscreen_cmd, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"}); }
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "parsevecterm", "$parse_vecscreen_terminal_file", 0, 1, "parse_vecscreen.pl terminal output file");
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "parsevecint", "$parse_vecscreen_internal_file", 0, 1, "parse_vecscreen.pl internal output file");
if(! $do_prvcmd) { utl_RunCommand($combine_summaries_cmd, opt_Get("-v", \%opt_HH), 0, $ofile_info_HH{"FH"});}
ofile_AddClosedFileToOutputInfo(\%ofile_info_HH, "parseveccombined", "$parse_vecscreen_combined_file", 0, 1, "combined parse_vecscreen.pl output file");
# get list of accessions in combined parse_vecscreen output that have non-Weak matches
$npass = parse_parse_vecscreen_combined_file($parse_vecscreen_combined_file, \%seqfailstr_H, \@seqorder_A, $out_root, \%opt_HH, \%ofile_info_HH);
ofile_OutputProgressComplete($start_secs, sprintf("%6d pass; %6d fail;", $npass, $nseq-$npass), $log_FH, *STDOUT);
}
###################################################################
# 'fblast' stage: self-blast all sequences to find tandem repeats
###################################################################
if($do_fblast) {
$stage_key = "fblast";
$start_secs = ofile_OutputProgressPrior("[Stage: $stage_key] Identifying repeats by BLASTing against self", $progress_w, $log_FH, *STDOUT);
$npass = fblast_stage(\%execs_H, \%seqfailstr_H, \@seqorder_A, $out_root, \%opt_HH, \%ofile_info_HH);