Skip to content

Commit

Permalink
Merge pull request #739 from tadast/tt/unknown-bases
Browse files Browse the repository at this point in the history
Ignore unknown bases ('N's) when detecting DB type
  • Loading branch information
yannickwurm committed Apr 16, 2024
2 parents 0297d1a + 8bdd2fd commit 00db003
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
18 changes: 16 additions & 2 deletions lib/sequenceserver/makeblastdb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module SequenceServer
#
class MAKEBLASTDB
extend Forwardable
GUESS_SAMPLE_SIZE = 1_048_576

def_delegators SequenceServer, :config, :sys

Expand Down Expand Up @@ -333,8 +334,21 @@ def guess_sequence_type_in_fasta(file)
# If the given file is FASTA, returns Array of as many different
# sequences in the portion of the file read. Returns the portion
# of the file read wrapped in an Array otherwise.
def sample_sequences(file)
File.read(file, 1_048_576).split(/^>.+$/).delete_if(&:empty?)
def sample_sequences(file, offset = 0)
sample = File.read(file, GUESS_SAMPLE_SIZE, offset)

return [] if sample.nil?

# remove all unknown bases (indicated by 'N') before sampling
sample.gsub!(/N/, '')
meaningful_samples = sample.split(/^>.+$/).map { |line| line.gsub(/^\n+$/, '') }.delete_if(&:empty?)

if meaningful_samples.empty?
offset += GUESS_SAMPLE_SIZE
sample_sequences(file, offset)
else
meaningful_samples
end
end
end
end
21 changes: 21 additions & 0 deletions spec/makeblastdb_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@ module SequenceServer
'Sinvicta2-2-3.prot.subset.fasta')
end

let 'fasta_file_prot_with_unknown_bases_seq' do
dir_path = File.join(root_database_dir, 'with_unknown_bases')
FileUtils.mkdir_p(dir_path)
file_path = File.join(dir_path, 'with_unknown_bases.fasta')
# Write a file with unknown bases that is GUESS_SAMPLE_SIZE 'N's long.
File.open(file_path, 'w') do |f|
f.puts ">seq1"
f.puts (("N" * 70) + "\n") * (SequenceServer::MAKEBLASTDB::GUESS_SAMPLE_SIZE / 70).to_i
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
f.puts ">seq2"
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
f.puts "VSDTAKVLVTEVLEKVSVNRVATFTIEADASLGTPVVEVLSPTRESLSVHVKQNSQGTYTV"
end

file_path
end

let 'fasta_file_nucl_seqs' do
File.join(database_dir_v5, 'transcripts', 'Solenopsis_invicta',
'Sinvicta2-2-3.cdna.subset.fasta')
Expand Down Expand Up @@ -72,6 +89,10 @@ module SequenceServer
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_nucl_seqs)).to eq :nucleotide
end

it 'can ignore unknown bases when detecting the sequence type' do
expect(makeblastdb.send(:guess_sequence_type_in_fasta, fasta_file_prot_with_unknown_bases_seq)).to eq :protein
end

it 'can tell FASTA files that are yet to be made into a BLAST+ database' do
makeblastdb = SequenceServer::MAKEBLASTDB.new(database_dir_unformatted)
expect(makeblastdb.any_to_format_or_reformat?).to be_truthy
Expand Down

0 comments on commit 00db003

Please sign in to comment.