Skip to content

Commit

Permalink
Merge pull request #720 from dengzq1234/ete4_gtdb
Browse files Browse the repository at this point in the history
Ete4 gtdb
  • Loading branch information
jordibc committed Nov 3, 2023
2 parents 8e5290b + edf73cd commit 79a34c4
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 16 deletions.
42 changes: 31 additions & 11 deletions ete4/gtdb_taxonomy/gtdbquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import math
import tarfile
import warnings
import requests

from ete4 import ETE_DATA_HOME, update_ete_data

Expand All @@ -20,8 +21,7 @@

DB_VERSION = 2
DEFAULT_GTDBTAXADB = ETE_DATA_HOME + '/gtdbtaxa.sqlite'
DEFAULT_GTDBTAXADUMP = ETE_DATA_HOME + '/gtdb202dump.tar.gz'

DEFAULT_GTDBTAXADUMP = ETE_DATA_HOME + '/gtdbdump.tar.gz'

def is_taxadb_up_to_date(dbfile=DEFAULT_GTDBTAXADB):
"""Check if a valid and up-to-date gtdbtaxa.sqlite database exists
Expand Down Expand Up @@ -59,9 +59,9 @@ def __init__(self, dbfile=None, taxdump_file=None, memory=False):
if dbfile != DEFAULT_GTDBTAXADB and not os.path.exists(self.dbfile):
print('GTDB database not present yet (first time used?)', file=sys.stderr)
urlbase = ('https://github.com/etetoolkit/ete-data/raw/main'
'/gtdb_taxonomy/gtdb202')
update_ete_data(f'{DEFAULT_GTDBTAXADB}.traverse.pkl', f'{urlbase}/gtdbtaxa.sqlite.traverse.pkl')
update_ete_data(f'{DEFAULT_GTDBTAXADUMP}', f'{urlbase}/gtdb202dump.tar.gz')
'/gtdb_taxonomy/gtdblatest')

update_ete_data(f'{DEFAULT_GTDBTAXADUMP}', f'{urlbase}/gtdb_latest_dump.tar.gz')

self.update_taxonomy_database(taxdump_file=DEFAULT_GTDBTAXADUMP)

Expand Down Expand Up @@ -742,12 +742,13 @@ def update_db(dbfile, targz_file=None):
basepath = os.path.split(dbfile)[0]
if basepath and not os.path.exists(basepath):
os.mkdir(basepath)

try:
tar = tarfile.open(targz_file, 'r')
except:
raise ValueError("Please provide taxa dump tar.gz file")


# if users don't provie targz_file, update the latest version from ete-data
if not targz_file:
update_local_taxdump(DEFAULT_GTDBTAXADUMP)
targz_file = DEFAULT_GTDBTAXADUMP

tar = tarfile.open(targz_file, 'r')
t, synonyms = load_gtdb_tree_from_dump(tar)

prepostorder = [int(node.name) for post, node in t.iter_prepostorder()]
Expand All @@ -762,6 +763,25 @@ def update_db(dbfile, targz_file=None):

os.system("rm taxa.tab")

def update_local_taxdump(fname=DEFAULT_GTDBTAXADUMP):
# latest version of gtdb taxonomy dump
url = "https://github.com/etetoolkit/ete-data/raw/main/gtdb_taxonomy/gtdblatest/gtdb_latest_dump.tar.gz"

if not os.path.exists(fname):
print(f'Downloading {fname} from {url} ...')
with open(fname, 'wb') as f:
f.write(requests.get(url).content)
else:
md5_local = md5(open(fname, 'rb').read()).hexdigest()
md5_remote = requests.get(url + '.md5').text.split()[0]

if md5_local != md5_remote:
print(f'Updating {fname} from {url} ...')
with open(fname, 'wb') as f:
f.write(requests.get(url).content)
else:
print(f'File {fname} is already up-to-date with {url} .')

def upload_data(dbfile):
print()
print('Uploading to', dbfile)
Expand Down
14 changes: 9 additions & 5 deletions tests/test_gtdbquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,25 @@

from ete4 import PhyloTree, GTDBTaxa, ETE_DATA_HOME, update_ete_data
from ete4.gtdb_taxonomy import gtdbquery
import requests

DATABASE_PATH = ETE_DATA_HOME + '/gtdbtaxa.sqlite'
DEFAULT_GTDBTAXADUMP = ETE_DATA_HOME + '/gtdb202dump.tar.gz'
DEFAULT_GTDBTAXADUMP = ETE_DATA_HOME + '/gtdbdump.tar.gz'


class Test_gtdbquery(unittest.TestCase):

def test_00_update_database(self):
gtdb = GTDBTaxa()

if not os.path.exists(DEFAULT_GTDBTAXADUMP):
url = ('https://github.com/etetoolkit/ete-data/raw/main'
url = ('https://github.com/etetoolkit/ete-data/raw/main'
'/gtdb_taxonomy/gtdb202/gtdb202dump.tar.gz')
update_ete_data(DEFAULT_GTDBTAXADUMP, url)

print(f'Downloading GTDB database release 202 to {DEFAULT_GTDBTAXADUMP} from {url}')

with open(DEFAULT_GTDBTAXADUMP, 'wb') as f:
f.write(requests.get(url).content)

gtdb.update_taxonomy_database(DEFAULT_GTDBTAXADUMP)

if not os.path.exists(DATABASE_PATH):
Expand Down Expand Up @@ -81,6 +86,5 @@ def test_name_lineages(self):
self.assertEqual(out[0]['o__Peptococcales'],
['root', 'd__Bacteria', 'p__Firmicutes_B', 'c__Peptococcia', 'o__Peptococcales'])


if __name__ == '__main__':
unittest.main()

0 comments on commit 79a34c4

Please sign in to comment.