Skip to content

Commit

Permalink
feat(db): new parameters to exclude pap_type and ids
Browse files Browse the repository at this point in the history
  • Loading branch information
valearna committed Dec 14, 2020
1 parent e2bbdba commit baf201b
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="wbtools",
version="1.0.1",
version="1.0.3",
author="Valerio Arnaboldi",
author_email="valearna@caltech.edu",
description="Interface to WormBase (www.wormbase.org) curation data, including literature management and NLP "
Expand Down
16 changes: 15 additions & 1 deletion wbtools/db/generic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
from typing import List

import psycopg2

from wbtools.db.abstract_manager import AbstractWBDBManager
Expand All @@ -11,12 +13,24 @@ class WBGenericDBManager(AbstractWBDBManager):
def __init__(self, dbname, user, password, host):
super().__init__(dbname, user, password, host)

def get_all_paper_ids(self, added_or_modified_after: str = '1970-010-1'):
def get_all_paper_ids(self, added_or_modified_after: str = '1970-01-01', exclude_ids: List[str] = None):
if not added_or_modified_after:
added_or_modified_after = '1970-01-01'
with psycopg2.connect(self.connection_str) as conn, conn.cursor() as curs:
curs.execute("SELECT DISTINCT joinkey, pap_timestamp from pap_electronic_path WHERE pap_timestamp > %s "
"ORDER BY pap_timestamp DESC",
(added_or_modified_after, ))
res = curs.fetchall()
exclude_ids = set(exclude_ids) if exclude_ids else set()
return [row[0] for row in res if row[0] not in exclude_ids and "WBPaper" + row[0] not in exclude_ids] \
if res else []

def get_paper_ids_with_pap_types(self, pap_types: List[str]):
with psycopg2.connect(self.connection_str) as conn, conn.cursor() as curs:
curs.execute("SELECT DISTINCT pap_type.joinkey from pap_type join pap_type_index "
"ON pap_type.joinkey = pap_type_index.joinkey WHERE pap_type_index.pap_type_index IN %s ",
(tuple(pap_types),))
res = curs.fetchall()
return [row[0] for row in res] if res else []

def get_curated_variations(self, exclude_id_used_as_name: bool = False):
Expand Down
12 changes: 9 additions & 3 deletions wbtools/literature/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
tazendra_ssh_user: str = None, tazendra_ssh_passwd: str = None, paper_ids: list = None,
from_date: str = None, load_pdf_files: bool = True, load_bib_info: bool = True,
load_curation_info: bool = True, max_num_papers: int = None,
must_have_svm: bool = False) -> None:
exclude_ids: List[str] = None, must_have_svm: bool = False,
exclude_pap_types: List[str] = None) -> None:
"""load papers from WormBase database
Args:
Expand All @@ -75,11 +76,16 @@ def load_from_wb_database(self, db_name: str, db_user: str, db_password: str, db
load_bib_info (bool): load bibliographic info of the papers
load_curation_info (bool): load curation info of the papers
max_num_papers (int): limit number of papers to be loaded
exclude_ids (List[str]): list of paper ids to exclude
must_have_svm (bool): whether to load only papers that have been flagged by WB SVMs
exclude_pap_types (List[str]): list of pap_types (string value, not numeric) to exclude
"""
db_manager = WBDBManager(db_name, db_user, db_password, db_host)
if not paper_ids:
db_manager = WBDBManager(db_name, db_user, db_password, db_host)
paper_ids = db_manager.generic.get_all_paper_ids(added_or_modified_after=from_date)
paper_ids = db_manager.generic.get_all_paper_ids(added_or_modified_after=from_date, exclude_ids=exclude_ids)
if exclude_pap_types:
ids_to_exclude = db_manager.generic.get_paper_ids_with_pap_types(exclude_pap_types)
paper_ids = list(set(paper_ids) - set(ids_to_exclude))
for paper_id in paper_ids:
paper = WBPaper(paper_id=paper_id, tazendra_ssh_user=tazendra_ssh_user,
tazendra_ssh_passwd=tazendra_ssh_passwd)
Expand Down

0 comments on commit baf201b

Please sign in to comment.