Skip to content

Commit

Permalink
Classifier Script Version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas-S-Allen committed Feb 16, 2024
1 parent d994bf3 commit ca84c3e
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 115 deletions.
8 changes: 3 additions & 5 deletions ClassifierPipeline/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from builtins import str
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Text, TIMESTAMP, ARRAY
from sqlalchemy import Column, Integer, String, Text, TIMESTAMP, ARRAY, ForeignKey
from sqlalchemy.types import Enum
import json
import sys
Expand All @@ -21,16 +21,14 @@ class ScoreTable(Base):
class OverrideTable(Base):
__tablename__ = 'overrides'
id = Column(Integer, primary_key=True)
score_id = Column(Integer, foreign_key='scores.id')
# score_id = Column(Integer, foreign_key='ScoreTable.id')
score_id = Column(Integer, ForeignKey('scores.id'))
override = Column(ARRAY(String))
created = Column(UTCDateTime, default=get_date)

class FinalCollectionTable(Base):
__tablename__ = 'final_collection'
id = Column(Integer, primary_key=True)
score_id = Column(Integer, foreign_key='scores.id')
# score_id = Column(Integer, foreign_key='ScoreTable.id')
score_id = Column(Integer, ForeignKey('scores.id'))
collection = Column(ARRAY(String))
created = Column(UTCDateTime, default=get_date)

4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
Building a Pipeline around the classifier.

The `quick_classifer.py` script is a stand-alone script to apply the SciX classifier to a set of records denoted by bibcode.


2 changes: 1 addition & 1 deletion alembic.ini
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ version_path_separator = os # Use os.pathsep. Default configuration used for ne
use_flask_db = true
# sqlalchemy.url = driver://user:pass@localhost/dbname
# sqlalchemy.url = 'intentionally wrong'
sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/ClassifierTestDB
sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/classifierdb
; sqlalchemy.url = postgresql-16://postgres:postgres@localhost:5432/ClassifierTestDB
; sqlalchemy.url = sqlite:///

Expand Down
62 changes: 4 additions & 58 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,11 @@
SQLALCHEMY_URL = 'sqlite:///'
SQLALCHEMY_ECHO = False
WORKING_DIR = '~/Code/ClassifierPipeline'

API_URL = "https://api.adsabs.harvard.edu/v1" # ADS API URL
API_TOKEN = ''
# Config file save directory
CONFIG_DIR = '/Users/thomasallen/Code/ClassifierPipeline/config_files/'
# Input Data
DATA_FULL_SAMPLE = '/Users/thomasallen/Code/ClassifierPipeline/data/full_sample.csv'
DATA_GROUND_TRUTH = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth.csv'
DATA_GROUND_TRUTH_ALL = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all_curated.csv'
# DATA_GROUND_TRUTH_ALL = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.csv'
DATA_GROUND_TRUTH_ALL_PICKLE = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.pkl'
DATA_GROUND_TRUTH_ALL_JSON = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.json'
TEST_DATA_SOURCE = "Classified_CSV" # "Article', 'Classified_CSV'
DATA_SAMPLE_CLASSIFIED = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified.csv' # Initial classified sample
DATA_EXTRA_HELIO = '/Users/thomasallen/Code/ClassifierPipeline/data/helio_nature_science_bibcode_list.csv'
DATA_EXTRA_PLANETARY = '/Users/thomasallen/Code/ClassifierPipeline/data/ps_nature_science_bibcode_list.csv'
# DATA_SAMPLE_CLASSIFIED_NEW = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified_new.csv' # Latest classified sample
DATA_SAMPLE_CLASSIFIED_NEW = "/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified_title_abstract_no_labels_chkp32100.csv"
# Classification Parameters
RUN_SAMPLE_CLASSIFICATION = "no"
CLASSIFICATION_INPUT_TEXT = "title abstract"
# CLASSIFICATION_INPUT_TEXT = 'Abstract' # 'title', 'abstract' 'title abstract'
# Classification Model
PUBLISHED_MODEL = False
# CLASSIFICATION_PRETRAINED_MODEL = "adsabs/ASTROBERT"
# CLASSIFICATION_PRETRAINED_MODEL_UNPUBLISHED = "/Users/thomasallen/Code/ClassifierPipeline/models/checkpoint-32100"
# CLASSIFICATION_PRETRAINED_MODEL_UNPUBLISHED = "/Users/thomasallen/Code/ClassifierPipeline/models/checkpoint-32100"


CLASSIFICATION_PRETRAINED_MODEL = "ClassifierPipeline/tests/models/checkpoint-32100/"
CLASSIFICATION_PRETRAINED_MODEL_REVISION = "SciX-Categorizer"
CLASSIFICATION_PRETRAINED_MODEL_TOKENIZER = "adsabs/ASTROBERT"
CLASSIFICATION_INPUT_TEXT = "title abstract"
# Plots
SHOW_BARCHART_COUNTS_ALL = False
MAKE_CATEGORY_BOXPLOTS = True
SHOW_CATEGORY_BOXPLOTS = False
BOXPLOT_SAVE_DIR = '/Users/thomasallen/Code/ClassifierPipeline/figures/Score_Boxplots/'
EXAMINE_CATAGORIES = False

TEST_THRESHOLDS = True
TEST_THRESHOLDS_METHOD = "max"
TEST_LABELS = False

EXPLORE_EARTH_SCIENCE = True
ADD_EARTH_SCIENCE_TWEAK = False
EARTH_SCIENCE_TWEAK_THRESHOLD = 0.015
CLASSIFICATION_THRESHOLDS = [0.06, 0.03, 0.04, 0.02, 0.99, 0.02, 0.02, 0.99]
ADDITIONAL_EARTH_SCIENCE_PROCESSING = True
ADDITIONAL_EARTH_SCIENCE_PROCESSING_THRESHOLD = 0.015

EXPLORE_MULTI_CLASS = True
GENERAL_THRESHOLD = 0.0
# Thresholds for model checkpoint 32100
# [Astrophysics, Heliophysics, Planetary Science, Earth Science, Biophysics, Other Physics, Other, Garbage]
# [0.06, 0.03, 0.04, 0.02, 0.0, 0.02, 0.02, 0.0]
CLASSIFICATION_THRESHOLDS = [0.06, 0.03, 0.04, 0.02, 0.99, 0.02, 0.02, 0.99]
ASTRONOMY_THRESHOLD_DELTA = 0.06
HELIOPHYSICS_THRESHOLD_DELTA = 0.03
PLANETARY_SCIENCE_THRESHOLD_DELTA = 0.04
EARTH_SCIENCE_THRESHOLD_DELTA = 0.02
BIOPHYSICS_THRESHOLD_DELTA = 0.0
OTHER_PHYSICS_THRESHOLD_DELTA = 0.02
OTHER_THRESHOLD_DELTA = 0.02
GARBAGE_THRESHOLD_DELTA = 0.0

65 changes: 65 additions & 0 deletions config_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
SQLALCHEMY_URL = 'sqlite:///'
SQLALCHEMY_ECHO = False
WORKING_DIR = '~/Code/ClassifierPipeline'
API_URL = "https://api.adsabs.harvard.edu/v1" # ADS API URL
API_TOKEN = ''
# Config file save directory
CONFIG_DIR = '/Users/thomasallen/Code/ClassifierPipeline/config_files/'
# Input Data
DATA_FULL_SAMPLE = '/Users/thomasallen/Code/ClassifierPipeline/data/full_sample.csv'
DATA_GROUND_TRUTH = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth.csv'
DATA_GROUND_TRUTH_ALL = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all_curated.csv'
# DATA_GROUND_TRUTH_ALL = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.csv'
DATA_GROUND_TRUTH_ALL_PICKLE = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.pkl'
DATA_GROUND_TRUTH_ALL_JSON = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_all.json'
TEST_DATA_SOURCE = "Classified_CSV" # "Article', 'Classified_CSV'
DATA_SAMPLE_CLASSIFIED = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified.csv' # Initial classified sample
DATA_EXTRA_HELIO = '/Users/thomasallen/Code/ClassifierPipeline/data/helio_nature_science_bibcode_list.csv'
DATA_EXTRA_PLANETARY = '/Users/thomasallen/Code/ClassifierPipeline/data/ps_nature_science_bibcode_list.csv'
# DATA_SAMPLE_CLASSIFIED_NEW = '/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified_new.csv' # Latest classified sample
DATA_SAMPLE_CLASSIFIED_NEW = "/Users/thomasallen/Code/ClassifierPipeline/data/ground_truth_sample_classified_title_abstract_no_labels_chkp32100.csv"
# Classification Parameters
RUN_SAMPLE_CLASSIFICATION = "no"
CLASSIFICATION_INPUT_TEXT = "title abstract"
# CLASSIFICATION_INPUT_TEXT = 'Abstract' # 'title', 'abstract' 'title abstract'
# Classification Model
PUBLISHED_MODEL = False
# CLASSIFICATION_PRETRAINED_MODEL = "adsabs/ASTROBERT"
# CLASSIFICATION_PRETRAINED_MODEL_UNPUBLISHED = "/Users/thomasallen/Code/ClassifierPipeline/models/checkpoint-32100"
# CLASSIFICATION_PRETRAINED_MODEL_UNPUBLISHED = "/Users/thomasallen/Code/ClassifierPipeline/models/checkpoint-32100"
CLASSIFICATION_PRETRAINED_MODEL = "ClassifierPipeline/tests/models/checkpoint-32100/"
CLASSIFICATION_PRETRAINED_MODEL_REVISION = "SciX-Categorizer"
CLASSIFICATION_PRETRAINED_MODEL_TOKENIZER = "adsabs/ASTROBERT"
CLASSIFICATION_INPUT_TEXT = "title abstract"
# Plots
SHOW_BARCHART_COUNTS_ALL = False
MAKE_CATEGORY_BOXPLOTS = True
SHOW_CATEGORY_BOXPLOTS = False
BOXPLOT_SAVE_DIR = '/Users/thomasallen/Code/ClassifierPipeline/figures/Score_Boxplots/'
EXAMINE_CATAGORIES = False

TEST_THRESHOLDS = True
TEST_THRESHOLDS_METHOD = "max"
TEST_LABELS = False

EXPLORE_EARTH_SCIENCE = True
ADD_EARTH_SCIENCE_TWEAK = False
EARTH_SCIENCE_TWEAK_THRESHOLD = 0.015
ADDITIONAL_EARTH_SCIENCE_PROCESSING = True
ADDITIONAL_EARTH_SCIENCE_PROCESSING_THRESHOLD = 0.015

EXPLORE_MULTI_CLASS = True
GENERAL_THRESHOLD = 0.0
# Thresholds for model checkpoint 32100
# [Astrophysics, Heliophysics, Planetary Science, Earth Science, Biophysics, Other Physics, Other, Garbage]
# [0.06, 0.03, 0.04, 0.02, 0.0, 0.02, 0.02, 0.0]
CLASSIFICATION_THRESHOLDS = [0.06, 0.03, 0.04, 0.02, 0.99, 0.02, 0.02, 0.99]
ASTRONOMY_THRESHOLD_DELTA = 0.06
HELIOPHYSICS_THRESHOLD_DELTA = 0.03
PLANETARY_SCIENCE_THRESHOLD_DELTA = 0.04
EARTH_SCIENCE_THRESHOLD_DELTA = 0.02
BIOPHYSICS_THRESHOLD_DELTA = 0.0
OTHER_PHYSICS_THRESHOLD_DELTA = 0.02
OTHER_THRESHOLD_DELTA = 0.02
GARBAGE_THRESHOLD_DELTA = 0.0

11 changes: 11 additions & 0 deletions config_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

API_URL = "https://api.adsabs.harvard.edu/v1" # ADS API URL


CLASSIFICATION_PRETRAINED_MODEL = "ClassifierPipeline/tests/models/checkpoint-32100/"
CLASSIFICATION_PRETRAINED_MODEL_REVISION = "SciX-Categorizer"
CLASSIFICATION_PRETRAINED_MODEL_TOKENIZER = "adsabs/ASTROBERT"

CLASSIFICATION_THRESHOLDS = [0.06, 0.03, 0.04, 0.02, 0.99, 0.02, 0.02, 0.99]
ADDITIONAL_EARTH_SCIENCE_PROCESSING = True
ADDITIONAL_EARTH_SCIENCE_PROCESSING_THRESHOLD = 0.015
5 changes: 4 additions & 1 deletion harvest_solr.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,10 @@ def transform_r_json(r_json):
abstracts = [doc['abstract'] for doc in r_json['response']['docs']]

# list of dictionaries with the bibcode, title, and abstract for each record
record_list = [{'bibcode': bibcodes[i], 'text': f'{titles[i]} {abstracts[i]}'} for i in range(len(bibcodes))]
record_list = [{'bibcode': bibcodes[i],
'title' : titles[i],
'abstract' : abstracts[i],
'text': f'{titles[i]} {abstracts[i]}'} for i in range(len(bibcodes))]

# return bibcodes, titles, abstracts
return record_list
50 changes: 0 additions & 50 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,54 +1,4 @@
adsmsg==1.3.6
adsputils @ git+https://github.com/adsabs/ADSPipelineUtils.git@f94385421c062bdf2457ce50757b27e10a68f77a
alembic==1.13.1
amqp==2.6.1
backports.zoneinfo==0.2.1
billiard==3.6.4.0
celery==4.4.2
certifi==2024.2.2
charset-normalizer==2.0.12
concurrent-log-handler==0.9.20
DateTime==4.1.1
filelock==3.13.1
fsspec==2024.2.0
future==0.18.2
greenlet==3.0.3
huggingface-hub==0.20.3
idna==3.6
importlib-metadata==7.0.1
importlib-resources==6.1.1
Jinja2==3.1.3
kombu==4.6.11
Mako==1.3.2
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.1
numpy==1.24.4
packaging==23.2
pandas==1.5.3
portalocker==2.8.2
protobuf==3.17.3
psycopg==3.1.18
psycopg2==2.9.9
python-dateutil==2.8.1
python-json-logger==0.1.8
pytz==2024.1
PyYAML==6.0.1
regex==2023.12.25
requests==2.27.1
safetensors==0.4.2
simplejson==3.17.6
six==1.16.0
SQLAlchemy==1.4.50
sympy==1.12
tokenizers==0.15.1
torch==2.2.0
tqdm==4.66.1
transformers==4.37.2
typing_extensions==4.9.0
tzdata==2023.4
Unidecode==0.4.21
urllib3==1.26.18
vine==1.3.0
zipp==3.17.0
zope.interface==6.1

0 comments on commit ca84c3e

Please sign in to comment.