-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #184 from ehenneken/AASNova
AAS Nova harvesting
- Loading branch information
Showing
5 changed files
with
402 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
#!/usr/bin/env python | ||
from __future__ import absolute_import | ||
from .default import BaseRSSFeedParser | ||
import sys | ||
import re | ||
import calendar | ||
|
||
## AAS Nova tags (to remove from keywords) | ||
AASNova_tags = ['Highlights','Astrobites','Features','Images','AAS News'] | ||
## Pattern for categories allowed | ||
re_cats_allowed = re.compile(r'(highlights|astrobites)', re.IGNORECASE) | ||
## Pattern used to locate references of a doi inside a citation | ||
doi_pattern = r'doi:\s*(?P<doi>10\.\d{4,9}/\S+\w)' | ||
re_doi = re.compile(doi_pattern, re.IGNORECASE) | ||
## Get CDATA contents | ||
# <![CDATA[habitability]]> | ||
re_CDATA = re.compile(r'<!\[CDATA\[(?P<tag>.*?)\]\]>', re.IGNORECASE) | ||
## Convert month name to number | ||
mnth2num = {month.lower(): index for index, month in enumerate(calendar.month_abbr) if month} | ||
## Exceptions | ||
class NoPublicationDateFound(Exception): | ||
pass | ||
|
||
class AASNovaParser(BaseRSSFeedParser): | ||
|
||
def extract_data(self, entry): | ||
links = {} | ||
# Get the categories | ||
try: | ||
categories = [c.text for c in entry.find_all('category')] | ||
except: | ||
categories = [] | ||
check = [c for c in categories if re_cats_allowed.search(c)] | ||
# Skip if the entry is not the right category | ||
if len(check) == 0: | ||
return {} | ||
keywords = [] | ||
for c in categories: | ||
kmat = re_CDATA.search(c) | ||
if kmat: | ||
keywords.append(kmat.group('tag')) | ||
# Remove AAS Nova tags | ||
keywords = [k for k in keywords if k not in AASNova_tags] | ||
# Start gathering the necessary fields | ||
title = entry.find('title').text | ||
# Get author data | ||
au = entry.find('dc:creator').text | ||
amat = re_CDATA.search(au) | ||
if amat: | ||
au = amat.group('tag') | ||
if au.lower() != 'astrobites': | ||
audata = au.split() | ||
fname = audata[0] | ||
lname = " ".join(audata[1:]) | ||
author = "%s, %s" % (lname, fname) | ||
else: | ||
author = "Astrobites" | ||
lname = "Astrobites" | ||
# Nova record number | ||
nova_entry = entry.find('guid').text | ||
links['ELECTR'] = nova_entry | ||
# The record number follows from the URL: | ||
# e.g. https://aasnova.org/?p=10858 | ||
# Of course, we can parse the URL using the urllib.parse library | ||
# but that seems overkill here | ||
recno = nova_entry.split('p=')[-1] | ||
# Attempt to find the citation in the record (for the paper | ||
# discussed in this AAS Nova entry) | ||
try: | ||
citation = entry.find('citation').text | ||
except: | ||
citation = None | ||
# If we found a citation, see if we can capture a DOI | ||
doi = None | ||
if citation: | ||
dmat = re_doi.search(citation) | ||
if dmat: | ||
doi = dmat.group('doi').strip() | ||
# Get the publication date | ||
try: | ||
pdate = entry.find('pubdate').text.split() | ||
pubdate = " ".join(pdate[1:4]) | ||
mnth = mnth2num.get(pdate[2].lower(), '00') | ||
year = pdate[3] | ||
except: | ||
raise NoPublicationDateFound("AAS Nova entry does not contain publication date") | ||
# Get the journal entry | ||
journal = "AAS Nova Highlight %s, id. %s" % (pubdate, recno) | ||
# Get the abstract data | ||
try: | ||
abstract = entry.find('description').text.replace('<![CDATA[','').replace(']]>','').strip() | ||
abstract = re.sub(r'The post.*?appeared first on AAS Nova\.','', abstract).strip() | ||
except: | ||
abstract = '' | ||
# Sometimes the associated DOI(s) are in the 'contents' data, | ||
# in case we did not find one earlier | ||
if not doi: | ||
contents = entry.find('content:encoded').text | ||
# We take the first DOI we find | ||
try: | ||
doi = re.findall(doi_pattern, contents, re.IGNORECASE)[0] | ||
except: | ||
doi = None | ||
if doi: | ||
# If we have a DOI, assign it to the ASSOCIATED link | ||
links['ASSOCIATED'] = "https://dx.doi/%s"%doi | ||
# Form the bibcode | ||
bibcode = "%snova.pres%s%s" % (year, recno.rjust(4, '.'), lname[0]) | ||
# Now we have all information to send back | ||
rec = { | ||
'bibcode': bibcode, | ||
'title': title, | ||
'authors': author, | ||
'properties': links, | ||
'keywords': ", ".join(keywords), | ||
'pubdate': "%s/%s" % (mnth, year), | ||
'publication': journal, | ||
'abstract': abstract.strip() | ||
} | ||
return rec | ||
|
||
def parse(self, url, **kwargs): | ||
nova_recs = [{}] | ||
USER_AGENT = "The NASA Astrophysics Data System harvester (http://ads.harvard.edu; ads@cfa.harvard.edu)" | ||
headers = { | ||
'Content-type': 'text/xml', | ||
'Accept': 'text/html,application/xhtml+xml,application/xml', | ||
'User-agent': USER_AGENT} | ||
data = self.get_records(url, headers=headers, **kwargs) | ||
for d in data: | ||
try: | ||
nova_recs.append(self.extract_data(d)) | ||
except Exception as err: | ||
sys.stderr.write('Failed to process record %s (%s). Skipping...\n' % (d.find('guid').text, err)) | ||
continue | ||
|
||
return nova_recs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.