Skip to content

Commit

Permalink
Merge pull request #149 from seasidesparrow/master
Browse files Browse the repository at this point in the history
Fixes for author-affiliation handling of JATS based parsers
  • Loading branch information
seasidesparrow committed Jun 24, 2021
2 parents 4848824 + 3a9f6f5 commit 57de1a2
Show file tree
Hide file tree
Showing 9 changed files with 468 additions and 228 deletions.
10 changes: 5 additions & 5 deletions examples/ex_aip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@
# infile = '/proj/ads/abstracts/data/AIP/AIP.test/RSI/v91/i5/054901_1/Markup/VOR_10.1063_5.0005676.xml'
# infile = '/proj/ads/abstracts/data/AIP/JATS.0127/JCP/v154/i2/024904_1/Markup/VOR_10.1063_5.0033645.xml'
# infile = '/proj/ads/abstracts/data/AIP/JATS.0127/AJP/v89/i2/210_1/Markup/VOR_10.1119_10.0002365.xml'
infile = '/Users/mtempleton/VOR_10.1063_5.0052678.xml'
infile = '/proj/ads/abstracts/data/AIP/JATS.0609.new/RSI/v92/i6/064704_1/Markup/VOR_10.1063_5.0044438.xml'


with open(infile,'r') as fp:
lol = AIPJATSParser()
doc = lol.parse(fp)
# print("Hi, ... %s" % doc)
wut = Tagged()
with open('lollol.lol','a') as fo:
with open('test.tag','a') as fo:
wut.write(doc,fo)
foo = ReferenceWriter()
foo.topdir = './'
# foo = ReferenceWriter()
# foo.topdir = './'
# try:
foo.writeref(doc,'aip')
# foo.writeref(doc,'aip')
# except Exception as err:
# print("Error with writeref:", err)
19 changes: 16 additions & 3 deletions pyingest/parsers/affils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,22 @@ def parse(self):
new_string = re.sub(email, '', new_string)
new_string = new_string + '; ' + str_type(email)
new_string = re.sub(' ;', '', new_string)
new_string = new_string.strip()
new_string = new_string.strip(';')
new_string = new_string.strip()
aff_array = [x.strip() for x in new_string.split(';') if x.strip()]
aff_array = list(dict.fromkeys(aff_array))
# if one string is an untagged version of another
# tagged string (e.g. untagged email), omit it
new_aff_array = list()
for x in aff_array:
ldup = False
for y in aff_array:
# tagged string will contain an untagged
# string but not vice versa
if x in y and y not in x:
ldup = True
if not ldup:
new_aff_array.append(x)
new_string = '; '.join(new_aff_array)

return new_string
except Exception as err:
print("AffiliationParser: PARSING FAILED:", err)
Expand Down
215 changes: 7 additions & 208 deletions pyingest/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from bs4 import BeautifulSoup, CData, Tag
from collections import OrderedDict
from .default import BaseBeautifulSoupParser
from .jats_contrib import JATSContribs
from pyingest.config.config import *
from .affils import AffiliationParser
from .entity_convert import EntityConverter
Expand Down Expand Up @@ -142,218 +143,16 @@ def parse(self, input_data, **kwargs):
if title_fn_list:
base_metadata['abstract'] += ' ' + ' '.join(title_fn_list)

# Authors and Affiliations:
# Set up affils storage
affils = OrderedDict()

# Author notes/note ids
# Authors and their Affiliations:
try:
notes = article_meta.find('author-notes').find_all('fn')
except Exception as err:
pass
else:
for n in notes:
try:
n.label.decompose()
except Exception as err:
pass
else:
try:
n['id']
except Exception as err:
pass
# print "I'm failing on author notes!",err
else:
key = n['id']
note_text = self._detag(n, JATS_TAGSET['affiliations'])
affils[key] = note_text.strip()

# Affils/affil ids
l_need_affils = False
try:
affil = article_meta.find('contrib-group').find_all('aff')
if len(affil) == 0:
try:
affil = article_meta.find_all('aff')
except Exception as err:
pass
auth_affil = JATSContribs(soup=article_meta)
auth_affil.parse()
aa_output = auth_affil.output
except Exception as err:
pass
else:
for a in affil:
try:
a.label.decompose()
except Exception as err:
pass
try:
a['id']
except Exception as err:
l_need_affils = True
pass
else:
key = a['id']
ekey = ''
try:
email_array = []
email_a = a.find_all('ext-link')
for em in email_a:
if em['ext-link-type'] == 'email':
address = self._detag(em, (
JATS_TAGSET['affiliations']))
address_new = "<EMAIL>" + address + "</EMAIL>"
ekey = em['id']
if ekey is not '':
affils[ekey] = address_new
while a.find('ext-link') is not None:
a.find('ext-link').extract()
except Exception as err:
pass

aff_text = self._detag(a, JATS_TAGSET['affiliations'])
affils[key] = aff_text.strip()


# <contrib-group>: Author name and affil/note lists:
try:
authors = article_meta.find('contrib-group').find_all('contrib')
except Exception as err:
pass
else:
# you have data for each author in <contrib> tags,
# so create the storage lists and loop over all contrib
base_metadata['authors'] = []
base_metadata['affiliations'] = []

for a in authors:

# Author names
if a.find('collab') is not None:
base_metadata['authors'].append(self._detag(a.collab, []))
else:
if a.find('surname') is not None:
surname = self._detag(a.surname, [])
else:
surname = ''
if a.find('prefix') is not None:
prefix = self._detag(a.prefix, []) + ' '
else:
prefix = ''
if a.find('suffix') is not None:
suffix = ' ' + self._detag(a.suffix, [])
else:
suffix = ''
if a.find('given-names') is not None:
given = self._detag(a.find('given-names'), [])
else:
given = ''
forename = prefix + given + suffix
if forename == '':
if surname != '':
base_metadata['authors'].append(surname)
else:
if surname != '':
base_metadata['authors'].append(surname + ', ' + forename)
else:
base_metadata['authors'].append(forename)

# EMAIL in contrib-group (e.g. OUP, AIP)
email = None
if a.find('email') is not None:
email = self._detag(a.email, [])
# AIP makes the email an 'xlink:href' attribute...
# You could also use an author note instead.
if email == '':
email = a.email['xlink:href']
email = email.replace('mailto:','')
email = '<EMAIL>' + email + '</EMAIL>'

# ORCIDs
orcid_out = None
try:
# orcids = a.find_all('ext-link')
orcids = a.find('ext-link')
try:
if orcids['ext-link-type'] == 'orcid':
o = self._detag(orcids, [])
orcid_out = "<ID system=\"ORCID\">" + o + "</ID>"
except Exception as err:
pass
except Exception as err:
pass
if orcid_out is None:
try:
if a.find('contrib-id') is not None:
auth_id = a.find('contrib-id')
if auth_id['contrib-id-type'] == 'orcid':
o = self._detag(auth_id, [])
o = o.split('/')[-1]
orcid_out = "<ID system=\"ORCID\">" + o + "</ID>"
except Exception as err:
pass

# If you didn't get affiliations above, l_need_affils == True, so do this...
if l_need_affils:
try:
if a.find_all('aff') is not None:
aff_text_arr = list()
for ax in a.find_all('aff'):
aff_text_arr.append(self._detag(ax, JATS_TAGSET['affiliations']).strip())
aff_text = "; ".join(aff_text_arr)
except Exception as err:
pass

# Author affil/note ids
try:
aid = a.find_all('xref')
except Exception as err:
pass
else:
aid_arr = []
if len(aid) > 0:
try:
aid_str = ' '.join([x['rid'] for x in aid])
except Exception as err:
print("jats.py: Failure in affil parsing: %s" % err)
else:
aid_arr = aid_str.split()

try:
new_aid_arr = []
for af in affils.keys():
if af in aid_arr or af == 'ALLAUTHS':
new_aid_arr.append(af)
aid_arr = new_aid_arr

# check whether or not you got affil data in one way or the other...
if not l_need_affils:
aff_text = '; '.join(affils[x] for x in aid_arr)

aff_text = aff_text.replace(';;', ';').rstrip(';')
aff_text = aff_text.replace('; ,', '').rstrip()
if aff_text == '':
if 'ALLAUTH' in affils:
aff_text = affils['ALLAUTH'].strip()

# Got ORCID?
if orcid_out is not None:
aff_text = aff_text + '; ' + orcid_out
if email is not None:
aff_text = aff_text + ' ' + email
base_metadata['affiliations'].append(aff_text)
except Exception as errrror:
if orcid_out is not None:
base_metadata['affiliations'].append(orcid_out)
else:
base_metadata['affiliations'].append('')
affnew = []
for affx in base_metadata['affiliations']:
affnew.append(AffiliationParser(affx).parse())
base_metadata['affiliations'] = affnew

if len(base_metadata['authors']) > 0:
base_metadata['authors'] = "; ".join(base_metadata['authors'])
else:
del base_metadata['authors']
base_metadata['authors'] = '; '.join(aa_output['authors'])
base_metadata['affiliations'] = aa_output['affiliations']

# Copyright:
try:
Expand Down

0 comments on commit 57de1a2

Please sign in to comment.