Skip to content

Commit

Permalink
Merge pull request #186 from seasidesparrow/bump_adsputils.20240311
Browse files Browse the repository at this point in the history
Bump adsputils.20240311
  • Loading branch information
seasidesparrow committed Mar 12, 2024
2 parents 8f4cd89 + 2f982e8 commit cef3fea
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 149 deletions.
10 changes: 7 additions & 3 deletions examples/ex_proquest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import print_function
import os
from glob import glob
from pyingest.parsers.proquest import ProQuestParser
from pyingest.serializers.classic import Tagged
Expand All @@ -7,15 +8,18 @@

def main():

marc_filename = 'SAO_NASA_Jul_2020.UNX'
marc_filename = './SAO_NASA_Dec_2023.UNX'
oa_filename = marc_filename.replace('.UNX', '_OpenAccessTitles.csv')
marcdata = open(marc_filename).read()
oadata = open(oa_filename).read()
if os.path.exists(oa_filename):
oadata = open(oa_filename).read()
else:
oadata = ''
parser = ProQuestParser(marcdata, oadata)
lol = parser.parse()
print("%s records processed" % len(parser.results))
tag = Tagged()
outfile = 'lolproque.tag'
outfile = marc_filename+'.NEW'
with open(outfile,'w') as fo:
for rec in parser.results:
tag.write(rec,fo)
Expand Down
1 change: 0 additions & 1 deletion pyingest/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,6 @@ def get_uat(data,data_dict):

# ProQuest harvester
PROQUEST_BASE_PATH = "/proj/ads_abstracts/sources/ProQuest/fromProQuest/"
PROQUEST_OA_BASE = "http://pqdtopen.proquest.com/pubnum/%s.html"
PROQUEST_URL_BASE = "http://gateway.proquest.com/openurl?url_ver=Z39.88-2004&res_dat=xri:pqdiss&rft_val_fmt=info:ofi/fmt:kev:mtx:dissertation&rft_dat=xri:pqdiss:%s"
PROQUEST_DATASOURCE = "UMI"

Expand Down
290 changes: 146 additions & 144 deletions pyingest/parsers/proquest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def get_db(self, rec):

def parse(self):

oa_base = config.PROQUEST_OA_BASE
url_base = config.PROQUEST_URL_BASE

auth_parse = AuthorNames()
Expand All @@ -82,176 +81,179 @@ def parse(self):

# read each record into a pymarc object
if sys.version_info > (3,):
reader = pymarc.MARCReader(r.encode('utf-8'), to_unicode=True)
reader = pymarc.MARCReader(r.encode('utf-8'), to_unicode=True, utf8_handling='replace')
else:
reader = pymarc.MARCReader(r, to_unicode=True)
record = next(reader)

# ProQuest ID (001)
try:
proqid = record['001'].value()
record = next(reader)
except Exception as err:
print('unable to get proquest id! %s ' % err)
print('Next(reader) failed! %s' % err)
else:
print('I am processing ProQuest ID# %s' % proqid)
pubnr = proqid.replace('AAI', '')

# MARC 2.1 fixed length data elements (005)
flde = record['005'].value()
# Publication Year
pubyear = flde[0:4]
# Defense Year
def_year = flde[7:11]
# Language
language = flde[35:38]
# ProQuest ID (001)
try:
proqid = record['001'].value()
except Exception as err:
print('unable to get proquest id! %s ' % err)
else:
print('I am processing ProQuest ID# %s' % proqid)
pubnr = proqid.replace(u'AAI', u'')

# ISBN (020)
try:
isbn = record['020']['a']
except Exception as err:
isbn = ''
# MARC 2.1 fixed length data elements (005)
flde = record['005'].value()
# Publication Year
pubyear = flde[0:4]
# Defense Year
def_year = flde[7:11]
# Language
language = flde[35:38]

# Author (100)
try:
author = re.sub('\\.$', '', record['100']['a'].strip())
# author = auth_parse.parse(author)
except Exception as err:
author = ''
# ISBN (020)
try:
isbn = record['020']['a']
except Exception as err:
isbn = ''

# Author (100)
try:
author = re.sub(u'\\.$', u'', record['100']['a'].strip())
# author = auth_parse.parse(author)
except Exception as err:
author = ''

# Title
try:
title = re.sub('\\.$', '', record['245']['a'].strip())
except Exception as err:
title = ''
# Title
try:
title = re.sub(u'\\.$', u'', record['245']['a'].strip())
except Exception as err:
title = ''

# Page length
try:
npage = record['300']['a']
except Exception as err:
npage = ''
# Page length
try:
npage = record['300']['a']
except Exception as err:
npage = ''

# Source
try:
school = record['502']['a']
except Exception as err:
pass
else:
jfield.append(school)
jfield.append('Publication Number: %s' % re.sub('AAI', 'AAT ', proqid))
if isbn:
jfield.append('ISBN: %s' % isbn)
# Source
try:
school = record['502']['a']
except Exception as err:
pass
else:
jfield.append(school)
jfield.append('Publication Number: %s' % re.sub('AAI', 'AAT ', proqid))
if isbn:
jfield.append('ISBN: %s' % isbn)

try:
publsh = record['500']['a']
except Exception as err:
pass
else:
jfield.append(publsh)

if npage:
jfield.append(npage)

# Abstract (multiline field: 520)
abstract = ''
for l in record.get_fields('520'):
try:
abstract += ' ' + l.value().strip()
publsh = record['500']['a']
except Exception as err:
pass
abstract = abstract.strip()
else:
jfield.append(publsh)

# ADS Collection/Database
(databases, subjects) = self.get_db(record)
if npage:
jfield.append(npage)

# Affil
affil = ''
try:
affil = record['710']['a'].rstrip('.')
except Exception as err:
pass
else:
# Abstract (multiline field: 520)
abstract = ''
for l in record.get_fields('520'):
try:
abstract += ' ' + l.value().strip()
except Exception as err:
pass
abstract = abstract.strip()

# ADS Collection/Database
(databases, subjects) = self.get_db(record)

# Affil
affil = ''
try:
a2 = record['710']['b'].rstrip('.')
affil = record['710']['a'].rstrip('.')
except Exception as err:
pass
else:
affil = a2 + ', ' + affil
try:
a2 = record['710']['b'].rstrip('.')
except Exception as err:
pass
else:
affil = a2 + ', ' + affil

# Advisor
advisor = []
comments = []
try:
for e in record.get_fields('790'):
if e['e']:
advisor.append(e['a'])
if advisor:
comments.append('Advisor: %s' % advisor[0])
except Exception as err:
pass
# Advisor
advisor = []
comments = []
try:
for e in record.get_fields('790'):
if e['e']:
advisor.append(e['a'])
if advisor:
comments.append('Advisor: %s' % advisor[0])
except Exception as err:
pass

# Pubdate
try:
pubdate = record['792']['a']
except Exception as err:
pubdate = ''
# Pubdate
try:
pubdate = record['792']['a']
except Exception as err:
pubdate = ''

# Language
lang = []
try:
for l in record.get_fields('793'):
ln = l.value().strip()
lang.append(ln)
except Exception as err:
pass
# Language
lang = []
try:
for l in record.get_fields('793'):
ln = l.value().strip()
lang.append(ln)
except Exception as err:
pass

# properties
properties = dict()
if pubnr in self.oa_pubnum:
properties['OPEN'] = 1

# properties
properties = dict()
if pubnr in self.oa_pubnum:
properties['OPEN'] = 1
# new_proqid = proqid.replace('AAI','AAT ')
url = oa_base % pubnr
else:
url = url_base % pubnr
properties['ELECTR'] = url

try:
output_metadata['source'] = datasource
except:
print('datasource missing')
try:
output_metadata['authors'] = author
except:
print('author missing')
try:
output_metadata['affiliations'] = [affil]
except:
print('affil missing')
try:
output_metadata['title'] = title
except:
print('title missing')
try:
output_metadata['abstract'] = abstract
except:
print('abstract missing')
try:
output_metadata['publication'] = '; '.join(jfield)
except:
print('jfield missing')
if pubdate:
output_metadata['pubdate'] = "%s" % pubdate
if databases:
output_metadata['database'] = databases
if comments:
output_metadata['comments'] = comments
# if keywords:
# output_metadata['keywords'] = keywords
if lang:
output_metadata['language'] = lang
if subjects:
output_metadata['subjectcategory'] = subjects
if properties:
output_metadata['properties'] = properties
properties['ELECTR'] = url

try:
output_metadata['source'] = datasource
except:
print('datasource missing')
try:
output_metadata['authors'] = author
except:
print('author missing')
try:
output_metadata['affiliations'] = [affil]
except:
print('affil missing')
try:
output_metadata['title'] = title
except:
print('title missing')
try:
output_metadata['abstract'] = abstract
except:
print('abstract missing')
try:
output_metadata['publication'] = '; '.join(jfield)
except:
print('jfield missing')
if pubdate:
output_metadata['pubdate'] = "%s" % pubdate
if databases:
output_metadata['database'] = databases
if comments:
output_metadata['comments'] = comments
# if keywords:
# output_metadata['keywords'] = keywords
if lang:
output_metadata['language'] = lang
if subjects:
output_metadata['subjectcategory'] = subjects
if properties:
output_metadata['properties'] = properties

except Exception as err:
print("Record skipped, MARC parsing failed: %s" % err)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
adsputils==1.4.3
adsputils>=1.4.3
bs4==0.0.1
feedparser==6.0.8
future==0.18.2
Expand Down

0 comments on commit cef3fea

Please sign in to comment.