Merge pull request #186 from seasidesparrow/bump_adsputils.20240311

Bump adsputils.20240311
adsabs · Mar 12, 2024 · cef3fea · cef3fea
2 parents 8f4cd89 + 2f982e8
commit cef3fea
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 149 deletions.
diff --git a/examples/ex_proquest.py b/examples/ex_proquest.py
@@ -1,4 +1,5 @@
 from __future__ import print_function
+import os
 from glob import glob
 from pyingest.parsers.proquest import ProQuestParser
 from pyingest.serializers.classic import Tagged
@@ -7,15 +8,18 @@
 
 def main():
 
-    marc_filename = 'SAO_NASA_Jul_2020.UNX'
+    marc_filename = './SAO_NASA_Dec_2023.UNX'
     oa_filename = marc_filename.replace('.UNX', '_OpenAccessTitles.csv')
     marcdata = open(marc_filename).read()
-    oadata = open(oa_filename).read()
+    if os.path.exists(oa_filename):
+        oadata = open(oa_filename).read()
+    else:
+        oadata = ''
     parser = ProQuestParser(marcdata, oadata)
     lol = parser.parse()
     print("%s records processed" % len(parser.results))
     tag = Tagged()
-    outfile = 'lolproque.tag'
+    outfile = marc_filename+'.NEW'
     with open(outfile,'w') as fo:
         for rec in parser.results:
             tag.write(rec,fo)

diff --git a/pyingest/config/config.py b/pyingest/config/config.py
@@ -259,7 +259,6 @@ def get_uat(data,data_dict):
 
 # ProQuest harvester
 PROQUEST_BASE_PATH = "/proj/ads_abstracts/sources/ProQuest/fromProQuest/"
-PROQUEST_OA_BASE = "http://pqdtopen.proquest.com/pubnum/%s.html"
 PROQUEST_URL_BASE = "http://gateway.proquest.com/openurl?url_ver=Z39.88-2004&res_dat=xri:pqdiss&rft_val_fmt=info:ofi/fmt:kev:mtx:dissertation&rft_dat=xri:pqdiss:%s"
 PROQUEST_DATASOURCE = "UMI"
 

diff --git a/pyingest/parsers/proquest.py b/pyingest/parsers/proquest.py
@@ -66,7 +66,6 @@ def get_db(self, rec):
 
     def parse(self):
 
-        oa_base = config.PROQUEST_OA_BASE
         url_base = config.PROQUEST_URL_BASE
 
         auth_parse = AuthorNames()
@@ -82,176 +81,179 @@ def parse(self):
 
                 # read each record into a pymarc object
                 if sys.version_info > (3,):
-                    reader = pymarc.MARCReader(r.encode('utf-8'), to_unicode=True)
+                    reader = pymarc.MARCReader(r.encode('utf-8'), to_unicode=True, utf8_handling='replace')
                 else:
                     reader = pymarc.MARCReader(r, to_unicode=True)
-                record = next(reader)
-
-                # ProQuest ID (001)
                 try:
-                    proqid = record['001'].value()
+                    record = next(reader)
                 except Exception as err:
-                    print('unable to get proquest id! %s ' % err)
+                    print('Next(reader) failed! %s' % err)
                 else:
-                    print('I am processing ProQuest ID# %s' % proqid)
-                    pubnr = proqid.replace('AAI', '')
 
-                # MARC 2.1 fixed length data elements (005)
-                flde = record['005'].value()
-                # Publication Year
-                pubyear = flde[0:4]
-                # Defense Year
-                def_year = flde[7:11]
-                # Language
-                language = flde[35:38]
+                    # ProQuest ID (001)
+                    try:
+                        proqid = record['001'].value()
+                    except Exception as err:
+                        print('unable to get proquest id! %s ' % err)
+                    else:
+                        print('I am processing ProQuest ID# %s' % proqid)
+                        pubnr = proqid.replace(u'AAI', u'')
 
-                # ISBN (020)
-                try:
-                    isbn = record['020']['a']
-                except Exception as err:
-                    isbn = ''
+                    # MARC 2.1 fixed length data elements (005)
+                    flde = record['005'].value()
+                    # Publication Year
+                    pubyear = flde[0:4]
+                    # Defense Year
+                    def_year = flde[7:11]
+                    # Language
+                    language = flde[35:38]
 
-                # Author (100)
-                try:
-                    author = re.sub('\\.$', '', record['100']['a'].strip())
-                    # author = auth_parse.parse(author)
-                except Exception as err:
-                    author = ''
+                    # ISBN (020)
+                    try:
+                        isbn = record['020']['a']
+                    except Exception as err:
+                        isbn = ''
+
+                    # Author (100)
+                    try:
+                        author = re.sub(u'\\.$', u'', record['100']['a'].strip())
+                        # author = auth_parse.parse(author)
+                    except Exception as err:
+                        author = ''
 
-                # Title
-                try:
-                    title = re.sub('\\.$', '', record['245']['a'].strip())
-                except Exception as err:
-                    title = ''
+                    # Title
+                    try:
+                        title = re.sub(u'\\.$', u'', record['245']['a'].strip())
+                    except Exception as err:
+                        title = ''
 
-                # Page length
-                try:
-                    npage = record['300']['a']
-                except Exception as err:
-                    npage = ''
+                    # Page length
+                    try:
+                        npage = record['300']['a']
+                    except Exception as err:
+                        npage = ''
 
-                # Source
-                try:
-                    school = record['502']['a']
-                except Exception as err:
-                    pass
-                else:
-                    jfield.append(school)
-                jfield.append('Publication Number: %s' % re.sub('AAI', 'AAT ', proqid))
-                if isbn:
-                    jfield.append('ISBN: %s' % isbn)
+                    # Source
+                    try:
+                        school = record['502']['a']
+                    except Exception as err:
+                        pass
+                    else:
+                        jfield.append(school)
+                    jfield.append('Publication Number: %s' % re.sub('AAI', 'AAT ', proqid))
+                    if isbn:
+                        jfield.append('ISBN: %s' % isbn)
 
-                try:
-                    publsh = record['500']['a']
-                except Exception as err:
-                    pass
-                else:
-                    jfield.append(publsh)
-
-                if npage:
-                    jfield.append(npage)
-
-                # Abstract (multiline field: 520)
-                abstract = ''
-                for l in record.get_fields('520'):
                     try:
-                        abstract += ' ' + l.value().strip()
+                        publsh = record['500']['a']
                     except Exception as err:
                         pass
-                abstract = abstract.strip()
+                    else:
+                        jfield.append(publsh)
 
-                # ADS Collection/Database
-                (databases, subjects) = self.get_db(record)
+                    if npage:
+                        jfield.append(npage)
 
-                # Affil
-                affil = ''
-                try:
-                    affil = record['710']['a'].rstrip('.')
-                except Exception as err:
-                    pass
-                else:
+                    # Abstract (multiline field: 520)
+                    abstract = ''
+                    for l in record.get_fields('520'):
+                        try:
+                            abstract += ' ' + l.value().strip()
+                        except Exception as err:
+                            pass
+                    abstract = abstract.strip()
+
+                    # ADS Collection/Database
+                    (databases, subjects) = self.get_db(record)
+
+                    # Affil
+                    affil = ''
                     try:
-                        a2 = record['710']['b'].rstrip('.')
+                        affil = record['710']['a'].rstrip('.')
                     except Exception as err:
                         pass
                     else:
-                        affil = a2 + ', ' + affil
+                        try:
+                            a2 = record['710']['b'].rstrip('.')
+                        except Exception as err:
+                            pass
+                        else:
+                            affil = a2 + ', ' + affil
 
-                # Advisor
-                advisor = []
-                comments = []
-                try:
-                    for e in record.get_fields('790'):
-                        if e['e']:
-                            advisor.append(e['a'])
-                    if advisor:
-                        comments.append('Advisor: %s' % advisor[0])
-                except Exception as err:
-                    pass
+                    # Advisor
+                    advisor = []
+                    comments = []
+                    try:
+                        for e in record.get_fields('790'):
+                            if e['e']:
+                                advisor.append(e['a'])
+                        if advisor:
+                            comments.append('Advisor: %s' % advisor[0])
+                    except Exception as err:
+                        pass
 
-                # Pubdate
-                try:
-                    pubdate = record['792']['a']
-                except Exception as err:
-                    pubdate = ''
+                    # Pubdate
+                    try:
+                        pubdate = record['792']['a']
+                    except Exception as err:
+                        pubdate = ''
 
-                # Language
-                lang = []
-                try:
-                    for l in record.get_fields('793'):
-                        ln = l.value().strip() 
-                        lang.append(ln)
-                except Exception as err:
-                    pass
+                    # Language
+                    lang = []
+                    try:
+                        for l in record.get_fields('793'):
+                            ln = l.value().strip() 
+                            lang.append(ln)
+                    except Exception as err:
+                        pass
+
+                    # properties
+                    properties = dict()
+                    if pubnr in self.oa_pubnum:
+                        properties['OPEN'] = 1
 
-                # properties
-                properties = dict()
-                if pubnr in self.oa_pubnum:
-                    properties['OPEN'] = 1
-                    # new_proqid = proqid.replace('AAI','AAT ')
-                    url = oa_base % pubnr
-                else:
                     url = url_base % pubnr
-                properties['ELECTR'] = url
 
-                try:
-                    output_metadata['source'] = datasource
-                except:
-                    print('datasource missing')
-                try:
-                    output_metadata['authors'] = author
-                except:
-                    print('author missing')
-                try:
-                    output_metadata['affiliations'] = [affil]
-                except:
-                    print('affil missing')
-                try:
-                    output_metadata['title'] = title
-                except:
-                    print('title missing')
-                try:
-                    output_metadata['abstract'] = abstract
-                except:
-                    print('abstract missing')
-                try:
-                    output_metadata['publication'] = '; '.join(jfield)
-                except:
-                    print('jfield missing')
-                if pubdate:
-                    output_metadata['pubdate'] = "%s" % pubdate
-                if databases:
-                    output_metadata['database'] = databases
-                if comments:
-                    output_metadata['comments'] = comments
-                # if keywords:
-                    # output_metadata['keywords'] = keywords
-                if lang:
-                    output_metadata['language'] = lang
-                if subjects:
-                    output_metadata['subjectcategory'] = subjects
-                if properties:
-                    output_metadata['properties'] = properties
+                    properties['ELECTR'] = url
+
+                    try:
+                        output_metadata['source'] = datasource
+                    except:
+                        print('datasource missing')
+                    try:
+                        output_metadata['authors'] = author
+                    except:
+                        print('author missing')
+                    try:
+                        output_metadata['affiliations'] = [affil]
+                    except:
+                        print('affil missing')
+                    try:
+                        output_metadata['title'] = title
+                    except:
+                        print('title missing')
+                    try:
+                        output_metadata['abstract'] = abstract
+                    except:
+                        print('abstract missing')
+                    try:
+                        output_metadata['publication'] = '; '.join(jfield)
+                    except:
+                        print('jfield missing')
+                    if pubdate:
+                        output_metadata['pubdate'] = "%s" % pubdate
+                    if databases:
+                        output_metadata['database'] = databases
+                    if comments:
+                        output_metadata['comments'] = comments
+                    # if keywords:
+                        # output_metadata['keywords'] = keywords
+                    if lang:
+                        output_metadata['language'] = lang
+                    if subjects:
+                        output_metadata['subjectcategory'] = subjects
+                    if properties:
+                        output_metadata['properties'] = properties
 
             except Exception as err:
                 print("Record skipped, MARC parsing failed: %s" % err)

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-adsputils==1.4.3
+adsputils>=1.4.3
 bs4==0.0.1
 feedparser==6.0.8
 future==0.18.2