Skip to content

Commit

Permalink
Merge pull request #55 from seasidesparrow/xref_preprint.20230720
Browse files Browse the repository at this point in the history
fix: adds both funding capture and posted_content doctype to crossref parser
  • Loading branch information
seasidesparrow committed Jul 28, 2023
2 parents ad58eb1 + 273ea4f commit 33ae877
Show file tree
Hide file tree
Showing 16 changed files with 1,931 additions and 6 deletions.
2 changes: 1 addition & 1 deletion adsingestp/parsers/base.py
Expand Up @@ -452,7 +452,7 @@ def format(self, input_dict, format):

# output["pubnote"] = "XXX" # TODO need an example
#
# output["funding"] = "XXX" # TODO need an example
output["funding"] = input_dict.get("funding", [])
#
# output["version"] = "XXX" # TODO need an example

Expand Down
76 changes: 74 additions & 2 deletions adsingestp/parsers/crossref.py
Expand Up @@ -68,6 +68,38 @@ def _get_isbn(self, isbns):

return isbns_out

def _get_funding(self, fundgroups):
funding_arr = []
for fg in fundgroups:
funder = {}
funder_name = fg.find("assertion", {"name": "funder_name"})
funder_award = fg.find("assertion", {"name": "award_number"})
if funder_name:
funder_id = funder_name.find("assertion", {"name": "funder_identifier"})
if funder_id:
funder_id = funder_id.extract()
funder_name = funder_name.extract()
else:
funder_id = None

if funder_name:
funder.setdefault("agencyname", funder_name.get_text().strip())
if funder_id:
funder.setdefault("agencyid", {"idvalue": funder_id.get_text().strip()})
if funder_award:
funder.setdefault("awardnumber", funder_award.extract().get_text().strip())

if funder:
funding_arr.append(funder)

return funding_arr

def _parse_funding(self):
fundgroups = self.record_meta.find_all("assertion", {"name": "fundgroup"})
if fundgroups:
funding = self._get_funding(fundgroups)
self.base_metadata["funding"] = funding

def _parse_pub(self):
# journal articles only
if self.input_metadata.find("journal") and self.input_metadata.find("journal").find(
Expand Down Expand Up @@ -159,6 +191,32 @@ def _parse_book_series(self):
self.base_metadata["series_id"] = series_meta.find("issn").get_text()
self.base_metadata["series_id_description"] = "issn"

def _parse_posted_content(self):
if self.record_meta.find("institution"):
inst_name = None
if self.record_meta.find("institution").find("institution_name"):
inst_name = (
self.record_meta.find("institution").find("institution_name").get_text()
)
if self.record_meta.find("institution").find("institution_acronym"):
if inst_name:
inst_name = (
inst_name
+ " (%s)"
% self.record_meta.find("institution")
.find("institution_acronym")
.get_text()
)
else:
inst_name = (
self.record_meta.find("institution").find("institution_acronym").get_text()
)
if inst_name:
self.base_metadata["publisher"] = inst_name
if self.record_meta.find("posted_date"):
pubdate = self._get_date(self.record_meta.find("posted_date"))
self.base_metadata["pubdate_electronic"] = pubdate

def _parse_title_abstract(self):
if self.record_meta.find("titles") and self.record_meta.find("titles").find("title"):
self.base_metadata["title"] = self.record_meta.find("titles").find("title").get_text()
Expand Down Expand Up @@ -195,7 +253,7 @@ def _parse_contrib(self):

if c.find("ORCID"):
orcid = c.find("ORCID").get_text()
orcid = orcid.replace("http://orcid.org/", "")
orcid = orcid.replace("http://orcid.org/", "").replace("https://orcid.org/", "")
contrib_tmp["orcid"] = orcid

if c.find("affiliation"):
Expand Down Expand Up @@ -350,10 +408,20 @@ def parse(self, text):
self.record_meta = self.input_metadata.find("book_series_metadata").extract()
else:
self.record_meta = None
if self.input_metadata.find("posted_content"):
if type_found:
raise WrongSchemaException("Too many document types found in CrossRef record")
else:
type_found = True
self.record_type = "posted_content"
if self.input_metadata.find("posted_content"):
self.record_meta = self.input_metadata.find("posted_content").extract()
else:
self.record_meta = None

if not type_found:
raise WrongSchemaException(
"Didn't find allowed document type (article, conference, book) in CrossRef record"
"Didn't find allowed document type (article, conference, book, posted_content) in CrossRef record"
)
elif not self.record_meta:
raise WrongSchemaException(
Expand All @@ -380,6 +448,10 @@ def parse(self, text):
if self.record_meta.find("series_metadata"):
self._parse_book_series()

if self.record_type == "posted_content":
self._parse_posted_content()

self._parse_funding()
self._parse_issue()
self._parse_title_abstract()
self._parse_contrib()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Expand Up @@ -25,7 +25,7 @@ dependencies = [
'nameparser==1.1.1',
'ordered-set==4.1.0',
'python-dateutil==2.8.1',
'adsingestschema @ git+https://github.com/adsabs/ingest_data_model@v1.0.8#egg=adsingestschema',
'adsingestschema @ git+https://github.com/adsabs/ingest_data_model@v1.0.9#egg=adsingestschema',
]


Expand Down
146 changes: 146 additions & 0 deletions tests/stubdata/input/crossref_preprint_10.1002-essoar.10508651.1.xml
@@ -0,0 +1,146 @@
<?xml version="1.0" encoding="UTF-8"?>
<doi_records>
<doi_record owner="10.1002" timestamp="2022-12-05 16:22:55">
<crossref>
<posted_content language="en" type="preprint">
<group_title>Solar System Physics</group_title>
<contributors>
<person_name contributor_role="author" sequence="first">
<given_name>Roberto</given_name>
<surname>Livi</surname>
<affiliation>University of California</affiliation>
<ORCID authenticated="false">https://orcid.org/0000-0002-0396-0547</ORCID>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Davin E</given_name>
<surname>Larson</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Justin C</given_name>
<surname>Kasper</surname>
<affiliation>University of Michigan</affiliation>
<affiliation>Smithsonian Astrophysical Observatory</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Robert</given_name>
<surname>Abiad</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Anthony W</given_name>
<surname>Case</surname>
<affiliation>Smithsonian Astrophysical Observatory</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Kristopher G</given_name>
<surname>Klein</surname>
<affiliation>University of Michigan</affiliation>
<affiliation>University of Arizona</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>David W</given_name>
<surname>Curtis</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Gregory</given_name>
<surname>Dalton</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Michael</given_name>
<surname>Stevens</surname>
<affiliation>Smithsonian Astrophysical Observatory</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Kelly E</given_name>
<surname>Korreck</surname>
<affiliation>Smithsonian Astrophysical Observatory</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>George</given_name>
<surname>Ho</surname>
<affiliation>Applied Physics Laboratory, Johns Hopkins University</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Miles</given_name>
<surname>Robinson</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Chris</given_name>
<surname>Tiu</surname>
<affiliation>NASA</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Phyllis L</given_name>
<surname>Whittlesey</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>J L</given_name>
<surname>Verniero</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Jasper</given_name>
<surname>Halekas</surname>
<affiliation>University of Iowa</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>James</given_name>
<surname>Mcfadden</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Mario</given_name>
<surname>Marckwordt</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Amanda</given_name>
<surname>Slagle</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Mamuda</given_name>
<surname>Abatcha</surname>
<affiliation>University of California</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Ali</given_name>
<surname>Rahmati</surname>
<affiliation>University of California</affiliation>
</person_name>
</contributors>
<titles>
<title>The Solar Probe ANalyzer -Ions on Parker Solar Probe</title>
</titles>
<posted_date media_type="online">
<month>11</month>
<day>08</day>
<year>2021</year>
</posted_date>
<institution>
<institution_name>Earth and Space Science Open Archive</institution_name>
<institution_acronym>ESSOAr</institution_acronym>
</institution>
<program name="fundref">
<assertion name="fundgroup">
<assertion name="funder_name">
National Aeronautics and Space Administration
<assertion name="funder_identifier">100000104</assertion>
</assertion>
<assertion name="award_number">NNN06AA01C</assertion>
</assertion>
</program>
<doi_data>
<doi>10.1002/essoar.10508651.1</doi>
<resource>https://essopenarchive.org/doi/full/10.1002/essoar.10508651.1</resource>
</doi_data>
<citation_list />
</posted_content>
</crossref>
</doi_record>
</doi_records>
@@ -0,0 +1,93 @@
<?xml version="1.0" encoding="UTF-8"?>
<doi_records>
<doi_record owner="10.1002" timestamp="2023-01-24 18:54:13">
<crossref>
<posted_content language="en" type="preprint">
<group_title>Planetology</group_title>
<contributors>
<person_name contributor_role="author" sequence="first">
<given_name>Mélanie</given_name>
<surname>Drilleau</surname>
<affiliation>31400 Toulouse</affiliation>
<affiliation>Institut Supérieur de l’Aéronautique et de l’Espace ISAE-SUPAERO</affiliation>
<affiliation>10 Avenue Edouard Belin</affiliation>
<affiliation>France</affiliation>
<ORCID authenticated="false">https://orcid.org/0000-0001-5625-9706</ORCID>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Henri</given_name>
<surname>Samuel</surname>
<affiliation>Institut de Physique du Globe de Paris, CNRS, Université de Paris, 1 rue Jussieu, 75005 Paris - France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Raphaël F.</given_name>
<surname>Garcia</surname>
<affiliation>Institut Supérieur de l’Aéronautique et de l’Espace ISAE-SUPAERO, 10 Avenue Edouard Belin, 31400 Toulouse, France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Attilio</given_name>
<surname>Rivoldini</surname>
<affiliation>Royal Observatory of Belgium, Brussels, Belgium</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Clément</given_name>
<surname>Perrin</surname>
<affiliation>Nantes Université, Université d’Angers, Le Mans Université, CNRS UMR 6112, Laboratoire de Planétologie et Géosciences, UAR 3281, Observatoire des Sciences de l’Univers de Nantes Atlantique, F-44000 Nantes, France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Chloé</given_name>
<surname>Michaut</surname>
<affiliation>Université de Lyon, Ecole Normale Supérieure de Lyon, Université Claude Bernard Lyon 1, CNRS, Laboratoire de Géologie de Lyon : Terre, Planètes, Environnement, 69622 Villeurbanne, France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Mark</given_name>
<surname>Wieczorek</surname>
<affiliation>Université Côte d’Azur, Observatoire de la Côte d’Azur, CNRS, Laboratoire Lagrange, France.</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Benoît</given_name>
<surname>Tauzin</surname>
<affiliation>Université de Lyon, Ecole Normale Supérieure de Lyon, Université Claude Bernard Lyon 1, CNRS, Laboratoire de Géologie de Lyon : Terre, Planètes, Environnement, 69622 Villeurbanne, France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>James A. D.</given_name>
<surname>Connolly</surname>
<affiliation>Institute of Geophysics, ETH Zurich, Sonneggstrasse 5, Zurich, Switzerland</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Pauline</given_name>
<surname>Meyer</surname>
<affiliation>Ecole et Observatoire des Sciences de la Terre, Université de Strasbourg, 5 rue René Descartes, 67084 Strasbourg, France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>Philippe</given_name>
<surname>Lognonné</surname>
<affiliation>Institut de Physique du Globe de Paris, CNRS, Université de Paris, 1 rue Jussieu, 75005 Paris - France</affiliation>
</person_name>
<person_name contributor_role="author" sequence="additional">
<given_name>William B.</given_name>
<surname>Banerdt</surname>
<affiliation>Jet Propulsion Laboratory, California Institute of Technology, 4800 Oak Grove Drive, Pasadena, CA 91109, USA</affiliation>
</person_name>
</contributors>
<titles>
<title>Marsquake locations and 1-D seismic models for Mars from InSight data</title>
</titles>
<posted_date media_type="online">
<month>04</month>
<day>11</day>
<year>2022</year>
</posted_date>
<institution>
<institution_name>Earth and Space Science Open Archive</institution_name>
<institution_acronym>ESSOAr</institution_acronym>
</institution>
<doi_data>
<doi>10.1002/essoar.10511074.2</doi>
<resource>https://essopenarchive.org/doi/full/10.1002/essoar.10511074.2</resource>
</doi_data>
<citation_list />
</posted_content>
</crossref>
</doi_record>
</doi_records>

0 comments on commit 33ae877

Please sign in to comment.