Skip to content

Commit

Permalink
Added handling for simple-article (#93)
Browse files Browse the repository at this point in the history
* Added handling for simple-artcile

* raise exception if no schema found

* lint fix

* better exception handling

* added other article types

* added more test cases

* changed doctype for book review

---------

Co-authored-by: Mugdha Polimera <mugdhapolimera@saos-MacBook-Pro.local>
Co-authored-by: Mugdha Polimera <mugdhapolimera@saos-mbp.lan>
Co-authored-by: Mugdha Polimera <mugdhapolimera@dhcp-10-250-214-225.harvard.edu>
  • Loading branch information
4 people committed Mar 6, 2024
1 parent 6473c3f commit 1188a5e
Show file tree
Hide file tree
Showing 22 changed files with 5,297 additions and 119 deletions.
47 changes: 41 additions & 6 deletions adsingestp/parsers/elsevier.py
Expand Up @@ -4,7 +4,7 @@
import validators

from adsingestp import utils
from adsingestp.ingest_exceptions import XmlLoadException
from adsingestp.ingest_exceptions import NoSchemaException, XmlLoadException
from adsingestp.parsers.base import BaseBeautifulSoupParser

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -135,14 +135,22 @@ def _parse_edhistory(self):

def _parse_title_abstract(self):
if self.record_meta.find("ce:title"):
self.base_metadata["title"] = self.record_meta.find("ce:title").get_text()
self.base_metadata["title"] = self._clean_output(
self.record_meta.find("ce:title").get_text()
)
elif self.record_header.find("dct:title"):
self.base_metadata["title"] = self.record_header.find("dct:title").get_text()
self.base_metadata["title"] = self._clean_output(
self.record_header.find("dct:title").get_text()
)
elif self.record_meta.find("cd:textfn"):
self.base_metadata["title"] = self.record_meta.find("cd:textfn").get_text()
self.base_metadata["title"] = self._clean_output(
self.record_meta.find("cd:textfn").get_text()
)

if self.record_meta.find("ce:subtitle"):
self.base_metadata["subtitle"] = self.record_meta.find("ce:subtitle").get_text()
self.base_metadata["subtitle"] = self._clean_output(
self.record_meta.find("ce:subtitle").get_text()
)

if self.record_meta.find("ce:abstract"):
abstract = ""
Expand Down Expand Up @@ -343,6 +351,27 @@ def _parse_esources(self):

self.base_metadata["esources"] = links

def _find_article_type(self, d):
article_types = {
"cja:converted-article": "article",
"ja:article": "article",
"ja:simple-article": "article",
"ja:book-review": "article",
"ja:exam": "nonarticle",
"bk:book": "book",
"bk:chapter": "inbook",
"bk:simple-chapter": "inbook",
"bk:examination": "nonarticle",
"bk:fb-non-chapter": "inbook",
"bk:glossary": "inbook",
"bk:index": "inbook",
"bk:introduction": "inbook",
"bk:bibliography": "inbook",
}
for art_type in article_types.keys():
if d.find(art_type, None):
return art_type, article_types[art_type]

def parse(self, text):
"""
Parse Elsevier XML into standard JSON format
Expand All @@ -355,7 +384,13 @@ def parse(self, text):
raise XmlLoadException(err)

self.record_header = d.find("rdf:Description")
self.record_meta = d.find("ja:article")

article_type, document_enum = self._find_article_type(d)
self.base_metadata["doctype"] = document_enum
self.record_meta = d.find(article_type)

if self.record_meta is None:
raise NoSchemaException("No Schema Found")

self._parse_pub()
self._parse_issue()
Expand Down
1 change: 1 addition & 0 deletions tests/stubdata/input/els_book_chapter.xml

Large diffs are not rendered by default.

62 changes: 62 additions & 0 deletions tests/stubdata/input/els_book_review.xml
@@ -0,0 +1,62 @@
<doc:document xmlns:doc="http://www.elsevier.com/xml/document/schema" xmlns:dp="http://www.elsevier.com/xml/common/doc-properties/schema" xmlns:cps="http://www.elsevier.com/xml/common/consyn-properties/schema" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dct="http://purl.org/dc/terms/" xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/" xmlns:oa="http://vtw.elsevier.com/data/ns/properties/OpenAccess-1/" xmlns:bam="http://vtw.elsevier.com/data/voc/ns/bam-vtw-1/" xmlns:cp="http://vtw.elsevier.com/data/ns/properties/Copyright-1/" xmlns:cja="http://www.elsevier.com/xml/cja/schema" xmlns:ja="http://www.elsevier.com/xml/ja/schema" xmlns:bk="http://www.elsevier.com/xml/bk/schema" xmlns:ce="http://www.elsevier.com/xml/common/schema" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:cals="http://www.elsevier.com/xml/common/cals/schema" xmlns:tb="http://www.elsevier.com/xml/common/table/schema" xmlns:sa="http://www.elsevier.com/xml/common/struct-aff/schema" xmlns:sb="http://www.elsevier.com/xml/common/struct-bib/schema" xmlns:xlink="http://www.w3.org/1999/xlink"><rdf:RDF><rdf:Description rdf:about="http://dx.doi.org/10.1016/j.enbuild.2006.02.002"><dct:format>application/xml</dct:format><dct:title>D. Clements-Croome Intelligent Buildings: Design, Management and Operation 2004 Thomas Telford Publishing (424pp., &#163;49.50, ISBN 0-7277-3266-8)</dct:title><dct:creator>Branislav Todorovic</dct:creator><dct:description>Energy &amp; Buildings 38 (2006) 712-713. doi:10.1016/j.enbuild.2006.02.002</dct:description><prism:aggregationType>journal</prism:aggregationType><prism:publicationName>Energy &amp; Buildings</prism:publicationName><prism:copyright>Copyright @ 2006 Elsevier B.V. All rights reserved.</prism:copyright><dct:publisher>Elsevier B.V.</dct:publisher><prism:issn>0378-7788</prism:issn><prism:volume>38</prism:volume><prism:number>6</prism:number><prism:coverDisplayDate/><prism:coverDate>2006</prism:coverDate><prism:pageRange>712-713</prism:pageRange><prism:startingPage>712</prism:startingPage><prism:endingPage>713</prism:endingPage><prism:doi>10.1016/j.enbuild.2006.02.002</prism:doi><prism:url>http://dx.doi.org/10.1016/j.enbuild.2006.02.002</prism:url><dct:identifier>doi:10.1016/j.enbuild.2006.02.002</dct:identifier><dp:availableOnlineInformation><bam:availableOnline xmlns:cp="http://www.elsevier.com/xml/common/consyn-properties/schema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">2006-03-13T00:00:00.000Z</bam:availableOnline></dp:availableOnlineInformation></rdf:Description></rdf:RDF><dp:document-properties><dp:aggregation-type>Journals</dp:aggregation-type><dp:version-number>S300.1</dp:version-number></dp:document-properties><ja:book-review version="5.0" xml:lang="en" docsubtype="brv">
<ja:item-info>
<ja:jid>ENB</ja:jid>
<ja:aid>2040</ja:aid>
<ce:pii>S0378-7788(06)00035-1</ce:pii>
<ce:doi>10.1016/j.enbuild.2006.02.002</ce:doi>
<ce:copyright type="full-transfer" year="2006">Elsevier B.V.</ce:copyright>
</ja:item-info>
<ja:book-review-head>
<ce:dochead>
<ce:textfn>Book review</ce:textfn>
</ce:dochead>
<sb:reference>
<sb:host>
<sb:edited-book>
<sb:editors>
<sb:editor>
<ce:given-name>D.</ce:given-name>
<ce:surname>Clements-Croome</ce:surname>
</sb:editor>
</sb:editors>
<sb:title>
<sb:maintitle>Intelligent Buildings: Design, Management and Operation</sb:maintitle>
</sb:title>
<sb:date>2004</sb:date>
<sb:publisher>
<sb:name>Thomas Telford Publishing</sb:name>
</sb:publisher>
</sb:edited-book>
</sb:host>
<sb:comment>(424pp., &#163;49.50, ISBN 0-7277-3266-8)</sb:comment>
</sb:reference>
<ce:author-group>
<ce:author>
<ce:given-name>Branislav</ce:given-name>
<ce:surname>Todorovic</ce:surname>
<ce:roles>Editor</ce:roles>
<ce:cross-ref refid="cor1">
<ce:sup loc="post">&#x204e;</ce:sup>
</ce:cross-ref>
<ce:e-address type="email">todorob@eunet.yu</ce:e-address>
</ce:author>
<ce:affiliation>
<ce:textfn>Mechanical Engineering, University of Belgrade, 27 Marto 80, 11000 Belgrade, Serbia and Montenegro</ce:textfn>
</ce:affiliation>
<ce:correspondence id="cor1">
<ce:label>&#x204e;</ce:label>
<ce:text>Tel.: +381 11 3370363; fax: +381 3370364.</ce:text>
</ce:correspondence>
</ce:author-group>
<ce:date-received day="17" month="6" year="2005"/>
</ja:book-review-head>
<ja:body view="all">
<ce:sections>
<ce:para view="all">The editor, Professor at the School of Construction Management and Engineering-University of Reading and director of the MSc in Intelligent Buildings, writing the first chapter, gives a perfect global view of intelligence. He states &#x201c;the nature of intelligence is one of the big science questions&#x201d;. There are many definitions given of natural and artificial intelligence related to several descriptions of intelligent buildings. Of course building intelligence will change with further developments in information and communication technologies, robotics, smart materials, sustainable issues technology and social change. The editor writes another chapter on &#x201c;Building Environment, Architecture and People&#x201d;, with an overview of 250 appropriate references.</ce:para>
<ce:para view="all">Separate chapters discuss the intelligence of intelligent buildings; information technology, communication and artificial intelligence; as well as engineering intelligence through nature.</ce:para>
<ce:para view="all">The second part of the book concerns &#x201c;management and operation&#x201d;, with financial analysis and investment appraisal including organizational strategy; the management strategy of design and of construction projects; as well as facilities management. Two case studies are presented in this part, and they are very interesting and useful.</ce:para>
<ce:para view="all">In Part III sustainable futures are covered with the relevant trends of culture of living and working which have changed enormously. The final chapter, again written by Professor Clements-Croome, presents &#x201c;sustainable architecture&#x201d;, giving a message which relates to all factors influencing a building: &#x201c;architecture, like music, needs technical resources, but at a higher level it requires interpretation to ignite the spirit of the users&#x201d;.</ce:para>
<ce:para view="all">The book with its philosophical view should be read by all specialists who are engaged in building, from the initial planning of a building to the end of its life time.</ce:para>
</ce:sections>
</ja:body>
</ja:book-review></doc:document>

0 comments on commit 1188a5e

Please sign in to comment.