Skip to content

Commit

Permalink
fixed subscript, superscript, italics, bold tags for elsevier (#101)
Browse files Browse the repository at this point in the history
* fixed subscript, superscript, italics, bold tags for elsevier

* updating jats output files with allowed html tags

* lint edits

* fixed abstract parsing

* deleted unwanted prints

---------

Co-authored-by: Mugdha Polimera <mugdhapolimera@dhcp-10-250-119-19.harvard.edu>
Co-authored-by: Mugdha Polimera <mugdhapolimera@saos-mbp-6.lan>
Co-authored-by: Mugdha Polimera <mugdhapolimera@dhcp-10-250-19-22.harvard.edu>
  • Loading branch information
4 people committed Apr 17, 2024
1 parent ca0a121 commit b8888cc
Show file tree
Hide file tree
Showing 15 changed files with 9,125 additions and 7,764 deletions.
11 changes: 6 additions & 5 deletions adsingestp/parsers/base.py
Expand Up @@ -488,16 +488,17 @@ class BaseBeautifulSoupParser(IngestBase):
"mml:mover",
"mml:mn",
"mml:annotation",
"mml:msubsup",
]

HTML_TAGS_HTML = ["sub", "sup", "a", "astrobj"]
HTML_TAGS_HTML = ["sub", "sup", "a", "astrobj", "i", "b"]

HTML_TAGSET = {
"title": HTML_TAGS_MATH + HTML_TAGS_HTML,
"abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"title": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a"],
"abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br", "p"],
"affiliations": ["email", "orcid"],
"keywords": ["astrobj"],
"keywords": HTML_TAGS_HTML,
}

HTML_TAGS_DANGER = ["php", "script", "css"]
Expand Down
86 changes: 56 additions & 30 deletions adsingestp/parsers/elsevier.py
Expand Up @@ -2,6 +2,7 @@
import re

import validators
from lxml import etree

from adsingestp import utils
from adsingestp.ingest_exceptions import NoSchemaException, XmlLoadException
Expand Down Expand Up @@ -129,55 +130,50 @@ def _parse_edhistory(self):
def _parse_title_abstract(self):
if self.record_meta.find("ce:title"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_meta.find("ce:title"), self.HTML_TAGSET["abstract"]
).strip()
self._detag(self.record_meta.find("ce:title"), self.HTML_TAGSET["title"]).strip()
)
elif self.record_header.find("dct:title"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_header.find("dct:title"), self.HTML_TAGSET["abstract"]
self.record_header.find("dct:title"), self.HTML_TAGSET["title"]
).strip()
)
elif self.record_meta.find("cd:textfn"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_meta.find("ce:textfn"), self.HTML_TAGSET["abstract"]
).strip()
self._detag(self.record_meta.find("ce:textfn"), self.HTML_TAGSET["title"]).strip()
)

if self.record_meta.find("ce:subtitle"):
self.base_metadata["subtitle"] = self._clean_output(
self._detag(
self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["abstract"]
self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["title"]
).strip()
)

if self.record_meta.find("ce:abstract"):
abstract = ""
abs_all = self.record_meta.find_all("ce:abstract")
abs_text_all = ""
for abs in abs_all:
if abs.find("ce:section-title"):
if abs.get("class", None) == "author":
abs_text_all = abs.find_all("ce:simple-para")
elif abs.find("ce:section-title"):
if abs.find("ce:section-title").get_text().lower() == "abstract":
abs_text_all = abs.find_all("ce:simple-para")
abstract = "" # we've found the real abstract, so reset
for abs_text in abs_text_all:
abstract = (
abstract
+ " "
+ self._detag(abs_text, self.HTML_TAGSET["abstract"]).strip()
)
if abstract:
self.base_metadata["abstract"] = abstract
break
elif abs.find("ce:section-title").get_text().lower() == "highlights":
abs_text_all = abs.find_all("ce:para")
for abs_text in abs_text_all:
abstract = (
abstract
+ " "
+ self._detag(abs_text, self.HTML_TAGSET["abstract"]).strip()
)
elif abs.find("ce:section-title").get_text().lower() == "highlights":
abs_text_all = abs.find_all("p")

abstract = ""
for abs_text in abs_text_all:
abstract = (
abstract
+ " "
+ self._detag(abs_text, self.HTML_TAGSET["abstract"]).strip()
)

if abstract:
self.base_metadata["abstract"] = abstract
break

if abstract:
self.base_metadata["abstract"] = self._clean_output(abstract)
Expand Down Expand Up @@ -283,9 +279,9 @@ def _parse_authors(self):
and author.find("ce:e-address").get("type", "") == "email"
):
author_tmp["email"] = author.find("ce:e-address").get_text()
if author.find("ce:cross-ref") and author.find("ce:cross-ref").find("ce:sup"):
if author.find("ce:cross-ref") and author.find("ce:cross-ref").find("sup"):
affs = []
for i in author.find("ce:cross-ref").find_all("ce:sup"):
for i in author.find("ce:cross-ref").find_all("sup"):
aff_label = i.get_text()
# don't append an empty aff
if affs_xref.get(aff_label):
Expand Down Expand Up @@ -371,14 +367,44 @@ def _find_article_type(self, d):
if d.find(art_type, None):
return art_type, article_types[art_type]

def _remove_namespaces(self, text):
convert = {
"italics": "i",
"italic": "i",
"bold": "b",
"sup": "sup",
"inf": "sub",
"list": "ul",
"list-item": "li",
"para": "p",
}

root = etree.fromstring(text)

# Iterate through all XML elements
for elem in root.getiterator():
# Skip comments and processing instructions,
# because they do not have names
if not (
isinstance(elem, etree._Comment) or isinstance(elem, etree._ProcessingInstruction)
):
# Remove a namespace URI in the element's name if element is specified in convert
if etree.QName(elem).localname in convert.keys():
elem.tag = convert[etree.QName(elem).localname]

# Remove unused namespace declarations
etree.cleanup_namespaces(root)
return etree.tostring(root)

def parse(self, text):
"""
Parse Elsevier XML into standard JSON format
:param text: string, contents of XML file
:return: parsed file contents in JSON format
"""
try:
d = self.bsstrtodict(text, parser="lxml-xml")
detagged_text = self._remove_namespaces(text)
d = self.bsstrtodict(detagged_text, parser="lxml-xml")
except Exception as err:
raise XmlLoadException(err)

Expand Down

0 comments on commit b8888cc

Please sign in to comment.