Skip to content

Commit

Permalink
Dublincore parser to replace arxiv parser (#84)
Browse files Browse the repository at this point in the history
* Creating generic dublincore parser

* Replaced arxiv parser with dubcore parser

* removed identifier from header and added publisher field

---------

Co-authored-by: Mugdha Polimera <mugdhapolimera@saos-MacBook-Pro.local>
Co-authored-by: Mugdha Polimera <mugdhapolimera@saos-mbp.lan>
  • Loading branch information
3 people committed Jan 5, 2024
1 parent 1ebe706 commit 0938a1c
Show file tree
Hide file tree
Showing 9 changed files with 409 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"python.linting.enabled": true,
"[python]": {
"editor.codeActionsOnSave": {
"source.organizeImports": true
"source.organizeImports": "explicit"
}
},
"terminal.integrated.env.linux": {
Expand Down
49 changes: 22 additions & 27 deletions adsingestp/parsers/arxiv.py → adsingestp/parsers/dubcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
logger = logging.getLogger(__name__)


class MultiArxivParser(IngestBase):
class MultiDublinCoreParser(IngestBase):
start_re = r"<record(?!-)[^>]*>"
end_re = r"</record(?!-)[^>]*>"

def parse(self, text, header=False):
"""
Separate multi-record arXiv XML document into individual XML documents
Separate multi-record DublinCore XML document into individual XML documents
:param text: string, input XML text from a multi-record XML document
:param header: boolean (default: False), set to True to preserve overall
document header/footer for each separate record's document
:return: list, each item is the XML of a separate arXiv document
:return: list, each item is the XML of a separate DublinCore document
"""
output_chunks = []
for chunk in self.get_chunks(text, self.start_re, self.end_re, head_foot=header):
Expand All @@ -33,15 +33,14 @@ def parse(self, text, header=False):
return output_chunks


class ArxivParser(BaseBeautifulSoupParser):
# Dublin Core parser for arXiv
class DublinCoreParser(BaseBeautifulSoupParser):
# Generic Dublin Core parser

DUBCORE_SCHEMA = ["http://www.openarchives.org/OAI/2.0/oai_dc/"]

author_collaborations_params = {
"keywords": ["group", "team", "collaboration"],
"remove_the": False,
"fix_arXiv_mixed_collaboration_string": True,
}

def __init__(self):
Expand All @@ -50,24 +49,14 @@ def __init__(self):
self.input_metadata = None

def _parse_ids(self):
if self.input_header.find("identifier"):
ids = self.input_header.find("identifier").get_text()
id_array = ids.split(":")
arxiv_id = id_array[-1]
self.base_metadata["ids"] = {}
self.base_metadata["ids"]["pub-id"] = []

# TODO what should the key on this actually be?
self.base_metadata["publication"] = "eprint arXiv:" + arxiv_id

self.base_metadata["ids"] = {"preprint": {}}

self.base_metadata["ids"]["preprint"]["source"] = "arXiv"
self.base_metadata["ids"]["preprint"]["id"] = arxiv_id

dc_ids = self.input_metadata.find_all("dc:identifier")
for d in dc_ids:
d_text = d.get_text()
if "doi:" in d_text:
self.base_metadata["ids"]["doi"] = d_text.replace("doi:", "")
if self.input_metadata.find("dc:identifier"):
for dc_id in self.input_metadata.find_all("dc:identifier"):
self.base_metadata["ids"]["pub-id"].append(
{"attribute": "publisher-id", "Identifier": dc_id.get_text()}
)

def _parse_title(self):
title_array = self.input_metadata.find_all("dc:title")
Expand Down Expand Up @@ -104,16 +93,21 @@ def _parse_pubdate(self):
"dc:date"
).get_text()

def _parse_publisher(self):
if self.input_metadata.find("dc:publisher"):
self.base_metadata["publisher"] = self.input_metadata.find("dc:publisher").get_text()

def _parse_abstract(self):
desc_array = self.input_metadata.find_all("dc:description")
# for arXiv.org, only 'dc:description'[0] is the abstract, the rest are comments
# in general, only 'dc:description'[0] is the abstract, the rest are comments
if desc_array:
self.base_metadata["abstract"] = self._clean_output(desc_array.pop(0).get_text())

if desc_array:
comments_out = []
for d in desc_array:
comments_out.append({"origin": "arxiv", "text": self._clean_output(d.get_text())})
# TODO: FIX
comments_out.append({"text": self._clean_output(d.get_text())})

self.base_metadata["comments"] = comments_out

Expand All @@ -123,12 +117,12 @@ def _parse_keywords(self):
if keywords_array:
keywords_out = []
for k in keywords_array:
keywords_out.append({"system": "arxiv", "string": k.get_text()})
keywords_out.append({"string": k.get_text()})
self.base_metadata["keywords"] = keywords_out

def parse(self, text):
"""
Parse arXiv XML into standard JSON format
Parse DublinCore XML into standard JSON format
:param text: string, contents of XML file
:return: parsed file contents in JSON format
"""
Expand All @@ -154,6 +148,7 @@ def parse(self, text):
self._parse_pubdate()
self._parse_abstract()
self._parse_keywords()
self._parse_publisher()

self.base_metadata = self._entity_convert(self.base_metadata)

Expand Down
33 changes: 33 additions & 0 deletions tests/stubdata/input/dubcore_pos_ecrs_002.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<record>
<header>
<identifier>oai:pos.sissa.it:ECRS/002</identifier>
<datestamp>2023-02-15</datestamp>
<setSpec>conference:ECRS</setSpec>
<setSpec>group:14</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://ww
w.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>The Memories of the First European Cosmic Ray Symposium: Łódź 1968</dc:title>
<dc:creator>Alan Watson</dc:creator>
<dc:subject>Astroparticle Physics</dc:subject>
<dc:description>The origins of the series of European Cosmic-Ray Symposia are briefly described. The first
meeting in the seri
es, on ‘Hadronic Interactions and Extensive Air Showers’, held in Łódź, Poland in 1968, was attended by
the author: some memories are recounted.</dc:description>
<dc:publisher>Sissa Medialab</dc:publisher>
<dc:date>2023-02-15</dc:date>
<dc:type>Text</dc:type>
<dc:format>application/pdf</dc:format>
<dc:identifier>PoS(ECRS)002</dc:identifier>
<dc:identifier>10.22323/1.423.0002</dc:identifier>
<dc:identifier>https://pos.sissa.it/423/002/</dc:identifier>
<dc:language>en</dc:language>
<dc:relation>ECRS (27th European Cosmic Ray Symposium) Opening; isPartOf</dc:relation>
<dc:rights>Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC BY-NC-ND
4.0)</dc:rights>
</oai_dc:dc>
</metadata>
</record>
121 changes: 104 additions & 17 deletions tests/stubdata/output/arxiv_0901_2443.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,104 @@
{"abstract": {"textEnglish": "The $^{112,120}$Sn$(\\gamma,\\gamma')$ reactions have been studied at the S-DALINAC. Electric dipole (E1) strength distributions have been determined including contributions from unresolved strength extracted by a fluctuation analysis. Together with available data on $^{116,124}$Sn, an experimental systematics of the pygmy dipole resonance (PDR) in stable even-mass tin isotopes is established. The PDR centroid excitation energies and summed strengths are in reasonable agreement with quasiparticle-phonon model calculations based on a nonrelativistic description of the mean field but disagree with relativistic quasiparticle random-phase approximation predictions."},
"authors": [{"name": {"given_name": "B.", "pubraw": "Özel, B.", "surname": "Özel"}},
{"name": {"given_name": "J.", "pubraw": "Enders, J.", "surname": "Enders"}},
{"name": {"given_name": "H.", "pubraw": "Lenske, H.", "surname": "Lenske"}},
{"name": {"given_name": "P.", "pubraw": "von Neumann-Cosel, P.", "surname": "von Neumann-Cosel"}},
{"name": {"given_name": "I.", "pubraw": "Poltoratska, I.", "surname": "Poltoratska"}},
{"name": {"given_name": "V.", "middle_name": "Yu.", "pubraw": "Ponomarev, V. Yu.", "surname": "Ponomarev"}},
{"name": {"given_name": "A.", "pubraw": "Richter, A.", "surname": "Richter"}},
{"name": {"given_name": "D.", "pubraw": "Savran, D.", "surname": "Savran"}},
{"name": {"given_name": "N.", "pubraw": "Tsoneva, N.", "surname": "Tsoneva"}}],
"comments": [{"commentOrigin": "arxiv", "commentText": "Comment: submitted to Phys. Lett. B"}],
"keywords": [{"keyString": "Nuclear Experiment", "keySystem": "arxiv"}],
"persistentIDs": [{"preprint": {"identifier": "0901.2443", "source": "arXiv"}}],
"pubDate": {"electrDate": "2009-01-16"},
"publication": {"pubName": "eprint arXiv:0901.2443", "pubYear": "2009"},
"recordData": {"createdTime": "", "loadFormat": "OtherXML", "loadLocation": "", "loadType": "fromFile", "parsedTime": "", "recordOrigin": ""},
"title": {"textEnglish": "Excitation energy and strength of the pygmy dipole resonance in stable tin isotopes"}}
{
"abstract": {
"textEnglish": "The $^{112,120}$Sn$(\\gamma,\\gamma')$ reactions have been studied at the S-DALINAC. Electric dipole (E1) strength distributions have been determined including contributions from unresolved strength extracted by a fluctuation analysis. Together with available data on $^{116,124}$Sn, an experimental systematics of the pygmy dipole resonance (PDR) in stable even-mass tin isotopes is established. The PDR centroid excitation energies and summed strengths are in reasonable agreement with quasiparticle-phonon model calculations based on a nonrelativistic description of the mean field but disagree with relativistic quasiparticle random-phase approximation predictions."
},
"authors": [
{
"name": {
"given_name": "B.",
"pubraw": "\u00d6zel, B.",
"surname": "\u00d6zel"
}
},
{
"name": {
"given_name": "J.",
"pubraw": "Enders, J.",
"surname": "Enders"
}
},
{
"name": {
"given_name": "H.",
"pubraw": "Lenske, H.",
"surname": "Lenske"
}
},
{
"name": {
"given_name": "P.",
"pubraw": "von Neumann-Cosel, P.",
"surname": "von Neumann-Cosel"
}
},
{
"name": {
"given_name": "I.",
"pubraw": "Poltoratska, I.",
"surname": "Poltoratska"
}
},
{
"name": {
"given_name": "V.",
"middle_name": "Yu.",
"pubraw": "Ponomarev, V. Yu.",
"surname": "Ponomarev"
}
},
{
"name": {
"given_name": "A.",
"pubraw": "Richter, A.",
"surname": "Richter"
}
},
{
"name": {
"given_name": "D.",
"pubraw": "Savran, D.",
"surname": "Savran"
}
},
{
"name": {
"given_name": "N.",
"pubraw": "Tsoneva, N.",
"surname": "Tsoneva"
}
}
],
"comments": [
{
"commentText": "Comment: submitted to Phys. Lett. B"
}
],
"keywords": [
{
"keyString": "Nuclear Experiment"
}
],
"pubDate": {
"electrDate": "2009-01-16"
},
"publication": {
"pubYear": "2009"
},
"publisherIDs": [
{
"Identifier": "http://arxiv.org/abs/0901.2443",
"attribute": "publisher-id"
}
],
"recordData": {
"createdTime": "",
"loadFormat": "OtherXML",
"loadLocation": "",
"loadType": "fromFile",
"parsedTime": "",
"recordOrigin": ""
},
"title": {
"textEnglish": "Excitation energy and strength of the pygmy dipole resonance in stable tin isotopes"
}
}
89 changes: 76 additions & 13 deletions tests/stubdata/output/arxiv_1711_04702.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,76 @@
{"abstract": {"textEnglish": "Background: Gene co-expression network analyses have become a central approach for the systems-level analysis of biological data. Several software packages exist for generating and analyzing such networks, either from correlation scores or the absolute value of a transformed score called weighted topological overlap (wTO). However, since some genes are able to up- or down-regulate other genes, it is important to explicitly consider both positive and negative correlations when constructing a gene co-expression network. Additionally, there has been a growing interest in the systematic comparison of multiple networks to identify deferentially changed links. Typically, such analyses are focused on the comparison of networks or data from two different conditions. Results: Here, we present an R package for calculating the weighted topological overlap (wTO), that explicitly addresses the sign of wTO values. The package includes the calculation of p-values (raw and adjusted) for each pairwise gene score. Our package also allows the calculation of networks from time series, without replicates. Additionally, our R package incorporates a novel method for calculating a consensus network (CN) from two or more networks. To visualize the resulting networks, the R package contains a visualization tool which allows for the direct network manipulation and access of node and link information. When testing the package on a standard laptop computer, we can conduct all calculations for systems of 20,000 genes in under two hours. Conclusion: In this work, we developed an R package that allows the computation of wTO networks, CNs and a visualization tool in the R statistical environment. It is publicly available on CRAN repositories under the GPL-2 Open Source License (https://cran.r-project.org/web/packages/wTO/)."},
"authors": [{"name": {"given_name": "Deisy", "pubraw": "Gysi, Deisy Morselli", "surname": "Morselli Gysi"}},
{"name": {"given_name": "Andre", "pubraw": "Voigt, Andre", "surname": "Voigt"}},
{"name": {"given_name": "Tiago", "pubraw": "Fragoso, Tiago de Miranda", "middle_name": "de Miranda", "surname": "Fragoso"}},
{"name": {"given_name": "Eivind", "pubraw": "Almaas, Eivind", "surname": "Almaas"}},
{"name": {"given_name": "Katja", "pubraw": "Nowick, Katja", "surname": "Nowick"}}],
"comments": [{"commentOrigin": "arxiv", "commentText": "Comment: 13 pages, 3 Figures"}],
"keywords": [{"keyString": "Quantitative Biology - Molecular Networks", "keySystem": "arxiv"}],
"persistentIDs": [{"preprint": {"identifier": "1711.04702", "source": "arXiv"}}],
"pubDate": {"electrDate": "2017-11-13"},
"publication": {"pubName": "eprint arXiv:1711.04702", "pubYear": "2017"},
"recordData": {"createdTime": "", "loadFormat": "OtherXML", "loadLocation": "", "loadType": "fromFile", "parsedTime": "", "recordOrigin": ""},
"title": {"textEnglish": "wTO: an R package for computing weighted topological overlap and consensus networks with an integrated visualization tool"}}
{
"abstract": {
"textEnglish": "Background: Gene co-expression network analyses have become a central approach for the systems-level analysis of biological data. Several software packages exist for generating and analyzing such networks, either from correlation scores or the absolute value of a transformed score called weighted topological overlap (wTO). However, since some genes are able to up- or down-regulate other genes, it is important to explicitly consider both positive and negative correlations when constructing a gene co-expression network. Additionally, there has been a growing interest in the systematic comparison of multiple networks to identify deferentially changed links. Typically, such analyses are focused on the comparison of networks or data from two different conditions. Results: Here, we present an R package for calculating the weighted topological overlap (wTO), that explicitly addresses the sign of wTO values. The package includes the calculation of p-values (raw and adjusted) for each pairwise gene score. Our package also allows the calculation of networks from time series, without replicates. Additionally, our R package incorporates a novel method for calculating a consensus network (CN) from two or more networks. To visualize the resulting networks, the R package contains a visualization tool which allows for the direct network manipulation and access of node and link information. When testing the package on a standard laptop computer, we can conduct all calculations for systems of 20,000 genes in under two hours. Conclusion: In this work, we developed an R package that allows the computation of wTO networks, CNs and a visualization tool in the R statistical environment. It is publicly available on CRAN repositories under the GPL-2 Open Source License (https://cran.r-project.org/web/packages/wTO/)."
},
"authors": [
{
"name": {
"given_name": "Deisy",
"pubraw": "Gysi, Deisy Morselli",
"surname": "Morselli Gysi"
}
},
{
"name": {
"given_name": "Andre",
"pubraw": "Voigt, Andre",
"surname": "Voigt"
}
},
{
"name": {
"given_name": "Tiago",
"middle_name": "de Miranda",
"pubraw": "Fragoso, Tiago de Miranda",
"surname": "Fragoso"
}
},
{
"name": {
"given_name": "Eivind",
"pubraw": "Almaas, Eivind",
"surname": "Almaas"
}
},
{
"name": {
"given_name": "Katja",
"pubraw": "Nowick, Katja",
"surname": "Nowick"
}
}
],
"comments": [
{
"commentText": "Comment: 13 pages, 3 Figures"
}
],
"keywords": [
{
"keyString": "Quantitative Biology - Molecular Networks"
}
],
"pubDate": {
"electrDate": "2017-11-13"
},
"publication": {
"pubYear": "2017"
},
"publisherIDs": [
{
"Identifier": "http://arxiv.org/abs/1711.04702",
"attribute": "publisher-id"
}
],
"recordData": {
"createdTime": "",
"loadFormat": "OtherXML",
"loadLocation": "",
"loadType": "fromFile",
"parsedTime": "",
"recordOrigin": ""
},
"title": {
"textEnglish": "wTO: an R package for computing weighted topological overlap and consensus networks with an integrated visualization tool"
}
}

0 comments on commit 0938a1c

Please sign in to comment.