Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implemented chunking of docs #10

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Expand Up @@ -62,6 +62,6 @@ jobs:
with:
python-version: "3.9"
- run: pip install .
- run: pip install '.[dev]'
- run: pip install '.[docs]'
- name: Build docs
run: python rtool.py docs
run: sphinx-build docs .docs
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Expand Up @@ -20,7 +20,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 21.11b1
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
Expand Down
7 changes: 6 additions & 1 deletion .vscode/settings.json
Expand Up @@ -12,5 +12,10 @@
"PYTHONPATH": ".:tests"
},
"python.analysis.extraPaths": [],
"python.defaultInterpreterPath": ".venv/bin/python"
"python.defaultInterpreterPath": ".venv/bin/python",
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Expand Up @@ -8,7 +8,7 @@ You can contribute in many ways:

### Report Bugs

Report bugs at <https://github.com/kelockhart/adsingestp/issues>.
Report bugs at <https://github.com/adsabs/adsingestp/issues>.

If you are reporting a bug, please include:

Expand All @@ -30,7 +30,7 @@ ADSIngestParser could always use more documentation, whether as part of the offi

### Submit Feedback

The best way to send feedback is to file an issue at <https://github.com/kelockhart/adsingestp/issues>.
The best way to send feedback is to file an issue at <https://github.com/adsabs/adsingestp/issues>.

If you are proposing a feature:

Expand Down
6 changes: 3 additions & 3 deletions README.md
Expand Up @@ -2,11 +2,11 @@

<p align="center">

![CI Status](https://github.com/kelockhart/ADSIngestParser/actions/workflows/ci.yml/badge.svg)
![CI Status](https://github.com/adsabs/ADSIngestParser/actions/workflows/ci.yml/badge.svg)

<!--
<a href="https://codecov.io/gh/kelockhart/adsingestp">
<img src="https://img.shields.io/codecov/c/github/kelockhart/adsingestp.svg?logo=codecov&logoColor=fff&style=flat-square" alt="Test coverage percentage">
<a href="https://codecov.io/gh/adsabs/adsingestp">
<img src="https://img.shields.io/codecov/c/github/adsabs/adsingestp.svg?logo=codecov&logoColor=fff&style=flat-square" alt="Test coverage percentage">
</a>
//-->
</p>
Expand Down
20 changes: 10 additions & 10 deletions adsingestp/cli.py
@@ -1,15 +1,15 @@
import click

# try:
# import rutils
#
# config = rutils.load_config()
# logger = rutils.setup_logging("adsingestp.cli")
# except ImportError:
# import logging
#
# config = {}
# logger = logging.getLogger("adsingestp.cli")
try:
import lvtn1_utils as utils

config = utils.load_config()
logger = utils.setup_logging("adsingestp.cli")
except ImportError:
import logging

config = {}
logger = logging.getLogger("adsingestp.cli")


@click.group()
Expand Down
2 changes: 2 additions & 0 deletions adsingestp/ingest_exceptions.py
@@ -1,5 +1,6 @@
# TODO add exception handling


class IngestParserException(Exception):
pass

Expand Down Expand Up @@ -40,6 +41,7 @@ class UnicodeHandlerError(IngestParserException):
"""
Error in the UnicodeHandler.
"""

pass


Expand Down
47 changes: 24 additions & 23 deletions adsingestp/parsers/arxiv.py
Expand Up @@ -127,26 +127,27 @@ def _parse_keywords(self):
self.base_metadata["keywords"] = keywords_out

def parse(self, text):
d = self.xmltodict(text)

self.input_header = d.get("record", {}).get("header", {})
self.input_metadata = d.get("record", {}).get("metadata", {}).get("oai_dc:dc", {})

schema_spec = []
for s in self._array(self.input_metadata["@xmlns:oai_dc"]):
schema_spec.append(self._text(s))
if len(schema_spec) == 0:
raise NoSchemaException("Unknown record schema.")
elif schema_spec[0] not in self.DUBCORE_SCHEMA:
raise WrongSchemaException("Wrong schema.")

self._parse_ids()
self._parse_title()
self._parse_author()
self._parse_pubdate()
self._parse_abstract()
self._parse_keywords()

output = serializer.serialize(self.base_metadata, format="OtherXML")

return output
for chunk in self.get_chunks(text, r"<record(?!-)[^>]*>", r"</record(?!-)[^>]*>"):
d = self.xmltodict(chunk)

self.input_header = d.get("record", {}).get("header", {})
self.input_metadata = d.get("record", {}).get("metadata", {}).get("oai_dc:dc", {})

schema_spec = []
for s in self._array(self.input_metadata["@xmlns:oai_dc"]):
schema_spec.append(self._text(s))
if len(schema_spec) == 0:
raise NoSchemaException("Unknown record schema.")
elif schema_spec[0] not in self.DUBCORE_SCHEMA:
raise WrongSchemaException("Wrong schema.")

self._parse_ids()
self._parse_title()
self._parse_author()
self._parse_pubdate()
self._parse_abstract()
self._parse_keywords()

output = serializer.serialize(self.base_metadata, format="OtherXML")

yield output
27 changes: 27 additions & 0 deletions adsingestp/parsers/base.py
@@ -1,3 +1,5 @@
import re

import bs4
import xmltodict as xmltodict_parser

Expand Down Expand Up @@ -52,6 +54,31 @@ def _attr(self, e, a, d=""):
else:
return d

def get_chunks(self, input, startpattern, endpattern):
"""Super simple method (though not inefficient) to cut input
into chunk-sized documents, preserving header/footer"""

s = re.compile(startpattern, re.IGNORECASE)
e = re.compile(endpattern, re.IGNORECASE)
x = s.search(input)
if x is None:
return input # not found, return the whole thing
istart = x.start()
iend = None
for x in e.finditer(input, istart + 1):
iend = x.end() + 1
if iend is None:
return input # not found, return the whole thing

header = input[0:istart]
footer = input[iend:]

for snext in s.finditer(input, istart + 1):
yield header + input[istart : snext.start()] + footer
istart = snext.start()

yield header + input[istart:iend] + footer


class BaseBeautifulSoupParser(object):
"""
Expand Down
137 changes: 70 additions & 67 deletions adsingestp/parsers/crossref.py
Expand Up @@ -8,12 +8,12 @@
WrongSchemaException,
XmlLoadException,
)
from adsingestp.parsers.base import BaseBeautifulSoupParser
from adsingestp.parsers.base import BaseBeautifulSoupParser, BaseXmlToDictParser

logger = logging.getLogger(__name__)


class CrossrefParser(BaseBeautifulSoupParser):
class CrossrefParser(BaseBeautifulSoupParser, BaseXmlToDictParser):
def __init__(self):
self.base_metadata = {}
self.input_metadata = None
Expand Down Expand Up @@ -289,81 +289,84 @@ def _parse_references(self):
self.base_metadata["references"] = ref_list

def parse(self, text):
try:
d = self.bsstrtodict(text, parser="lxml-xml")
records_in_file = d.find_all("doi_record")
if len(records_in_file) > 1:
raise TooManyDocumentsException(
"This file has %s records, should have only one!" % len(records_in_file)
)
except Exception as err:
raise XmlLoadException(err)
for chunk in self.get_chunks(text, r"<record[^>]*>", r"</record[^>]*>"):
try:
d = self.bsstrtodict(chunk, parser="lxml-xml")
records_in_file = d.find_all("doi_record")
if len(records_in_file) > 1:
raise TooManyDocumentsException(
"This file has %s records, should have only one!" % len(records_in_file)
)
except Exception as err:
raise XmlLoadException(err)

try:
self.input_metadata = d.find("crossref").extract()
except Exception as err:
raise NotCrossrefXMLException(err)
try:
self.input_metadata = d.find("crossref").extract()
except Exception as err:
raise NotCrossrefXMLException(err)

type_found = False
self.record_type = None
if self.input_metadata.find("journal"):
type_found = True
self.record_type = "journal"
self.record_meta = self.input_metadata.find("journal_article").extract()
if self.input_metadata.find("conference"):
if type_found:
raise WrongSchemaException("Too many document types found in CrossRef record")
else:
type_found = True
self.record_type = "conference"
self.record_meta = self.input_metadata.find("conference_paper").extract()
if self.input_metadata.find("book"):
if type_found:
raise WrongSchemaException("Too many document types found in CrossRef record")
else:
type_found = False
self.record_type = None
if self.input_metadata.find("journal"):
type_found = True
self.record_type = "book"
if self.input_metadata.find("book_metadata"):
self.record_meta = self.input_metadata.find("book_metadata").extract()
elif self.input_metadata.find("book_series_metadata"):
self.record_meta = self.input_metadata.find("book_series_metadata").extract()

if not type_found:
raise WrongSchemaException(
"Didn't find allowed document type (article, conference, book) in CrossRef record"
)
self.record_type = "journal"
self.record_meta = self.input_metadata.find("journal_article").extract()
if self.input_metadata.find("conference"):
if type_found:
raise WrongSchemaException("Too many document types found in CrossRef record")
else:
type_found = True
self.record_type = "conference"
self.record_meta = self.input_metadata.find("conference_paper").extract()
if self.input_metadata.find("book"):
if type_found:
raise WrongSchemaException("Too many document types found in CrossRef record")
else:
type_found = True
self.record_type = "book"
if self.input_metadata.find("book_metadata"):
self.record_meta = self.input_metadata.find("book_metadata").extract()
elif self.input_metadata.find("book_series_metadata"):
self.record_meta = self.input_metadata.find(
"book_series_metadata"
).extract()

if not type_found:
raise WrongSchemaException(
"Didn't find allowed document type (article, conference, book) in CrossRef record"
)

if self.record_type == "journal":
self._parse_pub()
if self.record_type == "journal":
self._parse_pub()

if self.record_type == "conference":
self._parse_conf_event_proceedings()
if self.record_type == "conference":
self._parse_conf_event_proceedings()

if self.record_type == "book":
if self.record_meta.find("publisher") and self.record_meta.find("publisher").find(
"publisher_name"
):
self.base_metadata["publisher"] = self.record_meta.find(
if self.record_type == "book":
if self.record_meta.find("publisher") and self.record_meta.find("publisher").find(
"publisher_name"
).get_text()
):
self.base_metadata["publisher"] = self.record_meta.find(
"publisher_name"
).get_text()

if self.record_meta.find("isbn"):
self.base_metadata["isbn"] = self._get_isbn(self.record_meta.find_all("isbn"))
if self.record_meta.find("isbn"):
self.base_metadata["isbn"] = self._get_isbn(self.record_meta.find_all("isbn"))

if self.record_meta.find("series_metadata"):
self._parse_book_series()
if self.record_meta.find("series_metadata"):
self._parse_book_series()

self._parse_issue()
self._parse_title_abstract()
self._parse_contrib()
self._parse_pubdate()
self._parse_edhistory_copyright()
self._parse_page()
self._parse_ids()
self._parse_references()
self._parse_issue()
self._parse_title_abstract()
self._parse_contrib()
self._parse_pubdate()
self._parse_edhistory_copyright()
self._parse_page()
self._parse_ids()
self._parse_references()

self.entity_convert()
self.entity_convert()

output = serializer.serialize(self.base_metadata, format="OtherXML")
output = serializer.serialize(self.base_metadata, format="OtherXML")

return output
yield output