From 95961d59db620dc71afb5be2b194df8cd6c86b70 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Jul 2022 14:29:38 -0700 Subject: WIP: DOAJ OAI-PMH importer --- python/fatcat_tools/harvest/oaipmh.py | 52 ++++++++++++++++++--- python/tests/files/example_doaj_article_oai.xml | 61 +++++++++++++++++++++++++ python/tests/harvest_doaj.py | 2 + 3 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 python/tests/files/example_doaj_article_oai.xml create mode 100644 python/tests/harvest_doaj.py diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 19eb6897..c829f2a2 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -5,8 +5,9 @@ from typing import Any, Optional import sickle from confluent_kafka import KafkaException, Producer +from bs4 import BeautifulSoup -from .harvest_common import HarvestState +from .harvest_common import HarvestState, requests_retry_session class HarvestOaiPmhWorker: @@ -44,6 +45,9 @@ class HarvestOaiPmhWorker: self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks + # optional; not all workers will need or use this HTTP session + self.http_session = requests_retry_session() + self.endpoint_url = None # needs override self.metadata_prefix = None # needs override self.name = "unnamed" @@ -94,14 +98,21 @@ class HarvestOaiPmhWorker: count += 1 if count % 50 == 0: print("... up to {}".format(count), file=sys.stderr) - producer.produce( - self.produce_topic, - item.raw.encode("utf-8"), - key=item.header.identifier.encode("utf-8"), - on_delivery=fail_fast, - ) + self.produce_record(item, producer) producer.flush() + def produce_record(self, item: sickle.models.Record, producer: Producer) -> None: + """ + The intent of this function is to allow overloading the record type + being passed along to the Kafka topic + """ + producer.produce( + self.produce_topic, + item.raw.encode("utf-8"), + key=item.header.identifier.encode("utf-8"), + on_delivery=fail_fast, + ) + def run(self, continuous: bool = False) -> None: while True: @@ -164,3 +175,30 @@ class HarvestDoajArticleWorker(HarvestOaiPmhWorker): self.endpoint_url = "https://www.doaj.org/oai.article" self.metadata_prefix = "oai_doaj" self.name = "doaj-article" + + def parse_doaj_article_id(self, raw_xml: bytes) -> str: + # XXX: don't parse XML; instead just handle item.oai_identifier + soup = BeautifulSoup(raw_xml, "xml") + elem = soup.find("record header identifier") + oai_id = elem.text.strip() + assert oai_id.startswith("oai:doaj.org/article:") + article_id = oai_id.replace("oai:doaj.org/article:", "") + assert len(article_id) == 32 and article_id == article_id.lower() + return article_id + + def produce_record(self, item: sickle.models.Record, producer: Producer) -> None: + """ + For each OAI-PMH record, do an API call to get the JSON format + response, and publish that to Kafka instead of the OAI-PMH XML + """ + + article_id = self.parse_doaj_article_id(item.raw) + resp = self.http_session.get(f"https://doaj.org/api/articles/{article_id}") + resp.raise_for_status() + assert resp.json()['id'] == article_id + producer.produce( + self.produce_topic, + resp.content.encode("utf-8"), + key=article_id.encode("utf-8"), + on_delivery=fail_fast, + ) diff --git a/python/tests/files/example_doaj_article_oai.xml b/python/tests/files/example_doaj_article_oai.xml new file mode 100644 index 00000000..de3add52 --- /dev/null +++ b/python/tests/files/example_doaj_article_oai.xml @@ -0,0 +1,61 @@ + +
+ oai:doaj.org/article:2a48ccce13c546ceab0c6bc5b74d433d + 2015-03-21T20:28:31Z + TENDOkRpc2Vhc2VzIG9mIHRoZSBtdXNjdWxvc2tlbGV0YWwgc3lzdGVt + TENDOlNwZWNpYWx0aWVzIG9mIGludGVybmFsIG1lZGljaW5l + TENDOkludGVybmFsIG1lZGljaW5l + TENDOk1lZGljaW5l + dGVzdDp0ZXJt +
+ + + ger + Verlag Krause und Pachernegg GmbH + Journal für Mineralstoffwechsel + 1023-7763 + 1680-9408 + 1998-01-01 + 5 + 1 + 25 + 29 + 648 + Leitfaden zur medikamentösen Standardtherapie in der Osteoporose + + + Stevo Popovic + 0 + https://orcid.org/0000-0001-1234-1234 + + + Dusko Bjelica + 1 + + + Gabriela Doina Tanase + 2 + https://orcid.org/0000-0001-1111-2222 + + + Rajko Milašinović + 3 + + + + University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro + University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro + University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro + University of Novi Sad, ACIMSR, Novi Sad, Serbia + + http://www.kup.at/kup/pdf/648.pdf + + Empfehlung + Mineralstoffwechsel + Osteoporose + Richtlinie + Therapie + + + +
diff --git a/python/tests/harvest_doaj.py b/python/tests/harvest_doaj.py new file mode 100644 index 00000000..a8dbbfd5 --- /dev/null +++ b/python/tests/harvest_doaj.py @@ -0,0 +1,2 @@ + +# TODO: test_parse_doaj_article_id() -- cgit v1.2.3