aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-07-19 14:29:38 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-07-19 14:29:38 -0700
commit95961d59db620dc71afb5be2b194df8cd6c86b70 (patch)
treee423d5ea8235401c5c92c2f9db6d43e7a8b9942c
parent5e15ca96db59858231c83b0af210a6a04054612c (diff)
downloadfatcat-bnewbold-doaj-article-harvest.tar.gz
fatcat-bnewbold-doaj-article-harvest.zip
WIP: DOAJ OAI-PMH importerbnewbold-doaj-article-harvest
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py52
-rw-r--r--python/tests/files/example_doaj_article_oai.xml61
-rw-r--r--python/tests/harvest_doaj.py2
3 files changed, 108 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 19eb6897..c829f2a2 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -5,8 +5,9 @@ from typing import Any, Optional
import sickle
from confluent_kafka import KafkaException, Producer
+from bs4 import BeautifulSoup
-from .harvest_common import HarvestState
+from .harvest_common import HarvestState, requests_retry_session
class HarvestOaiPmhWorker:
@@ -44,6 +45,9 @@ class HarvestOaiPmhWorker:
self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks
+ # optional; not all workers will need or use this HTTP session
+ self.http_session = requests_retry_session()
+
self.endpoint_url = None # needs override
self.metadata_prefix = None # needs override
self.name = "unnamed"
@@ -94,14 +98,21 @@ class HarvestOaiPmhWorker:
count += 1
if count % 50 == 0:
print("... up to {}".format(count), file=sys.stderr)
- producer.produce(
- self.produce_topic,
- item.raw.encode("utf-8"),
- key=item.header.identifier.encode("utf-8"),
- on_delivery=fail_fast,
- )
+ self.produce_record(item, producer)
producer.flush()
+ def produce_record(self, item: sickle.models.Record, producer: Producer) -> None:
+ """
+ The intent of this function is to allow overloading the record type
+ being passed along to the Kafka topic
+ """
+ producer.produce(
+ self.produce_topic,
+ item.raw.encode("utf-8"),
+ key=item.header.identifier.encode("utf-8"),
+ on_delivery=fail_fast,
+ )
+
def run(self, continuous: bool = False) -> None:
while True:
@@ -164,3 +175,30 @@ class HarvestDoajArticleWorker(HarvestOaiPmhWorker):
self.endpoint_url = "https://www.doaj.org/oai.article"
self.metadata_prefix = "oai_doaj"
self.name = "doaj-article"
+
+ def parse_doaj_article_id(self, raw_xml: bytes) -> str:
+ # XXX: don't parse XML; instead just handle item.oai_identifier
+ soup = BeautifulSoup(raw_xml, "xml")
+ elem = soup.find("record header identifier")
+ oai_id = elem.text.strip()
+ assert oai_id.startswith("oai:doaj.org/article:")
+ article_id = oai_id.replace("oai:doaj.org/article:", "")
+ assert len(article_id) == 32 and article_id == article_id.lower()
+ return article_id
+
+ def produce_record(self, item: sickle.models.Record, producer: Producer) -> None:
+ """
+ For each OAI-PMH record, do an API call to get the JSON format
+ response, and publish that to Kafka instead of the OAI-PMH XML
+ """
+
+ article_id = self.parse_doaj_article_id(item.raw)
+ resp = self.http_session.get(f"https://doaj.org/api/articles/{article_id}")
+ resp.raise_for_status()
+ assert resp.json()['id'] == article_id
+ producer.produce(
+ self.produce_topic,
+ resp.content.encode("utf-8"),
+ key=article_id.encode("utf-8"),
+ on_delivery=fail_fast,
+ )
diff --git a/python/tests/files/example_doaj_article_oai.xml b/python/tests/files/example_doaj_article_oai.xml
new file mode 100644
index 00000000..de3add52
--- /dev/null
+++ b/python/tests/files/example_doaj_article_oai.xml
@@ -0,0 +1,61 @@
+<record>
+ <header xmlns:oai_doaj="http://doaj.org/features/oai_doaj/1.0/">
+ <identifier>oai:doaj.org/article:2a48ccce13c546ceab0c6bc5b74d433d</identifier>
+ <datestamp>2015-03-21T20:28:31Z</datestamp>
+ <setSpec>TENDOkRpc2Vhc2VzIG9mIHRoZSBtdXNjdWxvc2tlbGV0YWwgc3lzdGVt</setSpec>
+ <setSpec>TENDOlNwZWNpYWx0aWVzIG9mIGludGVybmFsIG1lZGljaW5l</setSpec>
+ <setSpec>TENDOkludGVybmFsIG1lZGljaW5l</setSpec>
+ <setSpec>TENDOk1lZGljaW5l</setSpec>
+ <setSpec>dGVzdDp0ZXJt</setSpec>
+ </header>
+ <metadata xmlns:oai_doaj="http://doaj.org/features/oai_doaj/1.0/">
+ <oai_doaj:doajArticle xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd http://doaj.org/features/oai_doaj/1.0/ https://doaj.org/static/doaj/doajArticles.xsd">
+ <oai_doaj:language>ger</oai_doaj:language>
+ <oai_doaj:publisher>Verlag Krause und Pachernegg GmbH</oai_doaj:publisher>
+ <oai_doaj:journalTitle>Journal f&#xFC;r Mineralstoffwechsel</oai_doaj:journalTitle>
+ <oai_doaj:issn>1023-7763</oai_doaj:issn>
+ <oai_doaj:eissn>1680-9408</oai_doaj:eissn>
+ <oai_doaj:publicationDate>1998-01-01</oai_doaj:publicationDate>
+ <oai_doaj:volume>5</oai_doaj:volume>
+ <oai_doaj:issue>1</oai_doaj:issue>
+ <oai_doaj:startPage>25</oai_doaj:startPage>
+ <oai_doaj:endPage>29</oai_doaj:endPage>
+ <oai_doaj:publisherRecordId>648</oai_doaj:publisherRecordId>
+ <oai_doaj:title>Leitfaden zur medikament&#xF6;sen Standardtherapie in der Osteoporose</oai_doaj:title>
+ <authors>
+ <author>
+ <name>Stevo Popovic</name>
+ <affiliationId>0</affiliationId>
+ <orcid_id>https://orcid.org/0000-0001-1234-1234</orcid_id>
+ </author>
+ <author>
+ <name>Dusko Bjelica</name>
+ <affiliationId>1</affiliationId>
+ </author>
+ <author>
+ <name>Gabriela Doina Tanase</name>
+ <affiliationId>2</affiliationId>
+ <orcid_id>https://orcid.org/0000-0001-1111-2222</orcid_id>
+ </author>
+ <author>
+ <name>Rajko Mila&#x161;inovi&#x107;</name>
+ <affiliationId>3</affiliationId>
+ </author>
+ </authors>
+ <affiliationsList>
+ <affiliationName affiliationId="0">University of Montenegro, Faculty for Sport and Physical Education, Nik&#x161;i&#x107;, Montenegro</affiliationName>
+ <affiliationName affiliationId="1">University of Montenegro, Faculty for Sport and Physical Education, Nik&#x161;i&#x107;, Montenegro</affiliationName>
+ <affiliationName affiliationId="2">University of Montenegro, Faculty for Sport and Physical Education, Nik&#x161;i&#x107;, Montenegro</affiliationName>
+ <affiliationName affiliationId="3">University of Novi Sad, ACIMSR, Novi Sad, Serbia</affiliationName>
+ </affiliationsList>
+ <oai_doaj:fullTextUrl format="pdf">http://www.kup.at/kup/pdf/648.pdf</oai_doaj:fullTextUrl>
+ <oai_doaj:keywords>
+ <oai_doaj:keyword>Empfehlung</oai_doaj:keyword>
+ <oai_doaj:keyword>Mineralstoffwechsel</oai_doaj:keyword>
+ <oai_doaj:keyword>Osteoporose</oai_doaj:keyword>
+ <oai_doaj:keyword>Richtlinie</oai_doaj:keyword>
+ <oai_doaj:keyword>Therapie</oai_doaj:keyword>
+ </oai_doaj:keywords>
+ </oai_doaj:doajArticle>
+ </metadata>
+</record>
diff --git a/python/tests/harvest_doaj.py b/python/tests/harvest_doaj.py
new file mode 100644
index 00000000..a8dbbfd5
--- /dev/null
+++ b/python/tests/harvest_doaj.py
@@ -0,0 +1,2 @@
+
+# TODO: test_parse_doaj_article_id()