diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 14:29:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 14:29:38 -0700 |
commit | 95961d59db620dc71afb5be2b194df8cd6c86b70 (patch) | |
tree | e423d5ea8235401c5c92c2f9db6d43e7a8b9942c | |
parent | 5e15ca96db59858231c83b0af210a6a04054612c (diff) | |
download | fatcat-bnewbold-doaj-article-harvest.tar.gz fatcat-bnewbold-doaj-article-harvest.zip |
WIP: DOAJ OAI-PMH importerbnewbold-doaj-article-harvest
-rw-r--r-- | python/fatcat_tools/harvest/oaipmh.py | 52 | ||||
-rw-r--r-- | python/tests/files/example_doaj_article_oai.xml | 61 | ||||
-rw-r--r-- | python/tests/harvest_doaj.py | 2 |
3 files changed, 108 insertions, 7 deletions
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 19eb6897..c829f2a2 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -5,8 +5,9 @@ from typing import Any, Optional import sickle from confluent_kafka import KafkaException, Producer +from bs4 import BeautifulSoup -from .harvest_common import HarvestState +from .harvest_common import HarvestState, requests_retry_session class HarvestOaiPmhWorker: @@ -44,6 +45,9 @@ class HarvestOaiPmhWorker: self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks + # optional; not all workers will need or use this HTTP session + self.http_session = requests_retry_session() + self.endpoint_url = None # needs override self.metadata_prefix = None # needs override self.name = "unnamed" @@ -94,14 +98,21 @@ class HarvestOaiPmhWorker: count += 1 if count % 50 == 0: print("... up to {}".format(count), file=sys.stderr) - producer.produce( - self.produce_topic, - item.raw.encode("utf-8"), - key=item.header.identifier.encode("utf-8"), - on_delivery=fail_fast, - ) + self.produce_record(item, producer) producer.flush() + def produce_record(self, item: sickle.models.Record, producer: Producer) -> None: + """ + The intent of this function is to allow overloading the record type + being passed along to the Kafka topic + """ + producer.produce( + self.produce_topic, + item.raw.encode("utf-8"), + key=item.header.identifier.encode("utf-8"), + on_delivery=fail_fast, + ) + def run(self, continuous: bool = False) -> None: while True: @@ -164,3 +175,30 @@ class HarvestDoajArticleWorker(HarvestOaiPmhWorker): self.endpoint_url = "https://www.doaj.org/oai.article" self.metadata_prefix = "oai_doaj" self.name = "doaj-article" + + def parse_doaj_article_id(self, raw_xml: bytes) -> str: + # XXX: don't parse XML; instead just handle item.oai_identifier + soup = BeautifulSoup(raw_xml, "xml") + elem = soup.find("record header identifier") + oai_id = elem.text.strip() + assert oai_id.startswith("oai:doaj.org/article:") + article_id = oai_id.replace("oai:doaj.org/article:", "") + assert len(article_id) == 32 and article_id == article_id.lower() + return article_id + + def produce_record(self, item: sickle.models.Record, producer: Producer) -> None: + """ + For each OAI-PMH record, do an API call to get the JSON format + response, and publish that to Kafka instead of the OAI-PMH XML + """ + + article_id = self.parse_doaj_article_id(item.raw) + resp = self.http_session.get(f"https://doaj.org/api/articles/{article_id}") + resp.raise_for_status() + assert resp.json()['id'] == article_id + producer.produce( + self.produce_topic, + resp.content.encode("utf-8"), + key=article_id.encode("utf-8"), + on_delivery=fail_fast, + ) diff --git a/python/tests/files/example_doaj_article_oai.xml b/python/tests/files/example_doaj_article_oai.xml new file mode 100644 index 00000000..de3add52 --- /dev/null +++ b/python/tests/files/example_doaj_article_oai.xml @@ -0,0 +1,61 @@ +<record> + <header xmlns:oai_doaj="http://doaj.org/features/oai_doaj/1.0/"> + <identifier>oai:doaj.org/article:2a48ccce13c546ceab0c6bc5b74d433d</identifier> + <datestamp>2015-03-21T20:28:31Z</datestamp> + <setSpec>TENDOkRpc2Vhc2VzIG9mIHRoZSBtdXNjdWxvc2tlbGV0YWwgc3lzdGVt</setSpec> + <setSpec>TENDOlNwZWNpYWx0aWVzIG9mIGludGVybmFsIG1lZGljaW5l</setSpec> + <setSpec>TENDOkludGVybmFsIG1lZGljaW5l</setSpec> + <setSpec>TENDOk1lZGljaW5l</setSpec> + <setSpec>dGVzdDp0ZXJt</setSpec> + </header> + <metadata xmlns:oai_doaj="http://doaj.org/features/oai_doaj/1.0/"> + <oai_doaj:doajArticle xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd http://doaj.org/features/oai_doaj/1.0/ https://doaj.org/static/doaj/doajArticles.xsd"> + <oai_doaj:language>ger</oai_doaj:language> + <oai_doaj:publisher>Verlag Krause und Pachernegg GmbH</oai_doaj:publisher> + <oai_doaj:journalTitle>Journal für Mineralstoffwechsel</oai_doaj:journalTitle> + <oai_doaj:issn>1023-7763</oai_doaj:issn> + <oai_doaj:eissn>1680-9408</oai_doaj:eissn> + <oai_doaj:publicationDate>1998-01-01</oai_doaj:publicationDate> + <oai_doaj:volume>5</oai_doaj:volume> + <oai_doaj:issue>1</oai_doaj:issue> + <oai_doaj:startPage>25</oai_doaj:startPage> + <oai_doaj:endPage>29</oai_doaj:endPage> + <oai_doaj:publisherRecordId>648</oai_doaj:publisherRecordId> + <oai_doaj:title>Leitfaden zur medikamentösen Standardtherapie in der Osteoporose</oai_doaj:title> + <authors> + <author> + <name>Stevo Popovic</name> + <affiliationId>0</affiliationId> + <orcid_id>https://orcid.org/0000-0001-1234-1234</orcid_id> + </author> + <author> + <name>Dusko Bjelica</name> + <affiliationId>1</affiliationId> + </author> + <author> + <name>Gabriela Doina Tanase</name> + <affiliationId>2</affiliationId> + <orcid_id>https://orcid.org/0000-0001-1111-2222</orcid_id> + </author> + <author> + <name>Rajko Milašinović</name> + <affiliationId>3</affiliationId> + </author> + </authors> + <affiliationsList> + <affiliationName affiliationId="0">University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro</affiliationName> + <affiliationName affiliationId="1">University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro</affiliationName> + <affiliationName affiliationId="2">University of Montenegro, Faculty for Sport and Physical Education, Nikšić, Montenegro</affiliationName> + <affiliationName affiliationId="3">University of Novi Sad, ACIMSR, Novi Sad, Serbia</affiliationName> + </affiliationsList> + <oai_doaj:fullTextUrl format="pdf">http://www.kup.at/kup/pdf/648.pdf</oai_doaj:fullTextUrl> + <oai_doaj:keywords> + <oai_doaj:keyword>Empfehlung</oai_doaj:keyword> + <oai_doaj:keyword>Mineralstoffwechsel</oai_doaj:keyword> + <oai_doaj:keyword>Osteoporose</oai_doaj:keyword> + <oai_doaj:keyword>Richtlinie</oai_doaj:keyword> + <oai_doaj:keyword>Therapie</oai_doaj:keyword> + </oai_doaj:keywords> + </oai_doaj:doajArticle> + </metadata> +</record> diff --git a/python/tests/harvest_doaj.py b/python/tests/harvest_doaj.py new file mode 100644 index 00000000..a8dbbfd5 --- /dev/null +++ b/python/tests/harvest_doaj.py @@ -0,0 +1,2 @@ + +# TODO: test_parse_doaj_article_id() |