summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest/oaipmh.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-21 22:19:19 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-21 22:22:32 -0800
commitfb80eab1bdae2d21a3dda2e82230b7477ed41ebc (patch)
tree22249594f630b5535eea5a95dea8a8f81e0abbd0 /python/fatcat_tools/harvest/oaipmh.py
parent972f00e0e980da308fe80552145622ee69105b3b (diff)
downloadfatcat-fb80eab1bdae2d21a3dda2e82230b7477ed41ebc.tar.gz
fatcat-fb80eab1bdae2d21a3dda2e82230b7477ed41ebc.zip
clean up harvester comments/docs
Diffstat (limited to 'python/fatcat_tools/harvest/oaipmh.py')
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py73
1 files changed, 29 insertions, 44 deletions
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 1f6c54b6..abd917e0 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -1,46 +1,4 @@
-"""
-OAI-PMH protocol:
- https://sickle.readthedocs.io/en/latest/
-
-Pubmed
- https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
- https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
- https://github.com/titipata/pubmed_parser
-
-arxiv
- some APIs work on a per-version basis, others do not
-
- http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
- http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw
-
-doaj
- https://github.com/miku/doajfetch
-
------
-
-actually, just going to re-use https://github.com/miku/metha for OAI-PMH stuff
- => shell script from cronjob
- => call metha-sync daily
- => metha-cat -since <whenever> | kafkacat output
- => echo "date" | kafkat state
- => some shell trick (comm?) to find missing dates; for each, do metha-cat into kafka
-
-or, just skip kafka for this stuff for now? hrm.
-
-crossref-like stuff is far enough along to keep
-
-## More Miku Magic!
-
-wowa, JSTOR KBART files!
- http://www.jstor.org/kbart/collections/all-archive-titles
-
-https://github.com/miku/ldjtab: faster than jq for just grabbing
-
-sort can be told how much memory to use; eg: `sort -S50%`, and threads to use
-
-"""
-
import re
import sys
import csv
@@ -58,9 +16,19 @@ from .harvest_common import HarvestState
class HarvestOaiPmhWorker:
"""
- Base class for OAI-PMH harvesters.
+ Base class for OAI-PMH harvesters. Uses the 'sickle' protocol library.
- Based on Crossref importer
+ Typically run as a single process; harvests records and publishes in raw
+ (XML) format to a Kafka topic, one-message-per-document.
+
+ Based on Crossref importer, with the HarvestState internal class managing
+ progress with day-level granularity. Note that this depends on the OAI-PMH
+ endpoint being correct! In that it must be possible to poll for only
+ records updated on a particular date (typically "yesterday").
+
+ Was very tempted to re-use <https://github.com/miku/metha> for this OAI-PMH
+ stuff to save on dev time, but i'd already built the Crossref harvester and
+ would want something similar operationally. Oh well!
"""
@@ -124,6 +92,14 @@ class HarvestOaiPmhWorker:
class HarvestArxivWorker(HarvestOaiPmhWorker):
+ """
+ Arxiv refs:
+ - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
+ - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw
+
+ All records are work-level. Some metadata formats have internal info about
+ specific versions. The 'arXiv' format does, so i'm using that.
+ """
def __init__(self, **kwargs):
super().__init__(**kwargs)
@@ -133,6 +109,12 @@ class HarvestArxivWorker(HarvestOaiPmhWorker):
class HarvestPubmedWorker(HarvestOaiPmhWorker):
+ """
+ Pubmed refs:
+ - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
+ - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
+ - https://github.com/titipata/pubmed_parser
+ """
def __init__(self, **kwargs):
super().__init__(**kwargs)
@@ -144,6 +126,9 @@ class HarvestPubmedWorker(HarvestOaiPmhWorker):
class HarvestDoajJournalWorker(HarvestOaiPmhWorker):
"""
WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params
+
+ As an alternative, could use:
+ - https://github.com/miku/doajfetch
"""
def __init__(self, **kwargs):