From fb80eab1bdae2d21a3dda2e82230b7477ed41ebc Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 21 Nov 2018 22:19:19 -0800
Subject: clean up harvester comments/docs

---
 python/fatcat_tools/harvest/doi_registrars.py |  6 +--
 python/fatcat_tools/harvest/harvest_common.py |  2 +-
 python/fatcat_tools/harvest/oaipmh.py         | 73 +++++++++++----------------
 3 files changed, 31 insertions(+), 50 deletions(-)
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 0296a9d9..d27389ba 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -12,11 +12,6 @@ from pykafka import KafkaClient
 from fatcat_tools.workers import most_recent_message
 from .harvest_common import HarvestState
 
-# Skip pylint due to:
-#   AttributeError: 'NoneType' object has no attribute 'scope'
-# in 'astroid/node_classes.py'
-# pylint: skip-file
-
 
 class HarvestCrossrefWorker:
     """
@@ -187,3 +182,4 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
     def update_params(self, params, resp):
         params['page[number]'] = resp['meta']['page'] + 1
         return params
+
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index f0ef51aa..6041a36f 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -4,7 +4,7 @@ import json
 import time
 import datetime
 
-
+# Used for parsing ISO date format (YYYY-MM-DD)
 DATE_FMT = "%Y-%m-%d"
 
 class HarvestState:
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 1f6c54b6..abd917e0 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -1,46 +1,4 @@
 
-"""
-OAI-PMH protocol:
-    https://sickle.readthedocs.io/en/latest/
-
-Pubmed
-    https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
-    https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
-    https://github.com/titipata/pubmed_parser
-
-arxiv
-    some APIs work on a per-version basis, others do not
-
-    http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
-    http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw
-
-doaj
-    https://github.com/miku/doajfetch
-
------
-
-actually, just going to re-use https://github.com/miku/metha for OAI-PMH stuff
-    => shell script from cronjob
-    => call metha-sync daily
-    => metha-cat -since <whenever> | kafkacat output
-    => echo "date" | kafkat state
-    => some shell trick (comm?) to find missing dates; for each, do metha-cat into kafka
-
-or, just skip kafka for this stuff for now? hrm.
-
-crossref-like stuff is far enough along to keep
-
-## More Miku Magic!
-
-wowa, JSTOR KBART files!
-    http://www.jstor.org/kbart/collections/all-archive-titles
-
-https://github.com/miku/ldjtab: faster than jq for just grabbing 
-
-sort can be told how much memory to use; eg: `sort -S50%`, and threads to use
-
-"""
-
 import re
 import sys
 import csv
@@ -58,9 +16,19 @@ from .harvest_common import HarvestState
 
 class HarvestOaiPmhWorker:
     """
-    Base class for OAI-PMH harvesters.
+    Base class for OAI-PMH harvesters. Uses the 'sickle' protocol library.
 
-    Based on Crossref importer
+    Typically run as a single process; harvests records and publishes in raw
+    (XML) format to a Kafka topic, one-message-per-document.
+
+    Based on Crossref importer, with the HarvestState internal class managing
+    progress with day-level granularity. Note that this depends on the OAI-PMH
+    endpoint being correct! In that it must be possible to poll for only
+    records updated on a particular date (typically "yesterday").
+
+    Was very tempted to re-use <https://github.com/miku/metha> for this OAI-PMH
+    stuff to save on dev time, but i'd already built the Crossref harvester and
+    would want something similar operationally. Oh well!
     """
 
 
@@ -124,6 +92,14 @@ class HarvestOaiPmhWorker:
 
 
 class HarvestArxivWorker(HarvestOaiPmhWorker):
+    """
+    Arxiv refs:
+    - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv
+    - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw
+
+    All records are work-level. Some metadata formats have internal info about
+    specific versions. The 'arXiv' format does, so i'm using that.
+    """
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs) 
@@ -133,6 +109,12 @@ class HarvestArxivWorker(HarvestOaiPmhWorker):
 
 
 class HarvestPubmedWorker(HarvestOaiPmhWorker):
+    """
+    Pubmed refs:
+    - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/
+    - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm
+    - https://github.com/titipata/pubmed_parser
+    """
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs) 
@@ -144,6 +126,9 @@ class HarvestPubmedWorker(HarvestOaiPmhWorker):
 class HarvestDoajJournalWorker(HarvestOaiPmhWorker):
     """
     WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params
+
+    As an alternative, could use:
+    - https://github.com/miku/doajfetch
     """
 
     def __init__(self, **kwargs):
-- 
cgit v1.2.3