diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-11-21 22:19:19 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-11-21 22:22:32 -0800 | 
| commit | fb80eab1bdae2d21a3dda2e82230b7477ed41ebc (patch) | |
| tree | 22249594f630b5535eea5a95dea8a8f81e0abbd0 | |
| parent | 972f00e0e980da308fe80552145622ee69105b3b (diff) | |
| download | fatcat-fb80eab1bdae2d21a3dda2e82230b7477ed41ebc.tar.gz fatcat-fb80eab1bdae2d21a3dda2e82230b7477ed41ebc.zip | |
clean up harvester comments/docs
| -rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/harvest/harvest_common.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/harvest/oaipmh.py | 73 | 
3 files changed, 31 insertions, 50 deletions
| diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 0296a9d9..d27389ba 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -12,11 +12,6 @@ from pykafka import KafkaClient  from fatcat_tools.workers import most_recent_message  from .harvest_common import HarvestState -# Skip pylint due to: -#   AttributeError: 'NoneType' object has no attribute 'scope' -# in 'astroid/node_classes.py' -# pylint: skip-file -  class HarvestCrossrefWorker:      """ @@ -187,3 +182,4 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      def update_params(self, params, resp):          params['page[number]'] = resp['meta']['page'] + 1          return params + diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index f0ef51aa..6041a36f 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -4,7 +4,7 @@ import json  import time  import datetime - +# Used for parsing ISO date format (YYYY-MM-DD)  DATE_FMT = "%Y-%m-%d"  class HarvestState: diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 1f6c54b6..abd917e0 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,46 +1,4 @@ -""" -OAI-PMH protocol: -    https://sickle.readthedocs.io/en/latest/ - -Pubmed -    https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ -    https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm -    https://github.com/titipata/pubmed_parser - -arxiv -    some APIs work on a per-version basis, others do not - -    http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv -    http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw - -doaj -    https://github.com/miku/doajfetch - ------ - -actually, just going to re-use https://github.com/miku/metha for OAI-PMH stuff -    => shell script from cronjob -    => call metha-sync daily -    => metha-cat -since <whenever> | kafkacat output -    => echo "date" | kafkat state -    => some shell trick (comm?) to find missing dates; for each, do metha-cat into kafka - -or, just skip kafka for this stuff for now? hrm. - -crossref-like stuff is far enough along to keep - -## More Miku Magic! - -wowa, JSTOR KBART files! -    http://www.jstor.org/kbart/collections/all-archive-titles - -https://github.com/miku/ldjtab: faster than jq for just grabbing  - -sort can be told how much memory to use; eg: `sort -S50%`, and threads to use - -""" -  import re  import sys  import csv @@ -58,9 +16,19 @@ from .harvest_common import HarvestState  class HarvestOaiPmhWorker:      """ -    Base class for OAI-PMH harvesters. +    Base class for OAI-PMH harvesters. Uses the 'sickle' protocol library. -    Based on Crossref importer +    Typically run as a single process; harvests records and publishes in raw +    (XML) format to a Kafka topic, one-message-per-document. + +    Based on Crossref importer, with the HarvestState internal class managing +    progress with day-level granularity. Note that this depends on the OAI-PMH +    endpoint being correct! In that it must be possible to poll for only +    records updated on a particular date (typically "yesterday"). + +    Was very tempted to re-use <https://github.com/miku/metha> for this OAI-PMH +    stuff to save on dev time, but i'd already built the Crossref harvester and +    would want something similar operationally. Oh well!      """ @@ -124,6 +92,14 @@ class HarvestOaiPmhWorker:  class HarvestArxivWorker(HarvestOaiPmhWorker): +    """ +    Arxiv refs: +    - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXiv +    - http://export.arxiv.org/oai2?verb=GetRecord&identifier=oai:arXiv.org:0804.2273&metadataPrefix=arXivRaw + +    All records are work-level. Some metadata formats have internal info about +    specific versions. The 'arXiv' format does, so i'm using that. +    """      def __init__(self, **kwargs):          super().__init__(**kwargs)  @@ -133,6 +109,12 @@ class HarvestArxivWorker(HarvestOaiPmhWorker):  class HarvestPubmedWorker(HarvestOaiPmhWorker): +    """ +    Pubmed refs: +    - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ +    - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm +    - https://github.com/titipata/pubmed_parser +    """      def __init__(self, **kwargs):          super().__init__(**kwargs)  @@ -144,6 +126,9 @@ class HarvestPubmedWorker(HarvestOaiPmhWorker):  class HarvestDoajJournalWorker(HarvestOaiPmhWorker):      """      WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params + +    As an alternative, could use: +    - https://github.com/miku/doajfetch      """      def __init__(self, **kwargs): | 
