diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-02-14 14:32:57 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-02-19 01:07:46 +0100 |
commit | 519b90d7f539b667e919c220a53626e7a4ac48bf (patch) | |
tree | b7c3beed283d7dca732a8f2ab5b1dfe283bb69f3 /python/fatcat_tools/harvest/oaipmh.py | |
parent | 4cbc94cd708c1db80a232150ab2cf56dddf83e62 (diff) | |
download | fatcat-519b90d7f539b667e919c220a53626e7a4ac48bf.tar.gz fatcat-519b90d7f539b667e919c220a53626e7a4ac48bf.zip |
pubmed ftp harvest and KafkaBs4XmlPusher
* add PubmedFTPWorker
* utils are currently stored alongside pubmed (e.g. ftpretr, xmlstream)
but may live elsewhere, as they are more generic
* add KafkaBs4XmlPusher
Diffstat (limited to 'python/fatcat_tools/harvest/oaipmh.py')
-rw-r--r-- | python/fatcat_tools/harvest/oaipmh.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 11b5fa0a..8e9efea8 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -142,6 +142,21 @@ class HarvestPubmedWorker(HarvestOaiPmhWorker): - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm - https://github.com/titipata/pubmed_parser + + TODO(martin): OAI does not seem to support the format we already have an + importer for. Maybe we can use "Daily Update Files" -- + + Daily Update Files + ------------------ + Each day, NLM produces update files that include new, revised and deleted + citations. The first Update file to be loaded after loading the complete + set of 2019 MEDLINE/PubMed Baseline files is pubmed20n1016.xml. + ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles + + NOTES: + + * OAI: https://dtd.nlm.nih.gov/archiving/2.3/xsd/archivearticle.xsd + * FTP: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd """ def __init__(self, **kwargs): |