From 519b90d7f539b667e919c220a53626e7a4ac48bf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 14 Feb 2020 14:32:57 +0100 Subject: pubmed ftp harvest and KafkaBs4XmlPusher * add PubmedFTPWorker * utils are currently stored alongside pubmed (e.g. ftpretr, xmlstream) but may live elsewhere, as they are more generic * add KafkaBs4XmlPusher --- python/fatcat_tools/harvest/oaipmh.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'python/fatcat_tools/harvest/oaipmh.py') diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 11b5fa0a..8e9efea8 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -142,6 +142,21 @@ class HarvestPubmedWorker(HarvestOaiPmhWorker): - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm - https://github.com/titipata/pubmed_parser + + TODO(martin): OAI does not seem to support the format we already have an + importer for. Maybe we can use "Daily Update Files" -- + + Daily Update Files + ------------------ + Each day, NLM produces update files that include new, revised and deleted + citations. The first Update file to be loaded after loading the complete + set of 2019 MEDLINE/PubMed Baseline files is pubmed20n1016.xml. + ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles + + NOTES: + + * OAI: https://dtd.nlm.nih.gov/archiving/2.3/xsd/archivearticle.xsd + * FTP: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd """ def __init__(self, **kwargs): -- cgit v1.2.3 From c6b29c17eeea4c067dcc391fe6d9bdaec3f657b3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 9 Mar 2020 20:07:30 +0100 Subject: oaipmh: HarvestPubmedWorker obsoleted by PubmedFTPWorker --- python/fatcat_tools/harvest/oaipmh.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'python/fatcat_tools/harvest/oaipmh.py') diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 8e9efea8..c95f3445 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -132,40 +132,6 @@ class HarvestArxivWorker(HarvestOaiPmhWorker): self.name = "arxiv" -class HarvestPubmedWorker(HarvestOaiPmhWorker): - """ - Will likely be doing MEDLINE daily batch imports for primary metadata, but - might also want to run a PMC importer to update fulltext and assign OA - licenses (when appropriate). - - Pubmed refs: - - https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ - - https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:152494&metadataPrefix=pmc_fm - - https://github.com/titipata/pubmed_parser - - TODO(martin): OAI does not seem to support the format we already have an - importer for. Maybe we can use "Daily Update Files" -- - - Daily Update Files - ------------------ - Each day, NLM produces update files that include new, revised and deleted - citations. The first Update file to be loaded after loading the complete - set of 2019 MEDLINE/PubMed Baseline files is pubmed20n1016.xml. - ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles - - NOTES: - - * OAI: https://dtd.nlm.nih.gov/archiving/2.3/xsd/archivearticle.xsd - * FTP: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi" - self.metadata_prefix = "pmc_fm" - self.name = "pubmed" - - class HarvestDoajJournalWorker(HarvestOaiPmhWorker): """ WARNING: DOAJ OAI-PMH doesn't seem to respect 'from' and 'until' params -- cgit v1.2.3