From acca83e1336f4341a7e1670d0d1954b69c9e2894 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 12 Dec 2022 17:33:08 +0100 Subject: pubmed: ignore empty map during baseline update > NLM produces a baseline set of PubMed citation records in XML format for download on an annual basis. The annual baseline is released in December of each year. -- https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt Last occurence Dec 8, 2022. Since we do not know the exact date, but the Pubmed docs explicitly state "December", we ignore empty map error in this month. --- python/fatcat_tools/harvest/pubmed.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 78b1755b..ffb179a0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -176,9 +176,19 @@ class PubmedFTPWorker: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: - raise ValueError( - "map from dates to files should not be empty, maybe the HTML changed?" - ) + # NOTE: This may happen once - typically in December - when + # Pubmed publishes a baseline dataset for the year and resets + # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt + if datetime.date.today().month == 12: + print( + "ignoring empty map, as we assume baseline/updatefile reset " + "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)", + file=sys.stderr, + ) + else: + raise ValueError( + "map from dates to files should not be empty, maybe the HTML changed?" + ) current = self.state.next_span(continuous) if current: -- cgit v1.2.3