diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2022-12-12 17:33:08 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2022-12-12 17:33:08 +0100 |
commit | acca83e1336f4341a7e1670d0d1954b69c9e2894 (patch) | |
tree | d62f27ca0f9b33dad630e43ee224523c8cdc4757 /python | |
parent | 0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff) | |
download | fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.tar.gz fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.zip |
pubmed: ignore empty map during baseline update
> NLM produces a baseline set of PubMed citation records in XML format
for download on an annual basis. The annual baseline is released in
December of each year. --
https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
Last occurence Dec 8, 2022. Since we do not know the exact date, but the
Pubmed docs explicitly state "December", we ignore empty map error in
this month.
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 78b1755b..ffb179a0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -176,9 +176,19 @@ class PubmedFTPWorker: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: - raise ValueError( - "map from dates to files should not be empty, maybe the HTML changed?" - ) + # NOTE: This may happen once - typically in December - when + # Pubmed publishes a baseline dataset for the year and resets + # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt + if datetime.date.today().month == 12: + print( + "ignoring empty map, as we assume baseline/updatefile reset " + "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)", + file=sys.stderr, + ) + else: + raise ValueError( + "map from dates to files should not be empty, maybe the HTML changed?" + ) current = self.state.next_span(continuous) if current: |