diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2022-12-12 17:33:08 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2022-12-12 17:33:08 +0100 |
commit | acca83e1336f4341a7e1670d0d1954b69c9e2894 (patch) | |
tree | d62f27ca0f9b33dad630e43ee224523c8cdc4757 | |
parent | 0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff) | |
download | fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.tar.gz fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.zip |
pubmed: ignore empty map during baseline update
> NLM produces a baseline set of PubMed citation records in XML format
for download on an annual basis. The annual baseline is released in
December of each year. --
https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
Last occurence Dec 8, 2022. Since we do not know the exact date, but the
Pubmed docs explicitly state "December", we ignore empty map error in
this month.
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 78b1755b..ffb179a0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -176,9 +176,19 @@ class PubmedFTPWorker: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: - raise ValueError( - "map from dates to files should not be empty, maybe the HTML changed?" - ) + # NOTE: This may happen once - typically in December - when + # Pubmed publishes a baseline dataset for the year and resets + # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt + if datetime.date.today().month == 12: + print( + "ignoring empty map, as we assume baseline/updatefile reset " + "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)", + file=sys.stderr, + ) + else: + raise ValueError( + "map from dates to files should not be empty, maybe the HTML changed?" + ) current = self.state.next_span(continuous) if current: |