diff options
author | bnewbold <bnewbold@archive.org> | 2022-12-12 18:55:37 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2022-12-12 18:55:37 +0000 |
commit | 7be55b661edc04db0830f755997a05436ae7599b (patch) | |
tree | d62f27ca0f9b33dad630e43ee224523c8cdc4757 /python | |
parent | 0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff) | |
parent | acca83e1336f4341a7e1670d0d1954b69c9e2894 (diff) | |
download | fatcat-7be55b661edc04db0830f755997a05436ae7599b.tar.gz fatcat-7be55b661edc04db0830f755997a05436ae7599b.zip |
Merge branch 'martin-pubmed-harvest-empty-map-december' into 'master'
pubmed: ignore empty map during baseline update
See merge request webgroup/fatcat!145
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 78b1755b..ffb179a0 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -176,9 +176,19 @@ class PubmedFTPWorker: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: - raise ValueError( - "map from dates to files should not be empty, maybe the HTML changed?" - ) + # NOTE: This may happen once - typically in December - when + # Pubmed publishes a baseline dataset for the year and resets + # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt + if datetime.date.today().month == 12: + print( + "ignoring empty map, as we assume baseline/updatefile reset " + "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)", + file=sys.stderr, + ) + else: + raise ValueError( + "map from dates to files should not be empty, maybe the HTML changed?" + ) current = self.state.next_span(continuous) if current: |