aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2022-12-12 18:55:37 +0000
committerbnewbold <bnewbold@archive.org>2022-12-12 18:55:37 +0000
commit7be55b661edc04db0830f755997a05436ae7599b (patch)
treed62f27ca0f9b33dad630e43ee224523c8cdc4757
parent0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff)
parentacca83e1336f4341a7e1670d0d1954b69c9e2894 (diff)
downloadfatcat-7be55b661edc04db0830f755997a05436ae7599b.tar.gz
fatcat-7be55b661edc04db0830f755997a05436ae7599b.zip
Merge branch 'martin-pubmed-harvest-empty-map-december' into 'master'
pubmed: ignore empty map during baseline update See merge request webgroup/fatcat!145
-rw-r--r--python/fatcat_tools/harvest/pubmed.py16
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 78b1755b..ffb179a0 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -176,9 +176,19 @@ class PubmedFTPWorker:
while True:
self.date_file_map = generate_date_file_map(host=self.host)
if len(self.date_file_map) == 0:
- raise ValueError(
- "map from dates to files should not be empty, maybe the HTML changed?"
- )
+ # NOTE: This may happen once - typically in December - when
+ # Pubmed publishes a baseline dataset for the year and resets
+ # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
+ if datetime.date.today().month == 12:
+ print(
+ "ignoring empty map, as we assume baseline/updatefile reset "
+ "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)",
+ file=sys.stderr,
+ )
+ else:
+ raise ValueError(
+ "map from dates to files should not be empty, maybe the HTML changed?"
+ )
current = self.state.next_span(continuous)
if current: