aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py16
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 78b1755b..ffb179a0 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -176,9 +176,19 @@ class PubmedFTPWorker:
while True:
self.date_file_map = generate_date_file_map(host=self.host)
if len(self.date_file_map) == 0:
- raise ValueError(
- "map from dates to files should not be empty, maybe the HTML changed?"
- )
+ # NOTE: This may happen once - typically in December - when
+ # Pubmed publishes a baseline dataset for the year and resets
+ # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
+ if datetime.date.today().month == 12:
+ print(
+ "ignoring empty map, as we assume baseline/updatefile reset "
+ "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)",
+ file=sys.stderr,
+ )
+ else:
+ raise ValueError(
+ "map from dates to files should not be empty, maybe the HTML changed?"
+ )
current = self.state.next_span(continuous)
if current: