aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2022-12-12 17:33:08 +0100
committerMartin Czygan <martin.czygan@gmail.com>2022-12-12 17:33:08 +0100
commitacca83e1336f4341a7e1670d0d1954b69c9e2894 (patch)
treed62f27ca0f9b33dad630e43ee224523c8cdc4757
parent0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff)
downloadfatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.tar.gz
fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.zip
pubmed: ignore empty map during baseline update
> NLM produces a baseline set of PubMed citation records in XML format for download on an annual basis. The annual baseline is released in December of each year. -- https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt Last occurence Dec 8, 2022. Since we do not know the exact date, but the Pubmed docs explicitly state "December", we ignore empty map error in this month.
-rw-r--r--python/fatcat_tools/harvest/pubmed.py16
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 78b1755b..ffb179a0 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -176,9 +176,19 @@ class PubmedFTPWorker:
while True:
self.date_file_map = generate_date_file_map(host=self.host)
if len(self.date_file_map) == 0:
- raise ValueError(
- "map from dates to files should not be empty, maybe the HTML changed?"
- )
+ # NOTE: This may happen once - typically in December - when
+ # Pubmed publishes a baseline dataset for the year and resets
+ # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
+ if datetime.date.today().month == 12:
+ print(
+ "ignoring empty map, as we assume baseline/updatefile reset "
+ "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)",
+ file=sys.stderr,
+ )
+ else:
+ raise ValueError(
+ "map from dates to files should not be empty, maybe the HTML changed?"
+ )
current = self.state.next_span(continuous)
if current: