aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2022-12-12 17:33:08 +0100
committerMartin Czygan <martin.czygan@gmail.com>2022-12-12 17:33:08 +0100
commitacca83e1336f4341a7e1670d0d1954b69c9e2894 (patch)
treed62f27ca0f9b33dad630e43ee224523c8cdc4757 /python
parent0b693ddffb416f170a6e19db46caf1f5857d2b9b (diff)
downloadfatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.tar.gz
fatcat-acca83e1336f4341a7e1670d0d1954b69c9e2894.zip
pubmed: ignore empty map during baseline update
> NLM produces a baseline set of PubMed citation records in XML format for download on an annual basis. The annual baseline is released in December of each year. -- https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt Last occurence Dec 8, 2022. Since we do not know the exact date, but the Pubmed docs explicitly state "December", we ignore empty map error in this month.
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py16
1 files changed, 13 insertions, 3 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 78b1755b..ffb179a0 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -176,9 +176,19 @@ class PubmedFTPWorker:
while True:
self.date_file_map = generate_date_file_map(host=self.host)
if len(self.date_file_map) == 0:
- raise ValueError(
- "map from dates to files should not be empty, maybe the HTML changed?"
- )
+ # NOTE: This may happen once - typically in December - when
+ # Pubmed publishes a baseline dataset for the year and resets
+ # the daily update directory. Details: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt
+ if datetime.date.today().month == 12:
+ print(
+ "ignoring empty map, as we assume baseline/updatefile reset "
+ "(see also: https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/README.txt)",
+ file=sys.stderr,
+ )
+ else:
+ raise ValueError(
+ "map from dates to files should not be empty, maybe the HTML changed?"
+ )
current = self.state.next_span(continuous)
if current: