diff options
author | bnewbold <bnewbold@archive.org> | 2020-03-23 16:37:08 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-03-23 16:37:08 +0000 |
commit | 8af9df9fff925c90f2bfb52c4a2b2ea918b4eda2 (patch) | |
tree | 8d1e9a900279ab0e151e488f63273ac4f9b702fe /python/fatcat_tools | |
parent | 4bcef62ecd98f2719fc4d1cef35394b0bad5cb2b (diff) | |
download | fatcat-8af9df9fff925c90f2bfb52c4a2b2ea918b4eda2.tar.gz fatcat-8af9df9fff925c90f2bfb52c4a2b2ea918b4eda2.zip |
datacite: add year sanity restrictions
Example of entities with bogus years:
https://fatcat.wiki/release/search?q=doi_registrar%3Adatacite+year%3A%3E2100
We can do a clean-up task, but first need to prevent creation of new bad
metadata.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4e382348..db4709c2 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -222,6 +222,7 @@ class DataciteImporter(EntityImporter): self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file + self.this_year = datetime.datetime.now().year print('datacite with debug={}'.format(self.debug), file=sys.stderr) @@ -311,6 +312,12 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # block bogus far-future years/dates + if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + release_date = None + release_month = None + release_year = None + # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): |