diff options
author | bnewbold <bnewbold@archive.org> | 2022-02-14 22:03:19 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2022-02-14 22:03:19 +0000 |
commit | a94b3cc03f3ba59191d6fa5343759ff01d594b93 (patch) | |
tree | bfdc6524bb7cb564f33cbb537ec50393cdf51785 | |
parent | 575826cf460ac47e6af40173d5d40e26eb8cf45f (diff) | |
parent | 88fe8c60c169fea628bfb42d1f4af5297c914546 (diff) | |
download | fatcat-a94b3cc03f3ba59191d6fa5343759ff01d594b93.tar.gz fatcat-a94b3cc03f3ba59191d6fa5343759ff01d594b93.zip |
Merge branch 'bnewbold-datacite-skip-ir-containers' into 'master'
datacite importer: skip container_id for some repository sources
See merge request webgroup/fatcat!138
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index b310f8bc..1d098aca 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -511,6 +511,8 @@ class DataciteImporter(EntityImporter): ): relations.append(rel) + # TODO: could use many of these relations to do release/work grouping + if relations: extra_datacite["relations"] = relations @@ -646,6 +648,38 @@ class DataciteImporter(EntityImporter): ): re.extra["container_name"] = "figshare.com" + # Columbia Institutional Repository includes full bibliographic + # metadata, which results in incorrect container_id matches. But this + # DOI prefix also publishes actual journals! + if ( + re.ext_ids.doi.startswith("10.7916/") + and "-" in re.ext_ids.doi + and re.publisher == "Columbia University" + and re.extra + and re.extra.get("datacite") + ): + for relation in re.extra["datacite"].get("relations", []): + if relation.get("relationType") == "IsVariantFormOf": + re.container_id = None + if re.release_stage in ("published", None): + re.release_stage = "submitted" + + # several institutional and other repositories (including "RWTH" and + # "DESY") also results in incorrect container_id matches. + # This probably doesn't filter out enough, but is a start. + IR_DOI_PREFIXES = [ + "10.15495/epub_ubt_", + "10.18154/rwth-20", + "10.3204/pubdb-", + "10.3204/phppubdb-", + "10.26204/kluedo/", + ] + for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get("datacite"): + if re.ext_ids.doi.startswith(prefix): + for relation in re.extra["datacite"].get("relations", []): + if relation.get("relationType") == "IsVariantFormOf": + re.container_id = None + return re def try_update(self, re: ReleaseEntity) -> bool: |