aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2022-02-14 22:03:19 +0000
committerbnewbold <bnewbold@archive.org>2022-02-14 22:03:19 +0000
commita94b3cc03f3ba59191d6fa5343759ff01d594b93 (patch)
treebfdc6524bb7cb564f33cbb537ec50393cdf51785
parent575826cf460ac47e6af40173d5d40e26eb8cf45f (diff)
parent88fe8c60c169fea628bfb42d1f4af5297c914546 (diff)
downloadfatcat-a94b3cc03f3ba59191d6fa5343759ff01d594b93.tar.gz
fatcat-a94b3cc03f3ba59191d6fa5343759ff01d594b93.zip
Merge branch 'bnewbold-datacite-skip-ir-containers' into 'master'
datacite importer: skip container_id for some repository sources See merge request webgroup/fatcat!138
-rw-r--r--python/fatcat_tools/importers/datacite.py34
1 files changed, 34 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index b310f8bc..1d098aca 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -511,6 +511,8 @@ class DataciteImporter(EntityImporter):
):
relations.append(rel)
+ # TODO: could use many of these relations to do release/work grouping
+
if relations:
extra_datacite["relations"] = relations
@@ -646,6 +648,38 @@ class DataciteImporter(EntityImporter):
):
re.extra["container_name"] = "figshare.com"
+ # Columbia Institutional Repository includes full bibliographic
+ # metadata, which results in incorrect container_id matches. But this
+ # DOI prefix also publishes actual journals!
+ if (
+ re.ext_ids.doi.startswith("10.7916/")
+ and "-" in re.ext_ids.doi
+ and re.publisher == "Columbia University"
+ and re.extra
+ and re.extra.get("datacite")
+ ):
+ for relation in re.extra["datacite"].get("relations", []):
+ if relation.get("relationType") == "IsVariantFormOf":
+ re.container_id = None
+ if re.release_stage in ("published", None):
+ re.release_stage = "submitted"
+
+ # several institutional and other repositories (including "RWTH" and
+ # "DESY") also results in incorrect container_id matches.
+ # This probably doesn't filter out enough, but is a start.
+ IR_DOI_PREFIXES = [
+ "10.15495/epub_ubt_",
+ "10.18154/rwth-20",
+ "10.3204/pubdb-",
+ "10.3204/phppubdb-",
+ "10.26204/kluedo/",
+ ]
+ for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get("datacite"):
+ if re.ext_ids.doi.startswith(prefix):
+ for relation in re.extra["datacite"].get("relations", []):
+ if relation.get("relationType") == "IsVariantFormOf":
+ re.container_id = None
+
return re
def try_update(self, re: ReleaseEntity) -> bool: