aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2022-07-25 21:09:25 +0000
committerbnewbold <bnewbold@archive.org>2022-07-25 21:09:25 +0000
commit5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 (patch)
tree88b2a3a2ad2919cbef4f6acfdd5b986bda0baa72 /python/fatcat_tools
parentb3eddfc398129f2fdcf4737849d436327a67a74a (diff)
parentb12d4f0bde96bfe39df1cc94a993da4b25e53304 (diff)
downloadfatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.tar.gz
fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.zip
Merge branch 'bnewbold-dblp-iteration' into 'master'
dblp import iteration See merge request webgroup/fatcat!141
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/dblp_release.py7
-rw-r--r--python/fatcat_tools/transforms/ingest.py4
2 files changed, 9 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5f78ca3a..b97c3976 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -542,18 +542,21 @@ class DblpReleaseImporter(EntityImporter):
Used only in JSON dump mode, with the intent of transforming into
sandcrawler ingest requests.
"""
- EXTID_PATTERNS = [
+ SKIP_PATTERNS = [
"://doi.acm.org/",
"://doi.ieeecomputersociety.org/",
"doi.org/10.",
"wikidata.org/entity/Q",
"://arxiv.org/abs/",
+ "://hdl.handle.net/",
+ "://d-nb.info/",
+ "://www.base-search.net/",
]
urls = []
for ee in xml_elem.find_all("ee"):
url = ee.text
skip = False
- for pattern in EXTID_PATTERNS:
+ for pattern in SKIP_PATTERNS:
if pattern in url:
skip = True
break
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index d60f9467..1a19a651 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -89,6 +89,10 @@ def release_ingest_request(
url = "https://doaj.org/article/{}".format(release.ext_ids.doaj.lower())
link_source = "doaj"
link_source_id = release.ext_ids.doaj.lower()
+ elif release.ext_ids.hdl:
+ url = "https://hdl.handle.net/{}".format(release.ext_ids.hdl.lower())
+ link_source = "hdl"
+ link_source_id = release.ext_ids.hdl.lower()
if not url:
return None