diff options
author | bnewbold <bnewbold@archive.org> | 2022-07-25 21:09:25 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2022-07-25 21:09:25 +0000 |
commit | 5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 (patch) | |
tree | 88b2a3a2ad2919cbef4f6acfdd5b986bda0baa72 /python/fatcat_tools | |
parent | b3eddfc398129f2fdcf4737849d436327a67a74a (diff) | |
parent | b12d4f0bde96bfe39df1cc94a993da4b25e53304 (diff) | |
download | fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.tar.gz fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.zip |
Merge branch 'bnewbold-dblp-iteration' into 'master'
dblp import iteration
See merge request webgroup/fatcat!141
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 4 |
2 files changed, 9 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5f78ca3a..b97c3976 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -542,18 +542,21 @@ class DblpReleaseImporter(EntityImporter): Used only in JSON dump mode, with the intent of transforming into sandcrawler ingest requests. """ - EXTID_PATTERNS = [ + SKIP_PATTERNS = [ "://doi.acm.org/", "://doi.ieeecomputersociety.org/", "doi.org/10.", "wikidata.org/entity/Q", "://arxiv.org/abs/", + "://hdl.handle.net/", + "://d-nb.info/", + "://www.base-search.net/", ] urls = [] for ee in xml_elem.find_all("ee"): url = ee.text skip = False - for pattern in EXTID_PATTERNS: + for pattern in SKIP_PATTERNS: if pattern in url: skip = True break diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index d60f9467..1a19a651 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -89,6 +89,10 @@ def release_ingest_request( url = "https://doaj.org/article/{}".format(release.ext_ids.doaj.lower()) link_source = "doaj" link_source_id = release.ext_ids.doaj.lower() + elif release.ext_ids.hdl: + url = "https://hdl.handle.net/{}".format(release.ext_ids.hdl.lower()) + link_source = "hdl" + link_source_id = release.ext_ids.hdl.lower() if not url: return None |