diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 12:26:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 12:26:41 -0700 |
commit | 905ad4fc433d3a026649645311aa08492db1226e (patch) | |
tree | 4771ea18d7d8c35f06c385b5d42ed962fcc35d25 /python/fatcat_tools/importers | |
parent | e9525e5a9fce2927048e10716fe86548e91824c6 (diff) | |
download | fatcat-905ad4fc433d3a026649645311aa08492db1226e.tar.gz fatcat-905ad4fc433d3a026649645311aa08492db1226e.zip |
dblp: more skip patterns, and rename variable
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5f78ca3a..b97c3976 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -542,18 +542,21 @@ class DblpReleaseImporter(EntityImporter): Used only in JSON dump mode, with the intent of transforming into sandcrawler ingest requests. """ - EXTID_PATTERNS = [ + SKIP_PATTERNS = [ "://doi.acm.org/", "://doi.ieeecomputersociety.org/", "doi.org/10.", "wikidata.org/entity/Q", "://arxiv.org/abs/", + "://hdl.handle.net/", + "://d-nb.info/", + "://www.base-search.net/", ] urls = [] for ee in xml_elem.find_all("ee"): url = ee.text skip = False - for pattern in EXTID_PATTERNS: + for pattern in SKIP_PATTERNS: if pattern in url: skip = True break |