aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:26:41 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:26:41 -0700
commit905ad4fc433d3a026649645311aa08492db1226e (patch)
tree4771ea18d7d8c35f06c385b5d42ed962fcc35d25 /python/fatcat_tools/importers
parente9525e5a9fce2927048e10716fe86548e91824c6 (diff)
downloadfatcat-905ad4fc433d3a026649645311aa08492db1226e.tar.gz
fatcat-905ad4fc433d3a026649645311aa08492db1226e.zip
dblp: more skip patterns, and rename variable
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/dblp_release.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5f78ca3a..b97c3976 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -542,18 +542,21 @@ class DblpReleaseImporter(EntityImporter):
Used only in JSON dump mode, with the intent of transforming into
sandcrawler ingest requests.
"""
- EXTID_PATTERNS = [
+ SKIP_PATTERNS = [
"://doi.acm.org/",
"://doi.ieeecomputersociety.org/",
"doi.org/10.",
"wikidata.org/entity/Q",
"://arxiv.org/abs/",
+ "://hdl.handle.net/",
+ "://d-nb.info/",
+ "://www.base-search.net/",
]
urls = []
for ee in xml_elem.find_all("ee"):
url = ee.text
skip = False
- for pattern in EXTID_PATTERNS:
+ for pattern in SKIP_PATTERNS:
if pattern in url:
skip = True
break