diff options
author | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
commit | 03eadfc7e2bee4213345f6464378e87b8f741d20 (patch) | |
tree | 3e5b13af8ba46b240f9ae53d5f522fb7ee02c219 /python/fatcat_tools/importers/datacite.py | |
parent | 5afde4690a4653db53fe4962af5da3eb9188d9a2 (diff) | |
parent | a73b73c2944b3df2a62886c4e6b69c93f5e74222 (diff) | |
download | fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.tar.gz fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.zip |
Merge branch 'bnewbold-xml-html-ingest' into 'master'
HTML webcapture ingest (and XML file ingest)
See merge request webgroup/fatcat!88
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 86740e80..5cdc5577 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -151,7 +151,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( 'Unknown', ))) -# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist. +# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi @@ -346,7 +346,7 @@ class DataciteImporter(EntityImporter): print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False - # check for blacklisted "spam", e.g. "FULL MOVIE" + # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() for token in rule.get("tokens", []): @@ -819,7 +819,7 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blacklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(('Occdownload Gbif.Org',)) i = 0 for c in creators: @@ -861,7 +861,7 @@ class DataciteImporter(EntityImporter): continue if not name: name = "{} {}".format(given_name or '', surname or '').strip() - if name in name_blacklist: + if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue |