diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 19 | 
1 files changed, 19 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d4ff6784..86740e80 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -154,6 +154,15 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((  # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.  UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) +# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi +DATACITE_TITLE_SPAM_WORDGROUPS = [ +    { +        "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', +                   'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), +        "min": 4, +    } +] +  # TODO(martin): merge this with other maps and lookup functions, eventually.  LICENSE_SLUG_MAP = {      "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", @@ -337,6 +346,16 @@ class DataciteImporter(EntityImporter):              print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)              return False +        # check for blacklisted "spam", e.g. "FULL MOVIE" +        for rule in DATACITE_TITLE_SPAM_WORDGROUPS: +            seen = set() +            for token in rule.get("tokens", []): +                if token in title.lower(): +                    seen.add(token) +            if len(seen) >= rule.get("min"): +                print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) +                return False +          if not subtitle:              subtitle = None          else: | 
