diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-05-22 16:02:07 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-05-22 16:03:10 -0700 |
commit | 50023c3a6dbbac3da8cbf444ef5b5e47850394e0 (patch) | |
tree | f7d94ca6db9de142b75a19baff04188f776cfd2d /python/fatcat_tools | |
parent | 0e14ec400426162f4d2d6c4bcf624ab1515fb07f (diff) | |
download | fatcat-50023c3a6dbbac3da8cbf444ef5b5e47850394e0.tar.gz fatcat-50023c3a6dbbac3da8cbf444ef5b5e47850394e0.zip |
ingest importer: don't use glutton matches
Until reviewing I didn't realize we were even doing this currently.
Hopefluly has not impacted too many imports, as almost all ingests use
an external identifer, so only those with identifers not in fatcat for
whatever reason.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4772bfaa..7d5211fc 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -19,6 +19,7 @@ class IngestFileResultImporter(EntityImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.use_glutton_match = False self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel self.require_grobid = require_grobid @@ -109,7 +110,7 @@ class IngestFileResultImporter(EntityImporter): continue release_ident = release.ident break - if not release_ident and row.get('grobid'): + if self.use_glutton_match and not release_ident and row.get('grobid'): # try biblio-glutton extracted hit if row['grobid'].get('fatcat_release'): release_ident = row['grobid']['fatcat_release'].split('_')[-1] @@ -197,8 +198,7 @@ class IngestFileResultImporter(EntityImporter): if not existing: return True - # the following checks all assume there is an existing item - + # NOTE: the following checks all assume there is an existing item if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? self.counts['exists'] += 1 |