diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-20 20:21:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-20 20:21:30 -0700 |
commit | e33445101cfa30fe0843acbd31448e6544ae54e1 (patch) | |
tree | 0e0832d0c541fb0f5a9e769f4d39c73e26d0a31f /python | |
parent | ed498a60fec55f1a2f5d10907b47971696224ec1 (diff) | |
download | fatcat-e33445101cfa30fe0843acbd31448e6544ae54e1.tar.gz fatcat-e33445101cfa30fe0843acbd31448e6544ae54e1.zip |
support extids in matched importer
Diffstat (limited to 'python')
-rwxr-xr-x | python/fatcat_import.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 29 |
2 files changed, 30 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index d76f706f..8595d16b 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -62,6 +62,7 @@ def run_matched(args): fmi = MatchedImporter(args.api, edit_batch_size=args.batch_size, editgroup_description=args.editgroup_description_override, + default_link_rel=args.default_link_rel, default_mimetype=args.default_mimetype) JsonLinePusher(fmi, args.json_file).run() @@ -267,6 +268,9 @@ def main(): sub_matched.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") + sub_matched.add_argument('--default-link-rel', + default="web", + help="default URL rel for matches (eg, 'publisher', 'web')") sub_arabesque_match = subparsers.add_parser('arabesque') sub_arabesque_match.set_defaults( diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 04ce4573..3ef617d3 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,6 +4,8 @@ import json import sqlite3 import itertools import fatcat_client + +from fatcat_tools.normal import * from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS @@ -53,6 +55,10 @@ class MatchedImporter(EntityImporter): # lookup dois re_list = set() for doi in dois: + doi = clean_doi(doi) + if not doi: + self.counts['skip-bad-doi'] += 1 + return None try: re = self.api.lookup_release(doi=doi) except fatcat_client.rest.ApiException as err: @@ -64,12 +70,28 @@ class MatchedImporter(EntityImporter): pass else: re_list.add(re.ident) + + # look up other external ids + for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'): + extid = obj.get(extid_type) + if extid: + try: + re = self.api.lookup_release(**{extid_type: extid}) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + re = None + if re is None: + pass + else: + re_list.add(re.ident) + release_ids = list(re_list) if len(release_ids) == 0: - self.counts['skip-no-doi'] += 1 + self.counts['skip-no-releases'] += 1 return None if len(release_ids) > SANE_MAX_RELEASES: - self.counts['skip-too-many-dois'] += 1 + self.counts['skip-too-many-releases'] += 1 return None # parse URLs and CDX @@ -142,11 +164,12 @@ class MatchedImporter(EntityImporter): return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: - self.counts['skip-update-too-many-url'] += 1 + self.counts['skip-update-too-many-releases'] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 + existing.sha1 = existing.sha1 or fe.sha1 existing.sha256 = existing.sha256 or fe.sha256 self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 |