diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-18 16:07:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-18 16:07:14 -0700 |
commit | bfc0d6597fa98e89615e6d82eed9488fd9a1e087 (patch) | |
tree | 769a7bb28fb32296ec49c0aa30d3e0ce115b1162 | |
parent | 95fdfd31d73a2729eae3f20b3b6488d04782e0ce (diff) | |
download | fatcat-bfc0d6597fa98e89615e6d82eed9488fd9a1e087.tar.gz fatcat-bfc0d6597fa98e89615e6d82eed9488fd9a1e087.zip |
minor arabesque tweaks
-rwxr-xr-x | python/fatcat_import.py | 34 | ||||
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 2 |
2 files changed, 24 insertions, 12 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 4eae49ef..de5b4202 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -31,7 +31,10 @@ def run_matched(args): edit_batch_size=args.batch_size) JsonLinePusher(fmi, args.json_file).run() -def run_arabesque_matched(args): +def run_arabesque_match(args): + if (args.sqlite_file and args.json_file) or not (args.sqlite_file or + args.json_file): + print("Supply one of --sqlite-file or --json-file") ami = ArabesqueMatchImporter(args.api, do_updates=args.do_updates, require_grobid=(not args.no_require_grobid), @@ -39,8 +42,11 @@ def run_arabesque_matched(args): crawl_id=args.crawl_id, default_link_rel=args.default_link_rel, edit_batch_size=args.batch_size) - SqlitePusher(ami, args.db_file, "crawl_result", - ARABESQUE_MATCH_WHERE_CLAUSE).run() + if args.sqlite_file: + SqlitePusher(ami, args.sqlite_file, "crawl_result", + ARABESQUE_MATCH_WHERE_CLAUSE).run() + elif args.json_file: + JsonLinePusher(ami, args.json_file).run() def run_grobid_metadata(args): fmi = GrobidMetadataImporter(args.api, @@ -162,25 +168,29 @@ def main(): action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - sub_arabesque_matched = subparsers.add_parser('arabesque_matched') - sub_arabesque_matched.set_defaults( - func=run_arabesque_matched, + sub_arabesque_match = subparsers.add_parser('arabesque') + sub_arabesque_match.set_defaults( + func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_arabesque_matched.add_argument('db_file', + sub_arabesque_match.add_argument('--sqlite-file', help="sqlite database file to import from") - sub_arabesque_matched.add_argument('--do-updates', + sub_arabesque_match.add_argument('--json-file', + help="JSON file to import from (or stdin)", + type=argparse.FileType('r')) + sub_arabesque_match.add_argument('--do-updates', action='store_true', help="update pre-existing file entities if new match (instead of skipping)") - sub_arabesque_matched.add_argument('--no-require-grobid', + sub_arabesque_match.add_argument('--no-require-grobid', action='store_true', help="whether postproc_status column must be '200'") - sub_arabesque_matched.add_argument('--extid-type', + sub_arabesque_match.add_argument('--extid-type', default="doi", help="identifer type in the database (eg, 'doi', 'pmcid'") - sub_arabesque_matched.add_argument('--crawl-id', + sub_arabesque_match.add_argument('--crawl-id', help="crawl ID (optionally included in editgroup metadata)") - sub_arabesque_matched.add_argument('--default-link-rel', + sub_arabesque_match.add_argument('--default-link-rel', + default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_grobid_metadata = subparsers.add_parser('grobid-metadata') diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index e143ad99..c0311903 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -65,6 +65,7 @@ class ArabesqueMatchImporter(EntityImporter): assert extid_type in ('doi', 'pmcid', 'pmid') self.extid_type = extid_type self.default_link_rel = kwargs.get("default_link_rel", "web") + assert self.default_link_rel self.default_mime = kwargs.get("default_mime", None) self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid @@ -148,6 +149,7 @@ class ArabesqueMatchImporter(EntityImporter): self.counts['skip-update-disabled'] += 1 return False + # TODO: this code path never gets hit because of the check above if set(fe.release_ids) == set(existing.release_ids): existing_urls = set([u.url for u in existing.urls]) new_urls = set([u.url for u in fe.urls]) |