aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-12 22:10:06 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-12 22:10:06 -0700
commit759867197348f93d04e695782e7e337c7a69ea85 (patch)
tree50b0ba54b7d63fbf1a566868c21e56b2a83db2eb
parentc63b663e2fcfb0c4544653f7f30f7a548103ef2b (diff)
downloadfatcat-759867197348f93d04e695782e7e337c7a69ea85.tar.gz
fatcat-759867197348f93d04e695782e7e337c7a69ea85.zip
early version of arabesque importer
-rwxr-xr-xpython/fatcat_import.py28
-rw-r--r--python/fatcat_tools/importers/__init__.py1
2 files changed, 29 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index aea8c757..f04a63ef 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -31,6 +31,16 @@ def run_matched(args):
edit_batch_size=args.batch_size)
JsonLinePusher(fmi, args.json_file).run()
+def run_arabesque_matched(args):
+ ami = ArabesqueMatchImporter(args.api,
+ do_updates=args.do_updates,
+ extid_type=args.extid_type,
+ crawl_id=args.crawl_id,
+ default_link_rel=args.default_link_rel,
+ edit_batch_size=args.batch_size)
+ SqlitePusher(ami, args.db_file, "crawl_result",
+ ARABESQUE_MATCH_WHERE_CLAUSE).run()
+
def run_grobid_metadata(args):
fmi = GrobidMetadataImporter(args.api,
edit_batch_size=args.batch_size,
@@ -151,6 +161,24 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
+ sub_arabesque_matched = subparsers.add_parser('arabesque_matched')
+ sub_arabesque_matched.set_defaults(
+ func=run_arabesque_matched,
+ auth_var="FATCAT_API_AUTH_TOKEN",
+ )
+ sub_arabesque_matched.add_argument('db_file',
+ help="sqlite database file to import from")
+ sub_arabesque_matched.add_argument('--do-updates',
+ action='store_true',
+ help="update pre-existing file entities if new match (instead of skipping)")
+ sub_arabesque_matched.add_argument('--extid-type',
+ default="doi",
+ help="identifer type in the database (eg, 'doi', 'pmcid'")
+ sub_arabesque_matched.add_argument('--crawl-id',
+ help="crawl ID (optionally included in editgroup metadata)")
+ sub_arabesque_matched.add_argument('--default-link-rel',
+ help="default URL rel for matches (eg, 'publisher', 'web')")
+
sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
sub_grobid_metadata.set_defaults(
func=run_grobid_metadata,
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 94802915..f5ff43e5 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -18,6 +18,7 @@ from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
+from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
#from .kafka_source import KafkaSource