diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-30 17:24:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-30 17:24:42 -0700 |
commit | a0f77aa07815e2c2b8ee5dcb111e16d94c0b30b2 (patch) | |
tree | 80fa68ba1207452e25f1032ee9929b6b1fd3afb1 | |
parent | fb9d55bddc85c865b4e7eb4fb1259891f6f4a9be (diff) | |
download | fatcat-a0f77aa07815e2c2b8ee5dcb111e16d94c0b30b2.tar.gz fatcat-a0f77aa07815e2c2b8ee5dcb111e16d94c0b30b2.zip |
more import examples
-rw-r--r-- | python/README_import.md | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/python/README_import.md b/python/README_import.md index 2465940b..d4abe400 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -67,3 +67,21 @@ These each take 2-4 hours: # GROBID extracted (release+file) time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - +## Arabesque Matches + +Prep JSON files from sqlite (for parallel import): + + ~/arabesque/arabesque.py dump_json s2_doi.sqlite --only-identifier-hits | pv -l | gzip > s2_doi.json.gz + +Run import in parallel: + + export FATCAT_AUTH_WORKER_CRAWL=... + zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid + +## Other Matched + + export FATCAT_EDITGROUP_DESCRIPTION="File/DOI matching to user-uploaded pre-1923 and pre-1909 paper corpus on archive.org" + export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_ARCHIVE_ORG) + + zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - + |