aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/README_import.md18
1 files changed, 18 insertions, 0 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 2465940b..d4abe400 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -67,3 +67,21 @@ These each take 2-4 hours:
# GROBID extracted (release+file)
time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -
+## Arabesque Matches
+
+Prep JSON files from sqlite (for parallel import):
+
+ ~/arabesque/arabesque.py dump_json s2_doi.sqlite --only-identifier-hits | pv -l | gzip > s2_doi.json.gz
+
+Run import in parallel:
+
+ export FATCAT_AUTH_WORKER_CRAWL=...
+ zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid
+
+## Other Matched
+
+ export FATCAT_EDITGROUP_DESCRIPTION="File/DOI matching to user-uploaded pre-1923 and pre-1909 paper corpus on archive.org"
+ export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_ARCHIVE_ORG)
+
+ zcat /srv/fatcat/datasets/crossref-pre-1923-scholarly-works.matched.json.gz | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
+