From eaf5039116fbde796d9905c4986fc6a308a36ff4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 May 2019 18:34:11 -0700 Subject: fix 'fd' XML pattern for bulk imports --- python/README_import.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/README_import.md b/python/README_import.md index 027ce7ad..43100db1 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -79,7 +79,7 @@ Single file: Bulk (one file per process): - fd .xml /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/ | parallel -j15 ./fatcat_import.py arxiv {} + fd '.xml$' /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/ | parallel -j15 ./fatcat_import.py arxiv {} ## PubMed @@ -93,8 +93,7 @@ Run single: Bulk: - # very memory intensive to parse these big XML files, so need to limit parallelism - fd .xml /srv/fatcat/datasets/pubmed_medline_baseline_2019 | time parallel -j3 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2019 | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt ## Matched -- cgit v1.2.3