From dee48a8f1ad3599cefa044c476966929cd869cfa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 3 Jun 2019 17:30:53 -0700 Subject: recent bootstrap/import notes --- notes/bootstrap/import_timing_20190521_qa.txt | 284 ++++++++++++++++++++++++++ notes/bootstrap/import_timing_20190523.txt | 62 ++++++ notes/bootstrap/import_timing_20190530.txt | 149 ++++++++++++++ 3 files changed, 495 insertions(+) create mode 100644 notes/bootstrap/import_timing_20190521_qa.txt create mode 100644 notes/bootstrap/import_timing_20190523.txt create mode 100644 notes/bootstrap/import_timing_20190530.txt diff --git a/notes/bootstrap/import_timing_20190521_qa.txt b/notes/bootstrap/import_timing_20190521_qa.txt new file mode 100644 index 00000000..43c2a613 --- /dev/null +++ b/notes/bootstrap/import_timing_20190521_qa.txt @@ -0,0 +1,284 @@ + +## JALC importer + + time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro + Loading ISSN map file... + Got 2061670 ISSN-L mappings. + Counter({'total': 9976, 'insert': 7153, 'exists': 2820, 'inserted.container': 149, 'skip': 3, 'update': 0}) + + real 2m21.301s + user 1m14.664s + sys 0m2.144s + +In parallel: + + time zcat /srv/fatcat/datasets/JALC-LOD-20180907.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + [...] + Traceback (most recent call last): + File "./fatcat_import.py", line 294, in + main() + File "./fatcat_import.py", line 291, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 291, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 291, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 291, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 291, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 355, in main + args.func(args) + File "./fatcat_import.py", line 32, in run_arxiv + Bs4XmlFilePusher(ari, args.xml_file, "record").run() + File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 605, in run + self.importer.push_record(record) + File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 285, in push_record + entity = self.parse_record(raw_record) + File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 120, in parse_record + authors = parse_arxiv_authors(metadata.authors.string) + File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 36, in parse_arxiv_authors + authors = [latex_to_text(a).strip() for a in authors] + File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 36, in + authors = [latex_to_text(a).strip() for a in authors] + File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 18, in latex_to_text + return latex2text.latex_to_text(raw).strip() + File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latex2text.py", line 762, in latex_to_text + return self.nodelist_to_text(latexwalker.LatexWalker(latex, **parse_flags).get_latex_nodes()[0]) + File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 1197, in get_latex_nodes + r_endnow = do_read(nodelist, p) + File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 1045, in do_read + tok = self.get_token(p.pos, brackets_are_chars=brackets_are_chars) + File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 744, in get_token + macro = s[pos+1] # next char is necessarily part of macro + IndexError: string index out of range + + HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1063/"} + + +## JSTOR + +To unzip, use: + + unzip ejc-metadata-and-ocr-and-all-ngrams-part-1.zip 'metadata/*.xml' + +May need to do these a handful at a time to prevent inode exhaustion? Looks +like some 57 million free so probably fine; for JSTOR EJC only a couple +million. + +Setup creds: + + export export FATCAT_AUTH_WORKER_JSTOR=blah + +Run single: + + echo /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/journal-article-10.2307_42810429.xml | ./fatcat_import.py jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +In Bulk: + + fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | time parallel -j15 --round-robin --pipe ./fatcat_import.py --batch-size 100 jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +This was the smoothest! Fast too: + + 1354.71user 40.82system 5:50.72elapsed 397%CPU (0avgtext+0avgdata 420180maxresident)k + 1131400inputs+860528outputs (2major+1542545minor)pagefaults 0swaps + +TODO: +MISSING MARC LANG: jav +MISSING MARC LANG: map + + +## PubMed + +Setup creds: + + export export FATCAT_AUTH_WORKER_PUBMED=blah + +Run single: + + time ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2019/pubmed19n0400.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + + real 13m21.756s + user 9m10.720s + sys 0m14.100s + +Bulk: + + # very memory intensive to parse these big XML files, so need to limit parallelism + fd .xml /srv/fatcat/datasets/pubmed_medline_baseline_2019 | time parallel -j3 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +TODO: rip out external id map stuff for pubmed, and maybe JALC as well. will have separate update bots. + +ISSUES: + + HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): doi:10.1017/s1461145702002821"} + + [...] + /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release asywbmeegnfthi4t4pzrqaffj4, pmid 12132109 != 12124418 + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release a5gylyn7pnexblohgex34brum4, pmid 12124588 != 12124587 + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release jb4q7sqm7nbgxkw37bqyss3sai, pmid 12124590 != 12124589 + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release 4vsm2bkb2zg5rjo354pnd3sgji, pmid 19810921 != 12124933 + + HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: duplicate key value violates unique constraint \"release_edit_editgroup_id_ident_id_key\""} + +Performance: + + Counter({'total': 29998, 'exists': 15285, 'insert': 13960, 'update': 753, 'warn-pmid-doi-mismatch': 17, 'skip-update-conflict': 2, 'inserted.container': 1, 'skip': 0}) + real 17m49.921s + user 8m42.648s + sys 0m8.492s + + Counter({'total': 30000, 'insert': 16326, 'exists': 12500, 'update': 1174, 'inserted.container': 1, 'skip': 0}) + real 17m14.827s + user 9m33.444s + sys 0m8.420s + + HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1109/tcbb.2004.44 "} + HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1080/14756360400004532\t"} + +TODO: + + HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1126/science. 1134405"} + + + File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 582, in parse_record + if not raw_name and author.CollectiveName.string: + AttributeError: 'NoneType' object has no attribute 'string' + + File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 405, in parse_record + extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + AttributeError: 'NoneType' object has no attribute 'string' + +Trying pubmed importer again after iterparse() refactor: + + fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2019 | shuf | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt diff --git a/notes/bootstrap/import_timing_20190523.txt b/notes/bootstrap/import_timing_20190523.txt new file mode 100644 index 00000000..c391c786 --- /dev/null +++ b/notes/bootstrap/import_timing_20190523.txt @@ -0,0 +1,62 @@ + +## JSTOR + +Unzipped: + + ls ejc-metadata-and-ocr-and-all-ngrams-part*.zip | parallel unzip {} 'metadata/*.xml' + +Setup creds: + + export export FATCAT_AUTH_WORKER_JSTOR=blah + +Sample (to create most containers): + + fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | shuf -n10000 | ./fatcat_import.py jstor --batch-size 100 - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +All in bulk: + + fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | time parallel -j15 --round-robin --pipe ./fatcat_import.py --batch-size 100 jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + + [...] + Got 2153874 ISSN-L mappings. + Counter({'total': 34829, 'insert': 25226, 'update': 8888, 'exists': 679, 'skip': 36}) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: map + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + Loading ISSN map file... + Got 2153874 ISSN-L mappings. + Counter({'total': 41339, 'insert': 21549, 'exists': 12118, 'update': 7625, 'skip': 47}) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + Loading ISSN map file... + Got 2153874 ISSN-L mappings. + Counter({'total': 46438, 'insert': 25270, 'exists': 12204, 'update': 8899, 'skip': 65}) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: syr + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: oci + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + Loading ISSN map file... + Got 2153874 ISSN-L mappings. + Counter({'total': 46438, 'insert': 25434, 'exists': 12197, 'update': 8757, 'skip': 50}) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: syr + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: welsh + warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) + 6184.17user 163.41system 21:12.96elapsed 498%CPU (0avgtext+0avgdata 434764maxresident)k + 5320528inputs+3466408outputs (38major+2224857minor)pagefaults 0swaps + +TODO: + MISSING MARC LANG: syr (and gem, grc, non, emg, neg, map, welsh, oci) + +## arXiv + +Single file: + + ./fatcat_import.py --batch-size 100 arxiv /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/2007-12-31-00000001.xml + +Bulk (one file per process): + + fd .xml /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/ | parallel -j15 ./fatcat_import.py --batch-size 100 arxiv {} diff --git a/notes/bootstrap/import_timing_20190530.txt b/notes/bootstrap/import_timing_20190530.txt new file mode 100644 index 00000000..f0afe7bc --- /dev/null +++ b/notes/bootstrap/import_timing_20190530.txt @@ -0,0 +1,149 @@ + +## JALC + +Update to eee39965eee92b5005df0d967be779c2f2bb15f8 + + export FATCAT_AUTH_WORKER_JALC=blah + +Extracted file instead of piping it through zcat. + +Start small; do a random bunch (10k) single-threaded to pre-create containers: + + head -n100 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + shuf -n100 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + shuf -n10000 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + Counter({'total': 9971, 'insert': 7138, 'exists': 2826, 'inserted.container': 144, 'skip': 7, 'update': 0}) + +Then the command: + + cat /srv/fatcat/datasets/JALC-LOD-20180907.rdf | pv -l | time parallel -j20 --round-robin --pipe ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + +Bulk import: + + cat /srv/fatcat/datasets/JALC-LOD-20180907.rdf | pv -l | time parallel -j20 --round-robin --pipe ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + +Hit an error: + + Traceback (most recent call last): + File "./fatcat_import.py", line 365, in + main() + File "./fatcat_import.py", line 362, in main + args.func(args) + File "./fatcat_import.py", line 23, in run_jalc + Bs4XmlLinesPusher(ji, args.xml_file, " + main() + File "./fatcat_import.py", line 362, in main + args.func(args) + File "./fatcat_import.py", line 43, in run_pubmed + Bs4XmlLargeFilePusher(pi, args.xml_file, "PubmedArticle", record_list_tag="PubmedArticleSet").run() + File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 666, in run + self.importer.push_record(record) + File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 302, in push_record + entity = self.parse_record(raw_record) + File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 494, in parse_record + int(pub_date.Day.string)) + ValueError: day is out of range for month + +Lesson here is to really get the whole thing to work end-to-end with no +`parallel` error in QA before trying in prod. Was impatient! + +TODO: re-run these with a patch. going to do after dump/snapshot/etc though. + -- cgit v1.2.3