aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-06-03 17:30:53 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-06-03 17:30:53 -0700
commitdee48a8f1ad3599cefa044c476966929cd869cfa (patch)
tree85af819832be51df9dc59a1b0428f569624bcbe7 /notes
parent5c028c7098b39a031f51a662f9fb064a84c52f62 (diff)
downloadfatcat-dee48a8f1ad3599cefa044c476966929cd869cfa.tar.gz
fatcat-dee48a8f1ad3599cefa044c476966929cd869cfa.zip
recent bootstrap/import notes
Diffstat (limited to 'notes')
-rw-r--r--notes/bootstrap/import_timing_20190521_qa.txt284
-rw-r--r--notes/bootstrap/import_timing_20190523.txt62
-rw-r--r--notes/bootstrap/import_timing_20190530.txt149
3 files changed, 495 insertions, 0 deletions
diff --git a/notes/bootstrap/import_timing_20190521_qa.txt b/notes/bootstrap/import_timing_20190521_qa.txt
new file mode 100644
index 00000000..43c2a613
--- /dev/null
+++ b/notes/bootstrap/import_timing_20190521_qa.txt
@@ -0,0 +1,284 @@
+
+## JALC importer
+
+ time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2061670 ISSN-L mappings.
+ Counter({'total': 9976, 'insert': 7153, 'exists': 2820, 'inserted.container': 149, 'skip': 3, 'update': 0})
+
+ real 2m21.301s
+ user 1m14.664s
+ sys 0m2.144s
+
+In parallel:
+
+ time zcat /srv/fatcat/datasets/JALC-LOD-20180907.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ [...]
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 294, in <module>
+ main()
+ File "./fatcat_import.py", line 291, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 585, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 282, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 139, in parse_record
+ given_name=clean(eng.find('givenName').string),
+ AttributeError: 'NoneType' object has no attribute 'string'
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2061670 ISSN-L mappings.
+ Counter({'total': 7483, 'exists': 4476, 'insert': 3006, 'inserted.container': 1, 'skip': 1, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2061670 ISSN-L mappings.
+ Counter({'total': 7661, 'insert': 4685, 'exists': 2976, 'inserted.container': 4, 'skip': 0, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ [...]
+
+Update to 1df0cd9cfe96609ff276362d10a5e50b723bbb7b.
+
+Realized I also wasn't using correct creds, so:
+
+ export FATCAT_AUTH_WORKER_JALC=blah
+
+Hit:
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 294, in <module>
+ main()
+ File "./fatcat_import.py", line 291, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 585, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 282, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 93, in parse_record
+ assert doi.startswith('10.')
+ AssertionError
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2061670 ISSN-L mappings.
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 294, in <module>
+ main()
+ File "./fatcat_import.py", line 291, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 585, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 282, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 93, in parse_record
+ assert doi.startswith('10.')
+ AssertionError
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2061670 ISSN-L mappings.
+ Counter({'total': 7326, 'insert': 3707, 'exists': 3618, 'inserted.container': 4, 'skip': 1, 'update': 0})
+
+Update to a67c8e65f4892899df3368ac7ea3abaee176fb3a. Think that maybe tar/gzip thing isn't good idea, so:
+
+ time cat /srv/fatcat/datasets/JALC-LOD-20180907.rdf | time parallel -j20 --round-robin --pipe ./fatcat_import.py jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ [...]
+ bogus JALC DOI: http://dx.doi.org/10.5293/ijfms.2014.7.3.086
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 294, in <module>
+ main()
+ File "./fatcat_import.py", line 291, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 585, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 282, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 249, in parse_record
+ extids = self.lookup_ext_ids(doi=doi)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 49, in lookup_ext_ids
+ [doi.lower()]).fetchone()
+ [...]
+
+and...
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 294, in <module>
+ main()
+ File "./fatcat_import.py", line 291, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 585, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 282, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 178, in parse_record
+ release_year = int(date)
+ ValueError: invalid literal for int() with base 10: 'null'
+
+Got:
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.5183"}
+
+TODO: re-write author translation match code, to at least catch the common case of 50/50 matches
+
+## arXiv Importer
+
+Setup creds:
+
+ export export FATCAT_AUTH_WORKER_ARXIV=blah
+
+Single file:
+
+ ./fatcat_import.py arxiv /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/2007-12-31-00000001.xml
+
+Bulk (one file per process):
+
+ fd .xml /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/ | parallel -j15 ./fatcat_import.py arxiv {}
+
+Issues:
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.5120/13331-0888 10.5120/13331-0888"}
+
+ HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: new row for relation \"release_contrib\" violates check constraint \"release_contrib_raw_name_check\""}
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 358, in <module>
+ main()
+ File "./fatcat_import.py", line 355, in main
+ args.func(args)
+ File "./fatcat_import.py", line 32, in run_arxiv
+ Bs4XmlFilePusher(ari, args.xml_file, "record").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 605, in run
+ self.importer.push_record(record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 285, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 120, in parse_record
+ authors = parse_arxiv_authors(metadata.authors.string)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 36, in parse_arxiv_authors
+ authors = [latex_to_text(a).strip() for a in authors]
+ File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 36, in <listcomp>
+ authors = [latex_to_text(a).strip() for a in authors]
+ File "/srv/fatcat/src/python/fatcat_tools/importers/arxiv.py", line 18, in latex_to_text
+ return latex2text.latex_to_text(raw).strip()
+ File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latex2text.py", line 762, in latex_to_text
+ return self.nodelist_to_text(latexwalker.LatexWalker(latex, **parse_flags).get_latex_nodes()[0])
+ File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 1197, in get_latex_nodes
+ r_endnow = do_read(nodelist, p)
+ File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 1045, in do_read
+ tok = self.get_token(p.pos, brackets_are_chars=brackets_are_chars)
+ File "/srv/fatcat/src/python/.venv/lib/python3.5/site-packages/pylatexenc/latexwalker.py", line 744, in get_token
+ macro = s[pos+1] # next char is necessarily part of macro
+ IndexError: string index out of range
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1063/"}
+
+
+## JSTOR
+
+To unzip, use:
+
+ unzip ejc-metadata-and-ocr-and-all-ngrams-part-1.zip 'metadata/*.xml'
+
+May need to do these a handful at a time to prevent inode exhaustion? Looks
+like some 57 million free so probably fine; for JSTOR EJC only a couple
+million.
+
+Setup creds:
+
+ export export FATCAT_AUTH_WORKER_JSTOR=blah
+
+Run single:
+
+ echo /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/journal-article-10.2307_42810429.xml | ./fatcat_import.py jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+In Bulk:
+
+ fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | time parallel -j15 --round-robin --pipe ./fatcat_import.py --batch-size 100 jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+This was the smoothest! Fast too:
+
+ 1354.71user 40.82system 5:50.72elapsed 397%CPU (0avgtext+0avgdata 420180maxresident)k
+ 1131400inputs+860528outputs (2major+1542545minor)pagefaults 0swaps
+
+TODO:
+MISSING MARC LANG: jav
+MISSING MARC LANG: map
+
+
+## PubMed
+
+Setup creds:
+
+ export export FATCAT_AUTH_WORKER_PUBMED=blah
+
+Run single:
+
+ time ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2019/pubmed19n0400.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+ real 13m21.756s
+ user 9m10.720s
+ sys 0m14.100s
+
+Bulk:
+
+ # very memory intensive to parse these big XML files, so need to limit parallelism
+ fd .xml /srv/fatcat/datasets/pubmed_medline_baseline_2019 | time parallel -j3 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+TODO: rip out external id map stuff for pubmed, and maybe JALC as well. will have separate update bots.
+
+ISSUES:
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): doi:10.1017/s1461145702002821"}
+
+ [...]
+ /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release asywbmeegnfthi4t4pzrqaffj4, pmid 12132109 != 12124418
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release a5gylyn7pnexblohgex34brum4, pmid 12124588 != 12124587
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release jb4q7sqm7nbgxkw37bqyss3sai, pmid 12124590 != 12124589
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ /srv/fatcat/src/python/fatcat_tools/importers/pubmed.py:719: UserWarning: PMID/DOI mismatch: release 4vsm2bkb2zg5rjo354pnd3sgji, pmid 19810921 != 12124933
+
+ HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: duplicate key value violates unique constraint \"release_edit_editgroup_id_ident_id_key\""}
+
+Performance:
+
+ Counter({'total': 29998, 'exists': 15285, 'insert': 13960, 'update': 753, 'warn-pmid-doi-mismatch': 17, 'skip-update-conflict': 2, 'inserted.container': 1, 'skip': 0})
+ real 17m49.921s
+ user 8m42.648s
+ sys 0m8.492s
+
+ Counter({'total': 30000, 'insert': 16326, 'exists': 12500, 'update': 1174, 'inserted.container': 1, 'skip': 0})
+ real 17m14.827s
+ user 9m33.444s
+ sys 0m8.420s
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1109/tcbb.2004.44 "}
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1080/14756360400004532\t"}
+
+TODO:
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.1126/science. 1134405"}
+
+
+ File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 582, in parse_record
+ if not raw_name and author.CollectiveName.string:
+ AttributeError: 'NoneType' object has no attribute 'string'
+
+ File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 405, in parse_record
+ extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+ AttributeError: 'NoneType' object has no attribute 'string'
+
+Trying pubmed importer again after iterparse() refactor:
+
+ fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2019 | shuf | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
diff --git a/notes/bootstrap/import_timing_20190523.txt b/notes/bootstrap/import_timing_20190523.txt
new file mode 100644
index 00000000..c391c786
--- /dev/null
+++ b/notes/bootstrap/import_timing_20190523.txt
@@ -0,0 +1,62 @@
+
+## JSTOR
+
+Unzipped:
+
+ ls ejc-metadata-and-ocr-and-all-ngrams-part*.zip | parallel unzip {} 'metadata/*.xml'
+
+Setup creds:
+
+ export export FATCAT_AUTH_WORKER_JSTOR=blah
+
+Sample (to create most containers):
+
+ fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | shuf -n10000 | ./fatcat_import.py jstor --batch-size 100 - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+All in bulk:
+
+ fd .xml /srv/fatcat/datasets/jstor-ejc-bulk-metadata/metadata/ | time parallel -j15 --round-robin --pipe ./fatcat_import.py --batch-size 100 jstor - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+ [...]
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 34829, 'insert': 25226, 'update': 8888, 'exists': 679, 'skip': 36})
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: map
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 41339, 'insert': 21549, 'exists': 12118, 'update': 7625, 'skip': 47})
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 46438, 'insert': 25270, 'exists': 12204, 'update': 8899, 'skip': 65})
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: syr
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: oci
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: grc
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 46438, 'insert': 25434, 'exists': 12197, 'update': 8757, 'skip': 50})
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: syr
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ /srv/fatcat/src/python/fatcat_tools/importers/jstor.py:207: UserWarning: MISSING MARC LANG: welsh
+ warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
+ 6184.17user 163.41system 21:12.96elapsed 498%CPU (0avgtext+0avgdata 434764maxresident)k
+ 5320528inputs+3466408outputs (38major+2224857minor)pagefaults 0swaps
+
+TODO:
+ MISSING MARC LANG: syr (and gem, grc, non, emg, neg, map, welsh, oci)
+
+## arXiv
+
+Single file:
+
+ ./fatcat_import.py --batch-size 100 arxiv /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/2007-12-31-00000001.xml
+
+Bulk (one file per process):
+
+ fd .xml /srv/fatcat/datasets/arxiv_raw_oai_snapshot_2019-05-22/ | parallel -j15 ./fatcat_import.py --batch-size 100 arxiv {}
diff --git a/notes/bootstrap/import_timing_20190530.txt b/notes/bootstrap/import_timing_20190530.txt
new file mode 100644
index 00000000..f0afe7bc
--- /dev/null
+++ b/notes/bootstrap/import_timing_20190530.txt
@@ -0,0 +1,149 @@
+
+## JALC
+
+Update to eee39965eee92b5005df0d967be779c2f2bb15f8
+
+ export FATCAT_AUTH_WORKER_JALC=blah
+
+Extracted file instead of piping it through zcat.
+
+Start small; do a random bunch (10k) single-threaded to pre-create containers:
+
+ head -n100 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+ shuf -n100 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+ shuf -n10000 /srv/fatcat/datasets/JALC-LOD-20180907.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+ Counter({'total': 9971, 'insert': 7138, 'exists': 2826, 'inserted.container': 144, 'skip': 7, 'update': 0})
+
+Then the command:
+
+ cat /srv/fatcat/datasets/JALC-LOD-20180907.rdf | pv -l | time parallel -j20 --round-robin --pipe ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+Bulk import:
+
+ cat /srv/fatcat/datasets/JALC-LOD-20180907.rdf | pv -l | time parallel -j20 --round-robin --pipe ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+Hit an error:
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 365, in <module>
+ main()
+ File "./fatcat_import.py", line 362, in main
+ args.func(args)
+ File "./fatcat_import.py", line 23, in run_jalc
+ Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 605, in run
+ self.importer.push_record(soup)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 302, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/jalc.py", line 261, in parse_record
+ publisher = clean(pubs[0])
+ IndexError: list index out of range
+ [...]
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 320733, 'insert': 227567, 'exists': 92651, 'skip': 515, 'inserted.container': 53, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 317741, 'insert': 226336, 'exists': 91232, 'skip': 173, 'inserted.container': 64, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 318022, 'insert': 230063, 'exists': 87852, 'skip': 107, 'inserted.container': 51, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 317404, 'insert': 225893, 'exists': 91363, 'skip': 148, 'inserted.container': 45, 'update': 0})
+ Command exited with non-zero status 1
+ 70293.61user 1088.65system 4:06:04elapsed 483%CPU (0avgtext+0avgdata 449340maxresident)k
+ 1548632inputs+13813200outputs (248major+3685889minor)pagefaults 0swaps
+
+Re-ran with same command after patching, and success:
+
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 321098, 'exists': 319095, 'insert': 1726, 'skip': 277, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 317416, 'exists': 315055, 'insert': 1871, 'skip': 490, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 315676, 'exists': 313906, 'insert': 1653, 'skip': 117, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 308695, 'exists': 306407, 'insert': 1856, 'skip': 432, 'update': 0})
+ Using external ID map: file:/srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3?mode=ro
+ Loading ISSN map file...
+ Got 2153874 ISSN-L mappings.
+ Counter({'total': 310210, 'exists': 308280, 'insert': 1782, 'skip': 148, 'update': 0})
+ 71531.84user 1225.33system 1:17:04elapsed 1573%CPU (0avgtext+0avgdata 425368maxresident)k
+ 1195624inputs+14971088outputs (238major+2895079minor)pagefaults 0swaps
+
+## Journal Metadata Update
+
+Updating with fixed KBART year_spans, for better coverage detection.
+
+ export FATCAT_AUTH_WORKER_JOURNAL_METADATA=...
+
+ ./fatcat_import.py journal-metadata /srv/fatcat/datasets/journal_metadata.2019-02-20.fixed.json
+ Counter({'total': 107793, 'exists': 95921, 'update': 11549, 'insert': 270, 'skip': 53})
+
+## PubMed
+
+ export FATCAT_AUTH_WORKER_PUBMED=...
+
+Start small (and cut off) to ensure getting basics correct:
+
+ ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2019/pubmed19n0400.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+Kick off the big one:
+
+ fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2019 | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+Seemed to hang or something...
+
+ fatcat 1649 0.1 0.1 2335588 56076 pts/2 S Jun01 5:05 python3 ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2019/pubmed19n0966.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+ fatcat 9460 0.2 0.1 2333520 54004 pts/2 S May31 12:21 python3 ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2019/pubmed19n0383.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+
+ fatcat_client.rest.ApiException: (400)
+ Reason: Bad Request
+ HTTP response headers: HTTPHeaderDict({'Content-Length': '183', 'X-Clacks-Overhead': 'GNU aaronsw, jpb', 'X-Span-ID': '563f6833-be1e-452e-bcd6-e7c721edf9eb', 'Content-Type': 'application/json', 'Date': 'Sat, 01 Jun 2019 12:31:11 GMT'})
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a PubMed Central ID (PMCID) (expected, eg, 'PMC12345'): wst_2018_414"}
+
+And another:
+
+ fatcat_client.rest.ApiException: (400)
+ Reason: Bad Request
+ HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 01 Jun 2019 12:37:01 GMT', 'Content-Type': 'application/json', 'Content-Length': '182', 'X-Span-ID': 'c8cbcffb-d3c5-4ceb-b157-d628dbac613f', 'X-Clacks-Overhead': 'GNU aaronsw, jpb'})
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a PubMed Central ID (PMCID) (expected, eg, 'PMC12345'): wh_2018_033"}
+
+And another (jeeze!):
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a PubMed Central ID (PMCID) (expected, eg, 'PMC12345'): wst_2018_399"}
+
+And another derp:
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 365, in <module>
+ main()
+ File "./fatcat_import.py", line 362, in main
+ args.func(args)
+ File "./fatcat_import.py", line 43, in run_pubmed
+ Bs4XmlLargeFilePusher(pi, args.xml_file, "PubmedArticle", record_list_tag="PubmedArticleSet").run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 666, in run
+ self.importer.push_record(record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 302, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/pubmed.py", line 494, in parse_record
+ int(pub_date.Day.string))
+ ValueError: day is out of range for month
+
+Lesson here is to really get the whole thing to work end-to-end with no
+`parallel` error in QA before trying in prod. Was impatient!
+
+TODO: re-run these with a patch. going to do after dump/snapshot/etc though.
+