aboutsummaryrefslogtreecommitdiffstats
path: root/extra
diff options
context:
space:
mode:
Diffstat (limited to 'extra')
-rw-r--r--extra/bulk_edits/2019-06-24_unpaywall_archiveorg.md262
-rw-r--r--extra/bulk_edits/2019-09-03_chocula.md23
-rw-r--r--extra/bulk_edits/2019-10-08_file_cleanups.md59
-rw-r--r--extra/bulk_edits/2019-11-05_crossref_patch.md58
-rw-r--r--extra/bulk_edits/2019-12-20_orcid.md43
-rw-r--r--extra/bulk_edits/2019-12-20_updates.md137
-rw-r--r--extra/bulk_edits/2020-03-19_arxiv_pubmed.md57
-rw-r--r--extra/bulk_edits/2020-03-23_jalc.md23
-rw-r--r--extra/bulk_edits/2020-08-05_chocula.md17
-rw-r--r--extra/bulk_edits/2020-09-02_file_meta.md75
-rw-r--r--extra/bulk_edits/2020-10-08_chocula.md44
-rw-r--r--extra/bulk_edits/2020-12-01_orcid.md55
-rw-r--r--extra/bulk_edits/2020-12-14_doaj.md139
-rw-r--r--extra/bulk_edits/2020-12-23_dblp.md55
-rw-r--r--extra/bulk_edits/2020_datacite.md152
-rw-r--r--extra/bulk_edits/2021-05-28_dblp.md44
-rw-r--r--extra/bulk_edits/2021-05-28_doaj.md80
-rw-r--r--extra/bulk_edits/2021-11-10_case_sensitive_dois.md53
-rw-r--r--extra/bulk_edits/2021-11-10_file_release_ingest_bugfix.md108
-rw-r--r--extra/bulk_edits/2021-11-11_wayback_short_ts.md52
-rw-r--r--extra/bulk_edits/2021-11-24_file_meta.md41
-rw-r--r--extra/bulk_edits/2021-11-24_file_sha1_dedupe.md35
-rw-r--r--extra/bulk_edits/CHANGELOG.md131
23 files changed, 1743 insertions, 0 deletions
diff --git a/extra/bulk_edits/2019-06-24_unpaywall_archiveorg.md b/extra/bulk_edits/2019-06-24_unpaywall_archiveorg.md
new file mode 100644
index 00000000..2f3bbb98
--- /dev/null
+++ b/extra/bulk_edits/2019-06-24_unpaywall_archiveorg.md
@@ -0,0 +1,262 @@
+
+Goal is to import:
+
+- UNPAYWALL-PDF-CRAWL-2019-04.published dataset; about 6 million lines, expect
+ about half (3 million) new release fulltext matches
+- archive.org fulltext, about 1.8 million files
+
+## QA UNPAYWALL-PDF-CRAWL-2019-04
+
+ export FATCAT_AUTH_WORKER_CRAWL=...
+
+ # this wasn't a random sample
+ zcat /srv/fatcat/datasets/UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | head -n200 | ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id UNPAYWALL-PDF-CRAWL-2019-04
+
+ # this was!
+ zcat /srv/fatcat/datasets/UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | shuf -n200 | ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id UNPAYWALL-PDF-CRAWL-2019-04
+ [...]
+ Counter({'total': 199, 'insert': 106, 'exists': 62, 'skip': 31, 'skip-extid-not-found': 20, 'skip-update-disabled': 1, 'update': 0})
+
+ # ok, big import
+ zcat /srv/fatcat/datasets/UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id UNPAYWALL-PDF-CRAWL-2019-04
+ # ran a few hundred thousand and looked good
+
+## prod UNPAYWALL-PDF-CRAWL-2019-04
+
+ export FATCAT_AUTH_WORKER_CRAWL=...
+
+ zcat /srv/fatcat/datasets/UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | shuf -n200 | ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id UNPAYWALL-PDF-CRAWL-2019-04
+ Counter({'total': 198, 'insert': 115, 'exists': 56, 'skip': 27, 'skip-extid-not-found': 13, 'skip-update-disabled': 2, 'update': 0})
+
+ zcat /srv/fatcat/datasets/UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id UNPAYWALL-PDF-CRAWL-2019-04
+ [...]
+ Requiring GROBID status == 200
+ Counter({'total': 520139, 'insert': 282524, 'exists': 155371, 'skip': 82244, 'skip-extid-not-found': 40000, 'skip-update-disabled': 6319, 'update': 0})
+ 19008.63user 729.06system 3:01:57elapsed 180%CPU (0avgtext+0avgdata 51440maxresident)k
+ 8552inputs+4335032outputs (54major+370893minor)pagefaults 0swaps
+
+ (loosely repeated 12x times, of course)
+
+ oh no, lots of duplicate inserts... ugh. needed a uniq in there, but really
+ only "one hit per file" export. or a shuf? blech.
+
+ (python)fatcat@wbgrp-svc502:/srv/fatcat/datasets$ zcat UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | jq .final_sha1 -r | sort -S 4G -u | wc -l
+ 5621882
+ (python)fatcat@wbgrp-svc502:/srv/fatcat/datasets$ zcat UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | wc -l
+ 6181191
+
+ ugh. how did this get missed in QA? sloppy.
+
+fixup is going to be
+- filter input list for duplicated sha1hex
+- for each duplicated sha1hex:
+ - fetch file entity. if not single release_id, bail
+ - fetch release expanded with files
+ - find all files with same sha1 that *aren't* the fetched file
+ - print file entity id
+- iterate over file entity ids, batches of 100x
+ - create editgroup
+ - delete files
+ - accept editgroup
+
+fetch_dupes.py:
+
+
+ #!/usr/bin/env python3
+
+ import sys
+ import fatcat_client
+ from fatcat_tools import public_api
+
+ def do_sha1(api, sha1hex):
+ try:
+ fe = api.lookup_file(sha1=sha1hex)
+ except:
+ return
+ if len(fe.release_ids) != 1:
+ return
+ try:
+ re = api.get_release(fe.release_ids[0], expand='files', hide='refs,contribs,abstracts')
+ except:
+ return
+ for f in re.files:
+ if f.sha1 == fe.sha1 and f.ident != fe.ident and f.release_ids == [re.ident]:
+ print(f.ident)
+
+ def run():
+ api = public_api('https://api.qa.fatcat.wiki/v0')
+ for l in sys.stdin:
+ if l:
+ do_sha1(api, l.strip())
+
+ if __name__ == '__main__':
+ run()
+
+delete_dupes.py:
+
+
+ #!/usr/bin/env python3
+
+ import sys
+ import fatcat_client
+ from fatcat_tools import authenticated_api
+
+ #API_ENDPOINT = 'https://api.qa.fatcat.wiki/v0'
+ API_ENDPOINT = 'https://api.fatcat.wiki/v0'
+
+ def do_batch(api, batch):
+ eg = api.create_editgroup(
+ fatcat_client.Editgroup(description="Cleaning up duplicated file insertions from UNPAYWALL-CRAWL-2019-04 insert"))
+ for ident in batch:
+ api.delete_file(eg.editgroup_id, ident)
+ api.accept_editgroup(eg.editgroup_id)
+ print("deleted {} - {}...".format(eg.editgroup_id, len(batch)))
+
+ def run():
+ api = authenticated_api(API_ENDPOINT)
+ batch = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ try:
+ fe = api.get_file(l)
+ except:
+ continue
+ if fe.state == 'active' and fe.release_ids:
+ batch.append(l)
+ if len(batch) >= 100:
+ do_batch(api, batch)
+ batch = []
+ if batch:
+ do_batch(api, batch)
+
+ if __name__ == '__main__':
+ run()
+
+commands:
+
+ zcat UNPAYWALL-PDF-CRAWL-2019-04.published.json.gz | jq .final_sha1 -r | b32_hex.py | sort -S 4G | uniq -d > repeated_sha1.tsv
+
+ cat repeated_sha1.tsv | pv -l | ./fetch_dupes.py > repeated_file_idents.tsv
+
+ export FATCAT_API_AUTH_TOKEN=... (crawl bot)
+ cat repeated_file_idents.tsv | ./delete_dupes.py
+
+## QA archive.org files
+
+Start with arxiv:
+
+ # FATCAT_AUTH_WORKER_ARCHIVE_ORG
+ export FATCAT_API_AUTH_TOKEN=...
+
+ # had a 500 "unexpected internal error: invalid length at 196", which was
+ due to syntax error in API token. should have a better error response
+
+ # try sample of arxiv_id
+ zcat /srv/fatcat/datasets/arxiv.match.json.gz | head -n100 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'skip': 100, 'total': 100, 'skip-no-releases': 72, 'skip-no-urls': 28, 'update': 0, 'insert': 0, 'exists': 0})
+
+ # TODO: shouldn't re-insert if URL already in there under a different reltyp
+
+ # Ok, made a bunch of code changes to "clean up" at least arxiv URLs. All
+ # arxiv.org files should be 1-to-1 with releases that have full arxiv_ids
+
+Ok, try JSTOR:
+
+ zcat /srv/fatcat/datasets/jstor.match.json.gz | shuf -n1000 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 1000, 'skip': 763, 'skip-no-releases': 763, 'insert': 162, 'exists': 74, 'update': 1})
+
+larger import got:
+
+ HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""}
+
+could try getting around this with shuf?
+
+ zcat /srv/fatcat/datasets/jstor.match.json.gz | shuf | pv -l | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+
+got the same errors so added "inflight" edit protection and rolled back to earlier command:
+
+ zcat /srv/fatcat/datasets/jstor.match.json.gz | shuf -n1000 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ 451k 0:25:32 [ 294 /s]
+ Counter({'total': 451178, 'skip-no-releases': 351287, 'skip': 351287, 'insert': 59644, 'exists': 39198, 'update': 1049, 'skip-update-inflight': 26})
+
+many/most of these files were already in fatcat due to earlier "paper-manifest"
+work... keep forgetting that!
+
+ok, next pmc:
+
+ zcat /srv/fatcat/datasets/pmc.match.json.gz | shuf -n1000 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 1000, 'exists': 895, 'insert': 77, 'skip-no-releases': 22, 'skip': 22, 'update': 6})
+
+ that's a surprisingly large fraction (2.2%) with `skip-no-releases`. some
+ because pubmed import failed, some because multiple PMCID identifiers? hrm.
+
+ok, an finally paper-doi:
+
+ zcat /srv/fatcat/datasets/paper-doi.match.json.gz | shuf -n1000 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 1000, 'exists': 720, 'insert': 280, 'update': 0, 'skip': 0})
+
+ lots exist! probably from the pre-1923 stuff? yup.
+
+## prod archive.org files
+
+ # try sample of arxiv_id
+ zcat /srv/fatcat/datasets/arxiv.match.json.gz | shuf -n100 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'total': 100, 'insert': 80, 'update': 20, 'exists': 0, 'skip': 0})
+
+ # all arxiv_id
+ zcat /srv/fatcat/datasets/arxiv.match.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py --editgroup-description-override '"Import fulltext from archive.org journals collection"' matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 62296, 'insert': 49503, 'update': 12413, 'exists': 269, 'skip': 111, 'skip-no-releases': 111, 'skip-update-inflight': 10})
+ 2497.78user 98.87system 27:31.22elapsed 157%CPU (0avgtext+0avgdata 47604maxresident)k
+ 360inputs+266104outputs (3major+265297minor)pagefaults 0swaps
+
+ # derp, some of those were crawl-bot but should have been archive-org-bot. ctrl-c and re-ran
+
+ # sample jstor
+ zcat /srv/fatcat/datasets/jstor.match.json.gz | shuf -n100 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'total': 100, 'insert': 69, 'exists': 29, 'update': 1, 'skip': 1, 'skip-no-releases': 1})
+
+ # all jstor
+ zcat /srv/fatcat/datasets/jstor.match.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py --editgroup-description-override '"Import fulltext from archive.org journals collection"' matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 41072, 'insert': 27783, 'exists': 11926, 'update': 1307, 'skip-update-inflight': 117, 'skip': 56, 'skip-no-releases': 56})
+ 1257.93user 54.42system 12:45.96elapsed 171%CPU (0avgtext+0avgdata 45248maxresident)k
+ 5384inputs+157016outputs (38major+259749minor)pagefaults 0swaps
+
+ good, pretty low `skip-no-releases` for JSTOR imports
+
+ # sample pmc
+ zcat /srv/fatcat/datasets/pmc.match.json.gz | shuf -n100 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'total': 100, 'exists': 92, 'insert': 5, 'skip-no-releases': 2, 'skip': 2, 'update': 1})
+
+ interesting, at least one longtail file which is actually known: https://fatcat.wiki/file/xnc3sarc3jfsnceeagn34zi5la
+ almost all known!
+
+ # all pmc
+ zcat /srv/fatcat/datasets/pmc.match.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py --editgroup-description-override '"Import fulltext from archive.org journals collection"' matched --default-mimetype application/pdf --default-link-rel archive -
+ [...]
+ Counter({'total': 18720, 'exists': 16701, 'insert': 1461, 'skip': 357, 'skip-no-releases': 357, 'update': 201, 'skip-update-inflight': 1})
+
+ # sample paper-doi
+ zcat /srv/fatcat/datasets/paper-doi.match.json.gz | shuf -n100 | ./fatcat_import.py --editgroup-description-override "Import fulltext from archive.org journals collection" matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'total': 100, 'exists': 73, 'insert': 27, 'skip': 0, 'update': 0})
+
+ # all paper-doi
+ zcat /srv/fatcat/datasets/paper-doi.match.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py --editgroup-description-override '"Import fulltext from archive.org journals collection"' matched --default-mimetype application/pdf --default-link-rel archive -
+ Counter({'total': 3014, 'exists': 2280, 'insert': 734, 'update': 0, 'skip': 0})
+ Counter({'total': 3464, 'exists': 2483, 'insert': 981, 'update': 0, 'skip': 0})
+ Counter({'total': 3437, 'exists': 2303, 'insert': 1134, 'update': 0, 'skip': 0})
+ Counter({'total': 3450, 'exists': 2379, 'insert': 1071, 'update': 0, 'skip': 0})
+ Counter({'total': 3467, 'exists': 2486, 'insert': 981, 'skip': 0, 'update': 0})
+ Counter({'total': 3481, 'exists': 2583, 'insert': 898, 'skip': 0, 'update': 0})
+ Counter({'total': 3423, 'exists': 2178, 'insert': 1245, 'update': 0, 'skip': 0})
+ 62.41user 3.17system 0:31.18elapsed 210%CPU (0avgtext+0avgdata 49852maxresident)k
+ 96inputs+13184outputs (7major+159215minor)pagefaults 0swaps
+
+All done!
diff --git a/extra/bulk_edits/2019-09-03_chocula.md b/extra/bulk_edits/2019-09-03_chocula.md
new file mode 100644
index 00000000..03311c8c
--- /dev/null
+++ b/extra/bulk_edits/2019-09-03_chocula.md
@@ -0,0 +1,23 @@
+
+Ran on my laptop, from master branch (`00e4ec6865782e4453d4dcec24ee205bf7c13ecf`).
+
+In QA:
+
+ time ./fatcat_import.py --host-url https://api.qa.fatcat.wiki/v0 chocula ~/code/chocula/export_fatcat.json
+ [...]
+ Counter({'total': 139032, 'exists': 76952, 'exists-skip-update': 65528, 'update': 46157, 'insert': 13605, 'exists-by-issnl': 8879, 'exists-not-found': 2538, 'skip': 2318, 'skip-unknown-new-issnl': 2318, 'exists-inactive': 7})
+ real 32m30.009s
+ user 4m44.176s
+ sys 0m12.544s
+
+In prod:
+
+ time ./fatcat_import.py --host-url https://api.fatcat.wiki/v0 chocula ~/code/chocula/export_fatcat.2019-09-03.json
+
+ Counter({'total': 139032, 'update': 66802, 'exists': 47586, 'exists-skip-update': 47428, 'insert': 22326, 'skip-unknown-new-issnl': 2318, 'skip': 2318, 'exists-by-issnl': 158})
+
+ real 34m16.930s
+ user 5m5.116s
+ sys 0m13.560s
+
+The container auto-updater is currently broken, so needed to manually dump/load into elastic.
diff --git a/extra/bulk_edits/2019-10-08_file_cleanups.md b/extra/bulk_edits/2019-10-08_file_cleanups.md
new file mode 100644
index 00000000..2eebb363
--- /dev/null
+++ b/extra/bulk_edits/2019-10-08_file_cleanups.md
@@ -0,0 +1,59 @@
+
+These cleanups are primarily intended to fix bogus 'None' datetime links to
+wayback for files that are actually in petabox (archive.org not
+web.archive.org). These URLs were created accidentally during fatcat
+boostrapping; there are about 300k such file enties to fix.
+
+Will also update archive.org link reltype to 'archive' (instead of
+'repository'), which is the new preferred style.
+
+Generated the set of files to update like:
+
+ zcat file_export.2019-07-07.json.gz | rg 'web.archive.org/web/None' | gzip > file_export.2019-07-07.None.json.gz
+
+ zcat /srv/fatcat/datasets/file_export.2019-07-07.None.json.gz | wc -l
+ 304308
+
+## QA
+
+Running at git rev:
+
+ 984a1b157990f42f8c57815f4b3c00f6455a114f
+
+Created a new 'cleanup-bot' account and credentials. Put token in local env.
+
+Ran with a couple hundred entities first; edits look good.
+
+ zcat /srv/fatcat/datasets/file_export.2019-07-07.None.json.gz | head -n200 | ./fatcat_cleanup.py files -
+
+Then the full command, with batchsize=100:
+
+ time zcat /srv/fatcat/datasets/file_export.2019-07-07.None.json.gz | pv -l | ./fatcat_cleanup.py --batch-size 100 files -
+
+Should finish in a couple hours.
+
+ 304k 1:05:19 [77.6 /s]
+
+ Counter({'cleaned': 304308, 'lines': 304308, 'updated': 297308, 'skip-revision': 7000})
+
+ real 65m20.613s
+ user 20m40.828s
+ sys 0m34.492s
+
+## Production
+
+Again ran with a couple hundred entities first; edits look good.
+
+ zcat /srv/fatcat/datasets/file_export.2019-07-07.None.json.gz | head -n200 | ./fatcat_cleanup.py files -
+
+Then the full command, with batchsize=100:
+
+ time zcat /srv/fatcat/datasets/file_export.2019-07-07.None.json.gz | pv -l | ./fatcat_cleanup.py --batch-size 100 files -
+ [...]
+ 304k 1:03:10 [80.3 /s]
+ Counter({'cleaned': 304308, 'lines': 304308, 'updated': 304107, 'skip-revision': 201})
+
+ real 63m11.631s
+ user 21m8.504s
+ sys 0m31.888s
+
diff --git a/extra/bulk_edits/2019-11-05_crossref_patch.md b/extra/bulk_edits/2019-11-05_crossref_patch.md
new file mode 100644
index 00000000..1765fc36
--- /dev/null
+++ b/extra/bulk_edits/2019-11-05_crossref_patch.md
@@ -0,0 +1,58 @@
+
+Goal is to make sure we have imported all in-scope crossref DOI objects. There
+were a few months gap between the snapshot used as initial bootstrap and the
+start of continuous ingest; any DOIs registered during that gap and not updated
+since are not in fatcat. Expectation is that this will be a relatively small
+import.
+
+## QA Run
+
+Started Thu 31 Oct 2019 08:07:20 PM PDT
+
+ export FATCAT_AUTH_WORKER_CROSSREF="..."
+ time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ # postgresql DB at start: fresh 2019-10 dump imported, 357 GB
+ # over 15k TPS against postgres
+
+ 20x theads of:
+ Counter({'total': 5397349, 'exists': 4961058, 'skip': 360156, 'insert': 76135, 'inserted.container': 113, 'update': 0})
+
+ real 1173m52.497s => 20hr
+ user 13058m24.460s
+ sys 319m27.716s
+
+ 1.5 million new releases
+ 7.2 million skips (total)
+
+Ran again with null subtitle fix and granular stats:
+
+ 20x threads of:
+ Counter({'total': 5368366, 'exists': 5122104, 'skip': 244072, 'skip-blank-title': 38399, 'skip-release-type': 5296, 'insert': 2190, 'skip-huge-contribs': 70, 'skip-huge-refs': 7, 'update': 0})
+
+ 43k additional insets (still about 1.5m total)
+ of 4.8 million skipped (why not closer to 7.2 million?), most seem to be blank title
+
+## Production Run
+
+Git: 44c23290c72ec67db38f1e1d40b76ba795b40d9d
+
+started around Tue 05 Nov 2019 02:51:19 PM PST
+
+ export FATCAT_AUTH_WORKER_CROSSREF="..."
+ time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20190730.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ # postgresql DB at start: 399.03G
+
+ # 20x of:
+ Counter({'total': 5347938, 'exists': 5023305, 'skip': 251747, 'skip-blank-title': 247969, 'insert': 72886, 'skip-release-type': 3686, 'inserted.container': 103, 'skip-huge-contribs': 88, 'skip-huge-refs': 4, 'update': 0})
+ # 1.45m new releases
+ # 2k more new containers
+ # 4.96m blank titles
+
+ real 1139m42.231s
+ user 13307m10.124s
+ sys 355m18.904s
+
+ # postgresql DB: 402.76G
+
diff --git a/extra/bulk_edits/2019-12-20_orcid.md b/extra/bulk_edits/2019-12-20_orcid.md
new file mode 100644
index 00000000..33dde32f
--- /dev/null
+++ b/extra/bulk_edits/2019-12-20_orcid.md
@@ -0,0 +1,43 @@
+
+Newer ORCID dumps are XML, not JSON. But there is a conversion tool!
+
+ https://github.com/ORCID/orcid-conversion-lib
+
+Commands:
+
+ wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-0.0.2-full.jar
+ java -jar orcid-conversion-lib-0.0.2-full.jar OPTIONS
+
+ java -jar orcid-conversion-lib-0.0.2-full.jar --tarball -i ORCID_2019_summaries.tar.gz -v v3_0rc1 -o ORCID_2019_summaries_json.tar.gz
+
+ # [...]
+ # Sat Dec 21 04:43:50 UTC 2019 done 7300000
+ # Sat Dec 21 04:44:08 UTC 2019 done 7310000
+ # Sat Dec 21 04:44:17 UTC 2019 finished errors 0
+
+Importing in QA, ran in to some lines like:
+
+ {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-0014-6598","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+ {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-3750-5654","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+ {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-1424-4826","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+ {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0002-5340-9665","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+
+Needed to patch to filter those out. Then ran ok like:
+
+ zcat /srv/fatcat/datasets/ORCID_2019_summaries.sample_10k.json.gz | ./fatcat_import.py orcid -
+ Counter({'total': 10000, 'exists': 5323, 'insert': 4493, 'skip': 184, 'skip-no-person': 160, 'update': 0})
+
+New dump is about 7.3 million rows, so expecting about 3.2 million new
+entities, 250k skips.
+
+Doing bulk run like:
+
+ time zcat /srv/fatcat/datasets/ORCID_2019_summaries.json.gz | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid -
+
+Prod timing:
+
+ Counter({'total': 910643, 'exists': 476812, 'insert': 416583, 'skip': 17248, 'update': 0})
+
+ real 47m27.658s
+ user 245m44.272s
+ sys 14m50.836s
diff --git a/extra/bulk_edits/2019-12-20_updates.md b/extra/bulk_edits/2019-12-20_updates.md
new file mode 100644
index 00000000..bd069a7a
--- /dev/null
+++ b/extra/bulk_edits/2019-12-20_updates.md
@@ -0,0 +1,137 @@
+
+## Arxiv
+
+Used metha-sync tool to update. Then went in raw storage directory (as opposed
+to using `metha-cat`) and plucked out weekly files updated since last import.
+Created a tarball and uploaded to:
+
+ https://archive.org/download/arxiv_raw_oai_snapshot_2019-05-22/arxiv_20190522_20191220.tar.gz
+
+Downloaded, extracted, then unzipped:
+
+ gunzip *.gz
+
+Run importer:
+
+ export FATCAT_AUTH_WORKER_ARXIV=...
+
+ ./fatcat_import.py --batch-size 100 arxiv /srv/fatcat/datasets/arxiv_20190522_20191220/2019-05-31-00000000.xml
+ # Counter({'exists': 1785, 'total': 1001, 'insert': 549, 'skip': 1, 'update': 0})
+
+ fd .xml /srv/fatcat/datasets/arxiv_20190522_20191220/ | parallel -j15 ./fatcat_import.py --batch-size 100 arxiv {}
+
+Things seem to run smoothly in QA. New releases get grouped with old works
+correctly, no duplication obvious.
+
+In prod, loaded just the first file as a start, waiting to see if auto-ingest
+happens. Looks like yes! Great that everything is so smooth. All seem to be new
+captures.
+
+In production prod elasticsearch, 2,377,645 arxiv releases before this
+updated import, 741,033 with files attached. Guessing about 150k new releases,
+but will check.
+
+Up to 2,531,542 arxiv releases, so only 154k or so new releases created.
+781,122 with fulltext.
+
+## Pubmed QA
+
+Grabbed fresh 2020 baseline, released in December 2019: <https://archive.org/details/pubmed_medline_baseline_2020>
+
+ gunzip *.xml.gz
+
+Run importer:
+
+ export FATCAT_AUTH_WORKER_PUBMED=...
+
+ ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2020/pubmed20n1000.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+ # Counter({'total': 29975, 'update': 26650, 'skip': 2081, 'insert': 1193, 'warn-pmid-doi-mismatch': 36, 'exists': 36, 'skip-update-conflict': 15, 'inserted.container': 3})
+
+Noticed that `release_year` was not getting set for many releases. Made a small
+code tweak (`1bb0a2181d5a30241d80279c5930eb753733f30b`) and trying another:
+
+ time ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2020/pubmed20n1001.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+ # Counter({'total': 30000, 'update': 25912, 'skip': 2119, 'insert': 1935, 'exists': 29, 'warn-pmid-doi-mismatch': 27, 'skip-update-conflict': 5, 'inserted.container': 1})
+
+ real 30m45.044s
+ user 16m43.672s
+ sys 0m10.792s
+
+ time fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2020 | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+More errors:
+
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.3760/cma. j. issn.2095-4352. 2014. 07.014"}
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.13201/j.issn.10011781.2016.06.002"}
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.23750/abm.v88i2 -s.6506"}
+
+
+ 10.1037//0002-9432.72.1.50
+ BOGUS DOI: 10.1037//0021-843x.106.2.266
+ BOGUS DOI: 10.1037//0021-843x.106.2.280
+ => actual ok? at least redirect ok
+
+ unparsable medline date, skipping: Summer 2018
+
+TODO:
+x fix bad DOI error (real error, skip these)
+x remove newline after "unparsable medline date" error
+x remove extra line like "existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))" in warning
+
+NOTE: Remember having run through the entire baseline in QA, but didn't save the command or output.
+
+## Pubmed Prod (2020-01-17)
+
+This is after adding a flag to enforce no updates at all, only new releases.
+Will likely revisit and run through with updates that add important metadata
+like exact references matches for older releases, after doing release
+merge/group cleanups.
+
+
+ # git commit: d55d45ad667ccf34332b2ce55e8befbd212922ec
+ # had a trivial typo in fatcat_import.py, will push a fix
+ export FATCAT_AUTH_WORKER_PUBMED=...
+ time ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2020/pubmed20n1001.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+Full run:
+
+ fd '.xml$' /srv/fatcat/datasets/pubmed_medline_baseline_2020 | time parallel -j16 ./fatcat_import.py pubmed {} /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+ [...]
+ Command exited with non-zero status 2
+ 1271708.20user 23689.44system 31:42:15elapsed 1134%CPU (0avgtext+0avgdata 584588maxresident)k
+ 486129672inputs+2998072outputs (3672major+139751796minor)pagefaults 0swaps
+
+ => so apparently 2x tasks failed
+ => 1271708 = 353 hours... but what walltime? about 31-32 hours if divide by CPU
+
+Only received a single exception at:
+
+ Jan 18, 2020 8:33:09 AM UTC
+ /srv/fatcat/datasets/pubmed_medline_baseline_2020/pubmed20n0936.xml
+ MalformedExternalId: 10.4149/gpb¬_2017042
+
+Not sure what the other failure was... maybe an invalid filename or argument,
+before processing actually started? Or some failure (OOM) that prevented sentry
+reporting?
+
+Patch normal.py and re-run that single file:
+
+ ./fatcat_import.py pubmed /srv/fatcat/datasets/pubmed_medline_baseline_2020/pubmed20n0936.xml /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+ [...]
+ Counter({'total': 30000, 'exists': 27243, 'skip': 1605, 'insert': 1152, 'warn-pmid-doi-mismatch': 26, 'update': 0})
+
+Done!
+
+## Chocula
+
+Command:
+
+ export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+ ./fatcat_import.py chocula /srv/fatcat/datasets/export_fatcat.2019-12-26.json
+
+Result:
+
+ Counter({'total': 144455, 'exists': 139807, 'insert': 2384, 'skip': 2264, 'skip-unknown-new-issnl': 2264, 'exists-by-issnl': 306, 'update': 0})
diff --git a/extra/bulk_edits/2020-03-19_arxiv_pubmed.md b/extra/bulk_edits/2020-03-19_arxiv_pubmed.md
new file mode 100644
index 00000000..56e88880
--- /dev/null
+++ b/extra/bulk_edits/2020-03-19_arxiv_pubmed.md
@@ -0,0 +1,57 @@
+
+On 2020-03-20, automated daily harvesting and importing of arxiv and pubmed
+metadata started. In the case of pubmed, updates are enabled, so that recently
+created DOI releases get updated with PMID and extra metadata.
+
+We also want to do last backfills of metadata since the last import up through
+the first day updated by the continuous harvester.
+
+
+## arxiv
+
+The previous date span was 2019-05-22 through 2019-12-20. This time we should
+do 2019-12-20 through today.
+
+First do metha update from last harvest through today, and grab the new daily files:
+
+ metha-sync -format arXivRaw http://export.arxiv.org/oai2
+
+ mkdir arxiv_20191220_20200319
+ cp 2019-12-2* 2019-12-3* 2020-* arxiv_20191220_20200319/
+ tar cf arxiv_20191220_20200319.tar arxiv_20191220_20200319/
+ gzip arxiv_20191220_20200319.tar
+
+Then copy to fatcat server and run import:
+
+ export FATCAT_AUTH_WORKER_ARXIV=...
+
+ ./fatcat_import.py --batch-size 100 arxiv /srv/fatcat/datasets/arxiv_20191220_20200319/2019-12-31-00000000.xml
+ => Counter({'exists': 1824, 'total': 1001, 'insert': 579, 'skip': 1, 'update': 0})
+
+ fd .xml /srv/fatcat/datasets/arxiv_20191220_20200319/ | parallel -j15 ./fatcat_import.py --batch-size 100 arxiv {}
+
+Ran fairly quickly only some ~80-90k entities to process.
+
+## PubMed
+
+First, mirror update files from FTP, e.g. via lftp:
+
+ mkdir -p /srv/fatcat/datasets/pubmed_updates
+ lftp -e 'mirror -c /pubmed/updatefiles /srv/fatcat/datasets/pubmed_updates; bye' ftp://ftp.ncbi.nlm.nih.gov
+
+Inspect completed dates from kafka:
+
+ kafkacat -b $KAFKA_BROKER -t fatcat-prod.ftp-pubmed-state -C
+
+Show dates and corresponding files:
+
+ find /srv/fatcat/datasets/pubmed_updates -name "*html" | xargs cat | grep "Created" | sort
+
+For this bulk import, we used files pubmed20n1016.xml.gz (2019-12-16) up to pubmed20n1110.xml.gz (2020-03-06).
+
+To import the corresponding files, run:
+
+ printf "%s\n" /srv/fatcat/datasets/pubmed_updates/pubmed20n{1016..1110}.xml.gz | shuf | \
+ parallel -j16 'gunzip -c {} | ./fatcat_import.py pubmed --do-updates - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt'
+
+Import took 254 min, there were 1715427 PubmedArticle docs in these update files.
diff --git a/extra/bulk_edits/2020-03-23_jalc.md b/extra/bulk_edits/2020-03-23_jalc.md
new file mode 100644
index 00000000..d63c3759
--- /dev/null
+++ b/extra/bulk_edits/2020-03-23_jalc.md
@@ -0,0 +1,23 @@
+
+2019-10-01 JaLC metadata snapshot: <https://archive.org/download/jalc-bulk-metadata-2019>
+
+Extracted .rdf file instead of piping it through zcat.
+
+Use correct bot:
+
+ export FATCAT_AUTH_WORKER_JALC=blah
+
+Start small; do a random bunch (10k) single-threaded to pre-create containers:
+
+ head -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+ shuf -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+ shuf -n10000 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+
+Seemed like lots of individual containers getting added after repeating, so
+just going to import single-threaded to avoid duplicate container creation:
+
+ cat /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+ => Counter({'total': 8419745, 'exists': 6480683, 'insert': 1934082, 'skip': 4980, 'inserted.container': 134, 'update': 0})
+
+Had a bit fewer than 4,568,120 "doi_registrar:jalc" releases before this
+import, 6,502,202 after (based on `doi_registrar:jalc` query).
diff --git a/extra/bulk_edits/2020-08-05_chocula.md b/extra/bulk_edits/2020-08-05_chocula.md
new file mode 100644
index 00000000..78d2c9c3
--- /dev/null
+++ b/extra/bulk_edits/2020-08-05_chocula.md
@@ -0,0 +1,17 @@
+
+Using `journal-metadata-bot` and `chocula_fatcat_export.2020-07-31.json` export.
+
+Start small:
+
+ head -n100 /srv/fatcat/datasets/chocula_fatcat_export.2020-07-31.json | ./fatcat_import.py chocula --do-updates -
+ => Counter({'total': 100, 'update': 67, 'exists': 25, 'exists-skip-update': 24, 'insert': 8, 'exists-by-issnl': 1, 'skip': 0})
+
+Full batch:
+
+ time cat /srv/fatcat/datasets/chocula_fatcat_export.2020-07-31.json | ./fatcat_import.py chocula --do-updates -
+
+ Counter({'total': 164950, 'update': 112074, 'exists': 37243, 'exists-skip-update': 35862, 'insert': 15633, 'exists-by-issnl': 1381, 'skip': 0})
+
+ real 16m9.779s
+ user 6m26.324s
+ sys 0m16.088s
diff --git a/extra/bulk_edits/2020-09-02_file_meta.md b/extra/bulk_edits/2020-09-02_file_meta.md
new file mode 100644
index 00000000..b0606f2d
--- /dev/null
+++ b/extra/bulk_edits/2020-09-02_file_meta.md
@@ -0,0 +1,75 @@
+
+Approximately 18 million file entities have only partial file metadata. All
+have a sha1 (hex), but many are missing file size, md5, mimetype, etc.
+
+At least a few thousand of these are additionally *not* `application/pdf`
+mimetype based on actually retrieving the file and sniffing the file type.
+These are added earlier to the catalog likely based on CDX mimetype, which is
+server-reported and can be incorrect.
+
+## QA Testing
+
+ ./fatcat_import.py --editgroup-description-override "backfill of full file-level metadata for early-imported papers" file-meta -
+ => Counter({'total': 1000, 'update': 1000, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ # Identical command, verifying that don't double-insert:
+ => Counter({'total': 1000, 'skip-existing-complete': 1000, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+Two additional file-level cleanups should probably be done at the same time:
+
+Partial wayback URL timestamps, for cases where we have the full timestamped URL. Eg:
+
+ https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38
+ https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38
+ https://qa.fatcat.wiki/file/4udmm4zd4bgfhnaaycqoztgfgm
+ https://qa.fatcat.wiki/file/k73il3k5hzemtnkqa5qyorg6ci
+ https://qa.fatcat.wiki/file/7hstlrabfjb6vgyph7ntqtpkne
+
+Live-web URLs identical except for http/https flip or other trivial things (much less frequent case):
+
+ http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf
+ https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf
+
+ http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf
+ http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf
+ https://qa.fatcat.wiki/file/h2wx6re5fjhx7c6duifzskeo6u
+ https://qa.fatcat.wiki/file/vw7divmjwveftn4djj2cp32n4i
+
+Which bot to use? Let's do `sandcrawler-bot`.
+
+Trying a larger batch to see what database size increase is going to look like,
+and whether single-threaded is going to be too slow:
+
+ # before: Size: 517.87G
+
+ time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | head -n500000 | pv -l | ./fatcat_import.py --editgroup-description-override "backfill of full file-level metadata for early-imported papers" file-meta -
+ => 145m18.615s
+
+ # after: 518.47G
+ # delta: 600 MB
+
+A million records would take about 5 hours, so 100 hours total, or 4 days. Let's do parallelism.
+
+Total size increase estimated as 24 GBytes. It all adds up!
+
+ time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | tail -n500000 | pv -l | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+ => real 32m53.935s
+
+## Production Import
+
+Before Size: 624.63G
+
+ export FATCAT_API_AUTH_TOKEN... # sandcrawler-bot
+
+ # start small
+ time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | pv -l | head -n1000 | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+
+ # full batch
+ time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | pv -l | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+
+ => 18.1M 20:53:32 [ 241 /s]
+
+ Counter({'total': 2234159, 'update': 2234111, 'skip-existing-complete': 48, 'skip': 0, 'insert': 0, 'exists': 0})
+ (etc, 8x)
+
+After Size: 653.69G (+29GB or so)
diff --git a/extra/bulk_edits/2020-10-08_chocula.md b/extra/bulk_edits/2020-10-08_chocula.md
new file mode 100644
index 00000000..d60b6842
--- /dev/null
+++ b/extra/bulk_edits/2020-10-08_chocula.md
@@ -0,0 +1,44 @@
+
+Another update of journal metadata. In this case due to expanding "Keepers"
+coverage to PKP PLN, Hathitrust, Scholar's Portal, and Carniniana.
+
+Using `journal-metadata-bot` and `chocula.2020-10-08.json` export.
+
+## QA Testing
+
+ shuf -n1000 /srv/fatcat/datasets/chocula.2020-10-08.json | ./fatcat_import.py chocula --do-updates -
+ Counter({'total': 1000, 'exists': 640, 'exists-skip-update': 532, 'update': 348, 'exists-not-found': 108, 'insert': 12, 'skip': 0})
+
+Expecting roughly a 1/3 update rate. Most of these seem to be true updates (eg,
+adding kbart metadata). A smaller fraction are just updating DOAJ timestamp or
+not updating any metadata at all.
+
+ head -n500 /srv/fatcat/datasets/chocula.2020-10-08.json | ./fatcat_import.py chocula --do-updates -
+ Counter({'total': 500, 'exists': 372, 'exists-skip-update': 328, 'update': 121, 'exists-not-found': 44, 'insert': 7, 'skip': 0})
+
+ head -n500 /srv/fatcat/datasets/chocula.2020-10-08.json | ./fatcat_import.py chocula --do-updates -
+ Counter({'total': 500, 'exists': 481, 'exists-skip-update': 430, 'exists-not-found': 44, 'update': 19, 'exists-by-issnl': 7, 'skip': 0, 'insert': 0})
+
+Made some changes in `27fe31d5ffcac700c30b2b10d56685ef0fa4f3a8` which seem to
+have removed the spurious null updates, while retaining DOAJ date-only updates.
+
+Also as a small nit notice that occasionally `kbart` metadata gets added with
+no year spans. This seems to be common with cariniana. Presumably this happens
+when there is no year span info available, only volumes. Seems like a valuable
+thing to include as a flag anyways.
+
+## Prod Import
+
+Start small:
+
+ head -n100 /srv/fatcat/datasets/chocula.2020-10-08.json | ./fatcat_import.py chocula --do-updates -
+ => Counter({'total': 100, 'exists': 69, 'exists-skip-update': 68, 'update': 30, 'insert': 1, 'exists-by-issnl': 1, 'skip': 0})
+
+Full batch:
+
+ time cat /srv/fatcat/datasets/chocula.2020-10-08.json | ./fatcat_import.py chocula --do-updates -
+ => Counter({'total': 167092, 'exists': 110594, 'exists-skip-update': 109852, 'update': 55274, 'insert': 1224, 'exists-by-issnl': 742, 'skip': 0})
+
+ real 10m45.714s
+ user 4m51.680s
+ sys 0m12.236s
diff --git a/extra/bulk_edits/2020-12-01_orcid.md b/extra/bulk_edits/2020-12-01_orcid.md
new file mode 100644
index 00000000..b6883b17
--- /dev/null
+++ b/extra/bulk_edits/2020-12-01_orcid.md
@@ -0,0 +1,55 @@
+
+Another annual ORCID dump, basically the same as last year (2019). Expecting
+around 10 million total ORCIDs, compared to 7.3 million last year, so maybe 2.5
+million new creator entities.
+
+In particular motivated to run this import before a potential dblp import
+and/or creator creation run.
+
+Files download from:
+
+- <https://orcid.figshare.com/articles/dataset/ORCID_Public_Data_File_2020/13066970>
+- <https://archive.org/details/orcid-dump-2020>
+
+## Prep
+
+ wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-0.0.2-full.jar
+
+ java -jar orcid-conversion-lib-0.0.2-full.jar --tarball -i ORCID_2020_10_summaries.tar.gz -v v3_0rc1 -o ORCID_2020_10_summaries_json.tar.gz
+
+ tar xvf ORCID_2020_10_summaries_json.tar.gz
+
+ fd .json ORCID_2020_10_summaries/ | parallel cat {} | jq . -c | pv -l | gzip > ORCID_2020_10_summaries.json.gz
+
+ zcat ORCID_2020_10_summaries.json.gz | shuf -n10000 | gzip > ORCID_2020_10_summaries.sample_10k.json.gz
+
+ ia upload orcid-dump-2020 ORCID_2020_10_summaries_json.tar.gz ORCID_2020_10_summaries.sample_10k.json.gz
+
+## Import
+
+Fetch to prod machine:
+
+ wget https://archive.org/download/orcid-dump-2020/ORCID_2020_10_summaries.json.gz
+ wget https://archive.org/download/orcid-dump-2020/ORCID_2020_10_summaries.sample_10k.json.gz
+
+Sample:
+
+ export FATCAT_AUTH_WORKER_ORCID=[...]
+ zcat /srv/fatcat/datasets/ORCID_2020_10_summaries.sample_10k.json.gz | ./fatcat_import.py orcid -
+ => Counter({'total': 10000, 'exists': 7356, 'insert': 2465, 'skip': 179, 'update': 0})
+
+Bulk import:
+
+ export FATCAT_AUTH_WORKER_ORCID=[...]
+ time zcat /srv/fatcat/datasets/ORCID_2020_10_summaries.json.gz | pv -l | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid -
+ => Counter({'total': 1208991, 'exists': 888696, 'insert': 299008, 'skip': 21287, 'update': 0})
+ => (8x of the above, roughly)
+
+ real 88m40.960s
+ user 389m35.344s
+ sys 23m18.396s
+
+
+ Before: Size: 673.36G
+ After: Size: 675.55G
+
diff --git a/extra/bulk_edits/2020-12-14_doaj.md b/extra/bulk_edits/2020-12-14_doaj.md
new file mode 100644
index 00000000..5e897183
--- /dev/null
+++ b/extra/bulk_edits/2020-12-14_doaj.md
@@ -0,0 +1,139 @@
+
+## Earlier QA Testing (November 2020)
+
+ export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_DOAJ)
+
+ # small test:
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | head | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+ # full run
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+ before: 519.17G
+ after: 542.08G
+
+
+ 5.45M 6:29:17 [ 233 /s]
+
+ 12x of:
+ Counter({'total': 455504, 'insert': 394437, 'exists': 60615, 'skip': 452, 'skip-title': 452, 'update': 0})
+
+ total: ~5,466,048
+ insert: ~4,733,244
+ exists: ~727,380
+
+Initial imports (before crash) were like:
+
+ Counter({'total': 9339, 'insert': 9330, 'skip': 9, 'skip-title': 9, 'update': 0, 'exists': 0})
+
+Seems like there is a bug, not finding existing by DOI?
+
+## Prod Container Metadata Update (chocula)
+
+Generic update of container metadata using chocula pipeline. Need to run this
+before DOAJ import to ensure we have all the containers already updated.
+
+Also updating ISSN-L index at the same time. Using a 2020-11-19 metadata
+snapshot, which was generated on 2020-12-07; more recent snapshots had small
+upstream changes in some formats so it wasn't trivial to run with a newer
+snapshot.
+
+ # git rev: 9f67c82ce8952bbe9a7a07b732830363c7865485
+
+ # from laptop, then unzip on prod machine
+ scp chocula_fatcat_export.2020-11-19.json.gz fatcat-prod1-vm:/srv/fatcat/datasets/
+
+ # check ISSN-L symlink
+ # ISSN-to-ISSN-L.txt -> 20201119.ISSN-to-ISSN-L.txt
+
+ export FATCAT_AUTH_WORKER_JOURNAL_METADATA=...
+ head -n200 /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json | ./fatcat_import.py chocula -
+ Counter({'total': 200, 'exists': 200, 'exists-by-issnl': 6, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json | ./fatcat_import.py chocula - --do-updates
+ Counter({'total': 200, 'exists': 157, 'exists-skip-update': 151, 'update': 43, 'exists-by-issnl': 6, 'skip': 0, 'insert': 0})
+
+Some of these are very minor updates, so going to do just creation (no
+`--do-updates`) to start.
+
+ time ./fatcat_import.py chocula /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json
+ Counter({'total': 168165, 'exists': 167497, 'exists-by-issnl': 2371, 'insert': 668, 'skip': 0, 'update': 0})
+
+ real 5m37.081s
+ user 3m1.648s
+ sys 0m9.488s
+
+TODO: tweak chocula import script to not update on `extra.state` metadata.
+
+
+## Release Metadata Bulk Import
+
+This is the first production bulk import of DOAJ metadata!
+
+ # git rev: 9f67c82ce8952bbe9a7a07b732830363c7865485
+ # DB before: Size: 678.15G
+
+ # ensure fatcatd is updated to have support for DOAJ identifier
+
+ # create new bot user
+ ./target/release/fatcat-auth create-editor --admin --bot doaj-bot
+ => mir5imb3v5ctxcaqnbstvmri2a
+
+ ./target/release/fatcat-auth create-token mir5imb3v5ctxcaqnbstvmri2a
+ => ...
+
+ # download dataset
+ wget https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2020-11-13.sample_10k.json.gz
+ wget https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2020-11-13_all.json.gz
+
+ export FATCAT_AUTH_WORKER_DOAJ=...
+
+ # start small
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13.sample_10k.json.gz | head -n100 | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+ => Counter({'total': 100, 'exists': 70, 'insert': 30, 'skip': 0, 'update': 0})
+
+That is about expected, in terms of fraction without DOI. However, 6 out of 10
+(randomly checked) of the inserted releases seem to be dupes, which feels too
+high. So going to pause this import until basic fuzzy matching ready from
+Martin's fuzzycat work, and will check against elasticsearch before import.
+Will shuffle the entire file, import in a single thread, and just skip
+importing if there is any fuzzy match (not try to merge/update). Expecting
+about 500k new releases after such filtering.
+
+Ok, on 2020-12-17, back with patches to use fuzzycat in filtering. Trying
+another batch:
+
+ # git rev: 60e022609cd3fbbf9634577149018592e680858d
+ # DB before: Size: 678.47G
+
+ export FATCAT_AUTH_WORKER_DOAJ=...
+
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13.sample_10k.json.gz | head -n1000 | tail -n100 | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+ => Counter({'total': 100, 'exists': 71, 'insert': 19, 'exists-fuzzy': 10, 'skip': 0, 'update': 0})
+
+ # https://fatcat.wiki/changelog/5033496
+
+Sampled 10x of these and they look much better: no obvious duplication. Going
+ahead with the full import; note that other ingest is happening in parallel
+(many crossref, datacite, and pubmed imports which backed up).
+
+ # full run
+ # note the shuf command added, in an attempt to reduce duplicates within this corpus
+ zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+ # started 2020-12-17 22:01 (Pacific)
+
+ => 5.45M 52:38:45 [28.8 /s]
+ => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0})
+
+As total estimates:
+
+- total: 5,465,832
+- exists: 4,081,180
+- exists-fuzzy: 577,336
+- insert: 800,996
+
+Ending database size: Size: 684.08G
+
+(note that regular imports were running during same period)
+
diff --git a/extra/bulk_edits/2020-12-23_dblp.md b/extra/bulk_edits/2020-12-23_dblp.md
new file mode 100644
index 00000000..a33411cb
--- /dev/null
+++ b/extra/bulk_edits/2020-12-23_dblp.md
@@ -0,0 +1,55 @@
+
+## Prod Container Import
+
+Using 2020-11-30 XML dump, then scrape and transform tooling from
+`extra/dblp/`.
+
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp_container_meta.json
+
+ # updated ISSN-to-ISSN-L.txt symlink to 20201207.ISSN-to-ISSN-L.txt
+
+ touch /srv/fatcat/datasets/blank_dblp_containers.tsv
+
+Create new `dblp-bot` user:
+
+ ./target/release/fatcat-auth create-editor --admin --bot dblp-bot
+ => gwbheb5jfngrxkcad5qgth5cra
+
+ ./target/release/fatcat-auth create-token gwbheb5jfngrxkcad5qgth5cra
+
+Run import:
+
+ # git commit: ec6b366af8df1956e1287cba2e0818b80ce1c518
+
+ export FATCAT_AUTH_WORKER_DBLP=...
+
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file /srv/fatcat/datasets/blank_dblp_containers.tsv --dblp-container-map-output /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp_container_meta.json
+ => Got 0 existing dblp container mappings.
+ => Counter({'total': 6954, 'insert': 5202, 'exists': 1752, 'skip': 0, 'update': 0})
+
+ wc -l /srv/fatcat/datasets/all_dblp_containers.tsv
+ 6955 /srv/fatcat/datasets/all_dblp_containers.tsv
+
+## Prod Release Import
+
+Using same 2020-11-30 XML dump. Download to /srv/fatcat/datasets:
+
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp.dtd
+ wget https://archive.org/download/dblp-xml-2020-11-30/dblp.xml
+
+Run import:
+
+ export FATCAT_AUTH_WORKER_DBLP=...
+
+ ./fatcat_import.py dblp-release --dblp-container-map-file /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp.xml --do-updates
+
+ # started 2020-12-23 11:51 (Pacific)
+
+ # restarted/tweaked at least twice
+
+ # finally ended around 2020-12-27 after about... 48 hours?
+
+ => Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 3097418, 'skip-key-type': 2640968, 'skip-update': 2480449, 'exists': 943800, 'update': 889700, 'insert': 338842, 'skip-arxiv-corr': 312872, 'exists-fuzzy': 203103, 'skip-dblp-container-missing': 143578, 'skip-arxiv': 53, 'skip-title': 1})
+
+Starting database size (roughly): Size: 684.08G
+Ending database size: Size: 690.22G
diff --git a/extra/bulk_edits/2020_datacite.md b/extra/bulk_edits/2020_datacite.md
new file mode 100644
index 00000000..05d09517
--- /dev/null
+++ b/extra/bulk_edits/2020_datacite.md
@@ -0,0 +1,152 @@
+
+
+## QA Runs
+
+Trying on 2019-12-22, using Martin commit 18d411087007a30fbf027b87e30de42344119f0c from 2019-12-20.
+
+Quick test:
+
+ # this branch adds some new deps, so make sure to install them
+ pipenv install --deploy --dev
+ pipenv shell
+ export FATCAT_AUTH_WORKER_DATACITE="..."
+ xzcat /srv/fatcat/datasets/datacite.ndjson.xz | head -n100 | ./fatcat_import.py datacite - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ISSUE: `--extid-map-file` not passed through, so drop the:
+
+ --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ISSUE: auth_var should be FATCAT_AUTH_WORKER_DATACITE
+
+Test full parallel command:
+
+ export FATCAT_AUTH_WORKER_DATACITE="..."
+ time xzcat /srv/fatcat/datasets/datacite.ndjson.xz | head -n10000 | parallel -j20 --round-robin --pipe ./fatcat_import.py datacite - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ real 0m30.017s
+ user 3m5.576s
+ sys 0m19.640s
+
+Whole lot of:
+
+ invalid literal for int() with base 10: '10,495'
+ invalid literal for int() with base 10: '11,129'
+
+ invalid literal for int() with base 10: 'n/a'
+ invalid literal for int() with base 10: 'n/a'
+
+ invalid literal for int() with base 10: 'OP98'
+ invalid literal for int() with base 10: 'OP208'
+
+ no mapped type: None
+ no mapped type: None
+ no mapped type: None
+
+Re-ran above:
+
+ real 0m27.764s
+ user 3m2.448s
+ sys 0m12.908s
+
+Compare with `--lang-detect`:
+
+ real 0m27.395s
+ user 3m5.620s
+ sys 0m13.344s
+
+Not noticeable?
+
+Whole run:
+
+ export FATCAT_AUTH_WORKER_DATACITE="..."
+ time xzcat /srv/fatcat/datasets/datacite.ndjson.xz | parallel -j20 --round-robin --pipe ./fatcat_import.py datacite - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ real 35m21.051s
+ user 98m57.448s
+ sys 7m9.416s
+
+Huh. Kind of suspiciously fast.
+
+ select count(*) from editgroup where editor_id='07445cd2-cab2-4da5-9f84-34588b7296aa';
+ => 9952 editgroups
+
+ select count(*) from release_edit inner join editgroup on release_edit.editgroup_id = editgroup.id where editgroup.editor_id='07445cd2-cab2-4da5-9f84-34588b7296aa';
+ => 496,342 edits
+
+While running:
+
+ starting around 5k TPS in pg_activity
+ starting size: 367.58G
+ (this is after arxiv and some other changes on top of 2019-12-13 dump)
+ host doing a load average of about 5.5; fatcatd at 115% CPU
+
+ ending size: 371.43G
+
+Actually seems like extremely few DOIs getting inserted? Hrm.
+
+ xzcat /srv/fatcat/datasets/datacite.ndjson.xz | wc -l
+ => 18,210,075
+
+Last DOIs inserted were around: 10.7916/d81v6rqr
+
+Suspect a bunch of errors or something and output getting mangled by all the
+logging? Squelched logging and running again (using same DB/config), except
+with `pv -l` inserted after `xzcat`.
+
+Seem to run at a couple hundred records a second (very volatile).
+
+ Counter({'total': 42919, 'insert': 21579, 'exists': 21334, 'skip': 6, 'skip-blank-title': 6, 'inserted.container': 1, 'update': 0})
+ Counter({'total': 43396, 'insert': 23274, 'exists': 20120, 'skip-blank-title': 2, 'skip': 2, 'update': 0})
+
+Ok! The actual errors:
+
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 507, in <module>
+ main()
+ File "./fatcat_import.py", line 504, in main
+ args.func(args)
+ File "./fatcat_import.py", line 182, in run_datacite
+ JsonLinePusher(dci, args.json_file).run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 559, in run
+ self.importer.push_record(record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 318, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/datacite.py", line 447, in parse_record
+ sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
+ AttributeError: 'list' object has no attribute 'encode'
+
+ fatcat_openapi_client.exceptions.ApiException: (400)
+ Reason: Bad Request
+ HTTP response headers: HTTPHeaderDict({'Content-Length': '186', 'Content-Type': 'application/json', 'Date': 'Mon, 23 Dec 2019 08:12:16 GMT', 'X-Clacks-Overhead': 'GNU aaronsw, jpb', 'X-Span-ID': '73b0b698-bf88-4721-b869-b322dbe90cbe'})
+ HTTP response body: {"success":false,"error":"MalformedExternalId","message":"external identifier doesn't match required pattern for a DOI (expected, eg, '10.1234/aksjdfh'): 10.17167/mksz.2017.2.129–155"}
+
+
+ Traceback (most recent call last):
+ File "./fatcat_import.py", line 507, in <module>
+ main()
+ File "./fatcat_import.py", line 504, in main
+ args.func(args)
+ File "./fatcat_import.py", line 182, in run_datacite
+ JsonLinePusher(dci, args.json_file).run()
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 559, in run
+ self.importer.push_record(record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/common.py", line 318, in push_record
+ entity = self.parse_record(raw_record)
+ File "/srv/fatcat/src/python/fatcat_tools/importers/datacite.py", line 447, in parse_record
+ sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
+ AttributeError: 'list' object has no attribute 'encode'
+
+
+ fatcat_openapi_client.exceptions.ApiException: (400)
+ Reason: Bad Request
+ HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Span-ID': 'ca141ff4-83f7-4ee5-9256-91b23ec09e94', 'Content-Length': '188', 'X-Clacks-Overhead': 'GNU aaronsw, jpb', 'Date': 'Mon, 23 Dec 2019 08:11:25 GMT'})
+ HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: new row for relation \"release_contrib\" violates check constraint \"release_contrib_raw_name_check\""}
+
+## Prod Import
+
+Around first/second week of january. Needed to restart at least once due to
+database deadlock on abstract inserts, which seems to be due to parallelism and
+duplicated records in the bulk datacite dump.
+
+TODO: specific command used by martin
diff --git a/extra/bulk_edits/2021-05-28_dblp.md b/extra/bulk_edits/2021-05-28_dblp.md
new file mode 100644
index 00000000..061f4f45
--- /dev/null
+++ b/extra/bulk_edits/2021-05-28_dblp.md
@@ -0,0 +1,44 @@
+
+## Container Import
+
+Following dblp README directions:
+
+ export DBLP_DIR=/srv/fatcat/tasks/202105_dblp
+
+ ./fatcat_import.py dblp-release $DBLP_DIR/dblp.xml --dump-json-mode | pv -l > $DBLP_DIR/dblp_releases.json
+ => Counter({'total': 8328073, 'skip': 8328073, 'has-doi': 4478439, 'skip-key-type': 2764750, 'skip-arxiv-corr': 348766, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0})
+ => 5.21M 3:38:35 [ 397 /s]
+
+ cat $DBLP_DIR/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > $DBLP_DIR/prefix_list.txt
+
+ wc -l $DBLP_DIR/prefix_list.txt
+ => 7603 /srv/fatcat/tasks/202105_dblp/prefix_list.txt
+
+ mkdir -p journals
+ mkdir -p conf
+ mkdir -p series
+
+ shuf $DBLP_DIR/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+ # switch to temporary selectolax pipenv
+ fd html conf/ journals/ series/ | /srv/fatcat/src/extra/dblp/dblp_html_extract.py | pv -l > dblp_container_meta.json
+ => 7.08k 0:00:15 [ 449 /s]
+
+ fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+ => Got 5202 hits in 47ms
+ => 5.20k 0:00:13 [ 375 /s]
+
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file $DBLP_DIR/existing_dblp_containers.tsv --dblp-container-map-output $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp_container_meta.json
+ => Counter({'total': 7083, 'exists': 7025, 'insert': 58, 'skip': 0, 'update': 0})
+ => actually 108 inserted
+
+Actually imported 50 more before this, then Ctrl-C to check. Then re-did
+fatcat-cli query, upload, and re-ran all. So 108 new containers inserted.
+
+## Release Import
+
+With same exports as above:
+
+ ./fatcat_import.py dblp-release --dblp-container-map-file $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp.xml
+ => Counter({'total': 8328073, 'exists': 4847353, 'has-doi': 4478439, 'skip': 3259925, 'skip-key-type': 2764750, 'skip-arxiv-corr': 348766, 'exists-fuzzy': 202880, 'skip-dblp-container-missing': 146408, 'insert': 17862, 'skip-arxiv': 53, 'skip-title': 1, 'update': 0})
+
diff --git a/extra/bulk_edits/2021-05-28_doaj.md b/extra/bulk_edits/2021-05-28_doaj.md
new file mode 100644
index 00000000..e5925eeb
--- /dev/null
+++ b/extra/bulk_edits/2021-05-28_doaj.md
@@ -0,0 +1,80 @@
+
+Note: running 2021-05-28 pacific time, but 2021-05-29 UTC.
+
+First downloaded bulk metadata from doaj.org and uploaded into archive.org item
+as a snapshot.
+
+## Journal Metadata Import
+
+Before doing article import, want to ensure journals all exist.
+
+Use chocula pipeline, and to be simple/conservative don't update any
+containers, just create if they don't already exist.
+
+Run the usual chocula source update, copy to data dir, update sources.toml,
+etc. Didn't bother with updating container counts or homepage status. Then
+export container schema:
+
+ python -m chocula export_fatcat | gzip > chocula_fatcat_export.2021-05-28.json.gz
+
+Upload to fatcat prod machine, unzip, and then import to prod:
+
+ export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+ ./fatcat_import.py chocula /srv/fatcat/datasets/chocula_fatcat_export.2021-05-28.json
+ => Counter({'total': 175837, 'exists': 170358, 'insert': 5479, 'exists-by-issnl': 5232, 'skip': 0, 'update': 0})
+
+That is a healthy batch of new records!
+
+## Article Import
+
+Transform all the articles into a single JSON file:
+
+ cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2021-05-25_all.json.gz
+ => 6.1M 0:18:45 [5.42k/s]
+
+ zcat doaj_article_data_2021-05-25_all.json.gz | shuf -n10000 > doaj_article_data_2021-05-25_sample_10k.json
+
+Also upload this `_all` file to archive.org item.
+
+Ready to import! Start with sample:
+
+ export FATCAT_AUTH_WORKER_DOAJ=...
+ zcat /srv/fatcat/tasks/202105_doaj/doaj_article_data_2021-05-25_sample_10k.json.gz | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+ => Counter({'total': 10000, 'exists': 8743, 'exists-fuzzy': 1044, 'insert': 197, 'skip': 14, 'skip-title': 14, 'skip-doaj-id-mismatch': 2, 'update': 0})
+
+Then the full import, in parallel, shuffled (because we shuffled last time):
+
+ zcat /srv/fatcat/tasks/202105_doaj/doaj_article_data_2021-05-25_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+ => Counter({'total': 512351, 'exists': 449996, 'exists-fuzzy': 50858, 'insert': 10826, 'skip': 551, 'skip-title': 551, 'skip-doaj-id-mismatch': 120, 'update': 0})
+ => extrapolating, about 129,912 new release entities. 2.1% insert rate
+
+NOTE: large number of warnings like:
+
+ UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=5d2ebb760ad24ce68ec8079bc82c8d78 ident=dvtl2xpn4nespfrj6gad6mrk44
+
+## Extra Article Imports
+
+Manually disabled fuzzy matching with patch:
+
+ diff --git a/python/fatcat_import.py b/python/fatcat_import.py
+ index 1dcfec2..cb787cb 100755
+ --- a/python/fatcat_import.py
+ +++ b/python/fatcat_import.py
+ @@ -260,6 +260,7 @@ def run_doaj_article(args):
+ args.issn_map_file,
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates,
+ + do_fuzzy_match=False,
+ )
+ if args.kafka_mode:
+ KafkaJsonPusher(
+
+Filtered out some specific articles:
+
+ zcat doaj_article_data_2021-05-25_all.json.gz | rg 1665-1596 | pv -l > doaj_article_data_2021-05-25_voces.json
+ => 154 0:02:05 [1.22 /s]
+
+And used this for some imports:
+
+ cat /srv/fatcat/tasks/202105_doaj/doaj_article_data_2021-05-25_voces.json | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
diff --git a/extra/bulk_edits/2021-11-10_case_sensitive_dois.md b/extra/bulk_edits/2021-11-10_case_sensitive_dois.md
new file mode 100644
index 00000000..20772f56
--- /dev/null
+++ b/extra/bulk_edits/2021-11-10_case_sensitive_dois.md
@@ -0,0 +1,53 @@
+
+## Production Run
+
+Start small:
+
+ export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+ wc -l /srv/fatcat/datasets/nonlowercase_doi_releases.tsv
+ # 140530
+
+ head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.tsv \
+ | python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ # same command again to test not duping updates
+ Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+ # example editgroup_cld5qe34bzg7xg7g4cz5skgaw4
+
+Database size just before, while some other edits happening, PostgreSQL 11.6: 762.66G
+
+Ok, run a bunch in parallel:
+
+ cat /srv/fatcat/datasets/nonlowercase_doi_releases.tsv \
+ | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # Counter({'total': 24022, 'update': 24022, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 38836, 'update': 38836, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 38836, 'update': 38836, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 38836, 'update': 38736, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Over 3k TPS in `pg_activity`.
+
+Should have included `pv -l` in the pipeline.
+
+Final database size 763.14G, so only a couple hundred MByte of growth, totally
+fine.
+
+
+## Verification
+
+Re-dump release extids, in production:
+
+ sudo -u postgres psql fatcat_prod < dump_release_extid.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > /srv/fatcat/snapshots/release_extid.tsv.gz
+
+Filter to non-lowercase DOIs:
+
+ zcat release_extid.tsv.gz \
+ | cut -f1,3 \
+ | rg '[A-Z]' \
+ | pv -l \
+ > nonlowercase_doi.tsv
+
+Zero returned, hurray!
diff --git a/extra/bulk_edits/2021-11-10_file_release_ingest_bugfix.md b/extra/bulk_edits/2021-11-10_file_release_ingest_bugfix.md
new file mode 100644
index 00000000..6b5deb63
--- /dev/null
+++ b/extra/bulk_edits/2021-11-10_file_release_ingest_bugfix.md
@@ -0,0 +1,108 @@
+
+## Production Run
+
+Start small:
+
+ export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+ wc -l /srv/fatcat/datasets/file_release_bugfix_20211105.json
+ 228826
+
+ head -n100 /srv/fatcat/datasets/file_release_bugfix_20211105.json \
+ | python -m fatcat_tools.cleanups.file_release_bugfix -
+ # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ # example editgroup_keae3rfekffuriiy77f26rf6uq
+
+These are all now stubs (no release associated), which isn't the ratio seen in QA. Going to do a random sample:
+
+ shuf -n100 /srv/fatcat/datasets/file_release_bugfix_20211105.json \
+ | python -m fatcat_tools.cleanups.file_release_bugfix -
+ # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ # example editgroup_34mk525kxvdu3hak7g7fr7awru
+
+Looked at a few and all looked more like what would be expected, correct matches.
+
+Comparing before and after counts is going to be tricky, and will require a
+full re-index for an accurate count. But did do a snapshot just before this run
+(2021-11-10-prod-stats.json), and got 31,110,184 `in_web`.
+
+Full edit, in parallel:
+
+ cat /srv/fatcat/datasets/file_release_bugfix_20211105.json \
+ | pv -l \
+ | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix -
+ # 228k 0:26:34 [ 143 /s]
+ # Counter({'total': 26090, 'update': 26071, 'skip-existing-fixed': 15, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0})
+ # Counter({'total': 26080, 'update': 26061, 'skip-existing-fixed': 12, 'skip': 7, 'skip-wrong-release-is-ok': 7, 'insert': 0, 'exists': 0})
+ # Counter({'total': 27517, 'update': 27497, 'skip-existing-fixed': 15, 'skip': 5, 'skip-wrong-release-is-ok': 5, 'insert': 0, 'exists': 0})
+ # Counter({'total': 29534, 'update': 29420, 'skip-existing-fixed': 110, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0})
+ # Counter({'total': 29544, 'update': 29517, 'skip-existing-fixed': 16, 'skip': 11, 'skip-wrong-release-is-ok': 11, 'insert': 0, 'exists': 0})
+ # Counter({'total': 29535, 'update': 29518, 'skip-existing-fixed': 10, 'skip': 7, 'skip-wrong-release-is-ok': 7, 'insert': 0, 'exists': 0})
+ # Counter({'total': 30082, 'update': 30065, 'skip-existing-fixed': 13, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0})
+ # Counter({'total': 30444, 'update': 30420, 'skip-existing-fixed': 21, 'skip': 3, 'skip-wrong-release-is-ok': 3, 'insert': 0, 'exists': 0})
+
+## Verification
+
+Counts:
+
+ SELECT file_edit.extra_json->>'ingest_request_source' as source, COUNT(*) as broken_files
+ FROM file_edit
+ LEFT JOIN file_ident ON file_edit.ident_id = file_ident.id
+ LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev
+ LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id
+ LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id
+ WHERE
+ file_edit.extra_json->>'link_source_id' IS NOT NULL
+ AND file_edit.extra_json->>'link_source_id' LIKE '10.%'
+ AND lower(release_rev.doi) != lower(file_edit.extra_json->>'link_source_id')
+ AND file_ident.rev_id = file_edit.rev_id
+ GROUP BY file_edit.extra_json->>'ingest_request_source';
+
+ source | broken_files
+ -----------+--------------
+ unpaywall | 233
+ (1 row)
+
+Examples:
+
+ SELECT file_edit.ident_id as file_ident, release_ident.id as release_ident, file_edit.extra_json->>'link_source_id' as file_edit_doi, release_rev.doi as release_doi
+ FROM file_edit
+ LEFT JOIN file_ident ON file_edit.ident_id = file_ident.id
+ LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev
+ LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id
+ LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id
+ WHERE
+ file_edit.extra_json->>'link_source_id' IS NOT NULL
+ AND file_edit.extra_json->>'link_source_id' LIKE '10.%'
+ AND lower(release_rev.doi) != lower(file_edit.extra_json->>'link_source_id')
+ AND file_ident.rev_id = file_edit.rev_id
+ LIMIT 20;
+
+
+Looks like many of the remaining mismatches are from "double-slash" normalization, with doi prefix 10.1037:
+
+ file_ident | release_ident | file_edit_doi | release_doi
+ --------------------------------------+--------------------------------------+------------------------------+------------------------------
+ ae2f7864-66a6-4a82-a0e6-153cb4d0b03a | 0f436ae6-d7b4-4a45-a434-d158bc4a3437 | 10.1037/0096-1523.25.6.1568 | 10.1037//0096-1523.25.6.1568
+ d02ff5ab-a882-4a86-8a94-ce6222708323 | 2d5ebbca-e4ba-4bb7-bb19-f1e081479eab | 10.1037//0021-9010.63.4.467 | 10.1037/0021-9010.63.4.467
+ 2c107387-f57f-4855-bc1a-e40704f1e9b4 | 7654b956-4776-4f6f-bc35-ccf7e6bfe99c | 10.1037/0022-0663.75.4.500 | 10.1037//0022-0663.75.4.500
+ 15e3636a-4bcf-4595-8a2a-b6b06a299a2f | c09e3531-1ac4-4bfa-9fcf-8acb9f0d845e | 10.1037//1064-1297.8.2.225 | 10.1037/1064-1297.8.2.225
+ dc8b86c8-9b8e-4333-abbb-8811010d9c71 | bd91e7be-c360-47af-a634-f048e2c85b73 | 10.1037//0021-843x.105.4.637 | 10.1037/0021-843x.105.4.637
+ 35a06e0a-6f72-4624-87ca-fbb74bc9d77d | 96befa26-6eb0-47c0-a0ec-e00282e33bff | 10.1037//0735-7044.99.5.964 | 10.1037/0735-7044.99.5.964
+ 707bfaa1-65de-4dbb-9786-51b99d03d91d | 2d58524b-4216-4092-8ddf-336ac42d5955 | 10.1037/0096-1523.28.3.515 | 10.1037//0096-1523.28.3.515
+ de9ea98f-672e-44ec-9d12-e11acd8990d0 | 20f1a857-ad51-4b80-9ce5-bc3a44df96b1 | 10.1037//0002-9432.71.1.72 | 10.1037/0002-9432.71.1.72
+ 4275306c-11ef-4fce-bc03-3f1efe99f9a6 | c69bc740-4da1-4f96-acc9-151a0cef5c3f | 10.1037//1064-1297.6.1.107 | 10.1037/1064-1297.6.1.107
+ 6a63d2ae-b953-48ba-a68a-061543d82ad4 | e3c8b8c1-defc-44ac-8c73-e21e8cf93f5c | 10.1037//0022-0167.23.6.557 | 10.1037/0022-0167.23.6.557
+ 2fcbb54e-8fa8-4bbc-a2ae-4b6b6eaff412 | 8500b4a5-a693-4415-b4a4-4dcfb3403d82 | 10.1037/0021-9010.73.1.68 | 10.1037//0021-9010.73.1.68
+ b9aa4601-4a1b-4146-aa6b-a410d0fc3dce | 954f2072-8c53-41c7-82b0-8c6fe9ef4d0c | 10.1037//0278-6133.13.4.319 | 10.1037/0278-6133.13.4.319
+ b528b924-0680-43f3-81ad-d822e51b3373 | 69387969-40bc-451d-b567-8713296f60b0 | 10.1037//0002-9432.71.1.38 | 10.1037/0002-9432.71.1.38
+ f64f1ee2-b787-4a06-87ab-46b94f9d5454 | c082d47f-175d-456a-a741-650b5eaa5173 | 10.1037//0021-843x.98.4.487 | 10.1037/0021-843x.98.4.487
+ 86e8b655-963a-4c11-ae70-a8d528400682 | 6381254c-e339-4354-b0d8-711b5b5e4fcc | 10.1037/0022-0663.89.1.183 | 10.1037//0022-0663.89.1.183
+ 716e1761-8120-480b-b096-e7698c65456a | 7a4d2c7d-32b7-4292-adbf-b791387a3ac5 | 10.1037//0278-7393.21.5.1209 | 10.1037/0278-7393.21.5.1209
+ bb7aa131-d5e5-497e-8040-b1729850b94c | 1ce29f33-d020-4d5f-a8b3-2b8bef53ccb8 | 10.1037//1040-3590.7.4.533 | 10.1037/1040-3590.7.4.533
+ 510ad392-43aa-42dc-9644-9697f425efd5 | 796c92ca-767a-495b-ad0c-f458381c071c | 10.1037//0278-7393.20.4.824 | 10.1037/0278-7393.20.4.824
+ 32c57e68-0793-4ded-bca2-d05f3532ff3e | 1567a003-0b5a-48bb-bb7c-7dff2c44b90b | 10.1037//1040-3590.13.1.59 | 10.1037/1040-3590.13.1.59
+ e0d1fd38-17d8-42ac-b9df-60312829ddd4 | 0cefaf5d-fcdf-4049-a9d7-0c569096478e | 10.1037//0022-006x.56.4.621 | 10.1037/0022-006x.56.4.621
+ (20 rows)
diff --git a/extra/bulk_edits/2021-11-11_wayback_short_ts.md b/extra/bulk_edits/2021-11-11_wayback_short_ts.md
new file mode 100644
index 00000000..20349f0c
--- /dev/null
+++ b/extra/bulk_edits/2021-11-11_wayback_short_ts.md
@@ -0,0 +1,52 @@
+
+## Production Run
+
+At git commit `6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4`.
+
+Start small:
+
+ export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+ zcat /srv/fatcat/datasets/files_20211105_moreshortts.fetched.json.gz \
+ | head -n100 \
+ | python -m fatcat_tools.cleanups.file_short_wayback_ts -
+ # Counter({'total': 100, 'update': 99, 'skip-bad-wayback-timestamp': 1, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Looks good! Run the full batch.
+
+ zcat /srv/fatcat/datasets/files_20211105_moreshortts.fetched.json.gz \
+ | pv -l \
+ | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts -
+
+ [...]
+ bad replacement URL: partial_ts=2017 original=https://www.hydrol-earth-syst-sci.net/21/4959/2017/hess-21-4959-2017.pdf fix_url=https://web.archive.org/web/20180721004954/https://www.hydrol-earth-syst-sci.net/21/4959/2017/hess-21-4959-2017.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.the-cryosphere.net/11/1537/2017/tc-11-1537-2017.pdf fix_url=https://web.archive.org/web/20180719235703/https://www.the-cryosphere.net/11/1537/2017/tc-11-1537-2017.pdf
+ bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/msl/Vol7/msl_2017_26.pdf fix_url=https://web.archive.org/web/20180601235059/http://www.growingscience.com/msl/Vol7/msl_2017_26.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.hydrol-earth-syst-sci.net/21/4115/2017/hess-21-4115-2017.pdf fix_url=https://web.archive.org/web/20180719162956/https://www.hydrol-earth-syst-sci.net/21/4115/2017/hess-21-4115-2017.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/4279/2017/bg-14-4279-2017.pdf fix_url=https://web.archive.org/web/20180720220056/https://www.biogeosciences.net/14/4279/2017/bg-14-4279-2017.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/3669/2017/bg-14-3669-2017.pdf fix_url=https://web.archive.org/web/20180720222828/https://www.biogeosciences.net/14/3669/2017/bg-14-3669-2017.pdf
+ [...]
+ bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/msl/Vol7/msl_2017_28.pdf fix_url=https://web.archive.org/web/20180602071632/http://www.growingscience.com/msl/Vol7/msl_2017_28.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/4161/2017/bg-14-4161-2017.pdf fix_url=https://web.archive.org/web/20180720004438/https://www.biogeosciences.net/14/4161/2017/bg-14-4161-2017.pdf
+ bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/10915563.pdf fix_url=https://web.archive.org/web/20190220174144/https://core.ac.uk/download/pdf/10915563.pdf
+ bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/ijiec/Vol9/IJIEC_2017_24.pdf fix_url=https://web.archive.org/web/20180602094300/http://www.growingscience.com/ijiec/Vol9/IJIEC_2017_24.pdf
+ bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/36046645.pdf fix_url=https://web.archive.org/web/20190220175351/https://core.ac.uk/download/pdf/36046645.pdf
+ bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/35085886.pdf fix_url=https://web.archive.org/web/20190220175410/https://core.ac.uk/download/pdf/35085886.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.atmos-chem-phys.net/17/10349/2017/acp-17-10349-2017.pdf fix_url=https://web.archive.org/web/20181102190649/https://www.atmos-chem-phys.net/17/10349/2017/acp-17-10349-2017.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.atmos-chem-phys.net/17/7775/2017/acp-17-7775-2017.pdf fix_url=https://web.archive.org/web/20181101041355/https://www.atmos-chem-phys.net/17/7775/2017/acp-17-7775-2017.pdf
+ bad replacement URL: partial_ts=2017 original=http://www.veterinaryworld.org/Vol.10/March-2017/5.pdf fix_url=https://web.archive.org/web/20180721074940/http://www.veterinaryworld.org/Vol.10/March-2017/5.pdf
+ bad replacement URL: partial_ts=2017 original=https://www.ann-geophys.net/35/189/2017/angeo-35-189-2017.pdf fix_url=https://web.archive.org/web/20180625214916/https://www.ann-geophys.net/35/189/2017/angeo-35-189-2017.pdf
+ [...]
+
+ # 9.96M 12:57:06 [ 213 /s]
+
+ Counter({'total': 1272301, 'update': 1268466, 'skip-bad-wayback-timestamp': 2808, 'skip': 1026, 'skip-status': 981, 'skip-bad-replacement': 45, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0})
+ Counter({'total': 1242814, 'update': 1239042, 'skip-bad-wayback-timestamp': 2734, 'skip': 1036, 'skip-status': 974, 'skip-bad-replacement': 62, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0})
+ Counter({'total': 1264351, 'update': 1260695, 'skip-bad-wayback-timestamp': 2626, 'skip': 1030, 'skip-status': 977, 'skip-bad-replacement': 53, 'insert': 0, 'exists': 0})
+ Counter({'total': 1244480, 'update': 1240779, 'skip-bad-wayback-timestamp': 2680, 'skip': 1020, 'skip-status': 962, 'skip-bad-replacement': 58, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0})
+ Counter({'total': 1222678, 'update': 1219022, 'skip-bad-wayback-timestamp': 2698, 'skip': 956, 'skip-status': 892, 'skip-bad-replacement': 64, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0})
+ Counter({'total': 1225078, 'update': 1221459, 'skip-bad-wayback-timestamp': 2597, 'skip': 1020, 'skip-status': 964, 'skip-bad-replacement': 56, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0})
+ Counter({'total': 1283843, 'update': 1280014, 'skip-bad-wayback-timestamp': 2670, 'skip': 1059, 'skip-status': 997, 'skip-revision-changed': 99, 'skip-bad-replacement': 62, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0})
+ Counter({'total': 1203309, 'update': 1199782, 'skip-bad-wayback-timestamp': 2556, 'skip': 971, 'skip-status': 923, 'skip-bad-replacement': 48, 'insert': 0, 'exists': 0})
+
+On the order of 99.7% were updated/fixed, over 9.5 million file entities, taking almost 13 hours.
diff --git a/extra/bulk_edits/2021-11-24_file_meta.md b/extra/bulk_edits/2021-11-24_file_meta.md
new file mode 100644
index 00000000..1ec1698b
--- /dev/null
+++ b/extra/bulk_edits/2021-11-24_file_meta.md
@@ -0,0 +1,41 @@
+
+Another partial batch of pure `file_meta` updates to file entities. These came
+from re-attempting ingest by URL of existing file entities.
+
+Not all ran as expected, partially because of GROBID issues, and partially
+because we had alternate captures for the same URLs.
+
+Still, about half the attempts worked, so we are going to update a fraction of
+the ~520k outstanding file entities with partial metadata (eg, missing sha256).
+
+See cleanups `file_meta` document for prep and QA testing notes.
+
+
+## Production Commands
+
+ git log | head -n1
+ commit 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7
+
+ export export FATCAT_AUTH_API_TOKEN=[...] # sandcrawler-bot
+
+Start with a small sample:
+
+ cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \
+ | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+ # Counter({'total': 100, 'skip-existing-complete': 45, 'update': 43, 'skip-no-match': 12, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Then run in parallel with full batch:
+
+ cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \
+ | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+ # Counter({'total': 41846, 'update': 19737, 'skip-existing-complete': 18788, 'skip-no-match': 3321, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41522, 'update': 19678, 'skip-existing-complete': 18607, 'skip-no-match': 3237, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41537, 'update': 20517, 'skip-existing-complete': 17895, 'skip-no-match': 3125, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41529, 'update': 19684, 'skip-existing-complete': 18501, 'skip-no-match': 3344, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41530, 'update': 19595, 'skip-existing-complete': 18637, 'skip-no-match': 3298, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41542, 'update': 21359, 'skip-existing-complete': 17033, 'skip-no-match': 3150, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41534, 'update': 19758, 'skip-existing-complete': 18516, 'skip-no-match': 3260, 'skip': 0, 'insert': 0, 'exists': 0})
+ # Counter({'total': 41537, 'update': 20507, 'skip-existing-complete': 15543, 'skip-no-match': 5487, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Import ran pretty fast! Updated about 160k file entities. More like 1/3 than
+1/2 of the 520k that were missing SHA-256.
diff --git a/extra/bulk_edits/2021-11-24_file_sha1_dedupe.md b/extra/bulk_edits/2021-11-24_file_sha1_dedupe.md
new file mode 100644
index 00000000..012bcf62
--- /dev/null
+++ b/extra/bulk_edits/2021-11-24_file_sha1_dedupe.md
@@ -0,0 +1,35 @@
+
+See notes and scripts about `file_sha1_dedupe` cleanup for prep details.
+
+## Prod Run
+
+Run as `cleanup-bot`:
+
+ export FATCAT_AUTH_API_TOKEN=[...]
+
+ git log | head -n1
+ # commit 5bc5eeed5e3ba54c2129c4233b881291c5fa7449
+
+First do a sample in dry-run mode:
+
+ head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \
+ | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" --dry-run merge-files -
+ # Counter({'updated-entities': 59, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0})
+
+Gah, the dry-run mode still creates (empty) editgroups:
+
+ https://fatcat.wiki/editgroup/iqzjg3vxu5elvotknmmjln3gv4
+ https://fatcat.wiki/editgroup/2mxsl7lxo5dezem42whnr7zxxe
+
+Actually run (merge) the sample:
+
+ head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \
+ | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files -
+ # Counter({'updated-entities': 59, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0})
+
+
+Run the full batch:
+
+ cat /srv/fatcat/datasets/file_sha1_dupes.json \
+ | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files -
+ # Counter({'updated-entities': 6197, 'lines': 2039, 'merged': 2014, 'skip': 25, 'skip-not-active-entity': 25, 'updated-total': 0})
diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md
new file mode 100644
index 00000000..6156721c
--- /dev/null
+++ b/extra/bulk_edits/CHANGELOG.md
@@ -0,0 +1,131 @@
+
+# Fatcat Production Import CHANGELOG
+
+This file tracks major content (metadata) imports to the Fatcat production
+database (at https://fatcat.wiki). It complements the code CHANGELOG file.
+
+In general, changes that impact more than 50k entities will get logged here;
+this file should probably get merged into the guide at some point.
+
+This file should not turn in to a TODO list!
+
+
+## 2021-11
+
+Ran a series of cleanups. See background and prep notes in `notes/cleanups/`
+and specific final commands in this directory. Quick summary:
+
+- more than 9.5 million file entities had truncated timestamps wayback URLs,
+ and were fixed with the full timestamps. there are still a small fraction
+ (0.5%) which were identified but not corrected in this first pass
+- over 140k release entities with non-lowercase DOIs were updated with
+ lowercase DOI. all DOIs in current release entities now lowercase (at least,
+ no ASCII uppercase characters found)
+- over 220k file entities with incorrect release relation, due to an
+ import-time code bug, were fixed. a couple hundred questionable cases remain,
+ but are all mismatched due to DOI slash/double-slash issues and will not be
+ fixed in an automated way.
+- de-uplicated a few thousand file entities, on the basis of SHA-1 hash
+- updated file metadata for around 160k file entities (a couple hundred
+ thousand remain with partial metadata)
+
+
+## 2021-06
+
+Created new containers via chocula pipeline. Did not update any existing
+chocula entities.
+
+Ran DOAJ import manually, yielding almost 130k new release entities.
+
+Ran dblp import manually, resulting in about 17k new release entities, as well
+as 108 new containers. Note that 146k releases were not inserted due to
+`skip-dblp-container-missing` and 203k due to `exists-fuzzy`.
+
+## 2020-12
+
+Updated ORCIDs from 2020 dump. About 2.4 million new `creator` entities.
+
+Imported DOAJ article metadata from a 2020-11 dump. Crawled and imported
+several hundred thousand file entities matched by DOAJ identifier. Updated
+journal metadata using chocula took (before the release ingest). Filtered out
+fuzzy-matching papers before importing.
+
+Imported dblp from a 2020 snapshot, both containers (primarily for conferences
+lacking an ISSN) and release entities (primarily conference papers). Filtered
+out fuzzy-matching papers before importing.
+
+## 2020-03
+
+Started harvesting both Arxiv and Pubmed metadata daily and importing to
+fatcat. Did backfill imports for both sources.
+
+JALC DOI registry update from 2019 dump.
+
+## 2020-01
+
+Imported around 2,500 new containers (journals, by ISSN-L) from chocula
+analysis script.
+
+Imported DOIs from Datacite (around 16 million, plus or minus a couple
+million).
+
+Imported new release entities from 2020 Pubmed/MEDLINE baseline. This import
+included only new Pubmed works cataloged in 2019 (up until December or so).
+Only a few hundred thousand new release entities.
+
+Daily "ingest" (crawling) pipeline running.
+
+## 2019-12
+
+Started continuous harvesting Datacite DOI metadata; first date harvested was
+`2019-12-13`. No importer running yet.
+
+Imported about 3.3m new ORCID identifiers from 2019 bulk dump (after converting
+from XML to JSON): <https://archive.org/details/orcid-dump-2019>
+
+Inserted about 154k new arxiv release entities. Still no automatic daily
+harvesting.
+
+"Save Paper Now" importer running. This bot only *submits* editgroups for
+review, doesn't auto-accept them.
+
+## 2019-11
+
+Daily ingest of fulltext for OA releases now enabled. New file entities created
+and merged automatically.
+
+## 2019-10
+
+Inserted 1.45m new release entities from Crossref which had been missed during
+a previous gap in continuous metadata harvesting.
+
+Updated 304,308 file entities to remove broken
+"https://web.archive.org/web/None/*" URLs.
+
+## 2019-09
+
+Created and updated metadata for tens of thousands of containers, using
+"chocula" pipeline.
+
+## 2019-08
+
+Merged/fixed roughly 100 container entities with invalid ISSN-L numbers (eg,
+invalid ISSN checksum).
+
+## 2019-04
+
+Imported files (matched to releases by DOI) from Semantic Scholar
+(`DIRECT-OA-CRAWL-2019` crawl).
+
+Imported files (matched to releases by DOI) from pre-1923/pre-1909 items uploaded
+by a user to archive.org.
+
+Imported files (matched to releases by DOI) from CORE.ac.uk
+(`DIRECT-OA-CRAWL-2019` crawl).
+
+Imported files (matched to releases by DOI) from the public web (including many
+repositories) from the `UNPAYWALL` 2018 crawl.
+
+## 2019-02
+
+Bootstrapped!