diff options
-rw-r--r-- | TODO.md | 1 | ||||
-rw-r--r-- | guide/src/entity_release.md | 2 | ||||
-rw-r--r-- | notes/bulk_edits/2019-11-05_crossref_patch.md | 58 | ||||
-rw-r--r-- | notes/bulk_edits/CHANGELOG.md | 5 | ||||
-rw-r--r-- | proposals/20190911_v04_schema_tweaks.md | 38 | ||||
-rw-r--r-- | proposals/20191018_bigger_db.md | 81 | ||||
-rw-r--r-- | python/README.md | 5 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 15 | ||||
-rw-r--r-- | python/fatcat_web/auth.py | 4 | ||||
-rw-r--r-- | python/tests/import_crossref.py | 2 | ||||
-rw-r--r-- | rust/src/identifiers.rs | 1 |
11 files changed, 208 insertions, 4 deletions
@@ -165,6 +165,7 @@ new importers: ## Schema / Entity Fields +- file+fileset "first seen" datetime - file type/scope/coverage: "fulltext", "abstract", etc - elastic transform should only include authors, not editors (?) - `translation_of` field on releases (or similar/general). `retraction_of` to a diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md index 27ce0f2c..1fd0a2f1 100644 --- a/guide/src/entity_release.md +++ b/guide/src/entity_release.md @@ -216,6 +216,8 @@ with a small number of (proposed) extensions: - `stub` (fatcat extension) for releases which have notable external identifiers, and thus are included "for completeness", but don't seem to represent a "full work". +- `component` (fatcat extension) for sub-components of a full paper (or other + work). Eg, figures or tables. An example of a `stub` might be a paper that gets an extra DOI by accident; the primary DOI should be a full release, and the accidental DOI can be a `stub` diff --git a/notes/bulk_edits/2019-11-05_crossref_patch.md b/notes/bulk_edits/2019-11-05_crossref_patch.md new file mode 100644 index 00000000..1765fc36 --- /dev/null +++ b/notes/bulk_edits/2019-11-05_crossref_patch.md @@ -0,0 +1,58 @@ + +Goal is to make sure we have imported all in-scope crossref DOI objects. There +were a few months gap between the snapshot used as initial bootstrap and the +start of continuous ingest; any DOIs registered during that gap and not updated +since are not in fatcat. Expectation is that this will be a relatively small +import. + +## QA Run + +Started Thu 31 Oct 2019 08:07:20 PM PDT + + export FATCAT_AUTH_WORKER_CROSSREF="..." + time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + # postgresql DB at start: fresh 2019-10 dump imported, 357 GB + # over 15k TPS against postgres + + 20x theads of: + Counter({'total': 5397349, 'exists': 4961058, 'skip': 360156, 'insert': 76135, 'inserted.container': 113, 'update': 0}) + + real 1173m52.497s => 20hr + user 13058m24.460s + sys 319m27.716s + + 1.5 million new releases + 7.2 million skips (total) + +Ran again with null subtitle fix and granular stats: + + 20x threads of: + Counter({'total': 5368366, 'exists': 5122104, 'skip': 244072, 'skip-blank-title': 38399, 'skip-release-type': 5296, 'insert': 2190, 'skip-huge-contribs': 70, 'skip-huge-refs': 7, 'update': 0}) + + 43k additional insets (still about 1.5m total) + of 4.8 million skipped (why not closer to 7.2 million?), most seem to be blank title + +## Production Run + +Git: 44c23290c72ec67db38f1e1d40b76ba795b40d9d + +started around Tue 05 Nov 2019 02:51:19 PM PST + + export FATCAT_AUTH_WORKER_CROSSREF="..." + time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20190730.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + # postgresql DB at start: 399.03G + + # 20x of: + Counter({'total': 5347938, 'exists': 5023305, 'skip': 251747, 'skip-blank-title': 247969, 'insert': 72886, 'skip-release-type': 3686, 'inserted.container': 103, 'skip-huge-contribs': 88, 'skip-huge-refs': 4, 'update': 0}) + # 1.45m new releases + # 2k more new containers + # 4.96m blank titles + + real 1139m42.231s + user 13307m10.124s + sys 355m18.904s + + # postgresql DB: 402.76G + diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index e1d11817..3aa89b87 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -11,6 +11,11 @@ This file should not turn in to a TODO list! ## 2019-10 +Inserted 1.45m new release entities from Crossref which had been missed during +a previous gap in continuous metadata harvesting. + +## 2019-10 + Updated 304,308 file entities to remove broken "https://web.archive.org/web/None/*" URLs. diff --git a/proposals/20190911_v04_schema_tweaks.md b/proposals/20190911_v04_schema_tweaks.md new file mode 100644 index 00000000..3d1e04c1 --- /dev/null +++ b/proposals/20190911_v04_schema_tweaks.md @@ -0,0 +1,38 @@ + +status: work-in-progress + +Proposed schema changes for next fatcat iteration (v0.4? v0.5?). + +SQL (and API, and elasticsearch): + +- container:`container_status` as a string enum: eg, "stub", + "out-of-print"/"ended" (?), "active", "new"/"small" (?). Particularly to + deal with disambiguation of multiple containers by the same title but + separate ISSN-L. For example, "The Lancet". +- release: `release_month` (to complement `release_date` and `release_year`) +- file: `file_scope` as a string enum indicating how much content this file + includes. Eg, `book`, `chapter`, `article`/`work`, `issue`, `volume`, + `abstract`, `component`. Unclear how to initialize this field; default to + `article`/`work`? +- TODO: release: switch how pages work? first/last? + +API tweaks: + +- add regex restrictions on more `ext_ids`, especially `wikidata_qid` +- add explicit enums for more keyword fields + +API endpoints: + +- `GET /auth/token/<editor_id>` endpoint to generate new API token for given + editor. Used by web interface, or bot wranglers. +- create editor endpoint, to allow bot account creation +- `GET /editor/<ident>/bots` (?) endpoint to enumerate bots wrangled by a + specific editor + +Elasticsearch schema: + +- releases *may* need an "_all" field (or `biblio`?) containing most fields to + make some search experiences work +- releases should include volume, issue, pages +- releases *could* include reference and creator lists, as a faster/cheaper + mechanism for doing reverse lookups diff --git a/proposals/20191018_bigger_db.md b/proposals/20191018_bigger_db.md new file mode 100644 index 00000000..cd5f6e7b --- /dev/null +++ b/proposals/20191018_bigger_db.md @@ -0,0 +1,81 @@ + +How can we scale the fatcat backend to support: + +- one billion release entities +- 5 files, 1 webcapture, 1 fileset per release (average) +- 2 abstracts per release (average) +- 100 revisions per release +- average of 10 creators and 50 linked references per release revision + +Motivated by: +- 200 million paper works; 300 million releases +- 200 million books; 300 million editions +- 100 million greylit +- 100 million blog posts +- 100 million other web/platform things +=> 900 million releases, round to 100 million + +Want "abundance" for release edits, not concern about over-editing, thus the +100 reversion number. Break that down as: + +- 5 publisher metadata updates +- 3 updates of container/publisher +- 3 updates to merge under works +- 5 updates to fix release type, stage, license +- 10 other general metadata fixes (title, abstract, language, etc) +- 10 updates to add/fix external identifiers +- 20-50 = update per reference (linking) +- 10-20 = updates per contrib (linking) +=> 66-106 edits; round to 100 +=> almost no updates touch both reference and contribs +=> 1/3 to 1/2 of edits don't update either + +this would mean: + +- 1 billion release idents (10x current) +- 100 billion release revisions and edits (1000x current) +- 2 billion changelog entries (1000x current) +- 1 trillion creator rows (vastly larger) +- 5 trillion reference rows (vastly larger) + +based on current row sizes: +- release_ident: 77 GByte data, 140+ GByte index => 220+ GByte +- release_rev: 44 => 44 TByte +- contribs: 32 G => 32 TByte +- release_edit: 11 Gbyte => 11 TByte +- refs_blob: 77 G => 77 TByte (and maybe larger?) + +No table/index over 1 TByte? + +That's crazy for reference and contribs, unsustainable. Need to assume those +only get updated when actually updated, thus more like 10x per release: 3.2 and +7.7 TByte. + +Another way to estimate is from crossref dump size, which I think is now like +300 GBytes JSON uncompressed for ~100 million works with many references and +other metadata included. 1 billion would be about 3 TBytes. 100 edits would +mean 300 TBytes; 10 edits would mean 30 TBytes. + +What wants to be on an SSD? Just the most recent version. That would mean +closer to the 3 TByte size. Let's double that for other entities and hot +tables, then double again for indexes: 12 TBytes. Pretty big but doable. + +Roughly, 12 TBytes SSD, 30-100 TBytes nearline (spinning disk). Both need +replication. + +Curious to look at FoundationDB as overall solution; can different +tables/namespaces be on different storage backends? + +Cassandra probably an option for revision storage. And indexing? + +Merging edits and revisions into a single table/index could greatly reduce +index size (needed for, eg, history lookups). + +One plan would be: +- only index most recent versions of entities (contrib, refs, extids, etc), not all revs +- turn either (refs, contribs, abstracts) or entire release entities into + +TODO short term: +- try mass updates in QA: one pass to add release `ext_id` for all releases, + one pass to add release ref links to all releases. see what DB size looks + like. can be dummy data. diff --git a/python/README.md b/python/README.md index 1b2057f1..7c97bbc0 100644 --- a/python/README.md +++ b/python/README.md @@ -43,6 +43,11 @@ Almost all configuration is done via environment variables; see `example.env` for a list of settings. If you copy this file to `.env` it will be sourced by `pipenv` automatically; you can also load it in your shell like `source .env`. +If elasticsearch is not set up, you might want to create two empty indices: + + curl -XPUT localhost:9200/fatcat_release + curl -XPUT localhost:9200/fatcat_container + ## Running Tests Many (though not all) python tests depend on access to a local running API diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index faee6aac..d8abf3eb 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -11,12 +11,14 @@ from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there # first +# Can get a list of Crossref types (with counts) via API: +# https://api.crossref.org/works?rows=0&facet=type-name:* CROSSREF_TYPE_MAP = { 'book': 'book', 'book-chapter': 'chapter', 'book-part': 'chapter', 'book-section': 'chapter', - 'component': None, + 'component': 'component', 'dataset': 'dataset', 'dissertation': 'thesis', 'edited-book': 'book', @@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter): def want(self, obj): if not obj.get('title'): + self.counts['skip-blank-title'] += 1 return False # do most of these checks in-line below @@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter): if obj.get('type') in (None, 'journal', 'proceedings', 'standard-series', 'report-series', 'book-series', 'book-set', 'book-track', 'proceedings-series'): + self.counts['skip-release-type'] += 1 return None # Do require the 'title' keys to exsit, as release entities do if (not 'title' in obj) or (not obj['title']): + self.counts['skip-blank-title'] += 1 return None release_type = self.map_release_type(obj['type']) @@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter): # filter out unreasonably huge releases if len(abstracts) > 100: + self.counts['skip-huge-abstracts'] += 1 return None - if len(refs) > 2000: + if len(contribs) > 2000: + self.counts['skip-huge-contribs'] += 1 return None if len(refs) > 5000: + self.counts['skip-huge-refs'] += 1 return None # release date parsing is amazingly complex @@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter): title = clean(obj.get('title')[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character + self.counts['skip-blank-title'] += 1 return None subtitle = None @@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter): subtitle = clean(obj.get('subtitle')[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character - return None + subtitle = None if extra_crossref: extra['crossref'] = extra_crossref diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 5c8507c1..8e26b7fe 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -22,6 +22,10 @@ def handle_token_login(token): # TODO: what kind of Exceptions? app.log.warning("auth fail: MacaroonDeserializationException") return abort(400) + except pymacaroons.exceptions.MacaroonInitException: + # TODO: what kind of Exceptions? + app.log.warning("auth fail: must supply a valid token") + return abort(400) # extract editor_id editor_id = None for caveat in m.first_party_caveats(): diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 3954abe2..afa2410f 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer): def test_crossref_mappings(crossref_importer): assert crossref_importer.map_release_type('journal-article') == "article-journal" assert crossref_importer.map_release_type('asdf') is None - assert crossref_importer.map_release_type('component') is None + assert crossref_importer.map_release_type('book-series') is None assert crossref_importer.map_release_type('standard') == 'standard' def test_crossref_importer_create(crossref_importer): diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs index 597af338..180dc43b 100644 --- a/rust/src/identifiers.rs +++ b/rust/src/identifiers.rs @@ -540,6 +540,7 @@ pub fn check_release_type(raw: &str) -> Result<()> { "letter", "stub", "retraction", + "component", ]; for good in valid_types { if raw == good { |