summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--TODO.md1
-rw-r--r--guide/src/entity_release.md2
-rw-r--r--notes/bulk_edits/2019-11-05_crossref_patch.md58
-rw-r--r--notes/bulk_edits/CHANGELOG.md5
-rw-r--r--proposals/20190911_v04_schema_tweaks.md38
-rw-r--r--proposals/20191018_bigger_db.md81
-rw-r--r--python/README.md5
-rw-r--r--python/fatcat_tools/importers/crossref.py15
-rw-r--r--python/fatcat_web/auth.py4
-rw-r--r--python/tests/import_crossref.py2
-rw-r--r--rust/src/identifiers.rs1
11 files changed, 208 insertions, 4 deletions
diff --git a/TODO.md b/TODO.md
index 2fec5121..0c766204 100644
--- a/TODO.md
+++ b/TODO.md
@@ -165,6 +165,7 @@ new importers:
## Schema / Entity Fields
+- file+fileset "first seen" datetime
- file type/scope/coverage: "fulltext", "abstract", etc
- elastic transform should only include authors, not editors (?)
- `translation_of` field on releases (or similar/general). `retraction_of` to a
diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md
index 27ce0f2c..1fd0a2f1 100644
--- a/guide/src/entity_release.md
+++ b/guide/src/entity_release.md
@@ -216,6 +216,8 @@ with a small number of (proposed) extensions:
- `stub` (fatcat extension) for releases which have notable external
identifiers, and thus are included "for completeness", but don't seem to
represent a "full work".
+- `component` (fatcat extension) for sub-components of a full paper (or other
+ work). Eg, figures or tables.
An example of a `stub` might be a paper that gets an extra DOI by accident; the
primary DOI should be a full release, and the accidental DOI can be a `stub`
diff --git a/notes/bulk_edits/2019-11-05_crossref_patch.md b/notes/bulk_edits/2019-11-05_crossref_patch.md
new file mode 100644
index 00000000..1765fc36
--- /dev/null
+++ b/notes/bulk_edits/2019-11-05_crossref_patch.md
@@ -0,0 +1,58 @@
+
+Goal is to make sure we have imported all in-scope crossref DOI objects. There
+were a few months gap between the snapshot used as initial bootstrap and the
+start of continuous ingest; any DOIs registered during that gap and not updated
+since are not in fatcat. Expectation is that this will be a relatively small
+import.
+
+## QA Run
+
+Started Thu 31 Oct 2019 08:07:20 PM PDT
+
+ export FATCAT_AUTH_WORKER_CROSSREF="..."
+ time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20181203.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ # postgresql DB at start: fresh 2019-10 dump imported, 357 GB
+ # over 15k TPS against postgres
+
+ 20x theads of:
+ Counter({'total': 5397349, 'exists': 4961058, 'skip': 360156, 'insert': 76135, 'inserted.container': 113, 'update': 0})
+
+ real 1173m52.497s => 20hr
+ user 13058m24.460s
+ sys 319m27.716s
+
+ 1.5 million new releases
+ 7.2 million skips (total)
+
+Ran again with null subtitle fix and granular stats:
+
+ 20x threads of:
+ Counter({'total': 5368366, 'exists': 5122104, 'skip': 244072, 'skip-blank-title': 38399, 'skip-release-type': 5296, 'insert': 2190, 'skip-huge-contribs': 70, 'skip-huge-refs': 7, 'update': 0})
+
+ 43k additional insets (still about 1.5m total)
+ of 4.8 million skipped (why not closer to 7.2 million?), most seem to be blank title
+
+## Production Run
+
+Git: 44c23290c72ec67db38f1e1d40b76ba795b40d9d
+
+started around Tue 05 Nov 2019 02:51:19 PM PST
+
+ export FATCAT_AUTH_WORKER_CROSSREF="..."
+ time xzcat /srv/fatcat/datasets/crossref-works.2019-09-09.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20190730.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ # postgresql DB at start: 399.03G
+
+ # 20x of:
+ Counter({'total': 5347938, 'exists': 5023305, 'skip': 251747, 'skip-blank-title': 247969, 'insert': 72886, 'skip-release-type': 3686, 'inserted.container': 103, 'skip-huge-contribs': 88, 'skip-huge-refs': 4, 'update': 0})
+ # 1.45m new releases
+ # 2k more new containers
+ # 4.96m blank titles
+
+ real 1139m42.231s
+ user 13307m10.124s
+ sys 355m18.904s
+
+ # postgresql DB: 402.76G
+
diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md
index e1d11817..3aa89b87 100644
--- a/notes/bulk_edits/CHANGELOG.md
+++ b/notes/bulk_edits/CHANGELOG.md
@@ -11,6 +11,11 @@ This file should not turn in to a TODO list!
## 2019-10
+Inserted 1.45m new release entities from Crossref which had been missed during
+a previous gap in continuous metadata harvesting.
+
+## 2019-10
+
Updated 304,308 file entities to remove broken
"https://web.archive.org/web/None/*" URLs.
diff --git a/proposals/20190911_v04_schema_tweaks.md b/proposals/20190911_v04_schema_tweaks.md
new file mode 100644
index 00000000..3d1e04c1
--- /dev/null
+++ b/proposals/20190911_v04_schema_tweaks.md
@@ -0,0 +1,38 @@
+
+status: work-in-progress
+
+Proposed schema changes for next fatcat iteration (v0.4? v0.5?).
+
+SQL (and API, and elasticsearch):
+
+- container:`container_status` as a string enum: eg, "stub",
+ "out-of-print"/"ended" (?), "active", "new"/"small" (?). Particularly to
+ deal with disambiguation of multiple containers by the same title but
+ separate ISSN-L. For example, "The Lancet".
+- release: `release_month` (to complement `release_date` and `release_year`)
+- file: `file_scope` as a string enum indicating how much content this file
+ includes. Eg, `book`, `chapter`, `article`/`work`, `issue`, `volume`,
+ `abstract`, `component`. Unclear how to initialize this field; default to
+ `article`/`work`?
+- TODO: release: switch how pages work? first/last?
+
+API tweaks:
+
+- add regex restrictions on more `ext_ids`, especially `wikidata_qid`
+- add explicit enums for more keyword fields
+
+API endpoints:
+
+- `GET /auth/token/<editor_id>` endpoint to generate new API token for given
+ editor. Used by web interface, or bot wranglers.
+- create editor endpoint, to allow bot account creation
+- `GET /editor/<ident>/bots` (?) endpoint to enumerate bots wrangled by a
+ specific editor
+
+Elasticsearch schema:
+
+- releases *may* need an "_all" field (or `biblio`?) containing most fields to
+ make some search experiences work
+- releases should include volume, issue, pages
+- releases *could* include reference and creator lists, as a faster/cheaper
+ mechanism for doing reverse lookups
diff --git a/proposals/20191018_bigger_db.md b/proposals/20191018_bigger_db.md
new file mode 100644
index 00000000..cd5f6e7b
--- /dev/null
+++ b/proposals/20191018_bigger_db.md
@@ -0,0 +1,81 @@
+
+How can we scale the fatcat backend to support:
+
+- one billion release entities
+- 5 files, 1 webcapture, 1 fileset per release (average)
+- 2 abstracts per release (average)
+- 100 revisions per release
+- average of 10 creators and 50 linked references per release revision
+
+Motivated by:
+- 200 million paper works; 300 million releases
+- 200 million books; 300 million editions
+- 100 million greylit
+- 100 million blog posts
+- 100 million other web/platform things
+=> 900 million releases, round to 100 million
+
+Want "abundance" for release edits, not concern about over-editing, thus the
+100 reversion number. Break that down as:
+
+- 5 publisher metadata updates
+- 3 updates of container/publisher
+- 3 updates to merge under works
+- 5 updates to fix release type, stage, license
+- 10 other general metadata fixes (title, abstract, language, etc)
+- 10 updates to add/fix external identifiers
+- 20-50 = update per reference (linking)
+- 10-20 = updates per contrib (linking)
+=> 66-106 edits; round to 100
+=> almost no updates touch both reference and contribs
+=> 1/3 to 1/2 of edits don't update either
+
+this would mean:
+
+- 1 billion release idents (10x current)
+- 100 billion release revisions and edits (1000x current)
+- 2 billion changelog entries (1000x current)
+- 1 trillion creator rows (vastly larger)
+- 5 trillion reference rows (vastly larger)
+
+based on current row sizes:
+- release_ident: 77 GByte data, 140+ GByte index => 220+ GByte
+- release_rev: 44 => 44 TByte
+- contribs: 32 G => 32 TByte
+- release_edit: 11 Gbyte => 11 TByte
+- refs_blob: 77 G => 77 TByte (and maybe larger?)
+
+No table/index over 1 TByte?
+
+That's crazy for reference and contribs, unsustainable. Need to assume those
+only get updated when actually updated, thus more like 10x per release: 3.2 and
+7.7 TByte.
+
+Another way to estimate is from crossref dump size, which I think is now like
+300 GBytes JSON uncompressed for ~100 million works with many references and
+other metadata included. 1 billion would be about 3 TBytes. 100 edits would
+mean 300 TBytes; 10 edits would mean 30 TBytes.
+
+What wants to be on an SSD? Just the most recent version. That would mean
+closer to the 3 TByte size. Let's double that for other entities and hot
+tables, then double again for indexes: 12 TBytes. Pretty big but doable.
+
+Roughly, 12 TBytes SSD, 30-100 TBytes nearline (spinning disk). Both need
+replication.
+
+Curious to look at FoundationDB as overall solution; can different
+tables/namespaces be on different storage backends?
+
+Cassandra probably an option for revision storage. And indexing?
+
+Merging edits and revisions into a single table/index could greatly reduce
+index size (needed for, eg, history lookups).
+
+One plan would be:
+- only index most recent versions of entities (contrib, refs, extids, etc), not all revs
+- turn either (refs, contribs, abstracts) or entire release entities into
+
+TODO short term:
+- try mass updates in QA: one pass to add release `ext_id` for all releases,
+ one pass to add release ref links to all releases. see what DB size looks
+ like. can be dummy data.
diff --git a/python/README.md b/python/README.md
index 1b2057f1..7c97bbc0 100644
--- a/python/README.md
+++ b/python/README.md
@@ -43,6 +43,11 @@ Almost all configuration is done via environment variables; see `example.env`
for a list of settings. If you copy this file to `.env` it will be sourced by
`pipenv` automatically; you can also load it in your shell like `source .env`.
+If elasticsearch is not set up, you might want to create two empty indices:
+
+ curl -XPUT localhost:9200/fatcat_release
+ curl -XPUT localhost:9200/fatcat_container
+
## Running Tests
Many (though not all) python tests depend on access to a local running API
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index faee6aac..d8abf3eb 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -11,12 +11,14 @@ from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
# first
+# Can get a list of Crossref types (with counts) via API:
+# https://api.crossref.org/works?rows=0&facet=type-name:*
CROSSREF_TYPE_MAP = {
'book': 'book',
'book-chapter': 'chapter',
'book-part': 'chapter',
'book-section': 'chapter',
- 'component': None,
+ 'component': 'component',
'dataset': 'dataset',
'dissertation': 'thesis',
'edited-book': 'book',
@@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter):
def want(self, obj):
if not obj.get('title'):
+ self.counts['skip-blank-title'] += 1
return False
# do most of these checks in-line below
@@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter):
if obj.get('type') in (None, 'journal', 'proceedings',
'standard-series', 'report-series', 'book-series', 'book-set',
'book-track', 'proceedings-series'):
+ self.counts['skip-release-type'] += 1
return None
# Do require the 'title' keys to exsit, as release entities do
if (not 'title' in obj) or (not obj['title']):
+ self.counts['skip-blank-title'] += 1
return None
release_type = self.map_release_type(obj['type'])
@@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter):
# filter out unreasonably huge releases
if len(abstracts) > 100:
+ self.counts['skip-huge-abstracts'] += 1
return None
- if len(refs) > 2000:
+ if len(contribs) > 2000:
+ self.counts['skip-huge-contribs'] += 1
return None
if len(refs) > 5000:
+ self.counts['skip-huge-refs'] += 1
return None
# release date parsing is amazingly complex
@@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter):
title = clean(obj.get('title')[0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
+ self.counts['skip-blank-title'] += 1
return None
subtitle = None
@@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter):
subtitle = clean(obj.get('subtitle')[0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
- return None
+ subtitle = None
if extra_crossref:
extra['crossref'] = extra_crossref
diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py
index 5c8507c1..8e26b7fe 100644
--- a/python/fatcat_web/auth.py
+++ b/python/fatcat_web/auth.py
@@ -22,6 +22,10 @@ def handle_token_login(token):
# TODO: what kind of Exceptions?
app.log.warning("auth fail: MacaroonDeserializationException")
return abort(400)
+ except pymacaroons.exceptions.MacaroonInitException:
+ # TODO: what kind of Exceptions?
+ app.log.warning("auth fail: must supply a valid token")
+ return abort(400)
# extract editor_id
editor_id = None
for caveat in m.first_party_caveats():
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 3954abe2..afa2410f 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer):
def test_crossref_mappings(crossref_importer):
assert crossref_importer.map_release_type('journal-article') == "article-journal"
assert crossref_importer.map_release_type('asdf') is None
- assert crossref_importer.map_release_type('component') is None
+ assert crossref_importer.map_release_type('book-series') is None
assert crossref_importer.map_release_type('standard') == 'standard'
def test_crossref_importer_create(crossref_importer):
diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs
index 597af338..180dc43b 100644
--- a/rust/src/identifiers.rs
+++ b/rust/src/identifiers.rs
@@ -540,6 +540,7 @@ pub fn check_release_type(raw: &str) -> Result<()> {
"letter",
"stub",
"retraction",
+ "component",
];
for good in valid_types {
if raw == good {