From 276ac2aa24166660bc6ffe7601cee44b5d848dae Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 4 Jan 2023 19:55:30 -0800 Subject: proposals: update status; add some old ones; consistent file names --- proposals/2019-05-09_v03_schema_tweaks.md | 144 +++++++++++++++++++ proposals/2019-05-10_editgroup_endpoint_prefix.md | 81 +++++++++++ proposals/2019-05-10_release_ext_ids.md | 89 ++++++++++++ proposals/2019-05-14_fatcat_identifiers.md | 27 ++++ proposals/2019-09-11_NEXT_schema_tweaks.md | 42 ++++++ proposals/2019-09-11_search_query_parsing.md | 28 ++++ proposals/2019-10-18_bigger_db.md | 85 ++++++++++++ proposals/20190509_v03_schema_tweaks.md | 144 ------------------- proposals/20190510_editgroup_endpoint_prefix.md | 81 ----------- proposals/20190510_release_ext_ids.md | 89 ------------ proposals/20190514_fatcat_identifiers.md | 27 ---- proposals/20190911_search_query_parsing.md | 28 ---- proposals/20190911_v05_schema_tweaks.md | 42 ------ proposals/20191018_bigger_db.md | 85 ------------ proposals/2020-01-03_py37_refactors.md | 101 ++++++++++++++ proposals/2020-07-02_coverage_ui.md | 80 +++++++++++ proposals/2020-07-29_toml_editing.md | 43 ++++++ proposals/2020-08-04_grouped_release_exports.md | 62 +++++++++ proposals/2020-08-07_dblp.md | 159 +++++++++++++++++++++ proposals/2020-08_bulk_citation_graph.md | 162 ++++++++++++++++++++++ proposals/20200103_py37_refactors.md | 101 -------------- proposals/20200702_coverage_ui.md | 80 ----------- proposals/20200729_toml_editing.md | 43 ------ proposals/20200804_grouped_release_exports.md | 60 -------- proposals/20200807_dblp.md | 159 --------------------- proposals/202008_bulk_citation_graph.md | 160 --------------------- proposals/2020_client_cli.md | 3 +- proposals/2020_fuzzy_matching.md | 2 +- proposals/2020_ir_importer.spn | 25 ++++ proposals/2020_metadata_cleanups.md | 2 +- proposals/2020_spn.md | 27 ++++ proposals/2021-01-29_citation_api.md | 2 + proposals/2021-03-26_journal_metadata.md | 40 ++++++ proposals/2021-10-12_v04_schema_tweaks.md | 30 ++++ proposals/2021-11-17_content_scope.md | 2 +- proposals/20211012_v04_schema_tweaks.md | 30 ---- proposals/2022-01-21_read-only-db.md | 2 +- 37 files changed, 1232 insertions(+), 1135 deletions(-) create mode 100644 proposals/2019-05-09_v03_schema_tweaks.md create mode 100644 proposals/2019-05-10_editgroup_endpoint_prefix.md create mode 100644 proposals/2019-05-10_release_ext_ids.md create mode 100644 proposals/2019-05-14_fatcat_identifiers.md create mode 100644 proposals/2019-09-11_NEXT_schema_tweaks.md create mode 100644 proposals/2019-09-11_search_query_parsing.md create mode 100644 proposals/2019-10-18_bigger_db.md delete mode 100644 proposals/20190509_v03_schema_tweaks.md delete mode 100644 proposals/20190510_editgroup_endpoint_prefix.md delete mode 100644 proposals/20190510_release_ext_ids.md delete mode 100644 proposals/20190514_fatcat_identifiers.md delete mode 100644 proposals/20190911_search_query_parsing.md delete mode 100644 proposals/20190911_v05_schema_tweaks.md delete mode 100644 proposals/20191018_bigger_db.md create mode 100644 proposals/2020-01-03_py37_refactors.md create mode 100644 proposals/2020-07-02_coverage_ui.md create mode 100644 proposals/2020-07-29_toml_editing.md create mode 100644 proposals/2020-08-04_grouped_release_exports.md create mode 100644 proposals/2020-08-07_dblp.md create mode 100644 proposals/2020-08_bulk_citation_graph.md delete mode 100644 proposals/20200103_py37_refactors.md delete mode 100644 proposals/20200702_coverage_ui.md delete mode 100644 proposals/20200729_toml_editing.md delete mode 100644 proposals/20200804_grouped_release_exports.md delete mode 100644 proposals/20200807_dblp.md delete mode 100644 proposals/202008_bulk_citation_graph.md create mode 100644 proposals/2020_ir_importer.spn create mode 100644 proposals/2020_spn.md create mode 100644 proposals/2021-03-26_journal_metadata.md create mode 100644 proposals/2021-10-12_v04_schema_tweaks.md delete mode 100644 proposals/20211012_v04_schema_tweaks.md diff --git a/proposals/2019-05-09_v03_schema_tweaks.md b/proposals/2019-05-09_v03_schema_tweaks.md new file mode 100644 index 00000000..150ce525 --- /dev/null +++ b/proposals/2019-05-09_v03_schema_tweaks.md @@ -0,0 +1,144 @@ + +Status: implemented + +# SQL (and API) schema changes + +Intend to make these changes at the same time as bumping OpenAPI schema from +0.2 to 0.3, along with `20190510_editgroup_endpoint_prefix` and +`20190510_release_ext_ids`. + +Also adding some indices to speed up entity edit history views, but those are +just a performance change, not visible in API schema. + +### Structured Contrib Names + +`creator` entities already have "structured" names: in addition to +`display_name`, there are `given_name` and `surname` fields. This change is to +add these two fields to release contribs as well (to join `raw_name`). + +The two main motivations are: + +1. make various representations (eg, citation formats) of release entities + easier. CSL and many display formats require given/surname distinctions +2. improve algorithmic matching between release entities, raw metadata (eg, + from GROBID), and citation strings. Eg, biblio-glutton wants "first author + surname"; we can't provide this from existing `raw_name` field + +The status quo is that many large metadata sources often include structured +names, and we munge them into a single name. + +Some arguments against this change are: + +1. should be "normalizing" this structure into creator entities. However, + display/representation of a contributor might change between publications +2. structure isn't always deterministic from what is visible in published + documents. AKA, raw name is unambiguous (it's what is "printed" on the + document), but given/sur decomposition can be ambiguous (for individauls, or + entire locales/cultures) +3. could just stash in contrib `extra_json`. However, seems common enough to + include as full fields + +Questions/Decisions: + +- should contrib `raw_name` be changed to `display_name` for consistency with + `creator`? `raw_name` should probably always be what is in/on the document + itself, thus no. +- should we still munge a `raw_name` at insert time (we we only have structured + names), or push this on to client code to always create something for + display? + +### Rename `release_status` to `release_stage` + +Describes the field better. I think this is uncontroversial and not too +disruptive at this point. + +### New release fields: subtitle, number, version + +`subtitle`: mostly for books. could have a flat-out style guide policy against +use for articles? Already frequently add subtitle metadata as an `extra_json` +field. + +`number`: intended to represent, eg, a report number ("RFC ..."). Not to be +confused with `container-number`, `chapter`, `edition` + +`version`: intended to be a short string ("v3", "2", "third", "3.9") to +disambiguate which among multiple versions. CSL has a separate `edition` field. + +These are somewhat hard to justify as dedicated fields vs. `extra_json`. + +`subtitle` is a pretty core field for book metadata, but raises ambiguity for +other release types. + +Excited to include many reports and memos (as grey lit), for which the number +is a pretty major field, and we probably want to include in elasticsearch but +not as part of the title field, and someday perhaps an index on `number`, so +that's easier to justify. + +TODO: + +- `version` maybe should be dropped. arXiv is one possible justification, as is + sorting by this field in display. + +### Withdrawn fields + +As part of a plan to represent retractions and other "unpublishing", decided to +track when and whether a release has been "withdrawn", distinct from the +`release_stage`. + +To motivate this, consider a work that has been retracted. There are multiple +releases of different stages; should not set the `release_stage` for all to +`withdrawn` or `retracted`, because then hard to disambiguate between the +release entities. Also maybe the pre-print hasn't been formally withdrawn and +is still in the pre-print server, or maybe only the pre-print was withdrawn +(for being partial/incorrect?) while the final version is still "active". + +As with `release_date`, just `withdrawn_date` is insufficient, so we get +`withdrawn_year` also... and `withdrawn_month` in the future? Also +`withdrawn_state` for cases where we don't know even the year. This could +probably be a bool (`is_withdrawn` or `withdrawn`), but the flexibility of a +TEXT/ENUM has been nice. + +TODO: + +- boolean (`is_withdrawn`, default False) or text (`withdrawn_status`). Let's + keep text to allow evolution in the future; if the field is defined at all + it's "withdrawn" (true), if not it isn't + +### New release extids: `mag_id`, `ark_id` + +See also: `20190510_release_ext_ids`. + +- `mag_id`: Microsoft Academic Graph identifier. +- `ark_id`: ARK identifier. + +These will likely be the last identifiers added as fields on `release`; a +future two-stage refactor will be to move these out to a child table (something +like `extid_type`, `extid_value`, with a UNIQ index for lookups). + +Perhaps the `extid` table should be implemented now, starting with these +identifiers? + +### Web Capture CDX `size_bytes` + +Pretty straight-forward. + +Considered adding `extra_json` as well, to be consistent with other tables, but +feels too heavy for the CDX case. Can add later if there is an actual need; +adding fields easier than removing (for backwards compat). + +### Object/Class Name Changes + +TODO + +### Rust/Python Library Name Changes + +Do these as separate commits, after merging back in to master, for v0.3: + +- rust `fatcat-api-spec` => `fatcat-openapi` +- python `fatcat_client` => `fatcat_openapi_client` + +### More? + +`release_month`: apprently pretty common to know the year and month but not +date. I have avoided so far, seems like unnecessary complexity. Could start +as an `extra_json` field? NOT IMPLEMENTED diff --git a/proposals/2019-05-10_editgroup_endpoint_prefix.md b/proposals/2019-05-10_editgroup_endpoint_prefix.md new file mode 100644 index 00000000..6794266e --- /dev/null +++ b/proposals/2019-05-10_editgroup_endpoint_prefix.md @@ -0,0 +1,81 @@ + +Status: implemented + +# Editgroup API Endpoint Prefixes + +In summary, change the API URL design such that entity mutations (create, +update, delete) happen under the URL path of an editgroup, with the +`editgroup_id` as a path component, instead of being REST actions on the +canonical URL with `editgroup_id` as a query parameter. + +This is a fairly large/systemic breaking change, though it should not change +code *structure* much (just argument order/passing), and requires no SQL +changes. It may remove corner-case features (like non-auto match operations?). + + +## Renamed API URLs + +For all entity types: + + /editgroup//release/ + PUT: update_release(editgroup_id, ident, entity) + DELETE: delete_release(editgroup_id, ident) + + /editgroup//release + POST: create_release(editgroup_id, entity) + + /editgroup/auto/release/batch + POST: create_release_auto_batch(editgroup, [entity]) + => actually new ReleaseAutoBatch(editgroup, entity_list) body + + /editgroup//release/edit/ + DELETE: delete_release_edit(editgroup_id, edit_uuid) + + +## New Webface URLs + + /editgroup//release/ + GET: shows the (potentially WIP) entity as of this editgroup + + Some way to delete an edit from an editgroup + + +## Future: New API URLs + +Will not actually implement these for now. + +For all entity types: + + /editgroup//release/ + GET: get_editgroup_release(editgroup_id, ident) => entity revision + + /editgroup//release/batch + POST: create_release_batch([entity]) + + +## SCRATCH + +container +creator +file +fileset +webcapture +release +work + +per entity: +x auto_batch type +x create_* path and editgroup_id parameter +x delete old parameter +x batch: +x url +x operationId +x new single parameter +x return type +x put/delete +x new url section +x remove old editgroup_id parameters (2x) +x delete edit +x new url section +x remove old edit_id + diff --git a/proposals/2019-05-10_release_ext_ids.md b/proposals/2019-05-10_release_ext_ids.md new file mode 100644 index 00000000..b0a484ad --- /dev/null +++ b/proposals/2019-05-10_release_ext_ids.md @@ -0,0 +1,89 @@ + +Status: implemented + +# Release External ID Refactor + +Goal is to make the external identifier "namespace" (number of external +identifiers supported) less scarce, while still allowing fast lookups. Adding +new identifiers would still require API schema and rust code changes. + +This change would also bring the number of columns on `release_rev` back under +32, which makes diesel happier. + +Unclear whether existing extids (like DOI) should get merged and the old +columns dropped. This could be done consistently by, eg, a rust worker process +that re-writes tables, probably in multiple stages (eg, first copy, then update +code to read from new location, then drop old columns and indices). Perhaps for +very hot/popular columns (like DOI, and maybe ISBN13?) it is better to have +separate columns/indices. + +It would be possible to roll this out as a SQL change first, with identifiers +still at the top level of the API schema, then later switch the API schema. Not +sure this is worth it though. + +## New API + +All identifiers as text + + release_entity + ext_ids (required) + doi + pmid + pmcid + wikidata_qid + core + isbn13 + arxiv + jstor + [...] + +## New SQL Schema + +Something like: + + release_rev_extid ( + release_rev_id UUID foreign key to release_rev.id + extid_type TEXT + extid_value TEXT + ) + PRIMARY KEY (release_rev_id, extid_type) + INDEX (extid_type, extid_value) + +### Existing prod Column Use + +Queries like: + + fatcat_prod=# SELECT COUNT(*) FROM release_rev WHERE wikidata_qid IS NOT NULL; + 13460723 + +Results: + + wikidata_qid: 13460723 + isbn13: 1 + core_id: 7160477 + arxiv_id: 3 + jstor_id: 0 + pmcid: 3688945 + +Keep in SQL: + +- `doi` +- `pmid` +- `pmcid` +- `wikidata_qid` +- `core_id` + +Drop columns: + +- `isbn13` +- `arxiv_id` +- `jstor_id` + +In new table: + +- isbn13 +- arxiv +- jstor +- mag +- ark +- dblp diff --git a/proposals/2019-05-14_fatcat_identifiers.md b/proposals/2019-05-14_fatcat_identifiers.md new file mode 100644 index 00000000..77aeba93 --- /dev/null +++ b/proposals/2019-05-14_fatcat_identifiers.md @@ -0,0 +1,27 @@ + +Status: implemented + +Fatcat Identifiers +======================= + +AKA, `fcid` + +## Public Use / Reference + +When referencing identifiers in external databases, should prefix with the +entity type. Eg: + + release_hsmo6p4smrganpb3fndaj2lon4 + editgroup_qinmjr2lbvgd3mbt7mifir23fy + +Or with a prefix: + + fatcat:release_hsmo6p4smrganpb3fndaj2lon4 + +As a usability affordance, the public web interface (though not API) should do +permanent redirects HTTP (301 or 308) to the canonical page like: + + https://fatcat.wiki/release_hsmo6p4smrganpb3fndaj2lon4 + HTTP 301 => https://fatcat.wiki/release/hsmo6p4smrganpb3fndaj2lon4 + +However, no intention to use identifiers in this schema in the API itself? diff --git a/proposals/2019-09-11_NEXT_schema_tweaks.md b/proposals/2019-09-11_NEXT_schema_tweaks.md new file mode 100644 index 00000000..dcbc2f5f --- /dev/null +++ b/proposals/2019-09-11_NEXT_schema_tweaks.md @@ -0,0 +1,42 @@ + +Status: planned + +## Schema Changes for Next Release + +Proposed schema changes for next fatcat iteration with SQL changes (v0.6? v1.0?). + +SQL (and API, and elasticsearch): + +- `db_get_range_for_editor` is slow when there are many editgroups for editor; add sorted index? meh. +- release: `release_month` (to complement `release_date` and `release_year`) +- file: `file_scope` as a string enum indicating how much content this file + includes. Eg, `book`, `chapter`, `article`/`work`, `issue`, `volume`, + `abstract`, `component`. Unclear how to initialize this field; default to + `article`/`work`? +- file: some way of marking bad/bogus files... by scope? type? status? +- TODO: webcapture: lookup by primary URL sha1? +- TODO: release: switch how pages work? first/last? +- TODO: indication of peer-review process? at release or container level? +- TODO: container: separate canonical and disambiguating titles (?) +- TODO: container: "imprint" field? +- TODO: container: "series" field? eg for conferences +- TODO: release inter-references using SCHOLIX/Datacite schema + https://zenodo.org/record/1120265 + https://support.datacite.org/docs/connecting-research-outputs#section-related-identifiers +- TODO: fileset: some sort of lookup; hashes of hashes? +- TODO: fileset: some indication/handling of git repositories + +API tweaks: + +- add regex restrictions on more `ext_ids`, especially `wikidata_qid` +- add explicit enums for more keyword fields + +API endpoints: + +- `GET /auth/token/` endpoint to generate new API token for given + editor. Used by web interface, or bot wranglers. +- create editor endpoint, to allow bot account creation +- `GET /editor//bots` (?) endpoint to enumerate bots wrangled by a + specific editor + +See `2020_search_improvements` for elasticsearch-only schema updates. diff --git a/proposals/2019-09-11_search_query_parsing.md b/proposals/2019-09-11_search_query_parsing.md new file mode 100644 index 00000000..f1fb0128 --- /dev/null +++ b/proposals/2019-09-11_search_query_parsing.md @@ -0,0 +1,28 @@ + +Status: brainstorm + +## Search Query Parsing + +The default "release" search on fatcat.wiki currently uses the elasticsearch +built-in `query_string` parser, which is explicitly not recommended for +public/production use. + +The best way forward is likely a custom query parser (eg, PEG-generated parser) +that generates a complete elasticsearch query JSON structure. + +A couple search issues this would help with: + +- better parsing of keywords (year, year-range, DOI, ISSN, etc) in complex + queries and turning these in to keyword term sub-queries +- queries including terms from multiple fields which aren't explicitly tagged + (eg, "lovelace computer" vs. "author:lovelace title:computer") +- avoiding unsustainably expensive queries (eg, prefix wildcard, regex) +- handling single-character mispellings and synonyms +- collapsing multiple releases under the same work in search results + +In the near future, we may also create a fulltext search index, which will have +it's own issues. + +## Tech Changes + +If we haven't already, should also switch to using elasticsearch client library. diff --git a/proposals/2019-10-18_bigger_db.md b/proposals/2019-10-18_bigger_db.md new file mode 100644 index 00000000..b0e6617a --- /dev/null +++ b/proposals/2019-10-18_bigger_db.md @@ -0,0 +1,85 @@ + +Status: notes + +## Catalog Database Scaling + +How can we scale the fatcat backend to support: + +- one billion release entities +- 5 files, 1 webcapture, 1 fileset per release (average) +- 2 abstracts per release (average) +- 100 revisions per release +- average of 10 creators and 50 linked references per release revision + +Motivated by: +- 200 million paper works; 300 million releases +- 200 million books; 300 million editions +- 100 million greylit +- 100 million blog posts +- 100 million other web/platform things +=> 900 million releases, round to 100 million + +Want "abundance" for release edits, not concern about over-editing, thus the +100 reversion number. Break that down as: + +- 5 publisher metadata updates +- 3 updates of container/publisher +- 3 updates to merge under works +- 5 updates to fix release type, stage, license +- 10 other general metadata fixes (title, abstract, language, etc) +- 10 updates to add/fix external identifiers +- 20-50 = update per reference (linking) +- 10-20 = updates per contrib (linking) +=> 66-106 edits; round to 100 +=> almost no updates touch both reference and contribs +=> 1/3 to 1/2 of edits don't update either + +this would mean: + +- 1 billion release idents (10x current) +- 100 billion release revisions and edits (1000x current) +- 2 billion changelog entries (1000x current) +- 1 trillion creator rows (vastly larger) +- 5 trillion reference rows (vastly larger) + +based on current row sizes: +- release_ident: 77 GByte data, 140+ GByte index => 220+ GByte +- release_rev: 44 => 44 TByte +- contribs: 32 G => 32 TByte +- release_edit: 11 Gbyte => 11 TByte +- refs_blob: 77 G => 77 TByte (and maybe larger?) + +No table/index over 1 TByte? + +That's crazy for reference and contribs, unsustainable. Need to assume those +only get updated when actually updated, thus more like 10x per release: 3.2 and +7.7 TByte. + +Another way to estimate is from crossref dump size, which I think is now like +300 GBytes JSON uncompressed for ~100 million works with many references and +other metadata included. 1 billion would be about 3 TBytes. 100 edits would +mean 300 TBytes; 10 edits would mean 30 TBytes. + +What wants to be on an SSD? Just the most recent version. That would mean +closer to the 3 TByte size. Let's double that for other entities and hot +tables, then double again for indexes: 12 TBytes. Pretty big but doable. + +Roughly, 12 TBytes SSD, 30-100 TBytes nearline (spinning disk). Both need +replication. + +Curious to look at FoundationDB as overall solution; can different +tables/namespaces be on different storage backends? + +Cassandra probably an option for revision storage. And indexing? + +Merging edits and revisions into a single table/index could greatly reduce +index size (needed for, eg, history lookups). + +One plan would be: +- only index most recent versions of entities (contrib, refs, extids, etc), not all revs +- turn either (refs, contribs, abstracts) or entire release entities into + +TODO short term: +- try mass updates in QA: one pass to add release `ext_id` for all releases, + one pass to add release ref links to all releases. see what DB size looks + like. can be dummy data. diff --git a/proposals/20190509_v03_schema_tweaks.md b/proposals/20190509_v03_schema_tweaks.md deleted file mode 100644 index 150ce525..00000000 --- a/proposals/20190509_v03_schema_tweaks.md +++ /dev/null @@ -1,144 +0,0 @@ - -Status: implemented - -# SQL (and API) schema changes - -Intend to make these changes at the same time as bumping OpenAPI schema from -0.2 to 0.3, along with `20190510_editgroup_endpoint_prefix` and -`20190510_release_ext_ids`. - -Also adding some indices to speed up entity edit history views, but those are -just a performance change, not visible in API schema. - -### Structured Contrib Names - -`creator` entities already have "structured" names: in addition to -`display_name`, there are `given_name` and `surname` fields. This change is to -add these two fields to release contribs as well (to join `raw_name`). - -The two main motivations are: - -1. make various representations (eg, citation formats) of release entities - easier. CSL and many display formats require given/surname distinctions -2. improve algorithmic matching between release entities, raw metadata (eg, - from GROBID), and citation strings. Eg, biblio-glutton wants "first author - surname"; we can't provide this from existing `raw_name` field - -The status quo is that many large metadata sources often include structured -names, and we munge them into a single name. - -Some arguments against this change are: - -1. should be "normalizing" this structure into creator entities. However, - display/representation of a contributor might change between publications -2. structure isn't always deterministic from what is visible in published - documents. AKA, raw name is unambiguous (it's what is "printed" on the - document), but given/sur decomposition can be ambiguous (for individauls, or - entire locales/cultures) -3. could just stash in contrib `extra_json`. However, seems common enough to - include as full fields - -Questions/Decisions: - -- should contrib `raw_name` be changed to `display_name` for consistency with - `creator`? `raw_name` should probably always be what is in/on the document - itself, thus no. -- should we still munge a `raw_name` at insert time (we we only have structured - names), or push this on to client code to always create something for - display? - -### Rename `release_status` to `release_stage` - -Describes the field better. I think this is uncontroversial and not too -disruptive at this point. - -### New release fields: subtitle, number, version - -`subtitle`: mostly for books. could have a flat-out style guide policy against -use for articles? Already frequently add subtitle metadata as an `extra_json` -field. - -`number`: intended to represent, eg, a report number ("RFC ..."). Not to be -confused with `container-number`, `chapter`, `edition` - -`version`: intended to be a short string ("v3", "2", "third", "3.9") to -disambiguate which among multiple versions. CSL has a separate `edition` field. - -These are somewhat hard to justify as dedicated fields vs. `extra_json`. - -`subtitle` is a pretty core field for book metadata, but raises ambiguity for -other release types. - -Excited to include many reports and memos (as grey lit), for which the number -is a pretty major field, and we probably want to include in elasticsearch but -not as part of the title field, and someday perhaps an index on `number`, so -that's easier to justify. - -TODO: - -- `version` maybe should be dropped. arXiv is one possible justification, as is - sorting by this field in display. - -### Withdrawn fields - -As part of a plan to represent retractions and other "unpublishing", decided to -track when and whether a release has been "withdrawn", distinct from the -`release_stage`. - -To motivate this, consider a work that has been retracted. There are multiple -releases of different stages; should not set the `release_stage` for all to -`withdrawn` or `retracted`, because then hard to disambiguate between the -release entities. Also maybe the pre-print hasn't been formally withdrawn and -is still in the pre-print server, or maybe only the pre-print was withdrawn -(for being partial/incorrect?) while the final version is still "active". - -As with `release_date`, just `withdrawn_date` is insufficient, so we get -`withdrawn_year` also... and `withdrawn_month` in the future? Also -`withdrawn_state` for cases where we don't know even the year. This could -probably be a bool (`is_withdrawn` or `withdrawn`), but the flexibility of a -TEXT/ENUM has been nice. - -TODO: - -- boolean (`is_withdrawn`, default False) or text (`withdrawn_status`). Let's - keep text to allow evolution in the future; if the field is defined at all - it's "withdrawn" (true), if not it isn't - -### New release extids: `mag_id`, `ark_id` - -See also: `20190510_release_ext_ids`. - -- `mag_id`: Microsoft Academic Graph identifier. -- `ark_id`: ARK identifier. - -These will likely be the last identifiers added as fields on `release`; a -future two-stage refactor will be to move these out to a child table (something -like `extid_type`, `extid_value`, with a UNIQ index for lookups). - -Perhaps the `extid` table should be implemented now, starting with these -identifiers? - -### Web Capture CDX `size_bytes` - -Pretty straight-forward. - -Considered adding `extra_json` as well, to be consistent with other tables, but -feels too heavy for the CDX case. Can add later if there is an actual need; -adding fields easier than removing (for backwards compat). - -### Object/Class Name Changes - -TODO - -### Rust/Python Library Name Changes - -Do these as separate commits, after merging back in to master, for v0.3: - -- rust `fatcat-api-spec` => `fatcat-openapi` -- python `fatcat_client` => `fatcat_openapi_client` - -### More? - -`release_month`: apprently pretty common to know the year and month but not -date. I have avoided so far, seems like unnecessary complexity. Could start -as an `extra_json` field? NOT IMPLEMENTED diff --git a/proposals/20190510_editgroup_endpoint_prefix.md b/proposals/20190510_editgroup_endpoint_prefix.md deleted file mode 100644 index 6794266e..00000000 --- a/proposals/20190510_editgroup_endpoint_prefix.md +++ /dev/null @@ -1,81 +0,0 @@ - -Status: implemented - -# Editgroup API Endpoint Prefixes - -In summary, change the API URL design such that entity mutations (create, -update, delete) happen under the URL path of an editgroup, with the -`editgroup_id` as a path component, instead of being REST actions on the -canonical URL with `editgroup_id` as a query parameter. - -This is a fairly large/systemic breaking change, though it should not change -code *structure* much (just argument order/passing), and requires no SQL -changes. It may remove corner-case features (like non-auto match operations?). - - -## Renamed API URLs - -For all entity types: - - /editgroup//release/ - PUT: update_release(editgroup_id, ident, entity) - DELETE: delete_release(editgroup_id, ident) - - /editgroup//release - POST: create_release(editgroup_id, entity) - - /editgroup/auto/release/batch - POST: create_release_auto_batch(editgroup, [entity]) - => actually new ReleaseAutoBatch(editgroup, entity_list) body - - /editgroup//release/edit/ - DELETE: delete_release_edit(editgroup_id, edit_uuid) - - -## New Webface URLs - - /editgroup//release/ - GET: shows the (potentially WIP) entity as of this editgroup - - Some way to delete an edit from an editgroup - - -## Future: New API URLs - -Will not actually implement these for now. - -For all entity types: - - /editgroup//release/ - GET: get_editgroup_release(editgroup_id, ident) => entity revision - - /editgroup//release/batch - POST: create_release_batch([entity]) - - -## SCRATCH - -container -creator -file -fileset -webcapture -release -work - -per entity: -x auto_batch type -x create_* path and editgroup_id parameter -x delete old parameter -x batch: -x url -x operationId -x new single parameter -x return type -x put/delete -x new url section -x remove old editgroup_id parameters (2x) -x delete edit -x new url section -x remove old edit_id - diff --git a/proposals/20190510_release_ext_ids.md b/proposals/20190510_release_ext_ids.md deleted file mode 100644 index b0a484ad..00000000 --- a/proposals/20190510_release_ext_ids.md +++ /dev/null @@ -1,89 +0,0 @@ - -Status: implemented - -# Release External ID Refactor - -Goal is to make the external identifier "namespace" (number of external -identifiers supported) less scarce, while still allowing fast lookups. Adding -new identifiers would still require API schema and rust code changes. - -This change would also bring the number of columns on `release_rev` back under -32, which makes diesel happier. - -Unclear whether existing extids (like DOI) should get merged and the old -columns dropped. This could be done consistently by, eg, a rust worker process -that re-writes tables, probably in multiple stages (eg, first copy, then update -code to read from new location, then drop old columns and indices). Perhaps for -very hot/popular columns (like DOI, and maybe ISBN13?) it is better to have -separate columns/indices. - -It would be possible to roll this out as a SQL change first, with identifiers -still at the top level of the API schema, then later switch the API schema. Not -sure this is worth it though. - -## New API - -All identifiers as text - - release_entity - ext_ids (required) - doi - pmid - pmcid - wikidata_qid - core - isbn13 - arxiv - jstor - [...] - -## New SQL Schema - -Something like: - - release_rev_extid ( - release_rev_id UUID foreign key to release_rev.id - extid_type TEXT - extid_value TEXT - ) - PRIMARY KEY (release_rev_id, extid_type) - INDEX (extid_type, extid_value) - -### Existing prod Column Use - -Queries like: - - fatcat_prod=# SELECT COUNT(*) FROM release_rev WHERE wikidata_qid IS NOT NULL; - 13460723 - -Results: - - wikidata_qid: 13460723 - isbn13: 1 - core_id: 7160477 - arxiv_id: 3 - jstor_id: 0 - pmcid: 3688945 - -Keep in SQL: - -- `doi` -- `pmid` -- `pmcid` -- `wikidata_qid` -- `core_id` - -Drop columns: - -- `isbn13` -- `arxiv_id` -- `jstor_id` - -In new table: - -- isbn13 -- arxiv -- jstor -- mag -- ark -- dblp diff --git a/proposals/20190514_fatcat_identifiers.md b/proposals/20190514_fatcat_identifiers.md deleted file mode 100644 index 325e48f5..00000000 --- a/proposals/20190514_fatcat_identifiers.md +++ /dev/null @@ -1,27 +0,0 @@ - -Status: brainstorm - -Fatcat Identifiers -======================= - -AKA, `fcid` - -## Public Use / Reference - -When referencing identifiers in external databases, should prefix with the -entity type. Eg: - - release_hsmo6p4smrganpb3fndaj2lon4 - editgroup_qinmjr2lbvgd3mbt7mifir23fy - -Or with a prefix: - - fatcat:release_hsmo6p4smrganpb3fndaj2lon4 - -As a usability affordance, the public web interface (though not API) should do -permanent redirects HTTP (301 or 308) to the canonical page like: - - https://fatcat.wiki/release_hsmo6p4smrganpb3fndaj2lon4 - HTTP 301 => https://fatcat.wiki/release/hsmo6p4smrganpb3fndaj2lon4 - -However, no intention to use identifiers in this schema in the API itself? diff --git a/proposals/20190911_search_query_parsing.md b/proposals/20190911_search_query_parsing.md deleted file mode 100644 index f1fb0128..00000000 --- a/proposals/20190911_search_query_parsing.md +++ /dev/null @@ -1,28 +0,0 @@ - -Status: brainstorm - -## Search Query Parsing - -The default "release" search on fatcat.wiki currently uses the elasticsearch -built-in `query_string` parser, which is explicitly not recommended for -public/production use. - -The best way forward is likely a custom query parser (eg, PEG-generated parser) -that generates a complete elasticsearch query JSON structure. - -A couple search issues this would help with: - -- better parsing of keywords (year, year-range, DOI, ISSN, etc) in complex - queries and turning these in to keyword term sub-queries -- queries including terms from multiple fields which aren't explicitly tagged - (eg, "lovelace computer" vs. "author:lovelace title:computer") -- avoiding unsustainably expensive queries (eg, prefix wildcard, regex) -- handling single-character mispellings and synonyms -- collapsing multiple releases under the same work in search results - -In the near future, we may also create a fulltext search index, which will have -it's own issues. - -## Tech Changes - -If we haven't already, should also switch to using elasticsearch client library. diff --git a/proposals/20190911_v05_schema_tweaks.md b/proposals/20190911_v05_schema_tweaks.md deleted file mode 100644 index 46d7c489..00000000 --- a/proposals/20190911_v05_schema_tweaks.md +++ /dev/null @@ -1,42 +0,0 @@ - -Status: planned - -## Schema Changes for v0.4 Release - -Proposed schema changes for next fatcat iteration (v0.4? v0.5?). - -SQL (and API, and elasticsearch): - -- `db_get_range_for_editor` is slow when there are many editgroups for editor; add sorted index? meh. -- release: `release_month` (to complement `release_date` and `release_year`) -- file: `file_scope` as a string enum indicating how much content this file - includes. Eg, `book`, `chapter`, `article`/`work`, `issue`, `volume`, - `abstract`, `component`. Unclear how to initialize this field; default to - `article`/`work`? -- file: some way of marking bad/bogus files... by scope? type? status? -- TODO: webcapture: lookup by primary URL sha1? -- TODO: release: switch how pages work? first/last? -- TODO: indication of peer-review process? at release or container level? -- TODO: container: separate canonical and disambiguating titles (?) -- TODO: container: "imprint" field? -- TODO: container: "series" field? eg for conferences -- TODO: release inter-references using SCHOLIX/Datacite schema - https://zenodo.org/record/1120265 - https://support.datacite.org/docs/connecting-research-outputs#section-related-identifiers -- TODO: fileset: some sort of lookup; hashes of hashes? -- TODO: fileset: some indication/handling of git repositories - -API tweaks: - -- add regex restrictions on more `ext_ids`, especially `wikidata_qid` -- add explicit enums for more keyword fields - -API endpoints: - -- `GET /auth/token/` endpoint to generate new API token for given - editor. Used by web interface, or bot wranglers. -- create editor endpoint, to allow bot account creation -- `GET /editor//bots` (?) endpoint to enumerate bots wrangled by a - specific editor - -See `2020_search_improvements` for elasticsearch-only schema updates. diff --git a/proposals/20191018_bigger_db.md b/proposals/20191018_bigger_db.md deleted file mode 100644 index 7a5216d0..00000000 --- a/proposals/20191018_bigger_db.md +++ /dev/null @@ -1,85 +0,0 @@ - -Status: brainstorm - -## Catalog Database Scaling - -How can we scale the fatcat backend to support: - -- one billion release entities -- 5 files, 1 webcapture, 1 fileset per release (average) -- 2 abstracts per release (average) -- 100 revisions per release -- average of 10 creators and 50 linked references per release revision - -Motivated by: -- 200 million paper works; 300 million releases -- 200 million books; 300 million editions -- 100 million greylit -- 100 million blog posts -- 100 million other web/platform things -=> 900 million releases, round to 100 million - -Want "abundance" for release edits, not concern about over-editing, thus the -100 reversion number. Break that down as: - -- 5 publisher metadata updates -- 3 updates of container/publisher -- 3 updates to merge under works -- 5 updates to fix release type, stage, license -- 10 other general metadata fixes (title, abstract, language, etc) -- 10 updates to add/fix external identifiers -- 20-50 = update per reference (linking) -- 10-20 = updates per contrib (linking) -=> 66-106 edits; round to 100 -=> almost no updates touch both reference and contribs -=> 1/3 to 1/2 of edits don't update either - -this would mean: - -- 1 billion release idents (10x current) -- 100 billion release revisions and edits (1000x current) -- 2 billion changelog entries (1000x current) -- 1 trillion creator rows (vastly larger) -- 5 trillion reference rows (vastly larger) - -based on current row sizes: -- release_ident: 77 GByte data, 140+ GByte index => 220+ GByte -- release_rev: 44 => 44 TByte -- contribs: 32 G => 32 TByte -- release_edit: 11 Gbyte => 11 TByte -- refs_blob: 77 G => 77 TByte (and maybe larger?) - -No table/index over 1 TByte? - -That's crazy for reference and contribs, unsustainable. Need to assume those -only get updated when actually updated, thus more like 10x per release: 3.2 and -7.7 TByte. - -Another way to estimate is from crossref dump size, which I think is now like -300 GBytes JSON uncompressed for ~100 million works with many references and -other metadata included. 1 billion would be about 3 TBytes. 100 edits would -mean 300 TBytes; 10 edits would mean 30 TBytes. - -What wants to be on an SSD? Just the most recent version. That would mean -closer to the 3 TByte size. Let's double that for other entities and hot -tables, then double again for indexes: 12 TBytes. Pretty big but doable. - -Roughly, 12 TBytes SSD, 30-100 TBytes nearline (spinning disk). Both need -replication. - -Curious to look at FoundationDB as overall solution; can different -tables/namespaces be on different storage backends? - -Cassandra probably an option for revision storage. And indexing? - -Merging edits and revisions into a single table/index could greatly reduce -index size (needed for, eg, history lookups). - -One plan would be: -- only index most recent versions of entities (contrib, refs, extids, etc), not all revs -- turn either (refs, contribs, abstracts) or entire release entities into - -TODO short term: -- try mass updates in QA: one pass to add release `ext_id` for all releases, - one pass to add release ref links to all releases. see what DB size looks - like. can be dummy data. diff --git a/proposals/2020-01-03_py37_refactors.md b/proposals/2020-01-03_py37_refactors.md new file mode 100644 index 00000000..04c926a3 --- /dev/null +++ b/proposals/2020-01-03_py37_refactors.md @@ -0,0 +1,101 @@ + +status: notes + +If we update fatcat python code to python3.7, what code refactoring changes can +we make? We currently use/require python3.5. + +Nice features in python3 I know of are: + +- dataclasses (python3.7) +- async/await (mature in python3.7?) +- type annotations (python3.5) +- format strings (python3.6) +- walrus assignment (python3.8) + +Not sure if the walrus operator is worth jumping all the way to python3.8. + +While we might be at it, what other superficial factorings might we want to do? + +- strict lint style (eg, maximum column width) with `black` (python3.6) +- logging/debugging/verbose +- type annotations and checking +- use named dicts or structs in place of dicts + +## Linux Distro Support + +The default python version shipped by current and planned linux releases are: + +- ubuntu xenial 16.04 LTS: python3.5 +- ubuntu bionic 18.04 LTS: python3.6 +- ubuntu focal 20.04 LTS: python3.8 (planned) +- debian buster 10 2019: python3.7 + +Python 3.7 is the default in debian buster (10). + +There are apt PPA package repositories that allow backporting newer pythons to +older releases. As far as I know this is safe and doesn't override any system +usage if we are careful not to set the defaults (aka, `python3` command should +be the older version unless inside a virtualenv). + +It would also be possible to use `pyenv` to have `virtualenv`s with custom +python versions. We should probably do that for OS X and/or windows support if +we wanted those. But having a system package is probably a lot faster to +install. + +## Dataclasses + +`dataclasses` are a user-friendly way to create struct-like objects. They are +pretty similar to the existing `namedtuple`, but can be mutable and have +methods attached to them (they are just classes), plus several other usability +improvements. + +Most places we are throwing around dicts with structure we could be using +dataclasses instead. There are some instances of this in fatcat, but many more +in sandcrawler. + +## Async/Await + +Where might we actually use async/await? I think more in sandcrawler than in +the python tools or web apps. The GROBID, ingest, and ML workers in particular +should be async over batches, as should all fetches from CDX/wayback. + +Some of the kafka workers *could* be aync, but i'm not sure how much speedup +there would actually be. For example, the entity updates worker could fetch +entities for an editgroup concurrently. + +Inserts (importers) should probably mostly happen serially, at least the kafka +importers, one editgroup at a time, so progress is correctly recorded in kafka. +Parallelization should probably happen at the partition level; would need to +think through whether async would actually help with code simplicity vs. thread +or process parallelization. + +## Type Annotations + +The meta-goals of (gradual) type annotations would be catching more bugs at +development time, and having code be more self-documenting and easier to +understand. + +The two big wins I see with type annotation would be having annotations +auto-generated for the openapi classes and API calls, and to make string +munging in importer code less buggy. + +## Format Strings + +Eg, replace code like: + + "There are {} out of {} objects".format(found, total) + +With: + + f"There are {found} out of {total} objects" + +## Walrus Operator + +New operator allows checking and assignment together: + + if (n := len(a)) > 10: + print(f"List is too long ({n} elements, expected <= 10)") + +I feel like we would actually use this pattern *a ton* in importer code, where +we do a lot of lookups or cleaning then check if we got a `None`. + diff --git a/proposals/2020-07-02_coverage_ui.md b/proposals/2020-07-02_coverage_ui.md new file mode 100644 index 00000000..2803fa22 --- /dev/null +++ b/proposals/2020-07-02_coverage_ui.md @@ -0,0 +1,80 @@ + +status: implemented + +Coverage UI/UX Enhancements +=========================== + +Want to generally enhance the use case of fatcat as a tool for exploring +preservation coverage of groups of papers. + +Specific changes: + +- make coverage bar graphs and coverage-by-year charts use preservation codes + instead of the current categories +- container coverage page should have bar coverage by release type (horizontal bars) +- container coverage page: "volume" chart (like years) +- coverage search page: enter release query, show coverage stats/graphs + => link to "missing releases" search query (TODO) + => same basic view as container summary page + => parameter: by year or past 60 days +- show preservation status in release search results (per hit) (TODO) +- high-level coverage summary pages (TODO) + => published papers since 1900 + => "recent" (last 60 days, by day not year) + +Stretch changes: + +- update front page with a static (SVG) coverage diagram +- incorporate summaries in container index (new "enhanced container" index?), + for display in search results (along with total count (TODO) + => also periodically run a script to update them (daily? weekly?) + => calculate these at index update time + => rough stats by type: paper, dataset, doc, etc + +Not coverage-specific, but update at the same time: + +- show summary of release types on container page (as bar, top 4-5 release types) +- list keepers on container and coverage page (TODO) + => with some link? KBART, issn.org keepers + +## New Views/URLs + +### Container Views + +`GET /container//stats.json` + +Existing endpoint updated with new stats: + +- preservation aggregation +- release type aggregation + +`GET /container//preservation_by_year.json` +`GET /container//preservation_by_year.svg` + +`GET /container//preservation_by_volume.json` +`GET /container//preservation_by_volume.svg` + +`GET /container//preservation_by_type.json` + +### Coverage + +`GET /coverage`: high-level summary (TODO) +`GET /coverage/search`: like `/release/search`, but shows aggregates not hits + +## Coverage Logic and Fulltext Display + +Current preservation codes (no change): + +- `bright` (green): in IA or an open archive like arxiv.org, Pubmed Central +- `dark` (dark green): preserved by a known Keeper, but not in a "bright" archive +- `shadows_only` (grey/red): not in "bright" or "dark" archive, but in a shadow library +- `none` (red): no known preservation + +Going to update preservation code logic of releases that have no file in IA +(yet), but do have an arxiv or pubmed central identifier: + +- has arxiv id: label as "bright" + => and show fulltext link to arxiv +- pmcid and more than 12 months old: "bright" + => and show fulltext link to pmc +- pmcid and less than 12 months old: "dark" diff --git a/proposals/2020-07-29_toml_editing.md b/proposals/2020-07-29_toml_editing.md new file mode 100644 index 00000000..bdb8c12f --- /dev/null +++ b/proposals/2020-07-29_toml_editing.md @@ -0,0 +1,43 @@ + +status: implemented + +TOML Editing of Entity Metadata +=============================== + +Goal is to enable full-power editing through the web interface, of the raw +entity schema and "extra" metadata, for all entity types. + +A side-effect of this should be enabling redirect editing between entities, as +well as "undeleting" or other state transitions (other than deletion). + +Plan: + +- find and add a toml transform library to pipenv deps. preferably with good + human-readable parsing errors +- TOML/JSON transform helpers, with tests +- implement generic TOML entity editing view (HTML) +- implement generic TOML entity editing endpoints + +Some metadata fields are removed before displaying TOML to edit. For example, +the ident, revision, and redirect fields for 'active' entities. It should still +be possible to do redirects by entering only the redirect field in the TOML +form. + +## UI Integration + +For existing edit forms, add a link to the "advanced" editing option. + +For endpoints without a form-based option (yet), do an HTTP redirect to the +TOML editing option. + +## New Webface Views + +`//create/toml` + GET: display template to be filled in + POST: form submit for creation +`///edit/toml` + GET: transform entity for editing + POST: form submit for edit +`/editgroup////edit/toml` + GET: transform entity for editing + POST: form submit for edit diff --git a/proposals/2020-08-04_grouped_release_exports.md b/proposals/2020-08-04_grouped_release_exports.md new file mode 100644 index 00000000..c8eacfb9 --- /dev/null +++ b/proposals/2020-08-04_grouped_release_exports.md @@ -0,0 +1,62 @@ + +status: implemented + +Grouped Release Exports +======================= + +Want to have rich "work" entity dumps, without duplicating with enriched +release dumps. The motivation for this is to do work-level processing of +releases. For example, transform into scholar.archive.org (indexed at work +level), bulk downloading fulltext (only need "best copy for entire work"), +citation analysis, etc. + +One elegant way to do this would be having release exports be sorted by +`work_id`, with all release entities with the same `work_id` contiguous (but +not all on the same line). + +Plan is to update fatcat-export tool (rust) to read in an ident list with +`work_id` included (and sorted by the `work_id`). + +## Rust Implementation + +Remembering that `fatcat-export` operates on lines, and uses channels and +worker threads for processing and serialization, we can't just assume that the +output lines will be in the same order as the input lines (eg, worker threads +can shift order). To preserve order, we will add a new dump mode which operates +on batches (`Vec`) of rows instead, and write the batches contiguously. The +batches themselves may end up out of order, but that is fine. + +## SQL Ident Dump + + +Database timing (running 2020-08-04 on fatcat-qa): + + COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv' WITH NULL ''; + => COPY 142635153 + => Time: 143264.483 ms (02:23.264) + + COPY ( + SELECT + release_ident.id, + release_ident.rev_id, + release_ident.redirect_id, + release_rev.work_ident_id + FROM + release_ident + LEFT JOIN release_rev ON release_ident.rev_id = release_rev.id + WHERE + release_ident.is_live=true + AND release_ident.redirect_id IS NULL + AND release_ident.rev_id IS NOT NULL + ORDER BY + release_rev.work_ident_id ASC NULLS LAST + ) TO '/tmp/fatcat_ident_releases_by_work.tsv' WITH NULL ''; + => COPY 142635147 + => Time: 610540.112 ms (10:10.540) + +Much slower, but not a blocking issue. Apparently postgresql will do a full +rowscan instead of using the existing `work_id` sorted index on `release_rev` +when going of a large fraction of the table (here almost the entire table) +because it thinks the read will be so much faster. I don't think this is true +with SSDs? Haven't actually confirmed this is the behavior, just assuming based +on postgresql docs. diff --git a/proposals/2020-08-07_dblp.md b/proposals/2020-08-07_dblp.md new file mode 100644 index 00000000..b6c734a4 --- /dev/null +++ b/proposals/2020-08-07_dblp.md @@ -0,0 +1,159 @@ + +status: implemented + +DBLP Metadata Import +==================== + +~5.3 million publications, ~2.6 million authors, ~5k conferences, ~1.7k journals. + +All metadata is explicitly CC-0 + +Container metadata: + +- journals: match via ISSN, if there is one +- create containers for all conferences (at least series), and make a series/container/dblp/name/publisher mapping +- make some decision about conference series vs. conference instance vs. published proceedings + +Release metadata: + +x add `dblp` as a release identifier type to fatcat schema +- look at CSL fields: conference series? book series? etc +- if arxiv.org, skip import for now + => though note could disambiguate authors +- if has a DOI: fetch fatcat record. if no stage/type/`container_id`, update record +- always fuzzy match? experiment first + +Author metadata won't be imported in this iteration. + +Fulltext ingest: + +- XML to ingest requests +- article key, DOI, arxiv, other repo identifiers + +## Plan + +x get martin review of this plan +x read full XML DTD +x scrape container metadata (for ~6k containers): ISSN, Wikidata QID, name + => selectolax? + => title, issn, wikidata +x implement basic release import, with tests (no container/creator linking) + => surface any unexpected issues +x estimate number of entities with/without external identifier (DOI) + Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 2953841, 'skip-key-type': 2640968, 'skip-arxiv-corr': 312872, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0}) +/ update container and creator schemas to have lookup-able dblp identifiers (creator:`dblp_pid`, container:`dblp_prefix`) +. run orcid import/update of creators +- container creator/update for `dblp_prefix` + => chocula import first? +- investigate journal+conference ISSN mapping + + +## Creator Metadata + +There is a "person ID" system. These can be just numbers (new records), just +names, or alphanumeric disambiguated names. + + +## Container Metadata + +Types: + +- journal +- book-series +- proceedings +- conference-series (?) + +TBD: + +- conference series or individual instances? if series, can use volume/year to + distinguish, seems best +- workshops as separate containers? probably yes +- proceedings vs. papers vs. abstracts? + +Going to have many containers with no ISSN. Do we need dblp-specific lookup? Or +do a special-case mapping file for expediency? + +Journals do not have explicit entities in the database. They do have names, in +the form of URL prefix to article keys. Additionally, there are (often?) HTML +pages with things like ISSN ("BHT" files). There may be a dump of these? + + +## Release Metadata + +Schema is basically BibTeX. + +Types: + +- article -> journal-article (if 'journal'), article, others +- inproceedings -> conference-paper +- proceedings -> (container) +- book -> book +- incollection -> chapter (or part?) +- phdthesis -> thesis +- mastersthesis -> thesis +- www + => often a person, if key starts with "homepages" +- data (?) +- publtype sub-type: + encyclopedia/"encyclopedia entry" -> entry-encyclopedia (?) + informal/"informal publication" (greylit) + edited (editorial or news) + survey (survey/review article) + data (dataset) + software + withdrawn + +Future: person + +Fields: + +- element type (one of the above) +- key (eg, "journals/cacm/Szalay08") +- title + => may contain , , , +- author (multiple; each a single string) + => may have HTML entities + => may have a number at the end, to aid with identifier creation + => orcid +- editor (same as author) + => orcid +- journal (abbrev?) +- volume, pages, number (number -> issue) +- publisher +- year + => for conferences, year of conference not of publication +- month +- crossref (from inproceedings to specific proceedings volume) +- booktitle + => for inproceedings, this is the name of conference or workshop. acronym. +- isbn +- ee (electronic edition; often DOI?) + => in some cases a "local" URL + => publisher URL; often DOI + => type attr +- url + => dblp internal link to table-of-contents +- publnr + => alternative identifier +- note + => for persons (www), may be name in non-Latin character set + +- series: ? + => has href attr +- cite: ? +- school: ? +- chapter: ? + +Notable CSL "extra" fields: + => 'event': name of conference/workshop + => 'event-place': location of conference/workshop + => 'collection-title' (eg, book series) + => 'container-title' (eg, book for a chapter) + + +## Resources + +"DBLP — Some Lessons Learned" +https://dblp.org/xml/docu/dblpxml.pdf + +https://blog.dblp.org/2020/08/18/new-dblp-url-scheme-and-api-changes/ diff --git a/proposals/2020-08_bulk_citation_graph.md b/proposals/2020-08_bulk_citation_graph.md new file mode 100644 index 00000000..a6cce256 --- /dev/null +++ b/proposals/2020-08_bulk_citation_graph.md @@ -0,0 +1,162 @@ + +status: mostly implemented (refcat, mostly) + +Bulk Citation Graph +=================== + +This is one design proposal for how to scale up citation graph potential-match +generation, as well as for doing fuzzy matching of other types at scale (eg, +self-matches to group works within fatcat). Not proposiing that we have to do +things this way, this is just one possible option. + +This current proposal has the following assumptions: + +- 100-200 million "target" works +- 1-3 billion structured references to match +- references mostly coming from GROBID, Crossref, and PUBMED +- paper-like works (could include books; references to web pages etc to be + handled separately) +- static snapshots are sufficient +- python fuzzy match code works and is performant enough to verify matches + within small buckets/pools + +Additional major "source" and "target" works to think about: + +- wikipedia articles as special "source" works. should work fine with this + system. also most wikipedia paper references already have persistent + identifiers +- webpages as special "target" works, where we would want to do a CDX lookup or + something. normalize URL (SURT?) to generate reverse index ("all works citing + a given URL") +- openlibrary books as "target" works. also should work fine with this system + +The high-level prosposal is: + +- transform and normalize basic metadata for both citations and reference (eg, + sufficient fields for fuzzy verification), and store only this minimal subset +- as a first pass, if external identifiers exist in the "source" reference set, + do lookups against fatcat API and verify match on any resulting hits. remove + these "source" matches from the next stages. +- generate one or more fixed-size hash identifiers (~64 bit) for each citation + and target, and use these as a key to bucket works. this would not be hashes + over the entire record metadata, only small subsets +- sort the "target" works into an index for self-grouping, lookups, and + iteration. each record may appear multiple times if there are multiple hash + types +- sort the "source" references into an index and run a merge-sort on bucket + keys against the "target" index to generate candidate match buckets +- run python fuzzy match code against the candidate buckets, outputting a status + for each reference input and a list of all strong matches +- resort successful matches and index by both source and target identifiers as + output citation graph + +## Record Schema + +Imaginging a subset of fatcat release entity fields, perhaps stored in a binary +format like protobuf for size efficiency. Or a SQL table or columnar +datastore. If we used JSON we would want to use short key names to reduce key +storage overhead. Total data set size will impact performance because of disk +I/O, caching, etc. I think this may hold even with on-disk compression? + +Would do a pass of normalization ahead of time, like aggressive string +cleaning, so we don't need to do this per-fuzzy-verify attempt. + +Metadata subset might include: + +- `title` +- `subtitle` +- `authors` (surnames? structured/full names? original/alternate/aliases?) +- `year` +- `container_name` (and abbreviation?) +- `volume`, `issue`, `pages` +- `doi`, `pmid`, `arxiv_id` (only ext ids used in citations?) +- `release_ident` (source or target) +- `work_ident` (source or target) +- `release_stage` (target only?) +- `release_type` (target only?) + +Plus any other fields helpful in fuzzy verification. + +These records can be transformed into python release entities with partial +metadata, then passed to the existing fuzzy verification code path. + +## Hashing Schemes + +Hashing schemes could be flexible. Multiple could be used at the same time, and +we can change schemes over time. Each record could be transformed to one or +more hashes. Ideally we could use the top couple bits of the hash to indicate +the hash type. + +An initial proposal would be to use first and last N tokens of just the title. +In this scheme would normalize and tokenize the title, remove a few stopwords +(eg, tokens sometimes omitted in citation or indexing). If the title is shorter +than 3 tokens pad with blank tokens. Perhaps do a filter here against +inordinately popular titles or other bad data. Then use some fast hash +non-cryptographic hash with fixed size output (64-bits). Do this for both the +first and last three tokens; set the top bit to "0" for hash of the first three +tokens, or "1" for the hash of the last three tokens. Emit two key/value rows +(eg, TSV?), with the same values but different hashes. + +Alternatively, in SQL, index a single row on the two different hash types. + +Possible alternative hash variations we could experiment with: + +- take the first 10 normalized characters, removing whitespace, and hash that +- include first 3 title tokens, then 1 token of the first author's surname +- normalize and hash entire title +- concatenate subtitle to title or not + +Two advantages of hashing are: + +- we can shard/partition based on the key. this would not be the case if the + keys were raw natural language tokens +- advantages from fixed-size datatypes (eg, uint64) + +## Bulk Joining + +"Target" index could include all hash types in a single index. "Source" index +in bulk mode could be either all hash types concatenated together and run +together, then re-sort and uniq the output (eg, by release-to-release pairings) +to remove dupes. In many cases this would have the overhead of computing the +fuzzy verification multiple times redundantly (but only a small fixed maximum +number of duplicates). Alternatively, with greater pipeline complexity, could +do an initial match on one hash type, then attempt matching (eg, recompute and +sort and join) for the other hash types only for those which did not match. + +## Citation Graph Index Output + +Imagining successful match rows to look like: + +- `match_status` (eg, strong/weak) +- `source_release_ident` +- `source_release_stage` +- `ref_key` (optional? or `ref_index`?) +- `source_work_ident` +- `target_release_ident` +- `target_release_stage` +- `target_work_ident` + +Would run a sort/uniq on `(source_release_ident,target_release_ident)`. + +Could filter by stages, then sort/uniq work-to-work counts to generate simple +inbound citation counts for each target work. + +Could sort `target_work_ident` and generate groups of inbound works ("best +release per work") citing that work. Then do fast key lookups to show +"works/releases citing this work/release". + +## To Be Decided + +- bulk datastore/schema: just TSV files sorted by key column? if protobuf, how + to encode? what about SQL? parquet/arrow? +- what datastores would allow fast merge sorts? do SQL engines (parquet) + actually do this? +- would we need to make changes to be more compatible with something like + opencitations? Eg, I think they want DOI-to-DOI citations; having to look + those up again from fatcat API would be slow +- should we do this in a large distributed system like spark (maybe pyspark for + fuzzy verification) or stick to simple UNIX/CLI tools? +- wikipedia articles as sources? +- openlibrary identifiers? +- archive.org as additional identifiers? + diff --git a/proposals/20200103_py37_refactors.md b/proposals/20200103_py37_refactors.md deleted file mode 100644 index f0321b33..00000000 --- a/proposals/20200103_py37_refactors.md +++ /dev/null @@ -1,101 +0,0 @@ - -status: planning - -If we update fatcat python code to python3.7, what code refactoring changes can -we make? We currently use/require python3.5. - -Nice features in python3 I know of are: - -- dataclasses (python3.7) -- async/await (mature in python3.7?) -- type annotations (python3.5) -- format strings (python3.6) -- walrus assignment (python3.8) - -Not sure if the walrus operator is worth jumping all the way to python3.8. - -While we might be at it, what other superficial factorings might we want to do? - -- strict lint style (eg, maximum column width) with `black` (python3.6) -- logging/debugging/verbose -- type annotations and checking -- use named dicts or structs in place of dicts - -## Linux Distro Support - -The default python version shipped by current and planned linux releases are: - -- ubuntu xenial 16.04 LTS: python3.5 -- ubuntu bionic 18.04 LTS: python3.6 -- ubuntu focal 20.04 LTS: python3.8 (planned) -- debian buster 10 2019: python3.7 - -Python 3.7 is the default in debian buster (10). - -There are apt PPA package repositories that allow backporting newer pythons to -older releases. As far as I know this is safe and doesn't override any system -usage if we are careful not to set the defaults (aka, `python3` command should -be the older version unless inside a virtualenv). - -It would also be possible to use `pyenv` to have `virtualenv`s with custom -python versions. We should probably do that for OS X and/or windows support if -we wanted those. But having a system package is probably a lot faster to -install. - -## Dataclasses - -`dataclasses` are a user-friendly way to create struct-like objects. They are -pretty similar to the existing `namedtuple`, but can be mutable and have -methods attached to them (they are just classes), plus several other usability -improvements. - -Most places we are throwing around dicts with structure we could be using -dataclasses instead. There are some instances of this in fatcat, but many more -in sandcrawler. - -## Async/Await - -Where might we actually use async/await? I think more in sandcrawler than in -the python tools or web apps. The GROBID, ingest, and ML workers in particular -should be async over batches, as should all fetches from CDX/wayback. - -Some of the kafka workers *could* be aync, but i'm not sure how much speedup -there would actually be. For example, the entity updates worker could fetch -entities for an editgroup concurrently. - -Inserts (importers) should probably mostly happen serially, at least the kafka -importers, one editgroup at a time, so progress is correctly recorded in kafka. -Parallelization should probably happen at the partition level; would need to -think through whether async would actually help with code simplicity vs. thread -or process parallelization. - -## Type Annotations - -The meta-goals of (gradual) type annotations would be catching more bugs at -development time, and having code be more self-documenting and easier to -understand. - -The two big wins I see with type annotation would be having annotations -auto-generated for the openapi classes and API calls, and to make string -munging in importer code less buggy. - -## Format Strings - -Eg, replace code like: - - "There are {} out of {} objects".format(found, total) - -With: - - f"There are {found} out of {total} objects" - -## Walrus Operator - -New operator allows checking and assignment together: - - if (n := len(a)) > 10: - print(f"List is too long ({n} elements, expected <= 10)") - -I feel like we would actually use this pattern *a ton* in importer code, where -we do a lot of lookups or cleaning then check if we got a `None`. - diff --git a/proposals/20200702_coverage_ui.md b/proposals/20200702_coverage_ui.md deleted file mode 100644 index b2dfc2f6..00000000 --- a/proposals/20200702_coverage_ui.md +++ /dev/null @@ -1,80 +0,0 @@ - -status: in progress - -Coverage UI/UX Enhancements -=========================== - -Want to generally enhance the use case of fatcat as a tool for exploring -preservation coverage of groups of papers. - -Specific changes: - -- make coverage bar graphs and coverage-by-year charts use preservation codes - instead of the current categories -- container coverage page should have bar coverage by release type (horizontal bars) -- container coverage page: "volume" chart (like years) -- coverage search page: enter release query, show coverage stats/graphs - => link to "missing releases" search query (TODO) - => same basic view as container summary page - => parameter: by year or past 60 days -- show preservation status in release search results (per hit) (TODO) -- high-level coverage summary pages (TODO) - => published papers since 1900 - => "recent" (last 60 days, by day not year) - -Stretch changes: - -- update front page with a static (SVG) coverage diagram -- incorporate summaries in container index (new "enhanced container" index?), - for display in search results (along with total count (TODO) - => also periodically run a script to update them (daily? weekly?) - => calculate these at index update time - => rough stats by type: paper, dataset, doc, etc - -Not coverage-specific, but update at the same time: - -- show summary of release types on container page (as bar, top 4-5 release types) -- list keepers on container and coverage page (TODO) - => with some link? KBART, issn.org keepers - -## New Views/URLs - -### Container Views - -`GET /container//stats.json` - -Existing endpoint updated with new stats: - -- preservation aggregation -- release type aggregation - -`GET /container//preservation_by_year.json` -`GET /container//preservation_by_year.svg` - -`GET /container//preservation_by_volume.json` -`GET /container//preservation_by_volume.svg` - -`GET /container//preservation_by_type.json` - -### Coverage - -`GET /coverage`: high-level summary (TODO) -`GET /coverage/search`: like `/release/search`, but shows aggregates not hits - -## Coverage Logic and Fulltext Display - -Current preservation codes (no change): - -- `bright` (green): in IA or an open archive like arxiv.org, Pubmed Central -- `dark` (dark green): preserved by a known Keeper, but not in a "bright" archive -- `shadows_only` (grey/red): not in "bright" or "dark" archive, but in a shadow library -- `none` (red): no known preservation - -Going to update preservation code logic of releases that have no file in IA -(yet), but do have an arxiv or pubmed central identifier: - -- has arxiv id: label as "bright" - => and show fulltext link to arxiv -- pmcid and more than 12 months old: "bright" - => and show fulltext link to pmc -- pmcid and less than 12 months old: "dark" diff --git a/proposals/20200729_toml_editing.md b/proposals/20200729_toml_editing.md deleted file mode 100644 index bdb8c12f..00000000 --- a/proposals/20200729_toml_editing.md +++ /dev/null @@ -1,43 +0,0 @@ - -status: implemented - -TOML Editing of Entity Metadata -=============================== - -Goal is to enable full-power editing through the web interface, of the raw -entity schema and "extra" metadata, for all entity types. - -A side-effect of this should be enabling redirect editing between entities, as -well as "undeleting" or other state transitions (other than deletion). - -Plan: - -- find and add a toml transform library to pipenv deps. preferably with good - human-readable parsing errors -- TOML/JSON transform helpers, with tests -- implement generic TOML entity editing view (HTML) -- implement generic TOML entity editing endpoints - -Some metadata fields are removed before displaying TOML to edit. For example, -the ident, revision, and redirect fields for 'active' entities. It should still -be possible to do redirects by entering only the redirect field in the TOML -form. - -## UI Integration - -For existing edit forms, add a link to the "advanced" editing option. - -For endpoints without a form-based option (yet), do an HTTP redirect to the -TOML editing option. - -## New Webface Views - -`//create/toml` - GET: display template to be filled in - POST: form submit for creation -`///edit/toml` - GET: transform entity for editing - POST: form submit for edit -`/editgroup////edit/toml` - GET: transform entity for editing - POST: form submit for edit diff --git a/proposals/20200804_grouped_release_exports.md b/proposals/20200804_grouped_release_exports.md deleted file mode 100644 index d75ba687..00000000 --- a/proposals/20200804_grouped_release_exports.md +++ /dev/null @@ -1,60 +0,0 @@ - -Grouped Release Exports -======================= - -Want to have rich "work" entity dumps, without duplicating with enriched -release dumps. The motivation for this is to do work-level processing of -releases. For example, transform into scholar.archive.org (indexed at work -level), bulk downloading fulltext (only need "best copy for entire work"), -citation analysis, etc. - -One elegant way to do this would be having release exports be sorted by -`work_id`, with all release entities with the same `work_id` contiguous (but -not all on the same line). - -Plan is to update fatcat-export tool (rust) to read in an ident list with -`work_id` included (and sorted by the `work_id`). - -## Rust Implementation - -Remembering that `fatcat-export` operates on lines, and uses channels and -worker threads for processing and serialization, we can't just assume that the -output lines will be in the same order as the input lines (eg, worker threads -can shift order). To preserve order, we will add a new dump mode which operates -on batches (`Vec`) of rows instead, and write the batches contiguously. The -batches themselves may end up out of order, but that is fine. - -## SQL Ident Dump - - -Database timing (running 2020-08-04 on fatcat-qa): - - COPY (SELECT id, rev_id, redirect_id FROM release_ident WHERE is_live=true) TO '/tmp/fatcat_ident_releases.tsv' WITH NULL ''; - => COPY 142635153 - => Time: 143264.483 ms (02:23.264) - - COPY ( - SELECT - release_ident.id, - release_ident.rev_id, - release_ident.redirect_id, - release_rev.work_ident_id - FROM - release_ident - LEFT JOIN release_rev ON release_ident.rev_id = release_rev.id - WHERE - release_ident.is_live=true - AND release_ident.redirect_id IS NULL - AND release_ident.rev_id IS NOT NULL - ORDER BY - release_rev.work_ident_id ASC NULLS LAST - ) TO '/tmp/fatcat_ident_releases_by_work.tsv' WITH NULL ''; - => COPY 142635147 - => Time: 610540.112 ms (10:10.540) - -Much slower, but not a blocking issue. Apparently postgresql will do a full -rowscan instead of using the existing `work_id` sorted index on `release_rev` -when going of a large fraction of the table (here almost the entire table) -because it thinks the read will be so much faster. I don't think this is true -with SSDs? Haven't actually confirmed this is the behavior, just assuming based -on postgresql docs. diff --git a/proposals/20200807_dblp.md b/proposals/20200807_dblp.md deleted file mode 100644 index ba5e76dc..00000000 --- a/proposals/20200807_dblp.md +++ /dev/null @@ -1,159 +0,0 @@ - -status: in progress - -DBLP Metadata Import -==================== - -~5.3 million publications, ~2.6 million authors, ~5k conferences, ~1.7k journals. - -All metadata is explicitly CC-0 - -Container metadata: - -- journals: match via ISSN, if there is one -- create containers for all conferences (at least series), and make a series/container/dblp/name/publisher mapping -- make some decision about conference series vs. conference instance vs. published proceedings - -Release metadata: - -x add `dblp` as a release identifier type to fatcat schema -- look at CSL fields: conference series? book series? etc -- if arxiv.org, skip import for now - => though note could disambiguate authors -- if has a DOI: fetch fatcat record. if no stage/type/`container_id`, update record -- always fuzzy match? experiment first - -Author metadata won't be imported in this iteration. - -Fulltext ingest: - -- XML to ingest requests -- article key, DOI, arxiv, other repo identifiers - -## Plan - -x get martin review of this plan -x read full XML DTD -x scrape container metadata (for ~6k containers): ISSN, Wikidata QID, name - => selectolax? - => title, issn, wikidata -x implement basic release import, with tests (no container/creator linking) - => surface any unexpected issues -x estimate number of entities with/without external identifier (DOI) - Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 2953841, 'skip-key-type': 2640968, 'skip-arxiv-corr': 312872, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0}) -/ update container and creator schemas to have lookup-able dblp identifiers (creator:`dblp_pid`, container:`dblp_prefix`) -. run orcid import/update of creators -- container creator/update for `dblp_prefix` - => chocula import first? -- investigate journal+conference ISSN mapping - - -## Creator Metadata - -There is a "person ID" system. These can be just numbers (new records), just -names, or alphanumeric disambiguated names. - - -## Container Metadata - -Types: - -- journal -- book-series -- proceedings -- conference-series (?) - -TBD: - -- conference series or individual instances? if series, can use volume/year to - distinguish, seems best -- workshops as separate containers? probably yes -- proceedings vs. papers vs. abstracts? - -Going to have many containers with no ISSN. Do we need dblp-specific lookup? Or -do a special-case mapping file for expediency? - -Journals do not have explicit entities in the database. They do have names, in -the form of URL prefix to article keys. Additionally, there are (often?) HTML -pages with things like ISSN ("BHT" files). There may be a dump of these? - - -## Release Metadata - -Schema is basically BibTeX. - -Types: - -- article -> journal-article (if 'journal'), article, others -- inproceedings -> conference-paper -- proceedings -> (container) -- book -> book -- incollection -> chapter (or part?) -- phdthesis -> thesis -- mastersthesis -> thesis -- www - => often a person, if key starts with "homepages" -- data (?) -- publtype sub-type: - encyclopedia/"encyclopedia entry" -> entry-encyclopedia (?) - informal/"informal publication" (greylit) - edited (editorial or news) - survey (survey/review article) - data (dataset) - software - withdrawn - -Future: person - -Fields: - -- element type (one of the above) -- key (eg, "journals/cacm/Szalay08") -- title - => may contain , , , -- author (multiple; each a single string) - => may have HTML entities - => may have a number at the end, to aid with identifier creation - => orcid -- editor (same as author) - => orcid -- journal (abbrev?) -- volume, pages, number (number -> issue) -- publisher -- year - => for conferences, year of conference not of publication -- month -- crossref (from inproceedings to specific proceedings volume) -- booktitle - => for inproceedings, this is the name of conference or workshop. acronym. -- isbn -- ee (electronic edition; often DOI?) - => in some cases a "local" URL - => publisher URL; often DOI - => type attr -- url - => dblp internal link to table-of-contents -- publnr - => alternative identifier -- note - => for persons (www), may be name in non-Latin character set - -- series: ? - => has href attr -- cite: ? -- school: ? -- chapter: ? - -Notable CSL "extra" fields: - => 'event': name of conference/workshop - => 'event-place': location of conference/workshop - => 'collection-title' (eg, book series) - => 'container-title' (eg, book for a chapter) - - -## Resources - -"DBLP — Some Lessons Learned" -https://dblp.org/xml/docu/dblpxml.pdf - -https://blog.dblp.org/2020/08/18/new-dblp-url-scheme-and-api-changes/ diff --git a/proposals/202008_bulk_citation_graph.md b/proposals/202008_bulk_citation_graph.md deleted file mode 100644 index 65db0d94..00000000 --- a/proposals/202008_bulk_citation_graph.md +++ /dev/null @@ -1,160 +0,0 @@ - -status: brainstorm - - -This is one design proposal for how to scale up citation graph potential-match -generation, as well as for doing fuzzy matching of other types at scale (eg, -self-matches to group works within fatcat). Not proposiing that we have to do -things this way, this is just one possible option. - -This current proposal has the following assumptions: - -- 100-200 million "target" works -- 1-3 billion structured references to match -- references mostly coming from GROBID, Crossref, and PUBMED -- paper-like works (could include books; references to web pages etc to be - handled separately) -- static snapshots are sufficient -- python fuzzy match code works and is performant enough to verify matches - within small buckets/pools - -Additional major "source" and "target" works to think about: - -- wikipedia articles as special "source" works. should work fine with this - system. also most wikipedia paper references already have persistent - identifiers -- webpages as special "target" works, where we would want to do a CDX lookup or - something. normalize URL (SURT?) to generate reverse index ("all works citing - a given URL") -- openlibrary books as "target" works. also should work fine with this system - -The high-level prosposal is: - -- transform and normalize basic metadata for both citations and reference (eg, - sufficient fields for fuzzy verification), and store only this minimal subset -- as a first pass, if external identifiers exist in the "source" reference set, - do lookups against fatcat API and verify match on any resulting hits. remove - these "source" matches from the next stages. -- generate one or more fixed-size hash identifiers (~64 bit) for each citation - and target, and use these as a key to bucket works. this would not be hashes - over the entire record metadata, only small subsets -- sort the "target" works into an index for self-grouping, lookups, and - iteration. each record may appear multiple times if there are multiple hash - types -- sort the "source" references into an index and run a merge-sort on bucket - keys against the "target" index to generate candidate match buckets -- run python fuzzy match code against the candidate buckets, outputting a status - for each reference input and a list of all strong matches -- resort successful matches and index by both source and target identifiers as - output citation graph - -## Record Schema - -Imaginging a subset of fatcat release entity fields, perhaps stored in a binary -format like protobuf for size efficiency. Or a SQL table or columnar -datastore. If we used JSON we would want to use short key names to reduce key -storage overhead. Total data set size will impact performance because of disk -I/O, caching, etc. I think this may hold even with on-disk compression? - -Would do a pass of normalization ahead of time, like aggressive string -cleaning, so we don't need to do this per-fuzzy-verify attempt. - -Metadata subset might include: - -- `title` -- `subtitle` -- `authors` (surnames? structured/full names? original/alternate/aliases?) -- `year` -- `container_name` (and abbreviation?) -- `volume`, `issue`, `pages` -- `doi`, `pmid`, `arxiv_id` (only ext ids used in citations?) -- `release_ident` (source or target) -- `work_ident` (source or target) -- `release_stage` (target only?) -- `release_type` (target only?) - -Plus any other fields helpful in fuzzy verification. - -These records can be transformed into python release entities with partial -metadata, then passed to the existing fuzzy verification code path. - -## Hashing Schemes - -Hashing schemes could be flexible. Multiple could be used at the same time, and -we can change schemes over time. Each record could be transformed to one or -more hashes. Ideally we could use the top couple bits of the hash to indicate -the hash type. - -An initial proposal would be to use first and last N tokens of just the title. -In this scheme would normalize and tokenize the title, remove a few stopwords -(eg, tokens sometimes omitted in citation or indexing). If the title is shorter -than 3 tokens pad with blank tokens. Perhaps do a filter here against -inordinately popular titles or other bad data. Then use some fast hash -non-cryptographic hash with fixed size output (64-bits). Do this for both the -first and last three tokens; set the top bit to "0" for hash of the first three -tokens, or "1" for the hash of the last three tokens. Emit two key/value rows -(eg, TSV?), with the same values but different hashes. - -Alternatively, in SQL, index a single row on the two different hash types. - -Possible alternative hash variations we could experiment with: - -- take the first 10 normalized characters, removing whitespace, and hash that -- include first 3 title tokens, then 1 token of the first author's surname -- normalize and hash entire title -- concatenate subtitle to title or not - -Two advantages of hashing are: - -- we can shard/partition based on the key. this would not be the case if the - keys were raw natural language tokens -- advantages from fixed-size datatypes (eg, uint64) - -## Bulk Joining - -"Target" index could include all hash types in a single index. "Source" index -in bulk mode could be either all hash types concatenated together and run -together, then re-sort and uniq the output (eg, by release-to-release pairings) -to remove dupes. In many cases this would have the overhead of computing the -fuzzy verification multiple times redundantly (but only a small fixed maximum -number of duplicates). Alternatively, with greater pipeline complexity, could -do an initial match on one hash type, then attempt matching (eg, recompute and -sort and join) for the other hash types only for those which did not match. - -## Citation Graph Index Output - -Imagining successful match rows to look like: - -- `match_status` (eg, strong/weak) -- `source_release_ident` -- `source_release_stage` -- `ref_key` (optional? or `ref_index`?) -- `source_work_ident` -- `target_release_ident` -- `target_release_stage` -- `target_work_ident` - -Would run a sort/uniq on `(source_release_ident,target_release_ident)`. - -Could filter by stages, then sort/uniq work-to-work counts to generate simple -inbound citation counts for each target work. - -Could sort `target_work_ident` and generate groups of inbound works ("best -release per work") citing that work. Then do fast key lookups to show -"works/releases citing this work/release". - -## To Be Decided - -- bulk datastore/schema: just TSV files sorted by key column? if protobuf, how - to encode? what about SQL? parquet/arrow? -- what datastores would allow fast merge sorts? do SQL engines (parquet) - actually do this? -- would we need to make changes to be more compatible with something like - opencitations? Eg, I think they want DOI-to-DOI citations; having to look - those up again from fatcat API would be slow -- should we do this in a large distributed system like spark (maybe pyspark for - fuzzy verification) or stick to simple UNIX/CLI tools? -- wikipedia articles as sources? -- openlibrary identifiers? -- archive.org as additional identifiers? - diff --git a/proposals/2020_client_cli.md b/proposals/2020_client_cli.md index 01d190a8..82169eb4 100644 --- a/proposals/2020_client_cli.md +++ b/proposals/2020_client_cli.md @@ -1,6 +1,5 @@ -status: prototyping, side-project - +status: implemented (fatcat-cli) Fatcat CLI Client =================== diff --git a/proposals/2020_fuzzy_matching.md b/proposals/2020_fuzzy_matching.md index e84c2bd2..60a4b6ac 100644 --- a/proposals/2020_fuzzy_matching.md +++ b/proposals/2020_fuzzy_matching.md @@ -1,5 +1,5 @@ -Status: planned +Status: mostly implemented (fuzzycat) Bibliographic Entity Fuzzy Match and Verification ==================================================== diff --git a/proposals/2020_ir_importer.spn b/proposals/2020_ir_importer.spn new file mode 100644 index 00000000..ad561d7b --- /dev/null +++ b/proposals/2020_ir_importer.spn @@ -0,0 +1,25 @@ + +status: brainstorm + +Institutional Repository Importer +================================= + +Want to import content from IRs. Same general workflow for CORE, SHARE, BASE, +other aggregators. + +Filter input to only works with known/ingested fulltext. + +Lookup file by hash. If found, skip for now. In future might do +mapping/matching. + +Lookup by primary id (eg, CORE ident). If existing, can skip if it has file, or +add file/location directly. + +Two indirect lookups: by external ident (DOI, PMID), or fuzzy search match. If +we get either of these, want to do release/work grouping correctly. + +1. if we are certain of IR copy stage, then compare with existing release, + and/or lookup entire work for releases with same stage. update release or + add new release under same work. +2. not sure of IR copy stage. guess stage from sherpa/romeo color and proceed + to insert/update. diff --git a/proposals/2020_metadata_cleanups.md b/proposals/2020_metadata_cleanups.md index b95f6579..bd8dd85e 100644 --- a/proposals/2020_metadata_cleanups.md +++ b/proposals/2020_metadata_cleanups.md @@ -1,5 +1,5 @@ -status: planning +status: work-in-progress This proposal tracks a batch of catalog metadata cleanups planned for 2020. diff --git a/proposals/2020_spn.md b/proposals/2020_spn.md new file mode 100644 index 00000000..6e4b736b --- /dev/null +++ b/proposals/2020_spn.md @@ -0,0 +1,27 @@ + +Status: implemented + +## Save Paper Now + +Don't require login; if not logged in, request will be logged and added eventually. + +If logged in, can add edit immediately. + +Variations: +- know URL, or guess from DOI + +Results: +- does file exist? +- did GROBID match? +- webcapture (HTML) or file? + +Form fields: +- `release_id` (implied) + + +`/editgroup// +`/release//ingest`: GET form, POST request + => editgroup + => url + => type + diff --git a/proposals/2021-01-29_citation_api.md b/proposals/2021-01-29_citation_api.md index 6379da09..4cb28079 100644 --- a/proposals/2021-01-29_citation_api.md +++ b/proposals/2021-01-29_citation_api.md @@ -1,4 +1,6 @@ +status: implemented + Describes schemas, APIs, use-cases, and data store for citation graph. ## Use Cases diff --git a/proposals/2021-03-26_journal_metadata.md b/proposals/2021-03-26_journal_metadata.md new file mode 100644 index 00000000..52827d60 --- /dev/null +++ b/proposals/2021-03-26_journal_metadata.md @@ -0,0 +1,40 @@ + +status: brainstorm + +What improved journal-level metadata could we store? + + +## Names, Aliases + +Translated names, as a dict of { lang: name } + + +## API Endpoints + +OAI-PMH endpoint and type (for ingest) + + +## Homepage URLs + + +## Fulltext URL Info + +Fulltext SURT prefix/pattern, by type and year range + + surt_prefix + url_regex + url_pattern + including ext_id pattern substitutions; can generate URL from release entity + mimetype + year_span + optional + confidence + "if not this pattern, then isn't published version" + "if matches, definitely fulltext" + "might be fulltext, might not" + etc. as a slug/code + + +## Other + +for releases, could store DOAJ access URL in release extra metadata diff --git a/proposals/2021-10-12_v04_schema_tweaks.md b/proposals/2021-10-12_v04_schema_tweaks.md new file mode 100644 index 00000000..15ca489e --- /dev/null +++ b/proposals/2021-10-12_v04_schema_tweaks.md @@ -0,0 +1,30 @@ + +Status: implemented + +## Schema Changes for v0.4 + +Small SQL and API changes. Calling these a minor-level API version increment. + +API Schema Changes: + +- release `ext_ids`: `hdl` (handle) identifier +- fileset: `mimetype` for manifest files as a field. This is a SQL schema change as well. +- container: `issne` and `issnp` as top-level fields, indexed for lookup. SQL + schema change. +- container: `publication_status` as a top-level field, to indicate "active", + "discontinued", etc. SQL schema change. + +API Endpoints: + +- `GET /editor/lookup`: editor lookup by username + +Elasticsearch Schemas: + +- release: 'hdl' identifier +- release: `container_publication_status` and `container_issns` +- release: add missing `version` field (not related to any API change) +- release: add `tags` for future extensibility +- release: `is_work_alias` boolean flag for unversioned releases which point + to the overall work, or the latest published version of the work. Included + from field with the same name in release `extra`. +- container: `publication_status` diff --git a/proposals/2021-11-17_content_scope.md b/proposals/2021-11-17_content_scope.md index 8d04808e..4c71ca97 100644 --- a/proposals/2021-11-17_content_scope.md +++ b/proposals/2021-11-17_content_scope.md @@ -1,5 +1,5 @@ -status: planned +status: work-in-progress Content Scope Fields ====================== diff --git a/proposals/20211012_v04_schema_tweaks.md b/proposals/20211012_v04_schema_tweaks.md deleted file mode 100644 index 15ca489e..00000000 --- a/proposals/20211012_v04_schema_tweaks.md +++ /dev/null @@ -1,30 +0,0 @@ - -Status: implemented - -## Schema Changes for v0.4 - -Small SQL and API changes. Calling these a minor-level API version increment. - -API Schema Changes: - -- release `ext_ids`: `hdl` (handle) identifier -- fileset: `mimetype` for manifest files as a field. This is a SQL schema change as well. -- container: `issne` and `issnp` as top-level fields, indexed for lookup. SQL - schema change. -- container: `publication_status` as a top-level field, to indicate "active", - "discontinued", etc. SQL schema change. - -API Endpoints: - -- `GET /editor/lookup`: editor lookup by username - -Elasticsearch Schemas: - -- release: 'hdl' identifier -- release: `container_publication_status` and `container_issns` -- release: add missing `version` field (not related to any API change) -- release: add `tags` for future extensibility -- release: `is_work_alias` boolean flag for unversioned releases which point - to the overall work, or the latest published version of the work. Included - from field with the same name in release `extra`. -- container: `publication_status` diff --git a/proposals/2022-01-21_read-only-db.md b/proposals/2022-01-21_read-only-db.md index 16d3d166..cd1b7ae7 100644 --- a/proposals/2022-01-21_read-only-db.md +++ b/proposals/2022-01-21_read-only-db.md @@ -1,5 +1,5 @@ -status: in-progress +status: planned Database Read-Only Mode ======================= -- cgit v1.2.3