From 8a1199d2bd1fa79391e121e537181b63025efc76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 11 Jan 2019 19:50:19 -0800 Subject: allow null release_refs (as opposed to empty list) --- python/fatcat_web/templates/release_view.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index fd86b7c9..4e24b281 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -143,7 +143,7 @@ Raw Object: {% endif %}
-{% if release.refs.size != 0 %} +{% if release.refs != None and release.refs.size != 0 %}

References

This release citing other releases.
    -- cgit v1.2.3 From 1f4fd63ee1fb31ce657480b5170037f54b8e252f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 17:25:40 -0800 Subject: TODO updates --- TODO | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------ python/TODO | 5 ---- 2 files changed, 73 insertions(+), 12 deletions(-) (limited to 'python') diff --git a/TODO b/TODO index fa6397eb..6417668d 100644 --- a/TODO +++ b/TODO @@ -1,13 +1,21 @@ ## In Progress +- basic python tests for editgroup, annotation, submission changes +- python tests for new autoaccept behavior +- python tests for citation table storage efficiency changes + => should there be a distinction between empty list and no references? + yes, eg if expanded or not hidden + => postgres manual checks that this is working + => also benchmark (both speed and efficiency) + ## Next Up +- "don't clobber" mode/flag for crossref import (and others?) +- update_file requires 'id'. should it be 'ident'? + => something different about file vs. release - guide updates for auth -- remove the concept of "active editgroup", and simplify autoaccept batch path - refactor webface views to use shared entity_view.html template -- fix returned error messages; should return type (shortname), and then actual - message/description - handle 'wip' status entities in web UI - elastic inserter should handle deletions and redirects; if state isn't active, delete the document @@ -15,7 +23,30 @@ they don't show up in results => refactor inserter to be a class (eg, for command line use) => end-to-end test of this behavior? -- un-accepted editgroup access: by created/updated, accepted/not +- date handling is really pretty bad for releases; mangling those Jan1/Dec31 + => elastic schema should have a year field (integer) +- document: elastic query date syntax is like: date:[2018-10-01 TO 2018-12-31] +- elastic transform should only include authors, not editors (?) +- webcapture timestamp schema cleanup (both CDX and base) + => dt.to_rfc3339_opts(SecondsFormat::Secs, true) + => but this is mostly buried in serialization code? +- fake DOI (use in examples): 10.5555/12345678 +- URL location duplication (especially IA/wayback) + => eg, https://fatcat.wiki/file/2g4sz57j3bgcfpwkgz5bome3re + => UNIQ index on {release_rev, url}? +- shadow library manifest importer +- import from arabesque output (eg, specific crawls) +- elastic iteration + => any_abstract broken? + => blank author names? maybe in crossref import; fatcat-api and schema + should both prevent +- handle very large author/reference lists (instead of dropping) + => https://api.crossref.org/v1/works/http://dx.doi.org/10.1007/978-3-319-46095-6_7 + => 7000+ authors (!) + +## Bugs (or at least need tests) + +- autoaccept seems to have silently not actually merged editgroup ## Ideas @@ -36,18 +67,42 @@ => /{entity}/edit/{edit_id} => /{entity}/{ident}/redirects => /{entity}/{ident}/history +- investigate data quality by looking at, eg, most popular author strings, most + popular titles, duplicated containers, etc ## Production blockers - privacy policy, and link from: create account, create edit +- update /about page - refactors and correctness in rust/TODO -- metrics -- sentry - importers: don't insert wayback links with short timestamps +## Production Sanity + +- fatcat-web is not Type=simple (systemd) +- postgresql replication +- pg_dump/load test +- haproxy somewhere/how +- logging iteration: larger journald buffers? point somewhere? + ## Metadata Import +- web.archive.org response not SHA1 match? => need /
    id_/ thing +- XML etc in metadata + => (python) tests for these! + https://qa.fatcat.wiki/release/b3a2jvhvbvc6rlbdkpw4ukuzyi + https://qa.fatcat.wiki/release/search?q=xmlns + https://qa.fatcat.wiki/release/search?q=%26amp%3B + https://qa.fatcat.wiki/release/search?q=%26gt%3B +- better/complete reltypes probably good (eg, list of IRs, academic domain) +- 'expand' in lookups (derp! for single hit lookups) +- include crossref-capitalized DOI in extra +- some "Elsevier " stuff as publisher + => also title https://fatcat.wiki/release/uyjzaq3xjnd6tcrqy3vcucczsi +- crossref import: don't store citation unstructured if len() == 0: + {"crossref": {"unstructured": ""}} - cleaning/matching: https://ftfy.readthedocs.io/en/latest/ + => and try out beautifulsoup (https://stackoverflow.com/a/34532382/4682349) - manifest: multiple URLs per SHA1 - crossref: relations ("is-preprint-of") - crossref: two phase: no citations, then matched citations (via DOI table) @@ -58,6 +113,7 @@ => at least one author (?) => make this a method on Release object => or just set release_type as "stub"? +- special "alias" DOIs... in crossref metadata? new importers: - pubmed (medline) (filtered) @@ -89,6 +145,10 @@ new importers: => or maybe rust? - bibtext (etc) export +## Metadata Harvesting + +- datacite ingest seems to have failed... got a non-HTTP-200 status code, but also "got 50 (161950 of 21084)" + ## Schema / Entity Fields - arxiv_id field (keep flip-flopping) @@ -98,10 +158,16 @@ new importers: - `retracted`, `translation`, and perhaps `corrected` as flags on releases, instead of release_status? - 'part-of' relation for releases (release to release) and possibly containers -- `container-type` field for containers (journal, conference, book series, etc) +- `container_type` field for containers (journal, conference, book series, etc) ## Other / Backburner +- fileset/webcapture webface anything +- display abstracts better. no hashes or metadata; prefer plain or HTML, + convert JATS if necessary +- switch from slog to simple pretty_env_log +- format returned datetimes with only second precision, not millisecond (RFC mode) + => burried in model serialization internals - refactor openapi schema to use shared response types - consider using "HTTP 202: Accepted" for entity-mutating calls - basic python hbase/elastic matcher diff --git a/python/TODO b/python/TODO index 8d9cffd3..e169267b 100644 --- a/python/TODO +++ b/python/TODO @@ -1,13 +1,8 @@ -Idea for further module simplification: move codegen'd library into it's own -directory (with it's own README, tests, etc), and reference it here via -symlink. - - schema.org metadata for releases additional tests - full object fields actually getting passed e2e (for rich_app) -- implicit editor.active_edit_group behavior - modify existing release via edit mechanism (and commit) - redirect a release to another (merge) - update (via edit) a redirect release -- cgit v1.2.3 From 7638bcfe7a31d3aef06f9112578a5ee8d55ee076 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 17:30:51 -0800 Subject: hack pylint for unimplemented routes --- python/fatcat_web/routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 34012c9f..a5927d9b 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -58,8 +58,8 @@ def container_create(): if k.startswith('container_'): params[k[10:]] = request.form[k] container = None - edit = api.create_container(container, params=params) - return redirect("/container/{}".format(edit.ident)) + #edit = api.create_container(container, params=params) + #return redirect("/container/{}".format(edit.ident)) @app.route('/container/lookup', methods=['GET']) def container_lookup(): @@ -209,8 +209,8 @@ def release_create(): if k.startswith('release_'): params[k[10:]] = request.form[k] release = None - edit = api.create_release(release, params=params) - return redirect("/release/{}".format(edit.ident)) + #edit = api.create_release(release, params=params) + #return redirect("/release/{}".format(edit.ident)) @app.route('/release//history', methods=['GET']) def release_history(ident): -- cgit v1.2.3 From 4d0f994fd3c149c7de3640c21db503cbfb0a7039 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 18:44:16 -0800 Subject: python tests for citation efficiency --- python/tests/citation_efficiency.py | 113 ++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 python/tests/citation_efficiency.py (limited to 'python') diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py new file mode 100644 index 00000000..fe5006cc --- /dev/null +++ b/python/tests/citation_efficiency.py @@ -0,0 +1,113 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_citation_indexing(api): + # indexing is consistent and reacts to change + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1.refs = [ + ReleaseRef(key="first", title="the first title"), + ReleaseRef(key="second", title="the second title"), + ReleaseRef(key="third", title="a third title"), + ] + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "second" + assert r1.refs[2].index == 2 + assert r1.refs[2].key == "third" + + r1.refs.pop(1) + eg = quick_eg(api) + api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r1 = api.get_release(r1.ident) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "third" + +def test_citation_targets(api): + # invariant to linking citations + # also, updates work + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = ReleaseEntity(title="the citer") + r2.refs = [ + ReleaseRef(key="first", title="something else"), + ReleaseRef(key="second", title="the target title"), + ] + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + eg = quick_eg(api) + r2.refs[1].target_release_id = r1.ident + api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r2.ident) + assert r2.refs[0].key == "first" + assert r2.refs[1].key == "second" + assert r2.refs[0].index == 0 # TODO: one-indexing? + assert r2.refs[1].index == 1 + assert r2.refs[0].target_release_id == None + assert r2.refs[1].target_release_id == r1.ident + assert len(r2.refs) == 2 + +def test_citation_empty_array(api): + # distinction between empty array (no citations) and no array (hidden) + + r1 = ReleaseEntity(title="citation null") + r2 = ReleaseEntity(title="citation empty array") + r1.refs = None + r2.refs = [] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + print(r1.refs) + print(r2.refs) + assert r1.refs == [] + assert r1.refs == r2.refs + + r1b = api.get_release(r1.ident, hide="refs") + assert r1b.refs == None + +def test_citation_encoding(api): + # escape-only changes (eg, \u1234 whatever for ASCII) + + r1 = ReleaseEntity(title="citation encoding") + title = "title-unicode \\u0050 \\\" " + container = "container-unicode ☃︎ ä ö ü スティー" + extra = extra={'a': 1, 'b': 2, 'ö': 3} + locator = "p123" + r1.refs = [ + ReleaseRef(key="1", year=1923, title=title, container_name=container, + extra=extra, locator=locator), + ReleaseRef(key="2"), + ] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert title == r1.refs[0].title + assert container == r1.refs[0].container_name + assert extra == r1.refs[0].extra + assert locator == r1.refs[0].locator + -- cgit v1.2.3 From 0a8c9a5e07213276617f06b0379a166e7fd1c100 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Jan 2019 15:54:53 -0800 Subject: more robust IA Xauth logging --- python/fatcat_web/auth.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 8035cbe5..03964c92 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -90,7 +90,10 @@ def handle_ia_xauth(email, password): 'secret': Config.IA_XAUTH_CLIENT_SECRET, }) if resp.status_code == 401 or (not resp.json().get('success')): - flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + try: + flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + except: + print("IA XAuth fail: {}".format(resp.content)) return render_template('auth_ia_login.html', email=email), resp.status_code elif resp.status_code != 200: flash("Internet Archive login failed (internal error?)") -- cgit v1.2.3