refs: refactor web paths; enrich refs as generic; remove old refs link

author: Bryan Newbold <bnewbold@robocracy.org> 2021-07-23 11:56:42 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-07-23 11:56:42 -0700
commit: 7489ef7a979574effa74f1f17cebb81eefb1b71a (patch)
tree: 252bc76358fb769aa52305d45e449c547a740f33
parent: 0d17bad63b2d92220b8ddaeb9b5733b2b09f57a0 (diff)
download: fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.tar.gz
fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.zip
5 files changed, 91 insertions, 129 deletions
diff --git a/proposals/2021-01-29_citation_api.md b/proposals/2021-01-29_citation_api.md
index 1e329d61..f8d9e676 100644
--- a/proposals/2021-01-29_citation_api.md
+++ b/proposals/2021-01-29_citation_api.md
@@ -41,13 +41,13 @@ into a columnar file format like Parquet to get storage efficiency advances,
 type/schema enforcement, and easier ingest and use for large-scale data
 analysis.
 
-TODO: more?
-
 
 ## Schemas
 
 First, a combined JSON/pydantic/elasticsearch object that represents a
-reference between two things:
+reference from one thing to another, where the "source" must be known, but the
+"target" may either be known ("matched") or ambiguous (eg, just a reference
+string):
 
     BiblioRef ("bibliographic reference")
         _key: Optional[str] elasticsearch doc key
@@ -60,8 +60,6 @@ reference between two things:
         source_work_ident: Optional[str]
         source_wikipedia_article: Optional[str]
             with lang prefix like "en:Superglue"
-        # skipped: source_openlibrary_work
-        # skipped: source_url_surt
         source_release_stage: Optional[str]
         source_year: Optional[int]
 
@@ -71,7 +69,9 @@ reference between two things:
         ref_key: Optional[str]
             eg, "Lee86", "BIB23"
         ref_locator: Optional[str]
-            eg, page number
+            eg, specific page number in the book being referenced, if
+            applicable. Not used for, eg, first page of paper in a
+            volume/issue.
 
         # target of reference (identifiers)
         target_release_ident: Optional[str]
@@ -82,15 +82,15 @@ reference between two things:
             would not be stored in elasticsearch, but would be auto-generated
             by all "get" methods from the SURT, so calling code does not need
             to do SURT transform
-        # skipped: target_wikipedia_article
 
         match_provenance: str
             crossref, pubmed, grobid, etc
+            TODO: "ref_provenance"
         match_status: Optional[str]
             strong, weak, etc
-            TODO: "match_strength"?
+            TODO: "match_strength"? "match_confidence"?
         match_reason: Optional[str]
-            "doi", "isbn", "fuzzy title, author", etc
+            "doi", "isbn", "title-fuzzy, author", etc
             maybe "fuzzy-title-author"?
 
         target_unstructured: string (only if no release_ident link/match)
@@ -116,33 +116,22 @@ jinja templated to display lists of references in the user interface.
         size_bytes: Optional[int]
         thumbnail_url: Optional[str]
 
-    CslBiblioRef
-        # an "enriched" version of BiblioRef with metadata about the source or
-        # target entity. would be "hydrated" via a lookup to, eg, the
-        # `fatcat_release` elasticsearch index (fast mget fetch with a single
-        # request), as opposed to fatcat API fetches
-        biblio_ref: BiblioRef
-        source_csl/target_csl: free-form CSL-JSON
-        source_access/target_access: List[AccessOption]
-
-    FatcatBiblioRef
+    EnrichedBiblioRef
         # enriched version of BiblioRef with complete ReleaseEntity object as
-        # fetched from the fatcat API. CSL-JSON metadata would be derived from
-        # the full release entity.
+        # fetched from entity catalogs, if available. For example, fatcat API.
         biblio_ref: BiblioRef
         source_release/target_release: Optional[ReleaseEntity]
             complete ReleaseEntity from API, with optional expand/hide fields
-        source_csl/target_csl: free-form CSL-JSON
-            CSL-JSON version of ReleaseEntity metadata
         source_access/target_access: List[AccessOption]
+        # TODO: target_openlibrary? source_wikipedia?
 
 
 ## Datastore
 
 Would store in Elasticsearch as a live database, at least to start.
 
-TODO: try generating ~1 million of these objects to estimate index size (at
-billions of docs).
+Example Elasticsearch index `fatcat_ref_v02_20210716` has 1.8 billion docs
+(references), and consumes 435 GBytes of disk.
 
 Might be reasonable to use PostgreSQL in the future, with more explicit control
 over indexes and tuning for latency. But Elasticsearch is pretty easy to
@@ -172,59 +161,46 @@ operate (eg, replicas).
     count_inbound_refs(...) -> int
         same parameters as get_inbound_refs(), but returns just a count
 
-    get_all_outbound_refs(...) -> List[BiblioRef]
-    get_all_inbound_refs(...) -> List[BiblioRef]
-        same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
-        (optional; maybe not public)
+    # UNIMPLEMENTED
+    #get_all_outbound_refs(...) -> List[BiblioRef]
+    #get_all_inbound_refs(...) -> List[BiblioRef]
+    #    same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
+    #    (optional; maybe not public)
 
-    # run elasticsearch mget query for all ref idents and include "enriched" refs when possible
-    # for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL
-    # TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index?
-    enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-    enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-
-    # run fatcat API fetches for each ref and return "enriched" refs
-    enrich_inbound_refs_fatcat(refs: List[BiblioRef], hide, expand) -> List[FatcatBiblioRef]
-    enrich_outbound_refs_fatcat(refs: List[BiblioRef], hide, expand) -> List[FatcatBiblioRef]
+    # run catalog API fetches for each and return "enriched" refs
+    enrich_inbound_refs(refs: List[BiblioRef], hide, expand) -> List[EnrichedBiblioRef]
+    enrich_outbound_refs(refs: List[BiblioRef], hide, expand) -> List[EnrichedBiblioRef]
 
 ## HTTP API Endpoints
 
-Possible HTTP API endpoints... not even sure we would use these or expose them
-publicly?
-
-    citations-api.fatcat.wiki
-        /refs/inbound
-            &release_ident=
-            &work_ident=
-            &openlibrary_work=
-            &url=
-        /refs/outbound
-            &release_ident=
-            &work_ident=
-        /refs/csl/outbound
-        /refs/fatcat/outbound
-
-    api.fatcat.wiki/citations/v0
-        /inbound
-
-    fatcat.wiki/release/{release_ident}/refs/outbound.json
-    fatcat.wiki/work/{work_ident}/refs/outbound.json
-        &filter_type
-        &filter_stage
+Initial web endpoints, including unstable pseudo-APIs:
+
+    fatcat.wiki/release/{release_ident}/refs/in (and .json)
+    fatcat.wiki/release/{release_ident}/refs/out (and .json)
         &limit
         &offset
+        &sort (for inbound)
+        &filter_stage (for inbound)
 
-    fatcat.wiki/refs/openlibrary/{openlibrary_ident}/inbound.json
+    fatcat.wiki/openlibrary/{openlibrary_ident}/refs/in (and .json)
+        &limit
+        &offset
+        &sort
+        &filter_stage
 
-    fatcat.wiki/refs/url/inbound.json
+    fatcat.wiki/web/refs/in (and .json)
         &url=
+        &limit
+        &offset
+        &sort (newest, oldest)
+        &filter_stage
 
 ## Design Notes
 
 This proposed schema is relatively close to what the "normalize" SQL table
 would look like (many-to-many relationship).
 
-Especiall for "redistributing as bulk corpus", we might want to consider an
+Especially for "redistributing as bulk corpus", we might want to consider an
 alternative data model which is a single source entity containing a list of
 outbound references. Could even be a single source *work* for fatcat content,
 with many release under the entity. One advantage of this is that source
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 1d8a0d0d..a0079efd 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate
 index of reference links between works in the main catalog.
 
 See bulk citation and citation API proposals for design documentation.
-
-TODO:
-
-    surt_ify() helper (URL to SURT for queries)
-    CSL enrichment method (using only elasticsearch mget)
-    CSL enrichment for fatcat enrichment
-    access transform
-    microfilm access in access transform
-
-    all_outbound_refs(...) -> List[BiblioRef]
-    all_inbound_refs(...) -> List[BiblioRef]
-        same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
-        (optional; maybe not public)
 """
 
 import sys
 import json
 import datetime
 import argparse
-from typing import Optional, List, Any, Dict
+from typing import Optional, List, Any, Dict, Union
 
 from pydantic import BaseModel
 import elasticsearch
@@ -45,8 +32,6 @@ class BiblioRef(BaseModel):
     source_work_ident: Optional[str]
     # with lang prefix like "en:Superglue"
     source_wikipedia_article: Optional[str]
-    # skipped: source_openlibrary_work
-    # skipped: source_url_surt
     source_release_stage: Optional[str]
     source_year: Optional[int]
 
@@ -65,7 +50,6 @@ class BiblioRef(BaseModel):
     target_url_surt: Optional[str]
     # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform
     target_url: Optional[str]
-    # skipped: target_wikipedia_article
 
     # crossref, pubmed, grobid, etc
     match_provenance: Optional[str]
@@ -92,31 +76,20 @@ class BiblioRef(BaseModel):
         # TODO: if target_openlibrary_work, add an access option?
         return self
 
-class CslBiblioRef(BaseModel):
-    # an "enriched" version of BiblioRef with metadata about the source or
-    # target entity. would be "hydrated" via a lookup to, eg, the
-    # `fatcat_release` elasticsearch index (fast mget fetch with a single
-    # request), as opposed to fatcat API fetches
-    ref: BiblioRef
-    csl: Optional[Dict[str, Any]]
-    access: List[AccessOption]
-
-    class Config:
-        arbitrary_types_allowed = True
 
-class FatcatBiblioRef(BaseModel):
+class EnrichedBiblioRef(BaseModel):
     # enriched version of BiblioRef with complete ReleaseEntity object as
     # fetched from the fatcat API. CSL-JSON metadata would be derived from
     # the full release entity.
     ref: BiblioRef
     release: Optional[ReleaseEntity]
     # TODO: openlibrary work?
-    #csl: Optional[Dict[str, Any]]
     access: List[AccessOption]
 
     class Config:
         arbitrary_types_allowed = True
 
+
 class RefHits(BaseModel):
     count_returned: int
     count_total: int
@@ -124,9 +97,13 @@ class RefHits(BaseModel):
     limit: int
     query_time_ms: int
     query_wall_time_ms: int
-    result_refs: List[BiblioRef]
+    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
 
-def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]:
+
+def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits:
+    """
+    Internal helper for querying elasticsearch refs index and transforming hits
+    """
 
     limit = min((int(limit or 15), 200))
     if not offset or offset < 0:
@@ -179,7 +156,7 @@ def get_outbound_refs(
     limit: int = 100,
     offset: Optional[int] = None,
     es_index: str = "fatcat_ref",
-) -> List[BiblioRef]:
+) -> RefHits:
 
     search = Search(using=es_client, index=es_index)
 
@@ -199,6 +176,7 @@ def get_outbound_refs(
     hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)
     return hits
 
+
 def get_inbound_refs(
     es_client: Any,
     release_ident: Optional[str] = None,
@@ -208,12 +186,11 @@ def get_inbound_refs(
     url: Optional[str] = None,
     consolidate_works: bool = True,
     filter_stage: List[str] = [],
-    filter_type: List[str] = [],
+    sort: Optional[str] = None,
     limit: int = 25,
     offset: Optional[int] = None,
     es_index: str = "fatcat_ref",
 ) -> List[BiblioRef]:
-    # TODO: filter_stage, filter_type
 
     if url and not url_surt:
         url = surt_ify(url)
@@ -239,10 +216,19 @@ def get_inbound_refs(
     else:
         raise ValueError("require a lookup key")
 
-    search = search.sort("-source_year")
+    if filter_stage:
+        search = search.filter("term", source_stage=filter_stage)
+
+    if sort == "newest":
+        search = search.sort("-source_year")
+    elif sort == "oldest":
+        search = search.sort("source_year")
+    else:
+        search = search.sort("-source_year")
 
     return _execute_ref_query(search, limit=limit, offset=offset)
 
+
 def count_inbound_refs(
     es_client: Any,
     release_ident: Optional[str] = None,
@@ -251,7 +237,6 @@ def count_inbound_refs(
     url_surt: Optional[str] = None,
     url: Optional[str] = None,
     filter_stage: List[str] = [],
-    filter_type: List[str] = [],
     es_index: str = "fatcat_ref",
 ) -> int:
     """
@@ -274,28 +259,26 @@ def count_inbound_refs(
     else:
         raise ValueError("require a lookup key")
 
+    if filter_stage:
+        search = search.filter("term", source_stage=filter_stage)
+
     return search.count()
 
-# run elasticsearch mget query for all ref idents and include "enriched" refs when possible
-# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL
-# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index?
-#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
 
 # run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         if ref.source_release_ident:
             release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
                 #csl=None,
                 access=release_access_options(release),
                 release=release,
             ))
         else:
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
                 #csl=None,
                 access=[],
@@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi
             ))
     return enriched
 
-def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+
+def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         if ref.target_release_ident:
             release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
-                #csl=None,
                 access=release_access_options(release),
                 release=release,
             ))
         else:
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
-                #csl=None,
                 access=[],
                 release=None,
             ))
@@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h
 
 
 def run_ref_query(args) -> None:
+    """
+    CLI helper/debug tool (prints to stdout)
+    """
     release_ident = None
     work_ident = None
     if args.ident.startswith("release_"):
diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py
index e08aaf15..e24b4ac6 100644
--- a/python/fatcat_web/ref_routes.py
+++ b/python/fatcat_web/ref_routes.py
@@ -11,14 +11,14 @@ from fatcat_openapi_client.rest import ApiException
 from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release
 from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches
 
-from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs
+from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs
 from fatcat_tools.transforms.access import release_access_options
 from fatcat_web import app, api, auth_api
 from fatcat_web.forms import *
 from fatcat_web.entity_helpers import *
 
 
-@app.route('/release/<string(length=26):ident>/inbound-refs', methods=['GET'])
+@app.route('/release/<string(length=26):ident>/refs/in', methods=['GET'])
 def release_view_refs_inbound(ident):
 
     release = generic_get_entity("release", ident)
@@ -27,11 +27,12 @@ def release_view_refs_inbound(ident):
     offset = max(0, int(offset)) if offset.isnumeric() else 0
 
     hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30)
-    enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+    enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
 
-    return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200
+    return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200
 
-@app.route('/release/<string(length=26):ident>/outbound-refs', methods=['GET'])
+
+@app.route('/release/<string(length=26):ident>/refs/out', methods=['GET'])
 def release_view_refs_outbound(ident):
 
     release = generic_get_entity("release", ident)
@@ -40,9 +41,10 @@ def release_view_refs_outbound(ident):
     offset = max(0, int(offset)) if offset.isnumeric() else 0
 
     hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30)
-    enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+    enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+
+    return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200
 
-    return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200
 
 @app.route('/reference/match', methods=['GET', 'POST'])
 def reference_match():
diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html
index c23dbef2..78a151a0 100644
--- a/python/fatcat_web/templates/entity_base.html
+++ b/python/fatcat_web/templates/entity_base.html
@@ -85,10 +85,9 @@
               {{ entity_tab("coverage", "Coverage", "/coverage") }}
             {% elif entity_type == "release" and entity.state != 'deleted' %}
               {{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }}
-              {{ entity_tab("references", "References", "/references", entity.refs|count) }}
               {% if  entity.state == 'active' %}
-                {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }}
-                {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }}
+                {{ entity_tab("refs-out", "References", "/refs/out") }}
+                {{ entity_tab("refs-in", "Cited By", "/refs/in") }}
               {% endif %}
             {% endif %}
             {{ entity_tab("metadata", "Metadata", "/metadata") }}
diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html
index 7b286fd3..43860a31 100644
--- a/python/fatcat_web/templates/release_view_fuzzy_refs.html
+++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html
@@ -1,5 +1,5 @@
 {% set release = entity %}
-{% set entity_view = "{{ direction }}-refs" %}
+{% set entity_view = "refs-" + direction %}
 {% set entity_type = "release" %}
 {% import "entity_macros.html" as entity_macros %}
 {% extends "entity_base.html" %}
@@ -17,10 +17,10 @@
 
 {% block entity_main %}
 
-{% if direction == "inbound" %}
-  <h3>Referenced By</h3>
-  <i>Citations to this release by other works.</i>
-{% elif direction == "outbound" %}
+{% if direction == "in" %}
+  <h3>Cited By</h3>
+  <i>References to this release by other works.</i>
+{% elif direction == "out" %}
   <h3>References</h3>
   <i>NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list.</i>
 {% endif %}
@@ -36,7 +36,7 @@
   {% set release = row.release %}
   <tr><td class="collapsing left aligned top aligned">
         {# TODO: ref_locator? #}
-        {% if direction == "outbound" %}
+        {% if direction == "out" %}
           {% if row.ref.ref_key %}
             <code>[{{ row.ref.ref_key }}]</code><br>
           {% endif %}
author	Bryan Newbold <bnewbold@robocracy.org>	2021-07-23 11:56:42 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-07-23 11:56:42 -0700
commit	7489ef7a979574effa74f1f17cebb81eefb1b71a (patch)
tree	252bc76358fb769aa52305d45e449c547a740f33
parent	0d17bad63b2d92220b8ddaeb9b5733b2b09f57a0 (diff)
download	fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.tar.gz fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.zip