From 7489ef7a979574effa74f1f17cebb81eefb1b71a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 23 Jul 2021 11:56:42 -0700
Subject: refs: refactor web paths; enrich refs as generic; remove old refs
 link

---
 python/fatcat_tools/references.py | 85 ++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 50 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 1d8a0d0d..a0079efd 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate
 index of reference links between works in the main catalog.
 
 See bulk citation and citation API proposals for design documentation.
-
-TODO:
-
-    surt_ify() helper (URL to SURT for queries)
-    CSL enrichment method (using only elasticsearch mget)
-    CSL enrichment for fatcat enrichment
-    access transform
-    microfilm access in access transform
-
-    all_outbound_refs(...) -> List[BiblioRef]
-    all_inbound_refs(...) -> List[BiblioRef]
-        same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
-        (optional; maybe not public)
 """
 
 import sys
 import json
 import datetime
 import argparse
-from typing import Optional, List, Any, Dict
+from typing import Optional, List, Any, Dict, Union
 
 from pydantic import BaseModel
 import elasticsearch
@@ -45,8 +32,6 @@ class BiblioRef(BaseModel):
     source_work_ident: Optional[str]
     # with lang prefix like "en:Superglue"
     source_wikipedia_article: Optional[str]
-    # skipped: source_openlibrary_work
-    # skipped: source_url_surt
     source_release_stage: Optional[str]
     source_year: Optional[int]
 
@@ -65,7 +50,6 @@ class BiblioRef(BaseModel):
     target_url_surt: Optional[str]
     # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform
     target_url: Optional[str]
-    # skipped: target_wikipedia_article
 
     # crossref, pubmed, grobid, etc
     match_provenance: Optional[str]
@@ -92,31 +76,20 @@ class BiblioRef(BaseModel):
         # TODO: if target_openlibrary_work, add an access option?
         return self
 
-class CslBiblioRef(BaseModel):
-    # an "enriched" version of BiblioRef with metadata about the source or
-    # target entity. would be "hydrated" via a lookup to, eg, the
-    # `fatcat_release` elasticsearch index (fast mget fetch with a single
-    # request), as opposed to fatcat API fetches
-    ref: BiblioRef
-    csl: Optional[Dict[str, Any]]
-    access: List[AccessOption]
-
-    class Config:
-        arbitrary_types_allowed = True
 
-class FatcatBiblioRef(BaseModel):
+class EnrichedBiblioRef(BaseModel):
     # enriched version of BiblioRef with complete ReleaseEntity object as
     # fetched from the fatcat API. CSL-JSON metadata would be derived from
     # the full release entity.
     ref: BiblioRef
     release: Optional[ReleaseEntity]
     # TODO: openlibrary work?
-    #csl: Optional[Dict[str, Any]]
     access: List[AccessOption]
 
     class Config:
         arbitrary_types_allowed = True
 
+
 class RefHits(BaseModel):
     count_returned: int
     count_total: int
@@ -124,9 +97,13 @@ class RefHits(BaseModel):
     limit: int
     query_time_ms: int
     query_wall_time_ms: int
-    result_refs: List[BiblioRef]
+    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
 
-def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]:
+
+def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits:
+    """
+    Internal helper for querying elasticsearch refs index and transforming hits
+    """
 
     limit = min((int(limit or 15), 200))
     if not offset or offset < 0:
@@ -179,7 +156,7 @@ def get_outbound_refs(
     limit: int = 100,
     offset: Optional[int] = None,
     es_index: str = "fatcat_ref",
-) -> List[BiblioRef]:
+) -> RefHits:
 
     search = Search(using=es_client, index=es_index)
 
@@ -199,6 +176,7 @@ def get_outbound_refs(
     hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)
     return hits
 
+
 def get_inbound_refs(
     es_client: Any,
     release_ident: Optional[str] = None,
@@ -208,12 +186,11 @@ def get_inbound_refs(
     url: Optional[str] = None,
     consolidate_works: bool = True,
     filter_stage: List[str] = [],
-    filter_type: List[str] = [],
+    sort: Optional[str] = None,
     limit: int = 25,
     offset: Optional[int] = None,
     es_index: str = "fatcat_ref",
 ) -> List[BiblioRef]:
-    # TODO: filter_stage, filter_type
 
     if url and not url_surt:
         url = surt_ify(url)
@@ -239,10 +216,19 @@ def get_inbound_refs(
     else:
         raise ValueError("require a lookup key")
 
-    search = search.sort("-source_year")
+    if filter_stage:
+        search = search.filter("term", source_stage=filter_stage)
+
+    if sort == "newest":
+        search = search.sort("-source_year")
+    elif sort == "oldest":
+        search = search.sort("source_year")
+    else:
+        search = search.sort("-source_year")
 
     return _execute_ref_query(search, limit=limit, offset=offset)
 
+
 def count_inbound_refs(
     es_client: Any,
     release_ident: Optional[str] = None,
@@ -251,7 +237,6 @@ def count_inbound_refs(
     url_surt: Optional[str] = None,
     url: Optional[str] = None,
     filter_stage: List[str] = [],
-    filter_type: List[str] = [],
     es_index: str = "fatcat_ref",
 ) -> int:
     """
@@ -274,28 +259,26 @@ def count_inbound_refs(
     else:
         raise ValueError("require a lookup key")
 
+    if filter_stage:
+        search = search.filter("term", source_stage=filter_stage)
+
     return search.count()
 
-# run elasticsearch mget query for all ref idents and include "enriched" refs when possible
-# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL
-# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index?
-#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
 
 # run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         if ref.source_release_ident:
             release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
                 #csl=None,
                 access=release_access_options(release),
                 release=release,
             ))
         else:
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
                 #csl=None,
                 access=[],
@@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi
             ))
     return enriched
 
-def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+
+def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         if ref.target_release_ident:
             release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
-                #csl=None,
                 access=release_access_options(release),
                 release=release,
             ))
         else:
-            enriched.append(FatcatBiblioRef(
+            enriched.append(EnrichedBiblioRef(
                 ref=ref,
-                #csl=None,
                 access=[],
                 release=None,
             ))
@@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h
 
 
 def run_ref_query(args) -> None:
+    """
+    CLI helper/debug tool (prints to stdout)
+    """
     release_ident = None
     work_ident = None
     if args.ident.startswith("release_"):
-- 
cgit v1.2.3