aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-07-23 11:56:42 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-07-23 11:56:42 -0700
commit7489ef7a979574effa74f1f17cebb81eefb1b71a (patch)
tree252bc76358fb769aa52305d45e449c547a740f33 /python/fatcat_tools
parent0d17bad63b2d92220b8ddaeb9b5733b2b09f57a0 (diff)
downloadfatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.tar.gz
fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.zip
refs: refactor web paths; enrich refs as generic; remove old refs link
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/references.py85
1 files changed, 35 insertions, 50 deletions
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 1d8a0d0d..a0079efd 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate
index of reference links between works in the main catalog.
See bulk citation and citation API proposals for design documentation.
-
-TODO:
-
- surt_ify() helper (URL to SURT for queries)
- CSL enrichment method (using only elasticsearch mget)
- CSL enrichment for fatcat enrichment
- access transform
- microfilm access in access transform
-
- all_outbound_refs(...) -> List[BiblioRef]
- all_inbound_refs(...) -> List[BiblioRef]
- same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
- (optional; maybe not public)
"""
import sys
import json
import datetime
import argparse
-from typing import Optional, List, Any, Dict
+from typing import Optional, List, Any, Dict, Union
from pydantic import BaseModel
import elasticsearch
@@ -45,8 +32,6 @@ class BiblioRef(BaseModel):
source_work_ident: Optional[str]
# with lang prefix like "en:Superglue"
source_wikipedia_article: Optional[str]
- # skipped: source_openlibrary_work
- # skipped: source_url_surt
source_release_stage: Optional[str]
source_year: Optional[int]
@@ -65,7 +50,6 @@ class BiblioRef(BaseModel):
target_url_surt: Optional[str]
# would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform
target_url: Optional[str]
- # skipped: target_wikipedia_article
# crossref, pubmed, grobid, etc
match_provenance: Optional[str]
@@ -92,31 +76,20 @@ class BiblioRef(BaseModel):
# TODO: if target_openlibrary_work, add an access option?
return self
-class CslBiblioRef(BaseModel):
- # an "enriched" version of BiblioRef with metadata about the source or
- # target entity. would be "hydrated" via a lookup to, eg, the
- # `fatcat_release` elasticsearch index (fast mget fetch with a single
- # request), as opposed to fatcat API fetches
- ref: BiblioRef
- csl: Optional[Dict[str, Any]]
- access: List[AccessOption]
-
- class Config:
- arbitrary_types_allowed = True
-class FatcatBiblioRef(BaseModel):
+class EnrichedBiblioRef(BaseModel):
# enriched version of BiblioRef with complete ReleaseEntity object as
# fetched from the fatcat API. CSL-JSON metadata would be derived from
# the full release entity.
ref: BiblioRef
release: Optional[ReleaseEntity]
# TODO: openlibrary work?
- #csl: Optional[Dict[str, Any]]
access: List[AccessOption]
class Config:
arbitrary_types_allowed = True
+
class RefHits(BaseModel):
count_returned: int
count_total: int
@@ -124,9 +97,13 @@ class RefHits(BaseModel):
limit: int
query_time_ms: int
query_wall_time_ms: int
- result_refs: List[BiblioRef]
+ result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
-def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]:
+
+def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits:
+ """
+ Internal helper for querying elasticsearch refs index and transforming hits
+ """
limit = min((int(limit or 15), 200))
if not offset or offset < 0:
@@ -179,7 +156,7 @@ def get_outbound_refs(
limit: int = 100,
offset: Optional[int] = None,
es_index: str = "fatcat_ref",
-) -> List[BiblioRef]:
+) -> RefHits:
search = Search(using=es_client, index=es_index)
@@ -199,6 +176,7 @@ def get_outbound_refs(
hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)
return hits
+
def get_inbound_refs(
es_client: Any,
release_ident: Optional[str] = None,
@@ -208,12 +186,11 @@ def get_inbound_refs(
url: Optional[str] = None,
consolidate_works: bool = True,
filter_stage: List[str] = [],
- filter_type: List[str] = [],
+ sort: Optional[str] = None,
limit: int = 25,
offset: Optional[int] = None,
es_index: str = "fatcat_ref",
) -> List[BiblioRef]:
- # TODO: filter_stage, filter_type
if url and not url_surt:
url = surt_ify(url)
@@ -239,10 +216,19 @@ def get_inbound_refs(
else:
raise ValueError("require a lookup key")
- search = search.sort("-source_year")
+ if filter_stage:
+ search = search.filter("term", source_stage=filter_stage)
+
+ if sort == "newest":
+ search = search.sort("-source_year")
+ elif sort == "oldest":
+ search = search.sort("source_year")
+ else:
+ search = search.sort("-source_year")
return _execute_ref_query(search, limit=limit, offset=offset)
+
def count_inbound_refs(
es_client: Any,
release_ident: Optional[str] = None,
@@ -251,7 +237,6 @@ def count_inbound_refs(
url_surt: Optional[str] = None,
url: Optional[str] = None,
filter_stage: List[str] = [],
- filter_type: List[str] = [],
es_index: str = "fatcat_ref",
) -> int:
"""
@@ -274,28 +259,26 @@ def count_inbound_refs(
else:
raise ValueError("require a lookup key")
+ if filter_stage:
+ search = search.filter("term", source_stage=filter_stage)
+
return search.count()
-# run elasticsearch mget query for all ref idents and include "enriched" refs when possible
-# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL
-# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index?
-#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
# run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
if ref.source_release_ident:
release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
#csl=None,
access=release_access_options(release),
release=release,
))
else:
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
#csl=None,
access=[],
@@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi
))
return enriched
-def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+
+def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
if ref.target_release_ident:
release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
- #csl=None,
access=release_access_options(release),
release=release,
))
else:
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
- #csl=None,
access=[],
release=None,
))
@@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h
def run_ref_query(args) -> None:
+ """
+ CLI helper/debug tool (prints to stdout)
+ """
release_ident = None
work_ident = None
if args.ident.startswith("release_"):