summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/references.py85
-rw-r--r--python/fatcat_web/ref_routes.py16
-rw-r--r--python/fatcat_web/templates/entity_base.html5
-rw-r--r--python/fatcat_web/templates/release_view_fuzzy_refs.html12
4 files changed, 52 insertions, 66 deletions
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 1d8a0d0d..a0079efd 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate
index of reference links between works in the main catalog.
See bulk citation and citation API proposals for design documentation.
-
-TODO:
-
- surt_ify() helper (URL to SURT for queries)
- CSL enrichment method (using only elasticsearch mget)
- CSL enrichment for fatcat enrichment
- access transform
- microfilm access in access transform
-
- all_outbound_refs(...) -> List[BiblioRef]
- all_inbound_refs(...) -> List[BiblioRef]
- same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?)
- (optional; maybe not public)
"""
import sys
import json
import datetime
import argparse
-from typing import Optional, List, Any, Dict
+from typing import Optional, List, Any, Dict, Union
from pydantic import BaseModel
import elasticsearch
@@ -45,8 +32,6 @@ class BiblioRef(BaseModel):
source_work_ident: Optional[str]
# with lang prefix like "en:Superglue"
source_wikipedia_article: Optional[str]
- # skipped: source_openlibrary_work
- # skipped: source_url_surt
source_release_stage: Optional[str]
source_year: Optional[int]
@@ -65,7 +50,6 @@ class BiblioRef(BaseModel):
target_url_surt: Optional[str]
# would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform
target_url: Optional[str]
- # skipped: target_wikipedia_article
# crossref, pubmed, grobid, etc
match_provenance: Optional[str]
@@ -92,31 +76,20 @@ class BiblioRef(BaseModel):
# TODO: if target_openlibrary_work, add an access option?
return self
-class CslBiblioRef(BaseModel):
- # an "enriched" version of BiblioRef with metadata about the source or
- # target entity. would be "hydrated" via a lookup to, eg, the
- # `fatcat_release` elasticsearch index (fast mget fetch with a single
- # request), as opposed to fatcat API fetches
- ref: BiblioRef
- csl: Optional[Dict[str, Any]]
- access: List[AccessOption]
-
- class Config:
- arbitrary_types_allowed = True
-class FatcatBiblioRef(BaseModel):
+class EnrichedBiblioRef(BaseModel):
# enriched version of BiblioRef with complete ReleaseEntity object as
# fetched from the fatcat API. CSL-JSON metadata would be derived from
# the full release entity.
ref: BiblioRef
release: Optional[ReleaseEntity]
# TODO: openlibrary work?
- #csl: Optional[Dict[str, Any]]
access: List[AccessOption]
class Config:
arbitrary_types_allowed = True
+
class RefHits(BaseModel):
count_returned: int
count_total: int
@@ -124,9 +97,13 @@ class RefHits(BaseModel):
limit: int
query_time_ms: int
query_wall_time_ms: int
- result_refs: List[BiblioRef]
+ result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
-def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]:
+
+def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits:
+ """
+ Internal helper for querying elasticsearch refs index and transforming hits
+ """
limit = min((int(limit or 15), 200))
if not offset or offset < 0:
@@ -179,7 +156,7 @@ def get_outbound_refs(
limit: int = 100,
offset: Optional[int] = None,
es_index: str = "fatcat_ref",
-) -> List[BiblioRef]:
+) -> RefHits:
search = Search(using=es_client, index=es_index)
@@ -199,6 +176,7 @@ def get_outbound_refs(
hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)
return hits
+
def get_inbound_refs(
es_client: Any,
release_ident: Optional[str] = None,
@@ -208,12 +186,11 @@ def get_inbound_refs(
url: Optional[str] = None,
consolidate_works: bool = True,
filter_stage: List[str] = [],
- filter_type: List[str] = [],
+ sort: Optional[str] = None,
limit: int = 25,
offset: Optional[int] = None,
es_index: str = "fatcat_ref",
) -> List[BiblioRef]:
- # TODO: filter_stage, filter_type
if url and not url_surt:
url = surt_ify(url)
@@ -239,10 +216,19 @@ def get_inbound_refs(
else:
raise ValueError("require a lookup key")
- search = search.sort("-source_year")
+ if filter_stage:
+ search = search.filter("term", source_stage=filter_stage)
+
+ if sort == "newest":
+ search = search.sort("-source_year")
+ elif sort == "oldest":
+ search = search.sort("source_year")
+ else:
+ search = search.sort("-source_year")
return _execute_ref_query(search, limit=limit, offset=offset)
+
def count_inbound_refs(
es_client: Any,
release_ident: Optional[str] = None,
@@ -251,7 +237,6 @@ def count_inbound_refs(
url_surt: Optional[str] = None,
url: Optional[str] = None,
filter_stage: List[str] = [],
- filter_type: List[str] = [],
es_index: str = "fatcat_ref",
) -> int:
"""
@@ -274,28 +259,26 @@ def count_inbound_refs(
else:
raise ValueError("require a lookup key")
+ if filter_stage:
+ search = search.filter("term", source_stage=filter_stage)
+
return search.count()
-# run elasticsearch mget query for all ref idents and include "enriched" refs when possible
-# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL
-# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index?
-#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
-#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]
# run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
if ref.source_release_ident:
release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
#csl=None,
access=release_access_options(release),
release=release,
))
else:
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
#csl=None,
access=[],
@@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi
))
return enriched
-def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]:
+
+def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
if ref.target_release_ident:
release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
- #csl=None,
access=release_access_options(release),
release=release,
))
else:
- enriched.append(FatcatBiblioRef(
+ enriched.append(EnrichedBiblioRef(
ref=ref,
- #csl=None,
access=[],
release=None,
))
@@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h
def run_ref_query(args) -> None:
+ """
+ CLI helper/debug tool (prints to stdout)
+ """
release_ident = None
work_ident = None
if args.ident.startswith("release_"):
diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py
index e08aaf15..e24b4ac6 100644
--- a/python/fatcat_web/ref_routes.py
+++ b/python/fatcat_web/ref_routes.py
@@ -11,14 +11,14 @@ from fatcat_openapi_client.rest import ApiException
from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release
from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches
-from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs
+from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs
from fatcat_tools.transforms.access import release_access_options
from fatcat_web import app, api, auth_api
from fatcat_web.forms import *
from fatcat_web.entity_helpers import *
-@app.route('/release/<string(length=26):ident>/inbound-refs', methods=['GET'])
+@app.route('/release/<string(length=26):ident>/refs/in', methods=['GET'])
def release_view_refs_inbound(ident):
release = generic_get_entity("release", ident)
@@ -27,11 +27,12 @@ def release_view_refs_inbound(ident):
offset = max(0, int(offset)) if offset.isnumeric() else 0
hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30)
- enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+ enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
- return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200
+ return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200
-@app.route('/release/<string(length=26):ident>/outbound-refs', methods=['GET'])
+
+@app.route('/release/<string(length=26):ident>/refs/out', methods=['GET'])
def release_view_refs_outbound(ident):
release = generic_get_entity("release", ident)
@@ -40,9 +41,10 @@ def release_view_refs_outbound(ident):
offset = max(0, int(offset)) if offset.isnumeric() else 0
hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30)
- enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+ enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures")
+
+ return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200
- return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200
@app.route('/reference/match', methods=['GET', 'POST'])
def reference_match():
diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html
index c23dbef2..78a151a0 100644
--- a/python/fatcat_web/templates/entity_base.html
+++ b/python/fatcat_web/templates/entity_base.html
@@ -85,10 +85,9 @@
{{ entity_tab("coverage", "Coverage", "/coverage") }}
{% elif entity_type == "release" and entity.state != 'deleted' %}
{{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }}
- {{ entity_tab("references", "References", "/references", entity.refs|count) }}
{% if entity.state == 'active' %}
- {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }}
- {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }}
+ {{ entity_tab("refs-out", "References", "/refs/out") }}
+ {{ entity_tab("refs-in", "Cited By", "/refs/in") }}
{% endif %}
{% endif %}
{{ entity_tab("metadata", "Metadata", "/metadata") }}
diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html
index 7b286fd3..43860a31 100644
--- a/python/fatcat_web/templates/release_view_fuzzy_refs.html
+++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html
@@ -1,5 +1,5 @@
{% set release = entity %}
-{% set entity_view = "{{ direction }}-refs" %}
+{% set entity_view = "refs-" + direction %}
{% set entity_type = "release" %}
{% import "entity_macros.html" as entity_macros %}
{% extends "entity_base.html" %}
@@ -17,10 +17,10 @@
{% block entity_main %}
-{% if direction == "inbound" %}
- <h3>Referenced By</h3>
- <i>Citations to this release by other works.</i>
-{% elif direction == "outbound" %}
+{% if direction == "in" %}
+ <h3>Cited By</h3>
+ <i>References to this release by other works.</i>
+{% elif direction == "out" %}
<h3>References</h3>
<i>NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list.</i>
{% endif %}
@@ -36,7 +36,7 @@
{% set release = row.release %}
<tr><td class="collapsing left aligned top aligned">
{# TODO: ref_locator? #}
- {% if direction == "outbound" %}
+ {% if direction == "out" %}
{% if row.ref.ref_key %}
<code>[{{ row.ref.ref_key }}]</code><br>
{% endif %}