From a5a8811a605080f2cd9eb575c33a17f045c43674 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 17:01:21 -0700 Subject: initial inbound/outbound reference query helpers --- python/fatcat_tools/references.py | 450 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 python/fatcat_tools/references.py (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py new file mode 100644 index 00000000..c9730174 --- /dev/null +++ b/python/fatcat_tools/references.py @@ -0,0 +1,450 @@ +""" +Helper routines for working with the fatcat citation graph, which is a separate +index of reference links between works in the main catalog. + +See bulk citation and citation API proposals for design documentation. + +TODO: + + surt_ify() helper (URL to SURT for queries) + CSL enrichment method (using only elasticsearch mget) + CSL enrichment for fatcat enrichment + access transform + microfilm access in access transform + + all_outbound_refs(...) -> List[BiblioRef] + all_inbound_refs(...) -> List[BiblioRef] + same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) + (optional; maybe not public) +""" + +import sys +import json +import datetime +import argparse +from enum import Enum +from typing import Optional, List, Any, Dict + +from pydantic import BaseModel +import elasticsearch +from elasticsearch_dsl import Search, Q +from fatcat_openapi_client import ReleaseEntity + +from fatcat_tools import public_api + + + +class BiblioRef(BaseModel): + """bibliographic reference""" + # ("release", source_release_ident, ref_index) + # ("wikipedia", source_wikipedia_article, ref_index) + _key: Optional[str] + update_ts: Optional[datetime.datetime] + + # metadata about source of reference + source_release_ident: Optional[str] + source_work_ident: Optional[str] + # with lang prefix like "en:Superglue" + source_wikipedia_article: Optional[str] + # skipped: source_openlibrary_work + # skipped: source_url_surt + source_release_stage: Optional[str] + source_year: Optional[int] + + # context of the reference itself + # 1-indexed, not 0-indexed + ref_index: Optional[int] # TODO: actually optional? + # eg, "Lee86", "BIB23" + ref_key: Optional[str] + # eg, page number + ref_locator: Optional[str] + + # target of reference (identifiers) + target_release_ident: Optional[str] + target_work_ident: Optional[str] + target_openlibrary_work: Optional[str] + target_url_surt: Optional[str] + # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform + target_url: Optional[str] + # skipped: target_wikipedia_article + + # crossref, pubmed, grobid, etc + match_provenance: str + # strong, weak, etc + match_status: Optional[str] + # TODO: "match_strength"? + # "doi", "isbn", "fuzzy title, author", etc + # maybe "fuzzy-title-author"? + match_reason: Optional[str] + + # only if no release_ident link/match + target_unstructured: Optional[str] + target_csl: Optional[Dict[str, Any]] + +class AccessType(str, Enum): + """describes type of access URL""" + + wayback = "wayback" + ia_file = "ia_file" + ia_microfilm = "ia_microfilm" + repository = "repository" + +class AccessOption(BaseModel): + + access_type: AccessType + + # note: for `target_url` refs, would do a CDX lookup and this URL would be + # a valid/HTTP-200 web.archive.org capture URL + access_url: str + + # application/pdf, text/html, etc + # blank for landing pages + mimetype: Optional[str] + + size_bytes: Optional[int] + thumbnail_url: Optional[str] + +class CslBiblioRef(BaseModel): + # an "enriched" version of BiblioRef with metadata about the source or + # target entity. would be "hydrated" via a lookup to, eg, the + # `fatcat_release` elasticsearch index (fast mget fetch with a single + # request), as opposed to fatcat API fetches + ref: BiblioRef + csl: Optional[Dict[str, Any]] + access: List[AccessOption] + + class Config: + arbitrary_types_allowed = True + +class FatcatBiblioRef(BaseModel): + # enriched version of BiblioRef with complete ReleaseEntity object as + # fetched from the fatcat API. CSL-JSON metadata would be derived from + # the full release entity. + ref: BiblioRef + release: Optional[ReleaseEntity] + csl: Optional[Dict[str, Any]] + access: List[AccessOption] + + class Config: + arbitrary_types_allowed = True + +class RefHits(BaseModel): + count_returned: int + count_total: int + offset: int + limit: int + query_time_ms: int + query_wall_time_ms: int + result_refs: List[BiblioRef] + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + + limit = min((int(limit or 15), 200)) + if not offset or offset < 0: + offset = 0 + + search = search.params(track_total_hits=True) + search = search[offset : (offset + limit)] + + query_start = datetime.datetime.now() + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e_raw: + # this is a "user" error + e: Any = e_raw + #logging.warn("elasticsearch 400: " + str(e.info)) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e + else: + raise ValueError(str(e.info)) from e + except elasticsearch.exceptions.TransportError as e: + # all other errors + #logging.warn(f"elasticsearch non-200 status code: {e.info}") + raise IOError(str(e.info)) from e + query_delta = datetime.datetime.now() - query_start + + result_refs = [] + for h in resp.hits: + # might be a list because of consolidation + if isinstance(h._d_.get('source_work_ident'), list): + h._d_['source_work_ident'] = h._d_['source_work_ident'][0] + result_refs.append(BiblioRef.parse_obj(h._d_)) + + return RefHits( + count_returned=len(result_refs), + # ES 7.x style "total" + count_total=resp.hits.total.value, + offset=offset, + limit=limit, + query_time_ms=int(resp.took), + query_wall_time_ms=int(query_delta.total_seconds() * 1000), + result_refs=result_refs, + ) + + +def get_outbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + wikipedia_article: Optional[str] = None, + limit: int = 100, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> List[BiblioRef]: + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", source_release_ident=release_ident) + elif work_ident: + search = search.filter("term", source_work_ident=work_ident) + elif wikipedia_article: + search = search.filter("term", source_wikipedia_article=wikipedia_article) + else: + raise ValueError("require a lookup key") + + # TODO: schema doesn't support either of these currently + #search = search.sort("ref_index") + #search = search.sort("ref_key") + + # re-sort by index + hits = _execute_ref_query(search, limit=limit, offset=offset) + hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) + return hits + +def get_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url_surt: Optional[str] = None, + url: Optional[str] = None, + consolidate_works: bool = True, + filter_stage: List[str] = [], + filter_type: List[str] = [], + limit: int = 25, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> List[BiblioRef]: + # TODO: filter_stage, filter_type + + if url and not url_surt: + url = surt_ify(url) + + search = Search(using=es_client, index=es_index) + + if consolidate_works: + search = search.extra( + collapse={ + "field": "source_work_ident", + "inner_hits": {"name": "source_more", "size": 0,}, + } + ) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + elif url_surt: + search = search.filter("term", target_url_surt=url_surt) + else: + raise ValueError("require a lookup key") + + # TODO: wrong type, not int? and maybe need to index differently? + #search = search.sort("source_year") + + return _execute_ref_query(search, limit=limit, offset=offset) + +def count_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url_surt: Optional[str] = None, + url: Optional[str] = None, + filter_stage: List[str] = [], + filter_type: List[str] = [], + es_index: str = "fatcat_ref", +) -> int: + """ + Same parameters as get_inbound_refs(), but returns just a count + """ + + if url and not url_surt: + url = surt_ify(url) + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + elif url_surt: + search = search.filter("term", target_url_surt=url_surt) + else: + raise ValueError("require a lookup key") + + return search.count() + +def _release_access(release: ReleaseEntity) -> List[AccessOption]: + """ + Extracts access options from a release. + """ + options = [] + for f in (release.files or []): + for u in (f.urls or []): + if '://web.archive.org/' in u.url: + return [AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + elif '://archive.org/' in u.url: + return [AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + return options + +# run elasticsearch mget query for all ref idents and include "enriched" refs when possible +# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL +# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? +#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] +#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] + +# run fatcat API fetches for each ref and return "enriched" refs +def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + enriched = [] + for ref in refs: + if ref.source_release_ident: + release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=_release_access(release), + release=release, + )) + else: + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=[], + release=None, + )) + return enriched + +def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + enriched = [] + for ref in refs: + if ref.target_release_ident: + release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=_release_access(release), + release=release, + )) + else: + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=[], + release=None, + )) + return enriched + + +def run_ref_query(args) -> None: + release_ident = None + work_ident = None + if args.ident.startswith("release_"): + release_ident = args.ident.split('_')[1] + elif args.ident.startswith("work_"): + work_ident = args.ident.split('_')[1] + else: + release_ident = args.ident + + print("## Outbound References") + hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_outbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + + print() + print("## Inbound References") + hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_inbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"release_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") + +def main() -> None: + """ + Run this utility like: + + python -m fatcat_tools.references + + Examples: + + python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply + """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + subparsers = parser.add_subparsers() + + parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0") + parser.add_argument("--elasticsearch-base", default="https://search.fatcat.wiki") + parser.add_argument("--elasticsearch-ref-index", default="fatcat_ref") + + sub = subparsers.add_parser( + "query", + help="takes a fatcat ident argument, prints both inbound and outbound references", + ) + sub.set_defaults(func="run_ref_query") + sub.add_argument("ident", type=str) + sub.add_argument("--enrich", type=str) + + args = parser.parse_args() + if not args.__dict__.get("func"): + parser.print_help(file=sys.stderr) + sys.exit(-1) + + args.es_client = elasticsearch.Elasticsearch(args.elasticsearch_base) + args.fatcat_api_client = public_api(args.fatcat_api_base) + + if args.func == "run_ref_query": + run_ref_query(args) + else: + raise NotImplementedError(args.func) + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 15680e0caae7ff6e24ddca8584b0c590d2b30581 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:30:11 -0700 Subject: references: refactor to point to access_options transform; comment out CSL fields --- python/fatcat_tools/references.py | 65 +++++---------------------------------- 1 file changed, 8 insertions(+), 57 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index c9730174..7e1f4f71 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -22,7 +22,6 @@ import sys import json import datetime import argparse -from enum import Enum from typing import Optional, List, Any, Dict from pydantic import BaseModel @@ -31,7 +30,7 @@ from elasticsearch_dsl import Search, Q from fatcat_openapi_client import ReleaseEntity from fatcat_tools import public_api - +from fatcat_tools.transforms.access import release_access_options, AccessOption class BiblioRef(BaseModel): @@ -81,29 +80,6 @@ class BiblioRef(BaseModel): target_unstructured: Optional[str] target_csl: Optional[Dict[str, Any]] -class AccessType(str, Enum): - """describes type of access URL""" - - wayback = "wayback" - ia_file = "ia_file" - ia_microfilm = "ia_microfilm" - repository = "repository" - -class AccessOption(BaseModel): - - access_type: AccessType - - # note: for `target_url` refs, would do a CDX lookup and this URL would be - # a valid/HTTP-200 web.archive.org capture URL - access_url: str - - # application/pdf, text/html, etc - # blank for landing pages - mimetype: Optional[str] - - size_bytes: Optional[int] - thumbnail_url: Optional[str] - class CslBiblioRef(BaseModel): # an "enriched" version of BiblioRef with metadata about the source or # target entity. would be "hydrated" via a lookup to, eg, the @@ -122,7 +98,7 @@ class FatcatBiblioRef(BaseModel): # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] - csl: Optional[Dict[str, Any]] + #csl: Optional[Dict[str, Any]] access: List[AccessOption] class Config: @@ -290,31 +266,6 @@ def count_inbound_refs( return search.count() -def _release_access(release: ReleaseEntity) -> List[AccessOption]: - """ - Extracts access options from a release. - """ - options = [] - for f in (release.files or []): - for u in (f.urls or []): - if '://web.archive.org/' in u.url: - return [AccessOption( - access_type="wayback", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=None - )] - elif '://archive.org/' in u.url: - return [AccessOption( - access_type="ia_file", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=None - )] - return options - # run elasticsearch mget query for all ref idents and include "enriched" refs when possible # for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL # TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? @@ -329,14 +280,14 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) enriched.append(FatcatBiblioRef( ref=ref, - csl=None, - access=_release_access(release), + #csl=None, + access=release_access_options(release), release=release, )) else: enriched.append(FatcatBiblioRef( ref=ref, - csl=None, + #csl=None, access=[], release=None, )) @@ -349,14 +300,14 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) enriched.append(FatcatBiblioRef( ref=ref, - csl=None, - access=_release_access(release), + #csl=None, + access=release_access_options(release), release=release, )) else: enriched.append(FatcatBiblioRef( ref=ref, - csl=None, + #csl=None, access=[], release=None, )) -- cgit v1.2.3 From c2395869ff7860bb2c7f080fd6c097e299ea58bf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jun 2021 19:49:36 -0700 Subject: fixes for newer ref index --- python/fatcat_tools/references.py | 2 +- .../templates/release_view_fuzzy_refs.html | 59 ++++------------------ 2 files changed, 11 insertions(+), 50 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 7e1f4f71..976967d4 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -68,7 +68,7 @@ class BiblioRef(BaseModel): # skipped: target_wikipedia_article # crossref, pubmed, grobid, etc - match_provenance: str + match_provenance: Optional[str] # strong, weak, etc match_status: Optional[str] # TODO: "match_strength"? diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index 9ceb6060..ee39d15b 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -33,56 +33,17 @@ {% endif %} {% endif %}
{{ ref.ref.match_status }} -
{{ ref.ref.match_provenance }} + {% if ref.ref.match_provenance %} +
{{ ref.ref.match_provenance }} + {% endif %} - {{ release.title }} - {% if release.release_type not in ["article-journal", "conference-paper"] %} - [{{ release.release_type or "unknown-type" }}] - {% endif %} -
- {% for contrib in release.contribs[:5] %} - {% if contrib.creator %} - {{ contrib.creator.display_name }} - {% else %} - {{ contrib.raw_name }} - {% endif %} - {% if not loop.last %}, {% endif %} - {% endfor %} - {% if release.contribs | length > 5 %}(+ more) {%endif %} -
- {% if release.release_year %}{{ release.release_year }}  {% endif %} - {% if release.container %} - {{ release.container.name }} - {% elif release.extra and release.extra.container_name %} - {{ release.extra.container_name }} - {% endif %} - {% if release.release_stage != "published" %} -  {{ release.release_stage or "unpublished" }} - {% endif %} - -
- {% if release.version %} - version:{{ release.release_year }}  - {% endif %} - {% if release.number %} - number:{{ release.number }}  - {% endif %} - {% if release.ext_ids.doi %} - doi:{{ release.ext_ids.doi }}  - {% endif %} - {# TODO: links #} - {% if release.ext_ids.arxiv %} - arXiv:{{ release.ext_ids.arxiv }}  - {% endif %} - {% if release.ext_ids.pmcid %} - pmcid:{{ release.ext_ids.pmcid }}  - {% endif %} - {% if release.ext_ids.pmid %} - pmid:{{ release.ext_ids.pmid }}  - {% endif %} - {% if release.ext_ids.dblp %} - dblp:{{ release.ext_ids.dblp }}  - {% endif %} + {% if release %} + {% entity_macros.release_summary(release) %} + {% elif ref.ref.target_unstructured %} + {{ ref.ref.target_unstructured }} + {% else %} + blank + {% endif %} {% if ref.access %} {{ ref.access[0].access_type.name }} -- cgit v1.2.3 From 0d17bad63b2d92220b8ddaeb9b5733b2b09f57a0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 24 Jun 2021 18:48:40 -0700 Subject: refs fetch: add some hacks; sort hits --- python/fatcat_tools/references.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 976967d4..1d8a0d0d 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -80,6 +80,18 @@ class BiblioRef(BaseModel): target_unstructured: Optional[str] target_csl: Optional[Dict[str, Any]] + def hacks(self): + """ + Temporary (?) hacks to work around schema/data issues + """ + if self.target_openlibrary_work and self.target_openlibrary_work.startswith("/works/"): + self.target_openlibrary_work = self.target_openlibrary_work[7:] + if self.target_url_surt and not self.target_url: + # TODO: convert SURT to regular URL + pass + # TODO: if target_openlibrary_work, add an access option? + return self + class CslBiblioRef(BaseModel): # an "enriched" version of BiblioRef with metadata about the source or # target entity. would be "hydrated" via a lookup to, eg, the @@ -98,6 +110,7 @@ class FatcatBiblioRef(BaseModel): # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] + # TODO: openlibrary work? #csl: Optional[Dict[str, Any]] access: List[AccessOption] @@ -144,7 +157,7 @@ def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> # might be a list because of consolidation if isinstance(h._d_.get('source_work_ident'), list): h._d_['source_work_ident'] = h._d_['source_work_ident'][0] - result_refs.append(BiblioRef.parse_obj(h._d_)) + result_refs.append(BiblioRef.parse_obj(h._d_).hacks()) return RefHits( count_returned=len(result_refs), @@ -179,9 +192,7 @@ def get_outbound_refs( else: raise ValueError("require a lookup key") - # TODO: schema doesn't support either of these currently - #search = search.sort("ref_index") - #search = search.sort("ref_key") + search = search.sort("ref_index") # re-sort by index hits = _execute_ref_query(search, limit=limit, offset=offset) @@ -228,8 +239,7 @@ def get_inbound_refs( else: raise ValueError("require a lookup key") - # TODO: wrong type, not int? and maybe need to index differently? - #search = search.sort("source_year") + search = search.sort("-source_year") return _execute_ref_query(search, limit=limit, offset=offset) -- cgit v1.2.3 From 7489ef7a979574effa74f1f17cebb81eefb1b71a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 11:56:42 -0700 Subject: refs: refactor web paths; enrich refs as generic; remove old refs link --- proposals/2021-01-29_citation_api.md | 102 ++++++++------------- python/fatcat_tools/references.py | 85 +++++++---------- python/fatcat_web/ref_routes.py | 16 ++-- python/fatcat_web/templates/entity_base.html | 5 +- .../templates/release_view_fuzzy_refs.html | 12 +-- 5 files changed, 91 insertions(+), 129 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/proposals/2021-01-29_citation_api.md b/proposals/2021-01-29_citation_api.md index 1e329d61..f8d9e676 100644 --- a/proposals/2021-01-29_citation_api.md +++ b/proposals/2021-01-29_citation_api.md @@ -41,13 +41,13 @@ into a columnar file format like Parquet to get storage efficiency advances, type/schema enforcement, and easier ingest and use for large-scale data analysis. -TODO: more? - ## Schemas First, a combined JSON/pydantic/elasticsearch object that represents a -reference between two things: +reference from one thing to another, where the "source" must be known, but the +"target" may either be known ("matched") or ambiguous (eg, just a reference +string): BiblioRef ("bibliographic reference") _key: Optional[str] elasticsearch doc key @@ -60,8 +60,6 @@ reference between two things: source_work_ident: Optional[str] source_wikipedia_article: Optional[str] with lang prefix like "en:Superglue" - # skipped: source_openlibrary_work - # skipped: source_url_surt source_release_stage: Optional[str] source_year: Optional[int] @@ -71,7 +69,9 @@ reference between two things: ref_key: Optional[str] eg, "Lee86", "BIB23" ref_locator: Optional[str] - eg, page number + eg, specific page number in the book being referenced, if + applicable. Not used for, eg, first page of paper in a + volume/issue. # target of reference (identifiers) target_release_ident: Optional[str] @@ -82,15 +82,15 @@ reference between two things: would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform - # skipped: target_wikipedia_article match_provenance: str crossref, pubmed, grobid, etc + TODO: "ref_provenance" match_status: Optional[str] strong, weak, etc - TODO: "match_strength"? + TODO: "match_strength"? "match_confidence"? match_reason: Optional[str] - "doi", "isbn", "fuzzy title, author", etc + "doi", "isbn", "title-fuzzy, author", etc maybe "fuzzy-title-author"? target_unstructured: string (only if no release_ident link/match) @@ -116,33 +116,22 @@ jinja templated to display lists of references in the user interface. size_bytes: Optional[int] thumbnail_url: Optional[str] - CslBiblioRef - # an "enriched" version of BiblioRef with metadata about the source or - # target entity. would be "hydrated" via a lookup to, eg, the - # `fatcat_release` elasticsearch index (fast mget fetch with a single - # request), as opposed to fatcat API fetches - biblio_ref: BiblioRef - source_csl/target_csl: free-form CSL-JSON - source_access/target_access: List[AccessOption] - - FatcatBiblioRef + EnrichedBiblioRef # enriched version of BiblioRef with complete ReleaseEntity object as - # fetched from the fatcat API. CSL-JSON metadata would be derived from - # the full release entity. + # fetched from entity catalogs, if available. For example, fatcat API. biblio_ref: BiblioRef source_release/target_release: Optional[ReleaseEntity] complete ReleaseEntity from API, with optional expand/hide fields - source_csl/target_csl: free-form CSL-JSON - CSL-JSON version of ReleaseEntity metadata source_access/target_access: List[AccessOption] + # TODO: target_openlibrary? source_wikipedia? ## Datastore Would store in Elasticsearch as a live database, at least to start. -TODO: try generating ~1 million of these objects to estimate index size (at -billions of docs). +Example Elasticsearch index `fatcat_ref_v02_20210716` has 1.8 billion docs +(references), and consumes 435 GBytes of disk. Might be reasonable to use PostgreSQL in the future, with more explicit control over indexes and tuning for latency. But Elasticsearch is pretty easy to @@ -172,59 +161,46 @@ operate (eg, replicas). count_inbound_refs(...) -> int same parameters as get_inbound_refs(), but returns just a count - get_all_outbound_refs(...) -> List[BiblioRef] - get_all_inbound_refs(...) -> List[BiblioRef] - same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) - (optional; maybe not public) + # UNIMPLEMENTED + #get_all_outbound_refs(...) -> List[BiblioRef] + #get_all_inbound_refs(...) -> List[BiblioRef] + # same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) + # (optional; maybe not public) - # run elasticsearch mget query for all ref idents and include "enriched" refs when possible - # for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL - # TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? - enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] - enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] - - # run fatcat API fetches for each ref and return "enriched" refs - enrich_inbound_refs_fatcat(refs: List[BiblioRef], hide, expand) -> List[FatcatBiblioRef] - enrich_outbound_refs_fatcat(refs: List[BiblioRef], hide, expand) -> List[FatcatBiblioRef] + # run catalog API fetches for each and return "enriched" refs + enrich_inbound_refs(refs: List[BiblioRef], hide, expand) -> List[EnrichedBiblioRef] + enrich_outbound_refs(refs: List[BiblioRef], hide, expand) -> List[EnrichedBiblioRef] ## HTTP API Endpoints -Possible HTTP API endpoints... not even sure we would use these or expose them -publicly? - - citations-api.fatcat.wiki - /refs/inbound - &release_ident= - &work_ident= - &openlibrary_work= - &url= - /refs/outbound - &release_ident= - &work_ident= - /refs/csl/outbound - /refs/fatcat/outbound - - api.fatcat.wiki/citations/v0 - /inbound - - fatcat.wiki/release/{release_ident}/refs/outbound.json - fatcat.wiki/work/{work_ident}/refs/outbound.json - &filter_type - &filter_stage +Initial web endpoints, including unstable pseudo-APIs: + + fatcat.wiki/release/{release_ident}/refs/in (and .json) + fatcat.wiki/release/{release_ident}/refs/out (and .json) &limit &offset + &sort (for inbound) + &filter_stage (for inbound) - fatcat.wiki/refs/openlibrary/{openlibrary_ident}/inbound.json + fatcat.wiki/openlibrary/{openlibrary_ident}/refs/in (and .json) + &limit + &offset + &sort + &filter_stage - fatcat.wiki/refs/url/inbound.json + fatcat.wiki/web/refs/in (and .json) &url= + &limit + &offset + &sort (newest, oldest) + &filter_stage ## Design Notes This proposed schema is relatively close to what the "normalize" SQL table would look like (many-to-many relationship). -Especiall for "redistributing as bulk corpus", we might want to consider an +Especially for "redistributing as bulk corpus", we might want to consider an alternative data model which is a single source entity containing a list of outbound references. Could even be a single source *work* for fatcat content, with many release under the entity. One advantage of this is that source diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 1d8a0d0d..a0079efd 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate index of reference links between works in the main catalog. See bulk citation and citation API proposals for design documentation. - -TODO: - - surt_ify() helper (URL to SURT for queries) - CSL enrichment method (using only elasticsearch mget) - CSL enrichment for fatcat enrichment - access transform - microfilm access in access transform - - all_outbound_refs(...) -> List[BiblioRef] - all_inbound_refs(...) -> List[BiblioRef] - same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) - (optional; maybe not public) """ import sys import json import datetime import argparse -from typing import Optional, List, Any, Dict +from typing import Optional, List, Any, Dict, Union from pydantic import BaseModel import elasticsearch @@ -45,8 +32,6 @@ class BiblioRef(BaseModel): source_work_ident: Optional[str] # with lang prefix like "en:Superglue" source_wikipedia_article: Optional[str] - # skipped: source_openlibrary_work - # skipped: source_url_surt source_release_stage: Optional[str] source_year: Optional[int] @@ -65,7 +50,6 @@ class BiblioRef(BaseModel): target_url_surt: Optional[str] # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform target_url: Optional[str] - # skipped: target_wikipedia_article # crossref, pubmed, grobid, etc match_provenance: Optional[str] @@ -92,31 +76,20 @@ class BiblioRef(BaseModel): # TODO: if target_openlibrary_work, add an access option? return self -class CslBiblioRef(BaseModel): - # an "enriched" version of BiblioRef with metadata about the source or - # target entity. would be "hydrated" via a lookup to, eg, the - # `fatcat_release` elasticsearch index (fast mget fetch with a single - # request), as opposed to fatcat API fetches - ref: BiblioRef - csl: Optional[Dict[str, Any]] - access: List[AccessOption] - - class Config: - arbitrary_types_allowed = True -class FatcatBiblioRef(BaseModel): +class EnrichedBiblioRef(BaseModel): # enriched version of BiblioRef with complete ReleaseEntity object as # fetched from the fatcat API. CSL-JSON metadata would be derived from # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] # TODO: openlibrary work? - #csl: Optional[Dict[str, Any]] access: List[AccessOption] class Config: arbitrary_types_allowed = True + class RefHits(BaseModel): count_returned: int count_total: int @@ -124,9 +97,13 @@ class RefHits(BaseModel): limit: int query_time_ms: int query_wall_time_ms: int - result_refs: List[BiblioRef] + result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] -def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: + """ + Internal helper for querying elasticsearch refs index and transforming hits + """ limit = min((int(limit or 15), 200)) if not offset or offset < 0: @@ -179,7 +156,7 @@ def get_outbound_refs( limit: int = 100, offset: Optional[int] = None, es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits: search = Search(using=es_client, index=es_index) @@ -199,6 +176,7 @@ def get_outbound_refs( hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) return hits + def get_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -208,12 +186,11 @@ def get_inbound_refs( url: Optional[str] = None, consolidate_works: bool = True, filter_stage: List[str] = [], - filter_type: List[str] = [], + sort: Optional[str] = None, limit: int = 25, offset: Optional[int] = None, es_index: str = "fatcat_ref", ) -> List[BiblioRef]: - # TODO: filter_stage, filter_type if url and not url_surt: url = surt_ify(url) @@ -239,10 +216,19 @@ def get_inbound_refs( else: raise ValueError("require a lookup key") - search = search.sort("-source_year") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + + if sort == "newest": + search = search.sort("-source_year") + elif sort == "oldest": + search = search.sort("source_year") + else: + search = search.sort("-source_year") return _execute_ref_query(search, limit=limit, offset=offset) + def count_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -251,7 +237,6 @@ def count_inbound_refs( url_surt: Optional[str] = None, url: Optional[str] = None, filter_stage: List[str] = [], - filter_type: List[str] = [], es_index: str = "fatcat_ref", ) -> int: """ @@ -274,28 +259,26 @@ def count_inbound_refs( else: raise ValueError("require a lookup key") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + return search.count() -# run elasticsearch mget query for all ref idents and include "enriched" refs when possible -# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL -# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? -#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] -#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.source_release_ident: release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=[], @@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi )) return enriched -def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.target_release_ident: release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=[], release=None, )) @@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h def run_ref_query(args) -> None: + """ + CLI helper/debug tool (prints to stdout) + """ release_ident = None work_ident = None if args.ident.startswith("release_"): diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index e08aaf15..e24b4ac6 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -11,14 +11,14 @@ from fatcat_openapi_client.rest import ApiException from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches -from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api, auth_api from fatcat_web.forms import * from fatcat_web.entity_helpers import * -@app.route('/release//inbound-refs', methods=['GET']) +@app.route('/release//refs/in', methods=['GET']) def release_view_refs_inbound(ident): release = generic_get_entity("release", ident) @@ -27,11 +27,12 @@ def release_view_refs_inbound(ident): offset = max(0, int(offset)) if offset.isnumeric() else 0 hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") - return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200 -@app.route('/release//outbound-refs', methods=['GET']) + +@app.route('/release//refs/out', methods=['GET']) def release_view_refs_outbound(ident): release = generic_get_entity("release", ident) @@ -40,9 +41,10 @@ def release_view_refs_outbound(ident): offset = max(0, int(offset)) if offset.isnumeric() else 0 hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + + return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200 - return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 @app.route('/reference/match', methods=['GET', 'POST']) def reference_match(): diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html index c23dbef2..78a151a0 100644 --- a/python/fatcat_web/templates/entity_base.html +++ b/python/fatcat_web/templates/entity_base.html @@ -85,10 +85,9 @@ {{ entity_tab("coverage", "Coverage", "/coverage") }} {% elif entity_type == "release" and entity.state != 'deleted' %} {{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }} - {{ entity_tab("references", "References", "/references", entity.refs|count) }} {% if entity.state == 'active' %} - {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }} - {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }} + {{ entity_tab("refs-out", "References", "/refs/out") }} + {{ entity_tab("refs-in", "Cited By", "/refs/in") }} {% endif %} {% endif %} {{ entity_tab("metadata", "Metadata", "/metadata") }} diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index 7b286fd3..43860a31 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -1,5 +1,5 @@ {% set release = entity %} -{% set entity_view = "{{ direction }}-refs" %} +{% set entity_view = "refs-" + direction %} {% set entity_type = "release" %} {% import "entity_macros.html" as entity_macros %} {% extends "entity_base.html" %} @@ -17,10 +17,10 @@ {% block entity_main %} -{% if direction == "inbound" %} -

Referenced By

- Citations to this release by other works. -{% elif direction == "outbound" %} +{% if direction == "in" %} +

Cited By

+ References to this release by other works. +{% elif direction == "out" %}

References

NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list. {% endif %} @@ -36,7 +36,7 @@ {% set release = row.release %} {# TODO: ref_locator? #} - {% if direction == "outbound" %} + {% if direction == "out" %} {% if row.ref.ref_key %} [{{ row.ref.ref_key }}]
{% endif %} -- cgit v1.2.3 From 26a1763125a25e49903d667a048820213d90ed5b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 11:57:13 -0700 Subject: pylint: skip pydantic import check (dynamic/extensions) --- python/.pylintrc | 2 ++ python/fatcat_tools/references.py | 10 ++-------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/.pylintrc b/python/.pylintrc index d3003620..5fc310a8 100644 --- a/python/.pylintrc +++ b/python/.pylintrc @@ -14,3 +14,5 @@ notes=FIXME,XXX,DELETEME ignored-modules=responses # FileEntityForm' has no 'data' member (no-member) (etc) generated-members=data,errors +# No name 'BaseModel' in module 'pydantic' (no-name-in-module) (etc) +extension-pkg-allow-list=pydantic diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index a0079efd..da398239 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -192,9 +192,6 @@ def get_inbound_refs( es_index: str = "fatcat_ref", ) -> List[BiblioRef]: - if url and not url_surt: - url = surt_ify(url) - search = Search(using=es_client, index=es_index) if consolidate_works: @@ -243,9 +240,6 @@ def count_inbound_refs( Same parameters as get_inbound_refs(), but returns just a count """ - if url and not url_surt: - url = surt_ify(url) - search = Search(using=es_client, index=es_index) if release_ident: @@ -324,7 +318,7 @@ def run_ref_query(args) -> None: print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") if args.enrich == "fatcat": - enriched = enrich_outbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) for ref in enriched: if ref.release: print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") @@ -341,7 +335,7 @@ def run_ref_query(args) -> None: print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") if args.enrich == "fatcat": - enriched = enrich_inbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) for ref in enriched: if ref.release: print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") -- cgit v1.2.3 From d01ab339a0aa568b6ccd6c56beb611a2e7ac9686 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 11:58:41 -0700 Subject: remove unused imports (lint) --- python/fatcat_tools/references.py | 3 +-- python/fatcat_tools/transforms/access.py | 2 +- python/fatcat_web/ref_routes.py | 7 ++----- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index da398239..67a16602 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -6,14 +6,13 @@ See bulk citation and citation API proposals for design documentation. """ import sys -import json import datetime import argparse from typing import Optional, List, Any, Dict, Union from pydantic import BaseModel import elasticsearch -from elasticsearch_dsl import Search, Q +from elasticsearch_dsl import Search from fatcat_openapi_client import ReleaseEntity from fatcat_tools import public_api diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index 231cd2b3..add8ff3b 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -1,6 +1,6 @@ from enum import Enum -from typing import Optional, List, Any, Dict +from typing import Optional, List from pydantic import BaseModel from fatcat_openapi_client import ReleaseEntity diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index e24b4ac6..7c3ba5bd 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -3,17 +3,14 @@ Flask endpoints for reference (citation) endpoints. Eg, listing references "inbound" and "outbound" from a specific release or work. """ -from typing import Optional - -from flask import render_template, abort, redirect, request +from flask import render_template, request from fatcat_openapi_client import * -from fatcat_openapi_client.rest import ApiException from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs from fatcat_tools.transforms.access import release_access_options -from fatcat_web import app, api, auth_api +from fatcat_web import app, api from fatcat_web.forms import * from fatcat_web.entity_helpers import * -- cgit v1.2.3 From f58d4c2605bb028fd8844b25d345b524a5d47a87 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 17:47:40 -0700 Subject: refs: small refactors/tweaks --- python/fatcat_tools/references.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 67a16602..73f57e18 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -46,7 +46,7 @@ class BiblioRef(BaseModel): target_release_ident: Optional[str] target_work_ident: Optional[str] target_openlibrary_work: Optional[str] - target_url_surt: Optional[str] + # TODO: target_url_surt: Optional[str] # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform target_url: Optional[str] @@ -69,10 +69,22 @@ class BiblioRef(BaseModel): """ if self.target_openlibrary_work and self.target_openlibrary_work.startswith("/works/"): self.target_openlibrary_work = self.target_openlibrary_work[7:] - if self.target_url_surt and not self.target_url: - # TODO: convert SURT to regular URL - pass - # TODO: if target_openlibrary_work, add an access option? + + # work-arounds for bad/weird ref_key + if self.ref_key: + self.ref_key = self.ref_key.strip() + if self.ref_key[0] in ['/', '_']: + self.ref_key = self.ref_key[1:] + if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key: + self.ref_key = self.ref_key.split('-')[-1] + if self.ref_key.startswith("10.") and '_' in self.ref_key: + self.ref_key = self.ref_key.split('_')[-1] + if len(self.ref_key) > 10 and "#" in self.ref_key: + self.ref_key = self.ref_key.split('#')[-1] + if len(self.ref_key) > 10 and "_" in self.ref_key: + self.ref_key = self.ref_key.split('_')[-1] + if not self.ref_key and self.ref_index is not None: + self.ref_key = str(self.ref_index) return self @@ -181,7 +193,6 @@ def get_inbound_refs( release_ident: Optional[str] = None, work_ident: Optional[str] = None, openlibrary_work: Optional[str] = None, - url_surt: Optional[str] = None, url: Optional[str] = None, consolidate_works: bool = True, filter_stage: List[str] = [], @@ -207,8 +218,6 @@ def get_inbound_refs( search = search.filter("term", target_work_ident=work_ident) elif openlibrary_work: search = search.filter("term", target_openlibrary_work=openlibrary_work) - elif url_surt: - search = search.filter("term", target_url_surt=url_surt) else: raise ValueError("require a lookup key") @@ -230,7 +239,6 @@ def count_inbound_refs( release_ident: Optional[str] = None, work_ident: Optional[str] = None, openlibrary_work: Optional[str] = None, - url_surt: Optional[str] = None, url: Optional[str] = None, filter_stage: List[str] = [], es_index: str = "fatcat_ref", @@ -247,8 +255,6 @@ def count_inbound_refs( search = search.filter("term", target_work_ident=work_ident) elif openlibrary_work: search = search.filter("term", target_openlibrary_work=openlibrary_work) - elif url_surt: - search = search.filter("term", target_url_surt=url_surt) else: raise ValueError("require a lookup key") -- cgit v1.2.3 From 05665c448e846e4834fa59dfe3cd0f11ac789ac9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 17:51:22 -0700 Subject: refs: generalize web endpoints; JSON content negotiation; openlibrary inbound view; etc --- python/fatcat_tools/references.py | 77 ++++++++++++----- python/fatcat_tools/transforms/access.py | 2 + python/fatcat_web/ref_routes.py | 99 +++++++++++++++++----- .../templates/openlibrary_view_fuzzy_refs.html | 29 +++++++ 4 files changed, 166 insertions(+), 41 deletions(-) create mode 100644 python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 73f57e18..81b55f41 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -10,13 +10,14 @@ import datetime import argparse from typing import Optional, List, Any, Dict, Union -from pydantic import BaseModel +from pydantic import BaseModel, validator import elasticsearch from elasticsearch_dsl import Search from fatcat_openapi_client import ReleaseEntity from fatcat_tools import public_api from fatcat_tools.transforms.access import release_access_options, AccessOption +from fatcat_tools.transforms.entities import entity_to_dict class BiblioRef(BaseModel): @@ -97,8 +98,17 @@ class EnrichedBiblioRef(BaseModel): # TODO: openlibrary work? access: List[AccessOption] + @validator('release') + def check_release(cls, v): + if v is not None and not isinstance(v, ReleaseEntity): + raise ValueError("expected a ReleaseEntity") + return v + class Config: arbitrary_types_allowed = True + json_encoders = { + ReleaseEntity: entity_to_dict, + } class RefHits(BaseModel): @@ -110,6 +120,11 @@ class RefHits(BaseModel): query_wall_time_ms: int result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] + class Config: + json_encoders = { + ReleaseEntity: entity_to_dict, + } + def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: """ @@ -268,40 +283,58 @@ def count_inbound_refs( def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: + release = None + access = [] if ref.source_release_ident: release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) - enriched.append(EnrichedBiblioRef( - ref=ref, - #csl=None, - access=release_access_options(release), - release=release, - )) - else: - enriched.append(EnrichedBiblioRef( - ref=ref, - #csl=None, - access=[], - release=None, + access = release_access_options(release) + if ref.source_wikipedia_article: + wiki_lang = ref.source_wikipedia.split(':')[0] + wiki_article = ':'.join(ref.source_wikipedia.split(':')[1:]) + access.append(AccessOption( + access_type="wikipedia", + access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", + mimetype=None, + size_bytes=None, + thumbnail_url=None )) + enriched.append(EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + )) return enriched def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: + release = None + access = [] if ref.target_release_ident: release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) - enriched.append(EnrichedBiblioRef( - ref=ref, - access=release_access_options(release), - release=release, + access = release_access_options(release) + if ref.target_openlibrary_work: + access.append(AccessOption( + access_type="openlibrary", + access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", + mimetype=None, + size_bytes=None, + thumbnail_url=None )) - else: - enriched.append(EnrichedBiblioRef( - ref=ref, - access=[], - release=None, + if ref.target_url and '://web.archive.org/' in ref.target_url: + access.append(AccessOption( + access_type="wayback", + access_url=ref.target_url, + mimetype=None, + size_bytes=None, + thumbnail_url=None )) + enriched.append(EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + )) return enriched diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index add8ff3b..5ed64c7c 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -13,6 +13,8 @@ class AccessType(str, Enum): ia_file = "ia_file" ia_microfilm = "ia_microfilm" repository = "repository" + openlibrary = "openlibrary" + wikipedia = "wikipedia" class AccessOption(BaseModel): diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 7c3ba5bd..72f115cf 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -3,44 +3,82 @@ Flask endpoints for reference (citation) endpoints. Eg, listing references "inbound" and "outbound" from a specific release or work. """ -from flask import render_template, request +from flask import render_template, request, jsonify, Response from fatcat_openapi_client import * from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches -from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs +from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs, RefHits from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api +from fatcat_web.cors import crossdomain from fatcat_web.forms import * from fatcat_web.entity_helpers import * - -@app.route('/release//refs/in', methods=['GET']) -def release_view_refs_inbound(ident): - - release = generic_get_entity("release", ident) - +def _refs_web(direction, release_ident=None, work_ident=None, openlibrary_id=None) -> RefHits: offset = request.args.get('offset', '0') offset = max(0, int(offset)) if offset.isnumeric() else 0 + limit = request.args.get('offset', '30') + limit = min(max(0, int(limit)), 100) if limit.isnumeric() else 30 + if direction == "in": + hits = get_inbound_refs( + release_ident=release_ident, + work_ident=work_ident, + openlibrary_work=openlibrary_id, + es_client=app.es_client, + offset=offset, + limit=limit, + ) + hits.result_refs = enrich_inbound_refs( + hits.result_refs, + fatcat_api_client=api, + expand="container,files,webcaptures", + ) + elif direction == "out": + hits = get_outbound_refs( + release_ident=release_ident, + work_ident=work_ident, + es_client=app.es_client, + offset=offset, + limit=limit, + ) + hits.result_refs = enrich_outbound_refs( + hits.result_refs, + fatcat_api_client=api, + expand="container,files,webcaptures", + ) + else: + raise ValueError() + return hits + + +@app.route('/release//refs-in', methods=['GET']) +def release_view_refs_inbound(ident): + if request.accept_mimetypes.best == "application/json": + return release_view_refs_inbound_json(ident) - hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") - - return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + release = generic_get_entity("release", ident) + hits = _refs_web("in", release_ident=ident) + return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits), 200 -@app.route('/release//refs/out', methods=['GET']) +@app.route('/release//refs-out', methods=['GET']) def release_view_refs_outbound(ident): + if request.accept_mimetypes.best == "application/json": + return release_view_refs_outbound_json(ident) release = generic_get_entity("release", ident) + hits = _refs_web("out", release_ident=ident) + return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits), 200 - offset = request.args.get('offset', '0') - offset = max(0, int(offset)) if offset.isnumeric() else 0 - - hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") +@app.route('/openlibrary/OLW/refs-in', methods=['GET']) +def openlibrary_view_refs_inbound(id_num): + if request.accept_mimetypes.best == "application/json": + return openlibrary_view_refs_inbound(id_num) - return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + openlibrary_id = f"OL{id_num}W" + hits = _refs_web("in", openlibrary_id=openlibrary_id) + return render_template('openlibrary_view_fuzzy_refs.html', openlibrary_id=openlibrary_id, direction="in", hits=hits), 200 @app.route('/reference/match', methods=['GET', 'POST']) @@ -85,3 +123,26 @@ def reference_match(): return render_template('reference_match.html', form=form), 400 return render_template('reference_match.html', form=form), 200 + + +### Pseudo-APIs ############################################################# + +@app.route('/release//refs-out.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def release_view_refs_outbound_json(ident): + hits = _refs_web("out", release_ident=ident) + return Response(hits.json(exclude_unset=True), mimetype="application/json") + + +@app.route('/release//refs-in.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def release_view_refs_inbound_json(ident): + hits = _refs_web("in", release_ident=ident) + return Response(hits.json(exclude_unset=True), mimetype="application/json") + +@app.route('/openlibrary/OLW/refs-in', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def openlibrary_view_refs_inbound_json(ident): + openlibrary_id = f"OL{id_num}W" + hits = _refs_web("in", openlibrary_id=openlibrary_id) + return Response(hits.json(exclude_unset=True), mimetype="application/json") diff --git a/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html b/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html new file mode 100644 index 00000000..161a7b50 --- /dev/null +++ b/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html @@ -0,0 +1,29 @@ +{% extends "base.html" %} +{% import "refs_macros.html" as refs_macros %} + +{% block title %}Open Library Refs{% endblock %} + +{% block fullbody %} +

+ {% if hits.result_refs and hits.result_refs[0].ref.target_unstructured %} + {{ hits.result_refs[0].ref.target_unstructured }} + {% endif %} + https://openlibrary.org/works/{{ openlibrary_id }} +

+ +{% if direction == "in" %} +

Cited By

+

This page lists references to this book from other works (eg, journal articles). +{% elif direction == "out" %} +

References

+ Refernces from this book to other entities. +{% endif %} + +{% if hits.result_refs %} + {{ refs_macros.refs_table(hits, direction) }} +{% else %} +

None found +{% endif %} + +{% endblock %} + -- cgit v1.2.3 From cb3d10a06c0166c4eb8ec2d48852e4bfc236ef27 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 17:01:29 -0700 Subject: refs: support for wikipedia outbound refs, and display in tables --- python/fatcat_tools/references.py | 4 ++-- python/fatcat_web/ref_routes.py | 26 ++++++++++++++++++--- python/fatcat_web/templates/refs_macros.html | 20 +++++++++++++--- .../templates/wikipedia_view_fuzzy_refs.html | 27 ++++++++++++++++++++++ 4 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 81b55f41..508cf19d 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -289,8 +289,8 @@ def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Opt release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) access = release_access_options(release) if ref.source_wikipedia_article: - wiki_lang = ref.source_wikipedia.split(':')[0] - wiki_article = ':'.join(ref.source_wikipedia.split(':')[1:]) + wiki_lang = ref.source_wikipedia_article.split(':')[0] + wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_') access.append(AccessOption( access_type="wikipedia", access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 6f887c4d..88ac0744 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -15,7 +15,7 @@ from fatcat_web.cors import crossdomain from fatcat_web.forms import * from fatcat_web.entity_helpers import * -def _refs_web(direction, release_ident=None, work_ident=None, openlibrary_id=None) -> RefHits: +def _refs_web(direction, release_ident=None, work_ident=None, openlibrary_id=None, wikipedia_article=None) -> RefHits: offset = request.args.get('offset', '0') offset = max(0, int(offset)) if offset.isnumeric() else 0 limit = request.args.get('limit', '30') @@ -37,6 +37,7 @@ def _refs_web(direction, release_ident=None, work_ident=None, openlibrary_id=Non elif direction == "out": hits = get_outbound_refs( release_ident=release_ident, + wikipedia_article=wikipedia_article, work_ident=work_ident, es_client=app.es_client, offset=offset, @@ -74,12 +75,23 @@ def release_view_refs_outbound(ident): @app.route('/openlibrary/OLW/refs-in', methods=['GET']) def openlibrary_view_refs_inbound(id_num): if request.accept_mimetypes.best == "application/json": - return openlibrary_view_refs_inbound(id_num) + return openlibrary_view_refs_inbound_json(id_num) openlibrary_id = f"OL{id_num}W" hits = _refs_web("in", openlibrary_id=openlibrary_id) return render_template('openlibrary_view_fuzzy_refs.html', openlibrary_id=openlibrary_id, direction="in", hits=hits), 200 +@app.route('/wikipedia/:/refs-out', methods=['GET']) +def wikipedia_view_refs_outbound(wiki_lang: str, wiki_article: str): + if request.accept_mimetypes.best == "application/json": + return wikipedia_view_refs_outbound_json(wiki_lang, wiki_article) + + wiki_url = f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}" + wiki_article = wiki_article.replace('_', ' ') + wikipedia_article = wiki_lang + ":" + wiki_article + hits = _refs_web("out", wikipedia_article=wikipedia_article) + return render_template('wikipedia_view_fuzzy_refs.html', wiki_article=wiki_article, wiki_lang=wiki_lang, wiki_url=wiki_url, direction="out", hits=hits), 200 + @app.route('/reference/match', methods=['GET', 'POST']) def reference_match(): @@ -140,9 +152,17 @@ def release_view_refs_inbound_json(ident): hits = _refs_web("in", release_ident=ident) return Response(hits.json(exclude_unset=True), mimetype="application/json") -@app.route('/openlibrary/OLW/refs-in', methods=['GET', 'OPTIONS']) +@app.route('/openlibrary/OLW/refs-in.json', methods=['GET', 'OPTIONS']) @crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) def openlibrary_view_refs_inbound_json(ident): openlibrary_id = f"OL{id_num}W" hits = _refs_web("in", openlibrary_id=openlibrary_id) return Response(hits.json(exclude_unset=True), mimetype="application/json") + +@app.route('/wikipedia/:/refs-out.json', methods=['GET', 'OPTIONS']) +@crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) +def wikipedia_view_refs_outbound_json(wiki_lang: str, wiki_article: str): + wiki_article = wiki_article.replace('_', ' ') + wikipedia_article = wiki_lang + ":" + wiki_article + hits = _refs_web("out", wikipedia_article=wikipedia_article) + return Response(hits.json(exclude_unset=True), mimetype="application/json") diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index 405aca73..ba4d18ad 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -35,12 +35,26 @@ {% if release %} {{ entity_macros.release_summary(release) }} - {% elif row.ref.target_unstructured %} + {% elif direction == "in" and row.ref.source_wikipedia_article %} + {% set wiki_lang = row.ref.source_wikipedia_article.split(':')[0] %} + {% set wiki_article = ':'.join(row.ref.source_wikipedia_article.split(':')[1:]) %} + + + {{ wiki_article }} + + [wikipedia] + +
+ lang:{{ wiki_lang }}  + [references]  + {% elif direction == "out" and row.ref.target_unstructured %} {{ row.ref.target_unstructured }} {% if row.ref.target_openlibrary_work %} -
openlibrary:{{ row.ref.target_openlibrary_work }}  +
+ openlibrary:{{ row.ref.target_openlibrary_work }}  + [cited-by]  {% endif %} - {% elif row.ref.target_csl %} + {% elif direction == "in" and row.ref.target_csl %} {{ entity_macros.csl_summary(row.ref.target_csl) }} {% else %} blank diff --git a/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html b/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html new file mode 100644 index 00000000..5b53d692 --- /dev/null +++ b/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html @@ -0,0 +1,27 @@ +{% extends "base.html" %} +{% import "refs_macros.html" as refs_macros %} + +{% block title %}Wikipedia Refs{% endblock %} + +{% block fullbody %} +

+ [{{ wiki_lang }}] {{ wiki_article }} + {{ wiki_url }} +

+ +{% if direction == "in" %} +

Cited By

+

This page lists references to a wikipedia article, from other works (eg, journal articles). +{% elif direction == "out" %} +

References

+ Refernces from wikipedia article to other entities. +{% endif %} + +{% if hits.result_refs %} + {{ refs_macros.refs_table(hits, direction) }} +{% else %} +

None found +{% endif %} + +{% endblock %} + -- cgit v1.2.3 From f3481c02bd7a50d9073902dba07fe265eecb93db Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 18:57:09 -0700 Subject: refs: lint fixes --- python/fatcat_tools/references.py | 1 + python/fatcat_web/ref_routes.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools/references.py') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 508cf19d..496a46e1 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -99,6 +99,7 @@ class EnrichedBiblioRef(BaseModel): access: List[AccessOption] @validator('release') + @classmethod def check_release(cls, v): if v is not None and not isinstance(v, ReleaseEntity): raise ValueError("expected a ReleaseEntity") diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 88ac0744..d4219012 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -3,7 +3,7 @@ Flask endpoints for reference (citation) endpoints. Eg, listing references "inbound" and "outbound" from a specific release or work. """ -from flask import render_template, request, jsonify, Response +from flask import render_template, request, Response from fatcat_openapi_client import * from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches @@ -154,7 +154,7 @@ def release_view_refs_inbound_json(ident): @app.route('/openlibrary/OLW/refs-in.json', methods=['GET', 'OPTIONS']) @crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) -def openlibrary_view_refs_inbound_json(ident): +def openlibrary_view_refs_inbound_json(id_num): openlibrary_id = f"OL{id_num}W" hits = _refs_web("in", openlibrary_id=openlibrary_id) return Response(hits.json(exclude_unset=True), mimetype="application/json") -- cgit v1.2.3