From a5a8811a605080f2cd9eb575c33a17f045c43674 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 17:01:21 -0700 Subject: initial inbound/outbound reference query helpers --- python/fatcat_tools/references.py | 450 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 python/fatcat_tools/references.py diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py new file mode 100644 index 00000000..c9730174 --- /dev/null +++ b/python/fatcat_tools/references.py @@ -0,0 +1,450 @@ +""" +Helper routines for working with the fatcat citation graph, which is a separate +index of reference links between works in the main catalog. + +See bulk citation and citation API proposals for design documentation. + +TODO: + + surt_ify() helper (URL to SURT for queries) + CSL enrichment method (using only elasticsearch mget) + CSL enrichment for fatcat enrichment + access transform + microfilm access in access transform + + all_outbound_refs(...) -> List[BiblioRef] + all_inbound_refs(...) -> List[BiblioRef] + same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) + (optional; maybe not public) +""" + +import sys +import json +import datetime +import argparse +from enum import Enum +from typing import Optional, List, Any, Dict + +from pydantic import BaseModel +import elasticsearch +from elasticsearch_dsl import Search, Q +from fatcat_openapi_client import ReleaseEntity + +from fatcat_tools import public_api + + + +class BiblioRef(BaseModel): + """bibliographic reference""" + # ("release", source_release_ident, ref_index) + # ("wikipedia", source_wikipedia_article, ref_index) + _key: Optional[str] + update_ts: Optional[datetime.datetime] + + # metadata about source of reference + source_release_ident: Optional[str] + source_work_ident: Optional[str] + # with lang prefix like "en:Superglue" + source_wikipedia_article: Optional[str] + # skipped: source_openlibrary_work + # skipped: source_url_surt + source_release_stage: Optional[str] + source_year: Optional[int] + + # context of the reference itself + # 1-indexed, not 0-indexed + ref_index: Optional[int] # TODO: actually optional? + # eg, "Lee86", "BIB23" + ref_key: Optional[str] + # eg, page number + ref_locator: Optional[str] + + # target of reference (identifiers) + target_release_ident: Optional[str] + target_work_ident: Optional[str] + target_openlibrary_work: Optional[str] + target_url_surt: Optional[str] + # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform + target_url: Optional[str] + # skipped: target_wikipedia_article + + # crossref, pubmed, grobid, etc + match_provenance: str + # strong, weak, etc + match_status: Optional[str] + # TODO: "match_strength"? + # "doi", "isbn", "fuzzy title, author", etc + # maybe "fuzzy-title-author"? + match_reason: Optional[str] + + # only if no release_ident link/match + target_unstructured: Optional[str] + target_csl: Optional[Dict[str, Any]] + +class AccessType(str, Enum): + """describes type of access URL""" + + wayback = "wayback" + ia_file = "ia_file" + ia_microfilm = "ia_microfilm" + repository = "repository" + +class AccessOption(BaseModel): + + access_type: AccessType + + # note: for `target_url` refs, would do a CDX lookup and this URL would be + # a valid/HTTP-200 web.archive.org capture URL + access_url: str + + # application/pdf, text/html, etc + # blank for landing pages + mimetype: Optional[str] + + size_bytes: Optional[int] + thumbnail_url: Optional[str] + +class CslBiblioRef(BaseModel): + # an "enriched" version of BiblioRef with metadata about the source or + # target entity. would be "hydrated" via a lookup to, eg, the + # `fatcat_release` elasticsearch index (fast mget fetch with a single + # request), as opposed to fatcat API fetches + ref: BiblioRef + csl: Optional[Dict[str, Any]] + access: List[AccessOption] + + class Config: + arbitrary_types_allowed = True + +class FatcatBiblioRef(BaseModel): + # enriched version of BiblioRef with complete ReleaseEntity object as + # fetched from the fatcat API. CSL-JSON metadata would be derived from + # the full release entity. + ref: BiblioRef + release: Optional[ReleaseEntity] + csl: Optional[Dict[str, Any]] + access: List[AccessOption] + + class Config: + arbitrary_types_allowed = True + +class RefHits(BaseModel): + count_returned: int + count_total: int + offset: int + limit: int + query_time_ms: int + query_wall_time_ms: int + result_refs: List[BiblioRef] + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + + limit = min((int(limit or 15), 200)) + if not offset or offset < 0: + offset = 0 + + search = search.params(track_total_hits=True) + search = search[offset : (offset + limit)] + + query_start = datetime.datetime.now() + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e_raw: + # this is a "user" error + e: Any = e_raw + #logging.warn("elasticsearch 400: " + str(e.info)) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e + else: + raise ValueError(str(e.info)) from e + except elasticsearch.exceptions.TransportError as e: + # all other errors + #logging.warn(f"elasticsearch non-200 status code: {e.info}") + raise IOError(str(e.info)) from e + query_delta = datetime.datetime.now() - query_start + + result_refs = [] + for h in resp.hits: + # might be a list because of consolidation + if isinstance(h._d_.get('source_work_ident'), list): + h._d_['source_work_ident'] = h._d_['source_work_ident'][0] + result_refs.append(BiblioRef.parse_obj(h._d_)) + + return RefHits( + count_returned=len(result_refs), + # ES 7.x style "total" + count_total=resp.hits.total.value, + offset=offset, + limit=limit, + query_time_ms=int(resp.took), + query_wall_time_ms=int(query_delta.total_seconds() * 1000), + result_refs=result_refs, + ) + + +def get_outbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + wikipedia_article: Optional[str] = None, + limit: int = 100, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> List[BiblioRef]: + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", source_release_ident=release_ident) + elif work_ident: + search = search.filter("term", source_work_ident=work_ident) + elif wikipedia_article: + search = search.filter("term", source_wikipedia_article=wikipedia_article) + else: + raise ValueError("require a lookup key") + + # TODO: schema doesn't support either of these currently + #search = search.sort("ref_index") + #search = search.sort("ref_key") + + # re-sort by index + hits = _execute_ref_query(search, limit=limit, offset=offset) + hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) + return hits + +def get_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url_surt: Optional[str] = None, + url: Optional[str] = None, + consolidate_works: bool = True, + filter_stage: List[str] = [], + filter_type: List[str] = [], + limit: int = 25, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> List[BiblioRef]: + # TODO: filter_stage, filter_type + + if url and not url_surt: + url = surt_ify(url) + + search = Search(using=es_client, index=es_index) + + if consolidate_works: + search = search.extra( + collapse={ + "field": "source_work_ident", + "inner_hits": {"name": "source_more", "size": 0,}, + } + ) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + elif url_surt: + search = search.filter("term", target_url_surt=url_surt) + else: + raise ValueError("require a lookup key") + + # TODO: wrong type, not int? and maybe need to index differently? + #search = search.sort("source_year") + + return _execute_ref_query(search, limit=limit, offset=offset) + +def count_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url_surt: Optional[str] = None, + url: Optional[str] = None, + filter_stage: List[str] = [], + filter_type: List[str] = [], + es_index: str = "fatcat_ref", +) -> int: + """ + Same parameters as get_inbound_refs(), but returns just a count + """ + + if url and not url_surt: + url = surt_ify(url) + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + elif url_surt: + search = search.filter("term", target_url_surt=url_surt) + else: + raise ValueError("require a lookup key") + + return search.count() + +def _release_access(release: ReleaseEntity) -> List[AccessOption]: + """ + Extracts access options from a release. + """ + options = [] + for f in (release.files or []): + for u in (f.urls or []): + if '://web.archive.org/' in u.url: + return [AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + elif '://archive.org/' in u.url: + return [AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + return options + +# run elasticsearch mget query for all ref idents and include "enriched" refs when possible +# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL +# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? +#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] +#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] + +# run fatcat API fetches for each ref and return "enriched" refs +def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + enriched = [] + for ref in refs: + if ref.source_release_ident: + release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=_release_access(release), + release=release, + )) + else: + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=[], + release=None, + )) + return enriched + +def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + enriched = [] + for ref in refs: + if ref.target_release_ident: + release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=_release_access(release), + release=release, + )) + else: + enriched.append(FatcatBiblioRef( + ref=ref, + csl=None, + access=[], + release=None, + )) + return enriched + + +def run_ref_query(args) -> None: + release_ident = None + work_ident = None + if args.ident.startswith("release_"): + release_ident = args.ident.split('_')[1] + elif args.ident.startswith("work_"): + work_ident = args.ident.split('_')[1] + else: + release_ident = args.ident + + print("## Outbound References") + hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_outbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + + print() + print("## Inbound References") + hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_inbound_refs_fatcat(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"release_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") + +def main() -> None: + """ + Run this utility like: + + python -m fatcat_tools.references + + Examples: + + python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply + """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + subparsers = parser.add_subparsers() + + parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0") + parser.add_argument("--elasticsearch-base", default="https://search.fatcat.wiki") + parser.add_argument("--elasticsearch-ref-index", default="fatcat_ref") + + sub = subparsers.add_parser( + "query", + help="takes a fatcat ident argument, prints both inbound and outbound references", + ) + sub.set_defaults(func="run_ref_query") + sub.add_argument("ident", type=str) + sub.add_argument("--enrich", type=str) + + args = parser.parse_args() + if not args.__dict__.get("func"): + parser.print_help(file=sys.stderr) + sys.exit(-1) + + args.es_client = elasticsearch.Elasticsearch(args.elasticsearch_base) + args.fatcat_api_client = public_api(args.fatcat_api_base) + + if args.func == "run_ref_query": + run_ref_query(args) + else: + raise NotImplementedError(args.func) + +if __name__ == "__main__": + main() -- cgit v1.2.3 From d5b24df069fc96d396afbb302633a077e5dbfb39 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 18:34:11 -0700 Subject: first iteration of basic citation inbound/outbound views --- python/fatcat_web/__init__.py | 2 +- python/fatcat_web/ref_routes.py | 50 ++++++++++++ .../templates/release_view_fuzzy_refs.html | 95 ++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 python/fatcat_web/ref_routes.py create mode 100644 python/fatcat_web/templates/release_view_fuzzy_refs.html diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py index 07b4e083..3207bc75 100644 --- a/python/fatcat_web/__init__.py +++ b/python/fatcat_web/__init__.py @@ -76,7 +76,7 @@ app.register_blueprint(mwoauth.bp, url_prefix='/auth/wikipedia') app.es_client = elasticsearch.Elasticsearch(Config.ELASTICSEARCH_BACKEND) -from fatcat_web import routes, editing_routes, auth, cors, forms +from fatcat_web import routes, editing_routes, ref_routes, auth, cors, forms # TODO: blocking on ORCID support in loginpass if Config.ORCID_CLIENT_ID: diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py new file mode 100644 index 00000000..a49813c4 --- /dev/null +++ b/python/fatcat_web/ref_routes.py @@ -0,0 +1,50 @@ +""" +Flask endpoints for reference (citation) endpoints. Eg, listing references +"inbound" and "outbound" from a specific release or work. +""" + +from typing import Optional + +from flask import render_template, abort, redirect, request +from fatcat_openapi_client import * +from fatcat_openapi_client.rest import ApiException + +from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_web import app, api, auth_api +from fatcat_web.forms import * +from fatcat_web.entity_helpers import * + + +@app.route('/release//refs/inbound', methods=['GET']) +def release_view_refs_inbound(ident): + + # lookup release ident, ensure it exists + try: + release = api.get_release(ident) + except ApiException as ae: + abort(ae.status) + + offset = request.args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) + enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + + return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + +@app.route('/release//refs/outbound', methods=['GET']) +def release_view_refs_outbound(ident): + + # lookup release ident, ensure it exists + try: + release = api.get_release(ident) + except ApiException as ae: + abort(ae.status) + + offset = request.args.get('offset', '0') + offset = max(0, int(offset)) if offset.isnumeric() else 0 + + hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) + enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + + return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html new file mode 100644 index 00000000..bc1fa171 --- /dev/null +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -0,0 +1,95 @@ +{% set release = entity %} +{% set entity_view = "references" %} +{% set entity_type = "release" %} +{% import "entity_macros.html" as entity_macros %} +{% extends "entity_base.html" %} + +{% block entity_main %} + +{% if direction == "inbound" %} +

Inbound Matched References

+ Other releases citing this one +{% elif direction == "outbound" %} +

Outbound Matched References

+ This release citing other releases +{% endif %} + +

Found {{ hits.count_total }} references in {{ hits.query_wall_time_ms }}ms. +{% if hits.count_total != hits.count_returned %} + Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} + {% if hits.offset + hits.limit < hits.count_total %} +  next... + {% endif %} +{% endif %} + + + +{% for ref in enriched_refs %} + {% set release = ref.release %} + +
+ {% if direction == "outbound" %} + {% if ref.ref.ref_key %} + [{{ ref.ref.ref_key }}] + {% endif %} + {% endif %} +
{{ ref.ref.match_status }} +
{{ ref.ref.match_provenance }} +
+ {{ release.title }} + {% if release.release_type not in ["article-journal", "conference-paper"] %} + [{{ release.release_type or "unknown-type" }}] + {% endif %} +
+ {% for contrib in release.contribs[:5] %} + {% if contrib.creator %} + {{ contrib.creator.display_name }} + {% else %} + {{ contrib.raw_name }} + {% endif %} + {% if not loop.last %}, {% endif %} + {% endfor %} + {% if release.contribs | length > 5 %}(+ more) {%endif %} +
+ {% if release.release_year %}{{ release.release_year }}  {% endif %} + {% if release.container %} + {{ release.container.name }} + {% elif release.extra and release.extra.container_name %} + {{ release.extra.container_name }} + {% endif %} + {% if release.release_stage != "published" %} +  {{ release.release_stage or "unpublished" }} + {% endif %} + +
+ {% if release.version %} + version:{{ release.release_year }}  + {% endif %} + {% if release.number %} + number:{{ release.number }}  + {% endif %} + {% if release.ext_ids.doi %} + doi:{{ release.ext_ids.doi }}  + {% endif %} + {# TODO: links #} + {% if release.ext_ids.arxiv %} + arXiv:{{ release.ext_ids.arxiv }}  + {% endif %} + {% if release.ext_ids.pmcid %} + pmcid:{{ release.ext_ids.pmcid }}  + {% endif %} + {% if release.ext_ids.pmid %} + pmid:{{ release.ext_ids.pmid }}  + {% endif %} + {% if release.ext_ids.dblp %} + dblp:{{ release.ext_ids.dblp }}  + {% endif %} +
+ {% if ref.access %} + {{ ref.access[0].access_type.name }} + {% endif %} +{% endfor %} +
+ +{% endblock %} + -- cgit v1.2.3 From 7186a379f335dd2731d5db79ab85abf3506cee88 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:25:00 -0700 Subject: web: template macro to display release entry summary --- python/fatcat_web/templates/entity_macros.html | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index 50f45753..94770afb 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -387,3 +387,55 @@ yellow {%- endmacro %} + +{# this is useful for things like showing lists of releases in tables #} +{% macro release_summary(release) %} +{{ release.title }} + {% if release.release_type not in ["article-journal", "conference-paper"] %} + [{{ release.release_type or "unknown-type" }}] + {% endif %} +
+ {% for contrib in release.contribs[:5] %} + {% if contrib.creator %} + {{ contrib.creator.display_name }} + {% else %} + {{ contrib.raw_name }} + {% endif %} + {% if not loop.last %}, {% endif %} + {% endfor %} + {% if release.contribs | length > 5 %}(+ more) {%endif %} +
+ {% if release.release_year %}{{ release.release_year }}  {% endif %} + {% if release.container %} + {{ release.container.name }} + {% elif release.extra and release.extra.container_name %} + {{ release.extra.container_name }} + {% endif %} + {% if release.release_stage != "published" %} +  {{ release.release_stage or "unpublished" }} + {% endif %} + +
+ {% if release.version %} + version:{{ release.release_year }}  + {% endif %} + {% if release.number %} + number:{{ release.number }}  + {% endif %} + {% if release.ext_ids.doi %} + doi:{{ release.ext_ids.doi }}  + {% endif %} + {# TODO: links #} + {% if release.ext_ids.arxiv %} + arXiv:{{ release.ext_ids.arxiv }}  + {% endif %} + {% if release.ext_ids.pmcid %} + pmcid:{{ release.ext_ids.pmcid }}  + {% endif %} + {% if release.ext_ids.pmid %} + pmid:{{ release.ext_ids.pmid }}  + {% endif %} + {% if release.ext_ids.dblp %} + dblp:{{ release.ext_ids.dblp }}  + {% endif %} +{% endmacro %} -- cgit v1.2.3 From bb085c92760d6ccbd6c92e13fcae0af02b5a3d17 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:29:34 -0700 Subject: partial access options transform for releases --- python/fatcat_tools/transforms/access.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 python/fatcat_tools/transforms/access.py diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py new file mode 100644 index 00000000..231cd2b3 --- /dev/null +++ b/python/fatcat_tools/transforms/access.py @@ -0,0 +1,58 @@ + +from enum import Enum +from typing import Optional, List, Any, Dict + +from pydantic import BaseModel +from fatcat_openapi_client import ReleaseEntity + + +class AccessType(str, Enum): + """describes type of access URL""" + + wayback = "wayback" + ia_file = "ia_file" + ia_microfilm = "ia_microfilm" + repository = "repository" + +class AccessOption(BaseModel): + + access_type: AccessType + + # note: for `target_url` refs, would do a CDX lookup and this URL would be + # a valid/HTTP-200 web.archive.org capture URL + access_url: str + + # application/pdf, text/html, etc + # blank for landing pages + mimetype: Optional[str] + + size_bytes: Optional[int] + thumbnail_url: Optional[str] + + +def release_access_options(release: ReleaseEntity) -> List[AccessOption]: + """ + Extracts access options from a release. + + TODO: proper implementation + """ + options = [] + for f in (release.files or []): + for u in (f.urls or []): + if '://web.archive.org/' in u.url: + return [AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + elif '://archive.org/' in u.url: + return [AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + return options -- cgit v1.2.3 From 15680e0caae7ff6e24ddca8584b0c590d2b30581 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:30:11 -0700 Subject: references: refactor to point to access_options transform; comment out CSL fields --- python/fatcat_tools/references.py | 65 +++++---------------------------------- 1 file changed, 8 insertions(+), 57 deletions(-) diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index c9730174..7e1f4f71 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -22,7 +22,6 @@ import sys import json import datetime import argparse -from enum import Enum from typing import Optional, List, Any, Dict from pydantic import BaseModel @@ -31,7 +30,7 @@ from elasticsearch_dsl import Search, Q from fatcat_openapi_client import ReleaseEntity from fatcat_tools import public_api - +from fatcat_tools.transforms.access import release_access_options, AccessOption class BiblioRef(BaseModel): @@ -81,29 +80,6 @@ class BiblioRef(BaseModel): target_unstructured: Optional[str] target_csl: Optional[Dict[str, Any]] -class AccessType(str, Enum): - """describes type of access URL""" - - wayback = "wayback" - ia_file = "ia_file" - ia_microfilm = "ia_microfilm" - repository = "repository" - -class AccessOption(BaseModel): - - access_type: AccessType - - # note: for `target_url` refs, would do a CDX lookup and this URL would be - # a valid/HTTP-200 web.archive.org capture URL - access_url: str - - # application/pdf, text/html, etc - # blank for landing pages - mimetype: Optional[str] - - size_bytes: Optional[int] - thumbnail_url: Optional[str] - class CslBiblioRef(BaseModel): # an "enriched" version of BiblioRef with metadata about the source or # target entity. would be "hydrated" via a lookup to, eg, the @@ -122,7 +98,7 @@ class FatcatBiblioRef(BaseModel): # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] - csl: Optional[Dict[str, Any]] + #csl: Optional[Dict[str, Any]] access: List[AccessOption] class Config: @@ -290,31 +266,6 @@ def count_inbound_refs( return search.count() -def _release_access(release: ReleaseEntity) -> List[AccessOption]: - """ - Extracts access options from a release. - """ - options = [] - for f in (release.files or []): - for u in (f.urls or []): - if '://web.archive.org/' in u.url: - return [AccessOption( - access_type="wayback", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=None - )] - elif '://archive.org/' in u.url: - return [AccessOption( - access_type="ia_file", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=None - )] - return options - # run elasticsearch mget query for all ref idents and include "enriched" refs when possible # for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL # TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? @@ -329,14 +280,14 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) enriched.append(FatcatBiblioRef( ref=ref, - csl=None, - access=_release_access(release), + #csl=None, + access=release_access_options(release), release=release, )) else: enriched.append(FatcatBiblioRef( ref=ref, - csl=None, + #csl=None, access=[], release=None, )) @@ -349,14 +300,14 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) enriched.append(FatcatBiblioRef( ref=ref, - csl=None, - access=_release_access(release), + #csl=None, + access=release_access_options(release), release=release, )) else: enriched.append(FatcatBiblioRef( ref=ref, - csl=None, + #csl=None, access=[], release=None, )) -- cgit v1.2.3 From 314aba35d06eb80be0c5ffc068774bb9bca38e76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:31:07 -0700 Subject: web: initial implementation of fuzzy citation parsing and matching tool --- python/fatcat_web/forms.py | 41 +++++++++++ python/fatcat_web/ref_routes.py | 46 +++++++++++++ python/fatcat_web/templates/reference_match.html | 86 ++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 python/fatcat_web/templates/reference_match.html diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index 1c9fb199..19176a59 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -482,3 +482,44 @@ class EntityTomlForm(EntityEditForm): etf.toml.data = entity_to_toml(entity, pop_fields=pop_fields) return etf + +class ReferenceMatchForm(FlaskForm): + + submit_type = SelectField('submit_type', + [validators.DataRequired()], + choices=['parse', 'match']) + + raw_citation = TextAreaField("Citation String", render_kw={'rows':'3'}) + + title = StringField("Title") + journal = StringField("Journal or Conference") + first_author = StringField("First Author") + #year = IntegerField('Year Released', + # [validators.Optional(True), valid_year]) + year = StringField("Year Released") + volume = StringField("Volume") + issue = StringField("Issue") + pages = StringField("Pages") + + @staticmethod + def from_grobid_parse(parse_dict, raw_citation): + """ + Initializes form from GROBID extraction + """ + rmf = ReferenceMatchForm() + rmf.raw_citation.data = raw_citation + + direct_fields = ['title', 'journal', 'volume', 'issue', 'pages'] + for k in direct_fields: + if parse_dict.get(k): + a = getattr(rmf, k) + a.data = parse_dict[k] + + date = parse_dict.get('date') + if date and len(date) >= 4 and date[0:4].isdigit(): + rmf.year.data = int(date[0:4]) + + if parse_dict.get('authors'): + rmf.first_author.data = parse_dict['authors'][0].get('name') + + return rmf diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index a49813c4..dc39299f 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -8,8 +8,11 @@ from typing import Optional from flask import render_template, abort, redirect, request from fatcat_openapi_client import * from fatcat_openapi_client.rest import ApiException +from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release +from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api, auth_api from fatcat_web.forms import * from fatcat_web.entity_helpers import * @@ -48,3 +51,46 @@ def release_view_refs_outbound(ident): enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + +@app.route('/reference/match', methods=['GET', 'POST']) +def reference_match(): + + form = ReferenceMatchForm() + grobid_status = None + grobid_dict = None + + if form.is_submitted(): + if form.validate_on_submit(): + if form.submit_type.data == 'parse': + resp_xml = grobid_api_process_citation(form.raw_citation.data) + if not resp_xml: + grobid_status = "failed" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 400 + grobid_dict = transform_grobid_ref_xml(resp_xml) + if not grobid_dict: + grobid_status = "empty" + return render_template('reference_match.html', form=form, grobid_status=grobid_status), 200 + #print(grobid_dict) + release_stub = grobid_ref_to_release(grobid_dict) + # remove empty values from GROBID parsed dict + grobid_dict = {k: v for k, v in grobid_dict.items() if v is not None} + form = ReferenceMatchForm.from_grobid_parse(grobid_dict, form.raw_citation.data) + grobid_status = "success" + matches = close_fuzzy_release_matches(es_client=app.es_client, release=release_stub, match_limit=10) or [] + elif form.submit_type.data == 'match': + matches = close_fuzzy_biblio_matches(es_client=app.es_client, biblio=form.data, match_limit=10) or [] + else: + raise NotImplementedError() + + for m in matches: + # expand releases more completely + m.release = api.get_release(m.release.ident, expand="container,files,filesets,webcaptures", hide="abstract,refs") + # hack in access options + m.access_options = release_access_options(m.release) + + return render_template('reference_match.html', form=form, grobid_dict=grobid_dict, grobid_status=grobid_status, matches=matches), 200 + + elif form.errors: + return render_template('reference_match.html', form=form), 400 + + return render_template('reference_match.html', form=form), 200 diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html new file mode 100644 index 00000000..042b0607 --- /dev/null +++ b/python/fatcat_web/templates/reference_match.html @@ -0,0 +1,86 @@ +{% extends "base.html" %} +{% import "entity_macros.html" as entity_macros %} +{% import "edit_macros.html" as edit_macros %} + +{% block body %} + +

Reference Fuzzy Match Tool

+ +
+ + +

Parse Citation

+ +

Enter a citation string here and we will try to parse it (using GROBID) + into a structured format, then match against the catalog. + + {{ edit_macros.form_field_basic(form.raw_citation) }} + + + +
+ {% if grobid_status == "success" and grobid_dict %} +

+
Parsed successfully! See match results below
+ {{ entity_macros.extra_metadata(grobid_dict) }} +
+ {% endif %} + +
+
+

Fuzzy Match Metadata

+ +

Enter whatever bibliographic metadata fields you know, and we will try to + match to catalog entries. + +

NOTE: if you already know a persistent identifier (like a DOI), you + should use the lookup tool instead. + + {{ edit_macros.form_field_inline(form.title) }} + {{ edit_macros.form_field_inline(form.first_author) }} + +
+

+ {{ edit_macros.form_field_basic(form.year) }} + {{ edit_macros.form_field_basic(form.journal) }} +
+
+ {{ edit_macros.form_field_basic(form.volume) }} + {{ edit_macros.form_field_basic(form.issue) }} + {{ edit_macros.form_field_basic(form.pages) }} +
+ + +
+ +
+ +{% if matches is defined %} +
+
+

Match Results

+ + + {% for match in matches %} + +
+
{{ match.status.name }} +
{{ match.reason.name }} +
+ {{ entity_macros.release_summary(match.release) }} + + {% if match.access_options %} + {{ match.access_options[0].access_type.name }} + {% endif %} + {% endfor %} +
+ {% if not matches %} +

None! + {% endif %} +{% endif %} + +{% endblock %} -- cgit v1.2.3 From 570074b514259bf6345c376faea8128f279bd0b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:48:06 -0700 Subject: web: inbound/outbound refs as links (temporarily); change URL names --- python/fatcat_web/ref_routes.py | 4 ++-- python/fatcat_web/templates/entity_base.html | 4 ++++ python/fatcat_web/templates/release_view_fuzzy_refs.html | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index dc39299f..bd8ae550 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -18,7 +18,7 @@ from fatcat_web.forms import * from fatcat_web.entity_helpers import * -@app.route('/release//refs/inbound', methods=['GET']) +@app.route('/release//inbound-refs', methods=['GET']) def release_view_refs_inbound(ident): # lookup release ident, ensure it exists @@ -35,7 +35,7 @@ def release_view_refs_inbound(ident): return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 -@app.route('/release//refs/outbound', methods=['GET']) +@app.route('/release//outbound-refs', methods=['GET']) def release_view_refs_outbound(ident): # lookup release ident, ensure it exists diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html index 36280f5d..c23dbef2 100644 --- a/python/fatcat_web/templates/entity_base.html +++ b/python/fatcat_web/templates/entity_base.html @@ -86,6 +86,10 @@ {% elif entity_type == "release" and entity.state != 'deleted' %} {{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }} {{ entity_tab("references", "References", "/references", entity.refs|count) }} + {% if entity.state == 'active' %} + {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }} + {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }} + {% endif %} {% endif %} {{ entity_tab("metadata", "Metadata", "/metadata") }} diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index bc1fa171..9ceb6060 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -1,5 +1,5 @@ {% set release = entity %} -{% set entity_view = "references" %} +{% set entity_view = "{{ direction }}-refs" %} {% set entity_type = "release" %} {% import "entity_macros.html" as entity_macros %} {% extends "entity_base.html" %} -- cgit v1.2.3 From c2395869ff7860bb2c7f080fd6c097e299ea58bf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jun 2021 19:49:36 -0700 Subject: fixes for newer ref index --- python/fatcat_tools/references.py | 2 +- .../templates/release_view_fuzzy_refs.html | 59 ++++------------------ 2 files changed, 11 insertions(+), 50 deletions(-) diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 7e1f4f71..976967d4 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -68,7 +68,7 @@ class BiblioRef(BaseModel): # skipped: target_wikipedia_article # crossref, pubmed, grobid, etc - match_provenance: str + match_provenance: Optional[str] # strong, weak, etc match_status: Optional[str] # TODO: "match_strength"? diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index 9ceb6060..ee39d15b 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -33,56 +33,17 @@ {% endif %} {% endif %}
{{ ref.ref.match_status }} -
{{ ref.ref.match_provenance }} + {% if ref.ref.match_provenance %} +
{{ ref.ref.match_provenance }} + {% endif %} - {{ release.title }} - {% if release.release_type not in ["article-journal", "conference-paper"] %} - [{{ release.release_type or "unknown-type" }}] - {% endif %} -
- {% for contrib in release.contribs[:5] %} - {% if contrib.creator %} - {{ contrib.creator.display_name }} - {% else %} - {{ contrib.raw_name }} - {% endif %} - {% if not loop.last %}, {% endif %} - {% endfor %} - {% if release.contribs | length > 5 %}(+ more) {%endif %} -
- {% if release.release_year %}{{ release.release_year }}  {% endif %} - {% if release.container %} - {{ release.container.name }} - {% elif release.extra and release.extra.container_name %} - {{ release.extra.container_name }} - {% endif %} - {% if release.release_stage != "published" %} -  {{ release.release_stage or "unpublished" }} - {% endif %} - -
- {% if release.version %} - version:{{ release.release_year }}  - {% endif %} - {% if release.number %} - number:{{ release.number }}  - {% endif %} - {% if release.ext_ids.doi %} - doi:{{ release.ext_ids.doi }}  - {% endif %} - {# TODO: links #} - {% if release.ext_ids.arxiv %} - arXiv:{{ release.ext_ids.arxiv }}  - {% endif %} - {% if release.ext_ids.pmcid %} - pmcid:{{ release.ext_ids.pmcid }}  - {% endif %} - {% if release.ext_ids.pmid %} - pmid:{{ release.ext_ids.pmid }}  - {% endif %} - {% if release.ext_ids.dblp %} - dblp:{{ release.ext_ids.dblp }}  - {% endif %} + {% if release %} + {% entity_macros.release_summary(release) %} + {% elif ref.ref.target_unstructured %} + {{ ref.ref.target_unstructured }} + {% else %} + blank + {% endif %} {% if ref.access %} {{ ref.access[0].access_type.name }} -- cgit v1.2.3 From 61ed521cc40c1ee76692e9c4054e89fa63320600 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 24 Jun 2021 18:45:08 -0700 Subject: improvements to fuzzy refs view - fixes to release summary macro - show tab counts correctly by re-using generic entity get helper - table styling; 'prev' link - openlibrary access links - parse-and-match button for unmatched+unstructured refs --- python/fatcat_web/ref_routes.py | 12 +-- python/fatcat_web/templates/entity_macros.html | 17 ++-- .../templates/release_view_fuzzy_refs.html | 93 +++++++++++++++------- 3 files changed, 75 insertions(+), 47 deletions(-) diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index bd8ae550..e08aaf15 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -21,11 +21,7 @@ from fatcat_web.entity_helpers import * @app.route('/release//inbound-refs', methods=['GET']) def release_view_refs_inbound(ident): - # lookup release ident, ensure it exists - try: - release = api.get_release(ident) - except ApiException as ae: - abort(ae.status) + release = generic_get_entity("release", ident) offset = request.args.get('offset', '0') offset = max(0, int(offset)) if offset.isnumeric() else 0 @@ -38,11 +34,7 @@ def release_view_refs_inbound(ident): @app.route('/release//outbound-refs', methods=['GET']) def release_view_refs_outbound(ident): - # lookup release ident, ensure it exists - try: - release = api.get_release(ident) - except ApiException as ae: - abort(ae.status) + release = generic_get_entity("release", ident) offset = request.args.get('offset', '0') offset = max(0, int(offset)) if offset.isnumeric() else 0 diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index 94770afb..24d1b6d0 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -394,30 +394,29 @@ yellow {% if release.release_type not in ["article-journal", "conference-paper"] %} [{{ release.release_type or "unknown-type" }}] {% endif %} -
+ {% if release.contribs %}
{% endif %} {% for contrib in release.contribs[:5] %} {% if contrib.creator %} - {{ contrib.creator.display_name }} + {{ contrib.creator.display_name }} {% else %} {{ contrib.raw_name }} - {% endif %} - {% if not loop.last %}, {% endif %} + {%- endif %} + {%- if not loop.last %}, {% endif %} {% endfor %} - {% if release.contribs | length > 5 %}(+ more) {%endif %} -
+ {% if release.contribs | length > 5 %} (+ more) {%endif %} + {% if release.release_year or release.container or (release.extra and release.extra.container_name) %}
{% endif %} {% if release.release_year %}{{ release.release_year }}  {% endif %} {% if release.container %} - {{ release.container.name }} + {{ release.container.name }} {% elif release.extra and release.extra.container_name %} {{ release.extra.container_name }} {% endif %} {% if release.release_stage != "published" %}  {{ release.release_stage or "unpublished" }} {% endif %} -
{% if release.version %} - version:{{ release.release_year }}  + version:{{ release.version }}  {% endif %} {% if release.number %} number:{{ release.number }}  diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index ee39d15b..7b286fd3 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -4,53 +4,90 @@ {% import "entity_macros.html" as entity_macros %} {% extends "entity_base.html" %} +{% macro pagination_row(hits) %} + + {% if hits.offset %} + « prev   + {% endif %} + Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} of {{ hits.count_total}} references (in {{ hits.query_wall_time_ms }}ms) + {% if hits.count_total != hits.count_returned and hits.offset + hits.limit < hits.count_total %} +  next » + {% endif %} +{% endmacro %} + {% block entity_main %} {% if direction == "inbound" %} -

Inbound Matched References

- Other releases citing this one +

Referenced By

+ Citations to this release by other works. {% elif direction == "outbound" %} -

Outbound Matched References

- This release citing other releases -{% endif %} - -

Found {{ hits.count_total }} references in {{ hits.query_wall_time_ms }}ms. -{% if hits.count_total != hits.count_returned %} - Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} - {% if hits.offset + hits.limit < hits.count_total %} -  next... - {% endif %} +

References

+ NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list. {% endif %} - +{% if enriched_refs %} +
+ + -{% for ref in enriched_refs %} - {% set release = ref.release %} - +{% if hits.count_total != hits.count_returned %} + + +{% endif %}
+ {{ pagination_row(hits) }} +
+{% for row in enriched_refs %} + {% set release = row.release %} +
+ {# TODO: ref_locator? #} {% if direction == "outbound" %} - {% if ref.ref.ref_key %} - [{{ ref.ref.ref_key }}] + {% if row.ref.ref_key %} + [{{ row.ref.ref_key }}]
{% endif %} {% endif %} -
{{ ref.ref.match_status }} - {% if ref.ref.match_provenance %} -
{{ ref.ref.match_provenance }} + {{ row.ref.match_status }}
+ {% if row.ref.match_provenance %} + via {{ row.ref.match_provenance }} {% endif %}
{% if release %} - {% entity_macros.release_summary(release) %} - {% elif ref.ref.target_unstructured %} - {{ ref.ref.target_unstructured }} + {{ entity_macros.release_summary(release) }} + {% elif row.ref.target_unstructured %} + {{ row.ref.target_unstructured }} {% else %} blank {% endif %} - - {% if ref.access %} - {{ ref.access[0].access_type.name }} - {% endif %} + + {% if row.access %} + {% for access in row.access %} + {{ access.access_type.name }}
+ {% endfor %} + {% elif row.ref.target_unstructured %} +
+ + + +
+ {% endif %} + + {# TODO: include these as access options instead #} + {% if row.ref.target_openlibrary_work %} + openlibrary.org + {% endif %} + {% if row.ref.target_url %} + web +
wayback (?) + {% endif %} {% endfor %}
+ {{ pagination_row(hits) }} +
+{% else %} +

None found +{% endif %} {% endblock %} -- cgit v1.2.3 From 7498fb076d0b60a9021f7174f0a5c4ead38c151a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 24 Jun 2021 18:46:45 -0700 Subject: match UI: improve form layout --- python/fatcat_web/templates/reference_match.html | 29 +++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/python/fatcat_web/templates/reference_match.html b/python/fatcat_web/templates/reference_match.html index 042b0607..ae6a239c 100644 --- a/python/fatcat_web/templates/reference_match.html +++ b/python/fatcat_web/templates/reference_match.html @@ -38,19 +38,22 @@

NOTE: if you already know a persistent identifier (like a DOI), you should use the lookup tool instead. - {{ edit_macros.form_field_inline(form.title) }} - {{ edit_macros.form_field_inline(form.first_author) }} - -
-

- {{ edit_macros.form_field_basic(form.year) }} - {{ edit_macros.form_field_basic(form.journal) }} -
-
- {{ edit_macros.form_field_basic(form.volume) }} - {{ edit_macros.form_field_basic(form.issue) }} - {{ edit_macros.form_field_basic(form.pages) }} -
+
+
+ {{ edit_macros.form_field_basic(form.title) }} +
+
+ {{ edit_macros.form_field_basic(form.first_author) }} +
+
+ {{ edit_macros.form_field_basic(form.journal) }} +
+
+ {{ edit_macros.form_field_basic(form.year) }} + {{ edit_macros.form_field_basic(form.volume) }} + {{ edit_macros.form_field_basic(form.issue) }} + {{ edit_macros.form_field_basic(form.pages) }} +
+ +
+ -
{% if grobid_status == "success" and grobid_dict %}
-
Parsed successfully! See match results below
+
Parsed Citation String
{{ entity_macros.extra_metadata(grobid_dict) }} +

See below for fuzzy match results

{% endif %} -
-
-

Fuzzy Match Metadata

+
+

Fuzzy Match Metadata

-

Enter whatever bibliographic metadata fields you know, and we will try to - match to catalog entries. +

Enter whatever bibliographic metadata fields you know, and we will try to + match to catalog entries. -

NOTE: if you already know a persistent identifier (like a DOI), you - should use the lookup tool instead. +

NOTE: if you already know a persistent identifier (like a DOI), you + should use the lookup tool instead. -
-

- {{ edit_macros.form_field_basic(form.title) }} -
-
- {{ edit_macros.form_field_basic(form.first_author) }} -
-
- {{ edit_macros.form_field_basic(form.journal) }} -
-
- {{ edit_macros.form_field_basic(form.year) }} - {{ edit_macros.form_field_basic(form.volume) }} - {{ edit_macros.form_field_basic(form.issue) }} - {{ edit_macros.form_field_basic(form.pages) }} -
+
+
+ {{ edit_macros.form_field_basic(form.title) }} +
+
+ {{ edit_macros.form_field_basic(form.first_author) }} +
+
+ {{ edit_macros.form_field_basic(form.journal) }} +
+
+ {{ edit_macros.form_field_basic(form.year) }} + {{ edit_macros.form_field_basic(form.volume) }} + {{ edit_macros.form_field_basic(form.issue) }} + {{ edit_macros.form_field_basic(form.pages) }} +
- -
+ +
+
{% if matches is defined %} -
-
-

Match Results

+

Matched Releases

+ + {% if not matches %} +

No matches found + {% endif %} + {% for match in matches %} @@ -81,9 +87,7 @@ {% endfor %}
- {% if not matches %} -

None! - {% endif %} + {% endif %} {% endblock %} -- cgit v1.2.3 From fa2ba60834cf3cb3edea05af3c1830e6fc0d5bcc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 18:48:14 -0700 Subject: refs: several small improvements to web UI --- python/fatcat_web/templates/entity_macros.html | 11 +++- .../templates/openlibrary_view_fuzzy_refs.html | 6 +- python/fatcat_web/templates/refs_macros.html | 71 ++++++++++++++++------ .../templates/release_view_fuzzy_refs.html | 12 ++-- .../templates/wikipedia_view_fuzzy_refs.html | 6 +- 5 files changed, 71 insertions(+), 35 deletions(-) diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index 37bb2d90..562b99d3 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -391,7 +391,7 @@ yellow {# this is useful for things like showing lists of releases in tables #} {% macro release_summary(release) %} {{ release.title }} - {% if release.release_type not in ["article-journal", "conference-paper"] %} + {% if release.release_type not in ["article-journal", "paper-conference"] %} [{{ release.release_type or "unknown-type" }}] {% endif %} {% if release.contribs %}
{% endif %} @@ -405,7 +405,14 @@ yellow {% endfor %} {% if release.contribs | length > 8 %} (+ more) {%endif %} {% if release.release_year or release.container or (release.extra and release.extra.container_name) %}
{% endif %} - {% if release.release_year %}{{ release.release_year }}  {% endif %} + {% if release.release_year %} + {% if release.release_date %} + {{ release.release_year }} + {% else %} + {{ release.release_year }} + {% endif %} +   + {% endif %} {% if release.container %} {{ release.container.name }} {% elif release.extra and release.extra.container_name %} diff --git a/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html b/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html index 161a7b50..21bf76f2 100644 --- a/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/openlibrary_view_fuzzy_refs.html @@ -19,11 +19,7 @@ Refernces from this book to other entities. {% endif %} -{% if hits.result_refs %} - {{ refs_macros.refs_table(hits, direction) }} -{% else %} -

None found -{% endif %} +{{ refs_macros.refs_table(hits, direction) }} {% endblock %} diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index ba4d18ad..3db47064 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -1,37 +1,67 @@ {% import "entity_macros.html" as entity_macros %} -{% macro pagination_row(hits, direction) %} - - {% if hits.offset %} +{% macro pagination_row(hits, with_links=False) %} + {% if with_links and hits.offset %} « prev   {% endif %} - Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} of {{ hits.count_total}} references (in {{ hits.query_wall_time_ms }}ms) - {% if hits.count_total != hits.count_returned and hits.offset + hits.limit < hits.count_total %} + {% if hits.count_returned == 0 %} + Showing 0 references + {% else %} + Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} of {{ hits.count_total}} references + {% endif %} + {% if with_links and hits.count_total != hits.count_returned and hits.offset + hits.limit < hits.count_total %}  next » {% endif %} {% endmacro %} {% macro refs_table(hits, direction) %} +

+
+ Fuzzy reference matching is a work in progress! +
+ Read more about quality, completeness, and caveats in the fatcat guide. +
+{% if hits.count_total == 0 %} + + -{% if hits.count_total != hits.count_returned %} - - -{% endif %} + +
- {{ pagination_row(hits) }} + {{ pagination_row(hits, with_links=False) }} + (in {{ hits.query_wall_time_ms }}ms)
+
+ + No References Found +
+{% endif %} {% for row in hits.result_refs %} {% set release = row.release %} -
+
{# TODO: ref_locator? #} {% if direction == "out" %} {% if row.ref.ref_key %} [{{ row.ref.ref_key }}]
{% endif %} {% endif %} - {{ row.ref.match_status }}
- {% if row.ref.match_provenance %} - via {{ row.ref.match_provenance }} + + {% if row.ref.match_status == "exact" %} + {% set match_icon = "linkify" %} + {% elif row.ref.match_status == "unmatched" %} + {% set match_icon = "question circle outline" %} + {% else %} + {% set match_icon = "magic" %} {% endif %} +
+ {% if row.ref.match_provenance and row.ref.match_provenance == "fatcat-pubmed" %} + {# this is a common case and making the column render wide #} + via pubmed + {% elif row.ref.match_provenance %} + via {{ row.ref.match_provenance }}
+ {% endif %} +
{% if release %} {{ entity_macros.release_summary(release) }} @@ -62,7 +92,7 @@ {% if row.access %} {% for access in row.access %} - + {%- if access.access_type.name == "wayback" %} web.archive.org {%- elif access.access_type.name == "ia_file" -%} @@ -83,18 +113,23 @@ {% endif %} {% endfor %}
- {{ pagination_row(hits) }} -
+
+ JSON +
+ {% if hits.count_returned != hits.count_total %} +
+ {{ pagination_row(hits, with_links=True) }} +
+ {% endif %} +
{% endmacro %} diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index ffca0bc9..8cba4f4e 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -13,13 +13,15 @@ {% elif direction == "out" %}

References

NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list. -{% endif %} -{% if hits.result_refs %} - {{ refs_macros.refs_table(hits, direction) }} -{% else %} -

None found + {% if hits.count_total == 0 and release.refs %} +

+

No fuzzy references found, but there are {{ release.refs|count }} legacy references +

+ {% endif %} {% endif %} +{{ refs_macros.refs_table(hits, direction) }} + {% endblock %} diff --git a/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html b/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html index 5b53d692..3e1453c1 100644 --- a/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/wikipedia_view_fuzzy_refs.html @@ -17,11 +17,7 @@ Refernces from wikipedia article to other entities. {% endif %} -{% if hits.result_refs %} - {{ refs_macros.refs_table(hits, direction) }} -{% else %} -

None found -{% endif %} +{{ refs_macros.refs_table(hits, direction) }} {% endblock %} -- cgit v1.2.3 From f3481c02bd7a50d9073902dba07fe265eecb93db Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 18:57:09 -0700 Subject: refs: lint fixes --- python/fatcat_tools/references.py | 1 + python/fatcat_web/ref_routes.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 508cf19d..496a46e1 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -99,6 +99,7 @@ class EnrichedBiblioRef(BaseModel): access: List[AccessOption] @validator('release') + @classmethod def check_release(cls, v): if v is not None and not isinstance(v, ReleaseEntity): raise ValueError("expected a ReleaseEntity") diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 88ac0744..d4219012 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -3,7 +3,7 @@ Flask endpoints for reference (citation) endpoints. Eg, listing references "inbound" and "outbound" from a specific release or work. """ -from flask import render_template, request, jsonify, Response +from flask import render_template, request, Response from fatcat_openapi_client import * from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches @@ -154,7 +154,7 @@ def release_view_refs_inbound_json(ident): @app.route('/openlibrary/OLW/refs-in.json', methods=['GET', 'OPTIONS']) @crossdomain(origin='*',headers=['access-control-allow-origin','Content-Type']) -def openlibrary_view_refs_inbound_json(ident): +def openlibrary_view_refs_inbound_json(id_num): openlibrary_id = f"OL{id_num}W" hits = _refs_web("in", openlibrary_id=openlibrary_id) return Response(hits.json(exclude_unset=True), mimetype="application/json") -- cgit v1.2.3 From f29da9adb50a37cb6aad4e435fc09a5d682cbe6c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 18:57:25 -0700 Subject: refs: revert fatcat-pubmed -> pubmed truncation This was just going to be confusing --- python/fatcat_web/templates/refs_macros.html | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index 3db47064..8b6a5dc3 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -55,10 +55,7 @@ {% set match_icon = "magic" %} {% endif %}
- {% if row.ref.match_provenance and row.ref.match_provenance == "fatcat-pubmed" %} - {# this is a common case and making the column render wide #} - via pubmed - {% elif row.ref.match_provenance %} + {% if row.ref.match_provenance %} via {{ row.ref.match_provenance }}
{% endif %} -- cgit v1.2.3 From 16ce0cd302b954260ccc255ca486d426b79c6b99 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 19:21:30 -0700 Subject: refs: start the most basic/minimal web refs test coverage ('integration' level) --- python/tests/files/elastic_empty.json | 1 + python/tests/files/elastic_refs_in_release.json | 360 ++++++++++++ python/tests/files/elastic_refs_out_release.json | 679 +++++++++++++++++++++++ python/tests/web_refs.py | 54 ++ 4 files changed, 1094 insertions(+) create mode 100644 python/tests/files/elastic_empty.json create mode 100644 python/tests/files/elastic_refs_in_release.json create mode 100644 python/tests/files/elastic_refs_out_release.json create mode 100644 python/tests/web_refs.py diff --git a/python/tests/files/elastic_empty.json b/python/tests/files/elastic_empty.json new file mode 100644 index 00000000..9b30d03b --- /dev/null +++ b/python/tests/files/elastic_empty.json @@ -0,0 +1 @@ +{"took": 10, "timed_out": false, "_shards": {"total": 6, "successful": 6, "skipped": 0, "failed": 0}, "hits": {"total": {"value": 0, "relation": "eq"}, "max_score": null, "hits": []}} \ No newline at end of file diff --git a/python/tests/files/elastic_refs_in_release.json b/python/tests/files/elastic_refs_in_release.json new file mode 100644 index 00000000..5260ae3f --- /dev/null +++ b/python/tests/files/elastic_refs_in_release.json @@ -0,0 +1,360 @@ +{ + "took": 30, + "timed_out": false, + "_shards": { + "total": 6, + "successful": 6, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 69, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "rxy26aoognaytoeghum4ncmygq_30", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 30, + "ref_key": "BIB0030|jon779-cit-0030", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "3bggl3ls3fftvl3g6yg5qzy4yq", + "source_year": "2013", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "3bggl3ls3fftvl3g6yg5qzy4yq" + ] + }, + "sort": [ + 2013 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "xew5r3gnxbaznhj3kevspu75yq_46", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 46, + "ref_key": "_bib46", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "dqfgm7p2urh3dd2ja2s5cleqr4", + "source_year": "2013", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "dqfgm7p2urh3dd2ja2s5cleqr4" + ] + }, + "sort": [ + 2013 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "dyy6cr675zbivam4wldogvc7ue_23", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 23, + "ref_key": "BFmp2012104_CR23", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "7igrlb5tevgoxdeds2w2opwj7a", + "source_year": "2012", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "7igrlb5tevgoxdeds2w2opwj7a" + ] + }, + "sort": [ + 2012 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "kbivvq4n5nhfpfl3dc7xq6bzbu_33", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T01:37:55Z", + "match_provenance": "fuzzy", + "match_reason": "jaccardauthors", + "match_status": "strong", + "ref_index": 33, + "ref_key": "b33", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "75yfavca2bbwbarcchkm7afhyy", + "source_year": "2012", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "75yfavca2bbwbarcchkm7afhyy" + ] + }, + "sort": [ + 2012 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "2kemk2jdynacznanpixvqiytla_48", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 48, + "ref_key": "BFmp201237_CR48", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "7fu5n2crurex7isvfv5tnf3y2i", + "source_year": "2012", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "7fu5n2crurex7isvfv5tnf3y2i" + ] + }, + "sort": [ + 2012 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "q3qy5z3htnd2likregg3dff23i_37", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 37, + "ref_key": "BFnpp2012215_CR37", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "hqk3tnlyvffoppkakajynn5x2u", + "source_year": "2012", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "hqk3tnlyvffoppkakajynn5x2u" + ] + }, + "sort": [ + 2012 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "fzgecyr6cnggbinanrdyc44b4a_126", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 126, + "ref_key": "BFtp201234_CR126", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "hwtljgl3dvhojeusfwon6iba6q", + "source_year": "2012", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "hwtljgl3dvhojeusfwon6iba6q" + ] + }, + "sort": [ + 2012 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "7lt5parryzcbhentd75sqgsuvu_15", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T11:47:48Z", + "match_provenance": "grobid", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 15, + "ref_key": "b14", + "source_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "source_work_ident": "xenzkonwivbl3bzirbtqohsb4q", + "source_year": "0", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "fields": { + "source_work_ident": [ + "xenzkonwivbl3bzirbtqohsb4q" + ] + }, + "sort": [ + 0 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "wikipedia_knxxa2djmeqem4tbnztw65i_cx2xdrrgt5cydcinttixgj4nxu", + "_score": null, + "_source": { + "match_provenance": "wikipedia", + "match_reason": "doi", + "match_status": "exact", + "source_wikipedia_article": "en:Sophia Frangou", + "target_release_ident": "cx2xdrrgt5cydcinttixgj4nxu", + "target_work_ident": "s45xqgdp5bftrbymxtsl32hcna" + }, + "sort": [ + -9223372036854776000 + ], + "inner_hits": { + "source_more": { + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + } +} diff --git a/python/tests/files/elastic_refs_out_release.json b/python/tests/files/elastic_refs_out_release.json new file mode 100644 index 00000000..5e25d80f --- /dev/null +++ b/python/tests/files/elastic_refs_out_release.json @@ -0,0 +1,679 @@ +{ + "took": 15, + "timed_out": false, + "_shards": { + "total": 6, + "successful": 6, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 34, + "relation": "eq" + }, + "max_score": null, + "hits": [ + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_1", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 1, + "ref_key": "e_1_3_2_1_2_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. GSL- GNU Scientific Library. https://www.gnu.org/software/gsl/. Accessed: 2020-01-01. GSL- GNU Scientific Library. https://www.gnu.org/software/gsl/." + }, + "sort": [ + 1 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_2", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 2, + "ref_key": "e_1_3_2_1_3_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. KLEE with floating point support. https://github.com/srg-imperial/klee-float. Accessed: 2020-01-01. KLEE with floating point support. https://github.com/srg-imperial/klee-float." + }, + "sort": [ + 2 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_3", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 3, + "ref_key": "e_1_3_2_1_4_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. LibTooling. https://clang.llvm.org/docs/LibTooling.html. Accessed: 2020-01-01. LibTooling. https://clang.llvm.org/docs/LibTooling.html." + }, + "sort": [ + 3 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_4", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 4, + "ref_key": "e_1_3_2_1_5_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. Meschach Library. https://www.netlib.org/c/meschach/readme. Accessed: 2020-01-01. Meschach Library. https://www.netlib.org/c/meschach/readme." + }, + "sort": [ + 4 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_5", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 5, + "ref_key": "e_1_3_2_1_6_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. Toyota: Software to blame for Prius brake problems. http://www.cnn.com/2010/WORLD/asiapcf/02/04/japan.prius.complaints/index.html. Accessed: 2020-01-01. Toyota: Software to blame for Prius brake problems. http://www.cnn.com/2010/WORLD/asiapcf/02/04/japan.prius.complaints/index.html." + }, + "sort": [ + 5 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_6", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 6, + "ref_key": "e_1_3_2_1_7_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Accessed: 2020-01-01. Z3. https://github.com/Z3Prover/z3. Accessed: 2020-01-01. Z3. https://github.com/Z3Prover/z3." + }, + "sort": [ + 6 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_7", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 7, + "ref_key": "e_1_3_2_1_8_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_csl": { + "accessed": {}, + "author": [ + { + "name": "Alatawi Eman" + } + ], + "container-title": "Symbolic Execution with Invariant Inlay: Evaluating the Potential. In 2018 25th Australasian Software Engineering Conference, ASWEC 2018.", + "issued": {} + } + }, + "sort": [ + 7 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_8", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 8, + "ref_key": "e_1_3_2_1_9_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_csl": { + "accessed": {}, + "author": [ + { + "name": "Bagnara Roberto" + } + ], + "container-title": "Symbolic Path-Oriented Test Data Generation for Floating-Point Programs. In Sixth IEEE International Conference on Software Testing, Verification and Validation, ICST", + "issued": {} + } + }, + "sort": [ + 8 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_10", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 10, + "ref_key": "e_1_3_2_1_10_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "kf6qgd3e6vc3nhkpf3m32qehj4" + }, + "sort": [ + 10 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_11", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:48Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 11, + "ref_key": "e_1_3_2_1_11_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "gynqpbv73jbdfcfpnzptsq4m64" + }, + "sort": [ + 11 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_12", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:47Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 12, + "ref_key": "e_1_3_2_1_12_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "osmyp7kaxzbd3duf6hohrtuzvm" + }, + "sort": [ + 12 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_14", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:40Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 14, + "ref_key": "e_1_3_2_1_14_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "sbxxyxrtxfe5fdukmcgjgjf6we" + }, + "sort": [ + 14 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_15", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:46Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 15, + "ref_key": "e_1_3_2_1_15_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "aw5o7bbhnjef7fy3cg3prpune4" + }, + "sort": [ + 15 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_16", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:49Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 16, + "ref_key": "e_1_3_2_1_16_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "sdmeosutrzgppilsibx5kbinba" + }, + "sort": [ + 16 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_18", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:05:08Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 18, + "ref_key": "e_1_3_2_1_18_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "ae26aeacuvdi3mlgut3g32f42i" + }, + "sort": [ + 18 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_19", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:52Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 19, + "ref_key": "e_1_3_2_1_19_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "bqsv52bbdnggxkls5cgdbubovm" + }, + "sort": [ + 19 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_21", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:54Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 21, + "ref_key": "e_1_3_2_1_21_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "edpnjczcr5ebrppg5g5adrg5ty" + }, + "sort": [ + 21 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_22", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:15:52Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 22, + "ref_key": "e_1_3_2_1_22_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "cropr6fte5dbtbnnlaau7fcp3a" + }, + "sort": [ + 22 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_23", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:58Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 23, + "ref_key": "e_1_3_2_1_23_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "e7kzr7fvmrg2repjxglg6ptzz4" + }, + "sort": [ + 23 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_24", + "_score": null, + "_source": { + "indexed_ts": "2021-07-14T03:08:24Z", + "match_reason": "unknown", + "match_status": "unmatched", + "ref_index": 24, + "ref_key": "e_1_3_2_1_25_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_unstructured": "Michael O. Lam Jeffrey K. Hollingsworth and G. W. Stewart. 2013. Dynamic floating-point cancellation detection. Parallel Comput. (2013). Michael O. Lam Jeffrey K. Hollingsworth and G. W. Stewart. 2013. Dynamic floating-point cancellation detection. Parallel Comput. (2013)." + }, + "sort": [ + 24 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_26", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:05:10Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 26, + "ref_key": "e_1_3_2_1_26_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "zwdzqlf4ivdlbg6f7hmccyig7u" + }, + "sort": [ + 26 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_27", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:50Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 27, + "ref_key": "e_1_3_2_1_27_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "parh4accgzgbtahyxmm2sdfnzy" + }, + "sort": [ + 27 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_28", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:05:08Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 28, + "ref_key": "e_1_3_2_1_28_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "br57toqbtnathfwbmkqyyva63i" + }, + "sort": [ + 28 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_29", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:49Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 29, + "ref_key": "e_1_3_2_1_29_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "tlc4c3mwjnh25nnfjgpoic5r44" + }, + "sort": [ + 29 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_30", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:52Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 30, + "ref_key": "e_1_3_2_1_30_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "z7rtkf2gljct5pvmlt6vitakri" + }, + "sort": [ + 30 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_31", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:49Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 31, + "ref_key": "e_1_3_2_1_31_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "qhxmybpi2fgt3hqnxx4admuwwq" + }, + "sort": [ + 31 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_32", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:49Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 32, + "ref_key": "e_1_3_2_1_32_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "xdbsyi37jvfidpfp36okylalzi" + }, + "sort": [ + 32 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_34", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:43Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 34, + "ref_key": "e_1_3_2_1_34_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "pc5ynoopsnclvbiyzedr3swk34" + }, + "sort": [ + 34 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_35", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:56Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 35, + "ref_key": "e_1_3_2_1_35_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "hnmreinbn5aehmz2hqqydfpeve" + }, + "sort": [ + 35 + ] + }, + { + "_index": "fatcat_ref_v02_20210716", + "_type": "_doc", + "_id": "yfr23sa5rbfizoqjku7uwhq6ye_36", + "_score": null, + "_source": { + "indexed_ts": "2021-07-10T13:16:52Z", + "match_provenance": "crossref", + "match_reason": "doi", + "match_status": "exact", + "ref_index": 36, + "ref_key": "e_1_3_2_1_36_1", + "source_release_ident": "yfr23sa5rbfizoqjku7uwhq6ye", + "source_work_ident": "mkmq6ju4abaexcezpb44cpx5xa", + "source_year": "2020", + "target_release_ident": "aaaaaaaaaaaaarceaaaaaaaaam", + "target_work_ident": "abumc5tgazaahbjh4hb7t6k7qm" + }, + "sort": [ + 36 + ] + } + ] + } +} diff --git a/python/tests/web_refs.py b/python/tests/web_refs.py new file mode 100644 index 00000000..bceb8557 --- /dev/null +++ b/python/tests/web_refs.py @@ -0,0 +1,54 @@ + +import json +import pytest + +from fatcat_web.search import get_elastic_container_random_releases +from fatcat_openapi_client.rest import ApiException +from fixtures import * + + +def test_basic_refs(app, mocker): + + with open('tests/files/elastic_refs_in_release.json') as f: + elastic_resp_in = json.loads(f.read()) + with open('tests/files/elastic_refs_out_release.json') as f: + elastic_resp_out = json.loads(f.read()) + with open('tests/files/elastic_empty.json') as f: + elastic_resp_empty = json.loads(f.read()) + + es_raw = mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp_in)), + (200, {}, json.dumps(elastic_resp_in)), + (200, {}, json.dumps(elastic_resp_empty)), + (200, {}, json.dumps(elastic_resp_out)), + (200, {}, json.dumps(elastic_resp_out)), + (200, {}, json.dumps(elastic_resp_empty)), + ] + + # render refs-in + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-in') + assert rv.status_code == 200 + assert b"Why Most Published Research Findings Are False" in rv.data + + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-in.json') + assert rv.status_code == 200 + + # empty (in) + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-in') + assert rv.status_code == 200 + assert b"No References Found" in rv.data + + # render refs-out + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-out') + assert rv.status_code == 200 + assert b"Why Most Published Research Findings Are False" in rv.data + + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-out.json') + assert rv.status_code == 200 + + # empty (out) + rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/refs-out') + assert rv.status_code == 200 + assert b"No References Found" in rv.data + -- cgit v1.2.3 From ed56037d929d50abab707ee5eb9f583789a8ac7a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 19:21:49 -0700 Subject: refs: fix typo preventing CSL from rendering in refs output --- python/fatcat_web/templates/refs_macros.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index 8b6a5dc3..3095ce54 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -81,7 +81,7 @@ openlibrary:{{ row.ref.target_openlibrary_work }}  [cited-by]  {% endif %} - {% elif direction == "in" and row.ref.target_csl %} + {% elif direction == "out" and row.ref.target_csl %} {{ entity_macros.csl_summary(row.ref.target_csl) }} {% else %} blank -- cgit v1.2.3 From bfdbbdd50ab06d28a2099e408ff154b0ce1cbc4b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 27 Jul 2021 19:54:11 -0700 Subject: start CHANGELOG for refs work --- CHANGELOG.md | 5 +++++ guide/src/SUMMARY.md | 2 ++ guide/src/reference_graph.md | 9 +++++++++ guide/src/search_api.md | 29 +++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 guide/src/reference_graph.md create mode 100644 guide/src/search_api.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ffa4a8b3..3b171fa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,11 @@ See also: ## Unreleased +### Added + +- reference graph views, based on fuzzy reference dataset in `cgraph` and + `fatcat-scholar` projects, stored in elasticsearch index + ### Fixed - viewing deleted release entities no longer result in 500 error diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md index ffc80ac2..c7d12cb0 100644 --- a/guide/src/SUMMARY.md +++ b/guide/src/SUMMARY.md @@ -8,6 +8,7 @@ - [Goals and Related Projects](./goals.md) - [Data Model](./data_model.md) - [Editing Workflow](./workflow.md) + - [Reference Graph](./reference_graph.md) - [Sources of Metadata](./sources.md) - [Implementation and Infrastructure](./implementation.md) - [Roadmap](./roadmap.md) @@ -21,6 +22,7 @@ - [Release](./entity_release.md) - [Work](./entity_work.md) - [Public API](./http_api.md) + - [Search API](./search_api.md) - [Bulk Exports](./bulk_exports.md) - [Cookbook](./cookbook.md) - [Contributing](./contributing.md) diff --git a/guide/src/reference_graph.md b/guide/src/reference_graph.md new file mode 100644 index 00000000..3b773150 --- /dev/null +++ b/guide/src/reference_graph.md @@ -0,0 +1,9 @@ + +# Reference Graph + +As a new feature, fuzzy-matched references are available on an "inbound" and +"outbound" basis in the web interface. + +The backend reference graph is available via the [Search API](./search_api.md) +under the `fatcat_ref` index. + diff --git a/guide/src/search_api.md b/guide/src/search_api.md new file mode 100644 index 00000000..91b7c8e9 --- /dev/null +++ b/guide/src/search_api.md @@ -0,0 +1,29 @@ + +# Search API + +The Elasticsearch indices used to power metadata search, statistics, and graphs +on the fatcat web interface are exposed publicly at +`https://search.fatcat.wiki`. Third parties can make queries using the +Elasticsearch API, which is well documented online and has client libraries in +many programming languages. + +A thin proxy (`es-public-proxy`) filters requests to avoid expensive queries +which could cause problems for search queries on the web interface, but most of +the Elasticsearch API is supported, including powerful aggregation queries. + +There is a short delay between updates to the fatcat catalog (via the main API) +and updates to the search index. + +Notable indices include: + +- `fatcat_release`: release entity metadata +- `fatcat_container`: container entity metadata +- `fatcat_ref`: reference graph + +Schemas for these indices can be fetched directly from the index (eg, +`https://search.fatcat.wiki/fatcat_release/_mapping`), and are versioned in the +fatcat git repository under `fatcat:extra/eleasticsearch/`. They are a +simplification and transform of the regular entity schemas, and include some +synthesized fields (such as "preservation status" for releases). Note that the +search schemas are likely to change over time with less notice and stability +guarantees than the primary catalog API schema. -- cgit v1.2.3 From 37b6e99eec3cbc668d6b51ed9e57b93f9a114d2a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Aug 2021 18:15:10 -0700 Subject: refs: web UI tweaks for iterated CSL schema --- python/fatcat_web/templates/entity_macros.html | 22 +++++++++++++++++++--- python/tests/files/elastic_refs_out_release.json | 10 +++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/python/fatcat_web/templates/entity_macros.html b/python/fatcat_web/templates/entity_macros.html index 562b99d3..6b565f69 100644 --- a/python/fatcat_web/templates/entity_macros.html +++ b/python/fatcat_web/templates/entity_macros.html @@ -457,15 +457,31 @@ yellow {% if csl.title and csl.author %}
{% endif %} {% if csl.author %} {% for author in csl.author[:8] %} - {# TODO: other name variants? #} - {{ author.name }} + {% if author.literal %} + {{ author.literal }} + {% elif author.raw_name %} + {{ author.raw_name }} + {% elif author.family and author.given %} + {{ author.given }} {{ author.family }} + {% elif author.family %} + {{ author.family }} + {% elif author.name %} + {# DEPRECATED: was used by refs code path for a while. Delete in, eg, year 2022 #} + {{ author.name }} + {% endif %} {%- if not loop.last %}, {% endif %} {% endfor %} {% if csl.author | length > 8 %} (+ more) {%endif %} {% endif %} {% if csl.issued or csl["container-title"] %}
{% endif %} - {% if csl.issued and csl.issued.raw %}{{ csl.issued.raw }}  {% endif %} + {% if csl.issued and csl.issued is mapping %} + {% if csl.issued['date-parts'] %} + {{ csl.issued['date-parts'][0][0] }}   + {% elif csl.issued.raw %} + {{ csl.issued.raw }}   + {% endif %} + {% endif %} {% if csl["container-title"] %} {{ csl["container-title"] }} {% endif %} diff --git a/python/tests/files/elastic_refs_out_release.json b/python/tests/files/elastic_refs_out_release.json index 5e25d80f..5a45acee 100644 --- a/python/tests/files/elastic_refs_out_release.json +++ b/python/tests/files/elastic_refs_out_release.json @@ -152,11 +152,13 @@ "accessed": {}, "author": [ { - "name": "Alatawi Eman" + "raw_name": "Alatawi Eman" } ], "container-title": "Symbolic Execution with Invariant Inlay: Evaluating the Potential. In 2018 25th Australasian Software Engineering Conference, ASWEC 2018.", - "issued": {} + "issued": { + "date-parts": [[2019]] + } } }, "sort": [ @@ -185,7 +187,9 @@ } ], "container-title": "Symbolic Path-Oriented Test Data Generation for Floating-Point Programs. In Sixth IEEE International Conference on Software Testing, Verification and Validation, ICST", - "issued": {} + "issued": { + "raw": "2000" + } } }, "sort": [ -- cgit v1.2.3 From 4338ec346381647f079a464092e45c609894fe11 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 4 Aug 2021 12:11:20 -0700 Subject: refs web: correct URL to refs section of guide --- python/fatcat_web/templates/refs_macros.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index 3095ce54..4ccca7a5 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -19,7 +19,7 @@

Fuzzy reference matching is a work in progress!
- Read more about quality, completeness, and caveats in the fatcat guide. + Read more about quality, completeness, and caveats in the fatcat guide. -- cgit v1.2.3 From 56e4ce2d8347cdfedd492d54fde080772f3d8725 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 6 Aug 2021 11:58:10 -0700 Subject: refs: format (commas) large refs hit counts --- python/fatcat_web/templates/refs_macros.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/fatcat_web/templates/refs_macros.html b/python/fatcat_web/templates/refs_macros.html index 4ccca7a5..47ea2dcf 100644 --- a/python/fatcat_web/templates/refs_macros.html +++ b/python/fatcat_web/templates/refs_macros.html @@ -7,7 +7,7 @@ {% if hits.count_returned == 0 %} Showing 0 references {% else %} - Showing {{ hits.offset + 1 }} - {{ hits.offset + hits.count_returned }} of {{ hits.count_total}} references + Showing {{ "{:,}".format(hits.offset + 1) }} - {{ "{:,}".format(hits.offset + hits.count_returned) }} of {{ "{:,}".format(hits.count_total) }} references {% endif %} {% if with_links and hits.count_total != hits.count_returned and hits.offset + hits.limit < hits.count_total %}  next » -- cgit v1.2.3