diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-08-06 11:58:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-08-06 11:58:16 -0700 |
commit | 99885b458ad505ebb63b3e7cf5b1bae3dd2a459e (patch) | |
tree | de3fbb3e42b0bb7f6e447d2e13ac3f92a8bb90b2 /python/fatcat_tools | |
parent | 950d3f08bd439aed92d01dbc3cca9747570aa82c (diff) | |
parent | 56e4ce2d8347cdfedd492d54fde080772f3d8725 (diff) | |
download | fatcat-99885b458ad505ebb63b3e7cf5b1bae3dd2a459e.tar.gz fatcat-99885b458ad505ebb63b3e7cf5b1bae3dd2a459e.zip |
Merge branch 'bnewbold-refs-apis'
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/references.py | 429 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/access.py | 60 |
2 files changed, 489 insertions, 0 deletions
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py new file mode 100644 index 00000000..496a46e1 --- /dev/null +++ b/python/fatcat_tools/references.py @@ -0,0 +1,429 @@ +""" +Helper routines for working with the fatcat citation graph, which is a separate +index of reference links between works in the main catalog. + +See bulk citation and citation API proposals for design documentation. +""" + +import sys +import datetime +import argparse +from typing import Optional, List, Any, Dict, Union + +from pydantic import BaseModel, validator +import elasticsearch +from elasticsearch_dsl import Search +from fatcat_openapi_client import ReleaseEntity + +from fatcat_tools import public_api +from fatcat_tools.transforms.access import release_access_options, AccessOption +from fatcat_tools.transforms.entities import entity_to_dict + + +class BiblioRef(BaseModel): + """bibliographic reference""" + # ("release", source_release_ident, ref_index) + # ("wikipedia", source_wikipedia_article, ref_index) + _key: Optional[str] + update_ts: Optional[datetime.datetime] + + # metadata about source of reference + source_release_ident: Optional[str] + source_work_ident: Optional[str] + # with lang prefix like "en:Superglue" + source_wikipedia_article: Optional[str] + source_release_stage: Optional[str] + source_year: Optional[int] + + # context of the reference itself + # 1-indexed, not 0-indexed + ref_index: Optional[int] # TODO: actually optional? + # eg, "Lee86", "BIB23" + ref_key: Optional[str] + # eg, page number + ref_locator: Optional[str] + + # target of reference (identifiers) + target_release_ident: Optional[str] + target_work_ident: Optional[str] + target_openlibrary_work: Optional[str] + # TODO: target_url_surt: Optional[str] + # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform + target_url: Optional[str] + + # crossref, pubmed, grobid, etc + match_provenance: Optional[str] + # strong, weak, etc + match_status: Optional[str] + # TODO: "match_strength"? + # "doi", "isbn", "fuzzy title, author", etc + # maybe "fuzzy-title-author"? + match_reason: Optional[str] + + # only if no release_ident link/match + target_unstructured: Optional[str] + target_csl: Optional[Dict[str, Any]] + + def hacks(self): + """ + Temporary (?) hacks to work around schema/data issues + """ + if self.target_openlibrary_work and self.target_openlibrary_work.startswith("/works/"): + self.target_openlibrary_work = self.target_openlibrary_work[7:] + + # work-arounds for bad/weird ref_key + if self.ref_key: + self.ref_key = self.ref_key.strip() + if self.ref_key[0] in ['/', '_']: + self.ref_key = self.ref_key[1:] + if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key: + self.ref_key = self.ref_key.split('-')[-1] + if self.ref_key.startswith("10.") and '_' in self.ref_key: + self.ref_key = self.ref_key.split('_')[-1] + if len(self.ref_key) > 10 and "#" in self.ref_key: + self.ref_key = self.ref_key.split('#')[-1] + if len(self.ref_key) > 10 and "_" in self.ref_key: + self.ref_key = self.ref_key.split('_')[-1] + if not self.ref_key and self.ref_index is not None: + self.ref_key = str(self.ref_index) + return self + + +class EnrichedBiblioRef(BaseModel): + # enriched version of BiblioRef with complete ReleaseEntity object as + # fetched from the fatcat API. CSL-JSON metadata would be derived from + # the full release entity. + ref: BiblioRef + release: Optional[ReleaseEntity] + # TODO: openlibrary work? + access: List[AccessOption] + + @validator('release') + @classmethod + def check_release(cls, v): + if v is not None and not isinstance(v, ReleaseEntity): + raise ValueError("expected a ReleaseEntity") + return v + + class Config: + arbitrary_types_allowed = True + json_encoders = { + ReleaseEntity: entity_to_dict, + } + + +class RefHits(BaseModel): + count_returned: int + count_total: int + offset: int + limit: int + query_time_ms: int + query_wall_time_ms: int + result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] + + class Config: + json_encoders = { + ReleaseEntity: entity_to_dict, + } + + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: + """ + Internal helper for querying elasticsearch refs index and transforming hits + """ + + limit = min((int(limit or 15), 200)) + if not offset or offset < 0: + offset = 0 + + search = search.params(track_total_hits=True) + search = search[offset : (offset + limit)] + + query_start = datetime.datetime.now() + try: + resp = search.execute() + except elasticsearch.exceptions.RequestError as e_raw: + # this is a "user" error + e: Any = e_raw + #logging.warn("elasticsearch 400: " + str(e.info)) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e + else: + raise ValueError(str(e.info)) from e + except elasticsearch.exceptions.TransportError as e: + # all other errors + #logging.warn(f"elasticsearch non-200 status code: {e.info}") + raise IOError(str(e.info)) from e + query_delta = datetime.datetime.now() - query_start + + result_refs = [] + for h in resp.hits: + # might be a list because of consolidation + if isinstance(h._d_.get('source_work_ident'), list): + h._d_['source_work_ident'] = h._d_['source_work_ident'][0] + result_refs.append(BiblioRef.parse_obj(h._d_).hacks()) + + return RefHits( + count_returned=len(result_refs), + # ES 7.x style "total" + count_total=resp.hits.total.value, + offset=offset, + limit=limit, + query_time_ms=int(resp.took), + query_wall_time_ms=int(query_delta.total_seconds() * 1000), + result_refs=result_refs, + ) + + +def get_outbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + wikipedia_article: Optional[str] = None, + limit: int = 100, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> RefHits: + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", source_release_ident=release_ident) + elif work_ident: + search = search.filter("term", source_work_ident=work_ident) + elif wikipedia_article: + search = search.filter("term", source_wikipedia_article=wikipedia_article) + else: + raise ValueError("require a lookup key") + + search = search.sort("ref_index") + + # re-sort by index + hits = _execute_ref_query(search, limit=limit, offset=offset) + hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) + return hits + + +def get_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url: Optional[str] = None, + consolidate_works: bool = True, + filter_stage: List[str] = [], + sort: Optional[str] = None, + limit: int = 25, + offset: Optional[int] = None, + es_index: str = "fatcat_ref", +) -> List[BiblioRef]: + + search = Search(using=es_client, index=es_index) + + if consolidate_works: + search = search.extra( + collapse={ + "field": "source_work_ident", + "inner_hits": {"name": "source_more", "size": 0,}, + } + ) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + else: + raise ValueError("require a lookup key") + + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + + if sort == "newest": + search = search.sort("-source_year") + elif sort == "oldest": + search = search.sort("source_year") + else: + search = search.sort("-source_year") + + return _execute_ref_query(search, limit=limit, offset=offset) + + +def count_inbound_refs( + es_client: Any, + release_ident: Optional[str] = None, + work_ident: Optional[str] = None, + openlibrary_work: Optional[str] = None, + url: Optional[str] = None, + filter_stage: List[str] = [], + es_index: str = "fatcat_ref", +) -> int: + """ + Same parameters as get_inbound_refs(), but returns just a count + """ + + search = Search(using=es_client, index=es_index) + + if release_ident: + search = search.filter("term", target_release_ident=release_ident) + elif work_ident: + search = search.filter("term", target_work_ident=work_ident) + elif openlibrary_work: + search = search.filter("term", target_openlibrary_work=openlibrary_work) + else: + raise ValueError("require a lookup key") + + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + + return search.count() + + +# run fatcat API fetches for each ref and return "enriched" refs +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: + enriched = [] + for ref in refs: + release = None + access = [] + if ref.source_release_ident: + release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) + access = release_access_options(release) + if ref.source_wikipedia_article: + wiki_lang = ref.source_wikipedia_article.split(':')[0] + wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_') + access.append(AccessOption( + access_type="wikipedia", + access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", + mimetype=None, + size_bytes=None, + thumbnail_url=None + )) + enriched.append(EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + )) + return enriched + + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: + enriched = [] + for ref in refs: + release = None + access = [] + if ref.target_release_ident: + release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) + access = release_access_options(release) + if ref.target_openlibrary_work: + access.append(AccessOption( + access_type="openlibrary", + access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", + mimetype=None, + size_bytes=None, + thumbnail_url=None + )) + if ref.target_url and '://web.archive.org/' in ref.target_url: + access.append(AccessOption( + access_type="wayback", + access_url=ref.target_url, + mimetype=None, + size_bytes=None, + thumbnail_url=None + )) + enriched.append(EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + )) + return enriched + + +def run_ref_query(args) -> None: + """ + CLI helper/debug tool (prints to stdout) + """ + release_ident = None + work_ident = None + if args.ident.startswith("release_"): + release_ident = args.ident.split('_')[1] + elif args.ident.startswith("work_"): + work_ident = args.ident.split('_')[1] + else: + release_ident = args.ident + + print("## Outbound References") + hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + + print() + print("## Inbound References") + hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + + print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + + if args.enrich == "fatcat": + enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + for ref in enriched: + if ref.release: + print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + else: + print(f"release_{ref.target_release_ident}") + else: + for ref in hits.result_refs: + print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") + +def main() -> None: + """ + Run this utility like: + + python -m fatcat_tools.references + + Examples: + + python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply + """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + subparsers = parser.add_subparsers() + + parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0") + parser.add_argument("--elasticsearch-base", default="https://search.fatcat.wiki") + parser.add_argument("--elasticsearch-ref-index", default="fatcat_ref") + + sub = subparsers.add_parser( + "query", + help="takes a fatcat ident argument, prints both inbound and outbound references", + ) + sub.set_defaults(func="run_ref_query") + sub.add_argument("ident", type=str) + sub.add_argument("--enrich", type=str) + + args = parser.parse_args() + if not args.__dict__.get("func"): + parser.print_help(file=sys.stderr) + sys.exit(-1) + + args.es_client = elasticsearch.Elasticsearch(args.elasticsearch_base) + args.fatcat_api_client = public_api(args.fatcat_api_base) + + if args.func == "run_ref_query": + run_ref_query(args) + else: + raise NotImplementedError(args.func) + +if __name__ == "__main__": + main() diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py new file mode 100644 index 00000000..5ed64c7c --- /dev/null +++ b/python/fatcat_tools/transforms/access.py @@ -0,0 +1,60 @@ + +from enum import Enum +from typing import Optional, List + +from pydantic import BaseModel +from fatcat_openapi_client import ReleaseEntity + + +class AccessType(str, Enum): + """describes type of access URL""" + + wayback = "wayback" + ia_file = "ia_file" + ia_microfilm = "ia_microfilm" + repository = "repository" + openlibrary = "openlibrary" + wikipedia = "wikipedia" + +class AccessOption(BaseModel): + + access_type: AccessType + + # note: for `target_url` refs, would do a CDX lookup and this URL would be + # a valid/HTTP-200 web.archive.org capture URL + access_url: str + + # application/pdf, text/html, etc + # blank for landing pages + mimetype: Optional[str] + + size_bytes: Optional[int] + thumbnail_url: Optional[str] + + +def release_access_options(release: ReleaseEntity) -> List[AccessOption]: + """ + Extracts access options from a release. + + TODO: proper implementation + """ + options = [] + for f in (release.files or []): + for u in (f.urls or []): + if '://web.archive.org/' in u.url: + return [AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + elif '://archive.org/' in u.url: + return [AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + return options |