fmt (black): fatcat_tools/

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-02 18:14:59 -0700
commit: 31d1a6a713d177990609767d508209ced19ca396 (patch)
tree: a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/references.py
parent: 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download: fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
1 files changed, 116 insertions, 66 deletions
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 8361b260..6fd9ca49 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -22,6 +22,7 @@ from fatcat_tools.transforms.entities import entity_to_dict
 
 class BiblioRef(BaseModel):
     """bibliographic reference"""
+
     # ("release", source_release_ident, ref_index)
     # ("wikipedia", source_wikipedia_article, ref_index)
     _key: Optional[str]
@@ -37,7 +38,7 @@ class BiblioRef(BaseModel):
 
     # context of the reference itself
     # 1-indexed, not 0-indexed
-    ref_index: Optional[int] # TODO: actually optional?
+    ref_index: Optional[int]  # TODO: actually optional?
     # eg, "Lee86", "BIB23"
     ref_key: Optional[str]
     # eg, page number
@@ -74,16 +75,20 @@ class BiblioRef(BaseModel):
         # work-arounds for bad/weird ref_key
         if self.ref_key:
             self.ref_key = self.ref_key.strip()
-            if self.ref_key[0] in ['/', '_']:
+            if self.ref_key[0] in ["/", "_"]:
                 self.ref_key = self.ref_key[1:]
-            if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key:
-                self.ref_key = self.ref_key.split('-')[-1]
-            if self.ref_key.startswith("10.") and '_' in self.ref_key:
-                self.ref_key = self.ref_key.split('_')[-1]
+            if (
+                self.ref_key.startswith("10.")
+                and "SICI" in self.ref_key
+                and "-" in self.ref_key
+            ):
+                self.ref_key = self.ref_key.split("-")[-1]
+            if self.ref_key.startswith("10.") and "_" in self.ref_key:
+                self.ref_key = self.ref_key.split("_")[-1]
             if len(self.ref_key) > 10 and "#" in self.ref_key:
-                self.ref_key = self.ref_key.split('#')[-1]
+                self.ref_key = self.ref_key.split("#")[-1]
             if len(self.ref_key) > 10 and "_" in self.ref_key:
-                self.ref_key = self.ref_key.split('_')[-1]
+                self.ref_key = self.ref_key.split("_")[-1]
         if not self.ref_key and self.ref_index is not None:
             self.ref_key = str(self.ref_index)
         return self
@@ -98,7 +103,7 @@ class EnrichedBiblioRef(BaseModel):
     # TODO: openlibrary work?
     access: List[AccessOption]
 
-    @validator('release')
+    @validator("release")
     @classmethod
     def check_release(cls, v):
         if v is not None and not isinstance(v, ReleaseEntity):
@@ -119,7 +124,7 @@ class RefHits(BaseModel):
     limit: int
     query_time_ms: int
     query_wall_time_ms: int
-    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
+    result_refs: List[Union[BiblioRef, EnrichedBiblioRef]]
 
     class Config:
         json_encoders = {
@@ -145,22 +150,22 @@ def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) ->
     except elasticsearch.exceptions.RequestError as e_raw:
         # this is a "user" error
         e: Any = e_raw
-        #logging.warn("elasticsearch 400: " + str(e.info))
+        # logging.warn("elasticsearch 400: " + str(e.info))
         if e.info.get("error", {}).get("root_cause", {}):
             raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e
         else:
             raise ValueError(str(e.info)) from e
     except elasticsearch.exceptions.TransportError as e:
         # all other errors
-        #logging.warn(f"elasticsearch non-200 status code: {e.info}")
+        # logging.warn(f"elasticsearch non-200 status code: {e.info}")
         raise IOError(str(e.info)) from e
     query_delta = datetime.datetime.now() - query_start
 
     result_refs = []
     for h in resp.hits:
         # might be a list because of consolidation
-        if isinstance(h._d_.get('source_work_ident'), list):
-            h._d_['source_work_ident'] = h._d_['source_work_ident'][0]
+        if isinstance(h._d_.get("source_work_ident"), list):
+            h._d_["source_work_ident"] = h._d_["source_work_ident"][0]
         result_refs.append(BiblioRef.parse_obj(h._d_).hacks())
 
     return RefHits(
@@ -224,7 +229,10 @@ def get_inbound_refs(
         search = search.extra(
             collapse={
                 "field": "source_work_ident",
-                "inner_hits": {"name": "source_more", "size": 0,},
+                "inner_hits": {
+                    "name": "source_more",
+                    "size": 0,
+                },
             }
         )
 
@@ -281,61 +289,87 @@ def count_inbound_refs(
 
 
 # run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
+def enrich_inbound_refs(
+    refs: List[BiblioRef],
+    fatcat_api_client: Any,
+    hide: Optional[str] = "refs",
+    expand: Optional[str] = "container,files,webcaptures,filesets",
+) -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         release = None
         access = []
         if ref.source_release_ident:
-            release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
+            release = fatcat_api_client.get_release(
+                ref.source_release_ident, hide=hide, expand=expand
+            )
             access = release_access_options(release)
         if ref.source_wikipedia_article:
-            wiki_lang = ref.source_wikipedia_article.split(':')[0]
-            wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_')
-            access.append(AccessOption(
-                access_type="wikipedia",
-                access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}",
-                mimetype=None,
-                size_bytes=None,
-                thumbnail_url=None
-            ))
-        enriched.append(EnrichedBiblioRef(
-            ref=ref,
-            access=access,
-            release=release,
-        ))
+            wiki_lang = ref.source_wikipedia_article.split(":")[0]
+            wiki_article = ":".join(ref.source_wikipedia_article.split(":")[1:]).replace(
+                " ", "_"
+            )
+            access.append(
+                AccessOption(
+                    access_type="wikipedia",
+                    access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}",
+                    mimetype=None,
+                    size_bytes=None,
+                    thumbnail_url=None,
+                )
+            )
+        enriched.append(
+            EnrichedBiblioRef(
+                ref=ref,
+                access=access,
+                release=release,
+            )
+        )
     return enriched
 
 
-def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
+def enrich_outbound_refs(
+    refs: List[BiblioRef],
+    fatcat_api_client: Any,
+    hide: Optional[str] = "refs",
+    expand: Optional[str] = "container,files,webcaptures,filesets",
+) -> List[EnrichedBiblioRef]:
     enriched = []
     for ref in refs:
         release = None
         access = []
         if ref.target_release_ident:
-            release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
+            release = fatcat_api_client.get_release(
+                ref.target_release_ident, hide=hide, expand=expand
+            )
             access = release_access_options(release)
         if ref.target_openlibrary_work:
-            access.append(AccessOption(
-                access_type="openlibrary",
-                access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}",
-                mimetype=None,
-                size_bytes=None,
-                thumbnail_url=None
-            ))
-        if ref.target_url and '://web.archive.org/' in ref.target_url:
-            access.append(AccessOption(
-                access_type="wayback",
-                access_url=ref.target_url,
-                mimetype=None,
-                size_bytes=None,
-                thumbnail_url=None
-            ))
-        enriched.append(EnrichedBiblioRef(
-            ref=ref,
-            access=access,
-            release=release,
-        ))
+            access.append(
+                AccessOption(
+                    access_type="openlibrary",
+                    access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}",
+                    mimetype=None,
+                    size_bytes=None,
+                    thumbnail_url=None,
+                )
+            )
+        if ref.target_url and "://web.archive.org/" in ref.target_url:
+            access.append(
+                AccessOption(
+                    access_type="wayback",
+                    access_url=ref.target_url,
+                    mimetype=None,
+                    size_bytes=None,
+                    thumbnail_url=None,
+                )
+            )
+        enriched.append(
+            EnrichedBiblioRef(
+                ref=ref,
+                access=access,
+                release=release,
+            )
+        )
     return enriched
 
 
@@ -346,21 +380,29 @@ def run_ref_query(args) -> None:
     release_ident = None
     work_ident = None
     if args.ident.startswith("release_"):
-        release_ident = args.ident.split('_')[1]
+        release_ident = args.ident.split("_")[1]
     elif args.ident.startswith("work_"):
-        work_ident = args.ident.split('_')[1]
+        work_ident = args.ident.split("_")[1]
     else:
         release_ident = args.ident
 
     print("## Outbound References")
-    hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client)
-    print(f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms")
+    hits = get_outbound_refs(
+        release_ident=release_ident, work_ident=work_ident, es_client=args.es_client
+    )
+    print(
+        f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms"
+    )
 
     if args.enrich == "fatcat":
-        enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client)
+        enriched = enrich_outbound_refs(
+            hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
+        )
         for ref in enriched:
             if ref.release:
-                print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}")
+                print(
+                    f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+                )
             else:
                 print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}")
     else:
@@ -369,21 +411,30 @@ def run_ref_query(args) -> None:
 
     print()
     print("## Inbound References")
-    hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client)
+    hits = get_inbound_refs(
+        release_ident=release_ident, work_ident=work_ident, es_client=args.es_client
+    )
 
-    print(f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms")
+    print(
+        f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms"
+    )
 
     if args.enrich == "fatcat":
-        enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client)
+        enriched = enrich_inbound_refs(
+            hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
+        )
         for ref in enriched:
             if ref.release:
-                print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}")
+                print(
+                    f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+                )
             else:
                 print(f"release_{ref.target_release_ident}")
     else:
         for ref in hits.result_refs:
             print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}")
 
+
 def main() -> None:
     """
     Run this utility like:
@@ -395,9 +446,7 @@ def main() -> None:
         python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply
     """
 
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     subparsers = parser.add_subparsers()
 
     parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0")
@@ -425,5 +474,6 @@ def main() -> None:
     else:
         raise NotImplementedError(args.func)
 
+
 if __name__ == "__main__":
     main()
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-02 18:14:59 -0700
commit	31d1a6a713d177990609767d508209ced19ca396 (patch)
tree	a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/references.py
parent	9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
download	fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip