heavy to refs command

author: Bryan Newbold <bnewbold@archive.org> 2020-09-04 17:52:01 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-09-04 17:52:01 -0700
commit: 3bf706f07d68aaacebcbf08d21c7b4c4f91856ea (patch)
tree: b32bd11d7fb5091567c179877f9f5784c595faca
parent: 8ef87c02693a51b319249632e3219d8414ce8c13 (diff)
download: fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.tar.gz
fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.zip
3 files changed, 184 insertions, 2 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index 979a794..fc19036 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -97,6 +97,12 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]:
 
 
 def biblio_info(elem: ET.Element) -> Dict[str, Any]:
+    """
+    TODO for references:
+    - pages
+    - locator
+    - doi, pmid, pmcid, arxiv_id, isbn
+    """
     ref: Dict[str, Any] = dict()
     ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
     # Title stuff is messy in references...
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 3d402b4..f171716 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -178,6 +178,42 @@ class ScholarDoc(BaseModel):
     access: List[ScholarAccess]
 
 
+class RefBiblio(BaseModel):
+    title: Optional[str]
+    subtitle: Optional[str]
+    contrib_raw_names: List[str]
+    year: Optional[int]
+    container_name: Optional[str]
+    volume: Optional[str]
+    issue: Optional[str]
+    pages: Optional[str]
+    doi: Optional[str]
+    pmid: Optional[str]
+    pmcid: Optional[str]
+    arxiv_id: Optional[str]
+    isbn13: Optional[str]
+    url: Optional[str]
+
+
+class RefStructured(BaseModel):
+    biblio: RefBiblio
+    release_ident: Optional[str]
+    work_ident: Optional[str]
+    index: Optional[int]
+    key: Optional[str]
+    locator: Optional[str]
+    target_release_id: Optional[str]
+    ref_source: Optional[str]  # grobid, crossref, pubmed, wikipedia, etc
+
+
+class RefTarget(BaseModel):
+    biblio: RefBiblio
+    release_ident: Optional[str]
+    work_ident: Optional[str]
+    release_stage: Optional[str]
+    release_type: Optional[str]
+
+
 def clean_small_int(raw: Optional[str]) -> Optional[int]:
     if not raw or not raw.isdigit():
         return None
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index ad9e66f..158ed6f 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -4,7 +4,7 @@ import datetime
 from typing import List, Dict, Optional, Any, Sequence
 
 from dynaconf import settings
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, ReleaseRef
 
 from fatcat_scholar.api_entities import *
 from fatcat_scholar.schema import *
@@ -447,6 +447,110 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
     )
 
 
+def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> Sequence[RefStructured]:
+    output = []
+    for ref in tei_dict.get("citations") or []:
+        ref_date = ref.get("date") or None
+        ref_year: Optional[int] = None
+        if ref_date and len(ref_date) > 4 and ref_date[:4].isdigit():
+            ref_year = int(ref_date[:4])
+        output.append(
+            RefStructured(
+                biblio=RefBiblio(
+                    title=ref.get("title"),
+                    # subtitle
+                    contrib_raw_names=ref.get("authors") or [],
+                    year=ref_year,
+                    container_name=ref.get("journal"),
+                    volume=ref.get("volume"),
+                    issue=ref.get("issue"),
+                    # pages: Optional[str]
+                    # doi: Optional[str]
+                    # pmid: Optional[str]
+                    # pmcid: Optional[str]
+                    # arxiv_id: Optional[str]
+                    # isbn13: Optional[str]
+                    url=ref.get("url"),
+                ),
+                release_ident=release.ident,
+                work_ident=release.work_id,
+                index=ref.get("index"),
+                key=ref.get("id"),
+                locator=None,
+                # target_release_id
+                ref_source="grobid",
+            )
+        )
+    return output
+
+
+def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]:
+    output = []
+    for ref in release.refs:
+        ref_source = "fatcat"
+        if release.extra and release.extra.get("pubmed"):
+            ref_source = "pubmed"
+        elif release.extra and release.extra.get("crossref"):
+            ref_source = "crossref"
+        elif release.extra and release.extra.get("datacite"):
+            ref_source = "datacite"
+        extra = ref.extra or dict()
+        output.append(
+            RefStructured(
+                biblio=RefBiblio(
+                    title=ref.title,
+                    subtitle=extra.get("subtitle"),
+                    contrib_raw_names=extra.get("authors") or [],
+                    year=ref.year,
+                    container_name=ref.container_name,
+                    volume=extra.get("volume"),
+                    issue=extra.get("issue"),
+                    pages=extra.get("pages"),
+                    doi=extra.get("doi"),
+                    pmid=extra.get("pmid"),
+                    pmcid=extra.get("pmcid"),
+                    arxiv_id=extra.get("arxiv_id"),
+                    isbn13=extra.get("isbn13"),
+                    url=extra.get("url"),
+                ),
+                release_ident=release.ident,
+                work_ident=release.work_id,
+                index=ref.index,
+                key=ref.key,
+                locator=ref.locator,
+                target_release_id=ref.target_release_id,
+                ref_source=ref_source,
+            )
+        )
+    return output
+
+
+def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
+
+    if heavy.doc_type != DocType.work:
+        return []
+
+    # first, identify source of refs: fatcat release metadata or GROBID
+    assert heavy.biblio_release_ident
+    primary_release = [
+        r for r in heavy.releases if r.ident == heavy.biblio_release_ident
+    ][0]
+
+    if primary_release.refs:
+        # TODO: what about other releases?
+        return refs_from_release_refs(primary_release)
+    elif heavy.grobid_fulltext:
+        fulltext_release = [
+            r
+            for r in heavy.releases
+            if r.ident == heavy.grobid_fulltext["release_ident"]
+        ][0]
+        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
+        return refs_from_grobid(fulltext_release, tei_dict)
+    else:
+        return []
+
+
 def run_transform(infile: Sequence) -> None:
     for line in infile:
         obj = json.loads(line)
@@ -469,6 +573,27 @@ def run_transform(infile: Sequence) -> None:
         print(es_doc.json(exclude_none=True, sort_keys=True))
 
 
+def run_refs(infile: Sequence) -> None:
+    for line in infile:
+        obj = json.loads(line)
+
+        heavy = IntermediateBundle(
+            doc_type=DocType(obj["doc_type"]),
+            releases=[
+                entity_from_json(json.dumps(re), ReleaseEntity)
+                for re in obj["releases"]
+            ],
+            biblio_release_ident=obj.get("biblio_release_ident"),
+            grobid_fulltext=obj.get("grobid_fulltext"),
+            pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+            pdf_meta=obj.get("pdf_meta"),
+            sim_fulltext=obj.get("sim_fulltext"),
+        )
+        refs = refs_from_heavy(heavy)
+        for ref in refs:
+            print(ref.json(exclude_none=True, sort_keys=True))
+
+
 def main() -> None:
     """
     Run this command like:
@@ -482,7 +607,8 @@ def main() -> None:
     subparsers = parser.add_subparsers()
 
     sub = subparsers.add_parser(
-        "run_transform", help="iterates through 'heavy' intermediate"
+        "run_transform",
+        help="takes 'heavy' intermediate, outputs scholar_fulltext ES documents",
     )
     sub.set_defaults(func="run_transform")
     sub.add_argument(
@@ -493,6 +619,18 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
 
+    sub = subparsers.add_parser(
+        "run_refs", help="extracts references from 'heavy' intermediate"
+    )
+    sub.set_defaults(func="run_refs")
+    sub.add_argument(
+        "json_file",
+        help="intermediate globs as JSON-lines",
+        nargs="?",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print("tell me what to do! (try --help)")
@@ -500,6 +638,8 @@ def main() -> None:
 
     if args.func == "run_transform":
         run_transform(infile=args.json_file)
+    elif args.func == "run_refs":
+        run_refs(infile=args.json_file)
     else:
         raise NotImplementedError(args.func)
author	Bryan Newbold <bnewbold@archive.org>	2020-09-04 17:52:01 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-09-04 17:52:01 -0700
commit	3bf706f07d68aaacebcbf08d21c7b4c4f91856ea (patch)
tree	b32bd11d7fb5091567c179877f9f5784c595faca
parent	8ef87c02693a51b319249632e3219d8414ce8c13 (diff)
download	fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.tar.gz fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.zip