diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-04 17:52:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-04 17:52:01 -0700 |
commit | 3bf706f07d68aaacebcbf08d21c7b4c4f91856ea (patch) | |
tree | b32bd11d7fb5091567c179877f9f5784c595faca | |
parent | 8ef87c02693a51b319249632e3219d8414ce8c13 (diff) | |
download | fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.tar.gz fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.zip |
heavy to refs command
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 36 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 144 |
3 files changed, 184 insertions, 2 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 979a794..fc19036 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -97,6 +97,12 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element) -> Dict[str, Any]: + """ + TODO for references: + - pages + - locator + - doi, pmid, pmcid, arxiv_id, isbn + """ ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") # Title stuff is messy in references... diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 3d402b4..f171716 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -178,6 +178,42 @@ class ScholarDoc(BaseModel): access: List[ScholarAccess] +class RefBiblio(BaseModel): + title: Optional[str] + subtitle: Optional[str] + contrib_raw_names: List[str] + year: Optional[int] + container_name: Optional[str] + volume: Optional[str] + issue: Optional[str] + pages: Optional[str] + doi: Optional[str] + pmid: Optional[str] + pmcid: Optional[str] + arxiv_id: Optional[str] + isbn13: Optional[str] + url: Optional[str] + + +class RefStructured(BaseModel): + biblio: RefBiblio + release_ident: Optional[str] + work_ident: Optional[str] + index: Optional[int] + key: Optional[str] + locator: Optional[str] + target_release_id: Optional[str] + ref_source: Optional[str] # grobid, crossref, pubmed, wikipedia, etc + + +class RefTarget(BaseModel): + biblio: RefBiblio + release_ident: Optional[str] + work_ident: Optional[str] + release_stage: Optional[str] + release_type: Optional[str] + + def clean_small_int(raw: Optional[str]) -> Optional[int]: if not raw or not raw.isdigit(): return None diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index ad9e66f..158ed6f 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -4,7 +4,7 @@ import datetime from typing import List, Dict, Optional, Any, Sequence from dynaconf import settings -from fatcat_openapi_client import ReleaseEntity, FileEntity +from fatcat_openapi_client import ReleaseEntity, FileEntity, ReleaseRef from fatcat_scholar.api_entities import * from fatcat_scholar.schema import * @@ -447,6 +447,110 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: ) +def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> Sequence[RefStructured]: + output = [] + for ref in tei_dict.get("citations") or []: + ref_date = ref.get("date") or None + ref_year: Optional[int] = None + if ref_date and len(ref_date) > 4 and ref_date[:4].isdigit(): + ref_year = int(ref_date[:4]) + output.append( + RefStructured( + biblio=RefBiblio( + title=ref.get("title"), + # subtitle + contrib_raw_names=ref.get("authors") or [], + year=ref_year, + container_name=ref.get("journal"), + volume=ref.get("volume"), + issue=ref.get("issue"), + # pages: Optional[str] + # doi: Optional[str] + # pmid: Optional[str] + # pmcid: Optional[str] + # arxiv_id: Optional[str] + # isbn13: Optional[str] + url=ref.get("url"), + ), + release_ident=release.ident, + work_ident=release.work_id, + index=ref.get("index"), + key=ref.get("id"), + locator=None, + # target_release_id + ref_source="grobid", + ) + ) + return output + + +def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]: + output = [] + for ref in release.refs: + ref_source = "fatcat" + if release.extra and release.extra.get("pubmed"): + ref_source = "pubmed" + elif release.extra and release.extra.get("crossref"): + ref_source = "crossref" + elif release.extra and release.extra.get("datacite"): + ref_source = "datacite" + extra = ref.extra or dict() + output.append( + RefStructured( + biblio=RefBiblio( + title=ref.title, + subtitle=extra.get("subtitle"), + contrib_raw_names=extra.get("authors") or [], + year=ref.year, + container_name=ref.container_name, + volume=extra.get("volume"), + issue=extra.get("issue"), + pages=extra.get("pages"), + doi=extra.get("doi"), + pmid=extra.get("pmid"), + pmcid=extra.get("pmcid"), + arxiv_id=extra.get("arxiv_id"), + isbn13=extra.get("isbn13"), + url=extra.get("url"), + ), + release_ident=release.ident, + work_ident=release.work_id, + index=ref.index, + key=ref.key, + locator=ref.locator, + target_release_id=ref.target_release_id, + ref_source=ref_source, + ) + ) + return output + + +def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: + + if heavy.doc_type != DocType.work: + return [] + + # first, identify source of refs: fatcat release metadata or GROBID + assert heavy.biblio_release_ident + primary_release = [ + r for r in heavy.releases if r.ident == heavy.biblio_release_ident + ][0] + + if primary_release.refs: + # TODO: what about other releases? + return refs_from_release_refs(primary_release) + elif heavy.grobid_fulltext: + fulltext_release = [ + r + for r in heavy.releases + if r.ident == heavy.grobid_fulltext["release_ident"] + ][0] + tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) + return refs_from_grobid(fulltext_release, tei_dict) + else: + return [] + + def run_transform(infile: Sequence) -> None: for line in infile: obj = json.loads(line) @@ -469,6 +573,27 @@ def run_transform(infile: Sequence) -> None: print(es_doc.json(exclude_none=True, sort_keys=True)) +def run_refs(infile: Sequence) -> None: + for line in infile: + obj = json.loads(line) + + heavy = IntermediateBundle( + doc_type=DocType(obj["doc_type"]), + releases=[ + entity_from_json(json.dumps(re), ReleaseEntity) + for re in obj["releases"] + ], + biblio_release_ident=obj.get("biblio_release_ident"), + grobid_fulltext=obj.get("grobid_fulltext"), + pdftotext_fulltext=obj.get("pdftotext_fulltext"), + pdf_meta=obj.get("pdf_meta"), + sim_fulltext=obj.get("sim_fulltext"), + ) + refs = refs_from_heavy(heavy) + for ref in refs: + print(ref.json(exclude_none=True, sort_keys=True)) + + def main() -> None: """ Run this command like: @@ -482,7 +607,8 @@ def main() -> None: subparsers = parser.add_subparsers() sub = subparsers.add_parser( - "run_transform", help="iterates through 'heavy' intermediate" + "run_transform", + help="takes 'heavy' intermediate, outputs scholar_fulltext ES documents", ) sub.set_defaults(func="run_transform") sub.add_argument( @@ -493,6 +619,18 @@ def main() -> None: type=argparse.FileType("r"), ) + sub = subparsers.add_parser( + "run_refs", help="extracts references from 'heavy' intermediate" + ) + sub.set_defaults(func="run_refs") + sub.add_argument( + "json_file", + help="intermediate globs as JSON-lines", + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do! (try --help)") @@ -500,6 +638,8 @@ def main() -> None: if args.func == "run_transform": run_transform(infile=args.json_file) + elif args.func == "run_refs": + run_refs(infile=args.json_file) else: raise NotImplementedError(args.func) |