summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-04 17:52:01 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-04 17:52:01 -0700
commit3bf706f07d68aaacebcbf08d21c7b4c4f91856ea (patch)
treeb32bd11d7fb5091567c179877f9f5784c595faca /fatcat_scholar
parent8ef87c02693a51b319249632e3219d8414ce8c13 (diff)
downloadfatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.tar.gz
fatcat-scholar-3bf706f07d68aaacebcbf08d21c7b4c4f91856ea.zip
heavy to refs command
Diffstat (limited to 'fatcat_scholar')
-rwxr-xr-xfatcat_scholar/grobid2json.py6
-rw-r--r--fatcat_scholar/schema.py36
-rw-r--r--fatcat_scholar/transform.py144
3 files changed, 184 insertions, 2 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index 979a794..fc19036 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -97,6 +97,12 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]:
def biblio_info(elem: ET.Element) -> Dict[str, Any]:
+ """
+ TODO for references:
+ - pages
+ - locator
+ - doi, pmid, pmcid, arxiv_id, isbn
+ """
ref: Dict[str, Any] = dict()
ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
# Title stuff is messy in references...
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 3d402b4..f171716 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -178,6 +178,42 @@ class ScholarDoc(BaseModel):
access: List[ScholarAccess]
+class RefBiblio(BaseModel):
+ title: Optional[str]
+ subtitle: Optional[str]
+ contrib_raw_names: List[str]
+ year: Optional[int]
+ container_name: Optional[str]
+ volume: Optional[str]
+ issue: Optional[str]
+ pages: Optional[str]
+ doi: Optional[str]
+ pmid: Optional[str]
+ pmcid: Optional[str]
+ arxiv_id: Optional[str]
+ isbn13: Optional[str]
+ url: Optional[str]
+
+
+class RefStructured(BaseModel):
+ biblio: RefBiblio
+ release_ident: Optional[str]
+ work_ident: Optional[str]
+ index: Optional[int]
+ key: Optional[str]
+ locator: Optional[str]
+ target_release_id: Optional[str]
+ ref_source: Optional[str] # grobid, crossref, pubmed, wikipedia, etc
+
+
+class RefTarget(BaseModel):
+ biblio: RefBiblio
+ release_ident: Optional[str]
+ work_ident: Optional[str]
+ release_stage: Optional[str]
+ release_type: Optional[str]
+
+
def clean_small_int(raw: Optional[str]) -> Optional[int]:
if not raw or not raw.isdigit():
return None
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index ad9e66f..158ed6f 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -4,7 +4,7 @@ import datetime
from typing import List, Dict, Optional, Any, Sequence
from dynaconf import settings
-from fatcat_openapi_client import ReleaseEntity, FileEntity
+from fatcat_openapi_client import ReleaseEntity, FileEntity, ReleaseRef
from fatcat_scholar.api_entities import *
from fatcat_scholar.schema import *
@@ -447,6 +447,110 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
)
+def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> Sequence[RefStructured]:
+ output = []
+ for ref in tei_dict.get("citations") or []:
+ ref_date = ref.get("date") or None
+ ref_year: Optional[int] = None
+ if ref_date and len(ref_date) > 4 and ref_date[:4].isdigit():
+ ref_year = int(ref_date[:4])
+ output.append(
+ RefStructured(
+ biblio=RefBiblio(
+ title=ref.get("title"),
+ # subtitle
+ contrib_raw_names=ref.get("authors") or [],
+ year=ref_year,
+ container_name=ref.get("journal"),
+ volume=ref.get("volume"),
+ issue=ref.get("issue"),
+ # pages: Optional[str]
+ # doi: Optional[str]
+ # pmid: Optional[str]
+ # pmcid: Optional[str]
+ # arxiv_id: Optional[str]
+ # isbn13: Optional[str]
+ url=ref.get("url"),
+ ),
+ release_ident=release.ident,
+ work_ident=release.work_id,
+ index=ref.get("index"),
+ key=ref.get("id"),
+ locator=None,
+ # target_release_id
+ ref_source="grobid",
+ )
+ )
+ return output
+
+
+def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]:
+ output = []
+ for ref in release.refs:
+ ref_source = "fatcat"
+ if release.extra and release.extra.get("pubmed"):
+ ref_source = "pubmed"
+ elif release.extra and release.extra.get("crossref"):
+ ref_source = "crossref"
+ elif release.extra and release.extra.get("datacite"):
+ ref_source = "datacite"
+ extra = ref.extra or dict()
+ output.append(
+ RefStructured(
+ biblio=RefBiblio(
+ title=ref.title,
+ subtitle=extra.get("subtitle"),
+ contrib_raw_names=extra.get("authors") or [],
+ year=ref.year,
+ container_name=ref.container_name,
+ volume=extra.get("volume"),
+ issue=extra.get("issue"),
+ pages=extra.get("pages"),
+ doi=extra.get("doi"),
+ pmid=extra.get("pmid"),
+ pmcid=extra.get("pmcid"),
+ arxiv_id=extra.get("arxiv_id"),
+ isbn13=extra.get("isbn13"),
+ url=extra.get("url"),
+ ),
+ release_ident=release.ident,
+ work_ident=release.work_id,
+ index=ref.index,
+ key=ref.key,
+ locator=ref.locator,
+ target_release_id=ref.target_release_id,
+ ref_source=ref_source,
+ )
+ )
+ return output
+
+
+def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
+
+ if heavy.doc_type != DocType.work:
+ return []
+
+ # first, identify source of refs: fatcat release metadata or GROBID
+ assert heavy.biblio_release_ident
+ primary_release = [
+ r for r in heavy.releases if r.ident == heavy.biblio_release_ident
+ ][0]
+
+ if primary_release.refs:
+ # TODO: what about other releases?
+ return refs_from_release_refs(primary_release)
+ elif heavy.grobid_fulltext:
+ fulltext_release = [
+ r
+ for r in heavy.releases
+ if r.ident == heavy.grobid_fulltext["release_ident"]
+ ][0]
+ tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
+ return refs_from_grobid(fulltext_release, tei_dict)
+ else:
+ return []
+
+
def run_transform(infile: Sequence) -> None:
for line in infile:
obj = json.loads(line)
@@ -469,6 +573,27 @@ def run_transform(infile: Sequence) -> None:
print(es_doc.json(exclude_none=True, sort_keys=True))
+def run_refs(infile: Sequence) -> None:
+ for line in infile:
+ obj = json.loads(line)
+
+ heavy = IntermediateBundle(
+ doc_type=DocType(obj["doc_type"]),
+ releases=[
+ entity_from_json(json.dumps(re), ReleaseEntity)
+ for re in obj["releases"]
+ ],
+ biblio_release_ident=obj.get("biblio_release_ident"),
+ grobid_fulltext=obj.get("grobid_fulltext"),
+ pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+ pdf_meta=obj.get("pdf_meta"),
+ sim_fulltext=obj.get("sim_fulltext"),
+ )
+ refs = refs_from_heavy(heavy)
+ for ref in refs:
+ print(ref.json(exclude_none=True, sort_keys=True))
+
+
def main() -> None:
"""
Run this command like:
@@ -482,7 +607,8 @@ def main() -> None:
subparsers = parser.add_subparsers()
sub = subparsers.add_parser(
- "run_transform", help="iterates through 'heavy' intermediate"
+ "run_transform",
+ help="takes 'heavy' intermediate, outputs scholar_fulltext ES documents",
)
sub.set_defaults(func="run_transform")
sub.add_argument(
@@ -493,6 +619,18 @@ def main() -> None:
type=argparse.FileType("r"),
)
+ sub = subparsers.add_parser(
+ "run_refs", help="extracts references from 'heavy' intermediate"
+ )
+ sub.set_defaults(func="run_refs")
+ sub.add_argument(
+ "json_file",
+ help="intermediate globs as JSON-lines",
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do! (try --help)")
@@ -500,6 +638,8 @@ def main() -> None:
if args.func == "run_transform":
run_transform(infile=args.json_file)
+ elif args.func == "run_refs":
+ run_refs(infile=args.json_file)
else:
raise NotImplementedError(args.func)