aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:24:19 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:25:58 -0700
commit560d5f7cc1672f95e2a953ab5908f4205151a703 (patch)
tree04b35084358786bbd2329491be07cde35a4d2289 /fatcat_scholar
parent33211915773a0c77d064c55c1b02ceed6f455feb (diff)
downloadfatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.tar.gz
fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.zip
refactor use of grobid_tei_xml
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/query_citation.py52
-rw-r--r--fatcat_scholar/schema.py9
-rw-r--r--fatcat_scholar/transform.py80
3 files changed, 69 insertions, 72 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py
index dea4f02..0233acc 100644
--- a/fatcat_scholar/query_citation.py
+++ b/fatcat_scholar/query_citation.py
@@ -11,14 +11,14 @@ parallel with "regular" query?
"""
import sys
-from typing import Any, Optional, Tuple
+from typing import Any, List, Optional, Tuple
import fuzzycat.common
import fuzzycat.verify
import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
from fuzzycat.matching import match_release_fuzzy
-from grobid_tei_xml import parse_citations_xml
+from grobid_tei_xml import GrobidBiblio, parse_citations_xml
from fatcat_scholar.api_entities import entity_to_dict
@@ -44,44 +44,42 @@ def grobid_process_citation(
return grobid_response.text
-def transform_grobid(raw_xml: str) -> Optional[dict]:
- ref_list = parse_citations_xml(raw_xml)
- if not ref_list:
+def transform_grobid(raw_xml: str) -> Optional[GrobidBiblio]:
+ ref_list: List[GrobidBiblio] = parse_citations_xml(raw_xml)
+ # check for unmatched or empty references
+ if not ref_list or not ref_list[0].to_dict():
return None
- ref = ref_list[0]
- if not any(ref.values()):
- return None
- return ref
+ return ref_list[0]
-def ref_to_release(ref: dict) -> ReleaseEntity:
+def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:
contribs = []
- for author in ref.get("authors") or []:
+ for author in ref.authors or []:
contribs.append(
ReleaseContrib(
- raw_name=author.get("name"),
- given_name=author.get("given_name"),
- surname=author.get("surname"),
+ raw_name=author.full_name,
+ given_name=author.given_name,
+ surname=author.surname,
)
)
release = ReleaseEntity(
- title=ref.get("title"),
+ title=ref.title,
contribs=contribs,
- volume=ref.get("volume"),
- issue=ref.get("issue"),
- pages=ref.get("pages"),
+ volume=ref.volume,
+ issue=ref.issue,
+ pages=ref.pages,
ext_ids=ReleaseExtIds(
- doi=ref.get("doi"),
- pmid=ref.get("pmid"),
- pmcid=ref.get("pmcid"),
- arxiv=ref.get("arxiv_id"),
+ doi=ref.doi,
+ pmid=ref.pmid,
+ pmcid=ref.pmcid,
+ arxiv=ref.arxiv_id,
),
)
- if ref.get("journal"):
- release.extra = {"container_name": ref.get("journal")}
- if ref.get("date"):
- if len(ref["date"]) == 4 and ref["date"].isdigit():
- release.release_year = int(ref["date"])
+ if ref.journal:
+ release.extra = {"container_name": ref.journal}
+ if ref.date:
+ if len(ref.date) == 4 and ref.date.isdigit():
+ release.release_year = int(ref.date)
return release
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 6c9307d..b170f12 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional
import ftfy
from bs4 import BeautifulSoup
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity
+from grobid_tei_xml import GrobidDocument
# pytype: disable=import-error
from pydantic import BaseModel
@@ -494,12 +495,12 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
return None
-def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]:
+def es_abstracts_from_grobid(tei_doc: GrobidDocument) -> List[ScholarAbstract]:
- if tei_dict.get("abstract"):
- body = scrub_text(tei_dict["abstract"])
+ if tei_doc.abstract:
+ body = scrub_text(tei_doc.abstract)
if body:
- return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)]
+ return [ScholarAbstract(lang_code=tei_doc.language_code, body=body)]
return []
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index caeff21..d40e123 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Sequence
import sentry_sdk
from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity
-from grobid_tei_xml import parse_document_xml
+from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_document_xml
from fatcat_scholar.config import GIT_REVISION, settings
from fatcat_scholar.identifiers import clean_doi, clean_pmcid
@@ -241,18 +241,18 @@ def _add_file_release_meta(
def es_fulltext_from_grobid(
- tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
+ tei_doc: GrobidDocument, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
) -> Optional[ScholarFulltext]:
- if not tei_dict.get("body"):
+ if not tei_doc.body:
return None
- body = tei_dict.get("body")
+ body = tei_doc.body
if body and len(body) > MAX_BODY_CHARS:
body = body[:MAX_BODY_CHARS]
ret = ScholarFulltext(
- lang_code=tei_dict.get("lang"),
+ lang_code=tei_doc.language_code,
body=body,
- acknowledgement=tei_dict.get("acknowledgement"),
- annex=tei_dict.get("annex"),
+ acknowledgement=tei_doc.acknowledgement,
+ annex=tei_doc.annex,
)
return _add_file_release_meta(ret, pdf_meta, re, fe)
@@ -521,15 +521,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
try:
- tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
- tei_dict = tei_doc.to_legacy_dict()
+ tei_doc: Optional[GrobidDocument] = parse_document_xml(
+ heavy.grobid_fulltext["tei_xml"]
+ )
except xml.etree.ElementTree.ParseError:
- tei_dict = None
- if tei_dict:
+ tei_doc = None
+ if tei_doc:
if not abstracts:
- abstracts = es_abstracts_from_grobid(tei_dict)
+ abstracts = es_abstracts_from_grobid(tei_doc)
grobid_fulltext = es_fulltext_from_grobid(
- tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+ tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file
)
if exclude_web_fulltext and grobid_fulltext:
if not fulltext:
@@ -681,52 +682,50 @@ def test_clean_ref_key() -> None:
assert clean_ref_key(raw, doi=doi) == expected
-def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]:
+def refs_from_grobid(
+ release: ReleaseEntity, tei_doc: GrobidDocument
+) -> List[RefStructured]:
output = []
- for ref in tei_dict.get("citations") or []:
- ref_date = ref.get("date") or None
+ ref: GrobidBiblio
+ for ref in tei_doc.citations or []:
+ ref_date = ref.date or None
ref_year: Optional[int] = None
if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit():
ref_year = int(ref_date[:4])
- ref_authors = ref.get("authors") or []
authors: List[str] = []
- for a in ref_authors:
- if isinstance(a, str):
- authors.append(a)
- elif isinstance(a, dict):
- if a.get("name"):
- assert isinstance(a["name"], str)
- authors.append(a["name"])
- ref_index = ref.get("index")
+ for a in ref.authors or []:
+ if a.full_name:
+ assert isinstance(a.full_name, str)
+ authors.append(a.full_name)
+ ref_index = ref.index
if ref_index is not None:
# transform from 0-indexed to 1-indexed
ref_index = ref_index + 1
output.append(
RefStructured(
biblio=RefBiblio(
- unstructured=ref.get("unstructured"),
- title=ref.get("title"),
+ unstructured=ref.unstructured,
+ title=ref.title,
# subtitle
contrib_raw_names=authors or None,
year=ref_year,
- container_name=ref.get("journal"),
- publisher=ref.get("publisher"),
- volume=ref.get("volume"),
- issue=ref.get("issue"),
- pages=ref.get("pages"),
- doi=clean_doi(ref.get("doi")),
- pmid=ref.get("pmid"),
- pmcid=clean_pmcid(ref.get("pmcid")),
- arxiv_id=ref.get("arxiv_id"),
- isbn=ref.get("isbn"),
- url=clean_url_conservative(ref.get("url")),
+ container_name=ref.journal,
+ publisher=ref.publisher,
+ volume=ref.volume,
+ issue=ref.issue,
+ pages=ref.pages,
+ doi=clean_doi(ref.doi),
+ pmid=ref.pmid,
+ pmcid=clean_pmcid(ref.pmcid),
+ arxiv_id=ref.arxiv_id,
+ url=clean_url_conservative(ref.url),
),
release_ident=release.ident,
work_ident=release.work_id,
release_stage=release.release_stage,
release_year=release.release_year,
index=ref_index,
- key=clean_ref_key(ref.get("id")),
+ key=clean_ref_key(ref.id),
locator=None,
# target_release_id
ref_source="grobid",
@@ -902,8 +901,7 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
if r.ident == heavy.grobid_fulltext["release_ident"]
][0]
tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
- tei_dict = tei_doc.to_legacy_dict()
- fulltext_refs = refs_from_grobid(fulltext_release, tei_dict)
+ fulltext_refs = refs_from_grobid(fulltext_release, tei_doc)
crossref_refs: List[RefStructured] = []
if heavy.crossref: