aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/grobid_unstructured.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/grobid_unstructured.py')
-rw-r--r--fuzzycat/grobid_unstructured.py61
1 files changed, 23 insertions, 38 deletions
diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py
index 5462ae1..7470cd7 100644
--- a/fuzzycat/grobid_unstructured.py
+++ b/fuzzycat/grobid_unstructured.py
@@ -15,9 +15,9 @@ from typing import Optional
import requests
from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
+from grobid_tei_xml import parse_citation_xml, GrobidBiblio
from fuzzycat.config import settings
-from fuzzycat.grobid2json import biblio_info
from fuzzycat.utils import clean_doi
GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")
@@ -55,52 +55,37 @@ def grobid_api_process_citation(raw_citation: str,
return grobid_response.text or None
-def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
+def grobid_ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:
"""
- Parses GROBID XML for the case of a single reference/citation string (eg,
- not a full/propper TEI-XML fulltext document), and returns a dict.
- """
- # first, remove any xmlns stuff, for consistent parsign
- raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
- tree = ET.parse(io.StringIO(raw_xml))
- root = tree.getroot()
- ref = biblio_info(root, ns="")
- if not any(ref.values()):
- return None
- return ref
-
-
-def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
- """
- Takes the dict returned by transform_grobid_ref_xml() and returns a partial
+ Takes GrobidBiblio (parsed from TEI-XML) and returns a partial
ReleaseEntity object (for use with fuzzycat)
"""
contribs = []
- for author in ref.get("authors") or []:
+ for author in ref.authors or []:
contribs.append(
ReleaseContrib(
- raw_name=author.get("name"),
- given_name=author.get("given_name"),
- surname=author.get("surname"),
+ raw_name=author.full_name,
+ given_name=author.given_name,
+ surname=author.surname,
))
release = ReleaseEntity(
- title=ref.get("title"),
+ title=ref.title,
contribs=contribs,
- volume=ref.get("volume"),
- issue=ref.get("issue"),
- pages=ref.get("pages"),
+ volume=ref.volume,
+ issue=ref.issue,
+ pages=ref.pages,
ext_ids=ReleaseExtIds(
- doi=clean_doi(ref.get("doi")),
- pmid=ref.get("pmid"),
- pmcid=ref.get("pmcid"),
- arxiv=ref.get("arxiv_id"),
+ doi=clean_doi(ref.doi),
+ pmid=ref.pmid,
+ pmcid=ref.pmcid,
+ arxiv=ref.arxiv_id,
),
)
- if ref.get("journal"):
- release.extra = {"container_name": ref.get("journal")}
- if ref.get("date"):
- if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
- release.release_year = int(ref["date"][0:4])
+ if ref.journal:
+ release.extra = {"container_name": ref.journal}
+ if ref.date:
+ if len(ref.date) >= 4 and ref.date[0:4].isdigit():
+ release.release_year = int(ref.date[0:4])
# TODO: try to parse 'date' into an ISO date format, and assign to release_date?
return release
@@ -121,7 +106,7 @@ def grobid_parse_unstructured(raw_citation: str,
timeout=timeout)
if not ref_xml:
return None
- biblio_dict = transform_grobid_ref_xml(ref_xml)
- if not biblio_dict:
+ ref = parse_citation_xml(ref_xml)
+ if not ref:
return None
- return grobid_ref_to_release(biblio_dict)
+ return grobid_ref_to_release(ref)