diff options
author | Martin Czygan <martin@archive.org> | 2021-11-04 17:05:09 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-11-04 17:05:09 +0000 |
commit | 282f315c6ba3643c8c614220ab2f7e1d55de3658 (patch) | |
tree | ccb9e818e18492708d90411cbe2ff7ba8ce0f5ca /fuzzycat/grobid_unstructured.py | |
parent | 615439df4955ca19bf3fdfa10b41b7d8950b3e63 (diff) | |
parent | 2f41335d268b0e2705a1ebff0ff104e965630837 (diff) | |
download | fuzzycat-282f315c6ba3643c8c614220ab2f7e1d55de3658.tar.gz fuzzycat-282f315c6ba3643c8c614220ab2f7e1d55de3658.zip |
Merge branch 'bnewbold-grobid-tei-xml' into 'master'
use grobid_tei_xml for grobid unstructured lookups
See merge request webgroup/fuzzycat!9
Diffstat (limited to 'fuzzycat/grobid_unstructured.py')
-rw-r--r-- | fuzzycat/grobid_unstructured.py | 61 |
1 files changed, 23 insertions, 38 deletions
diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py index 5462ae1..7470cd7 100644 --- a/fuzzycat/grobid_unstructured.py +++ b/fuzzycat/grobid_unstructured.py @@ -15,9 +15,9 @@ from typing import Optional import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds +from grobid_tei_xml import parse_citation_xml, GrobidBiblio from fuzzycat.config import settings -from fuzzycat.grobid2json import biblio_info from fuzzycat.utils import clean_doi GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") @@ -55,52 +55,37 @@ def grobid_api_process_citation(raw_citation: str, return grobid_response.text or None -def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: +def grobid_ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: """ - Parses GROBID XML for the case of a single reference/citation string (eg, - not a full/propper TEI-XML fulltext document), and returns a dict. - """ - # first, remove any xmlns stuff, for consistent parsign - raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") - tree = ET.parse(io.StringIO(raw_xml)) - root = tree.getroot() - ref = biblio_info(root, ns="") - if not any(ref.values()): - return None - return ref - - -def grobid_ref_to_release(ref: dict) -> ReleaseEntity: - """ - Takes the dict returned by transform_grobid_ref_xml() and returns a partial + Takes GrobidBiblio (parsed from TEI-XML) and returns a partial ReleaseEntity object (for use with fuzzycat) """ contribs = [] - for author in ref.get("authors") or []: + for author in ref.authors or []: contribs.append( ReleaseContrib( - raw_name=author.get("name"), - given_name=author.get("given_name"), - surname=author.get("surname"), + raw_name=author.full_name, + given_name=author.given_name, + surname=author.surname, )) release = ReleaseEntity( - title=ref.get("title"), + title=ref.title, contribs=contribs, - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), + volume=ref.volume, + issue=ref.issue, + pages=ref.pages, ext_ids=ReleaseExtIds( - doi=clean_doi(ref.get("doi")), - pmid=ref.get("pmid"), - pmcid=ref.get("pmcid"), - arxiv=ref.get("arxiv_id"), + doi=clean_doi(ref.doi), + pmid=ref.pmid, + pmcid=ref.pmcid, + arxiv=ref.arxiv_id, ), ) - if ref.get("journal"): - release.extra = {"container_name": ref.get("journal")} - if ref.get("date"): - if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): - release.release_year = int(ref["date"][0:4]) + if ref.journal: + release.extra = {"container_name": ref.journal} + if ref.date: + if len(ref.date) >= 4 and ref.date[0:4].isdigit(): + release.release_year = int(ref.date[0:4]) # TODO: try to parse 'date' into an ISO date format, and assign to release_date? return release @@ -121,7 +106,7 @@ def grobid_parse_unstructured(raw_citation: str, timeout=timeout) if not ref_xml: return None - biblio_dict = transform_grobid_ref_xml(ref_xml) - if not biblio_dict: + ref = parse_citation_xml(ref_xml) + if not ref: return None - return grobid_ref_to_release(biblio_dict) + return grobid_ref_to_release(ref) |