aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-22 18:46:16 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-22 18:46:16 -0700
commit3456336d3e4324a542c16b91734a8ebd8ef99ab9 (patch)
treed56febf594e946c0185533e5b0210152d116c0c6
parent1c4d9e2595f4bdd1ebbd00f9d908772757fd0663 (diff)
downloadgrobid_tei_xml-3456336d3e4324a542c16b91734a8ebd8ef99ab9.tar.gz
grobid_tei_xml-3456336d3e4324a542c16b91734a8ebd8ef99ab9.zip
more tweaking/refactoring progress, and some to_csl_dict() helpers
-rw-r--r--grobid_tei_xml/grobid2json.py26
-rwxr-xr-xgrobid_tei_xml/parse.py277
-rw-r--r--grobid_tei_xml/types.py105
-rw-r--r--tests/test_parse.py12
4 files changed, 275 insertions, 145 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index 8946ab8..c10de7c 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -1,28 +1,6 @@
"""
-NOTE: this file is DEPRECATED and will be removed soon
-
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-
-This file copied from the sandcrawler repository.
+NOTE: this file is DEPRECATED. It is only here for testing backwards
+compatibility, and will be removed soon.
"""
import io
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 284ceff..c65cbdf 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -10,6 +10,10 @@ ns = "http://www.tei-c.org/ns/1.0"
def _string_to_tree(content: AnyStr) -> ET.ElementTree:
+ """
+ Helper to consistently parse XML into an ElementTree, whether provided as
+ str, bytes, wrapper thereof
+ """
if isinstance(content, str):
return ET.parse(io.StringIO(content))
elif isinstance(content, bytes):
@@ -23,122 +27,181 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]:
+ """
+ Internal helper to parse one or more TEI 'author' XML tags into
+ GrobidAuthor objects. 'author' could appear in document headers or
+ citations.
+ """
if not elem:
return []
- names = []
- for author in elem.findall(f".//{{{ns}}}author"):
- pn = author.find(f"./{{{ns}}}persName")
- if not pn:
+
+ authors = []
+ for author_tag in elem.findall(f".//{{{ns}}}author"):
+ persname_tag = author_tag.find(f"./{{{ns}}}persName")
+ if persname_tag is None:
+ # should we do something else here? it is possible to have author
+ # without persName?
continue
- given_name = pn.findtext(f"./{{{ns}}}forename") or None
- surname = pn.findtext(f"./{{{ns}}}surname") or None
- full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip()
- obj: Dict[str, Any] = dict(name=full_name)
- if given_name:
- obj["given_name"] = given_name
- if surname:
- obj["surname"] = surname
- ae = author.find(f"./{{{ns}}}affiliation")
- if ae:
- affiliation: Dict[str, Any] = dict()
- for on in ae.findall(f"./{{{ns}}}orgName"):
- on_type = on.get("type")
- if on_type:
- affiliation[on_type] = on.text
- addr_e = ae.find(f"./{{{ns}}}address")
- if addr_e:
- address = dict()
- for t in list(addr_e):
- address[t.tag.split("}")[-1]] = t.text
- if address:
- address['post_code'] = address.pop('postCode', None)
- affiliation["address"] = GrobidAddress(**address)
- # previously:
- # affiliation['address'] = {
- # 'post_code': addr.findtext('./{%s}postCode' % ns) or None,
- # 'settlement': addr.findtext('./{%s}settlement' % ns) or None,
- # 'country': addr.findtext('./{%s}country' % ns) or None,
- # }
- obj["affiliation"] = GrobidAffiliation(**affiliation)
- names.append(GrobidAuthor(**obj))
- return names
+
+ # basic author name stuff
+ given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None
+ surname = persname_tag.findtext(f"./{{{ns}}}surname") or None
+ # instead create full_name from all the sub-components of the tag
+ full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+ ga = GrobidAuthor(
+ full_name=full_name or None,
+ given_name=given_name,
+ surname=surname,
+ )
+
+ # author affiliation
+ affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation")
+ if affiliation_tag is not None:
+ affiliation_dict: Dict[str, Any] = dict()
+ for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
+ orgname_type = orgname_tag.get("type")
+ if orgname_type:
+ affiliation_dict[orgname_type] = orgname_tag.text or None
+ if affiliation_dict:
+ ga.affiliation = GrobidAffiliation(
+ institution=affiliation_dict.get('institution'),
+ department=affiliation_dict.get('department'),
+ laboratory=affiliation_dict.get('laboratory'),
+ )
+ address_tag = affiliation_tag.find(f"./{{{ns}}}address")
+ if address_tag is not None:
+ address_dict = dict()
+ for t in list(address_tag):
+ address_dict[t.tag.split("}")[-1]] = t.text or None
+ if address_dict:
+ ga.affiliation.address = GrobidAddress(
+ addr_line=address_dict.get('addrLine'),
+ post_code=address_dict.get('postCode'),
+ settlement=address_dict.get('settlement'),
+ country=address_dict.get('country'),
+ )
+ authors.append(ga)
+
+ return authors
+
+
+def _clean_url(url: Optional[str]) -> Optional[str]:
+ if not url:
+ return None
+ url = url.strip()
+ if url.endswith(".Lastaccessed"):
+ url = url.replace(".Lastaccessed", "")
+ if url.startswith("<"):
+ url = url[1:]
+ if ">" in url:
+ url = url.split(">")[0]
+ return url or None
+
+
+def test_clean_url() -> None:
+ examples: List[dict] = [
+ dict(
+ dirty="https://archive.org/thing.pdf",
+ clean="https://archive.org/thing.pdf",
+ ),
+ dict(
+ dirty="https://archive.org/thing.pdf.Lastaccessed",
+ clean="https://archive.org/thing.pdf",
+ ),
+ dict(
+ dirty="<https://archive.org/thing.pdf>",
+ clean="https://archive.org/thing.pdf",
+ ),
+ dict(
+ dirty=" https://archive.org/thing.pdf>",
+ clean="https://archive.org/thing.pdf",
+ ),
+ dict(
+ dirty=" https://archive.org/thing.pdf>",
+ clean="https://archive.org/thing.pdf",
+ ),
+ dict(dirty="", clean=None),
+ dict(dirty=None, clean=None),
+ ]
+
+ for row in examples:
+ assert row['clean'] == _clean_url(row['dirty'])
def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
- ref: Dict[str, Any] = dict()
- ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
- ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns)
- # Title stuff is messy in references...
- ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
- other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
- if other_title:
- if ref["title"]:
- ref["journal"] = other_title
- else:
- ref["journal"] = None
- ref["title"] = other_title
- ref["authors"] = _parse_authors(elem, ns=ns)
- ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
- if not ref["publisher"]:
- ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
- if ref["publisher"] == "":
- ref["publisher"] = None
- date = elem.find('.//{%s}date[@type="published"]' % ns)
- ref["date"] = (date is not None) and date.attrib.get("when")
- ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns)
- ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns)
- if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"):
- ref["arxiv_id"] = ref["arxiv_id"][6:]
- ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns)
- ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns)
- el = elem.find('.//{%s}biblScope[@unit="page"]' % ns)
+ """
+ Parses an entire TEI 'biblStruct' XML tag
+ """
+
+ citation = GrobidCitation(
+ id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+ title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
+ journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
+ authors=_parse_authors(elem, ns=ns),
+ unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+ volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
+ issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+ arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+ doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
+ pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+ pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+ )
+
+ citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
+ if not citation.publisher:
+ citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+
+ date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
+ if date_tag is not None:
+ citation.date = date_tag.attrib.get("when") or None
+
+ # title stuff is messy in references...
+ if citation.journal and not citation.title:
+ citation.title = citation.journal
+ citation.journal = None
+
+ if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
+ citation.arxiv_id = citation.arxiv_id[6:]
+
+ el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
if el is not None:
if el.attrib.get("from") and el.attrib.get("to"):
- ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+ citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
else:
- ref["pages"] = el.text
- el = elem.find(".//{%s}ptr[@target]" % ns)
+ citation.pages = el.text
+
+ el = elem.find(f".//{{{ns}}}ptr[@target]")
if el is not None:
- ref["url"] = el.attrib["target"]
- # Hand correction
- # TODO: move this elsewhere
- if ref["url"].endswith(".Lastaccessed"):
- ref["url"] = ref["url"].replace(".Lastaccessed", "")
- if ref["url"].startswith("<"):
- ref["url"] = ref["url"][1:]
- if ">" in ref["url"]:
- ref["url"] = ref["url"].split(">")[0]
- else:
- ref["url"] = None
- return GrobidCitation(**ref)
+ citation.url = _clean_url(el.attrib["target"])
+
+ return citation
def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
- journal = dict()
- journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
- journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
- if journal["publisher"] == "":
- journal["publisher"] = None
- journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
- journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
- journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
- journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- journal["abbrev"] = None
- return GrobidJournal(**journal)
+ journal = GrobidJournal(
+ name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
+ publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
+ issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+ eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
+ volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
+ issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+ # XXX: abbrev
+ abbrev=None,
+ )
+ return journal
def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
- header = elem
- info: Dict[str, Any] = dict()
- info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
- info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct"))
- info["journal"] = _parse_journal(header)
- date = header.find(f'.//{{{ns}}}date[@type="published"]')
- info["date"] = (date is not None) and date.attrib.get("when")
- info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]')
- return GrobidHeader(**info)
+ header = GrobidHeader(
+ title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
+ authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
+ journal=_parse_journal(elem) or None,
+ doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
+ )
+ date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
+ if date_tag is not None:
+ header.date = date_tag.attrib.get("when") or None
+ return header
def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -190,15 +253,19 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
"""
- Use this function to parse TEI-XML of one or more references.
+ Use this function to parse TEI-XML of one or more references. This should
+ work with either /api/processCitation or /api/processCitationList API
+ responses from GROBID
- Eg, the output of '/api/processReferences' or '/api/processCitation'.
+ Note that processed citations are usually returned as a bare XML tag, not a
+ full XML document, which means that the TEI xmlns is not set. This requires
+ a tweak to all downstream parsing code to handle documents with or without
+ the namespace.
"""
- # XXX: this replacement shouldn't be needed?
if isinstance(xml_text, bytes):
- xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"")
+ xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'')
elif isinstance(xml_text, str):
- xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+ xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '')
tree = _string_to_tree(xml_text)
root = tree.getroot()
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 9894bf5..b78b236 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -8,52 +8,125 @@ class GrobidAddress:
post_code: Optional[str] = None
settlement: Optional[str] = None
country: Optional[str] = None
- country_code: Optional[str] = None
+ country_code: Optional[str] = None # XXX
@dataclass
class GrobidAffiliation:
- address: Optional[GrobidAddress] = None
institution: Optional[str] = None
department: Optional[str] = None
laboratory: Optional[str] = None
+ address: Optional[GrobidAddress] = None
@dataclass
class GrobidAuthor:
- name: Optional[str]
- # TODO: 'forename'?
+ full_name: Optional[str]
given_name: Optional[str] = None
+ middle: Optional[str] = None # XXX
surname: Optional[str] = None
+ suffix: Optional[str] = None # XXX
+ email: Optional[str] = None # XXX
affiliation: Optional[GrobidAffiliation] = None
+ def to_csl_dict(self) -> dict:
+ d = dict(
+ given=self.given_name,
+ family=self.surname,
+ suffix=self.suffix,
+ )
+ return _simplify_dict(d)
+
+
+def _csl_date(s: Optional[str]) -> Optional[list]:
+ if not s:
+ return None
+
+ # YYYY
+ if len(s) >= 4 and s[0:4].isdigit():
+ year = int(s[0:4])
+ else:
+ return None
+
+ # YYYY-MM
+ if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit():
+ month = int(s[5:7])
+ else:
+ return [[year]]
+
+ # YYYY-MM-DD
+ if len(s) == 10 and s[7] == '-' and s[8:10].isdigit():
+ day = int(s[8:10])
+ return [[year, month, day]]
+ else:
+ return [[year, month]]
+
+
+def test_csl_date() -> None:
+ assert _csl_date("1998") == [[1998]]
+ assert _csl_date("1998-03") == [[1998, 3]]
+ assert _csl_date("1998-03-12") == [[1998, 3, 12]]
+ assert _csl_date("1998-blah") == [[1998]]
+ assert _csl_date("asdf") is None
+
@dataclass
class GrobidCitation:
authors: List[GrobidAuthor]
+
index: Optional[int] = None
id: Optional[str] = None
date: Optional[str] = None
issue: Optional[str] = None
- journal: Optional[str] = None
+ journal: Optional[str] = None # XXX: venue? other?
publisher: Optional[str] = None
title: Optional[str] = None
url: Optional[str] = None
volume: Optional[str] = None
pages: Optional[str] = None
- first_page: Optional[str] = None
- last_page: Optional[str] = None
+ first_page: Optional[str] = None # XXX
+ last_page: Optional[str] = None # XXX
unstructured: Optional[str] = None
- # TODO: 'arxiv' for consistency?
arxiv_id: Optional[str] = None
doi: Optional[str] = None
pmid: Optional[str] = None
pmcid: Optional[str] = None
oa_url: Optional[str] = None
+ note: Optional[str] = None
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
+ def to_csl_dict(self, default_type: str = "article-journal") -> dict:
+ """
+ Transforms in to Citation Style Language (CSL) JSON schema
+ """
+ csl = dict(
+ type=default_type,
+ author=[a.to_csl_dict() for a in self.authors],
+ issued=_csl_date(self.date),
+ publisher=self.publisher,
+ title=self.title,
+ page=self.pages,
+ URL=self.url,
+ DOI=self.doi,
+ PMID=self.pmid,
+ PMCID=self.pmcid,
+ note=self.note,
+ # fields with '-' in the key name
+ **{
+ "container-title": self.journal,
+ "page-first": self.first_page,
+ })
+
+ # numeric fields
+ if self.issue and self.issue.isdigit():
+ csl['issue'] = int(self.issue)
+ if self.volume and self.volume.isdigit():
+ csl['volume'] = int(self.volume)
+
+ return _simplify_dict(csl)
+
@dataclass
class GrobidJournal:
@@ -69,10 +142,10 @@ class GrobidJournal:
@dataclass
class GrobidHeader:
authors: List[GrobidAuthor]
+
title: Optional[str] = None
date: Optional[str] = None
doi: Optional[str] = None
- note: Optional[str] = None
journal: Optional[GrobidJournal] = None
@@ -81,9 +154,10 @@ class GrobidDocument:
grobid_version: str
grobid_timestamp: str
header: GrobidHeader
+
pdf_md5: Optional[str] = None
- citations: Optional[List[GrobidCitation]] = None
language_code: Optional[str] = None
+ citations: Optional[List[GrobidCitation]] = None
abstract: Optional[str] = None
body: Optional[str] = None
acknowledgement: Optional[str] = None
@@ -108,12 +182,21 @@ class GrobidDocument:
# all header fields at top-level
d.update(d.pop('header', {}))
- d.pop('note', None)
+
+ # files not in the old schema
d.pop('pdf_md5', None)
+ for c in d.get('citations', []):
+ c.pop('note', None)
+
+ # author changes
for a in d['authors']:
+ a['name'] = a.pop('full_name')
addr = a.get('affiliation', {}).get('address')
if addr and addr.get('post_code'):
addr['postCode'] = addr.pop('post_code')
+ for c in d['citations'] or []:
+ for a in c['authors']:
+ a['name'] = a.pop('full_name')
return d
def remove_encumbered(self) -> None:
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 825b561..7749201 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -24,7 +24,7 @@ def test_small_xml() -> None:
header=GrobidHeader(
title="Dummy Example File",
authors=[
- GrobidAuthor(name="Brewster Kahle",
+ GrobidAuthor(full_name="Brewster Kahle",
given_name="Brewster",
surname="Kahle",
affiliation=GrobidAffiliation(
@@ -38,7 +38,7 @@ def test_small_xml() -> None:
),
)),
GrobidAuthor(
- name="J Doe",
+ full_name="J Doe",
given_name="J",
surname="Doe",
),
@@ -53,7 +53,9 @@ def test_small_xml() -> None:
GrobidCitation(
index=0,
id="b0",
- authors=[GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson")],
+ authors=[
+ GrobidAuthor(full_name="A Seaperson", given_name="A", surname="Seaperson")
+ ],
date="2001",
journal="Letters in the Alphabet",
title="Everything is Wonderful",
@@ -127,7 +129,7 @@ def test_example_grobid_tei_xml() -> None:
"""Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""
ref = [c for c in doc.citations or [] if c.id == "b12"][0]
- assert ref.authors[0].name == "K Tasa"
+ assert ref.authors[0].full_name == "K Tasa"
assert ref.authors[0].given_name == "K"
assert ref.authors[0].surname == "Tasa"
assert ref.journal == "Quality Management in Health Care"
@@ -193,7 +195,7 @@ def test_single_citations_xml() -> None:
assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review"""
assert d.authors[2].given_name == "L"
assert d.authors[2].surname == "Taveras"
- assert d.authors[2].name == "L R Taveras"
+ assert d.authors[2].full_name == "L R Taveras"
assert d.doi == "10.1007/s10029-019-01898-9"
assert d.pmid == "30701369"
assert d.date == "2019-01-30"