diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 18:46:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 18:46:16 -0700 |
commit | 3456336d3e4324a542c16b91734a8ebd8ef99ab9 (patch) | |
tree | d56febf594e946c0185533e5b0210152d116c0c6 | |
parent | 1c4d9e2595f4bdd1ebbd00f9d908772757fd0663 (diff) | |
download | grobid_tei_xml-3456336d3e4324a542c16b91734a8ebd8ef99ab9.tar.gz grobid_tei_xml-3456336d3e4324a542c16b91734a8ebd8ef99ab9.zip |
more tweaking/refactoring progress, and some to_csl_dict() helpers
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 26 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 277 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 105 | ||||
-rw-r--r-- | tests/test_parse.py | 12 |
4 files changed, 275 insertions, 145 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index 8946ab8..c10de7c 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -1,28 +1,6 @@ """ -NOTE: this file is DEPRECATED and will be removed soon - -NB: adapted to work as a library for PDF extraction. Will probably be -re-written eventually to be correct, complete, and robust; this is just a -first iteration. - -This script tries to extract everything from a GROBID TEI XML fulltext dump: - -- header metadata -- affiliations -- references (with context) -- abstract -- fulltext -- tables, figures, equations - -A flag can be specified to disable copyright encumbered bits (--no-emcumbered): - -- abstract -- fulltext -- tables, figures, equations - -Prints JSON to stdout, errors to stderr - -This file copied from the sandcrawler repository. +NOTE: this file is DEPRECATED. It is only here for testing backwards +compatibility, and will be removed soon. """ import io diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 284ceff..c65cbdf 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -10,6 +10,10 @@ ns = "http://www.tei-c.org/ns/1.0" def _string_to_tree(content: AnyStr) -> ET.ElementTree: + """ + Helper to consistently parse XML into an ElementTree, whether provided as + str, bytes, wrapper thereof + """ if isinstance(content, str): return ET.parse(io.StringIO(content)) elif isinstance(content, bytes): @@ -23,122 +27,181 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree: def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: + """ + Internal helper to parse one or more TEI 'author' XML tags into + GrobidAuthor objects. 'author' could appear in document headers or + citations. + """ if not elem: return [] - names = [] - for author in elem.findall(f".//{{{ns}}}author"): - pn = author.find(f"./{{{ns}}}persName") - if not pn: + + authors = [] + for author_tag in elem.findall(f".//{{{ns}}}author"): + persname_tag = author_tag.find(f"./{{{ns}}}persName") + if persname_tag is None: + # should we do something else here? it is possible to have author + # without persName? continue - given_name = pn.findtext(f"./{{{ns}}}forename") or None - surname = pn.findtext(f"./{{{ns}}}surname") or None - full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() - obj: Dict[str, Any] = dict(name=full_name) - if given_name: - obj["given_name"] = given_name - if surname: - obj["surname"] = surname - ae = author.find(f"./{{{ns}}}affiliation") - if ae: - affiliation: Dict[str, Any] = dict() - for on in ae.findall(f"./{{{ns}}}orgName"): - on_type = on.get("type") - if on_type: - affiliation[on_type] = on.text - addr_e = ae.find(f"./{{{ns}}}address") - if addr_e: - address = dict() - for t in list(addr_e): - address[t.tag.split("}")[-1]] = t.text - if address: - address['post_code'] = address.pop('postCode', None) - affiliation["address"] = GrobidAddress(**address) - # previously: - # affiliation['address'] = { - # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, - # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, - # 'country': addr.findtext('./{%s}country' % ns) or None, - # } - obj["affiliation"] = GrobidAffiliation(**affiliation) - names.append(GrobidAuthor(**obj)) - return names + + # basic author name stuff + given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None + surname = persname_tag.findtext(f"./{{{ns}}}surname") or None + # instead create full_name from all the sub-components of the tag + full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() + ga = GrobidAuthor( + full_name=full_name or None, + given_name=given_name, + surname=surname, + ) + + # author affiliation + affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation") + if affiliation_tag is not None: + affiliation_dict: Dict[str, Any] = dict() + for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): + orgname_type = orgname_tag.get("type") + if orgname_type: + affiliation_dict[orgname_type] = orgname_tag.text or None + if affiliation_dict: + ga.affiliation = GrobidAffiliation( + institution=affiliation_dict.get('institution'), + department=affiliation_dict.get('department'), + laboratory=affiliation_dict.get('laboratory'), + ) + address_tag = affiliation_tag.find(f"./{{{ns}}}address") + if address_tag is not None: + address_dict = dict() + for t in list(address_tag): + address_dict[t.tag.split("}")[-1]] = t.text or None + if address_dict: + ga.affiliation.address = GrobidAddress( + addr_line=address_dict.get('addrLine'), + post_code=address_dict.get('postCode'), + settlement=address_dict.get('settlement'), + country=address_dict.get('country'), + ) + authors.append(ga) + + return authors + + +def _clean_url(url: Optional[str]) -> Optional[str]: + if not url: + return None + url = url.strip() + if url.endswith(".Lastaccessed"): + url = url.replace(".Lastaccessed", "") + if url.startswith("<"): + url = url[1:] + if ">" in url: + url = url.split(">")[0] + return url or None + + +def test_clean_url() -> None: + examples: List[dict] = [ + dict( + dirty="https://archive.org/thing.pdf", + clean="https://archive.org/thing.pdf", + ), + dict( + dirty="https://archive.org/thing.pdf.Lastaccessed", + clean="https://archive.org/thing.pdf", + ), + dict( + dirty="<https://archive.org/thing.pdf>", + clean="https://archive.org/thing.pdf", + ), + dict( + dirty=" https://archive.org/thing.pdf>", + clean="https://archive.org/thing.pdf", + ), + dict( + dirty=" https://archive.org/thing.pdf>", + clean="https://archive.org/thing.pdf", + ), + dict(dirty="", clean=None), + dict(dirty=None, clean=None), + ] + + for row in examples: + assert row['clean'] == _clean_url(row['dirty']) def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: - ref: Dict[str, Any] = dict() - ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) - # Title stuff is messy in references... - ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - if other_title: - if ref["title"]: - ref["journal"] = other_title - else: - ref["journal"] = None - ref["title"] = other_title - ref["authors"] = _parse_authors(elem, ns=ns) - ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if not ref["publisher"]: - ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") - if ref["publisher"] == "": - ref["publisher"] = None - date = elem.find('.//{%s}date[@type="published"]' % ns) - ref["date"] = (date is not None) and date.attrib.get("when") - ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) - ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) - if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): - ref["arxiv_id"] = ref["arxiv_id"][6:] - ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) - ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) - el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) + """ + Parses an entire TEI 'biblStruct' XML tag + """ + + citation = GrobidCitation( + id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None, + title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, + journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, + authors=_parse_authors(elem, ns=ns), + unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, + issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, + arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, + doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, + pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, + pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None, + ) + + citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + if not citation.publisher: + citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None + + date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') + if date_tag is not None: + citation.date = date_tag.attrib.get("when") or None + + # title stuff is messy in references... + if citation.journal and not citation.title: + citation.title = citation.journal + citation.journal = None + + if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"): + citation.arxiv_id = citation.arxiv_id[6:] + + el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]') if el is not None: if el.attrib.get("from") and el.attrib.get("to"): - ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) + citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) else: - ref["pages"] = el.text - el = elem.find(".//{%s}ptr[@target]" % ns) + citation.pages = el.text + + el = elem.find(f".//{{{ns}}}ptr[@target]") if el is not None: - ref["url"] = el.attrib["target"] - # Hand correction - # TODO: move this elsewhere - if ref["url"].endswith(".Lastaccessed"): - ref["url"] = ref["url"].replace(".Lastaccessed", "") - if ref["url"].startswith("<"): - ref["url"] = ref["url"][1:] - if ">" in ref["url"]: - ref["url"] = ref["url"].split(">")[0] - else: - ref["url"] = None - return GrobidCitation(**ref) + citation.url = _clean_url(el.attrib["target"]) + + return citation def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: - journal = dict() - journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if journal["publisher"] == "": - journal["publisher"] = None - journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) - journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) - journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - journal["abbrev"] = None - return GrobidJournal(**journal) + journal = GrobidJournal( + name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, + publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, + issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, + eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, + volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, + issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, + # XXX: abbrev + abbrev=None, + ) + return journal def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: - header = elem - info: Dict[str, Any] = dict() - info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) - info["journal"] = _parse_journal(header) - date = header.find(f'.//{{{ns}}}date[@type="published"]') - info["date"] = (date is not None) and date.attrib.get("when") - info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]') - return GrobidHeader(**info) + header = GrobidHeader( + title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, + authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")), + journal=_parse_journal(elem) or None, + doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, + ) + date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') + if date_tag is not None: + header.date = date_tag.attrib.get("when") or None + return header def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: @@ -190,15 +253,19 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ - Use this function to parse TEI-XML of one or more references. + Use this function to parse TEI-XML of one or more references. This should + work with either /api/processCitation or /api/processCitationList API + responses from GROBID - Eg, the output of '/api/processReferences' or '/api/processCitation'. + Note that processed citations are usually returned as a bare XML tag, not a + full XML document, which means that the TEI xmlns is not set. This requires + a tweak to all downstream parsing code to handle documents with or without + the namespace. """ - # XXX: this replacement shouldn't be needed? if isinstance(xml_text, bytes): - xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'') elif isinstance(xml_text, str): - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '') tree = _string_to_tree(xml_text) root = tree.getroot() diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 9894bf5..b78b236 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -8,52 +8,125 @@ class GrobidAddress: post_code: Optional[str] = None settlement: Optional[str] = None country: Optional[str] = None - country_code: Optional[str] = None + country_code: Optional[str] = None # XXX @dataclass class GrobidAffiliation: - address: Optional[GrobidAddress] = None institution: Optional[str] = None department: Optional[str] = None laboratory: Optional[str] = None + address: Optional[GrobidAddress] = None @dataclass class GrobidAuthor: - name: Optional[str] - # TODO: 'forename'? + full_name: Optional[str] given_name: Optional[str] = None + middle: Optional[str] = None # XXX surname: Optional[str] = None + suffix: Optional[str] = None # XXX + email: Optional[str] = None # XXX affiliation: Optional[GrobidAffiliation] = None + def to_csl_dict(self) -> dict: + d = dict( + given=self.given_name, + family=self.surname, + suffix=self.suffix, + ) + return _simplify_dict(d) + + +def _csl_date(s: Optional[str]) -> Optional[list]: + if not s: + return None + + # YYYY + if len(s) >= 4 and s[0:4].isdigit(): + year = int(s[0:4]) + else: + return None + + # YYYY-MM + if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit(): + month = int(s[5:7]) + else: + return [[year]] + + # YYYY-MM-DD + if len(s) == 10 and s[7] == '-' and s[8:10].isdigit(): + day = int(s[8:10]) + return [[year, month, day]] + else: + return [[year, month]] + + +def test_csl_date() -> None: + assert _csl_date("1998") == [[1998]] + assert _csl_date("1998-03") == [[1998, 3]] + assert _csl_date("1998-03-12") == [[1998, 3, 12]] + assert _csl_date("1998-blah") == [[1998]] + assert _csl_date("asdf") is None + @dataclass class GrobidCitation: authors: List[GrobidAuthor] + index: Optional[int] = None id: Optional[str] = None date: Optional[str] = None issue: Optional[str] = None - journal: Optional[str] = None + journal: Optional[str] = None # XXX: venue? other? publisher: Optional[str] = None title: Optional[str] = None url: Optional[str] = None volume: Optional[str] = None pages: Optional[str] = None - first_page: Optional[str] = None - last_page: Optional[str] = None + first_page: Optional[str] = None # XXX + last_page: Optional[str] = None # XXX unstructured: Optional[str] = None - # TODO: 'arxiv' for consistency? arxiv_id: Optional[str] = None doi: Optional[str] = None pmid: Optional[str] = None pmcid: Optional[str] = None oa_url: Optional[str] = None + note: Optional[str] = None def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + """ + Transforms in to Citation Style Language (CSL) JSON schema + """ + csl = dict( + type=default_type, + author=[a.to_csl_dict() for a in self.authors], + issued=_csl_date(self.date), + publisher=self.publisher, + title=self.title, + page=self.pages, + URL=self.url, + DOI=self.doi, + PMID=self.pmid, + PMCID=self.pmcid, + note=self.note, + # fields with '-' in the key name + **{ + "container-title": self.journal, + "page-first": self.first_page, + }) + + # numeric fields + if self.issue and self.issue.isdigit(): + csl['issue'] = int(self.issue) + if self.volume and self.volume.isdigit(): + csl['volume'] = int(self.volume) + + return _simplify_dict(csl) + @dataclass class GrobidJournal: @@ -69,10 +142,10 @@ class GrobidJournal: @dataclass class GrobidHeader: authors: List[GrobidAuthor] + title: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - note: Optional[str] = None journal: Optional[GrobidJournal] = None @@ -81,9 +154,10 @@ class GrobidDocument: grobid_version: str grobid_timestamp: str header: GrobidHeader + pdf_md5: Optional[str] = None - citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None + citations: Optional[List[GrobidCitation]] = None abstract: Optional[str] = None body: Optional[str] = None acknowledgement: Optional[str] = None @@ -108,12 +182,21 @@ class GrobidDocument: # all header fields at top-level d.update(d.pop('header', {})) - d.pop('note', None) + + # files not in the old schema d.pop('pdf_md5', None) + for c in d.get('citations', []): + c.pop('note', None) + + # author changes for a in d['authors']: + a['name'] = a.pop('full_name') addr = a.get('affiliation', {}).get('address') if addr and addr.get('post_code'): addr['postCode'] = addr.pop('post_code') + for c in d['citations'] or []: + for a in c['authors']: + a['name'] = a.pop('full_name') return d def remove_encumbered(self) -> None: diff --git a/tests/test_parse.py b/tests/test_parse.py index 825b561..7749201 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -24,7 +24,7 @@ def test_small_xml() -> None: header=GrobidHeader( title="Dummy Example File", authors=[ - GrobidAuthor(name="Brewster Kahle", + GrobidAuthor(full_name="Brewster Kahle", given_name="Brewster", surname="Kahle", affiliation=GrobidAffiliation( @@ -38,7 +38,7 @@ def test_small_xml() -> None: ), )), GrobidAuthor( - name="J Doe", + full_name="J Doe", given_name="J", surname="Doe", ), @@ -53,7 +53,9 @@ def test_small_xml() -> None: GrobidCitation( index=0, id="b0", - authors=[GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson")], + authors=[ + GrobidAuthor(full_name="A Seaperson", given_name="A", surname="Seaperson") + ], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", @@ -127,7 +129,7 @@ def test_example_grobid_tei_xml() -> None: """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" ref = [c for c in doc.citations or [] if c.id == "b12"][0] - assert ref.authors[0].name == "K Tasa" + assert ref.authors[0].full_name == "K Tasa" assert ref.authors[0].given_name == "K" assert ref.authors[0].surname == "Tasa" assert ref.journal == "Quality Management in Health Care" @@ -193,7 +195,7 @@ def test_single_citations_xml() -> None: assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" assert d.authors[2].given_name == "L" assert d.authors[2].surname == "Taveras" - assert d.authors[2].name == "L R Taveras" + assert d.authors[2].full_name == "L R Taveras" assert d.doi == "10.1007/s10029-019-01898-9" assert d.pmid == "30701369" assert d.date == "2019-01-30" |