diff options
-rwxr-xr-x | grobid_tei_xml/parse.py | 6 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 4 | ||||
-rw-r--r-- | tests/files/example_citation_list.xml | 80 | ||||
-rw-r--r-- | tests/test_parse.py | 38 |
4 files changed, 124 insertions, 4 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index da7ed97..66e4e72 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -179,6 +179,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: biblio.book_title = book_title_tag.text if biblio.book_title and not biblio.title: biblio.title = biblio.book_title + biblio.book_title = None note_tag = elem.find(f'.//{{{ns}}}note') if note_tag is not None and note_tag.attrib.get('type') is None: @@ -209,6 +210,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: if el is not None: biblio.url = _clean_url(el.attrib["target"]) + # having DOI and a DOI URL is redundant + if biblio.doi and biblio.url: + if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url): + biblio.url = None + return biblio diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 252e677..08be47a 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -24,8 +24,8 @@ class GrobidAuthor: given_name: Optional[str] = None middle_name: Optional[str] = None surname: Optional[str] = None - email: Optional[str] = None # XXX - orcid: Optional[str] = None # XXX + email: Optional[str] = None # TODO: test coverage + orcid: Optional[str] = None # TODO: test coverage affiliation: Optional[GrobidAffiliation] = None def to_csl_dict(self) -> dict: diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml index d640393..218fa46 100644 --- a/tests/files/example_citation_list.xml +++ b/tests/files/example_citation_list.xml @@ -270,6 +270,86 @@ </monogr> </biblStruct> +<biblStruct xml:id="b10"> + <analytic> + <title level="a" type="main">Heart failure, chronic diuretic use, and increase in mortality and hospitalization: an observational study using propensity score methods</title> + <author> + <persName><forename type="first">A</forename><surname>Ahmed</surname></persName> + </author> + <author> + <persName><forename type="first">A</forename><surname>Husain</surname></persName> + </author> + <author> + <persName><forename type="first">T</forename><forename type="middle">E</forename><surname>Love</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><surname>Gambassi</surname></persName> + </author> + <author> + <persName><forename type="first">L</forename><forename type="middle">J</forename><surname>Dell’italia</surname></persName> + </author> + <author> + <persName><forename type="first">G</forename><forename type="middle">S</forename><surname>Francis</surname></persName> + </author> + <author> + <persName><forename type="first">M</forename><surname>Gheorghiade</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Allman</surname></persName> + </author> + <author> + <persName><forename type="first">S</forename><surname>Meleth</surname></persName> + </author> + <author> + <persName><forename type="first">R</forename><forename type="middle">C</forename><surname>Bourge</surname></persName> + </author> + <idno type="DOI">10.1093/eurheartj/ehi890</idno> + <ptr target="https://doi.org/10.1093/eurheartj/ehi890" /> + </analytic> + <monogr> + <title level="j">Eur Heart J</title> + <imprint> + <biblScope unit="volume">27</biblScope> + <biblScope unit="issue">12</biblScope> + <biblScope unit="page" from="1431" to="1439" /> + <date type="published" when="2006" /> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b11"> + <analytic> + <author> + <persName><forename type="first">Michael</forename><surname>Bass</surname></persName> + </author> + </analytic> + <monogr> + <title level="m">Devices, Measurements and Properties</title> + <title level="s">Handbook of Optics</title> + <imprint> + <publisher>McGRAW-HILL</publisher> + <date type="published" when="1995" /> + <biblScope unit="volume">2</biblScope> + </imprint> + </monogr> +</biblStruct> + +<biblStruct xml:id="b12"> + <analytic> + <title level="a" type="main">Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River</title> + </analytic> + <monogr> + <title level="m">Paper presented at the 13th Biennial Meeting of the American Quaternary Association</title> + <meeting><address><addrLine>Minneapolis</addrLine></address></meeting> + <imprint> + <date type="published" when="1994-06" /> + </imprint> + <respStmt> + <orgName>University of Minnesota</orgName> + </respStmt> + </monogr> +</biblStruct> + </listBibl> </div> </back> diff --git a/tests/test_parse.py b/tests/test_parse.py index eb4b46e..976d1b1 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -215,14 +215,34 @@ def test_citation_list_xml() -> None: tei_xml = f.read() citations = parse_citations_xml(tei_xml) - assert len(citations) == 10 - assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger" + assert len(citations) == 13 assert citations[3].note == "The Research Handbook on International Environmental Law" assert citations[3].authors[0].surname == "Uhlířová" + assert citations[3].authors[1].surname == "Drumbl" + assert citations[3].editors + assert citations[3].editors[0].surname == "Fitzmaurice" + # TODO: multiple persName under a single <editor> (https://github.com/kermitt2/grobid/issues/845) + # assert citations[3].editors[1].surname == "Brus" + assert citations[4].authors[0].surname == "Sleytr" assert citations[4].authors[0].middle_name == "B" + assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger" + + assert citations[10].doi == "10.1093/eurheartj/ehi890" + assert citations[10].url is None + + assert citations[11].title == "Devices, Measurements and Properties" + assert citations[11].series_title == "Handbook of Optics" + assert citations[11].publisher == "McGRAW-HILL" + + assert citations[ + 12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River" + assert citations[ + 12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association" + assert citations[12].institution == "University of Minnesota" + def test_grobid_070_document() -> None: # more recent GROBID v0.7.0 output @@ -245,6 +265,20 @@ def test_grobid_070_document() -> None: assert cite_b6.volume == "574" assert cite_b6.issue == "1" + cite_b3 = doc.citations[3] + assert cite_b3.url == "http://unesdoc.unesco.org/ulis/" + assert cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS" + assert cite_b3.authors + assert cite_b3.authors[0].surname == "Ioc-Unesco" + assert cite_b3.date == "2012" + + cite_b18 = doc.citations[18] + assert cite_b18.note == "TriOS GmbH [Internet" + assert cite_b18.date == "2017-01-05" + + cite_b29 = doc.citations[29] + assert cite_b29.note == "PhD dissertation" + # run these methods over some more examples for c in doc.citations: c.to_csl_dict() |