summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-25 17:02:14 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-25 17:02:14 -0700
commit8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573 (patch)
tree3b71338fcae115c834097b16cbfe051fb88c7684
parentfcdb271193ca2c6b90eeeb5f4af4bbc15083319a (diff)
downloadgrobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.tar.gz
grobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.zip
more test coverage and comments
-rwxr-xr-xgrobid_tei_xml/parse.py6
-rw-r--r--grobid_tei_xml/types.py4
-rw-r--r--tests/files/example_citation_list.xml80
-rw-r--r--tests/test_parse.py38
4 files changed, 124 insertions, 4 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index da7ed97..66e4e72 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -179,6 +179,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
biblio.book_title = book_title_tag.text
if biblio.book_title and not biblio.title:
biblio.title = biblio.book_title
+ biblio.book_title = None
note_tag = elem.find(f'.//{{{ns}}}note')
if note_tag is not None and note_tag.attrib.get('type') is None:
@@ -209,6 +210,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
if el is not None:
biblio.url = _clean_url(el.attrib["target"])
+ # having DOI and a DOI URL is redundant
+ if biblio.doi and biblio.url:
+ if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url):
+ biblio.url = None
+
return biblio
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 252e677..08be47a 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -24,8 +24,8 @@ class GrobidAuthor:
given_name: Optional[str] = None
middle_name: Optional[str] = None
surname: Optional[str] = None
- email: Optional[str] = None # XXX
- orcid: Optional[str] = None # XXX
+ email: Optional[str] = None # TODO: test coverage
+ orcid: Optional[str] = None # TODO: test coverage
affiliation: Optional[GrobidAffiliation] = None
def to_csl_dict(self) -> dict:
diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml
index d640393..218fa46 100644
--- a/tests/files/example_citation_list.xml
+++ b/tests/files/example_citation_list.xml
@@ -270,6 +270,86 @@
</monogr>
</biblStruct>
+<biblStruct xml:id="b10">
+ <analytic>
+ <title level="a" type="main">Heart failure, chronic diuretic use, and increase in mortality and hospitalization: an observational study using propensity score methods</title>
+ <author>
+ <persName><forename type="first">A</forename><surname>Ahmed</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">A</forename><surname>Husain</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">T</forename><forename type="middle">E</forename><surname>Love</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><surname>Gambassi</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">L</forename><forename type="middle">J</forename><surname>Dell’italia</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">G</forename><forename type="middle">S</forename><surname>Francis</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">M</forename><surname>Gheorghiade</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">M</forename><surname>Allman</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">S</forename><surname>Meleth</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">R</forename><forename type="middle">C</forename><surname>Bourge</surname></persName>
+ </author>
+ <idno type="DOI">10.1093/eurheartj/ehi890</idno>
+ <ptr target="https://doi.org/10.1093/eurheartj/ehi890" />
+ </analytic>
+ <monogr>
+ <title level="j">Eur Heart J</title>
+ <imprint>
+ <biblScope unit="volume">27</biblScope>
+ <biblScope unit="issue">12</biblScope>
+ <biblScope unit="page" from="1431" to="1439" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <analytic>
+ <author>
+ <persName><forename type="first">Michael</forename><surname>Bass</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Devices, Measurements and Properties</title>
+ <title level="s">Handbook of Optics</title>
+ <imprint>
+ <publisher>McGRAW-HILL</publisher>
+ <date type="published" when="1995" />
+ <biblScope unit="volume">2</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <analytic>
+ <title level="a" type="main">Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River</title>
+ </analytic>
+ <monogr>
+ <title level="m">Paper presented at the 13th Biennial Meeting of the American Quaternary Association</title>
+ <meeting><address><addrLine>Minneapolis</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1994-06" />
+ </imprint>
+ <respStmt>
+ <orgName>University of Minnesota</orgName>
+ </respStmt>
+ </monogr>
+</biblStruct>
+
</listBibl>
</div>
</back>
diff --git a/tests/test_parse.py b/tests/test_parse.py
index eb4b46e..976d1b1 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -215,14 +215,34 @@ def test_citation_list_xml() -> None:
tei_xml = f.read()
citations = parse_citations_xml(tei_xml)
- assert len(citations) == 10
- assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"
+ assert len(citations) == 13
assert citations[3].note == "The Research Handbook on International Environmental Law"
assert citations[3].authors[0].surname == "Uhlířová"
+ assert citations[3].authors[1].surname == "Drumbl"
+ assert citations[3].editors
+ assert citations[3].editors[0].surname == "Fitzmaurice"
+ # TODO: multiple persName under a single <editor> (https://github.com/kermitt2/grobid/issues/845)
+ # assert citations[3].editors[1].surname == "Brus"
+
assert citations[4].authors[0].surname == "Sleytr"
assert citations[4].authors[0].middle_name == "B"
+ assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger"
+
+ assert citations[10].doi == "10.1093/eurheartj/ehi890"
+ assert citations[10].url is None
+
+ assert citations[11].title == "Devices, Measurements and Properties"
+ assert citations[11].series_title == "Handbook of Optics"
+ assert citations[11].publisher == "McGRAW-HILL"
+
+ assert citations[
+ 12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River"
+ assert citations[
+ 12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association"
+ assert citations[12].institution == "University of Minnesota"
+
def test_grobid_070_document() -> None:
# more recent GROBID v0.7.0 output
@@ -245,6 +265,20 @@ def test_grobid_070_document() -> None:
assert cite_b6.volume == "574"
assert cite_b6.issue == "1"
+ cite_b3 = doc.citations[3]
+ assert cite_b3.url == "http://unesdoc.unesco.org/ulis/"
+ assert cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS"
+ assert cite_b3.authors
+ assert cite_b3.authors[0].surname == "Ioc-Unesco"
+ assert cite_b3.date == "2012"
+
+ cite_b18 = doc.citations[18]
+ assert cite_b18.note == "TriOS GmbH [Internet"
+ assert cite_b18.date == "2017-01-05"
+
+ cite_b29 = doc.citations[29]
+ assert cite_b29.note == "PhD dissertation"
+
# run these methods over some more examples
for c in doc.citations:
c.to_csl_dict()