From 1f57527aa621525d46e9ddbbd4bab2682df8d67e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 20:14:17 -0700 Subject: add a test for author email extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent refactor fixed email extraction. Thanks to Seán Healy for reporting and providing a test case. --- tests/test_parse.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tests/test_parse.py') diff --git a/tests/test_parse.py b/tests/test_parse.py index 203c960..9d8f4ff 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -410,3 +410,33 @@ def test_citation_multiple_editors() -> None: assert ref.publisher == "Williams and Wilkins" assert ref.date == "1996" assert ref.note == "12th ed. Baltimore" + + +def test_author_email() -> None: + with open("tests/files/document/author_email.tei.xml", "r") as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + biblio = doc.header + assert biblio + assert biblio.title == "Task-Based Intelligent Retrieval and Recommendation" + assert biblio.authors + assert biblio.authors[0].given_name == "Chirag" + assert biblio.authors[0].surname == "Shah" + assert biblio.authors[0].email == "redacted@example.com" + assert biblio.authors[0].affiliation + assert biblio.authors[0].affiliation.institution == "University of Washington" + assert biblio.authors[0].affiliation.address + assert biblio.authors[0].affiliation.address.settlement == "Seattle" + assert biblio.authors[0].affiliation.address.country == "USA" + + assert doc.pdf_md5 == "6C18173427FE3FAD756BB2F4F7665855" + assert doc.grobid_version == "0.7.1-SNAPSHOT" + assert doc.grobid_timestamp == "2021-11-02T09:03+0000" + assert doc.language_code == "en" + assert doc.abstract + assert doc.abstract[:50] == "While the act of looking for information happens within a"[:50] + assert doc.citations == [] + assert doc.body is None + assert doc.acknowledgement is None + assert doc.annex is None -- cgit v1.2.3