From 1f57527aa621525d46e9ddbbd4bab2682df8d67e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 20:14:17 -0700 Subject: add a test for author email extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent refactor fixed email extraction. Thanks to Seán Healy for reporting and providing a test case. --- tests/files/document/author_email.tei.xml | 70 +++++++++++++++++++++++++++++++ tests/test_parse.py | 30 +++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/files/document/author_email.tei.xml diff --git a/tests/files/document/author_email.tei.xml b/tests/files/document/author_email.tei.xml new file mode 100644 index 0000000..f342fac --- /dev/null +++ b/tests/files/document/author_email.tei.xml @@ -0,0 +1,70 @@ + + + + + + Task-Based Intelligent Retrieval and Recommendation + + + + + + + + + + ChiragShah + redacted@example.com + + University of Washington +
+ Seattle + USA +
+
+
+ Task-Based Intelligent Retrieval and Recommendation +
+ + + + + + 6C18173427FE3FAD756BB2F4F7665855 +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + Task-based IR + Recommendation systems + Information Fostering + + + +

While the act of looking for information happens within a context of a task from the user side, most search and recommendation systems focus on user actions ('what'), ignoring the nature of the task that covers the process ('how') and user intent ('why'). For long, scholars have argued that IR systems should help users accomplish their tasks and not just fulfill a search request. But just as keywords have been good enough approximators for information need, satisfying a set of search requests has been deemed to be good enough to address the task. However, with changing user behaviors and search modalities, specifically found in conversational interfaces, the challenge and opportunity to focus on task have become critically important and central to IR. In this talk, I will discuss some of the key ideas and recent worksboth theoretical and empirical to study and support aspects of task. I will show how we could derive user's search path or strategy and intentions, and how they could be instrumental in not only creating more personalized search and recommendation solutions, but also solving problems not possible otherwise. Finally, I will extend this to the realm of intelligent assistants with our recent work in a new area called Information Fostering, where our knowledge of the user and the task can help us address another classical problem in IRpeople don't know what they don't know.

+
+
+
+ + + +
+ + +
+
+
+
diff --git a/tests/test_parse.py b/tests/test_parse.py index 203c960..9d8f4ff 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -410,3 +410,33 @@ def test_citation_multiple_editors() -> None: assert ref.publisher == "Williams and Wilkins" assert ref.date == "1996" assert ref.note == "12th ed. Baltimore" + + +def test_author_email() -> None: + with open("tests/files/document/author_email.tei.xml", "r") as f: + tei_xml = f.read() + + doc = parse_document_xml(tei_xml) + biblio = doc.header + assert biblio + assert biblio.title == "Task-Based Intelligent Retrieval and Recommendation" + assert biblio.authors + assert biblio.authors[0].given_name == "Chirag" + assert biblio.authors[0].surname == "Shah" + assert biblio.authors[0].email == "redacted@example.com" + assert biblio.authors[0].affiliation + assert biblio.authors[0].affiliation.institution == "University of Washington" + assert biblio.authors[0].affiliation.address + assert biblio.authors[0].affiliation.address.settlement == "Seattle" + assert biblio.authors[0].affiliation.address.country == "USA" + + assert doc.pdf_md5 == "6C18173427FE3FAD756BB2F4F7665855" + assert doc.grobid_version == "0.7.1-SNAPSHOT" + assert doc.grobid_timestamp == "2021-11-02T09:03+0000" + assert doc.language_code == "en" + assert doc.abstract + assert doc.abstract[:50] == "While the act of looking for information happens within a"[:50] + assert doc.citations == [] + assert doc.body is None + assert doc.acknowledgement is None + assert doc.annex is None -- cgit v1.2.3