4 files changed, 275 insertions, 145 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index 8946ab8..c10de7c 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -1,28 +1,6 @@
 """
-NOTE: this file is DEPRECATED and will be removed soon
-
-NB: adapted to work as a library for PDF extraction. Will probably be
-re-written eventually to be correct, complete, and robust; this is just a
-first iteration.
-
-This script tries to extract everything from a GROBID TEI XML fulltext dump:
-
-- header metadata
-- affiliations
-- references (with context)
-- abstract
-- fulltext
-- tables, figures, equations
-
-A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
-
-- abstract
-- fulltext
-- tables, figures, equations
-
-Prints JSON to stdout, errors to stderr
-
-This file copied from the sandcrawler repository.
+NOTE: this file is DEPRECATED. It is only here for testing backwards
+compatibility, and will be removed soon.
 """
 
 import io
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 284ceff..c65cbdf 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -10,6 +10,10 @@ ns = "http://www.tei-c.org/ns/1.0"
 
 
 def _string_to_tree(content: AnyStr) -> ET.ElementTree:
+    """
+    Helper to consistently parse XML into an ElementTree, whether provided as
+    str, bytes, wrapper thereof
+    """
     if isinstance(content, str):
         return ET.parse(io.StringIO(content))
     elif isinstance(content, bytes):
@@ -23,122 +27,181 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
 
 
 def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]:
+    """
+    Internal helper to parse one or more TEI 'author' XML tags into
+    GrobidAuthor objects. 'author' could appear in document headers or
+    citations.
+    """
     if not elem:
         return []
-    names = []
-    for author in elem.findall(f".//{{{ns}}}author"):
-        pn = author.find(f"./{{{ns}}}persName")
-        if not pn:
+
+    authors = []
+    for author_tag in elem.findall(f".//{{{ns}}}author"):
+        persname_tag = author_tag.find(f"./{{{ns}}}persName")
+        if persname_tag is None:
+            # should we do something else here? it is possible to have author
+            # without persName?
             continue
-        given_name = pn.findtext(f"./{{{ns}}}forename") or None
-        surname = pn.findtext(f"./{{{ns}}}surname") or None
-        full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip()
-        obj: Dict[str, Any] = dict(name=full_name)
-        if given_name:
-            obj["given_name"] = given_name
-        if surname:
-            obj["surname"] = surname
-        ae = author.find(f"./{{{ns}}}affiliation")
-        if ae:
-            affiliation: Dict[str, Any] = dict()
-            for on in ae.findall(f"./{{{ns}}}orgName"):
-                on_type = on.get("type")
-                if on_type:
-                    affiliation[on_type] = on.text
-            addr_e = ae.find(f"./{{{ns}}}address")
-            if addr_e:
-                address = dict()
-                for t in list(addr_e):
-                    address[t.tag.split("}")[-1]] = t.text
-                if address:
-                    address['post_code'] = address.pop('postCode', None)
-                    affiliation["address"] = GrobidAddress(**address)
-                # previously:
-                # affiliation['address'] = {
-                #    'post_code': addr.findtext('./{%s}postCode' % ns) or None,
-                #    'settlement': addr.findtext('./{%s}settlement' % ns) or None,
-                #    'country': addr.findtext('./{%s}country' % ns) or None,
-                # }
-            obj["affiliation"] = GrobidAffiliation(**affiliation)
-        names.append(GrobidAuthor(**obj))
-    return names
+
+        # basic author name stuff
+        given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None
+        surname = persname_tag.findtext(f"./{{{ns}}}surname") or None
+        # instead create full_name from all the sub-components of the tag
+        full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+        ga = GrobidAuthor(
+            full_name=full_name or None,
+            given_name=given_name,
+            surname=surname,
+        )
+
+        # author affiliation
+        affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation")
+        if affiliation_tag is not None:
+            affiliation_dict: Dict[str, Any] = dict()
+            for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
+                orgname_type = orgname_tag.get("type")
+                if orgname_type:
+                    affiliation_dict[orgname_type] = orgname_tag.text or None
+            if affiliation_dict:
+                ga.affiliation = GrobidAffiliation(
+                    institution=affiliation_dict.get('institution'),
+                    department=affiliation_dict.get('department'),
+                    laboratory=affiliation_dict.get('laboratory'),
+                )
+                address_tag = affiliation_tag.find(f"./{{{ns}}}address")
+                if address_tag is not None:
+                    address_dict = dict()
+                    for t in list(address_tag):
+                        address_dict[t.tag.split("}")[-1]] = t.text or None
+                    if address_dict:
+                        ga.affiliation.address = GrobidAddress(
+                            addr_line=address_dict.get('addrLine'),
+                            post_code=address_dict.get('postCode'),
+                            settlement=address_dict.get('settlement'),
+                            country=address_dict.get('country'),
+                        )
+        authors.append(ga)
+
+    return authors
+
+
+def _clean_url(url: Optional[str]) -> Optional[str]:
+    if not url:
+        return None
+    url = url.strip()
+    if url.endswith(".Lastaccessed"):
+        url = url.replace(".Lastaccessed", "")
+    if url.startswith("<"):
+        url = url[1:]
+    if ">" in url:
+        url = url.split(">")[0]
+    return url or None
+
+
+def test_clean_url() -> None:
+    examples: List[dict] = [
+        dict(
+            dirty="https://archive.org/thing.pdf",
+            clean="https://archive.org/thing.pdf",
+        ),
+        dict(
+            dirty="https://archive.org/thing.pdf.Lastaccessed",
+            clean="https://archive.org/thing.pdf",
+        ),
+        dict(
+            dirty="<https://archive.org/thing.pdf>",
+            clean="https://archive.org/thing.pdf",
+        ),
+        dict(
+            dirty="   https://archive.org/thing.pdf>",
+            clean="https://archive.org/thing.pdf",
+        ),
+        dict(
+            dirty="   https://archive.org/thing.pdf>",
+            clean="https://archive.org/thing.pdf",
+        ),
+        dict(dirty="", clean=None),
+        dict(dirty=None, clean=None),
+    ]
+
+    for row in examples:
+        assert row['clean'] == _clean_url(row['dirty'])
 
 
 def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
-    ref: Dict[str, Any] = dict()
-    ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
-    ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns)
-    # Title stuff is messy in references...
-    ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
-    other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
-    if other_title:
-        if ref["title"]:
-            ref["journal"] = other_title
-        else:
-            ref["journal"] = None
-            ref["title"] = other_title
-    ref["authors"] = _parse_authors(elem, ns=ns)
-    ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
-    if not ref["publisher"]:
-        ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
-    if ref["publisher"] == "":
-        ref["publisher"] = None
-    date = elem.find('.//{%s}date[@type="published"]' % ns)
-    ref["date"] = (date is not None) and date.attrib.get("when")
-    ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
-    ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
-    ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns)
-    ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns)
-    if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"):
-        ref["arxiv_id"] = ref["arxiv_id"][6:]
-    ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns)
-    ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns)
-    el = elem.find('.//{%s}biblScope[@unit="page"]' % ns)
+    """
+    Parses an entire TEI 'biblStruct' XML tag
+    """
+
+    citation = GrobidCitation(
+        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
+        journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
+        authors=_parse_authors(elem, ns=ns),
+        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
+        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
+        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+    )
+
+    citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
+    if not citation.publisher:
+        citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+
+    date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
+    if date_tag is not None:
+        citation.date = date_tag.attrib.get("when") or None
+
+    # title stuff is messy in references...
+    if citation.journal and not citation.title:
+        citation.title = citation.journal
+        citation.journal = None
+
+    if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
+        citation.arxiv_id = citation.arxiv_id[6:]
+
+    el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
     if el is not None:
         if el.attrib.get("from") and el.attrib.get("to"):
-            ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+            citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
         else:
-            ref["pages"] = el.text
-    el = elem.find(".//{%s}ptr[@target]" % ns)
+            citation.pages = el.text
+
+    el = elem.find(f".//{{{ns}}}ptr[@target]")
     if el is not None:
-        ref["url"] = el.attrib["target"]
-        # Hand correction
-        # TODO: move this elsewhere
-        if ref["url"].endswith(".Lastaccessed"):
-            ref["url"] = ref["url"].replace(".Lastaccessed", "")
-        if ref["url"].startswith("<"):
-            ref["url"] = ref["url"][1:]
-        if ">" in ref["url"]:
-            ref["url"] = ref["url"].split(">")[0]
-    else:
-        ref["url"] = None
-    return GrobidCitation(**ref)
+        citation.url = _clean_url(el.attrib["target"])
+
+    return citation
 
 
 def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
-    journal = dict()
-    journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
-    journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
-    if journal["publisher"] == "":
-        journal["publisher"] = None
-    journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
-    journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
-    journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
-    journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
-    journal["abbrev"] = None
-    return GrobidJournal(**journal)
+    journal = GrobidJournal(
+        name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
+        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
+        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
+        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
+        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+        # XXX: abbrev
+        abbrev=None,
+    )
+    return journal
 
 
 def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
-    header = elem
-    info: Dict[str, Any] = dict()
-    info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
-    info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct"))
-    info["journal"] = _parse_journal(header)
-    date = header.find(f'.//{{{ns}}}date[@type="published"]')
-    info["date"] = (date is not None) and date.attrib.get("when")
-    info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]')
-    return GrobidHeader(**info)
+    header = GrobidHeader(
+        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
+        authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
+        journal=_parse_journal(elem) or None,
+        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
+    )
+    date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
+    if date_tag is not None:
+        header.date = date_tag.attrib.get("when") or None
+    return header
 
 
 def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -190,15 +253,19 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
 
 def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
     """
-    Use this function to parse TEI-XML of one or more references.
+    Use this function to parse TEI-XML of one or more references. This should
+    work with either /api/processCitation or /api/processCitationList API
+    responses from GROBID
 
-    Eg, the output of '/api/processReferences' or '/api/processCitation'.
+    Note that processed citations are usually returned as a bare XML tag, not a
+    full XML document, which means that the TEI xmlns is not set. This requires
+    a tweak to all downstream parsing code to handle documents with or without
+    the namespace.
     """
-    # XXX: this replacement shouldn't be needed?
     if isinstance(xml_text, bytes):
-        xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"")
+        xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'')
     elif isinstance(xml_text, str):
-        xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+        xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '')
     tree = _string_to_tree(xml_text)
     root = tree.getroot()
 
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 9894bf5..b78b236 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -8,52 +8,125 @@ class GrobidAddress:
     post_code: Optional[str] = None
     settlement: Optional[str] = None
     country: Optional[str] = None
-    country_code: Optional[str] = None
+    country_code: Optional[str] = None  # XXX
 
 
 @dataclass
 class GrobidAffiliation:
-    address: Optional[GrobidAddress] = None
     institution: Optional[str] = None
     department: Optional[str] = None
     laboratory: Optional[str] = None
+    address: Optional[GrobidAddress] = None
 
 
 @dataclass
 class GrobidAuthor:
-    name: Optional[str]
-    # TODO: 'forename'?
+    full_name: Optional[str]
     given_name: Optional[str] = None
+    middle: Optional[str] = None  # XXX
     surname: Optional[str] = None
+    suffix: Optional[str] = None  # XXX
+    email: Optional[str] = None  # XXX
     affiliation: Optional[GrobidAffiliation] = None
 
+    def to_csl_dict(self) -> dict:
+        d = dict(
+            given=self.given_name,
+            family=self.surname,
+            suffix=self.suffix,
+        )
+        return _simplify_dict(d)
+
+
+def _csl_date(s: Optional[str]) -> Optional[list]:
+    if not s:
+        return None
+
+    # YYYY
+    if len(s) >= 4 and s[0:4].isdigit():
+        year = int(s[0:4])
+    else:
+        return None
+
+    # YYYY-MM
+    if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit():
+        month = int(s[5:7])
+    else:
+        return [[year]]
+
+    # YYYY-MM-DD
+    if len(s) == 10 and s[7] == '-' and s[8:10].isdigit():
+        day = int(s[8:10])
+        return [[year, month, day]]
+    else:
+        return [[year, month]]
+
+
+def test_csl_date() -> None:
+    assert _csl_date("1998") == [[1998]]
+    assert _csl_date("1998-03") == [[1998, 3]]
+    assert _csl_date("1998-03-12") == [[1998, 3, 12]]
+    assert _csl_date("1998-blah") == [[1998]]
+    assert _csl_date("asdf") is None
+
 
 @dataclass
 class GrobidCitation:
     authors: List[GrobidAuthor]
+
     index: Optional[int] = None
     id: Optional[str] = None
     date: Optional[str] = None
     issue: Optional[str] = None
-    journal: Optional[str] = None
+    journal: Optional[str] = None  # XXX: venue? other?
     publisher: Optional[str] = None
     title: Optional[str] = None
     url: Optional[str] = None
     volume: Optional[str] = None
     pages: Optional[str] = None
-    first_page: Optional[str] = None
-    last_page: Optional[str] = None
+    first_page: Optional[str] = None  # XXX
+    last_page: Optional[str] = None  # XXX
     unstructured: Optional[str] = None
-    # TODO: 'arxiv' for consistency?
     arxiv_id: Optional[str] = None
     doi: Optional[str] = None
     pmid: Optional[str] = None
     pmcid: Optional[str] = None
     oa_url: Optional[str] = None
+    note: Optional[str] = None
 
     def to_dict(self) -> dict:
         return _simplify_dict(asdict(self))
 
+    def to_csl_dict(self, default_type: str = "article-journal") -> dict:
+        """
+        Transforms in to Citation Style Language (CSL) JSON schema
+        """
+        csl = dict(
+            type=default_type,
+            author=[a.to_csl_dict() for a in self.authors],
+            issued=_csl_date(self.date),
+            publisher=self.publisher,
+            title=self.title,
+            page=self.pages,
+            URL=self.url,
+            DOI=self.doi,
+            PMID=self.pmid,
+            PMCID=self.pmcid,
+            note=self.note,
+            # fields with '-' in the key name
+            **{
+                "container-title": self.journal,
+                "page-first": self.first_page,
+            })
+
+        # numeric fields
+        if self.issue and self.issue.isdigit():
+            csl['issue'] = int(self.issue)
+        if self.volume and self.volume.isdigit():
+            csl['volume'] = int(self.volume)
+
+        return _simplify_dict(csl)
+
 
 @dataclass
 class GrobidJournal:
@@ -69,10 +142,10 @@ class GrobidJournal:
 @dataclass
 class GrobidHeader:
     authors: List[GrobidAuthor]
+
     title: Optional[str] = None
     date: Optional[str] = None
     doi: Optional[str] = None
-    note: Optional[str] = None
     journal: Optional[GrobidJournal] = None
 
 
@@ -81,9 +154,10 @@ class GrobidDocument:
     grobid_version: str
     grobid_timestamp: str
     header: GrobidHeader
+
     pdf_md5: Optional[str] = None
-    citations: Optional[List[GrobidCitation]] = None
     language_code: Optional[str] = None
+    citations: Optional[List[GrobidCitation]] = None
     abstract: Optional[str] = None
     body: Optional[str] = None
     acknowledgement: Optional[str] = None
@@ -108,12 +182,21 @@ class GrobidDocument:
 
         # all header fields at top-level
         d.update(d.pop('header', {}))
-        d.pop('note', None)
+
+        # files not in the old schema
         d.pop('pdf_md5', None)
+        for c in d.get('citations', []):
+            c.pop('note', None)
+
+        # author changes
         for a in d['authors']:
+            a['name'] = a.pop('full_name')
             addr = a.get('affiliation', {}).get('address')
             if addr and addr.get('post_code'):
                 addr['postCode'] = addr.pop('post_code')
+        for c in d['citations'] or []:
+            for a in c['authors']:
+                a['name'] = a.pop('full_name')
         return d
 
     def remove_encumbered(self) -> None:
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 825b561..7749201 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -24,7 +24,7 @@ def test_small_xml() -> None:
         header=GrobidHeader(
             title="Dummy Example File",
             authors=[
-                GrobidAuthor(name="Brewster Kahle",
+                GrobidAuthor(full_name="Brewster Kahle",
                              given_name="Brewster",
                              surname="Kahle",
                              affiliation=GrobidAffiliation(
@@ -38,7 +38,7 @@ def test_small_xml() -> None:
                                  ),
                              )),
                 GrobidAuthor(
-                    name="J Doe",
+                    full_name="J Doe",
                     given_name="J",
                     surname="Doe",
                 ),
@@ -53,7 +53,9 @@ def test_small_xml() -> None:
             GrobidCitation(
                 index=0,
                 id="b0",
-                authors=[GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson")],
+                authors=[
+                    GrobidAuthor(full_name="A Seaperson", given_name="A", surname="Seaperson")
+                ],
                 date="2001",
                 journal="Letters in the Alphabet",
                 title="Everything is Wonderful",
@@ -127,7 +129,7 @@ def test_example_grobid_tei_xml() -> None:
         """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"""
 
     ref = [c for c in doc.citations or [] if c.id == "b12"][0]
-    assert ref.authors[0].name == "K Tasa"
+    assert ref.authors[0].full_name == "K Tasa"
     assert ref.authors[0].given_name == "K"
     assert ref.authors[0].surname == "Tasa"
     assert ref.journal == "Quality Management in Health Care"
@@ -193,7 +195,7 @@ def test_single_citations_xml() -> None:
     assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review"""
     assert d.authors[2].given_name == "L"
     assert d.authors[2].surname == "Taveras"
-    assert d.authors[2].name == "L R Taveras"
+    assert d.authors[2].full_name == "L R Taveras"
     assert d.doi == "10.1007/s10029-019-01898-9"
     assert d.pmid == "30701369"
     assert d.date == "2019-01-30"