diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-25 17:02:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-25 17:02:14 -0700 |
commit | 8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573 (patch) | |
tree | 3b71338fcae115c834097b16cbfe051fb88c7684 /grobid_tei_xml | |
parent | fcdb271193ca2c6b90eeeb5f4af4bbc15083319a (diff) | |
download | grobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.tar.gz grobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.zip |
more test coverage and comments
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-x | grobid_tei_xml/parse.py | 6 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 4 |
2 files changed, 8 insertions, 2 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index da7ed97..66e4e72 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -179,6 +179,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: biblio.book_title = book_title_tag.text if biblio.book_title and not biblio.title: biblio.title = biblio.book_title + biblio.book_title = None note_tag = elem.find(f'.//{{{ns}}}note') if note_tag is not None and note_tag.attrib.get('type') is None: @@ -209,6 +210,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: if el is not None: biblio.url = _clean_url(el.attrib["target"]) + # having DOI and a DOI URL is redundant + if biblio.doi and biblio.url: + if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url): + biblio.url = None + return biblio diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 252e677..08be47a 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -24,8 +24,8 @@ class GrobidAuthor: given_name: Optional[str] = None middle_name: Optional[str] = None surname: Optional[str] = None - email: Optional[str] = None # XXX - orcid: Optional[str] = None # XXX + email: Optional[str] = None # TODO: test coverage + orcid: Optional[str] = None # TODO: test coverage affiliation: Optional[GrobidAffiliation] = None def to_csl_dict(self) -> dict: |