diff options
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-x | grobid_tei_xml/parse.py | 6 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 4 |
2 files changed, 8 insertions, 2 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index da7ed97..66e4e72 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -179,6 +179,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: biblio.book_title = book_title_tag.text if biblio.book_title and not biblio.title: biblio.title = biblio.book_title + biblio.book_title = None note_tag = elem.find(f'.//{{{ns}}}note') if note_tag is not None and note_tag.attrib.get('type') is None: @@ -209,6 +210,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: if el is not None: biblio.url = _clean_url(el.attrib["target"]) + # having DOI and a DOI URL is redundant + if biblio.doi and biblio.url: + if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url): + biblio.url = None + return biblio diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 252e677..08be47a 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -24,8 +24,8 @@ class GrobidAuthor: given_name: Optional[str] = None middle_name: Optional[str] = None surname: Optional[str] = None - email: Optional[str] = None # XXX - orcid: Optional[str] = None # XXX + email: Optional[str] = None # TODO: test coverage + orcid: Optional[str] = None # TODO: test coverage affiliation: Optional[GrobidAffiliation] = None def to_csl_dict(self) -> dict: |