aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-25 17:02:14 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-25 17:02:14 -0700
commit8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573 (patch)
tree3b71338fcae115c834097b16cbfe051fb88c7684 /grobid_tei_xml
parentfcdb271193ca2c6b90eeeb5f4af4bbc15083319a (diff)
downloadgrobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.tar.gz
grobid_tei_xml-8e68f3cd4cc3d2b6b166bf87811bb4db82cc7573.zip
more test coverage and comments
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-xgrobid_tei_xml/parse.py6
-rw-r--r--grobid_tei_xml/types.py4
2 files changed, 8 insertions, 2 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index da7ed97..66e4e72 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -179,6 +179,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
biblio.book_title = book_title_tag.text
if biblio.book_title and not biblio.title:
biblio.title = biblio.book_title
+ biblio.book_title = None
note_tag = elem.find(f'.//{{{ns}}}note')
if note_tag is not None and note_tag.attrib.get('type') is None:
@@ -209,6 +210,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
if el is not None:
biblio.url = _clean_url(el.attrib["target"])
+ # having DOI and a DOI URL is redundant
+ if biblio.doi and biblio.url:
+ if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url):
+ biblio.url = None
+
return biblio
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 252e677..08be47a 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -24,8 +24,8 @@ class GrobidAuthor:
given_name: Optional[str] = None
middle_name: Optional[str] = None
surname: Optional[str] = None
- email: Optional[str] = None # XXX
- orcid: Optional[str] = None # XXX
+ email: Optional[str] = None # TODO: test coverage
+ orcid: Optional[str] = None # TODO: test coverage
affiliation: Optional[GrobidAffiliation] = None
def to_csl_dict(self) -> dict: