summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-22 13:45:47 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-22 13:45:47 -0700
commitff673bc6be7098efb5a6297d990955761bffc7e6 (patch)
tree91f36e66bc98c002eb18053b89c2f917523bf4e8
parent8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (diff)
downloadgrobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.tar.gz
grobid_tei_xml-ff673bc6be7098efb5a6297d990955761bffc7e6.zip
to_legacy_dict() helper, and start adding some new fields
-rw-r--r--grobid_tei_xml/grobid2json.py8
-rwxr-xr-xgrobid_tei_xml/parse.py2
-rw-r--r--grobid_tei_xml/types.py20
-rw-r--r--tests/files/small.json20
-rw-r--r--tests/test_parse.py15
5 files changed, 30 insertions, 35 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index 7f455af..8946ab8 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -83,9 +83,9 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]:
journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
- keys = list(journal.keys())
# remove empty/null keys
+ keys = list(journal.keys())
for k in keys:
if not journal[k]:
journal.pop(k)
@@ -140,6 +140,12 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]:
ref["url"] = ref["url"].split(">")[0]
else:
ref["url"] = None
+
+ # remove empty/null keys
+ keys = list(ref.keys())
+ for k in keys:
+ if ref[k] is None:
+ ref.pop(k)
return ref
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 029fa85..284ceff 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -161,7 +161,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
header=_parse_header(header),
- # TODO: pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
+ pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
)
refs = []
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index b86e1a4..9894bf5 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -72,7 +72,7 @@ class GrobidHeader:
title: Optional[str] = None
date: Optional[str] = None
doi: Optional[str] = None
- # TODO: note: Optional[str]
+ note: Optional[str] = None
journal: Optional[GrobidJournal] = None
@@ -80,8 +80,8 @@ class GrobidHeader:
class GrobidDocument:
grobid_version: str
grobid_timestamp: str
- # TODO: pdf_md5: Optional[str]
header: GrobidHeader
+ pdf_md5: Optional[str] = None
citations: Optional[List[GrobidCitation]] = None
language_code: Optional[str] = None
abstract: Optional[str] = None
@@ -100,6 +100,22 @@ class GrobidDocument:
"""
return _simplify_dict(asdict(self))
+ def to_legacy_dict(self) -> dict:
+ """
+ Returns a dict in the old "grobid2json" format.
+ """
+ d = self.to_dict()
+
+ # all header fields at top-level
+ d.update(d.pop('header', {}))
+ d.pop('note', None)
+ d.pop('pdf_md5', None)
+ for a in d['authors']:
+ addr = a.get('affiliation', {}).get('address')
+ if addr and addr.get('post_code'):
+ addr['postCode'] = addr.pop('post_code')
+ return d
+
def remove_encumbered(self) -> None:
"""
This helper function removes fields from this object which might raise
diff --git a/tests/files/small.json b/tests/files/small.json
index aa0da78..9dc8fba 100644
--- a/tests/files/small.json
+++ b/tests/files/small.json
@@ -27,34 +27,18 @@
"date": "2001",
"id": "b0",
"index": 0,
- "issue": null,
"journal": "Letters in the Alphabet",
- "publisher": null,
"title": "Everything is Wonderful",
- "url": null,
"volume": "20",
- "unstructured": null,
- "arxiv_id": null,
- "doi": null,
- "pages": "1-11",
- "pmcid": null,
- "pmid": null
+ "pages": "1-11"
},
{ "authors": [],
"date": "2011-03-28",
"id": "b1",
"index": 1,
- "issue": null,
"journal": "The Dictionary",
- "publisher": null,
"title": "All about Facts",
- "url": null,
- "volume": "14",
- "unstructured": null,
- "arxiv_id": null,
- "doi": null,
- "pmcid": null,
- "pmid": null
+ "volume": "14"
}
],
"abstract": "Everything you ever wanted to know about nothing",
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 30b2926..825b561 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -75,25 +75,14 @@ def test_small_xml() -> None:
assert doc == expected
-def test_small_xml_json() -> None:
+def test_small_xml_legacy() -> None:
with open('tests/files/small.xml', 'r') as f:
tei_xml = f.read()
with open('tests/files/small.json', 'r') as f:
json_form = json.loads(f.read())
- d = parse_document_xml(tei_xml).to_dict()
-
- # munge back to the old JSON format
- d.update(d.pop('header'))
- addr = d['authors'][0]['affiliation']['address']
- addr['postCode'] = addr.pop('post_code')
-
- # remove nulls from old JSON
- for c in json_form['citations']:
- for k in list(c.keys()):
- if c[k] is None:
- c.pop(k)
+ d = parse_document_xml(tei_xml).to_legacy_dict()
assert d == json_form