diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-26 19:18:39 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | 64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421 (patch) | |
tree | 4239cd45fcc9490f0a2baf4261b04b2a622e930d | |
parent | 03b04aabc9d9b63ff54a80f52590b619aee06159 (diff) | |
download | sandcrawler-64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421.tar.gz sandcrawler-64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421.zip |
be more parsimonious with GROBID metadata
Because these are getting persisted in database (as well as kafka),
don't write out empty keys.
-rwxr-xr-x | python/grobid2json.py | 17 | ||||
-rw-r--r-- | python/sandcrawler/grobid.py | 6 |
2 files changed, 20 insertions, 3 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index 0d85e5e..39ab222 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -40,7 +40,11 @@ def all_authors(elem): given_name = pn.findtext('./{%s}forename' % ns) or None surname = pn.findtext('./{%s}surname' % ns) or None full_name = ' '.join(pn.itertext()) - obj = dict(name=full_name, given_name=given_name, surname=surname) + obj = dict(name=full_name) + if given_name: + obj['given_name'] = given_name + if surname: + obj['surname'] = surname ae = author.find('./{%s}affiliation' % ns) if ae: affiliation = dict() @@ -73,6 +77,12 @@ def journal_info(elem): journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + keys = list(journal.keys()) + + # remove empty/null keys + for k in keys: + if not journal[k]: + journal.pop(k) return journal @@ -159,6 +169,11 @@ def teixml2json(content, encumbered=True): el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) info['annex'] = (el or None) and " ".join(el.itertext()).strip() + # remove empty/null keys + keys = list(info.keys()) + for k in keys: + if not info[k]: + info.pop(k) return info def main(): # pragma no cover diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 70f7b16..9fd5ad4 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -62,10 +62,12 @@ class GrobidClient(object): meta = dict() biblio = dict() for k in ('title', 'authors', 'journal', 'date', 'doi', ): - biblio[k] = tei_json.get(k) + if tei_json.get(k): + biblio[k] = tei_json[k] meta['biblio'] = biblio for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'): - meta[k] = tei_json.get(k) + if tei_json.get(k): + meta[k] = tei_json[k] return meta class GrobidWorker(SandcrawlerWorker): |