aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 19:18:39 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421 (patch)
tree4239cd45fcc9490f0a2baf4261b04b2a622e930d
parent03b04aabc9d9b63ff54a80f52590b619aee06159 (diff)
downloadsandcrawler-64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421.tar.gz
sandcrawler-64d8a0e64b0b4f5d1c5927c7a45317f5bc65a421.zip
be more parsimonious with GROBID metadata
Because these are getting persisted in database (as well as kafka), don't write out empty keys.
-rwxr-xr-xpython/grobid2json.py17
-rw-r--r--python/sandcrawler/grobid.py6
2 files changed, 20 insertions, 3 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index 0d85e5e..39ab222 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -40,7 +40,11 @@ def all_authors(elem):
given_name = pn.findtext('./{%s}forename' % ns) or None
surname = pn.findtext('./{%s}surname' % ns) or None
full_name = ' '.join(pn.itertext())
- obj = dict(name=full_name, given_name=given_name, surname=surname)
+ obj = dict(name=full_name)
+ if given_name:
+ obj['given_name'] = given_name
+ if surname:
+ obj['surname'] = surname
ae = author.find('./{%s}affiliation' % ns)
if ae:
affiliation = dict()
@@ -73,6 +77,12 @@ def journal_info(elem):
journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ keys = list(journal.keys())
+
+ # remove empty/null keys
+ for k in keys:
+ if not journal[k]:
+ journal.pop(k)
return journal
@@ -159,6 +169,11 @@ def teixml2json(content, encumbered=True):
el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
info['annex'] = (el or None) and " ".join(el.itertext()).strip()
+ # remove empty/null keys
+ keys = list(info.keys())
+ for k in keys:
+ if not info[k]:
+ info.pop(k)
return info
def main(): # pragma no cover
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 70f7b16..9fd5ad4 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -62,10 +62,12 @@ class GrobidClient(object):
meta = dict()
biblio = dict()
for k in ('title', 'authors', 'journal', 'date', 'doi', ):
- biblio[k] = tei_json.get(k)
+ if tei_json.get(k):
+ biblio[k] = tei_json[k]
meta['biblio'] = biblio
for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
- meta[k] = tei_json.get(k)
+ if tei_json.get(k):
+ meta[k] = tei_json[k]
return meta
class GrobidWorker(SandcrawlerWorker):