diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/grobid2json.py | 17 | ||||
| -rw-r--r-- | python/sandcrawler/grobid.py | 6 | 
2 files changed, 20 insertions, 3 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index 0d85e5e..39ab222 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -40,7 +40,11 @@ def all_authors(elem):          given_name = pn.findtext('./{%s}forename' % ns) or None          surname = pn.findtext('./{%s}surname' % ns) or None          full_name = ' '.join(pn.itertext()) -        obj = dict(name=full_name, given_name=given_name, surname=surname) +        obj = dict(name=full_name) +        if given_name: +            obj['given_name'] = given_name +        if surname: +            obj['surname'] = surname          ae = author.find('./{%s}affiliation' % ns)          if ae:              affiliation = dict() @@ -73,6 +77,12 @@ def journal_info(elem):      journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)      journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)      journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) +    keys = list(journal.keys()) + +    # remove empty/null keys +    for k in keys: +        if not journal[k]: +            journal.pop(k)      return journal @@ -159,6 +169,11 @@ def teixml2json(content, encumbered=True):          el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))          info['annex'] = (el or None) and " ".join(el.itertext()).strip() +    # remove empty/null keys +    keys = list(info.keys()) +    for k in keys: +        if not info[k]: +            info.pop(k)      return info  def main():   # pragma no cover diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 70f7b16..9fd5ad4 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -62,10 +62,12 @@ class GrobidClient(object):          meta = dict()          biblio = dict()          for k in ('title', 'authors', 'journal', 'date', 'doi', ): -            biblio[k] = tei_json.get(k) +            if tei_json.get(k): +                biblio[k] = tei_json[k]          meta['biblio'] = biblio          for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'): -            meta[k] = tei_json.get(k) +            if tei_json.get(k): +                meta[k] = tei_json[k]          return meta  class GrobidWorker(SandcrawlerWorker):  | 
