diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-20 20:07:46 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-20 20:07:46 -0700 |
commit | 3e67adab7d41db41d12930d66daf76cb2f72f61b (patch) | |
tree | c21f1d30aa6abcfbf8eb695ed1286f8e386a9e2e /python | |
parent | ade98f67317b2bbe77d21bb9ebfcce9e32f775aa (diff) | |
download | sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.tar.gz sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.zip |
grobid2json: extract fatcat identifier
Diffstat (limited to 'python')
-rwxr-xr-x | python/grobid2json.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index d438d48..f3577b0 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -101,11 +101,15 @@ def teixml2json(content, encumbered=True): header = tei.find('.//{%s}teiHeader' % ns) if header is None: raise ValueError("XML does not look like TEI format") + application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0] + info['grobid_version'] = application_tag.attrib['version'] + info['grobid_timestamp'] = application_tag.attrib['when'] info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns))) info['journal'] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info['date'] = (date != None) and date.attrib.get('when') + info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info['doi']: info['doi'] = info['doi'].lower() @@ -135,7 +139,7 @@ def main(): # pragma no cover usage="%(prog)s [options] <teifile>...") parser.add_argument("--no-encumbered", action="store_true", - help="ignore errors loading individual WARC files") + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)") parser.add_argument("teifiles", nargs='+') args = parser.parse_args() |