grobid2json: extract fatcat identifier

author: Bryan Newbold <bnewbold@archive.org> 2019-09-20 20:07:46 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2019-09-20 20:07:46 -0700
commit: 3e67adab7d41db41d12930d66daf76cb2f72f61b (patch)
tree: c21f1d30aa6abcfbf8eb695ed1286f8e386a9e2e /python
parent: ade98f67317b2bbe77d21bb9ebfcce9e32f775aa (diff)
download: sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.tar.gz
sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.zip
1 files changed, 5 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index d438d48..f3577b0 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -101,11 +101,15 @@ def teixml2json(content, encumbered=True):
     header = tei.find('.//{%s}teiHeader' % ns)
     if header is None:
         raise ValueError("XML does not look like TEI format")
+    application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+    info['grobid_version'] = application_tag.attrib['version']
+    info['grobid_timestamp'] = application_tag.attrib['when']
     info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
     info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
     info['journal'] = journal_info(header)
     date = header.find('.//{%s}date[@type="published"]' % ns)
     info['date'] = (date != None) and date.attrib.get('when')
+    info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
     info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
     if info['doi']:
         info['doi'] = info['doi'].lower()
@@ -135,7 +139,7 @@ def main():   # pragma no cover
         usage="%(prog)s [options] <teifile>...")
     parser.add_argument("--no-encumbered",
         action="store_true",
-        help="ignore errors loading individual WARC files")
+        help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
     parser.add_argument("teifiles", nargs='+')
 
     args = parser.parse_args()
author	Bryan Newbold <bnewbold@archive.org>	2019-09-20 20:07:46 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2019-09-20 20:07:46 -0700
commit	3e67adab7d41db41d12930d66daf76cb2f72f61b (patch)
tree	c21f1d30aa6abcfbf8eb695ed1286f8e386a9e2e /python
parent	ade98f67317b2bbe77d21bb9ebfcce9e32f775aa (diff)
download	sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.tar.gz sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.zip