aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-20 20:07:46 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-20 20:07:46 -0700
commit3e67adab7d41db41d12930d66daf76cb2f72f61b (patch)
treec21f1d30aa6abcfbf8eb695ed1286f8e386a9e2e
parentade98f67317b2bbe77d21bb9ebfcce9e32f775aa (diff)
downloadsandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.tar.gz
sandcrawler-3e67adab7d41db41d12930d66daf76cb2f72f61b.zip
grobid2json: extract fatcat identifier
-rwxr-xr-xpython/grobid2json.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index d438d48..f3577b0 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -101,11 +101,15 @@ def teixml2json(content, encumbered=True):
header = tei.find('.//{%s}teiHeader' % ns)
if header is None:
raise ValueError("XML does not look like TEI format")
+ application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+ info['grobid_version'] = application_tag.attrib['version']
+ info['grobid_timestamp'] = application_tag.attrib['when']
info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
info['journal'] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
info['date'] = (date != None) and date.attrib.get('when')
+ info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
if info['doi']:
info['doi'] = info['doi'].lower()
@@ -135,7 +139,7 @@ def main(): # pragma no cover
usage="%(prog)s [options] <teifile>...")
parser.add_argument("--no-encumbered",
action="store_true",
- help="ignore errors loading individual WARC files")
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
parser.add_argument("teifiles", nargs='+')
args = parser.parse_args()