aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-22 21:34:40 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-22 21:35:00 -0700
commit2e3611f0e66615ae007d4e46bb5905e2220fb690 (patch)
tree9d2fa6f8d62145a5ab31f37f26b6c293a2163acd /python/sandcrawler/grobid.py
parentb11fe8c8f444756ae246250cbbfe44e7dc62eac3 (diff)
downloadsandcrawler-2e3611f0e66615ae007d4e46bb5905e2220fb690.tar.gz
sandcrawler-2e3611f0e66615ae007d4e46bb5905e2220fb690.zip
much progress on file ingest path
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f157241..d83fedc 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,6 +2,7 @@
import requests
from collections import Counter
+from grobid2json import teixml2json
from .workers import SandcrawlerWorker
from .misc import gen_file_metadata
from .ia import WaybackClient, WaybackError
@@ -49,6 +50,19 @@ class GrobidClient(object):
info['error_msg'] = grobid_response.text[:10000]
return info
+ def metadata(self, result):
+ if result['status'] != 'success':
+ return None
+ tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ meta = dict()
+ biblio = dict()
+ for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ biblio[k] = tei_json.get(k)
+ meta['biblio'] = biblio
+ for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta[k] = tei_json.get(k)
+ return meta
+
class GrobidWorker(SandcrawlerWorker):
def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):