aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f157241..d83fedc 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,6 +2,7 @@
import requests
from collections import Counter
+from grobid2json import teixml2json
from .workers import SandcrawlerWorker
from .misc import gen_file_metadata
from .ia import WaybackClient, WaybackError
@@ -49,6 +50,19 @@ class GrobidClient(object):
info['error_msg'] = grobid_response.text[:10000]
return info
+ def metadata(self, result):
+ if result['status'] != 'success':
+ return None
+ tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ meta = dict()
+ biblio = dict()
+ for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ biblio[k] = tei_json.get(k)
+ meta['biblio'] = biblio
+ for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta[k] = tei_json.get(k)
+ return meta
+
class GrobidWorker(SandcrawlerWorker):
def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):