aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py125
1 files changed, 121 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 37c4ea1..cdd2093 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
+import sys
import requests
-from grobid_tei_xml import parse_document_xml
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
from .ia import WaybackClient
from .misc import gen_file_metadata
@@ -9,7 +10,7 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class GrobidClient(object):
- def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
@@ -67,6 +68,29 @@ class GrobidClient(object):
info["error_msg"] = grobid_response.text[:10000]
return info
+ def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+ if not unstructured_list:
+ return []
+ if len(unstructured_list) > 5000:
+ raise ValueError("more than 5,000 references in a batch is just too much")
+
+ try:
+ grobid_response = requests.post(
+ self.host_url + "/api/processCitationList",
+ data={
+ "citations": unstructured_list,
+ "consolidateCitations": 0,
+ "includeRawCitations": 1,
+ },
+ timeout=30.0,
+ )
+ except requests.Timeout as te:
+ # TODO: handle somehow?
+ raise te
+
+ grobid_response.raise_for_status()
+ return parse_citation_list_xml(grobid_response.text)
+
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
if result["status"] != "success":
return None
@@ -90,6 +114,87 @@ class GrobidClient(object):
meta[k] = tei_json[k]
return meta
+ def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+ """
+ Helper function to decide whether to run GROBID parsing on an crossref
+ reference.
+
+ For example, if there is already a DOI in the ref metadata, could skip.
+ Or, if there is sufficient structured metadata, or only depending on
+ the source of the DOI linkage.
+ """
+ if ref.get("DOI"):
+ return False
+ if len(ref.get("unstructured", "")) <= 6:
+ return False
+
+ # TODO: what other combinations are enough to skip parsing?
+ if (
+ ref.get("year")
+ and ref.get("author")
+ and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+ ):
+ return False
+ elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+ return False
+ elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+ return False
+
+ return True
+
+ def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Given a complete Crossref metadata record, inspects the
+
+ The returned dict is in the schema of the `grobid_refs` database table,
+ in dict form:
+
+ source: 'crossref'
+ source_id: doi, as lower-case string
+ source_ts: Crossref indexed timestamp, if available
+ ('updated' is not set)
+ refs_json: list of dicts
+ """
+
+ # remove API wrapper around record, if necessary
+ if "message" in record and "DOI" not in record:
+ record = record["message"]
+
+ ret = dict(
+ source="crossref",
+ source_id=record["DOI"].lower(),
+ source_ts=record["indexed"]["date-time"],
+ refs_json=[],
+ )
+ all_refs = record.get("reference", [])
+ unstructured_refs = []
+ for r in all_refs:
+ if not r.get("unstructured"):
+ continue
+ if not self.should_parse_crossref_ref(r):
+ continue
+ unstructured_refs.append(r)
+ if not unstructured_refs:
+ return ret
+
+ # some reasonable cap on length of refs per work
+ if len(unstructured_refs) > 2000:
+ print(
+ f"truncatin very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+ file=sys.stderr,
+ )
+ unstructured_refs = unstructured_refs[:2000]
+
+ refs = self.process_citation_list([r["unstructured"] for r in unstructured_refs])
+ assert len(refs) == len(unstructured_refs)
+ refs_json = []
+ for i in range(len(refs)):
+ refs[i].id = unstructured_refs[i].get("key")
+ assert refs[i].unstructured == unstructured_refs[i]["unstructured"]
+ refs_json.append(refs[i].to_dict())
+ ret["refs_json"] = refs_json
+ return ret
+
class GrobidWorker(SandcrawlerFetchWorker):
def __init__(
@@ -97,7 +202,7 @@ class GrobidWorker(SandcrawlerFetchWorker):
grobid_client: GrobidClient,
wayback_client: Optional[WaybackClient] = None,
sink: Optional[SandcrawlerWorker] = None,
- **kwargs
+ **kwargs,
):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
@@ -129,6 +234,18 @@ class GrobidWorker(SandcrawlerFetchWorker):
return result
+class CrossrefRefsWorker(SandcrawlerWorker):
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
+ super().__init__(**kwargs)
+ self.grobid_client = grobid_client
+ self.sink = sink
+
+ def process(self, record: Any, key: Optional[str] = None) -> Any:
+ return self.grobid_client.crossref_refs(record)
+
+
class GrobidBlobWorker(SandcrawlerWorker):
"""
This is sort of like GrobidWorker, except it receives blobs directly,