diff options
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r-- | python/sandcrawler/grobid.py | 125 |
1 files changed, 121 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 37c4ea1..cdd2093 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -1,7 +1,8 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional +import sys import requests -from grobid_tei_xml import parse_document_xml +from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml from .ia import WaybackClient from .misc import gen_file_metadata @@ -9,7 +10,7 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class GrobidClient(object): - def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs): + def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs): self.host_url = host_url self.consolidate_mode = int(kwargs.get("consolidate_mode", 0)) @@ -67,6 +68,29 @@ class GrobidClient(object): info["error_msg"] = grobid_response.text[:10000] return info + def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]: + if not unstructured_list: + return [] + if len(unstructured_list) > 5000: + raise ValueError("more than 5,000 references in a batch is just too much") + + try: + grobid_response = requests.post( + self.host_url + "/api/processCitationList", + data={ + "citations": unstructured_list, + "consolidateCitations": 0, + "includeRawCitations": 1, + }, + timeout=30.0, + ) + except requests.Timeout as te: + # TODO: handle somehow? + raise te + + grobid_response.raise_for_status() + return parse_citation_list_xml(grobid_response.text) + def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: if result["status"] != "success": return None @@ -90,6 +114,87 @@ class GrobidClient(object): meta[k] = tei_json[k] return meta + def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool: + """ + Helper function to decide whether to run GROBID parsing on an crossref + reference. + + For example, if there is already a DOI in the ref metadata, could skip. + Or, if there is sufficient structured metadata, or only depending on + the source of the DOI linkage. + """ + if ref.get("DOI"): + return False + if len(ref.get("unstructured", "")) <= 6: + return False + + # TODO: what other combinations are enough to skip parsing? + if ( + ref.get("year") + and ref.get("author") + and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title")) + ): + return False + elif ref.get("year") and ref.get("author") and ref.get("journal-title"): + return False + elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"): + return False + + return True + + def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]: + """ + Given a complete Crossref metadata record, inspects the + + The returned dict is in the schema of the `grobid_refs` database table, + in dict form: + + source: 'crossref' + source_id: doi, as lower-case string + source_ts: Crossref indexed timestamp, if available + ('updated' is not set) + refs_json: list of dicts + """ + + # remove API wrapper around record, if necessary + if "message" in record and "DOI" not in record: + record = record["message"] + + ret = dict( + source="crossref", + source_id=record["DOI"].lower(), + source_ts=record["indexed"]["date-time"], + refs_json=[], + ) + all_refs = record.get("reference", []) + unstructured_refs = [] + for r in all_refs: + if not r.get("unstructured"): + continue + if not self.should_parse_crossref_ref(r): + continue + unstructured_refs.append(r) + if not unstructured_refs: + return ret + + # some reasonable cap on length of refs per work + if len(unstructured_refs) > 2000: + print( + f"truncatin very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}", + file=sys.stderr, + ) + unstructured_refs = unstructured_refs[:2000] + + refs = self.process_citation_list([r["unstructured"] for r in unstructured_refs]) + assert len(refs) == len(unstructured_refs) + refs_json = [] + for i in range(len(refs)): + refs[i].id = unstructured_refs[i].get("key") + assert refs[i].unstructured == unstructured_refs[i]["unstructured"] + refs_json.append(refs[i].to_dict()) + ret["refs_json"] = refs_json + return ret + class GrobidWorker(SandcrawlerFetchWorker): def __init__( @@ -97,7 +202,7 @@ class GrobidWorker(SandcrawlerFetchWorker): grobid_client: GrobidClient, wayback_client: Optional[WaybackClient] = None, sink: Optional[SandcrawlerWorker] = None, - **kwargs + **kwargs, ): super().__init__(wayback_client=wayback_client) self.grobid_client = grobid_client @@ -129,6 +234,18 @@ class GrobidWorker(SandcrawlerFetchWorker): return result +class CrossrefRefsWorker(SandcrawlerWorker): + def __init__( + self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs + ): + super().__init__(**kwargs) + self.grobid_client = grobid_client + self.sink = sink + + def process(self, record: Any, key: Optional[str] = None) -> Any: + return self.grobid_client.crossref_refs(record) + + class GrobidBlobWorker(SandcrawlerWorker): """ This is sort of like GrobidWorker, except it receives blobs directly, |