initial crossref-refs via GROBID helper routine

author: Bryan Newbold <bnewbold@archive.org> 2021-10-29 12:16:02 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-11-04 17:19:52 -0700
commit: d3fa74e941aa11f79cee2d0adcb5cbc70884ef48 (patch)
tree: d9e50a33686ce678016f8f573d3d3995ed966a5f /python/sandcrawler
parent: 13d2dc55d5be4d9579a4a242ce251cdbc82730aa (diff)
download: sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.tar.gz
sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.zip
1 files changed, 121 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 37c4ea1..cdd2093 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
+import sys
 
 import requests
-from grobid_tei_xml import parse_document_xml
+from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
 
 from .ia import WaybackClient
 from .misc import gen_file_metadata
@@ -9,7 +10,7 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class GrobidClient(object):
-    def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs):
+    def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
         self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
 
@@ -67,6 +68,29 @@ class GrobidClient(object):
             info["error_msg"] = grobid_response.text[:10000]
         return info
 
+    def process_citation_list(self, unstructured_list: List[str]) -> List[GrobidBiblio]:
+        if not unstructured_list:
+            return []
+        if len(unstructured_list) > 5000:
+            raise ValueError("more than 5,000 references in a batch is just too much")
+
+        try:
+            grobid_response = requests.post(
+                self.host_url + "/api/processCitationList",
+                data={
+                    "citations": unstructured_list,
+                    "consolidateCitations": 0,
+                    "includeRawCitations": 1,
+                },
+                timeout=30.0,
+            )
+        except requests.Timeout as te:
+            # TODO: handle somehow?
+            raise te
+
+        grobid_response.raise_for_status()
+        return parse_citation_list_xml(grobid_response.text)
+
     def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         if result["status"] != "success":
             return None
@@ -90,6 +114,87 @@ class GrobidClient(object):
                 meta[k] = tei_json[k]
         return meta
 
+    def should_parse_crossref_ref(self, ref: Dict[str, Any]) -> bool:
+        """
+        Helper function to decide whether to run GROBID parsing on an crossref
+        reference.
+
+        For example, if there is already a DOI in the ref metadata, could skip.
+        Or, if there is sufficient structured metadata, or only depending on
+        the source of the DOI linkage.
+        """
+        if ref.get("DOI"):
+            return False
+        if len(ref.get("unstructured", "")) <= 6:
+            return False
+
+        # TODO: what other combinations are enough to skip parsing?
+        if (
+            ref.get("year")
+            and ref.get("author")
+            and (ref.get("article-title") or ref.get("series-title") or ref.get("volume-title"))
+        ):
+            return False
+        elif ref.get("year") and ref.get("author") and ref.get("journal-title"):
+            return False
+        elif ref.get("journal-title") and ref.get("volume") and ref.get("first-page"):
+            return False
+
+        return True
+
+    def crossref_refs(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Given a complete Crossref metadata record, inspects the
+
+        The returned dict is in the schema of the `grobid_refs` database table,
+        in dict form:
+
+            source: 'crossref'
+            source_id: doi, as lower-case string
+            source_ts: Crossref indexed timestamp, if available
+            ('updated' is not set)
+            refs_json: list of dicts
+        """
+
+        # remove API wrapper around record, if necessary
+        if "message" in record and "DOI" not in record:
+            record = record["message"]
+
+        ret = dict(
+            source="crossref",
+            source_id=record["DOI"].lower(),
+            source_ts=record["indexed"]["date-time"],
+            refs_json=[],
+        )
+        all_refs = record.get("reference", [])
+        unstructured_refs = []
+        for r in all_refs:
+            if not r.get("unstructured"):
+                continue
+            if not self.should_parse_crossref_ref(r):
+                continue
+            unstructured_refs.append(r)
+        if not unstructured_refs:
+            return ret
+
+        # some reasonable cap on length of refs per work
+        if len(unstructured_refs) > 2000:
+            print(
+                f"truncatin very large reference list for doi:{record['DOI']} len:{len(unstructured_refs)}",
+                file=sys.stderr,
+            )
+            unstructured_refs = unstructured_refs[:2000]
+
+        refs = self.process_citation_list([r["unstructured"] for r in unstructured_refs])
+        assert len(refs) == len(unstructured_refs)
+        refs_json = []
+        for i in range(len(refs)):
+            refs[i].id = unstructured_refs[i].get("key")
+            assert refs[i].unstructured == unstructured_refs[i]["unstructured"]
+            refs_json.append(refs[i].to_dict())
+        ret["refs_json"] = refs_json
+        return ret
+
 
 class GrobidWorker(SandcrawlerFetchWorker):
     def __init__(
@@ -97,7 +202,7 @@ class GrobidWorker(SandcrawlerFetchWorker):
         grobid_client: GrobidClient,
         wayback_client: Optional[WaybackClient] = None,
         sink: Optional[SandcrawlerWorker] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(wayback_client=wayback_client)
         self.grobid_client = grobid_client
@@ -129,6 +234,18 @@ class GrobidWorker(SandcrawlerFetchWorker):
         return result
 
 
+class CrossrefRefsWorker(SandcrawlerWorker):
+    def __init__(
+        self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.grobid_client = grobid_client
+        self.sink = sink
+
+    def process(self, record: Any, key: Optional[str] = None) -> Any:
+        return self.grobid_client.crossref_refs(record)
+
+
 class GrobidBlobWorker(SandcrawlerWorker):
     """
     This is sort of like GrobidWorker, except it receives blobs directly,
author	Bryan Newbold <bnewbold@archive.org>	2021-10-29 12:16:02 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-11-04 17:19:52 -0700
commit	d3fa74e941aa11f79cee2d0adcb5cbc70884ef48 (patch)
tree	d9e50a33686ce678016f8f573d3d3995ed966a5f /python/sandcrawler
parent	13d2dc55d5be4d9579a4a242ce251cdbc82730aa (diff)
download	sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.tar.gz sandcrawler-d3fa74e941aa11f79cee2d0adcb5cbc70884ef48.zip