From 01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 1 Jun 2021 01:04:16 -0700
Subject: schema: add 'crossref' to bundle schema, and add from_json() helper

from_json() refactor was an earlier TODO, to reduce duplication when
updating fields on this class
---
 fatcat_scholar/schema.py        | 21 ++++++++++++++++++++-
 fatcat_scholar/sim_pipeline.py  |  1 +
 fatcat_scholar/transform.py     | 30 ++++--------------------------
 fatcat_scholar/work_pipeline.py |  1 +
 fatcat_scholar/worker.py        | 15 ++-------------
 5 files changed, 28 insertions(+), 40 deletions(-)

(limited to 'fatcat_scholar')

diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 9912a97..b8a1923 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -5,6 +5,7 @@ auto-conversion of datetime objects.
 """
 
 import re
+import json
 import datetime
 from enum import Enum
 from typing import Optional, List, Any, Dict
@@ -18,7 +19,7 @@ from pydantic import BaseModel
 # pytype: enable=import-error
 
 from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-from fatcat_scholar.api_entities import entity_to_dict
+from fatcat_scholar.api_entities import entity_to_dict, entity_from_json
 from fatcat_scholar.biblio_hacks import doi_link_domain
 
 
@@ -31,6 +32,7 @@ class IntermediateBundle(BaseModel):
     doc_type: DocType
     releases: List[ReleaseEntity]
     biblio_release_ident: Optional[str]
+    crossref: Optional[Dict[str, Any]]
     grobid_fulltext: Optional[Dict[str, Any]]
     pdftotext_fulltext: Optional[Dict[str, Any]]
     pdf_meta: Optional[Dict[str, Any]]
@@ -45,6 +47,23 @@ class IntermediateBundle(BaseModel):
             datetime.datetime: lambda dt: dt.isoformat(),
         }
 
+    @classmethod
+    def from_json(cls, obj: Dict[Any, Any]) -> "IntermediateBundle":
+        return IntermediateBundle(
+            doc_type=DocType(obj.get("doc_type")),
+            releases=[
+                entity_from_json(json.dumps(re), ReleaseEntity)
+                for re in obj.get("releases", [])
+            ],
+            biblio_release_ident=obj.get("biblio_release_ident"),
+            crossref=obj.get("crossref"),
+            grobid_fulltext=obj.get("grobid_fulltext"),
+            pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+            pdf_meta=obj.get("pdf_meta"),
+            sim_fulltext=obj.get("sim_fulltext"),
+            html_fulltext=obj.get("html_fulltext"),
+        )
+
 
 class AccessType(str, Enum):
     ia_sim = "ia_sim"
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index e5e2a02..d602edf 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -122,6 +122,7 @@ class SimPipeline:
                 doc_type=DocType.sim_page,
                 releases=[],
                 biblio_release_ident=None,
+                crossref=None,
                 grobid_fulltext=None,
                 pdftotext_fulltext=None,
                 sim_fulltext=dict(
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 53f83ae..1c4b0e7 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -733,19 +733,8 @@ def run_transform(infile: Sequence) -> None:
     for line in infile:
         obj = json.loads(line)
 
-        heavy = IntermediateBundle(
-            doc_type=DocType(obj["doc_type"]),
-            releases=[
-                entity_from_json(json.dumps(re), ReleaseEntity)
-                for re in obj["releases"]
-            ],
-            biblio_release_ident=obj.get("biblio_release_ident"),
-            grobid_fulltext=obj.get("grobid_fulltext"),
-            pdftotext_fulltext=obj.get("pdftotext_fulltext"),
-            pdf_meta=obj.get("pdf_meta"),
-            sim_fulltext=obj.get("sim_fulltext"),
-            html_fulltext=obj.get("html_fulltext"),
-        )
+        heavy = IntermediateBundle.from_json(obj)
+        assert heavy.doc_type
         es_doc = transform_heavy(heavy)
         if not es_doc:
             continue
@@ -756,19 +745,8 @@ def run_refs(infile: Sequence) -> None:
     for line in infile:
         obj = json.loads(line)
 
-        heavy = IntermediateBundle(
-            doc_type=DocType(obj["doc_type"]),
-            releases=[
-                entity_from_json(json.dumps(re), ReleaseEntity)
-                for re in obj["releases"]
-            ],
-            biblio_release_ident=obj.get("biblio_release_ident"),
-            grobid_fulltext=obj.get("grobid_fulltext"),
-            pdftotext_fulltext=obj.get("pdftotext_fulltext"),
-            pdf_meta=obj.get("pdf_meta"),
-            sim_fulltext=obj.get("sim_fulltext"),
-            html_fulltext=obj.get("html_fulltext"),
-        )
+        heavy = IntermediateBundle.from_json(obj)
+        assert heavy.doc_type
         refs = refs_from_heavy(heavy)
         for ref in refs:
             print(ref.json(exclude_none=True, sort_keys=True))
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 4c8f1be..7b477a0 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -389,6 +389,7 @@ class WorkPipeline:
             doc_type=DocType.work,
             releases=releases,
             biblio_release_ident=pref_idents[0],
+            crossref=biblio_crossref,
             grobid_fulltext=grobid_fulltext,
             pdftotext_fulltext=pdftotext_fulltext,
             pdf_meta=pdf_meta,
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index 823f1bd..7787d42 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -123,19 +123,8 @@ class IndexDocsWorker(KafkaWorker):
 
         bulk_actions = []
         for obj in batch:
-            bundle = IntermediateBundle(
-                doc_type=DocType(obj["doc_type"]),
-                releases=[
-                    entity_from_json(json.dumps(re), ReleaseEntity)
-                    for re in obj["releases"]
-                ],
-                biblio_release_ident=obj.get("biblio_release_ident"),
-                grobid_fulltext=obj.get("grobid_fulltext"),
-                pdftotext_fulltext=obj.get("pdftotext_fulltext"),
-                pdf_meta=obj.get("pdf_meta"),
-                html_fulltext=obj.get("html_fulltext"),
-                sim_fulltext=obj.get("sim_fulltext"),
-            )
+            bundle = IntermediateBundle.from_json(obj)
+            assert bundle.get('doc_type')
             es_doc = transform_heavy(bundle)
             if not es_doc:
                 continue
-- 
cgit v1.2.3