aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-06-01 01:04:16 -0700
committerBryan Newbold <bnewbold@archive.org>2021-06-02 00:38:51 -0700
commit01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 (patch)
treed9d0e87e0c1bdb842696018174db4f432bab20c9
parent86b29ed5fca70fc0c52443acf6a5ec1a398ed3f6 (diff)
downloadfatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.tar.gz
fatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.zip
schema: add 'crossref' to bundle schema, and add from_json() helper
from_json() refactor was an earlier TODO, to reduce duplication when updating fields on this class
-rw-r--r--fatcat_scholar/schema.py21
-rw-r--r--fatcat_scholar/sim_pipeline.py1
-rw-r--r--fatcat_scholar/transform.py30
-rw-r--r--fatcat_scholar/work_pipeline.py1
-rw-r--r--fatcat_scholar/worker.py15
5 files changed, 28 insertions, 40 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 9912a97..b8a1923 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -5,6 +5,7 @@ auto-conversion of datetime objects.
"""
import re
+import json
import datetime
from enum import Enum
from typing import Optional, List, Any, Dict
@@ -18,7 +19,7 @@ from pydantic import BaseModel
# pytype: enable=import-error
from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-from fatcat_scholar.api_entities import entity_to_dict
+from fatcat_scholar.api_entities import entity_to_dict, entity_from_json
from fatcat_scholar.biblio_hacks import doi_link_domain
@@ -31,6 +32,7 @@ class IntermediateBundle(BaseModel):
doc_type: DocType
releases: List[ReleaseEntity]
biblio_release_ident: Optional[str]
+ crossref: Optional[Dict[str, Any]]
grobid_fulltext: Optional[Dict[str, Any]]
pdftotext_fulltext: Optional[Dict[str, Any]]
pdf_meta: Optional[Dict[str, Any]]
@@ -45,6 +47,23 @@ class IntermediateBundle(BaseModel):
datetime.datetime: lambda dt: dt.isoformat(),
}
+ @classmethod
+ def from_json(cls, obj: Dict[Any, Any]) -> "IntermediateBundle":
+ return IntermediateBundle(
+ doc_type=DocType(obj.get("doc_type")),
+ releases=[
+ entity_from_json(json.dumps(re), ReleaseEntity)
+ for re in obj.get("releases", [])
+ ],
+ biblio_release_ident=obj.get("biblio_release_ident"),
+ crossref=obj.get("crossref"),
+ grobid_fulltext=obj.get("grobid_fulltext"),
+ pdftotext_fulltext=obj.get("pdftotext_fulltext"),
+ pdf_meta=obj.get("pdf_meta"),
+ sim_fulltext=obj.get("sim_fulltext"),
+ html_fulltext=obj.get("html_fulltext"),
+ )
+
class AccessType(str, Enum):
ia_sim = "ia_sim"
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index e5e2a02..d602edf 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -122,6 +122,7 @@ class SimPipeline:
doc_type=DocType.sim_page,
releases=[],
biblio_release_ident=None,
+ crossref=None,
grobid_fulltext=None,
pdftotext_fulltext=None,
sim_fulltext=dict(
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 53f83ae..1c4b0e7 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -733,19 +733,8 @@ def run_transform(infile: Sequence) -> None:
for line in infile:
obj = json.loads(line)
- heavy = IntermediateBundle(
- doc_type=DocType(obj["doc_type"]),
- releases=[
- entity_from_json(json.dumps(re), ReleaseEntity)
- for re in obj["releases"]
- ],
- biblio_release_ident=obj.get("biblio_release_ident"),
- grobid_fulltext=obj.get("grobid_fulltext"),
- pdftotext_fulltext=obj.get("pdftotext_fulltext"),
- pdf_meta=obj.get("pdf_meta"),
- sim_fulltext=obj.get("sim_fulltext"),
- html_fulltext=obj.get("html_fulltext"),
- )
+ heavy = IntermediateBundle.from_json(obj)
+ assert heavy.doc_type
es_doc = transform_heavy(heavy)
if not es_doc:
continue
@@ -756,19 +745,8 @@ def run_refs(infile: Sequence) -> None:
for line in infile:
obj = json.loads(line)
- heavy = IntermediateBundle(
- doc_type=DocType(obj["doc_type"]),
- releases=[
- entity_from_json(json.dumps(re), ReleaseEntity)
- for re in obj["releases"]
- ],
- biblio_release_ident=obj.get("biblio_release_ident"),
- grobid_fulltext=obj.get("grobid_fulltext"),
- pdftotext_fulltext=obj.get("pdftotext_fulltext"),
- pdf_meta=obj.get("pdf_meta"),
- sim_fulltext=obj.get("sim_fulltext"),
- html_fulltext=obj.get("html_fulltext"),
- )
+ heavy = IntermediateBundle.from_json(obj)
+ assert heavy.doc_type
refs = refs_from_heavy(heavy)
for ref in refs:
print(ref.json(exclude_none=True, sort_keys=True))
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 4c8f1be..7b477a0 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -389,6 +389,7 @@ class WorkPipeline:
doc_type=DocType.work,
releases=releases,
biblio_release_ident=pref_idents[0],
+ crossref=biblio_crossref,
grobid_fulltext=grobid_fulltext,
pdftotext_fulltext=pdftotext_fulltext,
pdf_meta=pdf_meta,
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index 823f1bd..7787d42 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -123,19 +123,8 @@ class IndexDocsWorker(KafkaWorker):
bulk_actions = []
for obj in batch:
- bundle = IntermediateBundle(
- doc_type=DocType(obj["doc_type"]),
- releases=[
- entity_from_json(json.dumps(re), ReleaseEntity)
- for re in obj["releases"]
- ],
- biblio_release_ident=obj.get("biblio_release_ident"),
- grobid_fulltext=obj.get("grobid_fulltext"),
- pdftotext_fulltext=obj.get("pdftotext_fulltext"),
- pdf_meta=obj.get("pdf_meta"),
- html_fulltext=obj.get("html_fulltext"),
- sim_fulltext=obj.get("sim_fulltext"),
- )
+ bundle = IntermediateBundle.from_json(obj)
+ assert bundle.get('doc_type')
es_doc = transform_heavy(bundle)
if not es_doc:
continue