summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/worker.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-06-01 01:04:16 -0700
committerBryan Newbold <bnewbold@archive.org>2021-06-02 00:38:51 -0700
commit01a1978d5b9667df4ae71a7934512e6c4e3bf9a8 (patch)
treed9d0e87e0c1bdb842696018174db4f432bab20c9 /fatcat_scholar/worker.py
parent86b29ed5fca70fc0c52443acf6a5ec1a398ed3f6 (diff)
downloadfatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.tar.gz
fatcat-scholar-01a1978d5b9667df4ae71a7934512e6c4e3bf9a8.zip
schema: add 'crossref' to bundle schema, and add from_json() helper
from_json() refactor was an earlier TODO, to reduce duplication when updating fields on this class
Diffstat (limited to 'fatcat_scholar/worker.py')
-rw-r--r--fatcat_scholar/worker.py15
1 files changed, 2 insertions, 13 deletions
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index 823f1bd..7787d42 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -123,19 +123,8 @@ class IndexDocsWorker(KafkaWorker):
bulk_actions = []
for obj in batch:
- bundle = IntermediateBundle(
- doc_type=DocType(obj["doc_type"]),
- releases=[
- entity_from_json(json.dumps(re), ReleaseEntity)
- for re in obj["releases"]
- ],
- biblio_release_ident=obj.get("biblio_release_ident"),
- grobid_fulltext=obj.get("grobid_fulltext"),
- pdftotext_fulltext=obj.get("pdftotext_fulltext"),
- pdf_meta=obj.get("pdf_meta"),
- html_fulltext=obj.get("html_fulltext"),
- sim_fulltext=obj.get("sim_fulltext"),
- )
+ bundle = IntermediateBundle.from_json(obj)
+ assert bundle.get('doc_type')
es_doc = transform_heavy(bundle)
if not es_doc:
continue