aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py104
1 files changed, 68 insertions, 36 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..c1835b9f 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
import base64
import datetime
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter
MAX_ABSTRACT_BYTES=4096
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+ """
+ This is a complex case: we need to parse and create both file and release entities.
+
+ The "primary" entity here is really File, not Release. If a matching File
+ exists, we bail in want(); if not we insert the Release during parsing, and
+ insert both.
+
+ TODO: should instead check if the File has any releases; if not, insert and update.
+ TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+ """
def __init__(self, api, **kwargs):
@@ -23,10 +33,52 @@ class GrobidMetadataImporter(FatcatImporter):
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
- def parse_grobid_json(self, obj):
+ def want(self, raw_record):
+
+ fields = raw_record.split('\t')
+ sha1_key = fields[0]
+ sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+ #cdx = json.loads(fields[1])
+ #mimetype = fields[2]
+ #file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+
+ if not grobid_meta.get('title'):
+ return False
+
+ # lookup existing file SHA1
+ try:
+ existing_file = self.api.lookup_file(sha1=sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ existing_file = None
+
+ # if file is already in here, presumably not actually long-tail
+ # TODO: this is where we should check if the file actually has
+ # release_ids and/or URLs associated with it
+ if existing_file and not self.bezerk_mode:
+ return False
+ return True
+
+ def parse_record(self, row):
+
+ fields = row.split('\t')
+ sha1_key = fields[0]
+ cdx = json.loads(fields[1])
+ mimetype = fields[2]
+ file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+ fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+ re = self.parse_grobid_json(grobid_meta)
+ assert (fe and re)
- if not obj.get('title'):
- return None
+ release_edit = self.create_release(re)
+ fe.release_ids.append(release_edit.ident)
+ return fe
+
+ def parse_grobid_json(self, obj):
+ assert obj.get('title')
extra = dict()
@@ -122,17 +174,6 @@ class GrobidMetadataImporter(FatcatImporter):
sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
- # lookup existing SHA1, or create new entity
- try:
- existing_file = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- existing_file = None
-
- if existing_file:
- # if file is already in here, presumably not actually long-tail
- return None
fe = fatcat_client.FileEntity(
sha1=sha1,
size=int(file_size),
@@ -143,6 +184,7 @@ class GrobidMetadataImporter(FatcatImporter):
# parse URLs and CDX
original = cdx['url']
+ assert len(cdx['dt']) >= 8
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
@@ -154,23 +196,13 @@ class GrobidMetadataImporter(FatcatImporter):
return fe
- def create_row(self, row, editgroup_id=None):
- if not row:
- return
- fields = row.split('\t')
- sha1_key = fields[0]
- cdx = json.loads(fields[1])
- mimetype = fields[2]
- file_size = int(fields[3])
- grobid_meta = json.loads(fields[4])
- fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
- re = self.parse_grobid_json(grobid_meta)
- if fe and re:
- release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
- # release ident can't already be in release list because we just
- # created it
- fe.release_ids.append(release_entity.ident)
- file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- # NB: batch mode not implemented
+ def try_update(entity):
+ # we did this in want()
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+