diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 15:02:03 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-23 15:02:03 -0800 |
commit | 1443f05faebd9e697086132694401f6a6c42d9b5 (patch) | |
tree | 8da8b8e7f4c957c5edccefe9188741c15697cd46 /python/fatcat_tools/importers/grobid_metadata.py | |
parent | 1fa8f820fd3b7c64d424f55796d2b860d22e4b22 (diff) | |
download | fatcat-1443f05faebd9e697086132694401f6a6c42d9b5.tar.gz fatcat-1443f05faebd9e697086132694401f6a6c42d9b5.zip |
more tests; fix some importer behavior
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 50 |
1 files changed, 23 insertions, 27 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index c1835b9f..4d3b41bc 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -34,51 +34,47 @@ class GrobidMetadataImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): + return True + + def parse_record(self, row): - fields = raw_record.split('\t') + fields = row.split('\t') sha1_key = fields[0] - sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - #cdx = json.loads(fields[1]) - #mimetype = fields[2] - #file_size = int(fields[3]) + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) - if not grobid_meta.get('title'): - return False + if not (fe and re): + return None # lookup existing file SHA1 + existing = None try: - existing_file = self.api.lookup_file(sha1=sha1) + existing = self.api.lookup_file(sha1=fe.sha1) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err - existing_file = None # if file is already in here, presumably not actually long-tail + # HACK: this is doing an exists check in parse_record(), which is weird # TODO: this is where we should check if the file actually has # release_ids and/or URLs associated with it - if existing_file and not self.bezerk_mode: - return False - return True - - def parse_record(self, row): - - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - assert (fe and re) + if existing and not self.bezerk_mode: + self.counts['exists'] += 1 + self.counts['skip'] -= 1 + return None release_edit = self.create_release(re) fe.release_ids.append(release_edit.ident) return fe def parse_grobid_json(self, obj): - assert obj.get('title') + + if not obj.get('title'): + return None extra = dict() @@ -196,8 +192,8 @@ class GrobidMetadataImporter(EntityImporter): return fe - def try_update(entity): - # we did this in want() + def try_update(self, entity): + # did the exists check in 'parse_record()', because we needed to create a release return True def insert_batch(self, batch): |