aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py123
1 files changed, 77 insertions, 46 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
import base64
import datetime
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
MAX_ABSTRACT_BYTES=4096
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+ """
+ This is a complex case: we need to parse and create both file and release entities.
+
+ The "primary" entity here is really File, not Release. If a matching File
+ exists, we bail in want(); if not we insert the Release during parsing, and
+ insert both.
+
+ TODO: should instead check if the File has any releases; if not, insert and update.
+ TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+ """
def __init__(self, api, **kwargs):
@@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.longtail_oa = kwargs.get("longtail_oa", False)
+
+ def want(self, raw_record):
+ return True
+
+ def parse_record(self, row):
+
+ fields = row.split('\t')
+ sha1_key = fields[0]
+ cdx = json.loads(fields[1])
+ mimetype = fields[2]
+ file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+ fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+ re = self.parse_grobid_json(grobid_meta)
+
+ if not (fe and re):
+ return None
+
+ # lookup existing file SHA1
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # if file is already in here, presumably not actually long-tail
+ # HACK: this is doing an exists check in parse_record(), which is weird
+ # TODO: this is where we should check if the file actually has
+ # release_ids and/or URLs associated with it
+ if existing and not self.bezerk_mode:
+ self.counts['exists'] += 1
+ self.counts['skip'] -= 1
+ return None
+
+ release_edit = self.create_release(re)
+ fe.release_ids.append(release_edit.ident)
+ return fe
def parse_grobid_json(self, obj):
@@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):
abobj = dict(
mimetype="text/plain",
language=None,
- content=obj.get('abstract').strip())
+ content=clean(obj.get('abstract')))
abstracts = [abobj]
else:
abstracts = None
@@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):
for i, a in enumerate(obj.get('authors', [])):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
- raw_name=a['name'],
+ raw_name=clean(a['name']),
role="author",
extra=None))
+ # XXX: why is this a dict()? not covered by tests?
refs = []
for raw in obj.get('citations', []):
cite_extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
+ ref['key'] = clean(raw.get('id'))
if raw.get('title'):
- ref['title'] = raw['title'].strip()
+ ref['title'] = clean(raw['title'])
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
@@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
- cite_extra[key] = raw[key].strip()
+ cite_extra[key] = clean(raw[key])
if raw.get('authors'):
- cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
if cite_extra:
cite_extra = dict(grobid=cite_extra)
else:
@@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):
if obj.get('doi'):
extra['doi'] = obj['doi']
if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
-
- extra['is_longtail_oa'] = True
+ extra['container_name'] = clean(obj['journal']['name'])
# TODO: ISSN/eISSN handling? or just journal name lookup?
+ if self.longtail_oa:
+ extra['longtail_oa'] = True
+
if extra:
extra = dict(grobid=extra)
else:
extra = None
re = fatcat_client.ReleaseEntity(
- title=obj['title'].strip(),
+ title=clean(obj['title'], force_xml=True),
release_type="article-journal",
release_date=release_date,
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=clean(obj['journal'].get('publisher')),
+ volume=clean(obj['journal'].get('volume')),
+ issue=clean(obj['journal'].get('issue')),
abstracts=abstracts,
extra=extra)
return re
@@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):
sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
- # lookup existing SHA1, or create new entity
- try:
- existing_file = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- existing_file = None
-
- if existing_file:
- # if file is already in here, presumably not actually long-tail
- return None
fe = fatcat_client.FileEntity(
sha1=sha1,
size=int(file_size),
@@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):
# parse URLs and CDX
original = cdx['url']
+ assert len(cdx['dt']) >= 8
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
@@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):
return fe
- def create_row(self, row, editgroup_id=None):
- if not row:
- return
- fields = row.split('\t')
- sha1_key = fields[0]
- cdx = json.loads(fields[1])
- mimetype = fields[2]
- file_size = int(fields[3])
- grobid_meta = json.loads(fields[4])
- fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
- re = self.parse_grobid_json(grobid_meta)
- if fe and re:
- release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
- # release ident can't already be in release list because we just
- # created it
- fe.release_ids.append(release_entity.ident)
- file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- # NB: batch mode not implemented
+ def try_update(self, entity):
+ # did the exists check in 'parse_record()', because we needed to create a release
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+