aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py136
1 files changed, 74 insertions, 62 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 0f666652..f7bb5357 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,7 +7,7 @@ import fatcat_openapi_client
from .common import EntityImporter, clean, make_rel_url
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
class GrobidMetadataImporter(EntityImporter):
@@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Import of release and file metadata, as extracted from PDFs by GROBID.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Import of release and file metadata, as extracted from PDFs by GROBID.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.longtail_oa = kwargs.get("longtail_oa", False)
@@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter):
def parse_record(self, row):
- fields = row.split('\t')
+ fields = row.split("\t")
sha1_key = fields[0]
cdx = json.loads(fields[1])
mimetype = fields[2]
@@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter):
# TODO: this is where we should check if the file actually has
# release_ids and/or URLs associated with it
if existing and not self.bezerk_mode:
- self.counts['exists'] += 1
- self.counts['skip'] -= 1
+ self.counts["exists"] += 1
+ self.counts["skip"] -= 1
return None
release_edit = self.create_release(re)
@@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter):
def parse_grobid_json(self, obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra_grobid = dict()
- abstract = obj.get('abstract')
+ abstract = obj.get("abstract")
if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain",
- content=clean(obj.get('abstract')))
+ mimetype="text/plain", content=clean(obj.get("abstract"))
+ )
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for i, a in enumerate(obj.get('authors', [])):
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i,
- raw_name=clean(a['name']),
- given_name=clean(a.get('given_name')),
- surname=clean(a.get('surname')),
- role="author",
- extra=None))
+ for i, a in enumerate(obj.get("authors", [])):
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ index=i,
+ raw_name=clean(a["name"]),
+ given_name=clean(a.get("given_name")),
+ surname=clean(a.get("surname")),
+ role="author",
+ extra=None,
+ )
+ )
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
cite_extra = dict()
year = None
- if raw.get('date'):
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
+ year = int(raw["date"].strip()[:4])
except (IndexError, ValueError):
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
cite_extra[key] = clean(raw[key])
- if raw.get('authors'):
- cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
+ if raw.get("authors"):
+ cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
if not cite_extra:
cite_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get('id')),
- year=year,
- title=clean(raw['title']),
- extra=cite_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ key=clean(raw.get("id")),
+ year=year,
+ title=clean(raw["title"]),
+ extra=cite_extra,
+ )
+ )
release_date = None
release_year = None
- if obj.get('date'):
+ if obj.get("date"):
# only returns year, ever?
- release_year = int(obj['date'][:4])
+ release_year = int(obj["date"][:4])
extra = dict()
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = clean(obj['journal']['name'])
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"]
+ if obj["journal"] and obj["journal"].get("name"):
+ extra["container_name"] = clean(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
if extra_grobid:
- extra['grobid'] = extra_grobid
+ extra["grobid"] = extra_grobid
if self.longtail_oa:
- extra['longtail_oa'] = True
+ extra["longtail_oa"] = True
if not extra:
extra = None
- title = clean(obj['title'], force_xml=True)
+ title = clean(obj["title"], force_xml=True)
if not title or len(title) < 2:
return None
@@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj['journal'].get('publisher')),
- volume=clean(obj['journal'].get('volume')),
- issue=clean(obj['journal'].get('issue')),
+ publisher=clean(obj["journal"].get("publisher")),
+ volume=clean(obj["journal"].get("volume")),
+ issue=clean(obj["journal"].get("issue")),
abstracts=abstracts,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
- extra=extra)
+ extra=extra,
+ )
return re
def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
- sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+ sha1 = (
+ base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", "")))
+ .decode("ascii")
+ .lower()
+ )
fe = fatcat_openapi_client.FileEntity(
sha1=sha1,
@@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter):
)
# parse URLs and CDX
- original = cdx['url']
- assert len(cdx['dt']) >= 8
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
- fe.urls.append(
- fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
+ original = cdx["url"]
+ assert len(cdx["dt"]) >= 8
+ wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
+ fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
original_url = make_rel_url(original, default_link_rel=self.default_link_rel)
if original_url is not None:
- fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]))
+ fe.urls.append(
+ fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])
+ )
return fe
@@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )