aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py39
1 files changed, 20 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index e36e1b48..3c85132c 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional
import fatcat_openapi_client
from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity
-from .common import EntityImporter, clean, make_rel_url
+from fatcat_tools.normal import clean_doi, clean_str
-MAX_ABSTRACT_BYTES = 4096
+from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url
class GrobidMetadataImporter(EntityImporter):
@@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter):
extra_grobid: Dict[str, Any] = dict()
abstract = obj.get("abstract")
- if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
+ if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain", content=clean(obj.get("abstract"))
+ mimetype="text/plain", content=clean_str(obj.get("abstract"))
)
abstracts = [abobj]
else:
@@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter):
contribs.append(
fatcat_openapi_client.ReleaseContrib(
index=i,
- raw_name=clean(a["name"]),
- given_name=clean(a.get("given_name")),
- surname=clean(a.get("surname")),
+ raw_name=clean_str(a["name"]),
+ given_name=clean_str(a.get("given_name")),
+ surname=clean_str(a.get("surname")),
role="author",
extra=None,
)
@@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter):
pass
for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
- cite_extra[key] = clean(raw[key])
+ cite_extra[key] = clean_str(raw[key])
if raw.get("authors"):
- cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
+ cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]]
refs.append(
fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get("id")),
+ key=clean_str(raw.get("id")),
year=year,
- title=clean(raw["title"]),
+ title=clean_str(raw["title"]),
extra=cite_extra or None,
)
)
@@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter):
# only returns year, ever?
release_year = int(obj["date"][:4])
- extra = dict()
- if obj.get("doi"):
- extra["doi"] = obj["doi"]
+ extra: Dict[str, Any] = dict()
+ doi = clean_doi(obj.get("doi"))
+ if doi:
+ extra["doi"] = doi
if obj["journal"] and obj["journal"].get("name"):
- extra["container_name"] = clean(obj["journal"]["name"])
+ extra["container_name"] = clean_str(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter):
if self.longtail_oa:
extra["longtail_oa"] = True
- clean_title = clean(obj["title"], force_xml=True)
+ clean_title = clean_str(obj["title"], force_xml=True)
if not clean_title or len(clean_title) < 2:
return None
title = clean_title
@@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj["journal"].get("publisher")),
- volume=clean(obj["journal"].get("volume")),
- issue=clean(obj["journal"].get("issue")),
+ publisher=clean_str(obj["journal"].get("publisher")),
+ volume=clean_str(obj["journal"].get("volume")),
+ issue=clean_str(obj["journal"].get("issue")),
abstracts=abstracts or None,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
extra=extra or None,