diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:23:12 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-10 13:28:28 -0800 |
commit | 16e9979a6f347b49764c1141209e84083ea81057 (patch) | |
tree | ccc3d35607cadac4933e9b28366bedf5a605c122 /python/fatcat_tools/importers/grobid_metadata.py | |
parent | ab4e1355bf93e3755985f1b5cd2589a78601d253 (diff) | |
download | fatcat-16e9979a6f347b49764c1141209e84083ea81057.tar.gz fatcat-16e9979a6f347b49764c1141209e84083ea81057.zip |
importers: refactor imports of clean() and other normalization helpers
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 30 |
1 files changed, 15 insertions, 15 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 7c595787..9db499a0 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str -from .common import EntityImporter, clean, make_rel_url +from .common import EntityImporter, make_rel_url MAX_ABSTRACT_BYTES = 4096 @@ -86,7 +86,7 @@ class GrobidMetadataImporter(EntityImporter): abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", content=clean(obj.get("abstract")) + mimetype="text/plain", content=clean_str(obj.get("abstract")) ) abstracts = [abobj] else: @@ -97,9 +97,9 @@ class GrobidMetadataImporter(EntityImporter): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, - raw_name=clean(a["name"]), - given_name=clean(a.get("given_name")), - surname=clean(a.get("surname")), + raw_name=clean_str(a["name"]), + given_name=clean_str(a.get("given_name")), + surname=clean_str(a.get("surname")), role="author", extra=None, ) @@ -116,15 +116,15 @@ class GrobidMetadataImporter(EntityImporter): pass for key in ("volume", "url", "issue", "publisher"): if raw.get(key): - cite_extra[key] = clean(raw[key]) + cite_extra[key] = clean_str(raw[key]) if raw.get("authors"): - cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] + cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]] refs.append( fatcat_openapi_client.ReleaseRef( - key=clean(raw.get("id")), + key=clean_str(raw.get("id")), year=year, - title=clean(raw["title"]), + title=clean_str(raw["title"]), extra=cite_extra or None, ) ) @@ -140,7 +140,7 @@ class GrobidMetadataImporter(EntityImporter): if doi: extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): - extra["container_name"] = clean(obj["journal"]["name"]) + extra["container_name"] = clean_str(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -149,7 +149,7 @@ class GrobidMetadataImporter(EntityImporter): if self.longtail_oa: extra["longtail_oa"] = True - clean_title = clean(obj["title"], force_xml=True) + clean_title = clean_str(obj["title"], force_xml=True) if not clean_title or len(clean_title) < 2: return None title = clean_title @@ -161,9 +161,9 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj["journal"].get("publisher")), - volume=clean(obj["journal"].get("volume")), - issue=clean(obj["journal"].get("issue")), + publisher=clean_str(obj["journal"].get("publisher")), + volume=clean_str(obj["journal"].get("volume")), + issue=clean_str(obj["journal"].get("issue")), abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra or None, |