diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/grobid_metadata.py | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/grobid_metadata.py')
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 39 |
1 files changed, 20 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index e36e1b48..3c85132c 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,9 +7,9 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity -from .common import EntityImporter, clean, make_rel_url +from fatcat_tools.normal import clean_doi, clean_str -MAX_ABSTRACT_BYTES = 4096 +from .common import MAX_ABSTRACT_LENGTH, EntityImporter, make_rel_url class GrobidMetadataImporter(EntityImporter): @@ -82,9 +82,9 @@ class GrobidMetadataImporter(EntityImporter): extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") - if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: + if abstract and len(abstract) < MAX_ABSTRACT_LENGTH and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", content=clean(obj.get("abstract")) + mimetype="text/plain", content=clean_str(obj.get("abstract")) ) abstracts = [abobj] else: @@ -95,9 +95,9 @@ class GrobidMetadataImporter(EntityImporter): contribs.append( fatcat_openapi_client.ReleaseContrib( index=i, - raw_name=clean(a["name"]), - given_name=clean(a.get("given_name")), - surname=clean(a.get("surname")), + raw_name=clean_str(a["name"]), + given_name=clean_str(a.get("given_name")), + surname=clean_str(a.get("surname")), role="author", extra=None, ) @@ -114,15 +114,15 @@ class GrobidMetadataImporter(EntityImporter): pass for key in ("volume", "url", "issue", "publisher"): if raw.get(key): - cite_extra[key] = clean(raw[key]) + cite_extra[key] = clean_str(raw[key]) if raw.get("authors"): - cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] + cite_extra["authors"] = [clean_str(a["name"]) for a in raw["authors"]] refs.append( fatcat_openapi_client.ReleaseRef( - key=clean(raw.get("id")), + key=clean_str(raw.get("id")), year=year, - title=clean(raw["title"]), + title=clean_str(raw["title"]), extra=cite_extra or None, ) ) @@ -133,11 +133,12 @@ class GrobidMetadataImporter(EntityImporter): # only returns year, ever? release_year = int(obj["date"][:4]) - extra = dict() - if obj.get("doi"): - extra["doi"] = obj["doi"] + extra: Dict[str, Any] = dict() + doi = clean_doi(obj.get("doi")) + if doi: + extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): - extra["container_name"] = clean(obj["journal"]["name"]) + extra["container_name"] = clean_str(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -146,7 +147,7 @@ class GrobidMetadataImporter(EntityImporter): if self.longtail_oa: extra["longtail_oa"] = True - clean_title = clean(obj["title"], force_xml=True) + clean_title = clean_str(obj["title"], force_xml=True) if not clean_title or len(clean_title) < 2: return None title = clean_title @@ -158,9 +159,9 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj["journal"].get("publisher")), - volume=clean(obj["journal"].get("volume")), - issue=clean(obj["journal"].get("issue")), + publisher=clean_str(obj["journal"].get("publisher")), + volume=clean_str(obj["journal"].get("volume")), + issue=clean_str(obj["journal"].get("issue")), abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), extra=extra or None, |