diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/jstor.py | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/jstor.py')
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 2c8aa0a4..79691c9a 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,7 +8,10 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity -from .common import LANG_MAP_MARC, EntityImporter, clean +from fatcat_tools.biblio_lookup_tables import LANG_MAP_MARC +from fatcat_tools.normal import clean_doi, clean_str + +from .common import EntityImporter from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? @@ -138,7 +141,7 @@ class JstorImporter(EntityImporter): issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=clean(journal_title, force_xml=True), + name=clean_str(journal_title, force_xml=True), ) ce_edit = self.create_container(ce) container_id = ce_edit.ident @@ -146,7 +149,9 @@ class JstorImporter(EntityImporter): doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: - doi = doi.string.lower().strip() + doi = clean_doi(doi.string.lower()) + else: + doi = None jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: @@ -162,13 +167,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text().replace("\n", " ")) + given = clean_str(given.get_text().replace("\n", " ")) surname = c.find("surname") if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text().replace("\n", " ")) + raw_name = clean_str(raw_name.get_text().replace("\n", " ")) if not raw_name: if given and surname: @@ -230,7 +235,7 @@ class JstorImporter(EntityImporter): # JSTOR issue-id if article_meta.find("issue-id"): - issue_id = clean(article_meta.find("issue-id").string) + issue_id = clean_str(article_meta.find("issue-id").string) if issue_id: extra_jstor["issue_id"] = issue_id |