diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/jalc.py | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 74 |
1 files changed, 11 insertions, 63 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..9916a55f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence @@ -7,9 +6,9 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity -from fatcat_tools.normal import clean_doi +from fatcat_tools.normal import clean_doi, clean_str, is_cjk -from .common import DATE_FMT, EntityImporter, clean, is_cjk +from .common import DATE_FMT, EntityImporter # TODO: should be List[Tag] not List[Any] for full type annotations @@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: for raw in raw_persons: name = raw.find("name") or None if name: - name = clean(name.get_text().replace("\n", " ")) + name = clean_str(name.get_text().replace("\n", " ")) surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace("\n", " ")) + surname = clean_str(surname.get_text().replace("\n", " ")) given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace("\n", " ")) + given_name = clean_str(given_name.get_text().replace("\n", " ")) lang = "en" if is_cjk(name): lang = "ja" @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, raw_record: Any) -> bool: return True @@ -273,16 +230,16 @@ class JalcImporter(EntityImporter): for p in record.find_all("publicationName") if p.get_text() ] - pubs = [clean(p) for p in pubs if p] + pubs = [clean_str(p) for p in pubs if p] assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): # eng/jpn ordering is not reliable pubs = [pubs[1], pubs[0]] - container_name = clean(pubs[0]) + container_name = clean_str(pubs[0]) if len(pubs) > 1: - container_extra["original_name"] = clean(pubs[1]) + container_extra["original_name"] = clean_str(pubs[1]) if record.publisher: pubs = [ @@ -297,7 +254,7 @@ class JalcImporter(EntityImporter): # ordering is not reliable pubs = [pubs[1], pubs[0]] if pubs: - publisher = clean(pubs[0]) + publisher = clean_str(pubs[0]) if len(pubs) > 1: container_extra["publisher_aliases"] = pubs[1:] @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter): # reasonable default for this collection release_type = "article-journal" - # external identifiers - extids = self.lookup_ext_ids(doi=doi) - # extra: # translation_of # aliases @@ -342,26 +296,20 @@ class JalcImporter(EntityImporter): # (informally) extra["jalc"] = extra_jalc - title = clean(title) + title = clean_str(title) if not title: return None re = ReleaseEntity( work_id=None, title=title, - original_title=clean(original_title), + original_title=clean_str(original_title), release_type=release_type, release_stage="published", release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=volume, issue=issue, |