summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/jalc.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
commit6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_tools/importers/jalc.py
parent7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
downloadfatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_tools/importers/jalc.py')
-rw-r--r--python/fatcat_tools/importers/jalc.py74
1 files changed, 11 insertions, 63 deletions
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..9916a55f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence
@@ -7,9 +6,9 @@ import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity
-from fatcat_tools.normal import clean_doi
+from fatcat_tools.normal import clean_doi, clean_str, is_cjk
-from .common import DATE_FMT, EntityImporter, clean, is_cjk
+from .common import DATE_FMT, EntityImporter
# TODO: should be List[Tag] not List[Any] for full type annotations
@@ -37,13 +36,13 @@ def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]:
for raw in raw_persons:
name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace("\n", " "))
+ name = clean_str(name.get_text().replace("\n", " "))
surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace("\n", " "))
+ surname = clean_str(surname.get_text().replace("\n", " "))
given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace("\n", " "))
+ given_name = clean_str(given_name.get_text().replace("\n", " "))
lang = "en"
if is_cjk(name):
lang = "ja"
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, raw_record: Any) -> bool:
return True
@@ -273,16 +230,16 @@ class JalcImporter(EntityImporter):
for p in record.find_all("publicationName")
if p.get_text()
]
- pubs = [clean(p) for p in pubs if p]
+ pubs = [clean_str(p) for p in pubs if p]
assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
# eng/jpn ordering is not reliable
pubs = [pubs[1], pubs[0]]
- container_name = clean(pubs[0])
+ container_name = clean_str(pubs[0])
if len(pubs) > 1:
- container_extra["original_name"] = clean(pubs[1])
+ container_extra["original_name"] = clean_str(pubs[1])
if record.publisher:
pubs = [
@@ -297,7 +254,7 @@ class JalcImporter(EntityImporter):
# ordering is not reliable
pubs = [pubs[1], pubs[0]]
if pubs:
- publisher = clean(pubs[0])
+ publisher = clean_str(pubs[0])
if len(pubs) > 1:
container_extra["publisher_aliases"] = pubs[1:]
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
# reasonable default for this collection
release_type = "article-journal"
- # external identifiers
- extids = self.lookup_ext_ids(doi=doi)
-
# extra:
# translation_of
# aliases
@@ -342,26 +296,20 @@ class JalcImporter(EntityImporter):
# (informally)
extra["jalc"] = extra_jalc
- title = clean(title)
+ title = clean_str(title)
if not title:
return None
re = ReleaseEntity(
work_id=None,
title=title,
- original_title=clean(original_title),
+ original_title=clean_str(original_title),
release_type=release_type,
release_stage="published",
release_date=release_date,
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,