diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 18:10:35 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 18:49:46 -0800 |
commit | ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff (patch) | |
tree | 2f3ff3ba4b70f0f7d4603a224bf68cbe3892376b | |
parent | a6d994fbc18debcf3860e6deb12eb54234a42839 (diff) | |
download | fatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.tar.gz fatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.zip |
remove deprecated extid sqlite3 lookup table feature from importers
This was used during initial bulk imports, but is no longer used and
could create serious metadata problems if used accidentially.
In retrospect, it also made metadata provenance less transparent, and
may have done more harm than good overall.
-rw-r--r-- | python/README_import.md | 3 | ||||
-rwxr-xr-x | python/fatcat_import.py | 22 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 54 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 54 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 52 | ||||
-rw-r--r-- | python/tests/import_crossref.py | 8 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 2 | ||||
-rw-r--r-- | python/tests/import_jalc.py | 8 | ||||
-rw-r--r-- | python/tests/import_jstor.py | 8 | ||||
-rw-r--r-- | python/tests/import_pubmed.py | 2 |
10 files changed, 10 insertions, 203 deletions
diff --git a/python/README_import.md b/python/README_import.md index 6853a4d7..74e75e14 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine. Usually 24 hours or so on fast production machine. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 ## JALC @@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine. First import a random subset single threaded to create (most) containers. On a fast machine, this takes a couple minutes. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 Then, in parallel: @@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import): Run import in parallel: + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated export FATCAT_AUTH_WORKER_CRAWL=... zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 445acde8..39ef200a 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, - extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, ) @@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None: def run_jalc(args: argparse.Namespace) -> None: - ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() @@ -370,7 +369,6 @@ def run_datacite(args: argparse.Namespace) -> None: edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, - extid_map_file=args.extid_map_file, insert_log_file=args.insert_log_file, ) if args.kafka_mode: @@ -495,12 +493,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_crossref.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_crossref.add_argument( "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" ) sub_crossref.add_argument( @@ -529,12 +521,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_jalc.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( @@ -964,12 +950,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_datacite.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_datacite.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" ) sub_datacite.add_argument( diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index a41e2bf5..9c69fee3 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client @@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. - Can use a local sqlite3 file for faster "external identifier" lookups - See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ @@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter): ) self.create_containers: bool = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db: Optional[Any] = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Optional[Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def map_release_type(self, crossref_type: str) -> Optional[str]: return CROSSREF_TYPE_MAP.get(crossref_type) @@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter): # unknown release_stage = None - # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} - # filter out unreasonably huge releases if len(abstracts) > 100: self.counts["skip-huge-abstracts"] += 1 @@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=obj["DOI"].lower(), - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=clean(obj.get("volume")), issue=clean(obj.get("issue")), diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d5622960..d4d7a9f5 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -12,7 +12,6 @@ import collections import datetime import json import re -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence, Set, Tuple @@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri), file=sys.stderr) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map", file=sys.stderr) - self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file @@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - """ - Return dictionary of identifiers referring to the same things as the given DOI. - """ - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. @@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter): if release_month: extra["release_month"] = release_month - extids = self.lookup_ext_ids(doi=doi) - # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..a737ac9f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, raw_record: Any) -> bool: return True @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter): # reasonable default for this collection release_type = "article-journal" - # external identifiers - extids = self.lookup_ext_ids(doi=doi) - # extra: # translation_of # aliases @@ -356,12 +310,6 @@ class JalcImporter(EntityImporter): release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=volume, issue=issue, diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index eb931eb1..5f38e73e 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher @pytest.fixture(scope="function") def crossref_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=False) @pytest.mark.skip( diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 220dc0f6..b15d14c3 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -30,7 +30,6 @@ def datacite_importer(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, ) @@ -41,7 +40,6 @@ def datacite_importer_existing(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, ) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py index 4ebc87b4..8281b9a1 100644 --- a/python/tests/import_jalc.py +++ b/python/tests/import_jalc.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo @pytest.fixture(scope="function") def jalc_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JalcImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jalc_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JalcImporter(api, issn_file, bezerk_mode=False) def test_jalc_importer(jalc_importer): diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py index 8ad550b3..7e13c8b0 100644 --- a/python/tests/import_jstor.py +++ b/python/tests/import_jstor.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter @pytest.fixture(scope="function") def jstor_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JstorImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jstor_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JstorImporter(api, issn_file, bezerk_mode=False) def test_jstor_importer(jstor_importer): diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index a5301f29..e783db48 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -11,7 +11,6 @@ def pubmed_importer(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, lookup_refs=True, ) @@ -23,7 +22,6 @@ def pubmed_importer_existing(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, lookup_refs=True, ) |