summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 18:10:35 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 18:49:46 -0800
commitba7f9214d2038882952eb50cd4dc5eff4eb0e6ff (patch)
tree2f3ff3ba4b70f0f7d4603a224bf68cbe3892376b
parenta6d994fbc18debcf3860e6deb12eb54234a42839 (diff)
downloadfatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.tar.gz
fatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.zip
remove deprecated extid sqlite3 lookup table feature from importers
This was used during initial bulk imports, but is no longer used and could create serious metadata problems if used accidentially. In retrospect, it also made metadata provenance less transparent, and may have done more harm than good overall.
-rw-r--r--python/README_import.md3
-rwxr-xr-xpython/fatcat_import.py22
-rw-r--r--python/fatcat_tools/importers/crossref.py54
-rw-r--r--python/fatcat_tools/importers/datacite.py54
-rw-r--r--python/fatcat_tools/importers/jalc.py52
-rw-r--r--python/tests/import_crossref.py8
-rw-r--r--python/tests/import_datacite.py2
-rw-r--r--python/tests/import_jalc.py8
-rw-r--r--python/tests/import_jstor.py8
-rw-r--r--python/tests/import_pubmed.py2
10 files changed, 10 insertions, 203 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 6853a4d7..74e75e14 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine.
Usually 24 hours or so on fast production machine.
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
## JALC
@@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine.
First import a random subset single threaded to create (most) containers. On a
fast machine, this takes a couple minutes.
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
Then, in parallel:
@@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import):
Run import in parallel:
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
export FATCAT_AUTH_WORKER_CRAWL=...
zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 445acde8..39ef200a 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None:
fci = CrossrefImporter(
args.api,
args.issn_map_file,
- extid_map_file=args.extid_map_file,
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
)
@@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None:
def run_jalc(args: argparse.Namespace) -> None:
- ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
+ ji = JalcImporter(args.api, args.issn_map_file)
Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
@@ -370,7 +369,6 @@ def run_datacite(args: argparse.Namespace) -> None:
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
debug=args.debug,
- extid_map_file=args.extid_map_file,
insert_log_file=args.insert_log_file,
)
if args.kafka_mode:
@@ -495,12 +493,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_crossref.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_crossref.add_argument(
"--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
)
sub_crossref.add_argument(
@@ -529,12 +521,6 @@ def main() -> None:
default=None,
type=argparse.FileType("r"),
)
- sub_jalc.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
sub_arxiv.set_defaults(
@@ -964,12 +950,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_datacite.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_datacite.add_argument(
"--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
)
sub_datacite.add_argument(
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..9c69fee3 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
@@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
- Can use a local sqlite3 file for faster "external identifier" lookups
-
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
@@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter):
)
self.create_containers: bool = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db: Optional[Any] = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Optional[Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def map_release_type(self, crossref_type: str) -> Optional[str]:
return CROSSREF_TYPE_MAP.get(crossref_type)
@@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter):
# unknown
release_stage = None
- # external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
# filter out unreasonably huge releases
if len(abstracts) > 100:
self.counts["skip-huge-abstracts"] += 1
@@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=obj["DOI"].lower(),
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=clean(obj.get("volume")),
issue=clean(obj.get("issue")),
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..d4d7a9f5 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
import datetime
import json
import re
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
@@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri), file=sys.stderr)
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map", file=sys.stderr)
-
self.read_issn_map_file(issn_map_file)
self.debug = debug
self.insert_log_file = insert_log_file
@@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter):
print("datacite with debug={}".format(self.debug), file=sys.stderr)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- """
- Return dictionary of identifiers referring to the same things as the given DOI.
- """
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
Mapping datacite JSON to ReleaseEntity.
@@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter):
if release_month:
extra["release_month"] = release_month
- extids = self.lookup_ext_ids(doi=doi)
-
# Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..a737ac9f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, raw_record: Any) -> bool:
return True
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
# reasonable default for this collection
release_type = "article-journal"
- # external identifiers
- extids = self.lookup_ext_ids(doi=doi)
-
# extra:
# translation_of
# aliases
@@ -356,12 +310,6 @@ class JalcImporter(EntityImporter):
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index eb931eb1..5f38e73e 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
@pytest.fixture(scope="function")
def crossref_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield CrossrefImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield CrossrefImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def crossref_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield CrossrefImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield CrossrefImporter(api, issn_file, bezerk_mode=False)
@pytest.mark.skip(
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 220dc0f6..b15d14c3 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -30,7 +30,6 @@ def datacite_importer(api):
yield DataciteImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=True,
)
@@ -41,7 +40,6 @@ def datacite_importer_existing(api):
yield DataciteImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=False,
)
diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py
index 4ebc87b4..8281b9a1 100644
--- a/python/tests/import_jalc.py
+++ b/python/tests/import_jalc.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo
@pytest.fixture(scope="function")
def jalc_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JalcImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield JalcImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def jalc_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JalcImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield JalcImporter(api, issn_file, bezerk_mode=False)
def test_jalc_importer(jalc_importer):
diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py
index 8ad550b3..7e13c8b0 100644
--- a/python/tests/import_jstor.py
+++ b/python/tests/import_jstor.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter
@pytest.fixture(scope="function")
def jstor_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JstorImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield JstorImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def jstor_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JstorImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield JstorImporter(api, issn_file, bezerk_mode=False)
def test_jstor_importer(jstor_importer):
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index a5301f29..e783db48 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -11,7 +11,6 @@ def pubmed_importer(api):
yield PubmedImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=True,
lookup_refs=True,
)
@@ -23,7 +22,6 @@ def pubmed_importer_existing(api):
yield PubmedImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=False,
lookup_refs=True,
)