aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 18:10:35 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 18:49:46 -0800
commitba7f9214d2038882952eb50cd4dc5eff4eb0e6ff (patch)
tree2f3ff3ba4b70f0f7d4603a224bf68cbe3892376b /python/fatcat_tools/importers
parenta6d994fbc18debcf3860e6deb12eb54234a42839 (diff)
downloadfatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.tar.gz
fatcat-ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff.zip
remove deprecated extid sqlite3 lookup table feature from importers
This was used during initial bulk imports, but is no longer used and could create serious metadata problems if used accidentially. In retrospect, it also made metadata provenance less transparent, and may have done more harm than good overall.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/crossref.py54
-rw-r--r--python/fatcat_tools/importers/datacite.py54
-rw-r--r--python/fatcat_tools/importers/jalc.py52
3 files changed, 0 insertions, 160 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..9c69fee3 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
@@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
- Can use a local sqlite3 file for faster "external identifier" lookups
-
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
@@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter):
)
self.create_containers: bool = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db: Optional[Any] = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Optional[Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def map_release_type(self, crossref_type: str) -> Optional[str]:
return CROSSREF_TYPE_MAP.get(crossref_type)
@@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter):
# unknown
release_stage = None
- # external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
# filter out unreasonably huge releases
if len(abstracts) > 100:
self.counts["skip-huge-abstracts"] += 1
@@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=obj["DOI"].lower(),
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=clean(obj.get("volume")),
issue=clean(obj.get("issue")),
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..d4d7a9f5 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
import datetime
import json
import re
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
@@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri), file=sys.stderr)
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map", file=sys.stderr)
-
self.read_issn_map_file(issn_map_file)
self.debug = debug
self.insert_log_file = insert_log_file
@@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter):
print("datacite with debug={}".format(self.debug), file=sys.stderr)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- """
- Return dictionary of identifiers referring to the same things as the given DOI.
- """
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
Mapping datacite JSON to ReleaseEntity.
@@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter):
if release_month:
extra["release_month"] = release_month
- extids = self.lookup_ext_ids(doi=doi)
-
# Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..a737ac9f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, raw_record: Any) -> bool:
return True
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
# reasonable default for this collection
release_type = "article-journal"
- # external identifiers
- extids = self.lookup_ext_ids(doi=doi)
-
# extra:
# translation_of
# aliases
@@ -356,12 +310,6 @@ class JalcImporter(EntityImporter):
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,