aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/README_import.md3
-rwxr-xr-xpython/fatcat_import.py22
-rw-r--r--python/fatcat_tools/importers/crossref.py54
-rw-r--r--python/fatcat_tools/importers/datacite.py54
-rw-r--r--python/fatcat_tools/importers/jalc.py52
-rw-r--r--python/tests/import_crossref.py8
-rw-r--r--python/tests/import_datacite.py2
-rw-r--r--python/tests/import_jalc.py8
-rw-r--r--python/tests/import_jstor.py8
-rw-r--r--python/tests/import_pubmed.py2
10 files changed, 10 insertions, 203 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 6853a4d7..74e75e14 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine.
Usually 24 hours or so on fast production machine.
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
## JALC
@@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine.
First import a random subset single threaded to create (most) containers. On a
fast machine, this takes a couple minutes.
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
Then, in parallel:
@@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import):
Run import in parallel:
+ # NOTE: `--extid-map-file` was used during initial import, but is now deprecated
export FATCAT_AUTH_WORKER_CRAWL=...
zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 445acde8..39ef200a 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None:
fci = CrossrefImporter(
args.api,
args.issn_map_file,
- extid_map_file=args.extid_map_file,
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
)
@@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None:
def run_jalc(args: argparse.Namespace) -> None:
- ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
+ ji = JalcImporter(args.api, args.issn_map_file)
Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
@@ -370,7 +369,6 @@ def run_datacite(args: argparse.Namespace) -> None:
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
debug=args.debug,
- extid_map_file=args.extid_map_file,
insert_log_file=args.insert_log_file,
)
if args.kafka_mode:
@@ -495,12 +493,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_crossref.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_crossref.add_argument(
"--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
)
sub_crossref.add_argument(
@@ -529,12 +521,6 @@ def main() -> None:
default=None,
type=argparse.FileType("r"),
)
- sub_jalc.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
sub_arxiv.set_defaults(
@@ -964,12 +950,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_datacite.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_datacite.add_argument(
"--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
)
sub_datacite.add_argument(
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index a41e2bf5..9c69fee3 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
from typing import Any, Dict, List, Optional, Sequence
import fatcat_openapi_client
@@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
- Can use a local sqlite3 file for faster "external identifier" lookups
-
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
@@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter):
)
self.create_containers: bool = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db: Optional[Any] = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Optional[Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def map_release_type(self, crossref_type: str) -> Optional[str]:
return CROSSREF_TYPE_MAP.get(crossref_type)
@@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter):
# unknown
release_stage = None
- # external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
-
# filter out unreasonably huge releases
if len(abstracts) > 100:
self.counts["skip-huge-abstracts"] += 1
@@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=obj["DOI"].lower(),
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=clean(obj.get("volume")),
issue=clean(obj.get("issue")),
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d5622960..d4d7a9f5 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -12,7 +12,6 @@ import collections
import datetime
import json
import re
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
@@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri), file=sys.stderr)
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map", file=sys.stderr)
-
self.read_issn_map_file(issn_map_file)
self.debug = debug
self.insert_log_file = insert_log_file
@@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter):
print("datacite with debug={}".format(self.debug), file=sys.stderr)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- """
- Return dictionary of identifiers referring to the same things as the given DOI.
- """
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]:
"""
Mapping datacite JSON to ReleaseEntity.
@@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter):
if release_month:
extra["release_month"] = release_month
- extids = self.lookup_ext_ids(doi=doi)
-
# Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 2f10e533..a737ac9f 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,5 +1,4 @@
import datetime
-import sqlite3
import sys
from typing import Any, Dict, List, Optional, Sequence
@@ -117,50 +116,8 @@ class JalcImporter(EntityImporter):
)
self.create_containers = kwargs.get("create_containers", True)
- extid_map_file = kwargs.get("extid_map_file")
- self.extid_map_db = None
- if extid_map_file:
- db_uri = "file:{}?mode=ro".format(extid_map_file)
- print("Using external ID map: {}".format(db_uri))
- self.extid_map_db = sqlite3.connect(db_uri, uri=True)
- else:
- print("Not using external ID map")
-
self.read_issn_map_file(issn_map_file)
- def lookup_ext_ids(self, doi: str) -> Dict[str, Any]:
- if self.extid_map_db is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
- ).fetchone()
- if row is None:
- return dict(
- core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None,
- )
- row = [str(cell or "") or None for cell in row]
- return dict(
- core_id=row[0],
- pmid=row[1],
- pmcid=row[2],
- wikidata_qid=row[3],
- # TODO:
- arxiv_id=None,
- jstor_id=None,
- )
-
def want(self, raw_record: Any) -> bool:
return True
@@ -330,9 +287,6 @@ class JalcImporter(EntityImporter):
# reasonable default for this collection
release_type = "article-journal"
- # external identifiers
- extids = self.lookup_ext_ids(doi=doi)
-
# extra:
# translation_of
# aliases
@@ -356,12 +310,6 @@ class JalcImporter(EntityImporter):
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids["pmid"],
- pmcid=extids["pmcid"],
- wikidata_qid=extids["wikidata_qid"],
- core=extids["core_id"],
- arxiv=extids["arxiv_id"],
- jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index eb931eb1..5f38e73e 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
@pytest.fixture(scope="function")
def crossref_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield CrossrefImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield CrossrefImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def crossref_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield CrossrefImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield CrossrefImporter(api, issn_file, bezerk_mode=False)
@pytest.mark.skip(
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 220dc0f6..b15d14c3 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -30,7 +30,6 @@ def datacite_importer(api):
yield DataciteImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=True,
)
@@ -41,7 +40,6 @@ def datacite_importer_existing(api):
yield DataciteImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=False,
)
diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py
index 4ebc87b4..8281b9a1 100644
--- a/python/tests/import_jalc.py
+++ b/python/tests/import_jalc.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo
@pytest.fixture(scope="function")
def jalc_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JalcImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield JalcImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def jalc_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JalcImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield JalcImporter(api, issn_file, bezerk_mode=False)
def test_jalc_importer(jalc_importer):
diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py
index 8ad550b3..7e13c8b0 100644
--- a/python/tests/import_jstor.py
+++ b/python/tests/import_jstor.py
@@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter
@pytest.fixture(scope="function")
def jstor_importer(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JstorImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True
- )
+ yield JstorImporter(api, issn_file, bezerk_mode=True)
@pytest.fixture(scope="function")
def jstor_importer_existing(api):
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield JstorImporter(
- api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False
- )
+ yield JstorImporter(api, issn_file, bezerk_mode=False)
def test_jstor_importer(jstor_importer):
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index a5301f29..e783db48 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -11,7 +11,6 @@ def pubmed_importer(api):
yield PubmedImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=True,
lookup_refs=True,
)
@@ -23,7 +22,6 @@ def pubmed_importer_existing(api):
yield PubmedImporter(
api,
issn_file,
- extid_map_file="tests/files/example_map.sqlite3",
bezerk_mode=False,
lookup_refs=True,
)