diff options
-rw-r--r-- | python/README_import.md | 3 | ||||
-rwxr-xr-x | python/fatcat_import.py | 22 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 54 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 54 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 52 | ||||
-rw-r--r-- | python/tests/import_crossref.py | 8 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 2 | ||||
-rw-r--r-- | python/tests/import_jalc.py | 8 | ||||
-rw-r--r-- | python/tests/import_jstor.py | 8 | ||||
-rw-r--r-- | python/tests/import_pubmed.py | 2 |
10 files changed, 10 insertions, 203 deletions
diff --git a/python/README_import.md b/python/README_import.md index 6853a4d7..74e75e14 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine. Usually 24 hours or so on fast production machine. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 ## JALC @@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine. First import a random subset single threaded to create (most) containers. On a fast machine, this takes a couple minutes. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 Then, in parallel: @@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import): Run import in parallel: + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated export FATCAT_AUTH_WORKER_CRAWL=... zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 445acde8..39ef200a 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, - extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, ) @@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None: def run_jalc(args: argparse.Namespace) -> None: - ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() @@ -370,7 +369,6 @@ def run_datacite(args: argparse.Namespace) -> None: edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, - extid_map_file=args.extid_map_file, insert_log_file=args.insert_log_file, ) if args.kafka_mode: @@ -495,12 +493,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_crossref.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_crossref.add_argument( "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" ) sub_crossref.add_argument( @@ -529,12 +521,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_jalc.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( @@ -964,12 +950,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_datacite.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_datacite.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" ) sub_datacite.add_argument( diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index a41e2bf5..9c69fee3 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client @@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. - Can use a local sqlite3 file for faster "external identifier" lookups - See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ @@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter): ) self.create_containers: bool = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db: Optional[Any] = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Optional[Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def map_release_type(self, crossref_type: str) -> Optional[str]: return CROSSREF_TYPE_MAP.get(crossref_type) @@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter): # unknown release_stage = None - # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} - # filter out unreasonably huge releases if len(abstracts) > 100: self.counts["skip-huge-abstracts"] += 1 @@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=obj["DOI"].lower(), - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=clean(obj.get("volume")), issue=clean(obj.get("issue")), diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d5622960..d4d7a9f5 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -12,7 +12,6 @@ import collections import datetime import json import re -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence, Set, Tuple @@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri), file=sys.stderr) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map", file=sys.stderr) - self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file @@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - """ - Return dictionary of identifiers referring to the same things as the given DOI. - """ - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. @@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter): if release_month: extra["release_month"] = release_month - extids = self.lookup_ext_ids(doi=doi) - # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..a737ac9f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, raw_record: Any) -> bool: return True @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter): # reasonable default for this collection release_type = "article-journal" - # external identifiers - extids = self.lookup_ext_ids(doi=doi) - # extra: # translation_of # aliases @@ -356,12 +310,6 @@ class JalcImporter(EntityImporter): release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=volume, issue=issue, diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index eb931eb1..5f38e73e 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher @pytest.fixture(scope="function") def crossref_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=False) @pytest.mark.skip( diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 220dc0f6..b15d14c3 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -30,7 +30,6 @@ def datacite_importer(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, ) @@ -41,7 +40,6 @@ def datacite_importer_existing(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, ) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py index 4ebc87b4..8281b9a1 100644 --- a/python/tests/import_jalc.py +++ b/python/tests/import_jalc.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo @pytest.fixture(scope="function") def jalc_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JalcImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jalc_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JalcImporter(api, issn_file, bezerk_mode=False) def test_jalc_importer(jalc_importer): diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py index 8ad550b3..7e13c8b0 100644 --- a/python/tests/import_jstor.py +++ b/python/tests/import_jstor.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter @pytest.fixture(scope="function") def jstor_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JstorImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jstor_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JstorImporter(api, issn_file, bezerk_mode=False) def test_jstor_importer(jstor_importer): diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index a5301f29..e783db48 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -11,7 +11,6 @@ def pubmed_importer(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, lookup_refs=True, ) @@ -23,7 +22,6 @@ def pubmed_importer_existing(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, lookup_refs=True, ) |