From ba7f9214d2038882952eb50cd4dc5eff4eb0e6ff Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:10:35 -0800 Subject: remove deprecated extid sqlite3 lookup table feature from importers This was used during initial bulk imports, but is no longer used and could create serious metadata problems if used accidentially. In retrospect, it also made metadata provenance less transparent, and may have done more harm than good overall. --- python/README_import.md | 3 ++ python/fatcat_import.py | 22 +------------ python/fatcat_tools/importers/crossref.py | 54 ------------------------------- python/fatcat_tools/importers/datacite.py | 54 ------------------------------- python/fatcat_tools/importers/jalc.py | 52 ----------------------------- python/tests/import_crossref.py | 8 ++--- python/tests/import_datacite.py | 2 -- python/tests/import_jalc.py | 8 ++--- python/tests/import_jstor.py | 8 ++--- python/tests/import_pubmed.py | 2 -- 10 files changed, 10 insertions(+), 203 deletions(-) diff --git a/python/README_import.md b/python/README_import.md index 6853a4d7..74e75e14 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -52,6 +52,7 @@ Usually tens of minutes on fast production machine. Usually 24 hours or so on fast production machine. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 ## JALC @@ -59,6 +60,7 @@ Usually 24 hours or so on fast production machine. First import a random subset single threaded to create (most) containers. On a fast machine, this takes a couple minutes. + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated time ./fatcat_import.py jalc /srv/fatcat/datasets/JALC-LOD-20180907.sample10k.rdf /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 Then, in parallel: @@ -116,6 +118,7 @@ Prep JSON files from sqlite (for parallel import): Run import in parallel: + # NOTE: `--extid-map-file` was used during initial import, but is now deprecated export FATCAT_AUTH_WORKER_CRAWL=... zcat /srv/fatcat/datasets/s2_doi.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py arabesque --json-file - --extid-type doi --crawl-id DIRECT-OA-CRAWL-2019 --no-require-grobid diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 445acde8..39ef200a 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -54,7 +54,6 @@ def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, - extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, ) @@ -72,7 +71,7 @@ def run_crossref(args: argparse.Namespace) -> None: def run_jalc(args: argparse.Namespace) -> None: - ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file) Bs4XmlLinesPusher(ji, args.xml_file, " None: edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, - extid_map_file=args.extid_map_file, insert_log_file=args.insert_log_file, ) if args.kafka_mode: @@ -494,12 +492,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_crossref.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_crossref.add_argument( "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" ) @@ -529,12 +521,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_jalc.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( @@ -963,12 +949,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_datacite.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_datacite.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" ) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index a41e2bf5..9c69fee3 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client @@ -128,8 +127,6 @@ class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. - Can use a local sqlite3 file for faster "external identifier" lookups - See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ @@ -150,50 +147,8 @@ class CrossrefImporter(EntityImporter): ) self.create_containers: bool = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db: Optional[Any] = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Optional[Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def map_release_type(self, crossref_type: str) -> Optional[str]: return CROSSREF_TYPE_MAP.get(crossref_type) @@ -473,9 +428,6 @@ class CrossrefImporter(EntityImporter): # unknown release_stage = None - # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} - # filter out unreasonably huge releases if len(abstracts) > 100: self.counts["skip-huge-abstracts"] += 1 @@ -538,13 +490,7 @@ class CrossrefImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=obj["DOI"].lower(), - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=clean(obj.get("volume")), issue=clean(obj.get("issue")), diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d5622960..d4d7a9f5 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -12,7 +12,6 @@ import collections import datetime import json import re -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence, Set, Tuple @@ -248,15 +247,6 @@ class DataciteImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri), file=sys.stderr) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map", file=sys.stderr) - self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file @@ -264,42 +254,6 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - """ - Return dictionary of identifiers referring to the same things as the given DOI. - """ - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. @@ -706,8 +660,6 @@ class DataciteImporter(EntityImporter): if release_month: extra["release_month"] = release_month - extids = self.lookup_ext_ids(doi=doi) - # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -722,12 +674,6 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 2f10e533..a737ac9f 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,5 +1,4 @@ import datetime -import sqlite3 import sys from typing import Any, Dict, List, Optional, Sequence @@ -117,50 +116,8 @@ class JalcImporter(EntityImporter): ) self.create_containers = kwargs.get("create_containers", True) - extid_map_file = kwargs.get("extid_map_file") - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: - if self.extid_map_db is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] - ).fetchone() - if row is None: - return dict( - core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None, - ) - row = [str(cell or "") or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, raw_record: Any) -> bool: return True @@ -330,9 +287,6 @@ class JalcImporter(EntityImporter): # reasonable default for this collection release_type = "article-journal" - # external identifiers - extids = self.lookup_ext_ids(doi=doi) - # extra: # translation_of # aliases @@ -356,12 +310,6 @@ class JalcImporter(EntityImporter): release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids["pmid"], - pmcid=extids["pmcid"], - wikidata_qid=extids["wikidata_qid"], - core=extids["core_id"], - arxiv=extids["arxiv_id"], - jstor=extids["jstor_id"], ), volume=volume, issue=issue, diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index eb931eb1..5f38e73e 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -10,17 +10,13 @@ from fatcat_tools.importers import CrossrefImporter, JsonLinePusher @pytest.fixture(scope="function") def crossref_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def crossref_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield CrossrefImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield CrossrefImporter(api, issn_file, bezerk_mode=False) @pytest.mark.skip( diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 220dc0f6..b15d14c3 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -30,7 +30,6 @@ def datacite_importer(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, ) @@ -41,7 +40,6 @@ def datacite_importer_existing(api): yield DataciteImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, ) diff --git a/python/tests/import_jalc.py b/python/tests/import_jalc.py index 4ebc87b4..8281b9a1 100644 --- a/python/tests/import_jalc.py +++ b/python/tests/import_jalc.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, Bs4XmlLinesPusher, JalcImpo @pytest.fixture(scope="function") def jalc_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JalcImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jalc_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JalcImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JalcImporter(api, issn_file, bezerk_mode=False) def test_jalc_importer(jalc_importer): diff --git a/python/tests/import_jstor.py b/python/tests/import_jstor.py index 8ad550b3..7e13c8b0 100644 --- a/python/tests/import_jstor.py +++ b/python/tests/import_jstor.py @@ -8,17 +8,13 @@ from fatcat_tools.importers import Bs4XmlFilePusher, JstorImporter @pytest.fixture(scope="function") def jstor_importer(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True - ) + yield JstorImporter(api, issn_file, bezerk_mode=True) @pytest.fixture(scope="function") def jstor_importer_existing(api): with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield JstorImporter( - api, issn_file, extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False - ) + yield JstorImporter(api, issn_file, bezerk_mode=False) def test_jstor_importer(jstor_importer): diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index a5301f29..e783db48 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -11,7 +11,6 @@ def pubmed_importer(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=True, lookup_refs=True, ) @@ -23,7 +22,6 @@ def pubmed_importer_existing(api): yield PubmedImporter( api, issn_file, - extid_map_file="tests/files/example_map.sqlite3", bezerk_mode=False, lookup_refs=True, ) -- cgit v1.2.3 From 2fd90ad2cc561fa743a617315824b2744f737575 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:12:39 -0800 Subject: clean_doi: stop mutating double-slash DOIs, except for 10.1037 prefix --- python/fatcat_tools/normal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 34e5c3d1..0d2c84ce 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -47,7 +47,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: raw = raw[8:] if raw.startswith("dx.doi.org/"): raw = raw[11:] - if raw[7:9] == "//": + if raw[7:9] == "//" and "10.1037//" in raw: raw = raw[:8] + raw[9:] # fatcatd uses same REGEX, but Rust regex rejects these characters, while @@ -74,6 +74,7 @@ def test_clean_doi() -> None: assert clean_doi("10.1234/asdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" + assert clean_doi("10.1026//1616-1041.3.2.86") == "10.1026//1616-1041.3.2.86" assert clean_doi("10.23750/abm.v88i2 -s.6506") is None assert clean_doi("10.17167/mksz.2017.2.129–155") is None assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf" -- cgit v1.2.3 From 1024e688bb12d64648ceb638daf049d508f87561 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:13:15 -0800 Subject: importers: use clean_doi() in many more (all?) importers --- python/fatcat_tools/importers/arxiv.py | 6 ++++-- python/fatcat_tools/importers/cdl_dash_dat.py | 6 ++++-- python/fatcat_tools/importers/common.py | 5 ++--- python/fatcat_tools/importers/crossref.py | 9 ++++++++- python/fatcat_tools/importers/grobid_metadata.py | 9 ++++++--- python/fatcat_tools/importers/jstor.py | 6 +++++- 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 1d50dd9a..dd2c2284 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -9,6 +9,8 @@ from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text +from fatcat_tools.normal import clean_doi + from .common import EntityImporter from .crossref import lookup_license_slug @@ -127,8 +129,8 @@ class ArxivRawImporter(EntityImporter): base_id = metadata.id.string doi = None if metadata.doi and metadata.doi.string: - doi = metadata.doi.string.lower().split()[0].strip() - if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): + doi = clean_doi(metadata.doi.string.lower().split()[0].strip()) + if doi and not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None title = latex_to_text(metadata.title.get_text().replace("\n", " ")) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 1a4114a0..ec557e15 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -23,6 +23,8 @@ from fatcat_openapi_client import ( ReleaseExtIds, ) +from fatcat_tools.normal import clean_doi + from .common import clean from .crossref import lookup_license_slug @@ -78,8 +80,8 @@ def cdl_dash_release( extra = dict() assert meta["identifier"]["type"] == "DOI" - doi = meta["identifier"]["value"].lower() - assert doi.startswith("10.") + doi = clean_doi(meta["identifier"]["value"].lower()) + assert doi and doi.startswith("10.") ark_id = None for extid in meta.get("alternativeIdentifiers", []): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..425b6f13 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -29,7 +29,7 @@ from fuzzycat.matching import match_release_fuzzy # TODO: refactor so remove need for this (re-imports for backwards compatibility) from fatcat_tools.normal import is_cjk # noqa: F401 -from fatcat_tools.normal import LANG_MAP_MARC, b32_hex # noqa: F401 +from fatcat_tools.normal import LANG_MAP_MARC, b32_hex, clean_doi # noqa: F401 from fatcat_tools.normal import clean_str as clean # noqa: F401 from fatcat_tools.transforms import entity_to_dict @@ -342,8 +342,7 @@ class EntityImporter: return creator_id def is_doi(self, doi: str) -> bool: - # TODO: replace with clean_doi() from fatcat_tools.normal - return doi.startswith("10.") and doi.count("/") >= 1 + return clean_doi(doi) is not None def lookup_doi(self, doi: str) -> Optional[str]: """Caches calls to the doi lookup API endpoint in a local dict diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 9c69fee3..c9f251fc 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,6 +4,8 @@ from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import EntityImporter, clean # The docs/guide should be the canonical home for these mappings; update there @@ -467,6 +469,11 @@ class CrossrefImporter(EntityImporter): self.counts["skip-blank-title"] += 1 return None + doi = clean_doi(obj["DOI"].lower()) + if not doi: + self.counts["skip-bad-doi"] += 1 + return None + subtitle = None if obj.get("subtitle"): subtitle = clean(obj["subtitle"][0], force_xml=True) @@ -489,7 +496,7 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj["DOI"].lower(), + doi=doi, isbn13=isbn13, ), volume=clean(obj.get("volume")), diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index e36e1b48..7c595787 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional import fatcat_openapi_client from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import EntityImporter, clean, make_rel_url MAX_ABSTRACT_BYTES = 4096 @@ -133,9 +135,10 @@ class GrobidMetadataImporter(EntityImporter): # only returns year, ever? release_year = int(obj["date"][:4]) - extra = dict() - if obj.get("doi"): - extra["doi"] = obj["doi"] + extra: Dict[str, Any] = dict() + doi = clean_doi(obj.get("doi")) + if doi: + extra["doi"] = doi if obj["journal"] and obj["journal"].get("name"): extra["container_name"] = clean(obj["journal"]["name"]) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 2c8aa0a4..ca1f2466 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -8,6 +8,8 @@ import fatcat_openapi_client from bs4 import BeautifulSoup from fatcat_openapi_client import ApiClient, ReleaseEntity +from fatcat_tools.normal import clean_doi + from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP @@ -146,7 +148,9 @@ class JstorImporter(EntityImporter): doi = article_meta.find("article-id", {"pub-id-type": "doi"}) if doi: - doi = doi.string.lower().strip() + doi = clean_doi(doi.string.lower()) + else: + doi = None jstor_id = article_meta.find("article-id", {"pub-id-type": "jstor"}) if jstor_id: -- cgit v1.2.3 From 23fd36a3e8505c1ed6d13367a3fb62a8bf2242d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 18:14:58 -0800 Subject: add notes about 'double slash in DOI' issue --- notes/cleanups/double_slash_dois.md | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 notes/cleanups/double_slash_dois.md diff --git a/notes/cleanups/double_slash_dois.md b/notes/cleanups/double_slash_dois.md new file mode 100644 index 00000000..d4e9ded6 --- /dev/null +++ b/notes/cleanups/double_slash_dois.md @@ -0,0 +1,46 @@ + +Relevant github issue: https://github.com/internetarchive/fatcat/issues/48 + + +## Investigate + +At least some of these DOIs actually seem valid, like +`10.1026//1616-1041.3.2.86`. So shouldn't be re-writing them! + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '\t10\.\d+//' \ + | wc -l + # 59,904 + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '\t10\.\d+//' \ + | pv -l \ + > doubleslash_dois.tsv + +Which prefixes have the most double slashes? + + cat doubleslash_dois.tsv | cut -f2 | cut -d/ -f1 | sort | uniq -c | sort -nr | head + 51220 10.1037 + 2187 10.1026 + 1316 10.1024 + 826 10.1027 + 823 10.14505 + 443 10.17010 + 186 10.46925 + 163 10.37473 + 122 10.18376 + 118 10.29392 + [...] + +All of the 10.1037 DOIs seem to be registered with Crossref, and at least some +have redirects to the not-with-double-slash versions. Not all doi.org lookups +include a redirect. + +I think the "correct thing to do" here is to add special-case handling for the +pubmed and crossref importers, and in any other case allow double slashes. + +Not clear that there are any specific cleanups to be done for now. A broader +"verify that DOIs are actually valid" push and cleanup would make sense; if +that happens checking for mangled double-slash DOIs would make sense. -- cgit v1.2.3 From c133f3077aa975aa4706a8e5ca894fc1b71fbc67 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 17:27:26 -0700 Subject: datacite import: store less subject metadata Many of these 'subject' objects have the equivalent of several lines of text, with complex URLs that don't compress well. I think it is fine we have included these thus far instead of parsing more deeply, but going forward I don't think this nested 'extra' metadata is worth the database space. --- python/fatcat_tools/importers/datacite.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d4d7a9f5..fe02cac4 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -597,7 +597,13 @@ class DataciteImporter(EntityImporter): if license_extra: extra_datacite["license"] = license_extra if attributes.get("subjects"): - extra_datacite["subjects"] = attributes["subjects"] + # these subjects with schemeUri are too much metadata, which + # doesn't compress. filter them out. + extra_subjects = [ + subj for subj in attributes["subjects"] if not subj.get("schemeUri") + ] + if extra_subjects: + extra_datacite["subjects"] = extra_subjects # Include version information. metadata_version = attributes.get("metadataVersion") or "" -- cgit v1.2.3 From ab4e1355bf93e3755985f1b5cd2589a78601d253 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 10 Nov 2021 13:08:23 -0800 Subject: remove cdl_dash_dat and wayback_static importers Cleaning out dead code. These importers were used to create demonstration fileset and webcapture entities early in development. They have been replaced by the fileset and webcapture ingest importers. --- python/fatcat_import.py | 86 ------- python/fatcat_tools/importers/__init__.py | 2 - python/fatcat_tools/importers/cdl_dash_dat.py | 221 ------------------ python/fatcat_tools/importers/wayback_static.py | 287 ------------------------ 4 files changed, 596 deletions(-) delete mode 100755 python/fatcat_tools/importers/cdl_dash_dat.py delete mode 100755 python/fatcat_tools/importers/wayback_static.py diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 39ef200a..33679868 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -42,8 +42,6 @@ from fatcat_tools.importers import ( SavePaperNowWebImporter, ShadowLibraryImporter, SqlitePusher, - auto_cdl_dash_dat, - auto_wayback_static, ) # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable @@ -315,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None: JsonLinePusher(fmi, args.json_file).run() -def run_wayback_static(args: argparse.Namespace) -> None: - api = args.api - - # find the release - if args.release_id: - release_id = args.release_id - elif args.extid: - idtype = args.extid.split(":")[0] - extid = ":".join(args.extid.split(":")[1:]) - if idtype == "doi": - release_id = api.lookup_release(doi=extid).ident - elif idtype == "pmid": - release_id = api.lookup_release(pmid=extid).ident - elif idtype == "wikidata": - release_id = api.lookup_release(wikidata_qid=extid).ident - else: - raise NotImplementedError("extid type: {}".format(idtype)) - else: - raise Exception("need either release_id or extid argument") - - # create it - (editgroup_id, wc) = auto_wayback_static( - api, release_id, args.wayback_url, editgroup_id=args.editgroup_id - ) - if not wc: - return - print("release_id: {}".format(release_id)) - print("editgroup_id: {}".format(editgroup_id)) - print("webcapture id: {}".format(wc.ident)) - print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) - - -def run_cdl_dash_dat(args: argparse.Namespace) -> None: - api = args.api - - # create it - (editgroup_id, release, fs) = auto_cdl_dash_dat( - api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id - ) - if not (fs and release): - return - print("release_id: {}".format(release.ident)) - print("editgroup_id: {}".format(editgroup_id)) - print("fileset id: {}".format(fs.ident)) - print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) - - def run_datacite(args: argparse.Namespace) -> None: dci = DataciteImporter( args.api, @@ -899,43 +850,6 @@ def main() -> None: type=argparse.FileType("r"), ) - sub_wayback_static = subparsers.add_parser( - "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" - ) - sub_wayback_static.set_defaults( - func=run_wayback_static, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_wayback_static.add_argument( - "wayback_url", type=str, help="URL of wayback capture to extract from" - ) - sub_wayback_static.add_argument( - "--extid", type=str, help="external identifier for release lookup" - ) - sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") - sub_wayback_static.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - - sub_cdl_dash_dat = subparsers.add_parser( - "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" - ) - sub_cdl_dash_dat.set_defaults( - func=run_cdl_dash_dat, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_cdl_dash_dat.add_argument( - "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" - ) - sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") - sub_cdl_dash_dat.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") sub_datacite.add_argument( "json_file", diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 06ecfd58..223ae526 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -13,7 +13,6 @@ To run an import you combine two classes; one each of: from .arabesque import ARABESQUE_MATCH_WHERE_CLAUSE, ArabesqueMatchImporter from .arxiv import ArxivRawImporter -from .cdl_dash_dat import auto_cdl_dash_dat from .chocula import ChoculaImporter from .common import ( LANG_MAP_MARC, @@ -55,4 +54,3 @@ from .matched import MatchedImporter from .orcid import OrcidImporter from .pubmed import PubmedImporter from .shadow import ShadowLibraryImporter -from .wayback_static import auto_wayback_static diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py deleted file mode 100755 index ec557e15..00000000 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 - -import hashlib -import json -import mimetypes -import os -import subprocess -import sys -import urllib -import urllib.parse -from typing import Any, Dict, List, Optional, Tuple - -import fatcat_openapi_client -import magic -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - FilesetEntity, - FilesetFile, - ReleaseAbstract, - ReleaseContrib, - ReleaseEntity, - ReleaseExtIds, -) - -from fatcat_tools.normal import clean_doi - -from .common import clean -from .crossref import lookup_license_slug - - -def single_file(prefix: str, path: str) -> FilesetFile: - - full = prefix + path - size_bytes = os.stat(full).st_size - - hashes = [ - hashlib.md5(), - hashlib.sha1(), - hashlib.sha256(), - ] - with open(full, "rb") as fp: - while True: - data = fp.read(2 ** 20) - if not data: - break - for h in hashes: - h.update(data) - mime = magic.Magic(mime=True).from_file(full) - if mime == "application/octet-stream": - # magic apparently isn't that great; try using filename as well - guess = mimetypes.guess_type(full)[0] - if guess: - mime = guess - - fsf = FilesetFile( - path=path, - size=size_bytes, - md5=hashes[0].hexdigest(), - sha1=hashes[1].hexdigest(), - sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime), - ) - return fsf - - -def make_manifest(base_dir: str) -> List[FilesetFile]: - manifest = [] - for root, dirs, files in os.walk(base_dir): - for f in files: - manifest.append(single_file(root, f)) - return manifest - - -def cdl_dash_release( - meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None -) -> ReleaseEntity: - - if not extra: - extra = dict() - - assert meta["identifier"]["type"] == "DOI" - doi = clean_doi(meta["identifier"]["value"].lower()) - assert doi and doi.startswith("10.") - - ark_id = None - for extid in meta.get("alternativeIdentifiers", []): - if extid["value"].startswith("ark:"): - ark_id = extid["value"] - assert ark_id - - license_slug = lookup_license_slug(meta["rights"]["uri"]) - - abstracts = [] - for desc in meta["descriptions"]: - if desc["type"] == "abstract": - abstracts.append( - ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) - ) - # print(abstracts) - - contribs = [] - for creator in meta["creator"]: - contribs.append( - ReleaseContrib( - given_name=creator["given"], - surname=creator["family"], - # sorry everybody - raw_name="{} {}".format(creator["given"], creator["family"]), - raw_affiliation=creator.get("affiliation"), - role="author", # presumably, for these datasets? - ) - ) - - r = ReleaseEntity( - ext_ids=ReleaseExtIds( - doi=doi, - ark=ark_id, - ), - title=clean(meta["title"], force_xml=True), - publisher=clean(meta["publisher"]), - release_year=int(meta["publicationYear"]), - release_type="dataset", - license_slug=license_slug, - contribs=contribs, - abstracts=abstracts or None, - extra=extra, - ) - return r - - -def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: - - if dat_path.endswith("/"): - dat_path = dat_path[:-1] - dat_discovery = dat_path - extra = dict() - assert len(dat_discovery) == 64 - - with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: - meta_dict = json.loads(fp.read()) - - release = cdl_dash_release(meta_dict) - ark_id = release.extra["ark_id"] - - dash_version = None - # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", "r") as fp: - for line in fp: - line = line.strip() - if line.startswith(""): - dash_version = int(line[19:].split("<")[0]) - assert dash_version is not None - extra["cdl_dash"] = dict(version=dash_version) - release.extra["cdl_dash"] = dict(version=dash_version) - - manifest = make_manifest(dat_path + "/files/") - - bundle_url = dict( - url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo-bundle", - ) - repo_url = dict( - url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=""), dash_version - ), - rel="repo", - ) - dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") - fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra - ) - return (release, fs) - - -def auto_cdl_dash_dat( - api: ApiClient, - dat_path: str, - release_id: Optional[str] = None, - editgroup_id: Optional[str] = None, -) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: - - git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") - - (release, fileset) = make_release_fileset(dat_path) - - if not editgroup_id: - eg = api.create_editgroup( - Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), - ) - ) - editgroup_id = eg.editgroup_id - - if not release_id and release.ext_ids.doi: - try: - r = api.lookup_release(doi=release.ext_ids.doi) - release_id = r.ident - except fatcat_openapi_client.rest.ApiException: - pass - if not release_id: - edit = api.create_release(eg.editgroup_id, release) - release_id = edit.ident - - release = api.get_release(release_id, expand="filesets") - if len(release.filesets): - print("A fileset already exists for release {}".format(release.ident)) - return (None, None, None) - - fileset.release_ids = [release.ident] - edit = api.create_fileset(eg.editgroup_id, fileset) - fileset = api.get_fileset(edit.ident) - return (editgroup_id, release, fileset) - - -if __name__ == "__main__": - # pass this a discovery key that has been cloned to the local directory - print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py deleted file mode 100755 index 5caed2c7..00000000 --- a/python/fatcat_tools/importers/wayback_static.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python3 - -""" -Helpers to create Web Capture entities from extracted wayback content. - -Works as a stand-alone script (for debugging) or as library routines. -""" - -import argparse -import datetime -import hashlib -import json -import subprocess -import sys -from typing import Any, Dict, List, Optional, Tuple - -import requests -from bs4 import BeautifulSoup -from fatcat_openapi_client import ( - ApiClient, - Editgroup, - EntityEdit, - WebcaptureCdxLine, - WebcaptureEntity, - WebcaptureUrl, -) - -from .common import b32_hex - -CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" -GWB_URL_BASE = "https://web.archive.org/web" -REQ_SESSION = requests.Session() - - -def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: - """Takes a wayback machine URL, and returns a tuple: - - (timestamp, datetime, original_url) - """ - chunks = url.split("/") - assert len(chunks) >= 6 - assert chunks[2] == "web.archive.org" - assert chunks[3] == "web" - return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) - - -def test_parse_wbm_url() -> None: - u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" - assert parse_wbm_url(u) == ( - "20010712114837", - datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html", - ) - - -def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: - """ - Takes a complete WBM timestamp string (like "20020327115625") and returns a - python datetime object (UTC) - """ - # strip any "im_" or "id_" suffix - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - # inflexible; require the full second-precision timestamp - assert len(timestamp) == 14 - return datetime.datetime( - year=int(timestamp[0:4]), - month=int(timestamp[4:6]), - day=int(timestamp[6:8]), - hour=int(timestamp[8:10]), - minute=int(timestamp[10:12]), - second=int(timestamp[12:14]), - ) - - -def test_parse_wbm_timestamp() -> None: - assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) - - -def fetch_wbm(url: str) -> bytes: - resp = REQ_SESSION.get(url) - resp.raise_for_status() - assert resp.content - return resp.content - - -def lookup_cdx( - embed_url: str, verify_hashes: bool = True, cdx_output: Any = None -) -> Optional[WebcaptureCdxLine]: - sys.stderr.write(embed_url + "\n") - assert embed_url.startswith("/web/") - embed_url_segments = embed_url.split("/") - timestamp = embed_url_segments[2] - if timestamp.endswith("_"): - timestamp = timestamp[:-3] - url = "/".join(embed_url_segments[3:]) - # print((timestamp, url)) - params: Dict = dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - ) - resp = REQ_SESSION.get( - CDX_API_BASE, - params=params, - ) - resp.raise_for_status() - # print(resp.url) - if resp.content: - hit = resp.content.decode("utf-8").split("\n")[0] - if cdx_output: - cdx_output.write(hit + "\n") - cdx_chunks = hit.split(" ") - cdx = [x if (x and x != "-") else None for x in cdx_chunks] - webcapture_cdx = WebcaptureCdxLine( - surt=cdx[0], - timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", - url=cdx[2], - mimetype=cdx[3], - status_code=int(cdx[4] or ""), - sha1=b32_hex(cdx[5] or ""), - sha256=None, - ) - if verify_hashes: - resp = REQ_SESSION.get( - GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp - ) - resp.raise_for_status() - assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() - webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() - webcapture_cdx.size = len(resp.content) - return webcapture_cdx - else: - return None - - -def wayback_url_to_relative(url: str) -> Optional[str]: - """ - Wayback URLs can be relative or absolute in rewritten documents. This - function converts any form of rewritten URL to a relative (to - web.archive.org) one, or returns None if it isn't a rewritten URL at all. - """ - if url.startswith("https://web.archive.org/"): - url = url[23:] - elif url.startswith("http://web.archive.org/"): - url = url[22:] - - if url.startswith("/web/"): - return url - else: - return None - - -def extract_embeds(soup: BeautifulSoup) -> List[str]: - - embeds = set() - - # - for tag in soup.find_all("link", href=True): - if tag["rel"] not in ("stylesheet",): - continue - url = wayback_url_to_relative(tag["href"]) - if url: - embeds.add(url) - # - for tag in soup.find_all("img", src=True): - url = wayback_url_to_relative(tag["src"]) - if url: - embeds.add(url) - - #