Merge branch 'bnewbold-import-refactors' into 'master'

import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
author: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
committer: bnewbold <bnewbold@archive.org> 2021-11-11 01:12:18 +0000
commit: 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree: 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_import.py
parent: 7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent: 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download: fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
1 files changed, 1 insertions, 107 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 445acde8..33679868 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -42,8 +42,6 @@ from fatcat_tools.importers import (
     SavePaperNowWebImporter,
     ShadowLibraryImporter,
     SqlitePusher,
-    auto_cdl_dash_dat,
-    auto_wayback_static,
 )
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
@@ -54,7 +52,6 @@ def run_crossref(args: argparse.Namespace) -> None:
     fci = CrossrefImporter(
         args.api,
         args.issn_map_file,
-        extid_map_file=args.extid_map_file,
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
     )
@@ -72,7 +69,7 @@ def run_crossref(args: argparse.Namespace) -> None:
 
 
 def run_jalc(args: argparse.Namespace) -> None:
-    ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
+    ji = JalcImporter(args.api, args.issn_map_file)
     Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
 
 
@@ -316,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None:
     JsonLinePusher(fmi, args.json_file).run()
 
 
-def run_wayback_static(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # find the release
-    if args.release_id:
-        release_id = args.release_id
-    elif args.extid:
-        idtype = args.extid.split(":")[0]
-        extid = ":".join(args.extid.split(":")[1:])
-        if idtype == "doi":
-            release_id = api.lookup_release(doi=extid).ident
-        elif idtype == "pmid":
-            release_id = api.lookup_release(pmid=extid).ident
-        elif idtype == "wikidata":
-            release_id = api.lookup_release(wikidata_qid=extid).ident
-        else:
-            raise NotImplementedError("extid type: {}".format(idtype))
-    else:
-        raise Exception("need either release_id or extid argument")
-
-    # create it
-    (editgroup_id, wc) = auto_wayback_static(
-        api, release_id, args.wayback_url, editgroup_id=args.editgroup_id
-    )
-    if not wc:
-        return
-    print("release_id: {}".format(release_id))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("webcapture id: {}".format(wc.ident))
-    print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
-
-
-def run_cdl_dash_dat(args: argparse.Namespace) -> None:
-    api = args.api
-
-    # create it
-    (editgroup_id, release, fs) = auto_cdl_dash_dat(
-        api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id
-    )
-    if not (fs and release):
-        return
-    print("release_id: {}".format(release.ident))
-    print("editgroup_id: {}".format(editgroup_id))
-    print("fileset id: {}".format(fs.ident))
-    print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
-
-
 def run_datacite(args: argparse.Namespace) -> None:
     dci = DataciteImporter(
         args.api,
@@ -370,7 +320,6 @@ def run_datacite(args: argparse.Namespace) -> None:
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
-        extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file,
     )
     if args.kafka_mode:
@@ -495,12 +444,6 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
     sub_crossref.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
-    sub_crossref.add_argument(
         "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
     )
     sub_crossref.add_argument(
@@ -529,12 +472,6 @@ def main() -> None:
         default=None,
         type=argparse.FileType("r"),
     )
-    sub_jalc.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
 
     sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
     sub_arxiv.set_defaults(
@@ -913,43 +850,6 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
 
-    sub_wayback_static = subparsers.add_parser(
-        "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback"
-    )
-    sub_wayback_static.set_defaults(
-        func=run_wayback_static,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_wayback_static.add_argument(
-        "wayback_url", type=str, help="URL of wayback capture to extract from"
-    )
-    sub_wayback_static.add_argument(
-        "--extid", type=str, help="external identifier for release lookup"
-    )
-    sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_wayback_static.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
-    sub_cdl_dash_dat = subparsers.add_parser(
-        "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project"
-    )
-    sub_cdl_dash_dat.set_defaults(
-        func=run_cdl_dash_dat,
-        auth_var="FATCAT_API_AUTH_TOKEN",
-    )
-    sub_cdl_dash_dat.add_argument(
-        "dat_path", type=str, help="local path dat to import (must be the dat discovery key)"
-    )
-    sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier")
-    sub_cdl_dash_dat.add_argument(
-        "--editgroup-id",
-        type=str,
-        help="use existing editgroup (instead of creating a new one)",
-    )
-
     sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")
     sub_datacite.add_argument(
         "json_file",
@@ -964,12 +864,6 @@ def main() -> None:
         type=argparse.FileType("r"),
     )
     sub_datacite.add_argument(
-        "--extid-map-file",
-        help="DOI-to-other-identifiers sqlite3 database",
-        default=None,
-        type=str,
-    )
-    sub_datacite.add_argument(
         "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
     )
     sub_datacite.add_argument(
author	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
committer	bnewbold <bnewbold@archive.org>	2021-11-11 01:12:18 +0000
commit	6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree	1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_import.py
parent	7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent	6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
download	fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip