aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_import.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:12:18 +0000
commit6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch)
tree1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_import.py
parent7e3f91f1a49ea85707cae31125021ba761f5373d (diff)
parent6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff)
downloadfatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz
fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes. The Datacite-specific stuff could use review here. Remove unused/deprecated/dead code: - cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers - "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used) Refactors: - moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code) - shuffled around relative imports and some function names ("clean_str" vs. "clean") Some actual behavioral changes: - remove some Datacite-specific license slugs - stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!) - remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-xpython/fatcat_import.py108
1 files changed, 1 insertions, 107 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 445acde8..33679868 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -42,8 +42,6 @@ from fatcat_tools.importers import (
SavePaperNowWebImporter,
ShadowLibraryImporter,
SqlitePusher,
- auto_cdl_dash_dat,
- auto_wayback_static,
)
# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
@@ -54,7 +52,6 @@ def run_crossref(args: argparse.Namespace) -> None:
fci = CrossrefImporter(
args.api,
args.issn_map_file,
- extid_map_file=args.extid_map_file,
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
)
@@ -72,7 +69,7 @@ def run_crossref(args: argparse.Namespace) -> None:
def run_jalc(args: argparse.Namespace) -> None:
- ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
+ ji = JalcImporter(args.api, args.issn_map_file)
Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
@@ -316,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None:
JsonLinePusher(fmi, args.json_file).run()
-def run_wayback_static(args: argparse.Namespace) -> None:
- api = args.api
-
- # find the release
- if args.release_id:
- release_id = args.release_id
- elif args.extid:
- idtype = args.extid.split(":")[0]
- extid = ":".join(args.extid.split(":")[1:])
- if idtype == "doi":
- release_id = api.lookup_release(doi=extid).ident
- elif idtype == "pmid":
- release_id = api.lookup_release(pmid=extid).ident
- elif idtype == "wikidata":
- release_id = api.lookup_release(wikidata_qid=extid).ident
- else:
- raise NotImplementedError("extid type: {}".format(idtype))
- else:
- raise Exception("need either release_id or extid argument")
-
- # create it
- (editgroup_id, wc) = auto_wayback_static(
- api, release_id, args.wayback_url, editgroup_id=args.editgroup_id
- )
- if not wc:
- return
- print("release_id: {}".format(release_id))
- print("editgroup_id: {}".format(editgroup_id))
- print("webcapture id: {}".format(wc.ident))
- print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
-
-
-def run_cdl_dash_dat(args: argparse.Namespace) -> None:
- api = args.api
-
- # create it
- (editgroup_id, release, fs) = auto_cdl_dash_dat(
- api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id
- )
- if not (fs and release):
- return
- print("release_id: {}".format(release.ident))
- print("editgroup_id: {}".format(editgroup_id))
- print("fileset id: {}".format(fs.ident))
- print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
-
-
def run_datacite(args: argparse.Namespace) -> None:
dci = DataciteImporter(
args.api,
@@ -370,7 +320,6 @@ def run_datacite(args: argparse.Namespace) -> None:
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
debug=args.debug,
- extid_map_file=args.extid_map_file,
insert_log_file=args.insert_log_file,
)
if args.kafka_mode:
@@ -495,12 +444,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_crossref.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_crossref.add_argument(
"--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
)
sub_crossref.add_argument(
@@ -529,12 +472,6 @@ def main() -> None:
default=None,
type=argparse.FileType("r"),
)
- sub_jalc.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
sub_arxiv.set_defaults(
@@ -913,43 +850,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
- sub_wayback_static = subparsers.add_parser(
- "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback"
- )
- sub_wayback_static.set_defaults(
- func=run_wayback_static,
- auth_var="FATCAT_API_AUTH_TOKEN",
- )
- sub_wayback_static.add_argument(
- "wayback_url", type=str, help="URL of wayback capture to extract from"
- )
- sub_wayback_static.add_argument(
- "--extid", type=str, help="external identifier for release lookup"
- )
- sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier")
- sub_wayback_static.add_argument(
- "--editgroup-id",
- type=str,
- help="use existing editgroup (instead of creating a new one)",
- )
-
- sub_cdl_dash_dat = subparsers.add_parser(
- "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project"
- )
- sub_cdl_dash_dat.set_defaults(
- func=run_cdl_dash_dat,
- auth_var="FATCAT_API_AUTH_TOKEN",
- )
- sub_cdl_dash_dat.add_argument(
- "dat_path", type=str, help="local path dat to import (must be the dat discovery key)"
- )
- sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier")
- sub_cdl_dash_dat.add_argument(
- "--editgroup-id",
- type=str,
- help="use existing editgroup (instead of creating a new one)",
- )
-
sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")
sub_datacite.add_argument(
"json_file",
@@ -964,12 +864,6 @@ def main() -> None:
type=argparse.FileType("r"),
)
sub_datacite.add_argument(
- "--extid-map-file",
- help="DOI-to-other-identifiers sqlite3 database",
- default=None,
- type=str,
- )
- sub_datacite.add_argument(
"--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
)
sub_datacite.add_argument(