diff options
author | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-11-11 01:12:18 +0000 |
commit | 6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4 (patch) | |
tree | 1b80344125152b46ae727dc8bbff73cc12abfd3e /python/fatcat_import.py | |
parent | 7e3f91f1a49ea85707cae31125021ba761f5373d (diff) | |
parent | 6eaf4f57c1f92b6f4f46adc38e5b39fd30b65d81 (diff) | |
download | fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.tar.gz fatcat-6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4.zip |
Merge branch 'bnewbold-import-refactors' into 'master'
import refactors and deprecations
Some of these are from old stale branches (the datacite subject metadata patch), but most are from yesterday and today. Sort of a hodge-podge, but the general theme is getting around to deferred cleanups and refactors specific to importer code before making some behavioral changes.
The Datacite-specific stuff could use review here.
Remove unused/deprecated/dead code:
- cdl_dash_dat and wayback_static importers, which were for specific early example entities and have been superseded by other importers
- "extid map" sqlite3 feature from several importers, was only used for initial bulk imports (and maybe should not have been used)
Refactors:
- moved a number of large datastructures out of importer code and into a dedicated static file (`biblio_lookup_tables.py`). Didn't move all, just the ones that were either generic or very large (making it hard to read code)
- shuffled around relative imports and some function names ("clean_str" vs. "clean")
Some actual behavioral changes:
- remove some Datacite-specific license slugs
- stop trying to fix double-slashes in DOIs, that was causing more harm than help (some DOIs do actually have double-slashes!)
- remove some excess metadata from datacite 'extra' fields
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 108 |
1 files changed, 1 insertions, 107 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 445acde8..33679868 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -42,8 +42,6 @@ from fatcat_tools.importers import ( SavePaperNowWebImporter, ShadowLibraryImporter, SqlitePusher, - auto_cdl_dash_dat, - auto_wayback_static, ) # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable @@ -54,7 +52,6 @@ def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, - extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, ) @@ -72,7 +69,7 @@ def run_crossref(args: argparse.Namespace) -> None: def run_jalc(args: argparse.Namespace) -> None: - ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() @@ -316,53 +313,6 @@ def run_shadow_lib(args: argparse.Namespace) -> None: JsonLinePusher(fmi, args.json_file).run() -def run_wayback_static(args: argparse.Namespace) -> None: - api = args.api - - # find the release - if args.release_id: - release_id = args.release_id - elif args.extid: - idtype = args.extid.split(":")[0] - extid = ":".join(args.extid.split(":")[1:]) - if idtype == "doi": - release_id = api.lookup_release(doi=extid).ident - elif idtype == "pmid": - release_id = api.lookup_release(pmid=extid).ident - elif idtype == "wikidata": - release_id = api.lookup_release(wikidata_qid=extid).ident - else: - raise NotImplementedError("extid type: {}".format(idtype)) - else: - raise Exception("need either release_id or extid argument") - - # create it - (editgroup_id, wc) = auto_wayback_static( - api, release_id, args.wayback_url, editgroup_id=args.editgroup_id - ) - if not wc: - return - print("release_id: {}".format(release_id)) - print("editgroup_id: {}".format(editgroup_id)) - print("webcapture id: {}".format(wc.ident)) - print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) - - -def run_cdl_dash_dat(args: argparse.Namespace) -> None: - api = args.api - - # create it - (editgroup_id, release, fs) = auto_cdl_dash_dat( - api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id - ) - if not (fs and release): - return - print("release_id: {}".format(release.ident)) - print("editgroup_id: {}".format(editgroup_id)) - print("fileset id: {}".format(fs.ident)) - print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) - - def run_datacite(args: argparse.Namespace) -> None: dci = DataciteImporter( args.api, @@ -370,7 +320,6 @@ def run_datacite(args: argparse.Namespace) -> None: edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, - extid_map_file=args.extid_map_file, insert_log_file=args.insert_log_file, ) if args.kafka_mode: @@ -495,12 +444,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_crossref.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_crossref.add_argument( "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" ) sub_crossref.add_argument( @@ -529,12 +472,6 @@ def main() -> None: default=None, type=argparse.FileType("r"), ) - sub_jalc.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( @@ -913,43 +850,6 @@ def main() -> None: type=argparse.FileType("r"), ) - sub_wayback_static = subparsers.add_parser( - "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" - ) - sub_wayback_static.set_defaults( - func=run_wayback_static, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_wayback_static.add_argument( - "wayback_url", type=str, help="URL of wayback capture to extract from" - ) - sub_wayback_static.add_argument( - "--extid", type=str, help="external identifier for release lookup" - ) - sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") - sub_wayback_static.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - - sub_cdl_dash_dat = subparsers.add_parser( - "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" - ) - sub_cdl_dash_dat.set_defaults( - func=run_cdl_dash_dat, - auth_var="FATCAT_API_AUTH_TOKEN", - ) - sub_cdl_dash_dat.add_argument( - "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" - ) - sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") - sub_cdl_dash_dat.add_argument( - "--editgroup-id", - type=str, - help="use existing editgroup (instead of creating a new one)", - ) - sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") sub_datacite.add_argument( "json_file", @@ -964,12 +864,6 @@ def main() -> None: type=argparse.FileType("r"), ) sub_datacite.add_argument( - "--extid-map-file", - help="DOI-to-other-identifiers sqlite3 database", - default=None, - type=str, - ) - sub_datacite.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" ) sub_datacite.add_argument( |