diff options
Diffstat (limited to 'python/fatcat_import.py')
| -rwxr-xr-x | python/fatcat_import.py | 48 | 
1 files changed, 30 insertions, 18 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 656fe87d..04f58ff7 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -156,10 +156,8 @@ def run_cdl_dash_dat(args):      print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))  def main(): -    parser = argparse.ArgumentParser() -    parser.add_argument('--debug', -        action='store_true', -        help="enable debugging interface") +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter)      parser.add_argument('--host-url',          default="http://localhost:9411/v0",          help="connect to this host/port") @@ -177,7 +175,8 @@ def main():          default=None, type=str)      subparsers = parser.add_subparsers() -    sub_crossref = subparsers.add_parser('crossref') +    sub_crossref = subparsers.add_parser('crossref', +        help="import Crossref API metadata format (JSON)")      sub_crossref.set_defaults(          func=run_crossref,          auth_var="FATCAT_AUTH_WORKER_CROSSREF", @@ -201,7 +200,8 @@ def main():          action='store_true',          help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") -    sub_jalc = subparsers.add_parser('jalc') +    sub_jalc = subparsers.add_parser('jalc', +        help="import JALC DOI metadata from XML dump")      sub_jalc.set_defaults(          func=run_jalc,          auth_var="FATCAT_AUTH_WORKER_JALC", @@ -216,7 +216,8 @@ def main():          help="DOI-to-other-identifiers sqlite3 database",          default=None, type=str) -    sub_arxiv = subparsers.add_parser('arxiv') +    sub_arxiv = subparsers.add_parser('arxiv', +        help="import arxiv.org metadata from XML files")      sub_arxiv.set_defaults(          func=run_arxiv,          auth_var="FATCAT_AUTH_WORKER_ARXIV", @@ -228,7 +229,8 @@ def main():          action='store_true',          help="consume from kafka topic (not stdin)") -    sub_pubmed = subparsers.add_parser('pubmed') +    sub_pubmed = subparsers.add_parser('pubmed', +        help="import MEDLINE/PubMed work-level metadata (XML)")      sub_pubmed.set_defaults(          func=run_pubmed,          auth_var="FATCAT_AUTH_WORKER_PUBMED", @@ -246,7 +248,8 @@ def main():          action='store_true',          help="consume from kafka topic (not stdin)") -    sub_jstor = subparsers.add_parser('jstor') +    sub_jstor = subparsers.add_parser('jstor', +        help="import JSTOR work-level metadata from XML dump")      sub_jstor.set_defaults(          func=run_jstor,          auth_var="FATCAT_AUTH_WORKER_JSTOR", @@ -258,7 +261,8 @@ def main():          help="ISSN to ISSN-L mapping file",          default=None, type=argparse.FileType('r')) -    sub_orcid = subparsers.add_parser('orcid') +    sub_orcid = subparsers.add_parser('orcid', +        help="import creator entities from ORCID XML dump")      sub_orcid.set_defaults(          func=run_orcid,          auth_var="FATCAT_AUTH_WORKER_ORCID" @@ -267,7 +271,8 @@ def main():          help="orcid JSON file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_journal_metadata = subparsers.add_parser('journal-metadata') +    sub_journal_metadata = subparsers.add_parser('journal-metadata', +        help="import/update container metadata from old manual munging format")      sub_journal_metadata.set_defaults(          func=run_journal_metadata,          auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", @@ -276,7 +281,8 @@ def main():          help="Journal JSON metadata file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_chocula = subparsers.add_parser('chocula') +    sub_chocula = subparsers.add_parser('chocula', +        help="import/update container metadata from chocula JSON export")      sub_chocula.set_defaults(          func=run_chocula,          auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", @@ -285,7 +291,8 @@ def main():          help="chocula JSON entities file (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_matched = subparsers.add_parser('matched') +    sub_matched = subparsers.add_parser('matched', +        help="add file entities matched against existing releases; custom JSON format")      sub_matched.set_defaults(          func=run_matched,          auth_var="FATCAT_API_AUTH_TOKEN", @@ -303,7 +310,8 @@ def main():          default="web",          help="default URL rel for matches (eg, 'publisher', 'web')") -    sub_arabesque_match = subparsers.add_parser('arabesque') +    sub_arabesque_match = subparsers.add_parser('arabesque', +        help="add file entities matched to releases from crawl log analysis")      sub_arabesque_match.set_defaults(          func=run_arabesque_match,          auth_var="FATCAT_AUTH_WORKER_CRAWL", @@ -328,7 +336,8 @@ def main():          default="web",          help="default URL rel for matches (eg, 'publisher', 'web')") -    sub_ingest_file = subparsers.add_parser('ingest-file-results') +    sub_ingest_file = subparsers.add_parser('ingest-file-results', +        help="add/update flie entities linked to releases based on sandcrawler ingest results")      sub_ingest_file.set_defaults(          func=run_ingest_file,          auth_var="FATCAT_AUTH_WORKER_CRAWL", @@ -352,7 +361,8 @@ def main():          default="web",          help="default URL rel for matches (eg, 'publisher', 'web')") -    sub_grobid_metadata = subparsers.add_parser('grobid-metadata') +    sub_grobid_metadata = subparsers.add_parser('grobid-metadata', +        help="create release and file entities based on GROBID PDF metadata extraction")      sub_grobid_metadata.set_defaults(          func=run_grobid_metadata,          auth_var="FATCAT_API_AUTH_TOKEN", @@ -370,7 +380,8 @@ def main():          action='store_true',          help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") -    sub_wayback_static = subparsers.add_parser('wayback-static') +    sub_wayback_static = subparsers.add_parser('wayback-static', +        help="crude crawl+ingest tool for single-page HTML docs from wayback")      sub_wayback_static.set_defaults(          func=run_wayback_static,          auth_var="FATCAT_API_AUTH_TOKEN", @@ -388,7 +399,8 @@ def main():          type=str,          help="use existing editgroup (instead of creating a new one)") -    sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat') +    sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat', +        help="crude helper to import datasets from Dat/CDL mirror pilot project")      sub_cdl_dash_dat.set_defaults(          func=run_cdl_dash_dat,          auth_var="FATCAT_API_AUTH_TOKEN", | 
