diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:02:48 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:04:30 -0800 |
commit | 3f8ead3d07bed78c750b9f6a8b7e95ebffeff089 (patch) | |
tree | 319d88e3b6fed2004b168217cfffa7d76857f901 /python/scripts | |
parent | 4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc (diff) | |
download | sandcrawler-3f8ead3d07bed78c750b9f6a8b7e95ebffeff089.tar.gz sandcrawler-3f8ead3d07bed78c750b9f6a8b7e95ebffeff089.zip |
unpaywall2ingestrequest transform script
Diffstat (limited to 'python/scripts')
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py new file mode 100755 index 0000000..c51a152 --- /dev/null +++ b/python/scripts/unpaywall2ingestrequest.py @@ -0,0 +1,103 @@ +#!/usr/bin/python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + "semanticscholar.org/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", +] + +RELEASE_STAGE_MAP = { + 'draftVersion': 'draft', + 'submittedVersion': 'submitted', + 'acceptedVersion': 'accepted', + 'publishedVersion': 'published', + 'updatedVersion': 'updated', +} + + +def transform(obj): + """ + Transforms from a single unpaywall object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj['doi'].startswith('10.'): + return requests + if not obj['oa_locations']: + return requests + + for location in obj['oa_locations']: + if not location['url_for_pdf']: + continue + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in location['url_for_pdf']: + skip = True + if skip: + continue + + request = { + 'base_url': location['url_for_pdf'], + 'ingest_type': 'pdf', + 'link_source': 'unpaywall', + 'link_source_id': obj['doi'].lower(), + 'ingest_request_source': 'unpaywall', + 'release_stage': RELEASE_STAGE_MAP.get(location['version']), + 'rel': location['host_type'], + 'ext_ids': { + 'doi': obj['doi'].lower(), + }, + 'edit_extra': {}, + } + if obj.get('oa_status'): + request['edit_extra']['oa_status'] = obj['oa_status'] + if location.get('evidence'): + request['edit_extra']['evidence'] = location['evidence'] + if location['pmh_id']: + request['ext_ids']['pmh_id'] = location['pmh_id'] + requests.append(request) + + return requests + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="unpaywall dump file to use", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() |