diff options
-rw-r--r-- | python/sandcrawler/pdftrio.py | 2 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 103 |
2 files changed, 104 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 7a2e53c..12be9eb 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -180,7 +180,7 @@ class PdfTrioBlobWorker(SandcrawlerWorker): result = dict() result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=mode) + result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode) result['timing'] = dict( pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), total_sec=time.time() - start_process, diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py new file mode 100755 index 0000000..c51a152 --- /dev/null +++ b/python/scripts/unpaywall2ingestrequest.py @@ -0,0 +1,103 @@ +#!/usr/bin/python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + "semanticscholar.org/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", +] + +RELEASE_STAGE_MAP = { + 'draftVersion': 'draft', + 'submittedVersion': 'submitted', + 'acceptedVersion': 'accepted', + 'publishedVersion': 'published', + 'updatedVersion': 'updated', +} + + +def transform(obj): + """ + Transforms from a single unpaywall object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj['doi'].startswith('10.'): + return requests + if not obj['oa_locations']: + return requests + + for location in obj['oa_locations']: + if not location['url_for_pdf']: + continue + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in location['url_for_pdf']: + skip = True + if skip: + continue + + request = { + 'base_url': location['url_for_pdf'], + 'ingest_type': 'pdf', + 'link_source': 'unpaywall', + 'link_source_id': obj['doi'].lower(), + 'ingest_request_source': 'unpaywall', + 'release_stage': RELEASE_STAGE_MAP.get(location['version']), + 'rel': location['host_type'], + 'ext_ids': { + 'doi': obj['doi'].lower(), + }, + 'edit_extra': {}, + } + if obj.get('oa_status'): + request['edit_extra']['oa_status'] = obj['oa_status'] + if location.get('evidence'): + request['edit_extra']['evidence'] = location['evidence'] + if location['pmh_id']: + request['ext_ids']['pmh_id'] = location['pmh_id'] + requests.append(request) + + return requests + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="unpaywall dump file to use", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() |