aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-18 19:02:48 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-18 19:04:30 -0800
commit3f8ead3d07bed78c750b9f6a8b7e95ebffeff089 (patch)
tree319d88e3b6fed2004b168217cfffa7d76857f901 /python
parent4cf5345040b4e8a5d77ca3ceb0f7ea4f8c5778dc (diff)
downloadsandcrawler-3f8ead3d07bed78c750b9f6a8b7e95ebffeff089.tar.gz
sandcrawler-3f8ead3d07bed78c750b9f6a8b7e95ebffeff089.zip
unpaywall2ingestrequest transform script
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdftrio.py2
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py103
2 files changed, 104 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7a2e53c..12be9eb 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -180,7 +180,7 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
result = dict()
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=mode)
+ result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
result['timing'] = dict(
pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
total_sec=time.time() - start_process,
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
new file mode 100755
index 0000000..c51a152
--- /dev/null
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'draftVersion': 'draft',
+ 'submittedVersion': 'submitted',
+ 'acceptedVersion': 'accepted',
+ 'publishedVersion': 'published',
+ 'updatedVersion': 'updated',
+}
+
+
+def transform(obj):
+ """
+ Transforms from a single unpaywall object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj['doi'].startswith('10.'):
+ return requests
+ if not obj['oa_locations']:
+ return requests
+
+ for location in obj['oa_locations']:
+ if not location['url_for_pdf']:
+ continue
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in location['url_for_pdf']:
+ skip = True
+ if skip:
+ continue
+
+ request = {
+ 'base_url': location['url_for_pdf'],
+ 'ingest_type': 'pdf',
+ 'link_source': 'unpaywall',
+ 'link_source_id': obj['doi'].lower(),
+ 'ingest_request_source': 'unpaywall',
+ 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
+ 'rel': location['host_type'],
+ 'ext_ids': {
+ 'doi': obj['doi'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ if obj.get('oa_status'):
+ request['edit_extra']['oa_status'] = obj['oa_status']
+ if location.get('evidence'):
+ request['edit_extra']['evidence'] = location['evidence']
+ if location['pmh_id']:
+ request['ext_ids']['pmh_id'] = location['pmh_id']
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="unpaywall dump file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()