aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/unpaywall2ingestrequest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
commit826c7538e091fac14d987a3cd654975da964e240 (patch)
tree90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts/unpaywall2ingestrequest.py
parent020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
downloadsandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
make fmt (black 21.9b0)
Diffstat (limited to 'python/scripts/unpaywall2ingestrequest.py')
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py63
1 files changed, 31 insertions, 32 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index b79f316..ad5353b 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -11,7 +11,6 @@ import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -25,11 +24,11 @@ DOMAIN_BLOCKLIST = [
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
- 'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
- 'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
}
@@ -45,44 +44,44 @@ def transform(obj):
"""
requests = []
- if not obj['doi'].startswith('10.'):
+ if not obj["doi"].startswith("10."):
return requests
- if not obj['oa_locations']:
+ if not obj["oa_locations"]:
return requests
- for location in obj['oa_locations']:
- if not location['url_for_pdf']:
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in location['url_for_pdf']:
+ if domain in location["url_for_pdf"]:
skip = True
if skip:
continue
try:
- base_url = canon(location['url_for_pdf'])
+ base_url = canon(location["url_for_pdf"])
except UnicodeEncodeError:
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'unpaywall',
- 'link_source_id': obj['doi'].lower(),
- 'ingest_request_source': 'unpaywall',
- 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
- 'rel': location['host_type'],
- 'ext_ids': {
- 'doi': obj['doi'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
- if obj.get('oa_status'):
- request['edit_extra']['oa_status'] = obj['oa_status']
- if location.get('evidence'):
- request['edit_extra']['evidence'] = location['evidence']
- if location['pmh_id']:
- request['ext_ids']['pmh_id'] = location['pmh_id']
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
requests.append(request)
return requests
@@ -101,9 +100,9 @@ def run(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -111,5 +110,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()