From 485c8b8432d839bb3cc0bd67152adda4bbf0df20 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Apr 2020 15:27:25 -0700 Subject: move scripts/ to bin/ --- bin/cord19_fatcat_enrich.py | 104 +++++++++++++++++++ bin/deliver_file2disk.py | 225 ++++++++++++++++++++++++++++++++++++++++ bin/grobid2json.py | 199 +++++++++++++++++++++++++++++++++++ bin/parse_cord19_csv.py | 15 +++ scripts/cord19_fatcat_enrich.py | 104 ------------------- scripts/deliver_file2disk.py | 225 ---------------------------------------- scripts/parse_cord19_csv.py | 15 --- 7 files changed, 543 insertions(+), 344 deletions(-) create mode 100755 bin/cord19_fatcat_enrich.py create mode 100755 bin/deliver_file2disk.py create mode 100755 bin/grobid2json.py create mode 100755 bin/parse_cord19_csv.py delete mode 100755 scripts/cord19_fatcat_enrich.py delete mode 100755 scripts/deliver_file2disk.py delete mode 100755 scripts/parse_cord19_csv.py diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py new file mode 100755 index 0000000..a911007 --- /dev/null +++ b/bin/cord19_fatcat_enrich.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +""" +Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat +metadata. +""" + +import sys +import json +import argparse +import datetime + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + + +def requests_retry_session(retries=10, backoff_factor=3, + status_forcelist=(500, 502, 504), session=None): + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +def do_line(row, args): + + pubmed_id = row.get('pubmed_id') or None + pmcid = row.get('pmcid') or None + doi = row.get('doi') or None + fatcat_release = None + + if doi == '0.1126/science.abb7331': + doi = '10.1126/science.abb7331' + + if not fatcat_release and pmcid: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmcid': pmcid, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and doi: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'doi': doi, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and pubmed_id: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmid': pubmed_id, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + + obj = dict( + cord19_paper=row, + ) + if fatcat_release: + obj['fatcat_release'] = fatcat_release + obj['release_id'] = fatcat_release['ident'] + obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id']) + print(json.dumps(obj, sort_keys=True)) + +def run(args): + for l in args.json_file: + l = json.loads(l) + do_line(l, args) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="CORD-19 parsed JSON file", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + args.session = requests_retry_session() + + run(args) + +if __name__ == '__main__': + main() + diff --git a/bin/deliver_file2disk.py b/bin/deliver_file2disk.py new file mode 100755 index 0000000..f54ecb3 --- /dev/null +++ b/bin/deliver_file2disk.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Tool for downloading fatcat release PDFs to disk (assuming there is at least +one accessible PDF file entity for each release). + +Behavior: +- if no file, or not accessible, skip release +- filter files, then iterate through: + - if already exists locally on disk, skip + - try downloading from any archive.org or web.archive.org URLs + - verify SHA-1 + - write out to disk +""" + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os +import sys +import json +import magic +import hashlib +import argparse +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error +from collections import Counter + + +def gen_file_metadata(blob): + """ + Takes a file blob (bytestream) and returns hashes and other metadata. + + Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype + """ + assert blob + mimetype = magic.Magic(mime=True).from_buffer(blob) + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + for h in hashes: + h.update(blob) + return dict( + size_bytes=len(blob), + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + +def requests_retry_session(retries=2, backoff_factor=3, + status_forcelist=(500, 502, 504), session=None): + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +class DeliverFatcatDisk: + + def __init__(self, disk_dir, **kwargs): + self.count = Counter() + self.disk_dir = disk_dir + self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') + self.disk_suffix = kwargs.get('disk_suffix', '.pdf') + self.session = requests_retry_session() + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 fatcat.DeliverFatcatDisk', + }) + + def run(self, release_json_file): + sys.stderr.write("Ensuring all 256 base directories exist...\n") + for i in range(256): + fpath = "{}/{}{:02x}".format( + self.disk_dir, + self.disk_prefix, + i) + os.makedirs(fpath, exist_ok=True) + sys.stderr.write("Starting...\n") + for line in release_json_file: + self.count['total'] += 1 + if not line.startswith('{'): + self.count['skip-no-release'] += 1 + continue + #print(line) + release = json.loads(line) + assert 'ident' in release + self.fetch_release(release) + sys.stderr.write("{}\n".format(self.count)) + + def blob_path(self, sha1hex): + fpath = "{}/{}{}/{}{}".format( + self.disk_dir, + self.disk_prefix, + sha1hex[0:2], + sha1hex, + self.disk_suffix) + return fpath + + def does_file_already_exist(self, sha1hex): + return os.path.isfile(self.blob_path(sha1hex)) + + def filter_files(self, files): + """ + Takes a list of file entities and only returns the ones which are PDFs + we can download. + """ + good = [] + for f in files: + if f['mimetype'] and not 'pdf' in f['mimetype'].lower(): + continue + for url in f['urls']: + if 'archive.org/' in url['url']: + good.append(f) + break + return good + + def fetch_content(self, url): + """ + Returns tuple: (str:status, content) + Content contains bytes only if status is "success", otherwise None + """ + if '://web.archive.org/' in url: + # add id_ to URL to avoid wayback re-writing + l = url.split('/') + if l[2] == 'web.archive.org' and l[3] == 'web' and not '_' in l[4]: + l[4] = l[4] + 'id_' + url = '/'.join(l) + + try: + resp = self.session.get(url) + except requests.exceptions.RetryError: + return ('wayback-error', None) + except requests.exceptions.TooManyRedirects: + return ('too-many-redirects', None) + if resp.status_code != 200: + return ('fetch:{}'.format(resp.status_code), None) + else: + return ('success', resp.content) + + def fetch_file(self, f): + """ + Returns tuple: (status, sha1hex, file_meta) + + file_meta is a dict on success, or None otherwise + """ + sha1hex = f['sha1'] + if self.does_file_already_exist(sha1hex): + return ('exists', sha1hex, None) + status = None + for url in f['urls']: + url = url['url'] + if not 'archive.org' in url: + continue + status, content = self.fetch_content(url) + if status == 'success': + # TODO: verify sha1hex + file_meta = gen_file_metadata(content) + if file_meta['sha1hex'] != sha1hex: + status = 'sha1-mismatch' + continue + with open(self.blob_path(sha1hex), 'wb') as outf: + outf.write(content) + return ('success', sha1hex, file_meta) + if status: + return (status, sha1hex, None) + else: + return ('no-urls', sha1hex, None) + + def fetch_release(self, release): + good_files = self.filter_files(release['files']) + status = 'no-file' + sha1hex = None + for f in good_files: + status, sha1hex, file_meta = self.fetch_file(f) + if status in ('success', 'exists'): + break + else: + continue + if sha1hex: + print("{}\t{}".format(status, sha1hex)) + else: + print(status) + self.count[status] += 1 + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--disk-dir', + required=True, + type=str, + help='local base directory to save into') + parser.add_argument('--disk-prefix', + type=str, + default="pdf/", + help='directory prefix for items created in bucket') + parser.add_argument('--disk-suffix', + type=str, + default=".pdf", + help='file suffix for created files') + parser.add_argument('release_json_file', + help="JSON manifest of fatcat release entities", + default=sys.stdin, + type=argparse.FileType('r')) + args = parser.parse_args() + + worker = DeliverFatcatDisk(**args.__dict__) + worker.run(args.release_json_file) + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/bin/grobid2json.py b/bin/grobid2json.py new file mode 100755 index 0000000..39ab222 --- /dev/null +++ b/bin/grobid2json.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +""" +NB: adapted to work as a library for PDF extraction. Will probably be +re-written eventually to be correct, complete, and robust; this is just a +first iteration. + +This script tries to extract everything from a GROBID TEI XML fulltext dump: + +- header metadata +- affiliations +- references (with context) +- abstract +- fulltext +- tables, figures, equations + +A flag can be specified to disable copyright encumbered bits (--no-emcumbered): + +- abstract +- fulltext +- tables, figures, equations + +Prints JSON to stdout, errors to stderr +""" + +import io +import json +import argparse +import xml.etree.ElementTree as ET + +xml_ns = "http://www.w3.org/XML/1998/namespace" +ns = "http://www.tei-c.org/ns/1.0" + +def all_authors(elem): + names = [] + for author in elem.findall('.//{%s}author' % ns): + pn = author.find('./{%s}persName' % ns) + if not pn: + continue + given_name = pn.findtext('./{%s}forename' % ns) or None + surname = pn.findtext('./{%s}surname' % ns) or None + full_name = ' '.join(pn.itertext()) + obj = dict(name=full_name) + if given_name: + obj['given_name'] = given_name + if surname: + obj['surname'] = surname + ae = author.find('./{%s}affiliation' % ns) + if ae: + affiliation = dict() + for on in ae.findall('./{%s}orgName' % ns): + affiliation[on.get('type')] = on.text + addr_e = ae.find('./{%s}address' % ns) + if addr_e: + address = dict() + for t in addr_e.getchildren(): + address[t.tag.split('}')[-1]] = t.text + if address: + affiliation['address'] = address + #affiliation['address'] = { + # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, + # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, + # 'country': addr.findtext('./{%s}country' % ns) or None, + #} + obj['affiliation'] = affiliation + names.append(obj) + return names + + +def journal_info(elem): + journal = dict() + journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) + journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) + if journal['publisher'] == '': + journal['publisher'] = None + journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) + journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) + journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + keys = list(journal.keys()) + + # remove empty/null keys + for k in keys: + if not journal[k]: + journal.pop(k) + return journal + + +def biblio_info(elem): + ref = dict() + ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id') + # Title stuff is messy in references... + ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) + other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) + if other_title: + if ref['title']: + ref['journal'] = other_title + else: + ref['journal'] = None + ref['title'] = other_title + ref['authors'] = all_authors(elem) + ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) + if ref['publisher'] == '': + ref['publisher'] = None + date = elem.find('.//{%s}date[@type="published"]' % ns) + ref['date'] = (date != None) and date.attrib.get('when') + ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + el = elem.find('.//{%s}ptr[@target]' % ns) + if el is not None: + ref['url'] = el.attrib['target'] + # Hand correction + if ref['url'].endswith(".Lastaccessed"): + ref['url'] = ref['url'].replace(".Lastaccessed", "") + else: + ref['url'] = None + return ref + + +def teixml2json(content, encumbered=True): + + if type(content) == str: + content = io.StringIO(content) + elif type(content) == bytes: + content = io.BytesIO(content) + + info = dict() + + #print(content) + #print(content.getvalue()) + tree = ET.parse(content) + tei = tree.getroot() + + header = tei.find('.//{%s}teiHeader' % ns) + if header is None: + raise ValueError("XML does not look like TEI format") + application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0] + info['grobid_version'] = application_tag.attrib['version'].strip() + info['grobid_timestamp'] = application_tag.attrib['when'].strip() + info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) + info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns))) + info['journal'] = journal_info(header) + date = header.find('.//{%s}date[@type="published"]' % ns) + info['date'] = (date != None) and date.attrib.get('when') + info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) + info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) + if info['doi']: + info['doi'] = info['doi'].lower() + + refs = [] + for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))): + ref = biblio_info(bs) + ref['index'] = i + refs.append(ref) + info['citations'] = refs + + text = tei.find('.//{%s}text' % (ns)) + #print(text.attrib) + if text.attrib.get('{%s}lang' % xml_ns): + info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + + if encumbered: + el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) + info['abstract'] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find('.//{%s}text/{%s}body' % (ns, ns)) + info['body'] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns)) + info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) + info['annex'] = (el or None) and " ".join(el.itertext()).strip() + + # remove empty/null keys + keys = list(info.keys()) + for k in keys: + if not info[k]: + info.pop(k) + return info + +def main(): # pragma no cover + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="GROBID TEI XML to JSON", + usage="%(prog)s [options] ...") + parser.add_argument("--no-encumbered", + action="store_true", + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)") + parser.add_argument("teifiles", nargs='+') + + args = parser.parse_args() + + for filename in args.teifiles: + content = open(filename, 'r') + print(json.dumps( + teixml2json(content, + encumbered=(not args.no_encumbered)), + sort_keys=True)) + +if __name__=='__main__': # pragma no cover + main() diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py new file mode 100755 index 0000000..536e5d3 --- /dev/null +++ b/bin/parse_cord19_csv.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import sys +import csv +import json + +CSVFILE = sys.argv[1] + +with open(CSVFILE, newline='') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + row = dict(row) + row['mag_id'] = row.pop('Microsoft Academic Paper ID') + row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '') + print(json.dumps(row, sort_keys=True)) diff --git a/scripts/cord19_fatcat_enrich.py b/scripts/cord19_fatcat_enrich.py deleted file mode 100755 index 5d3a554..0000000 --- a/scripts/cord19_fatcat_enrich.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 - -""" -Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat -metadata. -""" - -import sys -import json -import argparse -import datetime - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error - - -def requests_retry_session(retries=10, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): - """ - From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests - """ - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - return session - - -def do_line(row, args): - - pubmed_id = row.get('pubmed_id') or None - pmcid = row.get('pmcid') or None - doi = row.get('doi') or None - fatcat_release = None - - if doi == '0.1126/science.abb7331': - doi = '10.1126/science.abb7331' - - if not fatcat_release and pmcid: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmcid': pmcid, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and doi: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'doi': doi, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and pubmed_id: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmid': pubmed_id, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - - obj = dict( - cord19_paper=row, - ) - if fatcat_release: - obj['fatcat_release'] = fatcat_release - obj['release_id'] = fatcat_release['ident'] - obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id']) - print(json.dumps(obj, sort_keys=True)) - -def run(args): - for l in sys.stdin: - l = json.loads(l) - do_line(l, args) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="CORD-19 parsed JSON file", - type=argparse.FileType('r')) - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - diff --git a/scripts/deliver_file2disk.py b/scripts/deliver_file2disk.py deleted file mode 100755 index f54ecb3..0000000 --- a/scripts/deliver_file2disk.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool for downloading fatcat release PDFs to disk (assuming there is at least -one accessible PDF file entity for each release). - -Behavior: -- if no file, or not accessible, skip release -- filter files, then iterate through: - - if already exists locally on disk, skip - - try downloading from any archive.org or web.archive.org URLs - - verify SHA-1 - - write out to disk -""" - -# XXX: some broken MRO thing going on in here due to python3 object wrangling -# in `wayback` library. Means we can't run pylint. -# pylint: skip-file - -import os -import sys -import json -import magic -import hashlib -import argparse -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error -from collections import Counter - - -def gen_file_metadata(blob): - """ - Takes a file blob (bytestream) and returns hashes and other metadata. - - Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype - """ - assert blob - mimetype = magic.Magic(mime=True).from_buffer(blob) - hashes = [ - hashlib.sha1(), - hashlib.sha256(), - hashlib.md5(), - ] - for h in hashes: - h.update(blob) - return dict( - size_bytes=len(blob), - sha1hex=hashes[0].hexdigest(), - sha256hex=hashes[1].hexdigest(), - md5hex=hashes[2].hexdigest(), - mimetype=mimetype, - ) - -def requests_retry_session(retries=2, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): - """ - From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests - """ - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - return session - - -class DeliverFatcatDisk: - - def __init__(self, disk_dir, **kwargs): - self.count = Counter() - self.disk_dir = disk_dir - self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') - self.disk_suffix = kwargs.get('disk_suffix', '.pdf') - self.session = requests_retry_session() - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 fatcat.DeliverFatcatDisk', - }) - - def run(self, release_json_file): - sys.stderr.write("Ensuring all 256 base directories exist...\n") - for i in range(256): - fpath = "{}/{}{:02x}".format( - self.disk_dir, - self.disk_prefix, - i) - os.makedirs(fpath, exist_ok=True) - sys.stderr.write("Starting...\n") - for line in release_json_file: - self.count['total'] += 1 - if not line.startswith('{'): - self.count['skip-no-release'] += 1 - continue - #print(line) - release = json.loads(line) - assert 'ident' in release - self.fetch_release(release) - sys.stderr.write("{}\n".format(self.count)) - - def blob_path(self, sha1hex): - fpath = "{}/{}{}/{}{}".format( - self.disk_dir, - self.disk_prefix, - sha1hex[0:2], - sha1hex, - self.disk_suffix) - return fpath - - def does_file_already_exist(self, sha1hex): - return os.path.isfile(self.blob_path(sha1hex)) - - def filter_files(self, files): - """ - Takes a list of file entities and only returns the ones which are PDFs - we can download. - """ - good = [] - for f in files: - if f['mimetype'] and not 'pdf' in f['mimetype'].lower(): - continue - for url in f['urls']: - if 'archive.org/' in url['url']: - good.append(f) - break - return good - - def fetch_content(self, url): - """ - Returns tuple: (str:status, content) - Content contains bytes only if status is "success", otherwise None - """ - if '://web.archive.org/' in url: - # add id_ to URL to avoid wayback re-writing - l = url.split('/') - if l[2] == 'web.archive.org' and l[3] == 'web' and not '_' in l[4]: - l[4] = l[4] + 'id_' - url = '/'.join(l) - - try: - resp = self.session.get(url) - except requests.exceptions.RetryError: - return ('wayback-error', None) - except requests.exceptions.TooManyRedirects: - return ('too-many-redirects', None) - if resp.status_code != 200: - return ('fetch:{}'.format(resp.status_code), None) - else: - return ('success', resp.content) - - def fetch_file(self, f): - """ - Returns tuple: (status, sha1hex, file_meta) - - file_meta is a dict on success, or None otherwise - """ - sha1hex = f['sha1'] - if self.does_file_already_exist(sha1hex): - return ('exists', sha1hex, None) - status = None - for url in f['urls']: - url = url['url'] - if not 'archive.org' in url: - continue - status, content = self.fetch_content(url) - if status == 'success': - # TODO: verify sha1hex - file_meta = gen_file_metadata(content) - if file_meta['sha1hex'] != sha1hex: - status = 'sha1-mismatch' - continue - with open(self.blob_path(sha1hex), 'wb') as outf: - outf.write(content) - return ('success', sha1hex, file_meta) - if status: - return (status, sha1hex, None) - else: - return ('no-urls', sha1hex, None) - - def fetch_release(self, release): - good_files = self.filter_files(release['files']) - status = 'no-file' - sha1hex = None - for f in good_files: - status, sha1hex, file_meta = self.fetch_file(f) - if status in ('success', 'exists'): - break - else: - continue - if sha1hex: - print("{}\t{}".format(status, sha1hex)) - else: - print(status) - self.count[status] += 1 - -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('--disk-dir', - required=True, - type=str, - help='local base directory to save into') - parser.add_argument('--disk-prefix', - type=str, - default="pdf/", - help='directory prefix for items created in bucket') - parser.add_argument('--disk-suffix', - type=str, - default=".pdf", - help='file suffix for created files') - parser.add_argument('release_json_file', - help="JSON manifest of fatcat release entities", - default=sys.stdin, - type=argparse.FileType('r')) - args = parser.parse_args() - - worker = DeliverFatcatDisk(**args.__dict__) - worker.run(args.release_json_file) - -if __name__ == '__main__': # pragma: no cover - main() diff --git a/scripts/parse_cord19_csv.py b/scripts/parse_cord19_csv.py deleted file mode 100755 index 536e5d3..0000000 --- a/scripts/parse_cord19_csv.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import csv -import json - -CSVFILE = sys.argv[1] - -with open(CSVFILE, newline='') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - row = dict(row) - row['mag_id'] = row.pop('Microsoft Academic Paper ID') - row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '') - print(json.dumps(row, sort_keys=True)) -- cgit v1.2.3