diff options
-rwxr-xr-x | python/fatcat_import.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 19 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 15 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
-rw-r--r-- | python/fatcat_tools/importers/issn.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 2 | ||||
-rw-r--r-- | python/fatcat_web/routes.py | 10 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 2 |
9 files changed, 47 insertions, 29 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index a5527b8c..cdf04db1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -3,7 +3,7 @@ import sys import argparse from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ - IssnImporter, MatchedImporter, GrobidMetadataImporter + IssnImporter, MatchedImporter, GrobidMetadataImporter def run_crossref(args): diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 1a6807d2..ed80cfc9 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import time import requests import itertools import datetime @@ -10,6 +11,11 @@ from pykafka import KafkaClient from fatcat_tools.workers.worker_common import most_recent_message +# Skip pylint due to: +# AttributeError: 'NoneType' object has no attribute 'scope' +# in 'astroid/node_classes.py' +# pylint: skip-file + DATE_FMT = "%Y-%m-%d" @@ -79,7 +85,7 @@ class HarvestCrossrefWorker: date_str, date_str) if self.is_update_filter is not None: filter_param += ',is_update:{}'.format(bool(self.is_update_filter)) - params = { + return { 'filter': filter_param, 'rows': self.api_batch_size, 'cursor': '*', @@ -93,7 +99,7 @@ class HarvestCrossrefWorker: state_topic = self.kafka.topics[self.state_topic] produce_topic = self.kafka.topics[self.produce_topic] - + date_str = date.strftime(DATE_FMT) params = self.params(date_str) headers = { @@ -103,12 +109,12 @@ class HarvestCrossrefWorker: with produce_topic.get_producer() as producer: while True: http_resp = requests.get(self.api_host_url, params, headers=headers) - if http_resp.status_code is 503: + if http_resp.status_code == 503: # crud backoff print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code)) time.sleep(30.0) continue - assert http_resp.status_code is 200 + assert http_resp.status_code == 200 resp = http_resp.json() items = self.extract_items(resp) count += len(items) @@ -135,7 +141,7 @@ class HarvestCrossrefWorker: today_utc = datetime.datetime.utcnow().date() if self.start_date is None: self.start_date = self.get_latest_date() - if self.start_date: + if self.start_date: # if we are continuing, start day after last success self.start_date = self.start_date + datetime.timedelta(days=1) if self.start_date is None: @@ -167,7 +173,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): """ datacite has a REST API as well as OAI-PMH endpoint. - have about 8 million + have about 8 million bulk export notes: https://github.com/datacite/datacite/issues/188 @@ -206,4 +212,3 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): def update_params(self, params, resp): params['page[number]'] = resp['meta']['page'] + 1 return params - diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9cf92b41..e1efde80 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -37,12 +37,21 @@ class FatcatImporter: print("Processed {} lines, inserted {}, updated {}.".format( self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) + def create_row(self, row, editgroup_id=None): + # sub-classes expected to implement this + raise NotImplementedError + + def create_batch(self, rows, editgroup_id=None): + # sub-classes expected to implement this + raise NotImplementedError + def process_source(self, source, group_size=100): """Creates and auto-accepts editgroup every group_size rows""" eg = self.api.create_editgroup( fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + i = 0 for i, row in enumerate(source): - self.create_row(row, editgroup=eg.id) + self.create_row(row, editgroup_id=eg.id) if i > 0 and (i % group_size) == 0: self.api.accept_editgroup(eg.id) eg = self.api.create_editgroup( @@ -57,7 +66,7 @@ class FatcatImporter: self.counts['processed_lines'] += len(rows) eg = self.api.create_editgroup( fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - self.create_batch(rows, editgroup=eg.id) + self.create_batch(rows, editgroup_id=eg.id) def process_csv_source(self, source, group_size=100, delimiter=','): reader = csv.DictReader(source, delimiter=delimiter) @@ -85,7 +94,7 @@ class FatcatImporter: return container_id def is_orcid(self, orcid): - return self._orcid_regex.match(orcid) != None + return self._orcid_regex.match(orcid) is not None def lookup_orcid(self, orcid): """Caches calls to the Orcid lookup API endpoint in a local dict""" diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fac8f32b..d0a69cd6 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -112,7 +112,7 @@ class CrossrefImporter(FatcatImporter): extra['sequence'] = am.get('sequence') if not extra: extra = None - assert(ctype in ("author", "editor", "translator")) + assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, @@ -133,7 +133,7 @@ class CrossrefImporter(FatcatImporter): publisher = obj.get('publisher') ce = None - if (container_id is None and self.create_containers and issnl != None + if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba8a4e6f..d525d4f7 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -21,7 +21,6 @@ class GrobidMetadataImporter(FatcatImporter): if not obj.get('title'): return None - release = dict() extra = dict() if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: @@ -35,7 +34,6 @@ class GrobidMetadataImporter(FatcatImporter): contribs = [] for i, a in enumerate(obj.get('authors', [])): - c = dict(raw_name=a['name'], role="author") contribs.append(fatcat_client.ReleaseContrib( index=i, raw_name=a['name'], @@ -67,7 +65,6 @@ class GrobidMetadataImporter(FatcatImporter): ref['extra'] = cite_extra refs.append(ref) - release_type = "article-journal" release_date = None if obj.get('date'): # TODO: only returns year, ever? how to handle? @@ -77,7 +74,7 @@ class GrobidMetadataImporter(FatcatImporter): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): extra['container_name'] = obj['journal']['name'] - + extra['is_longtail_oa'] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -89,6 +86,8 @@ class GrobidMetadataImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( title=obj['title'].strip(), + release_type="article-journal", + release_date=release_date, contribs=contribs, refs=refs, publisher=obj['journal'].get('publisher'), @@ -97,7 +96,7 @@ class GrobidMetadataImporter(FatcatImporter): abstracts=abstracts, extra=extra) return re - + # TODO: make this a common function somewhere def make_url(self, raw): rel = self.default_link_rel @@ -111,7 +110,7 @@ class GrobidMetadataImporter(FatcatImporter): return fatcat_client.FileEntityUrls(url=raw, rel=rel) def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - + sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() # lookup existing SHA1, or create new entity @@ -141,7 +140,7 @@ class GrobidMetadataImporter(FatcatImporter): fe.urls.append( fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) original_url = self.make_url(original) - if original_url != None: + if original_url is not None: fe.urls.append(original_url) return fe diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 0b0efccb..f702dc60 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -17,6 +17,7 @@ def truthy(s): if s is None: return None s = s.lower() + if s in ('true', 't', 'yes', 'y', '1'): return True elif s in ('false', 'f', 'no', 'n', '0'): @@ -37,12 +38,12 @@ class IssnImporter(FatcatImporter): def parse_issn_row(self, row): """ row is a python dict (parsed from CSV). - returns a ContainerEntity + returns a ContainerEntity (or None if invalid or couldn't parse) """ title = or_none(row['title']) issnl = or_none(row['ISSN-L']) if title is None or issnl is None: - return + return None extra = dict( in_doaj=truthy(row['in_doaj']), in_road=truthy(row['in_road']), @@ -72,7 +73,7 @@ class IssnImporter(FatcatImporter): def create_batch(self, batch, editgroup=None): """Reads and processes in batches (not API-call-per-line)""" objects = [self.parse_issn_row(l) - for l in batch if l != None] - objects = [o for o in objects if o != None] + for l in batch if (l is not None)] + objects = [o for o in objects if (o is not None)] self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index e64c043b..e803e2d0 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -27,7 +27,7 @@ class ChangelogWorker(FatcatWorker): # topic if self.offset is None: print("Checking for most recent changelog offset...") - msg = self.most_recent_message(topic) + msg = most_recent_message(topic) if msg: self.offset = json.loads(msg.decode('utf-8'))['index'] else: diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 62096a93..64d5931e 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -50,11 +50,13 @@ def container_create_view(): @app.route('/container/create', methods=['POST']) def container_create(): + raise NotImplementedError params = dict() for k in request.form: if k.startswith('container_'): params[k[10:]] = request.form[k] - edit = api.create_container(params=params) + container = None + edit = api.create_container(container, params=params) return redirect("/container/{}".format(edit.ident)) @app.route('/container/lookup', methods=['GET']) @@ -174,11 +176,13 @@ def release_create_view(): @app.route('/release/create', methods=['POST']) def release_create(): + raise NotImplementedError params = dict() for k in request.form: if k.startswith('release_'): params[k[10:]] = request.form[k] - edit = api.create_release(params=params) + release = None + edit = api.create_release(release, params=params) return redirect("/release/{}".format(edit.ident)) @app.route('/release/<ident>/history', methods=['GET']) @@ -260,7 +264,7 @@ def work_view(ident): @app.route('/editgroup/current', methods=['GET']) def editgroup_current(): - raise NotImplemented() + raise NotImplementedError #eg = api.get_or_create_editgroup() #return redirect('/editgroup/{}'.format(eg.id)) diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index 2f883fe0..e0ac48d8 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -21,7 +21,7 @@ def run_entity_updates(args): def run_elasticsearch_release(args): consume_topic = "fatcat-{}.release-updates".format(args.env) - worker = ReleaseWorker(args.kafka_hosts, + worker = ElasticsearchReleaseWorker(args.kafka_hosts, consume_topic, elasticsearch_backend=args.elasticsearch_backend, elasticsearch_index=args.elasticsearch_index) worker.run() |