diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat_import.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/harvest/doi_registrars.py | 19 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 15 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/issn.py | 9 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 2 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 10 | ||||
| -rwxr-xr-x | python/fatcat_worker.py | 2 | 
9 files changed, 47 insertions, 29 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index a5527b8c..cdf04db1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -3,7 +3,7 @@  import sys  import argparse  from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ -    IssnImporter, MatchedImporter, GrobidMetadataImporter  +    IssnImporter, MatchedImporter, GrobidMetadataImporter  def run_crossref(args): diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index 1a6807d2..ed80cfc9 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -3,6 +3,7 @@ import re  import sys  import csv  import json +import time  import requests  import itertools  import datetime @@ -10,6 +11,11 @@ from pykafka import KafkaClient  from fatcat_tools.workers.worker_common import most_recent_message +# Skip pylint due to: +#   AttributeError: 'NoneType' object has no attribute 'scope' +# in 'astroid/node_classes.py' +# pylint: skip-file +  DATE_FMT = "%Y-%m-%d" @@ -79,7 +85,7 @@ class HarvestCrossrefWorker:              date_str, date_str)          if self.is_update_filter is not None:              filter_param += ',is_update:{}'.format(bool(self.is_update_filter)) -        params = { +        return {              'filter': filter_param,              'rows': self.api_batch_size,              'cursor': '*', @@ -93,7 +99,7 @@ class HarvestCrossrefWorker:          state_topic = self.kafka.topics[self.state_topic]          produce_topic = self.kafka.topics[self.produce_topic] -  +          date_str = date.strftime(DATE_FMT)          params = self.params(date_str)          headers = { @@ -103,12 +109,12 @@ class HarvestCrossrefWorker:          with produce_topic.get_producer() as producer:              while True:                  http_resp = requests.get(self.api_host_url, params, headers=headers) -                if http_resp.status_code is 503: +                if http_resp.status_code == 503:                      # crud backoff                      print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))                      time.sleep(30.0)                      continue -                assert http_resp.status_code is 200 +                assert http_resp.status_code == 200                  resp = http_resp.json()                  items = self.extract_items(resp)                  count += len(items) @@ -135,7 +141,7 @@ class HarvestCrossrefWorker:          today_utc = datetime.datetime.utcnow().date()          if self.start_date is None:              self.start_date = self.get_latest_date() -            if self.start_date:  +            if self.start_date:                  # if we are continuing, start day after last success                  self.start_date = self.start_date + datetime.timedelta(days=1)          if self.start_date is None: @@ -167,7 +173,7 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      """      datacite has a REST API as well as OAI-PMH endpoint. -    have about 8 million  +    have about 8 million      bulk export notes: https://github.com/datacite/datacite/issues/188 @@ -206,4 +212,3 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      def update_params(self, params, resp):          params['page[number]'] = resp['meta']['page'] + 1          return params - diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 9cf92b41..e1efde80 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -37,12 +37,21 @@ class FatcatImporter:          print("Processed {} lines, inserted {}, updated {}.".format(              self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) +    def create_row(self, row, editgroup_id=None): +        # sub-classes expected to implement this +        raise NotImplementedError + +    def create_batch(self, rows, editgroup_id=None): +        # sub-classes expected to implement this +        raise NotImplementedError +      def process_source(self, source, group_size=100):          """Creates and auto-accepts editgroup every group_size rows"""          eg = self.api.create_editgroup(              fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +        i = 0          for i, row in enumerate(source): -            self.create_row(row, editgroup=eg.id) +            self.create_row(row, editgroup_id=eg.id)              if i > 0 and (i % group_size) == 0:                  self.api.accept_editgroup(eg.id)                  eg = self.api.create_editgroup( @@ -57,7 +66,7 @@ class FatcatImporter:              self.counts['processed_lines'] += len(rows)              eg = self.api.create_editgroup(                  fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) -            self.create_batch(rows, editgroup=eg.id) +            self.create_batch(rows, editgroup_id=eg.id)      def process_csv_source(self, source, group_size=100, delimiter=','):          reader = csv.DictReader(source, delimiter=delimiter) @@ -85,7 +94,7 @@ class FatcatImporter:          return container_id      def is_orcid(self, orcid): -        return self._orcid_regex.match(orcid) != None +        return self._orcid_regex.match(orcid) is not None      def lookup_orcid(self, orcid):          """Caches calls to the Orcid lookup API endpoint in a local dict""" diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fac8f32b..d0a69cd6 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -112,7 +112,7 @@ class CrossrefImporter(FatcatImporter):                      extra['sequence'] = am.get('sequence')                  if not extra:                      extra = None -                assert(ctype in ("author", "editor", "translator")) +                assert ctype in ("author", "editor", "translator")                  contribs.append(fatcat_client.ReleaseContrib(                      creator_id=creator_id,                      index=index, @@ -133,7 +133,7 @@ class CrossrefImporter(FatcatImporter):          publisher = obj.get('publisher')          ce = None -        if (container_id is None and self.create_containers and issnl != None  +        if (container_id is None and self.create_containers and (issnl is not None)              and obj.get('container-title') and len(obj['container-title']) > 0):              ce = fatcat_client.ContainerEntity(                  issnl=issnl, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index ba8a4e6f..d525d4f7 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -21,7 +21,6 @@ class GrobidMetadataImporter(FatcatImporter):          if not obj.get('title'):              return None -        release = dict()          extra = dict()          if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: @@ -35,7 +34,6 @@ class GrobidMetadataImporter(FatcatImporter):          contribs = []          for i, a in enumerate(obj.get('authors', [])): -            c = dict(raw_name=a['name'], role="author")              contribs.append(fatcat_client.ReleaseContrib(                  index=i,                  raw_name=a['name'], @@ -67,7 +65,6 @@ class GrobidMetadataImporter(FatcatImporter):              ref['extra'] = cite_extra              refs.append(ref) -        release_type = "article-journal"          release_date = None          if obj.get('date'):              # TODO: only returns year, ever? how to handle? @@ -77,7 +74,7 @@ class GrobidMetadataImporter(FatcatImporter):              extra['doi'] = obj['doi']          if obj['journal'] and obj['journal'].get('name'):              extra['container_name'] = obj['journal']['name'] -         +          extra['is_longtail_oa'] = True          # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -89,6 +86,8 @@ class GrobidMetadataImporter(FatcatImporter):          re = fatcat_client.ReleaseEntity(              title=obj['title'].strip(), +            release_type="article-journal", +            release_date=release_date,              contribs=contribs,              refs=refs,              publisher=obj['journal'].get('publisher'), @@ -97,7 +96,7 @@ class GrobidMetadataImporter(FatcatImporter):              abstracts=abstracts,              extra=extra)          return re -     +      # TODO: make this a common function somewhere      def make_url(self, raw):          rel = self.default_link_rel @@ -111,7 +110,7 @@ class GrobidMetadataImporter(FatcatImporter):          return fatcat_client.FileEntityUrls(url=raw, rel=rel)      def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): -         +          sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()          # lookup existing SHA1, or create new entity @@ -141,7 +140,7 @@ class GrobidMetadataImporter(FatcatImporter):          fe.urls.append(              fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))          original_url = self.make_url(original) -        if original_url != None: +        if original_url is not None:              fe.urls.append(original_url)          return fe diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 0b0efccb..f702dc60 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -17,6 +17,7 @@ def truthy(s):      if s is None:          return None      s = s.lower() +      if s in ('true', 't', 'yes', 'y', '1'):          return True      elif s in ('false', 'f', 'no', 'n', '0'): @@ -37,12 +38,12 @@ class IssnImporter(FatcatImporter):      def parse_issn_row(self, row):          """          row is a python dict (parsed from CSV). -        returns a ContainerEntity +        returns a ContainerEntity (or None if invalid or couldn't parse)          """          title = or_none(row['title'])          issnl = or_none(row['ISSN-L'])          if title is None or issnl is None: -            return +            return None          extra = dict(              in_doaj=truthy(row['in_doaj']),              in_road=truthy(row['in_road']), @@ -72,7 +73,7 @@ class IssnImporter(FatcatImporter):      def create_batch(self, batch, editgroup=None):          """Reads and processes in batches (not API-call-per-line)"""          objects = [self.parse_issn_row(l) -                   for l in batch if l != None] -        objects = [o for o in objects if o != None] +                   for l in batch if (l is not None)] +        objects = [o for o in objects if (o is not None)]          self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup)          self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index e64c043b..e803e2d0 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -27,7 +27,7 @@ class ChangelogWorker(FatcatWorker):          # topic          if self.offset is None:              print("Checking for most recent changelog offset...") -            msg = self.most_recent_message(topic) +            msg = most_recent_message(topic)              if msg:                  self.offset = json.loads(msg.decode('utf-8'))['index']              else: diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index 62096a93..64d5931e 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -50,11 +50,13 @@ def container_create_view():  @app.route('/container/create', methods=['POST'])  def container_create(): +    raise NotImplementedError      params = dict()      for k in request.form:          if k.startswith('container_'):              params[k[10:]] = request.form[k] -    edit = api.create_container(params=params) +    container = None +    edit = api.create_container(container, params=params)      return redirect("/container/{}".format(edit.ident))  @app.route('/container/lookup', methods=['GET']) @@ -174,11 +176,13 @@ def release_create_view():  @app.route('/release/create', methods=['POST'])  def release_create(): +    raise NotImplementedError      params = dict()      for k in request.form:          if k.startswith('release_'):              params[k[10:]] = request.form[k] -    edit = api.create_release(params=params) +    release = None +    edit = api.create_release(release, params=params)      return redirect("/release/{}".format(edit.ident))  @app.route('/release/<ident>/history', methods=['GET']) @@ -260,7 +264,7 @@ def work_view(ident):  @app.route('/editgroup/current', methods=['GET'])  def editgroup_current(): -    raise NotImplemented() +    raise NotImplementedError      #eg = api.get_or_create_editgroup()      #return redirect('/editgroup/{}'.format(eg.id)) diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index 2f883fe0..e0ac48d8 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -21,7 +21,7 @@ def run_entity_updates(args):  def run_elasticsearch_release(args):      consume_topic = "fatcat-{}.release-updates".format(args.env) -    worker = ReleaseWorker(args.kafka_hosts, +    worker = ElasticsearchReleaseWorker(args.kafka_hosts,          consume_topic, elasticsearch_backend=args.elasticsearch_backend,          elasticsearch_index=args.elasticsearch_index)      worker.run() | 
