diff options
43 files changed, 4020 insertions, 3194 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index ec38a17b..6f9ee7d8 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,4 +1,3 @@ -  from .api_auth import authenticated_api, public_api  from .fcid import fcid2uuid, uuid2fcid  from .kafka import kafka_fail_fast, simple_kafka_producer diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index bbf059c0..d8f0c46d 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,4 +1,3 @@ -  import os  import sys @@ -15,6 +14,7 @@ def public_api(host_uri):      conf.host = host_uri      return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) +  def authenticated_api(host_uri, token=None):      """      Note: if this helper is called, it's implied that an actual API connection @@ -24,10 +24,11 @@ def authenticated_api(host_uri, token=None):      conf = fatcat_openapi_client.Configuration()      conf.host = host_uri      if not token: -        token = os.environ['FATCAT_API_AUTH_TOKEN'] +        token = os.environ["FATCAT_API_AUTH_TOKEN"]      if not token:          sys.stderr.write( -            'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') +            "This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n" +        )          sys.exit(-1)      conf.api_key["Authorization"] = token diff --git a/python/fatcat_tools/cleanups/__init__.py b/python/fatcat_tools/cleanups/__init__.py index 587c7b9b..0aeec977 100644 --- a/python/fatcat_tools/cleanups/__init__.py +++ b/python/fatcat_tools/cleanups/__init__.py @@ -1,3 +1,2 @@ -  from .common import EntityCleaner  from .files import FileCleaner diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index d0fcc761..26ca7bd6 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -1,4 +1,3 @@ -  import copy  import json  import subprocess @@ -30,16 +29,19 @@ class EntityCleaner:      def __init__(self, api, entity_type, **kwargs): -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['git_rev'] = eg_extra.get('git_rev', -            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner') +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["git_rev"] = eg_extra.get( +            "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() +        ).decode("utf-8") +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner")          self.api = api          self.entity_type = entity_type -        self.dry_run_mode = kwargs.get('dry_run_mode', True) -        self.edit_batch_size = kwargs.get('edit_batch_size', 50) -        self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot") +        self.dry_run_mode = kwargs.get("dry_run_mode", True) +        self.edit_batch_size = kwargs.get("edit_batch_size", 50) +        self.editgroup_description = kwargs.get( +            "editgroup_description", "Generic Entity Cleaner Bot" +        )          self.editgroup_extra = eg_extra          self.reset()          self.ac = ApiClient() @@ -48,7 +50,7 @@ class EntityCleaner:              print("Running in dry-run mode!")      def reset(self): -        self.counts = Counter({'lines': 0, 'cleaned': 0, 'updated': 0}) +        self.counts = Counter({"lines": 0, "cleaned": 0, "updated": 0})          self._edit_count = 0          self._editgroup_id = None          self._entity_queue = [] @@ -63,23 +65,23 @@ class EntityCleaner:          Returns nothing.          """ -        self.counts['lines'] += 1 -        if (not record): -            self.counts['skip-null'] += 1 +        self.counts["lines"] += 1 +        if not record: +            self.counts["skip-null"] += 1              return          entity = entity_from_dict(record, self.entity_type, api_client=self.ac) -        if entity.state != 'active': -            self.counts['skip-inactive'] += 1 +        if entity.state != "active": +            self.counts["skip-inactive"] += 1              return          cleaned = self.clean_entity(copy.deepcopy(entity))          if entity == cleaned: -            self.counts['skip-clean'] += 1 +            self.counts["skip-clean"] += 1              return          else: -            self.counts['cleaned'] += 1 +            self.counts["cleaned"] += 1          if self.dry_run_mode:              entity_dict = entity_to_dict(entity, api_client=self.ac) @@ -87,11 +89,13 @@ class EntityCleaner:              return          if entity.ident in self._idents_inflight: -            raise ValueError("Entity already part of in-process update: {}".format(entity.ident)) +            raise ValueError( +                "Entity already part of in-process update: {}".format(entity.ident) +            )          updated = self.try_update(cleaned)          if updated: -            self.counts['updated'] += updated +            self.counts["updated"] += updated              self._edit_count += updated              self._idents_inflight.append(entity.ident) @@ -132,9 +136,8 @@ class EntityCleaner:          if not self._editgroup_id:              eg = self.api.create_editgroup( -                Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra)) +                Editgroup(description=self.editgroup_description, extra=self.editgroup_extra) +            )              self._editgroup_id = eg.editgroup_id          return self._editgroup_id diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index 0d275ba6..d378a91f 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,4 +1,3 @@ -  from fatcat_openapi_client.models import FileEntity  from fatcat_openapi_client.rest import ApiException @@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') -        super().__init__(api, +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Automated cleanup of file entities (eg, remove bad URLs)" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner") +        super().__init__( +            api,              entity_type=FileEntity,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, -            **kwargs) +            **kwargs +        )      def clean_entity(self, entity):          """ @@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner):          """          # URL has ://web.archive.org/web/None/ link => delete URL -        entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url] +        entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url]          # URL has ://archive.org/ link with rel=repository => rel=archive          for u in entity.urls: -            if '://archive.org/' in u.url and u.rel == 'repository': -                u.rel = 'archive' +            if "://archive.org/" in u.url and u.rel == "repository": +                u.rel = "archive"          # URL has short wayback date ("2017") and another url with that as prefix => delete URL          stub_wayback_urls = []          full_wayback_urls = []          for u in entity.urls: -            if '://web.archive.org/web/' in u.url: -                if len(u.url.split('/')[4]) <= 8: +            if "://web.archive.org/web/" in u.url: +                if len(u.url.split("/")[4]) <= 8:                      stub_wayback_urls.append(u.url)                  else: -                    full_wayback_urls.append('/'.join(u.url.split('/')[5:])) +                    full_wayback_urls.append("/".join(u.url.split("/")[5:]))          for stub in stub_wayback_urls: -            target = '/'.join(stub.split('/')[5:]) +            target = "/".join(stub.split("/")[5:])              if target in full_wayback_urls:                  entity.urls = [u for u in entity.urls if u.url != stub] @@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner):          except ApiException as err:              if err.status != 404:                  raise err -            self.counts['skip-not-found'] += 1 +            self.counts["skip-not-found"] += 1              return 0 -        if existing.state != 'active': -            self.counts['skip-existing-inactive'] += 1 +        if existing.state != "active": +            self.counts["skip-existing-inactive"] += 1              return 0          if existing.revision != entity.revision: -            self.counts['skip-revision'] += 1 +            self.counts["skip-revision"] += 1              return 0          self.api.update_file(self.get_editgroup_id(), entity.ident, entity) diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index 0987d10d..53891e5a 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -1,4 +1,3 @@ -  import base64  import uuid @@ -7,18 +6,20 @@ def fcid2uuid(s):      """      Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object      """ -    s = s.split('_')[-1].upper().encode('utf-8') +    s = s.split("_")[-1].upper().encode("utf-8")      assert len(s) == 26      raw = base64.b32decode(s + b"======")      return str(uuid.UUID(bytes=raw)).lower() +  def uuid2fcid(s):      """      Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)      """      raw = uuid.UUID(s).bytes -    return base64.b32encode(raw)[:26].lower().decode('utf-8') +    return base64.b32encode(raw)[:26].lower().decode("utf-8") +  def test_fcid(): -    test_uuid = '00000000-0000-0000-3333-000000000001' +    test_uuid = "00000000-0000-0000-3333-000000000001"      assert test_uuid == fcid2uuid(uuid2fcid(test_uuid)) diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d441d495..dd48e256 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,4 +1,3 @@ -  import json  import sys  import time @@ -59,29 +58,35 @@ class HarvestCrossrefWorker:      to be careful how state is serialized back into kafka.      """ -    def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, -            api_host_url="https://api.crossref.org/works", start_date=None, -            end_date=None): +    def __init__( +        self, +        kafka_hosts, +        produce_topic, +        state_topic, +        contact_email, +        api_host_url="https://api.crossref.org/works", +        start_date=None, +        end_date=None, +    ):          self.api_host_url = api_host_url          self.produce_topic = produce_topic          self.state_topic = state_topic          self.contact_email = contact_email          self.kafka_config = { -            'bootstrap.servers': kafka_hosts, -            'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes +            "bootstrap.servers": kafka_hosts, +            "message.max.bytes": 20000000,  # ~20 MBytes; broker is ~50 MBytes          }          self.state = HarvestState(start_date, end_date)          self.state.initialize_from_kafka(self.state_topic, self.kafka_config) -        self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks +        self.loop_sleep = 60 * 60  # how long to wait, in seconds, between date checks          self.api_batch_size = 50          self.name = "Crossref"          self.producer = self._kafka_producer()      def _kafka_producer(self): -          def fail_fast(err, msg):              if err is not None:                  print("Kafka producer delivery error: {}".format(err), file=sys.stderr) @@ -92,46 +97,53 @@ class HarvestCrossrefWorker:          self._kafka_fail_fast = fail_fast          producer_conf = self.kafka_config.copy() -        producer_conf.update({ -            'delivery.report.only.error': True, -            'default.topic.config': { -                'request.required.acks': -1, # all brokers must confirm -            }, -        }) +        producer_conf.update( +            { +                "delivery.report.only.error": True, +                "default.topic.config": { +                    "request.required.acks": -1,  # all brokers must confirm +                }, +            } +        )          return Producer(producer_conf)      def params(self, date_str): -        filter_param = 'from-update-date:{},until-update-date:{}'.format( -            date_str, date_str) +        filter_param = "from-update-date:{},until-update-date:{}".format(date_str, date_str)          return { -            'filter': filter_param, -            'rows': self.api_batch_size, -            'cursor': '*', +            "filter": filter_param, +            "rows": self.api_batch_size, +            "cursor": "*",          }      def update_params(self, params, resp): -        params['cursor'] = resp['message']['next-cursor'] +        params["cursor"] = resp["message"]["next-cursor"]          return params      def extract_key(self, obj): -        return obj['DOI'].encode('utf-8') +        return obj["DOI"].encode("utf-8")      def fetch_date(self, date):          date_str = date.isoformat()          params = self.params(date_str)          http_session = requests_retry_session() -        http_session.headers.update({ -            'User-Agent': 'fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests'.format( -                self.contact_email), -        }) +        http_session.headers.update( +            { +                "User-Agent": "fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests".format( +                    self.contact_email +                ), +            } +        )          count = 0          while True:              http_resp = http_session.get(self.api_host_url, params=params)              if http_resp.status_code == 503:                  # crude backoff; now redundant with session exponential                  # backoff, but allows for longer backoff/downtime on remote end -                print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr) +                print( +                    "got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), +                    file=sys.stderr, +                )                  # keep kafka producer connection alive                  self.producer.poll(0)                  time.sleep(30.0) @@ -143,19 +155,27 @@ class HarvestCrossrefWorker:              except json.JSONDecodeError as exc:                  # Datacite API returned HTTP 200, but JSON seemed unparseable.                  # It might be a glitch, so we retry. -                print("failed to decode body from {}: {}".format(http_resp.url, resp_body), file=sys.stderr) +                print( +                    "failed to decode body from {}: {}".format(http_resp.url, resp_body), +                    file=sys.stderr, +                )                  raise exc              items = self.extract_items(resp)              count += len(items) -            print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count, -                self.extract_total(resp), http_resp.elapsed), file=sys.stderr) -            #print(json.dumps(resp)) +            print( +                "... got {} ({} of {}), HTTP fetch took {}".format( +                    len(items), count, self.extract_total(resp), http_resp.elapsed +                ), +                file=sys.stderr, +            ) +            # print(json.dumps(resp))              for work in items:                  self.producer.produce(                      self.produce_topic, -                    json.dumps(work).encode('utf-8'), +                    json.dumps(work).encode("utf-8"),                      key=self.extract_key(work), -                    on_delivery=self._kafka_fail_fast) +                    on_delivery=self._kafka_fail_fast, +                )              self.producer.poll(0)              if len(items) < self.api_batch_size:                  break @@ -163,10 +183,10 @@ class HarvestCrossrefWorker:          self.producer.flush()      def extract_items(self, resp): -        return resp['message']['items'] +        return resp["message"]["items"]      def extract_total(self, resp): -        return resp['message']['total-results'] +        return resp["message"]["total-results"]      def run(self, continuous=False): @@ -175,9 +195,9 @@ class HarvestCrossrefWorker:              if current:                  print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)                  self.fetch_date(current) -                self.state.complete(current, -                    kafka_topic=self.state_topic, -                    kafka_config=self.kafka_config) +                self.state.complete( +                    current, kafka_topic=self.state_topic, kafka_config=self.kafka_config +                )                  continue              if continuous: @@ -200,16 +220,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):      could/should use this script for that, and dump to JSON?      """ -    def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, -            api_host_url="https://api.datacite.org/dois", -            start_date=None, end_date=None): -        super().__init__(kafka_hosts=kafka_hosts, -                         produce_topic=produce_topic, -                         state_topic=state_topic, -                         api_host_url=api_host_url, -                         contact_email=contact_email, -                         start_date=start_date, -                         end_date=end_date) +    def __init__( +        self, +        kafka_hosts, +        produce_topic, +        state_topic, +        contact_email, +        api_host_url="https://api.datacite.org/dois", +        start_date=None, +        end_date=None, +    ): +        super().__init__( +            kafka_hosts=kafka_hosts, +            produce_topic=produce_topic, +            state_topic=state_topic, +            api_host_url=api_host_url, +            contact_email=contact_email, +            start_date=start_date, +            end_date=end_date, +        )          # for datecite, it's "from-update-date"          self.name = "Datacite" @@ -219,19 +248,21 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):          Dates have to be supplied in 2018-10-27T22:36:30.000Z format.          """          return { -            'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str), -            'page[size]': self.api_batch_size, -            'page[cursor]': 1, +            "query": "updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]".format( +                date_str, date_str +            ), +            "page[size]": self.api_batch_size, +            "page[cursor]": 1,          }      def extract_items(self, resp): -        return resp['data'] +        return resp["data"]      def extract_total(self, resp): -        return resp['meta']['total'] +        return resp["meta"]["total"]      def extract_key(self, obj): -        return obj['attributes']['doi'].encode('utf-8') +        return obj["attributes"]["doi"].encode("utf-8")      def update_params(self, params, resp):          """ @@ -245,9 +276,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):              https://github.com/datacite/datacite/issues/897 (HTTP 400)              https://github.com/datacite/datacite/issues/898 (HTTP 500)          """ -        parsed = urlparse(resp['links']['next']) -        page_cursor = parse_qs(parsed.query).get('page[cursor]') +        parsed = urlparse(resp["links"]["next"]) +        page_cursor = parse_qs(parsed.query).get("page[cursor]")          if not page_cursor: -            raise ValueError('no page[cursor] in .links.next') -        params['page[cursor]'] = page_cursor[0] +            raise ValueError("no page[cursor] in .links.next") +        params["page[cursor]"] = page_cursor[0]          return params diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 45c2b8ea..fda0dc62 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,4 +1,3 @@ -  import datetime  import json  import sys @@ -14,8 +13,10 @@ from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import  # Used for parsing ISO date format (YYYY-MM-DD)  DATE_FMT = "%Y-%m-%d" -def requests_retry_session(retries=10, backoff_factor=3, -        status_forcelist=(500, 502, 504), session=None): + +def requests_retry_session( +    retries=10, backoff_factor=3, status_forcelist=(500, 502, 504), session=None +):      """      From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests      """ @@ -28,10 +29,11 @@ def requests_retry_session(retries=10, backoff_factor=3,          status_forcelist=status_forcelist,      )      adapter = HTTPAdapter(max_retries=retry) -    session.mount('http://', adapter) -    session.mount('https://', adapter) +    session.mount("http://", adapter) +    session.mount("https://", adapter)      return session +  class HarvestState:      """      First version of this works with full days (dates) @@ -57,8 +59,9 @@ class HarvestState:              self.enqueue_period(start_date, end_date, catchup_days)      def __str__(self): -        return '<HarvestState to_process={}, completed={}>'.format( -            len(self.to_process), len(self.completed)) +        return "<HarvestState to_process={}, completed={}>".format( +            len(self.to_process), len(self.completed) +        )      def enqueue_period(self, start_date=None, end_date=None, catchup_days=14):          """ @@ -92,7 +95,9 @@ class HarvestState:          """          if continuous:              # enqueue yesterday -            self.enqueue_period(start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1)) +            self.enqueue_period( +                start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1) +            )          if not self.to_process:              return None          return sorted(list(self.to_process))[0] @@ -105,8 +110,8 @@ class HarvestState:          state stored on disk or in Kafka.          """          state = json.loads(state_json) -        if 'completed-date' in state: -            date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date() +        if "completed-date" in state: +            date = datetime.datetime.strptime(state["completed-date"], DATE_FMT).date()              self.complete(date)      def complete(self, date, kafka_topic=None, kafka_config=None): @@ -123,12 +128,14 @@ class HarvestState:          except KeyError:              pass          self.completed.add(date) -        state_json = json.dumps({ -            'in-progress-dates': [str(d) for d in self.to_process], -            'completed-date': str(date), -        }).encode('utf-8') +        state_json = json.dumps( +            { +                "in-progress-dates": [str(d) for d in self.to_process], +                "completed-date": str(date), +            } +        ).encode("utf-8")          if kafka_topic: -            assert(kafka_config) +            assert kafka_config              def fail_fast(err, msg):                  if err: @@ -136,17 +143,16 @@ class HarvestState:              print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr)              producer_conf = kafka_config.copy() -            producer_conf.update({ -                'delivery.report.only.error': True, -                'default.topic.config': { -                    'request.required.acks': -1, # all brokers must confirm -                }, -            }) +            producer_conf.update( +                { +                    "delivery.report.only.error": True, +                    "default.topic.config": { +                        "request.required.acks": -1,  # all brokers must confirm +                    }, +                } +            )              producer = Producer(producer_conf) -            producer.produce( -                kafka_topic, -                state_json, -                on_delivery=fail_fast) +            producer.produce(kafka_topic, state_json, on_delivery=fail_fast)              producer.flush()          return state_json @@ -166,22 +172,25 @@ class HarvestState:                  raise KafkaException(err)          conf = kafka_config.copy() -        conf.update({ -            'group.id': 'dummy_init_group', # should never be committed -            'enable.auto.commit': False, -            'auto.offset.reset': 'earliest', -            'session.timeout.ms': 10000, -        }) +        conf.update( +            { +                "group.id": "dummy_init_group",  # should never be committed +                "enable.auto.commit": False, +                "auto.offset.reset": "earliest", +                "session.timeout.ms": 10000, +            } +        )          consumer = Consumer(conf)          # this watermark fetch is mostly to ensure we are connected to broker and          # fail fast if not, but we also confirm that we read to end below.          hwm = consumer.get_watermark_offsets( -            TopicPartition(kafka_topic, 0), -            timeout=5.0, -            cached=False) +            TopicPartition(kafka_topic, 0), timeout=5.0, cached=False +        )          if not hwm: -            raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic)) +            raise Exception( +                "Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic) +            )          consumer.assign([TopicPartition(kafka_topic, 0, 0)])          c = 0 @@ -191,8 +200,8 @@ class HarvestState:                  break              if msg.error():                  raise KafkaException(msg.error()) -            #sys.stdout.write('.') -            self.update(msg.value().decode('utf-8')) +            # sys.stdout.write('.') +            self.update(msg.value().decode("utf-8"))              c += 1          consumer.close() diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 0eb0343d..40d1c853 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,4 +1,3 @@ -  import sys  import time @@ -25,19 +24,18 @@ class HarvestOaiPmhWorker:      would want something similar operationally. Oh well!      """ -    def __init__(self, kafka_hosts, produce_topic, state_topic, -            start_date=None, end_date=None): +    def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None):          self.produce_topic = produce_topic          self.state_topic = state_topic          self.kafka_config = { -            'bootstrap.servers': kafka_hosts, -            'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes +            "bootstrap.servers": kafka_hosts, +            "message.max.bytes": 20000000,  # ~20 MBytes; broker is ~50 MBytes          } -        self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks +        self.loop_sleep = 60 * 60  # how long to wait, in seconds, between date checks -        self.endpoint_url = None # needs override +        self.endpoint_url = None  # needs override          self.metadata_prefix = None  # needs override          self.name = "unnamed"          self.state = HarvestState(start_date, end_date) @@ -45,7 +43,6 @@ class HarvestOaiPmhWorker:          print(self.state, file=sys.stderr)      def fetch_date(self, date): -          def fail_fast(err, msg):              if err is not None:                  print("Kafka producer delivery error: {}".format(err), file=sys.stderr) @@ -54,12 +51,14 @@ class HarvestOaiPmhWorker:                  raise KafkaException(err)          producer_conf = self.kafka_config.copy() -        producer_conf.update({ -            'delivery.report.only.error': True, -            'default.topic.config': { -                'request.required.acks': -1, # all brokers must confirm -            }, -        }) +        producer_conf.update( +            { +                "delivery.report.only.error": True, +                "default.topic.config": { +                    "request.required.acks": -1,  # all brokers must confirm +                }, +            } +        )          producer = Producer(producer_conf)          api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503]) @@ -67,13 +66,18 @@ class HarvestOaiPmhWorker:          # this dict kwargs hack is to work around 'from' as a reserved python keyword          # recommended by sickle docs          try: -            records = api.ListRecords(**{ -                'metadataPrefix': self.metadata_prefix, -                'from': date_str, -                'until': date_str, -            }) +            records = api.ListRecords( +                **{ +                    "metadataPrefix": self.metadata_prefix, +                    "from": date_str, +                    "until": date_str, +                } +            )          except sickle.oaiexceptions.NoRecordsMatch: -            print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr) +            print( +                "WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), +                file=sys.stderr, +            )              return          count = 0 @@ -83,9 +87,10 @@ class HarvestOaiPmhWorker:                  print("... up to {}".format(count), file=sys.stderr)              producer.produce(                  self.produce_topic, -                item.raw.encode('utf-8'), -                key=item.header.identifier.encode('utf-8'), -                on_delivery=fail_fast) +                item.raw.encode("utf-8"), +                key=item.header.identifier.encode("utf-8"), +                on_delivery=fail_fast, +            )          producer.flush()      def run(self, continuous=False): @@ -95,9 +100,9 @@ class HarvestOaiPmhWorker:              if current:                  print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)                  self.fetch_date(current) -                self.state.complete(current, -                    kafka_topic=self.state_topic, -                    kafka_config=self.kafka_config) +                self.state.complete( +                    current, kafka_topic=self.state_topic, kafka_config=self.kafka_config +                )                  continue              if continuous: diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index ee55f4eb..0f33f334 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -60,14 +60,15 @@ class PubmedFTPWorker:          <tr>      """ +      def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): -        self.name = 'Pubmed' -        self.host = 'ftp.ncbi.nlm.nih.gov' +        self.name = "Pubmed" +        self.host = "ftp.ncbi.nlm.nih.gov"          self.produce_topic = produce_topic          self.state_topic = state_topic          self.kafka_config = { -            'bootstrap.servers': kafka_hosts, -            'message.max.bytes': 20000000,  # ~20 MBytes; broker is ~50 MBytes +            "bootstrap.servers": kafka_hosts, +            "message.max.bytes": 20000000,  # ~20 MBytes; broker is ~50 MBytes          }          self.loop_sleep = 60 * 60  # how long to wait, in seconds, between date checks          self.state = HarvestState(start_date, end_date) @@ -86,12 +87,14 @@ class PubmedFTPWorker:          self._kafka_fail_fast = fail_fast          producer_conf = self.kafka_config.copy() -        producer_conf.update({ -            'delivery.report.only.error': True, -            'default.topic.config': { -                'request.required.acks': -1,  # all brokers must confirm -            }, -        }) +        producer_conf.update( +            { +                "delivery.report.only.error": True, +                "default.topic.config": { +                    "request.required.acks": -1,  # all brokers must confirm +                }, +            } +        )          return Producer(producer_conf)      def fetch_date(self, date): @@ -105,24 +108,35 @@ class PubmedFTPWorker:          if self.date_file_map is None:              raise ValueError("cannot fetch date without date file mapping") -        date_str = date.strftime('%Y-%m-%d') +        date_str = date.strftime("%Y-%m-%d")          paths = self.date_file_map.get(date_str)          if paths is None: -            print("WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format(date_str, self.date_file_map), file=sys.stderr) +            print( +                "WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format( +                    date_str, self.date_file_map +                ), +                file=sys.stderr, +            )              return False          count = 0          for path in paths:              # Fetch and decompress file.              url = "ftp://{}{}".format(self.host, path) -            filename = ftpretr(url, proxy_hostport="159.69.240.245:15201") # TODO: proxy obsolete, when networking issue is resolved -            with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp: +            filename = ftpretr( +                url, proxy_hostport="159.69.240.245:15201" +            )  # TODO: proxy obsolete, when networking issue is resolved +            with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as decomp:                  try:                      gzf = gzip.open(filename)                      shutil.copyfileobj(gzf, decomp)                  except zlib.error as exc: -                    print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format( -                        url, exc), file=sys.stderr) +                    print( +                        "[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)".format( +                            url, exc +                        ), +                        file=sys.stderr, +                    )                      continue              # Here, blob is the unparsed XML; we peek into it to use PMID as @@ -131,15 +145,17 @@ class PubmedFTPWorker:              # WARNING: Parsing foreign XML exposes us at some              # https://docs.python.org/3/library/xml.html#xml-vulnerabilities              # here. -            for blob in xmlstream(decomp.name, 'PubmedArticle', encoding='utf-8'): -                soup = BeautifulSoup(blob, 'xml') -                pmid = soup.find('PMID') +            for blob in xmlstream(decomp.name, "PubmedArticle", encoding="utf-8"): +                soup = BeautifulSoup(blob, "xml") +                pmid = soup.find("PMID")                  if pmid is None:                      raise ValueError("no PMID found, please adjust identifier extraction")                  count += 1                  if count % 50 == 0:                      print("... up to {}".format(count), file=sys.stderr) -                self.producer.produce(self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast) +                self.producer.produce( +                    self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast +                )              self.producer.flush()              os.remove(filename) @@ -151,13 +167,17 @@ class PubmedFTPWorker:          while True:              self.date_file_map = generate_date_file_map(host=self.host)              if len(self.date_file_map) == 0: -                raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") +                raise ValueError( +                    "map from dates to files should not be empty, maybe the HTML changed?" +                )              current = self.state.next_span(continuous)              if current:                  print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr)                  self.fetch_date(current) -                self.state.complete(current, kafka_topic=self.state_topic, kafka_config=self.kafka_config) +                self.state.complete( +                    current, kafka_topic=self.state_topic, kafka_config=self.kafka_config +                )                  continue              if continuous: @@ -168,7 +188,7 @@ class PubmedFTPWorker:          print("{} FTP ingest caught up".format(self.name)) -def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): +def generate_date_file_map(host="ftp.ncbi.nlm.nih.gov"):      """      Generate a DefaultDict[string, set] mapping dates to absolute filepaths on      the server (mostly we have one file, but sometimes more). @@ -176,14 +196,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):      Example: {"2020-01-02": set(["/pubmed/updatefiles/pubmed20n1016.xml.gz"]), ...}      """      mapping = collections.defaultdict(set) -    pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)') +    pattern = re.compile(r"Filename: ([^ ]*.xml) -- Created: ([^<]*)")      ftp = ftplib.FTP(host)      ftp.login() -    filenames = ftp.nlst('/pubmed/updatefiles') +    filenames = ftp.nlst("/pubmed/updatefiles")      retries, retry_delay = 10, 60      for name in filenames: -        if not name.endswith('.html'): +        if not name.endswith(".html"):              continue          sio = io.StringIO()          for i in range(retries): @@ -201,10 +221,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):                      ftp = ftplib.FTP(host)                      ftp.login()                      sio.truncate(0) -                ftp.retrlines('RETR {}'.format(name), sio.write) +                ftp.retrlines("RETR {}".format(name), sio.write)              except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc: -                print("ftp retr on {} failed with {} ({}) ({} retries left)".format( -                    name, exc, type(exc), retries - (i + 1)), file=sys.stderr) +                print( +                    "ftp retr on {} failed with {} ({}) ({} retries left)".format( +                        name, exc, type(exc), retries - (i + 1) +                    ), +                    file=sys.stderr, +                )                  if i + 1 == retries:                      raise                  else: @@ -214,16 +238,24 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):          contents = sio.getvalue()          match = pattern.search(contents)          if match is None: -            print('pattern miss in {} on: {}, may need to adjust pattern: {}'.format(name, contents, pattern), file=sys.stderr) +            print( +                "pattern miss in {} on: {}, may need to adjust pattern: {}".format( +                    name, contents, pattern +                ), +                file=sys.stderr, +            )              continue -        filename, filedate = match.groups()  # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019') +        ( +            filename, +            filedate, +        ) = match.groups()  # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019')          date = dateparser.parse(filedate) -        fullpath = '/pubmed/updatefiles/{}.gz'.format(filename) -        date_str = date.strftime('%Y-%m-%d') +        fullpath = "/pubmed/updatefiles/{}.gz".format(filename) +        date_str = date.strftime("%Y-%m-%d")          mapping[date_str].add(fullpath) -        print('added entry for {}: {}'.format(date_str, fullpath), file=sys.stderr) +        print("added entry for {}: {}".format(date_str, fullpath), file=sys.stderr) -    print('generated date-file mapping for {} dates'.format(len(mapping)), file=sys.stderr) +    print("generated date-file mapping for {} dates".format(len(mapping)), file=sys.stderr)      return mapping @@ -241,20 +273,29 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):      when we encountered EOFError while talking to the FTP server. Retry delay in seconds.      """      if proxy_hostport is not None: -        return ftpretr_via_http_proxy(url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay) +        return ftpretr_via_http_proxy( +            url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay +        )      parsed = urlparse(url)      server, path = parsed.netloc, parsed.path      for i in range(max_retries):          try:              ftp = ftplib.FTP(server)              ftp.login() -            with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: -                print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) -                ftp.retrbinary('RETR %s' % path, f.write) +            with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f: +                print( +                    "retrieving {} from {} to {} ...".format(path, server, f.name), +                    file=sys.stderr, +                ) +                ftp.retrbinary("RETR %s" % path, f.write)              ftp.close()          except EOFError as exc: -            print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( -                path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) +            print( +                "ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( +                    path, exc, type(exc), max_retries - (i + 1) +                ), +                file=sys.stderr, +            )              if i + 1 == max_retries:                  raise              else: @@ -263,7 +304,9 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):              return f.name -def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1): +def ftpretr_via_http_proxy( +    url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1 +):      """      Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would      be retrievable via proxy.com/a/b/c; (in 09/2021 we used @@ -276,19 +319,23 @@ def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retri          try:              url = "http://{}{}".format(proxy_hostport, path)              print("retrieving file via proxy (ftpup) from {}".format(url), file=sys.stderr) -            with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: +            with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f:                  cmd = ["wget", "-c", url, "-O", f.name]                  result = subprocess.run(cmd)                  return f.name          except (subprocess.CalledProcessError, OSError, ValueError) as exc: -            print("ftp fetch {} failed with {} ({}) ({} retries left)".format( -                url, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) +            print( +                "ftp fetch {} failed with {} ({}) ({} retries left)".format( +                    url, exc, type(exc), max_retries - (i + 1) +                ), +                file=sys.stderr, +            )              if i + 1 == max_retries:                  raise              time.sleep(retry_delay) -def xmlstream(filename, tag, encoding='utf-8'): +def xmlstream(filename, tag, encoding="utf-8"):      """      Note: This might move into a generic place in the future. @@ -300,23 +347,29 @@ def xmlstream(filename, tag, encoding='utf-8'):      Known vulnerabilities: https://docs.python.org/3/library/xml.html#xml-vulnerabilities      """ +      def strip_ns(tag): -        if '}' not in tag: +        if "}" not in tag:              return tag -        return tag.split('}')[1] +        return tag.split("}")[1]      # https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm -    context = iter(ET.iterparse(filename, events=( -        'start', -        'end', -    ))) +    context = iter( +        ET.iterparse( +            filename, +            events=( +                "start", +                "end", +            ), +        ) +    )      try:          _, root = next(context)      except StopIteration:          return      for event, elem in context: -        if not strip_ns(elem.tag) == tag or event == 'start': +        if not strip_ns(elem.tag) == tag or event == "start":              continue          yield ET.tostring(elem, encoding=encoding) diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 2b0ff7ec..ae4f9049 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,9 +1,9 @@ -  import fatcat_openapi_client  from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url -ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' +ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" +  class ArabesqueMatchImporter(EntityImporter):      """ @@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter):      def __init__(self, api, extid_type, require_grobid=True, **kwargs): -        eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist" -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') -        if kwargs.get('crawl_id'): -            eg_extra['crawl_id'] = kwargs.get('crawl_id') -        kwargs['do_updates'] = kwargs.get("do_updates", False) -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) -        assert extid_type in ('doi', 'pmcid', 'pmid') +        eg_desc = ( +            kwargs.get("editgroup_description", None) +            or "Match web crawl files to releases based on identifier/URL seedlist" +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter") +        if kwargs.get("crawl_id"): +            eg_extra["crawl_id"] = kwargs.get("crawl_id") +        kwargs["do_updates"] = kwargs.get("do_updates", False) +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) +        assert extid_type in ("doi", "pmcid", "pmid")          self.extid_type = extid_type          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel @@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter):              print("NOT checking GROBID status column")      def want(self, row): -        if self.require_grobid and not row['postproc_status'] == "200": +        if self.require_grobid and not row["postproc_status"] == "200":              return False -        if (bool(row['hit']) is True -                and row['final_sha1'] -                and row['final_timestamp'] -                and row['final_timestamp'] != "-" -                and len(row['final_timestamp']) == 14 -                and row['final_mimetype'] -                and bool(row['hit']) is True -                and row['identifier']): +        if ( +            bool(row["hit"]) is True +            and row["final_sha1"] +            and row["final_timestamp"] +            and row["final_timestamp"] != "-" +            and len(row["final_timestamp"]) == 14 +            and row["final_mimetype"] +            and bool(row["hit"]) is True +            and row["identifier"] +        ):              return True          else:              return False      def parse_record(self, row): -        extid = row['identifier'].strip() +        extid = row["identifier"].strip()          # check/cleanup DOI -        if self.extid_type == 'doi': +        if self.extid_type == "doi":              extid = extid.lower() -            extid.replace('http://doi.org/', '') -            extid.replace('https://doi.org/', '') -            if extid.startswith('doi:'): +            extid.replace("http://doi.org/", "") +            extid.replace("https://doi.org/", "") +            if extid.startswith("doi:"):                  extid = extid[4:] -            if not extid.startswith('10.'): -                self.counts['skip-extid-invalid'] +            if not extid.startswith("10."): +                self.counts["skip-extid-invalid"]                  return None          # lookup extid @@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter):          except fatcat_openapi_client.rest.ApiException as err:              if err.status == 404:                  # bail on 404 (release not in DB) -                self.counts['skip-extid-not-found'] += 1 +                self.counts["skip-extid-not-found"] += 1                  return None              elif err.status == 400: -                self.counts['skip-extid-invalid'] += 1 +                self.counts["skip-extid-invalid"] += 1                  return None              else:                  raise err -        url = make_rel_url(row['final_url'], self.default_link_rel) +        url = make_rel_url(row["final_url"], self.default_link_rel)          if not url: -            self.counts['skip-url'] += 1 +            self.counts["skip-url"] += 1              return None -        if not row['final_timestamp']: -            self.counts['skip-missing-timestamp'] += 1 +        if not row["final_timestamp"]: +            self.counts["skip-missing-timestamp"] += 1              return None          wayback = "https://web.archive.org/web/{}/{}".format( -            row['final_timestamp'], -            row['final_url']) +            row["final_timestamp"], row["final_url"] +        )          urls = [url, ("webarchive", wayback)]          urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]          if len(urls) > SANE_MAX_URLS: -            self.counts['skip-too-many-url'] += 1 +            self.counts["skip-too-many-url"] += 1              return None          fe = fatcat_openapi_client.FileEntity( -            sha1=b32_hex(row['final_sha1']), -            mimetype=row['final_mimetype'] or self.default_mimetype, +            sha1=b32_hex(row["final_sha1"]), +            mimetype=row["final_mimetype"] or self.default_mimetype,              release_ids=[re.ident],              urls=urls,          ) @@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter):          if (fe.release_ids[0] in existing.release_ids) and existing.urls:              # TODO: could still, in theory update with the new URL? -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          if not self.do_updates: -            self.counts['skip-update-disabled'] += 1 +            self.counts["skip-update-disabled"] += 1              return False          if existing.ident in [e.ident for e in self._edits_inflight]: -            self.counts['skip-update-inflight'] += 1 +            self.counts["skip-update-inflight"] += 1              return False          # TODO: this code path never gets hit because of the check above @@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter):              existing_urls = set([u.url for u in existing.urls])              new_urls = set([u.url for u in fe.urls])              if existing_urls.issuperset(new_urls): -                self.counts['skip-update-nothing-new'] += 1 +                self.counts["skip-update-nothing-new"] += 1                  return False          # merge the existing into this one and update          existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) -        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] +        existing.urls = [ +            fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls +        ]          if len(existing.urls) > SANE_MAX_URLS: -            self.counts['skip-update-too-many-url'] += 1 +            self.counts["skip-update-too-many-url"] += 1              return None          existing.release_ids = list(set(fe.release_ids + existing.release_ids))          if len(existing.release_ids) > SANE_MAX_RELEASES: -            self.counts['skip-update-too-many-url'] += 1 +            self.counts["skip-update-too-many-url"] += 1              return None          existing.mimetype = existing.mimetype or fe.mimetype          edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)          self._edits_inflight.append(edit) -        self.counts['update'] += 1 +        self.counts["update"] += 1          return False      def insert_batch(self, batch): -        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_file_auto_batch( +            fatcat_openapi_client.FileAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index fc429fb0..7a689ed2 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,4 +1,3 @@ -  import datetime  import json  import re @@ -13,6 +12,7 @@ from .crossref import lookup_license_slug  latex2text = LatexNodes2Text() +  def latex_to_text(raw):      try:          return latex2text.latex_to_text(raw).strip() @@ -21,13 +21,14 @@ def latex_to_text(raw):      except IndexError:          return raw.strip() +  def parse_arxiv_authors(raw):      if not raw:          return [] -    raw = raw.replace('*', '') -    if '(' in raw: -        raw = re.sub(r'\(.*\)', '', raw) -    authors = raw.split(', ') +    raw = raw.replace("*", "") +    if "(" in raw: +        raw = re.sub(r"\(.*\)", "", raw) +    authors = raw.split(", ")      if authors:          last = authors[-1].split(" and ")          if len(last) == 2: @@ -39,9 +40,12 @@ def parse_arxiv_authors(raw):      authors = [a for a in authors if a]      return authors +  def test_parse_arxiv_authors(): -    assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ +    assert parse_arxiv_authors( +        "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" +    ) == [          "Raphael Chetrite",          "Shamik Gupta",          "Izaak Neri", @@ -63,7 +67,9 @@ def test_parse_arxiv_authors():          "Raphael Chetrite Shamik Gupta",      ] -    assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V.  James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [ +    assert parse_arxiv_authors( +        "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V.  James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)" +    ) == [          "B. P. Lanyon",          "T. J. Weinhold",          "N. K. Langford", @@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') +        eg_desc = kwargs.get( +            "editgroup_description", +            "Automated import of arxiv metadata via arXivRaw OAI-PMH feed", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter")          # lower batch size, because multiple versions per entry (guessing 2-3 on average?) -        batch_size = kwargs.get('edit_batch_size', 50) -        super().__init__(api, +        batch_size = kwargs.get("edit_batch_size", 50) +        super().__init__( +            api,              editgroup_description=eg_desc,              editgroup_extra=eg_extra,              batch_size=batch_size, -            **kwargs) +            **kwargs +        )          self._test_override = False      def parse_record(self, record): @@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter):          doi = None          if metadata.doi and metadata.doi.string:              doi = metadata.doi.string.lower().split()[0].strip() -            if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): +            if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):                  sys.stderr.write("BOGUS DOI: {}\n".format(doi))                  doi = None -        title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) -        authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' ')) -        contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] - -        lang = "en"     # the vast majority in english +        title = latex_to_text(metadata.title.get_text().replace("\n", " ")) +        authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " ")) +        contribs = [ +            fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author") +            for i, a in enumerate(authors) +        ] + +        lang = "en"  # the vast majority in english          if metadata.comments and metadata.comments.get_text(): -            comments = metadata.comments.get_text().replace('\n', ' ').strip() -            extra_arxiv['comments'] = comments -            if 'in french' in comments.lower(): -                lang = 'fr' -            elif 'in spanish' in comments.lower(): -                lang = 'es' -            elif 'in portuguese' in comments.lower(): -                lang = 'pt' -            elif 'in hindi' in comments.lower(): -                lang = 'hi' -            elif 'in japanese' in comments.lower(): -                lang = 'ja' -            elif 'in german' in comments.lower(): -                lang = 'de' -            elif 'simplified chinese' in comments.lower(): -                lang = 'zh' -            elif 'in russian' in comments.lower(): -                lang = 'ru' +            comments = metadata.comments.get_text().replace("\n", " ").strip() +            extra_arxiv["comments"] = comments +            if "in french" in comments.lower(): +                lang = "fr" +            elif "in spanish" in comments.lower(): +                lang = "es" +            elif "in portuguese" in comments.lower(): +                lang = "pt" +            elif "in hindi" in comments.lower(): +                lang = "hi" +            elif "in japanese" in comments.lower(): +                lang = "ja" +            elif "in german" in comments.lower(): +                lang = "de" +            elif "simplified chinese" in comments.lower(): +                lang = "zh" +            elif "in russian" in comments.lower(): +                lang = "ru"              # more languages?          number = None -        if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): -            journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip() -            extra_arxiv['journal_ref'] = journal_ref +        if metadata.find("journal-ref") and metadata.find("journal-ref").get_text(): +            journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip() +            extra_arxiv["journal_ref"] = journal_ref              if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():                  release_type = "paper-conference" -        if metadata.find('report-no') and metadata.find('report-no').string: -            number = metadata.find('report-no').string.strip() +        if metadata.find("report-no") and metadata.find("report-no").string: +            number = metadata.find("report-no").string.strip()              # at least some people plop extra metadata in here. hrmf! -            if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2: -                extra_arxiv['report-no'] = number +            if "ISSN " in number or "ISBN " in number or len(number.split()) > 2: +                extra_arxiv["report-no"] = number                  number = None              else:                  release_type = "report" -        if metadata.find('acm-class') and metadata.find('acm-class').string: -            extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() +        if metadata.find("acm-class") and metadata.find("acm-class").string: +            extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip()          if metadata.categories and metadata.categories.get_text(): -            extra_arxiv['categories'] = metadata.categories.get_text().split() +            extra_arxiv["categories"] = metadata.categories.get_text().split()          license_slug = None          if metadata.license and metadata.license.get_text():              license_slug = lookup_license_slug(metadata.license.get_text()) @@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter):              abstracts = []              abst = metadata.abstract.get_text().strip()              orig = None -            if '-----' in abst: -                both = abst.split('-----') +            if "-----" in abst: +                both = abst.split("-----")                  abst = both[0].strip()                  orig = both[1].strip() -            if '$' in abst or '{' in abst: +            if "$" in abst or "{" in abst:                  mime = "application/x-latex"                  abst_plain = latex_to_text(abst) -                abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) +                abstracts.append( +                    fatcat_openapi_client.ReleaseAbstract( +                        content=abst_plain, mimetype="text/plain", lang="en" +                    ) +                )              else:                  mime = "text/plain" -            abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) +            abstracts.append( +                fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en") +            )              if orig: -                abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) +                abstracts.append( +                    fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime) +                )                  # indicates that fulltext probably isn't english either -                if lang == 'en': +                if lang == "en":                      lang = None          # extra: @@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter):          #   container_name          #   group-title          #   arxiv: comments, categories, etc -        extra_arxiv['base_id'] = base_id -        extra['superceded'] = True -        extra['arxiv'] = extra_arxiv +        extra_arxiv["base_id"] = base_id +        extra["superceded"] = True +        extra["arxiv"] = extra_arxiv          versions = [] -        for version in metadata.find_all('version'): -            arxiv_id = base_id + version['version'] +        for version in metadata.find_all("version"): +            arxiv_id = base_id + version["version"]              release_date = version.date.string.strip() -            release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() +            release_date = datetime.datetime.strptime( +                release_date, "%a, %d %b %Y %H:%M:%S %Z" +            ).date()              # TODO: source_type? -            versions.append(fatcat_openapi_client.ReleaseEntity( -                work_id=None, -                title=title, -                #original_title -                version=version['version'], -                release_type=release_type, -                release_stage='submitted', -                release_date=release_date.isoformat(), -                release_year=release_date.year, -                ext_ids=fatcat_openapi_client.ReleaseExtIds( -                    arxiv=arxiv_id, -                ), -                number=number, -                language=lang, -                license_slug=license_slug, -                abstracts=abstracts, -                contribs=contribs, -                extra=extra.copy(), -            )) +            versions.append( +                fatcat_openapi_client.ReleaseEntity( +                    work_id=None, +                    title=title, +                    # original_title +                    version=version["version"], +                    release_type=release_type, +                    release_stage="submitted", +                    release_date=release_date.isoformat(), +                    release_year=release_date.year, +                    ext_ids=fatcat_openapi_client.ReleaseExtIds( +                        arxiv=arxiv_id, +                    ), +                    number=number, +                    language=lang, +                    license_slug=license_slug, +                    abstracts=abstracts, +                    contribs=contribs, +                    extra=extra.copy(), +                ) +            )          # TODO: assert that versions are actually in order?          assert versions -        versions[-1].extra.pop('superceded') +        versions[-1].extra.pop("superceded")          # only apply DOI to most recent version (HACK)          if doi: @@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter):          for v in versions:              if v._existing_work_id:                  if not v._updated: -                    self.counts['exists'] += 1 +                    self.counts["exists"] += 1                  continue              if not any_work_id and last_edit:                  # fetch the last inserted release from this group @@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter):                  any_work_id = r.work_id              v.work_id = any_work_id              last_edit = self.api.create_release(self.get_editgroup_id(), v) -            self.counts['insert'] += 1 +            self.counts["insert"] += 1          return False @@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter):          # there is no batch/bezerk mode for arxiv importer, except for testing          if self._test_override:              for batch in batch_batch: -                self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -                    editgroup=fatcat_openapi_client.Editgroup( -                        description=self.editgroup_description, -                        extra=self.editgroup_extra), -                    entity_list=batch)) -                self.counts['insert'] += len(batch) - 1 +                self.api.create_release_auto_batch( +                    fatcat_openapi_client.ReleaseAutoBatch( +                        editgroup=fatcat_openapi_client.Editgroup( +                            description=self.editgroup_description, extra=self.editgroup_extra +                        ), +                        entity_list=batch, +                    ) +                ) +                self.counts["insert"] += len(batch) - 1          else:              raise NotImplementedError() @@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter):          for article in soup.find_all("record"):              resp = self.parse_record(article)              print(json.dumps(resp)) -            #sys.exit(-1) +            # sys.exit(-1) -if __name__ == '__main__': +if __name__ == "__main__":      parser = ArxivRawImporter(None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 0340f6a3..e9de42fc 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -34,15 +34,15 @@ def single_file(prefix, path):          hashlib.sha1(),          hashlib.sha256(),      ] -    with open(full, 'rb') as fp: +    with open(full, "rb") as fp:          while True: -            data = fp.read(2**20) +            data = fp.read(2 ** 20)              if not data:                  break              for h in hashes:                  h.update(data)      mime = magic.Magic(mime=True).from_file(full) -    if mime == 'application/octet-stream': +    if mime == "application/octet-stream":          # magic apparently isn't that great; try using filename as well          guess = mimetypes.guess_type(full)[0]          if guess: @@ -54,9 +54,11 @@ def single_file(prefix, path):          md5=hashes[0].hexdigest(),          sha1=hashes[1].hexdigest(),          sha256=hashes[2].hexdigest(), -        extra=dict(mimetype=mime)) +        extra=dict(mimetype=mime), +    )      return fsf +  def make_manifest(base_dir):      manifest = []      for root, dirs, files in os.walk(base_dir): @@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None):      if not extra:          extra = dict() -    assert meta['identifier']['type'] == 'DOI' -    doi = meta['identifier']['value'].lower() -    assert doi.startswith('10.') +    assert meta["identifier"]["type"] == "DOI" +    doi = meta["identifier"]["value"].lower() +    assert doi.startswith("10.")      ark_id = None -    for extid in meta.get('alternativeIdentifiers', []): -        if extid['value'].startswith('ark:'): -            ark_id = extid['value'] +    for extid in meta.get("alternativeIdentifiers", []): +        if extid["value"].startswith("ark:"): +            ark_id = extid["value"]      assert ark_id -    license_slug = lookup_license_slug(meta['rights']['uri']) +    license_slug = lookup_license_slug(meta["rights"]["uri"])      abstracts = [] -    for desc in meta['descriptions']: -        if desc['type'] == "abstract": -            abstracts.append(ReleaseAbstract( -                mimetype="text/html", -                content=clean(desc['value']))) -            #print(abstracts) +    for desc in meta["descriptions"]: +        if desc["type"] == "abstract": +            abstracts.append( +                ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) +            ) +            # print(abstracts)      if not abstracts:          abstracts = None      contribs = [] -    for creator in meta['creator']: -        contribs.append(ReleaseContrib( -            given_name=creator['given'], -            surname=creator['family'], -            # sorry everybody -            raw_name="{} {}".format(creator['given'], creator['family']), -            raw_affiliation=creator.get('affiliation'), -            role="author", # presumably, for these datasets? -        )) +    for creator in meta["creator"]: +        contribs.append( +            ReleaseContrib( +                given_name=creator["given"], +                surname=creator["family"], +                # sorry everybody +                raw_name="{} {}".format(creator["given"], creator["family"]), +                raw_affiliation=creator.get("affiliation"), +                role="author",  # presumably, for these datasets? +            ) +        )      r = ReleaseEntity(          ext_ids=ReleaseExtIds(              doi=doi,              ark=ark_id,          ), -        title=clean(meta['title'], force_xml=True), -        publisher=clean(meta['publisher']), -        release_year=int(meta['publicationYear']), +        title=clean(meta["title"], force_xml=True), +        publisher=clean(meta["publisher"]), +        release_year=int(meta["publicationYear"]),          release_type="dataset",          license_slug=license_slug,          contribs=contribs, @@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None):      )      return r +  def make_release_fileset(dat_path): -    if dat_path.endswith('/'): +    if dat_path.endswith("/"):          dat_path = dat_path[:-1]      dat_discovery = dat_path      extra = dict()      assert len(dat_discovery) == 64 -    with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: +    with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:          meta_dict = json.loads(fp.read())      release = cdl_dash_release(meta_dict) -    ark_id = release.extra['ark_id'] +    ark_id = release.extra["ark_id"]      dash_version = None      # really crude XML parse-out -    with open(dat_path + "/stash-wrapper.xml", 'r') as fp: +    with open(dat_path + "/stash-wrapper.xml", "r") as fp:          for line in fp:              line = line.strip()              if line.startswith("<st:version_number>"): -                dash_version = int(line[19:].split('<')[0]) +                dash_version = int(line[19:].split("<")[0])      assert dash_version is not None -    extra['cdl_dash'] = dict(version=dash_version) -    release.extra['cdl_dash'] = dict(version=dash_version) +    extra["cdl_dash"] = dict(version=dash_version) +    release.extra["cdl_dash"] = dict(version=dash_version)      manifest = make_manifest(dat_path + "/files/")      bundle_url = dict(          url="https://merritt.cdlib.org/u/{}/{}".format( -            urllib.parse.quote(ark_id, safe=''), -            dash_version), -        rel="repo-bundle") +            urllib.parse.quote(ark_id, safe=""), dash_version +        ), +        rel="repo-bundle", +    )      repo_url = dict(          url="https://merritt.cdlib.org/d/{}/{}/".format( -            urllib.parse.quote(ark_id, safe=''), -            dash_version), -        rel="repo") -    dat_url = dict( -        url="dat://{}/files/".format(dat_discovery), -        rel="dweb") +            urllib.parse.quote(ark_id, safe=""), dash_version +        ), +        rel="repo", +    ) +    dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")      fs = FilesetEntity( -        urls=[bundle_url, repo_url, dat_url], -        release_ids=None, -        manifest=manifest, -        extra=extra) +        urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra +    )      return (release, fs) +  def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): -    git_rev = subprocess.check_output( -        ["git", "describe", "--always"]).strip().decode('utf-8') +    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")      (release, fileset) = make_release_fileset(dat_path)      if not editgroup_id: -        eg = api.create_editgroup(Editgroup( -            description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", -            extra=dict( -                git_rev=git_rev, -                agent="fatcat_tools.auto_cdl_dash_dat"))) +        eg = api.create_editgroup( +            Editgroup( +                description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", +                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), +            ) +        )          editgroup_id = eg.editgroup_id      if not release_id and release.ext_ids.doi: @@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):      fileset = api.get_fileset(edit.ident)      return (editgroup_id, release, fileset) -if __name__=='__main__': + +if __name__ == "__main__":      # pass this a discovery key that has been cloned to the local directory      print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 0b634e73..8d2a89b6 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from .common import EntityImporter, clean @@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of container-level metadata from Chocula tool.") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.get( +            "editgroup_description", +            "Automated import of container-level metadata from Chocula tool.", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, raw_record): -        if not raw_record.get('ident') and not raw_record.get('_known_issnl'): -            self.counts['skip-unknown-new-issnl'] += 1 +        if not raw_record.get("ident") and not raw_record.get("_known_issnl"): +            self.counts["skip-unknown-new-issnl"] += 1              return False -        if raw_record.get('issnl') and raw_record.get('name'): +        if raw_record.get("issnl") and raw_record.get("name"):              return True          return False @@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter):          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        name = clean(row.get('name')) +        name = clean(row.get("name"))          if not name:              # Name is required (by schema)              return None          name = name.strip() -        if name.endswith(',  Proceedings of the'): -            name = "Proceedings of the " + name.split(',')[0] +        if name.endswith(",  Proceedings of the"): +            name = "Proceedings of the " + name.split(",")[0] -        if name.endswith('.'): +        if name.endswith("."):              name = name[:-1]          extra = dict() -        for k in ('urls', 'webarchive_urls', 'country', -                  'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages', -                  'ia', 'scielo', 'kbart', 'publisher_type', 'platform'): -            if row['extra'].get(k): -                extra[k] = row['extra'][k] +        for k in ( +            "urls", +            "webarchive_urls", +            "country", +            "sherpa_romeo", +            "ezb", +            "szczepanski", +            "doaj", +            "languages", +            "ia", +            "scielo", +            "kbart", +            "publisher_type", +            "platform", +        ): +            if row["extra"].get(k): +                extra[k] = row["extra"][k]          container_type = None -        if 'proceedings' in name.lower(): -            container_type = 'proceedings' -        elif 'journal ' in name.lower(): -            container_type = 'journal' +        if "proceedings" in name.lower(): +            container_type = "proceedings" +        elif "journal " in name.lower(): +            container_type = "journal"          ce = fatcat_openapi_client.ContainerEntity( -            issnl=row['issnl'], -            issnp=row['extra'].get('issnp'), -            issne=row['extra'].get('issne'), -            ident=row['ident'], +            issnl=row["issnl"], +            issnp=row["extra"].get("issnp"), +            issne=row["extra"].get("issne"), +            ident=row["ident"],              name=name,              container_type=container_type, -            publisher=clean(row.get('publisher')), -            wikidata_qid=row.get('wikidata_qid'), -            extra=extra) +            publisher=clean(row.get("publisher")), +            wikidata_qid=row.get("wikidata_qid"), +            extra=extra, +        )          return ce      def try_update(self, ce): @@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter):              except fatcat_openapi_client.rest.ApiException as err:                  if err.status != 404:                      raise err -                self.counts['exists'] += 1 -                self.counts['exists-not-found'] += 1 +                self.counts["exists"] += 1 +                self.counts["exists-not-found"] += 1                  return False -            if existing.state != 'active': -                self.counts['exists'] += 1 -                self.counts['exists-inactive'] += 1 +            if existing.state != "active": +                self.counts["exists"] += 1 +                self.counts["exists-inactive"] += 1                  return False          if not existing: @@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter):                  if err.status != 404:                      raise err              if existing: -                self.counts['exists'] += 1 -                self.counts['exists-by-issnl'] += 1 +                self.counts["exists"] += 1 +                self.counts["exists-by-issnl"] += 1                  return False              # doesn't exist, always create              return True @@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter):          # decide whether to update          do_update = False          if not self.do_updates: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          if not existing.extra:              existing.extra = dict() -        if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): +        if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set( +            existing.extra.get("urls", []) +        ):              do_update = True -        if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): +        if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set( +            existing.extra.get("webarchive_urls", []) +        ):              do_update = True -        for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'): +        for k in ("ezb", "szczepanski", "publisher_type", "platform"):              if ce.extra.get(k) and not existing.extra.get(k):                  do_update = True -        for k in ('kbart', 'ia', 'doaj'): +        for k in ("kbart", "ia", "doaj"):              # always update these fields if not equal (chocula override)              if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):                  do_update = True @@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter):              existing.container_type = existing.container_type or ce.container_type              existing.issne = existing.issne or ce.issne              existing.issnp = existing.issnp or ce.issnp -            for k in ('urls', 'webarchive_urls'): +            for k in ("urls", "webarchive_urls"):                  # be conservative about URL updates; don't clobber existing URL lists                  # may want to make this behavior more sophisticated in the                  # future, or at least a config flag                  if ce.extra.get(k) and not existing.extra.get(k):                      existing.extra[k] = ce.extra.get(k, []) -            for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia', -                      'scielo', 'kbart', 'publisher_type', 'platform'): +            for k in ( +                "sherpa_romeo", +                "ezb", +                "szczepanski", +                "doaj", +                "ia", +                "scielo", +                "kbart", +                "publisher_type", +                "platform", +            ):                  # always update (chocula over-rides)                  if ce.extra.get(k):                      existing.extra[k] = ce.extra[k] -            for k in ('country',): +            for k in ("country",):                  # only include if not set (don't clobber human edits)                  if ce.extra.get(k) and not existing.extra.get(k):                      existing.extra[k] = ce.extra[k] -            if ce.extra.get('languages'): -                if not existing.extra.get('languages'): -                    existing.extra['languages'] = ce.extra['languages'] -                elif not ce.extra['languages'][0] in existing.extra['languages']: -                    existing.extra['languages'].append(ce.extra['languages'][0]) +            if ce.extra.get("languages"): +                if not existing.extra.get("languages"): +                    existing.extra["languages"] = ce.extra["languages"] +                elif not ce.extra["languages"][0] in existing.extra["languages"]: +                    existing.extra["languages"].append(ce.extra["languages"][0])              self.api.update_container(self.get_editgroup_id(), existing.ident, existing) -            self.counts['update'] += 1 +            self.counts["update"] += 1              return False          else: -            self.counts['exists'] += 1 -            self.counts['exists-skip-update'] += 1 +            self.counts["exists"] += 1 +            self.counts["exists-skip-update"] += 1              return False          # if we got this far, it's a bug          raise NotImplementedError      def insert_batch(self, batch): -        self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_container_auto_batch( +            fatcat_openapi_client.ContainerAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e33a2012..2639c85a 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -1,4 +1,3 @@ -  import csv  import datetime  import json @@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100  DOMAIN_REL_MAP: Dict[str, str] = {      "archive.org": "archive",      # LOCKSS, Portico, DuraSpace, etc would also be "archive" -      "arxiv.org": "repository",      "babel.hathitrust.org": "repository",      "cds.cern.ch": "repository", @@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {      "zenodo.org": "repository",      "www.biorxiv.org": "repository",      "www.medrxiv.org": "repository", -      "citeseerx.ist.psu.edu": "aggregator",      "publisher-connector.core.ac.uk": "aggregator",      "core.ac.uk": "aggregator", @@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {      "pdfs.semanticscholar.org": "aggregator",      "semanticscholar.org": "aggregator",      "www.semanticscholar.org": "aggregator", -      "academic.oup.com": "publisher",      "cdn.elifesciences.org": "publisher",      "cell.com": "publisher", @@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = {      "ehp.niehs.nih.gov": "publisher",      "journals.tsu.ru": "publisher",      "www.cogentoa.com": "publisher", -      "www.researchgate.net": "academicsocial",      "academia.edu": "academicsocial", -      "wayback.archive-it.org": "webarchive",      "web.archive.org": "webarchive",      "archive.is": "webarchive",  } +  def make_rel_url(raw_url: str, default_link_rel: str = "web"):      # this is where we map specific domains to rel types, and also filter out      # bad domains, invalid URLs, etc @@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"):              break      return (rel, raw_url) +  def test_make_rel_url():      assert make_rel_url("http://example.com/thing.pdf")[0] == "web"      assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" -    assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive" +    assert ( +        make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] +        == "webarchive" +    )      assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher" +  class EntityImporter:      """      Base class for fatcat entity importers. @@ -147,23 +147,26 @@ class EntityImporter:      def __init__(self, api, **kwargs): -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['git_rev'] = eg_extra.get('git_rev', -            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["git_rev"] = eg_extra.get( +            "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() +        ).decode("utf-8") +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter")          self.api = api -        self.do_updates = bool(kwargs.get('do_updates', True)) -        self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True) -        self.bezerk_mode: bool = kwargs.get('bezerk_mode', False) -        self.submit_mode: bool = kwargs.get('submit_mode', False) -        self.edit_batch_size: int = kwargs.get('edit_batch_size', 100) -        self.editgroup_description: Optional[str] = kwargs.get('editgroup_description') +        self.do_updates = bool(kwargs.get("do_updates", True)) +        self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True) +        self.bezerk_mode: bool = kwargs.get("bezerk_mode", False) +        self.submit_mode: bool = kwargs.get("submit_mode", False) +        self.edit_batch_size: int = kwargs.get("edit_batch_size", 100) +        self.editgroup_description: Optional[str] = kwargs.get("editgroup_description")          self.editgroup_extra: Optional[Any] = eg_extra -        self.es_client = kwargs.get('es_client') +        self.es_client = kwargs.get("es_client")          if not self.es_client: -            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120) +            self.es_client = elasticsearch.Elasticsearch( +                "https://search.fatcat.wiki", timeout=120 +            )          self._issnl_id_map: Dict[str, Any] = dict()          self._orcid_id_map: Dict[str, Any] = dict() @@ -174,7 +177,7 @@ class EntityImporter:          self.reset()      def reset(self) -> None: -        self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) +        self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0})          self._edit_count: int = 0          self._editgroup_id: Optional[str] = None          self._entity_queue: List[Any] = [] @@ -184,13 +187,13 @@ class EntityImporter:          """          Returns nothing.          """ -        self.counts['total'] += 1 +        self.counts["total"] += 1          if (not raw_record) or (not self.want(raw_record)): -            self.counts['skip'] += 1 +            self.counts["skip"] += 1              return          entity = self.parse_record(raw_record)          if not entity: -            self.counts['skip'] += 1 +            self.counts["skip"] += 1              return          if self.bezerk_mode:              self.push_entity(entity) @@ -230,7 +233,7 @@ class EntityImporter:          if self._entity_queue:              self.insert_batch(self._entity_queue) -            self.counts['insert'] += len(self._entity_queue) +            self.counts["insert"] += len(self._entity_queue)              self._entity_queue = []          return self.counts @@ -248,8 +251,9 @@ class EntityImporter:          if not self._editgroup_id:              eg = self.api.create_editgroup(                  fatcat_openapi_client.Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra)) +                    description=self.editgroup_description, extra=self.editgroup_extra +                ) +            )              self._editgroup_id = eg.editgroup_id          self._edit_count += edits @@ -257,30 +261,30 @@ class EntityImporter:      def create_container(self, entity):          eg_id = self.get_editgroup_id() -        self.counts['inserted.container'] += 1 +        self.counts["inserted.container"] += 1          return self.api.create_container(eg_id, entity)      def create_release(self, entity):          eg_id = self.get_editgroup_id() -        self.counts['inserted.release'] += 1 +        self.counts["inserted.release"] += 1          return self.api.create_release(eg_id, entity)      def create_file(self, entity):          eg_id = self.get_editgroup_id() -        self.counts['inserted.file'] += 1 +        self.counts["inserted.file"] += 1          return self.api.create_file(eg_id, entity)      def updated(self):          """          Implementations should call this from try_update() if the update was successful          """ -        self.counts['update'] += 1 +        self.counts["update"] += 1      def push_entity(self, entity):          self._entity_queue.append(entity)          if len(self._entity_queue) >= self.edit_batch_size:              self.insert_batch(self._entity_queue) -            self.counts['insert'] += len(self._entity_queue) +            self.counts["insert"] += len(self._entity_queue)              self._entity_queue = []      def want(self, raw_record: Any) -> bool: @@ -324,7 +328,7 @@ class EntityImporter:              # If anything other than a 404 (not found), something is wrong              if ae.status != 404:                  raise ae -        self._orcid_id_map[orcid] = creator_id # might be None +        self._orcid_id_map[orcid] = creator_id  # might be None          return creator_id      def is_doi(self, doi: str) -> bool: @@ -347,7 +351,7 @@ class EntityImporter:              # If anything other than a 404 (not found), something is wrong              if ae.status != 404:                  raise ae -        self._doi_id_map[doi] = release_id # might be None +        self._doi_id_map[doi] = release_id  # might be None          return release_id      def lookup_pmid(self, pmid: str): @@ -364,11 +368,11 @@ class EntityImporter:              # If anything other than a 404 (not found), something is wrong              if ae.status != 404:                  raise ae -        self._pmid_id_map[pmid] = release_id # might be None +        self._pmid_id_map[pmid] = release_id  # might be None          return release_id      def is_issnl(self, issnl: str) -> bool: -        return len(issnl) == 9 and issnl[4] == '-' +        return len(issnl) == 9 and issnl[4] == "-"      def lookup_issnl(self, issnl: str):          """Caches calls to the ISSN-L lookup API endpoint in a local dict""" @@ -382,7 +386,7 @@ class EntityImporter:              # If anything other than a 404 (not found), something is wrong              if ae.status != 404:                  raise ae -        self._issnl_id_map[issnl] = container_id # might be None +        self._issnl_id_map[issnl] = container_id  # might be None          return container_id      def read_issn_map_file(self, issn_map_file): @@ -417,26 +421,26 @@ class EntityImporter:          # update old/deprecated 'rel' on URLs          for i in range(len(existing.urls)):              u = existing.urls[i] -            if u.rel == 'repository' and '://archive.org/download/' in u.url: -                existing.urls[i].rel = 'archive' -            if u.rel == 'social': -                u.rel = 'academicsocial' +            if u.rel == "repository" and "://archive.org/download/" in u.url: +                existing.urls[i].rel = "archive" +            if u.rel == "social": +                u.rel = "academicsocial"          # remove URLs which are near-duplicates          redundant_urls = []          all_urls = [u.url for u in existing.urls] -        all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] +        all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url]          for url in all_urls:              # https/http redundancy -            if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: +            if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls:                  redundant_urls.append(url)                  continue              # default HTTP port included and not included -            if ':80/' in url and url.replace(':80', '', 1) in all_urls: +            if ":80/" in url and url.replace(":80", "", 1) in all_urls:                  redundant_urls.append(url)                  continue              # partial and complete wayback timestamps -            if '://web.archive.org/web/2017/' in url: +            if "://web.archive.org/web/2017/" in url:                  original_url = "/".join(url.split("/")[5:])                  assert len(original_url) > 5                  for wb_url in all_wayback_urls: @@ -452,7 +456,9 @@ class EntityImporter:      def generic_fileset_cleanups(existing):          return existing -    def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: +    def match_existing_release_fuzzy( +        self, release: ReleaseEntity +    ) -> Optional[Tuple[str, str, ReleaseEntity]]:          """          This helper function uses fuzzycat (and elasticsearch) to look for          existing release entities with similar metadata. @@ -488,7 +494,15 @@ class EntityImporter:              return None          release_dict = entity_to_dict(release, api_client=self.api.api_client) -        verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] +        verified = [ +            ( +                fuzzycat.verify.verify( +                    release_dict, entity_to_dict(c, api_client=self.api.api_client) +                ), +                c, +            ) +            for c in candidates +        ]          # chose the "closest" match          closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] @@ -522,7 +536,6 @@ class RecordPusher:  class JsonLinePusher(RecordPusher): -      def __init__(self, importer, json_file, **kwargs):          self.importer = importer          self.json_file = json_file @@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher):  class CsvPusher(RecordPusher): -      def __init__(self, importer, csv_file, **kwargs):          self.importer = importer -        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) +        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ","))      def run(self):          for line in self.reader: @@ -555,7 +567,6 @@ class CsvPusher(RecordPusher):  class LinePusher(RecordPusher): -      def __init__(self, importer, text_file, **kwargs):          self.importer = importer          self.text_file = text_file @@ -571,17 +582,15 @@ class LinePusher(RecordPusher):  class SqlitePusher(RecordPusher): -      def __init__(self, importer, db_file, table_name, where_clause="", **kwargs):          self.importer = importer -        self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') +        self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")          self.db.row_factory = sqlite3.Row          self.table_name = table_name          self.where_clause = where_clause      def run(self): -        cur = self.db.execute("SELECT * FROM {} {};".format( -            self.table_name, self.where_clause)) +        cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause))          for row in cur:              self.importer.push_record(row)          counts = self.importer.finish() @@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher):  class Bs4XmlLinesPusher(RecordPusher): -      def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):          self.importer = importer          self.xml_file = xml_file @@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher):  class Bs4XmlFilePusher(RecordPusher): -      def __init__(self, importer, xml_file, record_tag, **kwargs):          self.importer = importer          self.xml_file = xml_file @@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher):  class Bs4XmlFileListPusher(RecordPusher): -      def __init__(self, importer, list_file, record_tag, **kwargs):          self.importer = importer          self.list_file = list_file @@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher):              xml_path = xml_path.strip()              if not xml_path or xml_path.startswith("#"):                  continue -            with open(xml_path, 'r') as xml_file: +            with open(xml_path, "r") as xml_file:                  soup = BeautifulSoup(xml_file, "xml")                  for record in soup.find_all(self.record_tag):                      self.importer.push_record(record) @@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher):          print(counts)          return counts +  class KafkaBs4XmlPusher(RecordPusher):      """      Fetch XML for an article from Kafka, parse via Bs4.      """ +      def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):          self.importer = importer          self.consumer = make_kafka_consumer( @@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher):              kafka_env,              topic_suffix,              group, -            kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') +            kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),          ) -        self.poll_interval = kwargs.get('poll_interval', 5.0) -        self.consume_batch_size = kwargs.get('consume_batch_size', 25) +        self.poll_interval = kwargs.get("poll_interval", 5.0) +        self.consume_batch_size = kwargs.get("consume_batch_size", 25)      def run(self):          count = 0 @@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher):              # outstanding editgroups every 5 minutes, but there is still that              # window when editgroups might be hanging (unsubmitted).              batch = self.consumer.consume( -                num_messages=self.consume_batch_size, -                timeout=self.poll_interval) -            print("... got {} kafka messages ({}sec poll interval) {}".format( -                len(batch), self.poll_interval, self.importer.counts)) +                num_messages=self.consume_batch_size, timeout=self.poll_interval +            ) +            print( +                "... got {} kafka messages ({}sec poll interval) {}".format( +                    len(batch), self.poll_interval, self.importer.counts +                ) +            )              if not batch:                  if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):                      # it has been some time, so flush any current editgroup                      self.importer.finish()                      last_push = datetime.datetime.now() -                    #print("Flushed any partial import batch: {}".format(self.importer.counts)) +                    # print("Flushed any partial import batch: {}".format(self.importer.counts))                  continue              # first check errors on entire batch...              for msg in batch: @@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher):                      raise KafkaException(msg.error())              # ... then process              for msg in batch: -                soup = BeautifulSoup(msg.value().decode('utf-8'), "xml") +                soup = BeautifulSoup(msg.value().decode("utf-8"), "xml")                  self.importer.push_record(soup)                  soup.decompose()                  count += 1 @@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher):          self.consumer.close()          return counts -class KafkaJsonPusher(RecordPusher): +class KafkaJsonPusher(RecordPusher):      def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):          self.importer = importer          self.consumer = make_kafka_consumer( @@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher):              kafka_env,              topic_suffix,              group, -            kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') +            kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),          ) -        self.poll_interval = kwargs.get('poll_interval', 5.0) -        self.consume_batch_size = kwargs.get('consume_batch_size', 100) -        self.force_flush = kwargs.get('force_flush', False) +        self.poll_interval = kwargs.get("poll_interval", 5.0) +        self.consume_batch_size = kwargs.get("consume_batch_size", 100) +        self.force_flush = kwargs.get("force_flush", False)      def run(self):          count = 0 @@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher):              # outstanding editgroups every 5 minutes, but there is still that              # window when editgroups might be hanging (unsubmitted).              batch = self.consumer.consume( -                num_messages=self.consume_batch_size, -                timeout=self.poll_interval) -            print("... got {} kafka messages ({}sec poll interval) {}".format( -                len(batch), self.poll_interval, self.importer.counts)) +                num_messages=self.consume_batch_size, timeout=self.poll_interval +            ) +            print( +                "... got {} kafka messages ({}sec poll interval) {}".format( +                    len(batch), self.poll_interval, self.importer.counts +                ) +            )              if self.force_flush:                  # this flushing happens even if there have been 'push' events                  # more recently. it is intended for, eg, importers off the @@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher):                      self.importer.finish()                      last_push = datetime.datetime.now()                      last_force_flush = datetime.datetime.now() -                    #print("Flushed any partial import batch: {}".format(self.importer.counts)) +                    # print("Flushed any partial import batch: {}".format(self.importer.counts))                  continue              # first check errors on entire batch...              for msg in batch: @@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher):                      raise KafkaException(msg.error())              # ... then process              for msg in batch: -                record = json.loads(msg.value().decode('utf-8')) +                record = json.loads(msg.value().decode("utf-8"))                  self.importer.push_record(record)                  count += 1                  if count % 500 == 0: @@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat                  print("Bailing out...")                  # TODO: should it be sys.exit(-1)?                  raise KafkaException(p.error) -        #print("Kafka consumer commit successful") +        # print("Kafka consumer commit successful")          pass      # previously, using pykafka -    #auto_commit_enable=True, -    #auto_commit_interval_ms=30000, # 30 seconds +    # auto_commit_enable=True, +    # auto_commit_interval_ms=30000, # 30 seconds      conf = { -        'bootstrap.servers': hosts, -        'group.id': group, -        'on_commit': fail_fast, +        "bootstrap.servers": hosts, +        "group.id": group, +        "on_commit": fail_fast,          # messages don't have offset marked as stored until pushed to          # elastic, but we do auto-commit stored offsets to broker -        'enable.auto.offset.store': False, -        'enable.auto.commit': True, +        "enable.auto.offset.store": False, +        "enable.auto.commit": True,          # user code timeout; if no poll after this long, assume user code          # hung and rebalance (default: 5min) -        'max.poll.interval.ms': 120000, -        'default.topic.config': { -            'auto.offset.reset': 'latest', +        "max.poll.interval.ms": 120000, +        "default.topic.config": { +            "auto.offset.reset": "latest",          },      } @@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat          for p in partitions:              if p.error:                  raise KafkaException(p.error) -        print("Kafka partitions rebalanced: {} / {}".format( -            consumer, partitions)) +        print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))      consumer = Consumer(conf)      # NOTE: it's actually important that topic_name *not* be bytes (UTF-8      # encoded) -    consumer.subscribe([topic_name], +    consumer.subscribe( +        [topic_name],          on_assign=on_rebalance,          on_revoke=on_rebalance,      ) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fd6936a4..606d4bb1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,4 +1,3 @@ -  import datetime  import sqlite3  from typing import Any, Dict, Optional @@ -13,30 +12,30 @@ from .common import EntityImporter, clean  # Can get a list of Crossref types (with counts) via API:  # https://api.crossref.org/works?rows=0&facet=type-name:*  CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { -    'book': 'book', -    'book-chapter': 'chapter', -    'book-part': 'chapter', -    'book-section': 'chapter', -    'component': 'component', -    'dataset': 'dataset', -    'dissertation': 'thesis', -    'edited-book': 'book', -    'journal-article': 'article-journal', -    'monograph': 'book', -    'other': None, -    'peer-review': 'peer_review', -    'posted-content': 'post', -    'proceedings-article': 'paper-conference', -    'reference-book': 'book', -    'reference-entry': 'entry', -    'report': 'report', -    'standard': 'standard', +    "book": "book", +    "book-chapter": "chapter", +    "book-part": "chapter", +    "book-section": "chapter", +    "component": "component", +    "dataset": "dataset", +    "dissertation": "thesis", +    "edited-book": "book", +    "journal-article": "article-journal", +    "monograph": "book", +    "other": None, +    "peer-review": "peer_review", +    "posted-content": "post", +    "proceedings-article": "paper-conference", +    "reference-book": "book", +    "reference-entry": "entry", +    "report": "report", +    "standard": "standard",  }  CONTAINER_TYPE_MAP: Dict[str, str] = { -    'article-journal': 'journal', -    'paper-conference': 'conference', -    'book': 'book-series', +    "article-journal": "journal", +    "paper-conference": "conference", +    "book": "book-series",  }  # These are based, informally, on sorting the most popular licenses found in @@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = {      "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",  } +  def lookup_license_slug(raw: str) -> Optional[str]:      if not raw:          return None -    raw = raw.strip().replace('http://', '//').replace('https://', '//') -    if 'creativecommons.org' in raw.lower(): +    raw = raw.strip().replace("http://", "//").replace("https://", "//") +    if "creativecommons.org" in raw.lower():          raw = raw.lower() -        raw = raw.replace('/legalcode', '/').replace('/uk', '') -        if not raw.endswith('/'): -            raw = raw + '/' +        raw = raw.replace("/legalcode", "/").replace("/uk", "") +        if not raw.endswith("/"): +            raw = raw + "/"      return LICENSE_SLUG_MAP.get(raw) +  def test_lookup_license_slug():      assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" -    assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" -    assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" +    assert ( +        lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") +        == "CC-BY" +    ) +    assert ( +        lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") +        == "CC-0" +    )      assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" -    assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" +    assert ( +        lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") +        == "CC-BY-NC-SA" +    )      assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"      assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None      assert lookup_license_slug("") is None      assert lookup_license_slug(None) is None +  class CrossrefImporter(EntityImporter):      """      Importer for Crossref metadata. @@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter):      def __init__(self, api, issn_map_file, **kwargs): -        eg_desc: Optional[str] = kwargs.get('editgroup_description', -            "Automated import of Crossref DOI metadata, harvested from REST API") -        eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') -        super().__init__(api, +        eg_desc: Optional[str] = kwargs.get( +            "editgroup_description", +            "Automated import of Crossref DOI metadata, harvested from REST API", +        ) +        eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") +        super().__init__( +            api,              issn_map_file=issn_map_file,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, -            **kwargs) +            **kwargs +        ) -        self.create_containers: bool = kwargs.get('create_containers', True) -        extid_map_file = kwargs.get('extid_map_file') +        self.create_containers: bool = kwargs.get("create_containers", True) +        extid_map_file = kwargs.get("extid_map_file")          self.extid_map_db: Optional[Any] = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter):      def lookup_ext_ids(self, doi: str) -> Optional[Any]:          if self.extid_map_db is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", -            [doi.lower()]).fetchone() +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            ) +        row = self.extid_map_db.execute( +            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] +        ).fetchone()          if row is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = [str(cell or '') or None for cell in row] +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            ) +        row = [str(cell or "") or None for cell in row]          return dict(              core_id=row[0],              pmid=row[1], @@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter):          return CONTAINER_TYPE_MAP.get(crossref_type)      def want(self, obj: Dict[str, Any]) -> bool: -        if not obj.get('title'): -            self.counts['skip-blank-title'] += 1 +        if not obj.get("title"): +            self.counts["skip-blank-title"] += 1              return False          # these are pre-registered DOIs before the actual record is ready          # title is a list of titles -        titles = obj.get('title') +        titles = obj.get("title")          if titles is not None and titles[0].strip().lower() in [ -                "OUP accepted manuscript".lower(), -            ]: -            self.counts['skip-stub-title'] += 1 +            "OUP accepted manuscript".lower(), +        ]: +            self.counts["skip-stub-title"] += 1              return False          # do most of these checks in-line below @@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter):          # Ways to be out of scope (provisionally)          # journal-issue and journal-volume map to None, but allowed for now -        if obj.get('type') in (None, 'journal', 'proceedings', -                'standard-series', 'report-series', 'book-series', 'book-set', -                'book-track', 'proceedings-series'): -            self.counts['skip-release-type'] += 1 +        if obj.get("type") in ( +            None, +            "journal", +            "proceedings", +            "standard-series", +            "report-series", +            "book-series", +            "book-set", +            "book-track", +            "proceedings-series", +        ): +            self.counts["skip-release-type"] += 1              return None          # Do require the 'title' keys to exist, as release entities do -        if ('title' not in obj) or (not obj['title']): -            self.counts['skip-blank-title'] += 1 +        if ("title" not in obj) or (not obj["title"]): +            self.counts["skip-blank-title"] += 1              return None -        release_type = self.map_release_type(obj['type']) +        release_type = self.map_release_type(obj["type"])          # contribs          def do_contribs(obj_list, ctype):              contribs = []              for i, am in enumerate(obj_list):                  creator_id = None -                if 'ORCID' in am.keys(): -                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) +                if "ORCID" in am.keys(): +                    creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])                  # Sorry humans :( -                if am.get('given') and am.get('family'): -                    raw_name = "{} {}".format(am['given'], am['family']) -                elif am.get('family'): -                    raw_name = am['family'] +                if am.get("given") and am.get("family"): +                    raw_name = "{} {}".format(am["given"], am["family"]) +                elif am.get("family"): +                    raw_name = am["family"]                  else:                      # TODO: can end up empty -                    raw_name = am.get('name') or am.get('given') +                    raw_name = am.get("name") or am.get("given")                  extra = dict()                  if ctype == "author":                      index = i                  else:                      index = None                  raw_affiliation = None -                if am.get('affiliation'): -                    if len(am.get('affiliation')) > 0: -                        raw_affiliation = am.get('affiliation')[0]['name'] -                    if len(am.get('affiliation')) > 1: +                if am.get("affiliation"): +                    if len(am.get("affiliation")) > 0: +                        raw_affiliation = am.get("affiliation")[0]["name"] +                    if len(am.get("affiliation")) > 1:                          # note: affiliation => more_affiliations -                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] -                if am.get('sequence') and am.get('sequence') != "additional": -                    extra['seq'] = clean(am.get('sequence')) +                        extra["more_affiliations"] = [ +                            clean(a["name"]) for a in am.get("affiliation")[1:] +                        ] +                if am.get("sequence") and am.get("sequence") != "additional": +                    extra["seq"] = clean(am.get("sequence"))                  if not extra:                      extra = None                  assert ctype in ("author", "editor", "translator")                  raw_name = clean(raw_name) -                contribs.append(fatcat_openapi_client.ReleaseContrib( -                    creator_id=creator_id, -                    index=index, -                    raw_name=raw_name, -                    given_name=clean(am.get('given')), -                    surname=clean(am.get('family')), -                    raw_affiliation=clean(raw_affiliation), -                    role=ctype, -                    extra=extra)) +                contribs.append( +                    fatcat_openapi_client.ReleaseContrib( +                        creator_id=creator_id, +                        index=index, +                        raw_name=raw_name, +                        given_name=clean(am.get("given")), +                        surname=clean(am.get("family")), +                        raw_affiliation=clean(raw_affiliation), +                        role=ctype, +                        extra=extra, +                    ) +                )              return contribs -        contribs = do_contribs(obj.get('author', []), "author") -        contribs.extend(do_contribs(obj.get('editor', []), "editor")) -        contribs.extend(do_contribs(obj.get('translator', []), "translator")) + +        contribs = do_contribs(obj.get("author", []), "author") +        contribs.extend(do_contribs(obj.get("editor", []), "editor")) +        contribs.extend(do_contribs(obj.get("translator", []), "translator"))          # container -        issn = obj.get('ISSN', [None])[0] +        issn = obj.get("ISSN", [None])[0]          issnl = self.issn2issnl(issn)          container_id = None          if issnl:              container_id = self.lookup_issnl(issnl) -        publisher = clean(obj.get('publisher')) +        publisher = clean(obj.get("publisher")) -        container_name = obj.get('container-title') +        container_name = obj.get("container-title")          if container_name:              container_name = clean(container_name[0], force_xml=True)          if not container_name:              container_name = None -        if (container_id is None and self.create_containers and (issnl is not None) -                and container_name): +        if ( +            container_id is None +            and self.create_containers +            and (issnl is not None) +            and container_name +        ):              ce = fatcat_openapi_client.ContainerEntity(                  issnl=issnl,                  publisher=publisher,                  container_type=self.map_container_type(release_type), -                name=container_name) +                name=container_name, +            )              ce_edit = self.create_container(ce)              container_id = ce_edit.ident              self._issnl_id_map[issnl] = container_id @@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter):          # license slug          license_slug = None          license_extra = [] -        for lic in obj.get('license', []): -            if lic['content-version'] not in ('vor', 'unspecified'): +        for lic in obj.get("license", []): +            if lic["content-version"] not in ("vor", "unspecified"):                  continue -            slug = lookup_license_slug(lic['URL']) +            slug = lookup_license_slug(lic["URL"])              if slug:                  license_slug = slug -            if 'start' in lic: -                lic['start'] = lic['start']['date-time'] +            if "start" in lic: +                lic["start"] = lic["start"]["date-time"]              license_extra.append(lic)          # references          refs = [] -        for i, rm in enumerate(obj.get('reference', [])): +        for i, rm in enumerate(obj.get("reference", [])):              try: -                year: Optional[int] = int(rm.get('year')) +                year: Optional[int] = int(rm.get("year"))                  # TODO: will need to update/config in the future!                  # NOTE: are there crossref works with year < 100?                  if year is not None: @@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter):              except (TypeError, ValueError):                  year = None              ref_extra: Dict[str, Any] = dict() -            key = rm.get('key') -            if key and key.startswith(obj['DOI'].upper()): -                key = key.replace(obj['DOI'].upper() + "-", '') -                key = key.replace(obj['DOI'].upper(), '') -            ref_container_name = rm.get('volume-title') +            key = rm.get("key") +            if key and key.startswith(obj["DOI"].upper()): +                key = key.replace(obj["DOI"].upper() + "-", "") +                key = key.replace(obj["DOI"].upper(), "") +            ref_container_name = rm.get("volume-title")              if not ref_container_name: -                ref_container_name = rm.get('journal-title') -            elif rm.get('journal-title'): -                ref_extra['journal-title'] = rm['journal-title'] -            if rm.get('DOI'): -                ref_extra['doi'] = rm.get('DOI').lower() -            author = clean(rm.get('author')) +                ref_container_name = rm.get("journal-title") +            elif rm.get("journal-title"): +                ref_extra["journal-title"] = rm["journal-title"] +            if rm.get("DOI"): +                ref_extra["doi"] = rm.get("DOI").lower() +            author = clean(rm.get("author"))              if author: -                ref_extra['authors'] = [author] -            for k in ('editor', 'edition', 'authority', 'version', 'genre', -                    'url', 'event', 'issue', 'volume', 'date', 'accessed_date', -                    'issued', 'page', 'medium', 'collection_title', 'chapter_number', -                    'unstructured', 'series-title', 'volume-title'): +                ref_extra["authors"] = [author] +            for k in ( +                "editor", +                "edition", +                "authority", +                "version", +                "genre", +                "url", +                "event", +                "issue", +                "volume", +                "date", +                "accessed_date", +                "issued", +                "page", +                "medium", +                "collection_title", +                "chapter_number", +                "unstructured", +                "series-title", +                "volume-title", +            ):                  if clean(rm.get(k)):                      ref_extra[k] = clean(rm[k])              if not ref_extra:                  ref_extra = None -            refs.append(fatcat_openapi_client.ReleaseRef( -                index=i, -                # doing lookups would be a second import pass -                target_release_id=None, -                key=key, -                year=year, -                container_name=clean(ref_container_name), -                title=clean(rm.get('article-title')), -                locator=clean(rm.get('first-page')), -                # TODO: just dump JSON somewhere here? -                extra=ref_extra)) +            refs.append( +                fatcat_openapi_client.ReleaseRef( +                    index=i, +                    # doing lookups would be a second import pass +                    target_release_id=None, +                    key=key, +                    year=year, +                    container_name=clean(ref_container_name), +                    title=clean(rm.get("article-title")), +                    locator=clean(rm.get("first-page")), +                    # TODO: just dump JSON somewhere here? +                    extra=ref_extra, +                ) +            )          # abstracts          abstracts = [] -        abstract = clean(obj.get('abstract')) +        abstract = clean(obj.get("abstract"))          if abstract and len(abstract) > 10: -            abstracts.append(fatcat_openapi_client.ReleaseAbstract( -                mimetype="application/xml+jats", -                content=abstract)) +            abstracts.append( +                fatcat_openapi_client.ReleaseAbstract( +                    mimetype="application/xml+jats", content=abstract +                ) +            )          # extra fields          extra = dict()          extra_crossref = dict()          # top-level extra keys          if not container_id: -            if obj.get('container-title'): -                extra['container_name'] = container_name -        for key in ('group-title'): +            if obj.get("container-title"): +                extra["container_name"] = container_name +        for key in "group-title":              val = obj.get(key)              if val:                  if type(val) == list: @@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter):                  else:                      extra[key] = val          # crossref-nested extra keys -        for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): +        for key in ("subject", "type", "alternative-id", "archive", "funder"):              val = obj.get(key)              if val:                  if type(val) == str: @@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter):                  else:                      extra_crossref[key] = val          if license_extra: -            extra_crossref['license'] = license_extra +            extra_crossref["license"] = license_extra -        if len(obj['title']) > 1: -            aliases = [clean(t) for t in obj['title'][1:]] +        if len(obj["title"]) > 1: +            aliases = [clean(t) for t in obj["title"][1:]]              aliases = [t for t in aliases if t]              if aliases: -                extra['aliases'] = aliases +                extra["aliases"] = aliases          # ISBN          isbn13 = None -        for raw in obj.get('ISBN', []): +        for raw in obj.get("ISBN", []):              # TODO: convert if not ISBN-13 format              if len(raw) == 17:                  isbn13 = raw                  break          # release status -        if obj['type'] in ('journal-article', 'conference-proceeding', 'book', -                'dissertation', 'book-chapter'): +        if obj["type"] in ( +            "journal-article", +            "conference-proceeding", +            "book", +            "dissertation", +            "book-chapter", +        ):              release_stage = "published"          else:              # unknown              release_stage = None          # external identifiers -        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower()) +        extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())          # filter out unreasonably huge releases          if len(abstracts) > 100: -            self.counts['skip-huge-abstracts'] += 1 +            self.counts["skip-huge-abstracts"] += 1              return None          if len(contribs) > 2000: -            self.counts['skip-huge-contribs'] += 1 +            self.counts["skip-huge-contribs"] += 1              return None          if len(refs) > 5000: -            self.counts['skip-huge-refs'] += 1 +            self.counts["skip-huge-refs"] += 1              return None          # release date parsing is amazingly complex -        raw_date = obj['issued']['date-parts'][0] +        raw_date = obj["issued"]["date-parts"][0]          if not raw_date or not raw_date[0]:              # got some NoneType, even though at least year is supposed to be set              release_year = None @@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter):              release_date = None          original_title: Optional[str] = None -        if obj.get('original-title'): -            ot = obj.get('original-title') +        if obj.get("original-title"): +            ot = obj.get("original-title")              if ot is not None:                  original_title = clean(ot[0], force_xml=True)          title: Optional[str] = None -        if obj.get('title'): -            title = clean(obj.get('title')[0], force_xml=True) +        if obj.get("title"): +            title = clean(obj.get("title")[0], force_xml=True)              if not title or len(title) <= 1:                  # title can't be just a single character -                self.counts['skip-blank-title'] += 1 +                self.counts["skip-blank-title"] += 1                  return None          subtitle = None -        if obj.get('subtitle'): -            subtitle = clean(obj.get('subtitle')[0], force_xml=True) +        if obj.get("subtitle"): +            subtitle = clean(obj.get("subtitle")[0], force_xml=True)              if not subtitle or len(subtitle) <= 1:                  # subtitle can't be just a single character                  subtitle = None          if extra_crossref: -            extra['crossref'] = extra_crossref +            extra["crossref"] = extra_crossref          if not extra:              extra = None @@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter):              release_year=release_year,              publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds( -                doi=obj['DOI'].lower(), -                pmid=extids['pmid'], -                pmcid=extids['pmcid'], -                wikidata_qid=extids['wikidata_qid'], +                doi=obj["DOI"].lower(), +                pmid=extids["pmid"], +                pmcid=extids["pmcid"], +                wikidata_qid=extids["wikidata_qid"],                  isbn13=isbn13, -                core=extids['core_id'], -                arxiv=extids['arxiv_id'], -                jstor=extids['jstor_id'], +                core=extids["core_id"], +                arxiv=extids["arxiv_id"], +                jstor=extids["jstor_id"],              ), -            volume=clean(obj.get('volume')), -            issue=clean(obj.get('issue')), -            pages=clean(obj.get('page')), -            language=clean(obj.get('language')), +            volume=clean(obj.get("volume")), +            issue=clean(obj.get("issue")), +            pages=clean(obj.get("page")), +            language=clean(obj.get("language")),              license_slug=license_slug,              extra=extra,              abstracts=abstracts, @@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter):          # eventually we'll want to support "updates", but for now just skip if          # entity already exists          if existing: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          return True      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a06c68a4..4c174b0b 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048  # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary  CONTAINER_TYPE_MAP = { -    'Journal': 'journal', -    'Series': 'journal', -    'Book Series': 'book-series', +    "Journal": "journal", +    "Series": "journal", +    "Book Series": "book-series",  }  # The docs/guide should be the canonical home for these mappings; update there  # first.  Map various datacite type types to CSL-ish types. None means TODO or  # remove.  DATACITE_TYPE_MAP = { -    'ris': { -        'THES': 'thesis', -        'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) -        'CHAP': 'chapter', -        'FIGURE': 'figure', -        'RPRT': 'report', -        'JOUR': 'article-journal', -        'MPCT': 'motion_picture', -        'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset -        'BOOK': 'book', -        'DATA': 'dataset', -        'COMP': 'software', +    "ris": { +        "THES": "thesis", +        "SOUND": "song",  # 99.9% maps to citeproc song, so use that (exception: report) +        "CHAP": "chapter", +        "FIGURE": "figure", +        "RPRT": "report", +        "JOUR": "article-journal", +        "MPCT": "motion_picture", +        "GEN": "article-journal",  # GEN consist of 99% article and report, post-weblog, misc - and one dataset +        "BOOK": "book", +        "DATA": "dataset", +        "COMP": "software",      }, -    'schemaOrg': { -        'Dataset': 'dataset', -        'Book': 'book', -        'ScholarlyArticle': 'article-journal', -        'ImageObject': 'graphic', -        'Collection': None, -        'MediaObject': None, -        'Event': None, -        'SoftwareSourceCode': 'software', -        'Chapter': 'chapter', -        'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. -        'PublicationIssue': 'article', -        'AudioObject': None, -        'Thesis': 'thesis', +    "schemaOrg": { +        "Dataset": "dataset", +        "Book": "book", +        "ScholarlyArticle": "article-journal", +        "ImageObject": "graphic", +        "Collection": None, +        "MediaObject": None, +        "Event": None, +        "SoftwareSourceCode": "software", +        "Chapter": "chapter", +        "CreativeWork": None,  # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. +        "PublicationIssue": "article", +        "AudioObject": None, +        "Thesis": "thesis",      }, -    'citeproc': { -        'article': 'article', -        'article-journal': 'article-journal', -        'article-magazine': 'article-magazine', -        'article-newspaper': 'article-newspaper', -        'bill': 'bill', -        'book': 'book', -        'broadcast': 'broadcast', -        'chapter': 'chapter', -        'dataset': 'dataset', -        'entry-dictionary': 'entry-dictionary', -        'entry-encyclopedia': 'entry-encyclopedia', -        'entry': 'entry', -        'figure': 'figure', -        'graphic': 'graphic', -        'interview': 'interview', -        'legal_case': 'legal_case', -        'legislation': 'legislation', -        'manuscript': 'manuscript', -        'map': 'map', -        'motion_picture': 'motion_picture', -        'musical_score': 'musical_score', -        'pamphlet': 'pamphlet', -        'paper-conference': 'paper-conference', -        'patent': 'patent', -        'personal_communication': 'personal_communication', -        'post': 'post', -        'post-weblog': 'post-weblog', -        'report': 'report', -        'review-book': 'review-book', -        'review': 'review', -        'song': 'song', -        'speech': 'speech', -        'thesis': 'thesis', -        'treaty': 'treaty', -        'webpage': 'webpage', +    "citeproc": { +        "article": "article", +        "article-journal": "article-journal", +        "article-magazine": "article-magazine", +        "article-newspaper": "article-newspaper", +        "bill": "bill", +        "book": "book", +        "broadcast": "broadcast", +        "chapter": "chapter", +        "dataset": "dataset", +        "entry-dictionary": "entry-dictionary", +        "entry-encyclopedia": "entry-encyclopedia", +        "entry": "entry", +        "figure": "figure", +        "graphic": "graphic", +        "interview": "interview", +        "legal_case": "legal_case", +        "legislation": "legislation", +        "manuscript": "manuscript", +        "map": "map", +        "motion_picture": "motion_picture", +        "musical_score": "musical_score", +        "pamphlet": "pamphlet", +        "paper-conference": "paper-conference", +        "patent": "patent", +        "personal_communication": "personal_communication", +        "post": "post", +        "post-weblog": "post-weblog", +        "report": "report", +        "review-book": "review-book", +        "review": "review", +        "song": "song", +        "speech": "speech", +        "thesis": "thesis", +        "treaty": "treaty", +        "webpage": "webpage",      },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types -    'bibtex': { -        'phdthesis': 'thesis', -        'inbook': 'chapter', -        'misc': None, -        'article': 'article-journal', -        'book': 'book', +    "bibtex": { +        "phdthesis": "thesis", +        "inbook": "chapter", +        "misc": None, +        "article": "article-journal", +        "book": "book",      }, -    'resourceTypeGeneral': { -        'Image': 'graphic', -        'Dataset': 'dataset', -        'PhysicalObject': None, -        'Collection': None, -        'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" -        'Sound': None, -        'InteractiveResource': None, -        'Event': None, -        'Software': 'software', -        'Other': None, -        'Workflow': None, -        'Audiovisual': None, -    } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +    "resourceTypeGeneral": { +        "Image": "graphic", +        "Dataset": "dataset", +        "PhysicalObject": None, +        "Collection": None, +        "Text": None,  # "Greyliterature, labnotes, accompanyingmaterials" +        "Sound": None, +        "InteractiveResource": None, +        "Event": None, +        "Software": "software", +        "Other": None, +        "Workflow": None, +        "Audiovisual": None, +    },  # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32  }  # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.  DATACITE_UNKNOWN_MARKERS = ( -    '(:unac)',  # temporarily inaccessible -    '(:unal)',  # unallowed, suppressed intentionally -    '(:unap)',  # not applicable, makes no sense -    '(:unas)',  # value unassigned (e.g., Untitled) -    '(:unav)',  # value unavailable, possibly unknown -    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue) -    '(:none)',  # never had a value, never will -    '(:null)',  # explicitly and meaningfully empty -    '(:tba)',  # to be assigned or announced later -    '(:etal)',  # too numerous to list (et alia) +    "(:unac)",  # temporarily inaccessible +    "(:unal)",  # unallowed, suppressed intentionally +    "(:unap)",  # not applicable, makes no sense +    "(:unas)",  # value unassigned (e.g., Untitled) +    "(:unav)",  # value unavailable, possibly unknown +    "(:unkn)",  # known to be unknown (e.g., Anonymous, Inconnue) +    "(:none)",  # never had a value, never will +    "(:null)",  # explicitly and meaningfully empty +    "(:tba)",  # to be assigned or announced later +    "(:etal)",  # too numerous to list (et alia)  )  # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking  # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( -    'NA', -    'NN', -    'n.a.', -    '[s.n.]', -    'Unknown', -))) +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( +    set( +        ( +            "NA", +            "NN", +            "n.a.", +            "[s.n.]", +            "Unknown", +        ) +    ) +)  # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist.  UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) @@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))  # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi  DATACITE_TITLE_SPAM_WORDGROUPS = [      { -        "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', -                   'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), +        "tokens": ( +            "full", +            "movies", +            "movie", +            "watch", +            "streaming", +            "online", +            "free", +            "hd", +            "download", +            "english", +            "subtitle", +            "bluray", +        ),          "min": 4,      }  ] @@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter):      """      Importer for datacite records.      """ -    def __init__(self, -                 api, -                 issn_map_file, -                 debug=False, -                 insert_log_file=None, -                 **kwargs): + +    def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs):          eg_desc = kwargs.get( -            'editgroup_description', -            "Automated import of Datacite DOI metadata, harvested from REST API" +            "editgroup_description", +            "Automated import of Datacite DOI metadata, harvested from REST API",          ) -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', -                                         'fatcat_tools.DataciteImporter') -        super().__init__(api, -                         issn_map_file=issn_map_file, -                         editgroup_description=eg_desc, -                         editgroup_extra=eg_extra, -                         **kwargs) - -        self.create_containers = kwargs.get('create_containers', True) -        extid_map_file = kwargs.get('extid_map_file') +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter") +        super().__init__( +            api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs +        ) + +        self.create_containers = kwargs.get("create_containers", True) +        extid_map_file = kwargs.get("extid_map_file")          self.extid_map_db = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter):          self.insert_log_file = insert_log_file          self.this_year = datetime.datetime.now().year -        print('datacite with debug={}'.format(self.debug), file=sys.stderr) +        print("datacite with debug={}".format(self.debug), file=sys.stderr)      def lookup_ext_ids(self, doi):          """          Return dictionary of identifiers referring to the same things as the given DOI.          """          if self.extid_map_db is None: -            return dict(core_id=None, -                        pmid=None, -                        pmcid=None, -                        wikidata_qid=None, -                        arxiv_id=None, -                        jstor_id=None) +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            )          row = self.extid_map_db.execute( -            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", -            [doi.lower()]).fetchone() +            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] +        ).fetchone()          if row is None: -            return dict(core_id=None, -                        pmid=None, -                        pmcid=None, -                        wikidata_qid=None, -                        arxiv_id=None, -                        jstor_id=None) -        row = [str(cell or '') or None for cell in row] +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            ) +        row = [str(cell or "") or None for cell in row]          return dict(              core_id=row[0],              pmid=row[1], @@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter):          """          if not obj or not isinstance(obj, dict):              return None -        if 'attributes' not in obj: +        if "attributes" not in obj:              return None -        attributes = obj['attributes'] -        doi = clean_doi(attributes.get('doi', '').lower()) +        attributes = obj["attributes"] +        doi = clean_doi(attributes.get("doi", "").lower())          if not doi: -            print('skipping record without a DOI', file=sys.stderr) +            print("skipping record without a DOI", file=sys.stderr)              return          if not str.isascii(doi): -            print('[{}] skipping non-ascii doi for now'.format(doi)) +            print("[{}] skipping non-ascii doi for now".format(doi))              return None -        creators = attributes.get('creators', []) or [] -        contributors = attributes.get('contributors', []) or []  # Much fewer than creators. +        creators = attributes.get("creators", []) or [] +        contributors = attributes.get("contributors", []) or []  # Much fewer than creators.          contribs = self.parse_datacite_creators(creators, doi=doi) @@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter):          # Related: https://guide.fatcat.wiki/entity_release.html -- role          # (string, of a set): the type of contribution, from a controlled          # vocabulary. TODO: vocabulary needs review. -        contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) +        contribs_extra_contributors = self.parse_datacite_creators( +            contributors, set_index=False, doi=doi +        )          # Unfortunately, creators and contributors might overlap, refs GH59.          for cc in contribs_extra_contributors: @@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter):          # Title, may come with "attributes.titles[].titleType", like          # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" -        titles = attributes.get('titles', []) or [] -        title, original_language_title, subtitle = parse_datacite_titles( -            titles) +        titles = attributes.get("titles", []) or [] +        title, original_language_title, subtitle = parse_datacite_titles(titles)          if title is None: -            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) +            print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)              return False          title = clean(title)          if not title: -            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) +            print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)              return False          # check for blocklisted "spam", e.g. "FULL MOVIE" @@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter):          # "Collected", "Copyrighted", "Created", "Issued", "Submitted",          # "Updated", "Valid".          release_date, release_month, release_year = parse_datacite_dates( -            attributes.get('dates', [])) +            attributes.get("dates", []) +        )          # block bogus far-future years/dates -        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): +        if release_year is not None and ( +            release_year > (self.this_year + 5) or release_year < 1000 +        ):              release_date = None              release_month = None              release_year = None @@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter):          # Some records do not use the "dates" field (e.g. micropub), but:          # "attributes.published" or "attributes.publicationYear"          if not any((release_date, release_month, release_year)): -            release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) +            release_date, release_month, release_year = parse_single_date( +                attributes.get("publicationYear") +            )              if not any((release_date, release_month, release_year)): -                release_date, release_month, release_year = parse_single_date(attributes.get('published')) +                release_date, release_month, release_year = parse_single_date( +                    attributes.get("published") +                )          if not any((release_date, release_month, release_year)): -            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) +            print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr)          # Start with clear stages, e.g. published. TODO(martin): we could          # probably infer a bit more from the relations, e.g.          # "IsPreviousVersionOf" or "IsNewVersionOf". -        release_stage = 'published' +        release_stage = "published"          # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,          # we might want something else than 'published'. See also:          # https://support.datacite.org/docs/doi-states.          # Publisher. A few NA values. A few bogus values. -        publisher = attributes.get('publisher') +        publisher = attributes.get("publisher") -        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): +        if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")):              publisher = None              release_stage = None          if publisher is not None and len(publisher) > 80: @@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter):          container_id = None          container_name = None -        container = attributes.get('container', {}) or {} -        if container.get('type') in CONTAINER_TYPE_MAP.keys(): -            container_type = CONTAINER_TYPE_MAP.get(container['type']) -            if container.get('identifier') and container.get( -                    'identifierType') == 'ISSN': -                issn = container.get('identifier') +        container = attributes.get("container", {}) or {} +        if container.get("type") in CONTAINER_TYPE_MAP.keys(): +            container_type = CONTAINER_TYPE_MAP.get(container["type"]) +            if container.get("identifier") and container.get("identifierType") == "ISSN": +                issn = container.get("identifier")                  if len(issn) == 8:                      issn = issn[:4] + "-" + issn[4:]                  issnl = self.issn2issnl(issn)                  if issnl is not None:                      container_id = self.lookup_issnl(issnl) -                    if container_id is None and container.get('title'): -                        container_name = container.get('title') +                    if container_id is None and container.get("title"): +                        container_name = container.get("title")                          if isinstance(container_name, list):                              if len(container_name) > 0: -                                print('[{}] too many container titles: {}'.format(doi, -                                    len(container_name))) +                                print( +                                    "[{}] too many container titles: {}".format( +                                        doi, len(container_name) +                                    ) +                                )                                  container_name = container_name[0]                          assert isinstance(container_name, str)                          ce = fatcat_openapi_client.ContainerEntity( @@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter):                  else:                      # TODO(martin): factor this out into a testable function.                      # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 -                    container_name = container.get('title') +                    container_name = container.get("title")                      if isinstance(container_name, list):                          if len(container_name) > 0: -                            print('[{}] too many container titles: {}'.format(doi, -                                len(container_name))) +                            print( +                                "[{}] too many container titles: {}".format( +                                    doi, len(container_name) +                                ) +                            )                              container_name = container_name[0]          # Exception: https://www.micropublication.org/, see: !MR24.          if container_id is None and container_name is None: -            if publisher and publisher.lower().startswith('micropublication'): +            if publisher and publisher.lower().startswith("micropublication"):                  container_name = publisher          # Volume and issue. -        volume = container.get('volume') -        issue = container.get('issue') +        volume = container.get("volume") +        issue = container.get("issue")          if volume:              volume = clean(volume) @@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter):          # Pages.          pages = None -        first_page = container.get('firstPage') -        last_page = container.get('lastPage') +        first_page = container.get("firstPage") +        last_page = container.get("lastPage")          if first_page and last_page:              try:                  _ = int(first_page) < int(last_page) -                pages = '{}-{}'.format(first_page, last_page) +                pages = "{}-{}".format(first_page, last_page)              except ValueError as err:  # noqa: F841                  # TODO(martin): This is more debug than info.                  # print('[{}] {}'.format(doi, err), file=sys.stderr) @@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter):          license_slug = None          license_extra = [] -        for lic in attributes.get('rightsList', []): -            slug = lookup_license_slug(lic.get('rightsUri')) +        for lic in attributes.get("rightsList", []): +            slug = lookup_license_slug(lic.get("rightsUri"))              if slug:                  license_slug = slug              license_extra.append(lic) @@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter):          # library solves it for you." -- TODO(martin): We need more of these.          language = None -        value = attributes.get('language', '') or '' +        value = attributes.get("language", "") or ""          try:              language = pycountry.languages.lookup(value).alpha_2          except (LookupError, AttributeError) as err:  # noqa: F841 @@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter):          # "Other" fields might contain references or related articles (with          # DOI). TODO(martin): maybe try to parse out some of those refs.          abstracts = [] -        descs = attributes.get('descriptions', []) or [] +        descs = attributes.get("descriptions", []) or []          for desc in descs: -            if not desc.get('descriptionType') == 'Abstract': +            if not desc.get("descriptionType") == "Abstract":                  continue              # Description maybe a string, int or list. -            text = desc.get('description', '') +            text = desc.get("description", "")              if not text:                  continue              if isinstance(text, int): -                text = '{}'.format(text) +                text = "{}".format(text)              if isinstance(text, list):                  try:                      text = "\n".join(text)                  except TypeError: -                    continue # Bail out, if it is not a list of strings. +                    continue  # Bail out, if it is not a list of strings.              # Limit length.              if len(text) < 10: @@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter):              try:                  lang = langdetect.detect(text)              except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: -                print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) +                print( +                    "[{}] language detection failed with {} on {}".format(doi, err, text), +                    file=sys.stderr, +                )              abstract_text = clean(text)              if not abstract_text:                  continue @@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter):                      mimetype="text/plain",                      content=abstract_text,                      lang=lang, -                )) +                ) +            )          # References and relations. Datacite include many relation types in          # "attributes.relatedIdentifiers[].relationType", e.g. @@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter):          # For the moment, we only care about References.          refs, ref_index = [], 0 -        relIds = attributes.get('relatedIdentifiers', []) or [] +        relIds = attributes.get("relatedIdentifiers", []) or []          for rel in relIds: -            if not rel.get('relationType', '') in ('References', 'Cites'): +            if not rel.get("relationType", "") in ("References", "Cites"):                  continue              ref_extra = dict() -            if rel.get('relatedIdentifierType', '') == 'DOI': -                ref_extra['doi'] = rel.get('relatedIdentifier') +            if rel.get("relatedIdentifierType", "") == "DOI": +                ref_extra["doi"] = rel.get("relatedIdentifier")              if not ref_extra:                  ref_extra = None              refs.append(                  fatcat_openapi_client.ReleaseRef(                      index=ref_index,                      extra=ref_extra, -                )) +                ) +            )              ref_index += 1          # More specific release_type via 'Reviews' relationsship.          for rel in relIds: -            if rel.get('relatedIdentifierType', '') != 'Reviews': +            if rel.get("relatedIdentifierType", "") != "Reviews":                  continue -            release_type = 'review' +            release_type = "review"          # Extra information.          extra_datacite = dict()          if license_extra: -            extra_datacite['license'] = license_extra -        if attributes.get('subjects'): -            extra_datacite['subjects'] = attributes['subjects'] +            extra_datacite["license"] = license_extra +        if attributes.get("subjects"): +            extra_datacite["subjects"] = attributes["subjects"]          # Include version information. -        metadata_version = attributes.get('metadataVersion') or '' +        metadata_version = attributes.get("metadataVersion") or ""          if metadata_version: -            extra_datacite['metadataVersion'] = metadata_version +            extra_datacite["metadataVersion"] = metadata_version          # Include resource types. -        types = attributes.get('types', {}) or {} -        resource_type = types.get('resourceType', '') or '' -        resource_type_general = types.get('resourceTypeGeneral', '') or '' +        types = attributes.get("types", {}) or {} +        resource_type = types.get("resourceType", "") or "" +        resource_type_general = types.get("resourceTypeGeneral", "") or ""          if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER: -            extra_datacite['resourceType'] = resource_type +            extra_datacite["resourceType"] = resource_type          if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER: -            extra_datacite['resourceTypeGeneral'] = resource_type_general +            extra_datacite["resourceTypeGeneral"] = resource_type_general          # Include certain relations from relatedIdentifiers. Keeping the          # original structure of data here, which is a list of dicts, with          # relation type, identifier and identifier type (mostly).          relations = []          for rel in relIds: -            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', -                                           'IsVariantFormOf', 'IsSupplementTo', -                                           'HasVersion', 'IsMetadataFor', -                                           'IsNewVersionOf', 'IsIdenticalTo', -                                           'IsVersionOf', 'IsDerivedFrom', -                                           'IsSourceOf'): +            if rel.get("relationType") in ( +                "IsPartOf", +                "Reviews", +                "Continues", +                "IsVariantFormOf", +                "IsSupplementTo", +                "HasVersion", +                "IsMetadataFor", +                "IsNewVersionOf", +                "IsIdenticalTo", +                "IsVersionOf", +                "IsDerivedFrom", +                "IsSourceOf", +            ):                  relations.append(rel)          if relations: -            extra_datacite['relations'] = relations +            extra_datacite["relations"] = relations          extra = dict() @@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter):          # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,          # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",          # "10161", "10010691", "10780", # "Presentación" -        version = attributes.get('version') or None +        version = attributes.get("version") or None          # top-level extra keys          if not container_id and container_name: -            extra['container_name'] = container_name +            extra["container_name"] = container_name          # Always include datacite key, even if value is empty (dict). -        extra['datacite'] = extra_datacite +        extra["datacite"] = extra_datacite          # Preparation for a schema update.          if release_month: -            extra['release_month'] = release_month +            extra["release_month"] = release_month          extids = self.lookup_ext_ids(doi=doi) @@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter):              publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds(                  doi=doi, -                pmid=extids['pmid'], -                pmcid=extids['pmcid'], -                wikidata_qid=extids['wikidata_qid'], -                core=extids['core_id'], -                arxiv=extids['arxiv_id'], -                jstor=extids['jstor_id'], +                pmid=extids["pmid"], +                pmcid=extids["pmcid"], +                wikidata_qid=extids["wikidata_qid"], +                core=extids["core_id"], +                arxiv=extids["arxiv_id"], +                jstor=extids["jstor_id"],              ),              contribs=contribs,              volume=volume, @@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter):          """          release_type = None -        if not attributes.get('types'): +        if not attributes.get("types"):              return None -        types = attributes['types'] +        types = attributes["types"] -        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): +        for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"):              value = types.get(typeType)              release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)              if release_type is not None:                  break          # special case: figshare "collections" which group other entities -        if doi.startswith('10.6084/') or doi.startswith('10.25384'): -            if types.get('resourceType') == "Collection": +        if doi.startswith("10.6084/") or doi.startswith("10.25384"): +            if types.get("resourceType") == "Collection":                  release_type = "stub"          if release_type is None: @@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter):          # publishes highly interesting datasets, but titles are mostly the same          # ("GBIF Occurrence Download" or "Occurrence Download"); set          # release_type to "stub" (CSL/FC). -        if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): -            re.release_type = 'stub' +        if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."): +            re.release_type = "stub"          # release_type exception: lots of "Experimental Crystal Structure Determination"          # publisher: "Cambridge Crystallographic Data Centre" -        if re.ext_ids.doi.startswith('10.5517/'): -            re.release_type = 'entry' +        if re.ext_ids.doi.startswith("10.5517/"): +            re.release_type = "entry"          # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." -        if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): -            re.release_type = 'component' +        if re.title.lower().startswith("additional file") and re.release_type in ( +            "article", +            "article-journal", +        ): +            re.release_type = "component"          # figshare -        if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): +        if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"):              # set version if DOI ends with versioned suffix -            doi_suffix = re.ext_ids.doi.split('.')[-1] -            if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): +            doi_suffix = re.ext_ids.doi.split(".")[-1] +            if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit():                  re.version = doi_suffix              # "Figure 123 from " -> component              # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" -            if " from " in re.title and re.release_type not in ('stub', 'graphic'): +            if " from " in re.title and re.release_type not in ("stub", "graphic"):                  if re.title.startswith("Figure "):                      re.release_type = "component"                  elif re.title.startswith("Table "):                      re.release_type = "component"          # figshare.com -        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: -            re.extra['container_name'] = "figshare.com" +        if ( +            re.ext_ids.doi.startswith("10.6084/m9.figshare.") +            and re.extra.get("container_name") is None +        ): +            re.extra["container_name"] = "figshare.com"          return re @@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter):          # eventually we'll want to support "updates", but for now just skip if          # entity already exists          if existing: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          return True      def insert_batch(self, batch): -        print('inserting batch ({})'.format(len(batch)), file=sys.stderr) +        print("inserting batch ({})".format(len(batch)), file=sys.stderr)          if self.insert_log_file: -            with open(self.insert_log_file, 'a') as f: +            with open(self.insert_log_file, "a") as f:                  for doc in batch:                      json.dump(entity_to_dict(doc, api_client=None), f) -                    f.write('\n') +                    f.write("\n")          self.api.create_release_auto_batch(              fatcat_openapi_client.ReleaseAutoBatch(                  editgroup=fatcat_openapi_client.Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra), -                entity_list=batch)) +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) -    def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): +    def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None):          """          Parses a list of creators into a list of ReleaseContrib objects. Set          set_index to False, if the index contrib field should be left blank. @@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter):          contribs = []          # Names, that should be ignored right away. -        name_blocklist = set(('Occdownload Gbif.Org',)) +        name_blocklist = set(("Occdownload Gbif.Org",))          i = 0          for c in creators:              if not set_index:                  i = None -            nameType = c.get('nameType', '') or '' -            if nameType in ('', 'Personal'): +            nameType = c.get("nameType", "") or "" +            if nameType in ("", "Personal"):                  creator_id = None -                for nid in c.get('nameIdentifiers', []) or []: +                for nid in c.get("nameIdentifiers", []) or []:                      if not isinstance(nid, dict):                          # see: fatcat-workers/issues/44035/ -                        print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr) +                        print( +                            "unexpected nameIdentifiers, expected list of dicts, got: {}".format( +                                nid +                            ), +                            file=sys.stderr, +                        )                          continue -                    name_scheme = nid.get('nameIdentifierScheme', '') or '' +                    name_scheme = nid.get("nameIdentifierScheme", "") or ""                      if not name_scheme.lower() == "orcid":                          continue -                    orcid = nid.get('nameIdentifier') or '' -                    orcid = orcid.replace('https://orcid.org/', '') +                    orcid = nid.get("nameIdentifier") or "" +                    orcid = orcid.replace("https://orcid.org/", "")                      if not orcid:                          continue                      creator_id = self.lookup_orcid(orcid)                      # TODO(martin): If creator_id is None, should we create creators?                  # If there are multiple affiliation strings, use the first one. -                affiliations = c.get('affiliation', []) or [] +                affiliations = c.get("affiliation", []) or []                  raw_affiliation = None                  if len(affiliations) == 0:                      raw_affiliation = None                  else:                      raw_affiliation = clean(affiliations[0]) -                name = c.get('name') -                given_name = c.get('givenName') -                surname = c.get('familyName') +                name = c.get("name") +                given_name = c.get("givenName") +                surname = c.get("familyName")                  if name:                      name = clean(name)                  if not any((name, given_name, surname)):                      continue                  if not name: -                    name = "{} {}".format(given_name or '', surname or '').strip() +                    name = "{} {}".format(given_name or "", surname or "").strip()                  if name in name_blocklist:                      continue                  if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter):                  if not name:                      continue -                if raw_affiliation == '': +                if raw_affiliation == "":                      continue                  extra = None @@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter):                  # "RelatedPerson", "ProjectLeader", "Editor", "Other",                  # "ProjectMember", "Funder", "RightsHolder", "DataCollector",                  # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" -                contributorType = c.get('contributorType', '') or '' +                contributorType = c.get("contributorType", "") or ""                  if contributorType: -                    extra = {'type': contributorType} +                    extra = {"type": contributorType}                  rc = fatcat_openapi_client.ReleaseContrib( -                        creator_id=creator_id, -                        index=i, -                        raw_name=name, -                        given_name=given_name, -                        surname=surname, -                        role=role, -                        raw_affiliation=raw_affiliation, -                        extra=extra, -                    ) +                    creator_id=creator_id, +                    index=i, +                    raw_name=name, +                    given_name=given_name, +                    surname=surname, +                    role=role, +                    raw_affiliation=raw_affiliation, +                    extra=extra, +                )                  # Filter out duplicates early.                  if not contributor_list_contains_contributor(contribs, rc):                      contribs.append(rc)                      if i is not None:                          i += 1 -            elif nameType == 'Organizational': -                name = c.get('name', '') or '' +            elif nameType == "Organizational": +                name = c.get("name", "") or ""                  if name in UNKNOWN_MARKERS:                      continue                  if len(name) < 3:                      continue -                extra = {'organization': name} -                contribs.append(fatcat_openapi_client.ReleaseContrib( -                    index=i, extra=extra)) +                extra = {"organization": name} +                contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))                  if i is not None:                      i += 1              else: -                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) +                print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr)          return contribs @@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor):      for cc in contributor_list:          if cc.raw_name != contributor.raw_name:              continue -        cc_role = cc.role or 'author' -        contributor_role = contributor.role or 'author' +        cc_role = cc.role or "author" +        contributor_role = contributor.role or "author"          if cc_role != contributor_role:              continue          return True @@ -952,91 +1007,97 @@ def lookup_license_slug(raw):      if not raw:          return None -    if 'creativecommons.org/publicdomain/zero' in raw: -        return 'CC-0' -    if raw.lower().endswith('/cc0'): -        return 'CC-0' +    if "creativecommons.org/publicdomain/zero" in raw: +        return "CC-0" +    if raw.lower().endswith("/cc0"): +        return "CC-0" -    if 'creativecommons' in raw: +    if "creativecommons" in raw:          # https://creativecommons.org/publicdomain/mark/1.0/deed.de -        if 'creativecommons.org/publicdomain' in raw: -            return 'CC-PUBLICDOMAIN' -        if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: -            return 'CC-0' +        if "creativecommons.org/publicdomain" in raw: +            return "CC-PUBLICDOMAIN" +        if "creativecommons.org/share-your-work/public-domain/cc0" in raw: +            return "CC-0"          # https://creativecommons.org/licenses/by/4.0/deed.es_ES          raw = raw.lower() -        match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE) +        match = re.search( +            r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE +        )          if not match: -            print('missed potential license: {}'.format(raw), file=sys.stderr) +            print("missed potential license: {}".format(raw), file=sys.stderr)              return None -        name = match.groupdict().get('name') +        name = match.groupdict().get("name")          if not name:              return None -        if not name.startswith('cc'): -            name = 'cc-{}'.format(name) +        if not name.startswith("cc"): +            name = "cc-{}".format(name)          return name.upper() -    if 'opensource.org' in raw: +    if "opensource.org" in raw:          # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 -        match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE) +        match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE)          if not match: -            print('missed potential license: {}'.format(raw), file=sys.stderr) +            print("missed potential license: {}".format(raw), file=sys.stderr)              return None -        name = match.groupdict().get('name') +        name = match.groupdict().get("name")          if not name:              return None          if len(name) > 11:              return None          return name.upper() -    if 'gnu.org' in raw: +    if "gnu.org" in raw:          # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html -        match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) +        match = re.search( +            r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)", +            raw, +            re.IGNORECASE, +        )          if not match: -            print('missed potential license: {}'.format(raw), file=sys.stderr) +            print("missed potential license: {}".format(raw), file=sys.stderr)              return None -        name = match.groupdict().get('name') +        name = match.groupdict().get("name")          if not name:              return None          if len(name) > 8:              return None          return name.upper() -    if 'spdx.org' in raw: -        if 'spdx.org/licenses/CC0' in raw: -            return 'CC-0' +    if "spdx.org" in raw: +        if "spdx.org/licenses/CC0" in raw: +            return "CC-0"          # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html -        match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE) +        match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE)          if not match: -            print('missed potential license: {}'.format(raw), file=sys.stderr) +            print("missed potential license: {}".format(raw), file=sys.stderr)              return None -        name = match.groupdict().get('name') +        name = match.groupdict().get("name")          if not name:              return None          if len(name) > 36:              return None          # cleanup version and extensions -        name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) +        name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower())          return name.upper() -    if 'rightsstatements.org' in raw: +    if "rightsstatements.org" in raw:          # http://rightsstatements.org/vocab/InC/1.0/ -        match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw) +        match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw)          if not match: -            print('missed potential license: {}'.format(raw), file=sys.stderr) +            print("missed potential license: {}".format(raw), file=sys.stderr)              return None -        name = match.groupdict().get('name') +        name = match.groupdict().get("name")          if not name:              return None          if len(name) > 9:              return None -        return 'RS-{}'.format(name.upper()) +        return "RS-{}".format(name.upper())      # Fallback to mapped values.      raw = raw.lower() -    raw = raw.strip().replace('http://', '//').replace('https://', '//') -    if not raw.endswith('/'): -        raw = raw + '/' +    raw = raw.strip().replace("http://", "//").replace("https://", "//") +    if not raw.endswith("/"): +        raw = raw + "/"      return LICENSE_SLUG_MAP.get(raw) @@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3):      Example input: {'title': 'Some title', 'original_language_title': 'Some title'}      """ -    if 'original_language_title' not in item: +    if "original_language_title" not in item:          return None -    title = item.get('title') +    title = item.get("title")      if not title:          return None -    original_language_title = item.get('original_language_title') -    if isinstance(original_language_title, -                  str) and title != original_language_title: +    original_language_title = item.get("original_language_title") +    if isinstance(original_language_title, str) and title != original_language_title:          if len(original_language_title) < min_length:              return None -        if original_language_title.count('?') > max_questionmarks: +        if original_language_title.count("?") > max_questionmarks:              return None          return original_language_title      if isinstance(original_language_title, dict): -        content = original_language_title.get('__content__', '') or '' -        if content and content != title and not content.count( -                '?') > max_questionmarks: +        content = original_language_title.get("__content__", "") or "" +        if content and content != title and not content.count("?") > max_questionmarks:              return content      return None @@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles):          return title, original_language_title, subtitle      elif len(titles) == 1:          original_language_title = find_original_language_title(titles[0]) -        title = titles[0].get('title', '') or '' +        title = titles[0].get("title", "") or ""          title = title.strip()          if not title:              title = None          return title, original_language_title, subtitle      else:          for entry in titles: -            if not title and ('titleType' not in entry -                              or not entry.get('titleType')): -                title = (entry.get('title') or '').strip() -            if not subtitle and entry.get('titleType') == 'Subtitle': -                subtitle = entry.get('title', '').strip() +            if not title and ("titleType" not in entry or not entry.get("titleType")): +                title = (entry.get("title") or "").strip() +            if not subtitle and entry.get("titleType") == "Subtitle": +                subtitle = entry.get("title", "").strip()              if not original_language_title:                  original_language_title = find_original_language_title(entry)      return title, original_language_title, subtitle +  def parse_single_date(value):      """      Given a single string containing a date in arbitrary format, try to return @@ -1113,11 +1172,11 @@ def parse_single_date(value):          # Results in a dict with keys: date_obj, period, locale.          parse_result = parser.get_date_data(value)          # A datetime object, later we need a date, only. -        result = parse_result['date_obj'] +        result = parse_result["date_obj"]          if result is not None: -            if parse_result['period'] == 'year': +            if parse_result["period"] == "year":                  return None, None, result.year -            elif parse_result['period'] == 'month': +            elif parse_result["period"] == "month":                  return None, result.month, result.year              else:                  return result.date(), result.month, result.year @@ -1126,6 +1185,7 @@ def parse_single_date(value):      return None, None, None +  def parse_datacite_dates(dates):      """      Given a list of date fields (under .dates), return tuple, (release_date, @@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates):          return release_date, release_month, release_year      if not isinstance(dates, list): -        raise ValueError('expected a list of date items') +        raise ValueError("expected a list of date items")      # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",      # "Collected", "Updated", "Copyrighted", "Created"      # Ignored for now: "Collected", "Issued"      date_type_prio = ( -        'Valid', -        'Available', -        'Accepted', -        'Submitted', -        'Copyrighted', -        'Created', -        'Updated', +        "Valid", +        "Available", +        "Accepted", +        "Submitted", +        "Copyrighted", +        "Created", +        "Updated",      )      # We need to note the granularity, since a string like "2019" would be      # parsed into "2019-01-01", even though the month is unknown. Use 3      # granularity types: 'y', 'm', 'd'. -    Pattern = collections.namedtuple('Pattern', 'layout granularity') +    Pattern = collections.namedtuple("Pattern", "layout granularity")      # Before using (expensive) dateparser, try a few common patterns.      common_patterns = ( -        Pattern('%Y-%m-%d', 'd'), -        Pattern('%Y-%m', 'm'), -        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), -        Pattern('%Y-%m-%dT%H:%M:%S', 'd'), -        Pattern('%Y', 'y'), +        Pattern("%Y-%m-%d", "d"), +        Pattern("%Y-%m", "m"), +        Pattern("%Y-%m-%dT%H:%M:%SZ", "d"), +        Pattern("%Y-%m-%dT%H:%M:%S", "d"), +        Pattern("%Y", "y"),      )      def parse_item(item): -        result, value, year_only = None, str(item.get('date', '')) or '', False +        result, value, year_only = None, str(item.get("date", "")) or "", False          release_date, release_month, release_year = None, None, None          for layout, granularity in common_patterns: @@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates):              except ValueError:                  continue              else: -                if granularity == 'y': +                if granularity == "y":                      year_only = True                  break          if result is None: -            print('fallback for {}'.format(value), file=sys.stderr) +            print("fallback for {}".format(value), file=sys.stderr)              release_date, release_month, release_year = parse_single_date(value)          if result is None:              # Unparsable date.              return release_date, release_month, release_year -        if granularity != 'y': +        if granularity != "y":              release_date = result.date()          release_year = result.year -        if granularity in ('m', 'd'): +        if granularity in ("m", "d"):              release_month = result.month          return release_date, release_month, release_year @@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates):      for prio in date_type_prio:          for item in dates: -            if not item.get('dateType') == prio: +            if not item.get("dateType") == prio:                  continue              release_date, release_month, release_year = parse_item(item) @@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates):      return release_date, release_month, release_year +  def index_form_to_display_name(s):      """      Try to convert an index form name, like 'Razis, Panos A' into display_name,      e.g. 'Panos A Razis'.      """ -    if ',' not in s: +    if "," not in s:          return s -    skip_on_chars = ['(', ')', '*'] +    skip_on_chars = ["(", ")", "*"]      for char in skip_on_chars:          if char in s:              return s -    if s.count(',') > 1: +    if s.count(",") > 1:          # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"          return s      # Not names, but sprinkled in fields where authors live. -    stopwords = [s.lower() for s in ( -        'Archive', -        'Collection', -        'Coordinator', -        'Department', -        'Germany', -        'International', -        'National', -        'Netherlands', -        'Office', -        'Organisation', -        'Organization', -        'Service', -        'Services', -        'United States', -        'University', -        'Verein', -        'Volkshochschule', -    )] +    stopwords = [ +        s.lower() +        for s in ( +            "Archive", +            "Collection", +            "Coordinator", +            "Department", +            "Germany", +            "International", +            "National", +            "Netherlands", +            "Office", +            "Organisation", +            "Organization", +            "Service", +            "Services", +            "United States", +            "University", +            "Verein", +            "Volkshochschule", +        ) +    ]      lower = s.lower()      for stop in stopwords:          if stop in lower:              return s -    a, b = s.split(',') -    return '{} {}'.format(b.strip(), a.strip()) +    a, b = s.split(",") +    return "{} {}".format(b.strip(), a.strip()) diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index 3d280fb7..603a6271 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -1,4 +1,3 @@ -  """  Importer for DBLP container-level (journal/conference/series) metadata,  pre-scraped in to JSON from HTML pages. @@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str  class DblpContainerImporter(EntityImporter): +    def __init__( +        self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs +    ): -    def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs): - -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of container-level metadata scraped from dblp HTML") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.get( +            "editgroup_description", +            "Automated import of container-level metadata scraped from dblp HTML", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.dblp_container_map_output = dblp_container_map_output          self.read_dblp_container_map_file(dblp_container_map_file) @@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter):              assert len(container_id) == 26              self._dblp_container_map[prefix] = container_id              print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) -        print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) +        print( +            "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), +            file=sys.stderr, +        )      def lookup_dblp_prefix(self, prefix):          if not prefix: @@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter):          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        dblp_prefix = row.get('key') or row.get('dblp_prefix') +        dblp_prefix = row.get("key") or row.get("dblp_prefix")          assert dblp_prefix -        assert row['title'] +        assert row["title"]          container_type = None -        if dblp_prefix.startswith('conf/'): +        if dblp_prefix.startswith("conf/"):              container_type = "conference-series" -        elif dblp_prefix.startswith('journals/'): +        elif dblp_prefix.startswith("journals/"):              container_type = "journal" -        elif dblp_prefix.startswith('series/'): +        elif dblp_prefix.startswith("series/"):              container_type = "book-series"          issnl = None -        for issn in row.get('issns', []): +        for issn in row.get("issns", []):              issnl = self.issn2issnl(issn)              if issnl:                  break          extra = { -            'dblp': { -                'prefix': dblp_prefix, +            "dblp": { +                "prefix": dblp_prefix,              },          } -        if row.get('homepage_url'): -            extra['urls'] = [row['homepage_url']] +        if row.get("homepage_url"): +            extra["urls"] = [row["homepage_url"]] -        if row.get('acronym'): -            extra['acronym'] = row['acronym'] +        if row.get("acronym"): +            extra["acronym"] = row["acronym"]          ce = fatcat_openapi_client.ContainerEntity( -            name=clean_str(row['title']), +            name=clean_str(row["title"]),              container_type=container_type,              issnl=issnl, -            wikidata_qid=row.get('wikidata_qid'), +            wikidata_qid=row.get("wikidata_qid"),              extra=extra,          )          return ce      def try_update(self, ce): -        dblp_prefix = ce.extra['dblp']['prefix'] +        dblp_prefix = ce.extra["dblp"]["prefix"]          existing = None          existing_container_id = self.lookup_dblp_prefix(dblp_prefix)          if existing_container_id: @@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter):              return True          if existing: -            self.counts['exists'] += 1 -            print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output) +            self.counts["exists"] += 1 +            print( +                "\t".join([ce.extra["dblp"]["prefix"], existing.ident]), +                file=self.dblp_container_map_output, +            )              return False          # shouldn't get here @@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter):          Because we want to print a prefix/container_id match for each row, we          require a special batch insert method          """ -        eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        eg = self.api.create_container_auto_batch( +            fatcat_openapi_client.ContainerAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )          for c_edit in eg.edits.containers:              c = self.api.get_container(c_edit.ident) -            print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output) +            print( +                "\t".join([c.extra["dblp"]["prefix"], c.ident]), +                file=self.dblp_container_map_output, +            ) diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 6d028f2f..5baa6cd6 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -1,4 +1,3 @@ -  """  Importer for DBLP release-level (article/paper/etc) XML metadata. @@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict  class DblpReleaseImporter(EntityImporter): - -    def __init__(self, -                 api, -                 dblp_container_map_file=None, -                 **kwargs): +    def __init__(self, api, dblp_container_map_file=None, **kwargs):          eg_desc = kwargs.get( -            'editgroup_description', -            "Automated import of dblp metadata via XML records" +            "editgroup_description", "Automated import of dblp metadata via XML records"          ) -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', -                                         'fatcat_tools.DblpReleaseImporter') +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")          # ensure default is to not do updates with this worker (override super() default) -        kwargs['do_updates'] = kwargs.get("do_updates", False) -        super().__init__(api, -                         editgroup_description=eg_desc, -                         editgroup_extra=eg_extra, -                         **kwargs) +        kwargs["do_updates"] = kwargs.get("do_updates", False) +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.dump_json_mode = kwargs.get("dump_json_mode", False)          self.this_year = datetime.datetime.now().year @@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):          "phdthesis",          "mastersthesis",          "www", -        #"data",  # no instances in 2020-11 dump +        # "data",  # no instances in 2020-11 dump      ]      def read_dblp_container_map_file(self, dblp_container_map_file) -> None:          self._dblp_container_map = dict()          if not dblp_container_map_file: -            print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr) +            print( +                "Not loading a dblp prefix container map file; entities will fail to import", +                file=sys.stderr, +            )              return          print("Loading dblp prefix container map file...", file=sys.stderr)          for line in dblp_container_map_file: @@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):              container_id = container_id.strip()              assert len(container_id) == 26              self._dblp_container_map[prefix] = container_id -        print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) +        print( +            "Got {} dblp container mappings.".format(len(self._dblp_container_map)), +            file=sys.stderr, +        )      def lookup_dblp_prefix(self, prefix):          if not prefix: @@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):      def want(self, xml_elem):          if xml_elem.name not in self.ELEMENT_TYPES: -            self.counts['skip-type'] += 1 +            self.counts["skip-type"] += 1              return False -        if not xml_elem.get('key'): -            self.counts['skip-no-key'] += 1 +        if not xml_elem.get("key"): +            self.counts["skip-no-key"] += 1              return False -        if xml_elem['key'].startswith('homepage/'): -            self.counts['skip-type-homepage'] += 1 +        if xml_elem["key"].startswith("homepage/"): +            self.counts["skip-type-homepage"] += 1              return False          return True @@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):          - isbn          """ -        dblp_key = xml_elem.get('key') +        dblp_key = xml_elem.get("key")          if not dblp_key: -            self.counts['skip-empty-key'] += 1 +            self.counts["skip-empty-key"] += 1              return False -        dblp_key_type = dblp_key.split('/')[0] +        dblp_key_type = dblp_key.split("/")[0]          # dblp_prefix may be used for container lookup          dblp_prefix = None -        if dblp_key_type in ('journals', 'conf'): -            dblp_prefix = '/'.join(dblp_key.split('/')[:2]) -        elif dblp_key_type in ('series', 'reference', 'tr', 'books'): -            dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) +        if dblp_key_type in ("journals", "conf"): +            dblp_prefix = "/".join(dblp_key.split("/")[:2]) +        elif dblp_key_type in ("series", "reference", "tr", "books"): +            dblp_prefix = "/".join(dblp_key.split("/")[:-1]) -        publtype = xml_elem.get('publtype') or None +        publtype = xml_elem.get("publtype") or None          dblp_type = xml_elem.name          if dblp_type not in self.ELEMENT_TYPES: -            self.counts[f'skip-dblp-type:{dblp_type}'] += 1 +            self.counts[f"skip-dblp-type:{dblp_type}"] += 1 -        if dblp_key_type in ('homepages', 'persons', 'dblpnote'): -            self.counts['skip-key-type'] += 1 +        if dblp_key_type in ("homepages", "persons", "dblpnote"): +            self.counts["skip-key-type"] += 1              return False -        if dblp_key.startswith('journals/corr/'): -            self.counts['skip-arxiv-corr'] += 1 +        if dblp_key.startswith("journals/corr/"): +            self.counts["skip-arxiv-corr"] += 1              return False          title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)          if not title: -            self.counts['skip-title'] += 1 +            self.counts["skip-title"] += 1              return False -        if title.endswith('.'): +        if title.endswith("."):              title = title[:-1]          release_type = None -        release_stage = 'published' +        release_stage = "published"          withdrawn_status = None          # primary releae_type detection: type of XML element, then prefix of key for granularity -        if dblp_type == 'article': -            release_type = 'article' -            if dblp_key_type == 'journals' and publtype != 'informal': -                release_type = 'article-journal' -            elif dblp_key_type == 'tr': -                release_type = 'report' +        if dblp_type == "article": +            release_type = "article" +            if dblp_key_type == "journals" and publtype != "informal": +                release_type = "article-journal" +            elif dblp_key_type == "tr": +                release_type = "report"              elif title.startswith("Review:"): -                release_type = 'review' -        elif dblp_type == 'inproceedings': -            release_type = 'paper-conference' -        elif dblp_type == 'book': -            release_type = 'book' -        elif dblp_type == 'incollection': +                release_type = "review" +        elif dblp_type == "inproceedings": +            release_type = "paper-conference" +        elif dblp_type == "book": +            release_type = "book" +        elif dblp_type == "incollection":              # XXX: part vs. chapter? -            release_type = 'chapter' -        elif dblp_type == 'data': -            release_type = 'dataset' -        elif dblp_type in ('mastersthesis', 'phdthesis'): -            release_type = 'thesis' +            release_type = "chapter" +        elif dblp_type == "data": +            release_type = "dataset" +        elif dblp_type in ("mastersthesis", "phdthesis"): +            release_type = "thesis"          # overrides/extensions of the above -        if publtype == 'informal': +        if publtype == "informal":              # for conferences, seems to indicate peer-review status              # for journals, seems to indicate things like book reviews; split out above              pass -        elif publtype == 'encyclopedia': -            release_type = 'entry-encyclopedia' -        elif publtype == 'edited': +        elif publtype == "encyclopedia": +            release_type = "entry-encyclopedia" +        elif publtype == "edited":              # XXX: article? -            release_type = 'editorial' -        elif publtype == 'data': -            release_type = 'dataset' -        elif publtype == 'data': -            release_type = 'dataset' -        elif publtype == 'software': -            release_type = 'software' -        elif publtype == 'widthdrawn': -            withdrawn_status = 'widthdrawn' -        elif publtype == 'survey': +            release_type = "editorial" +        elif publtype == "data": +            release_type = "dataset" +        elif publtype == "data": +            release_type = "dataset" +        elif publtype == "software": +            release_type = "software" +        elif publtype == "widthdrawn": +            withdrawn_status = "widthdrawn" +        elif publtype == "survey":              # XXX: flag as a review/survey article?              pass -        #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) +        # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)          container_name = None          booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) @@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):          part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)          # block bogus far-future years/dates -        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): +        if release_year is not None and ( +            release_year > (self.this_year + 5) or release_year < 1000 +        ):              release_month = None              release_year = None @@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):          if isbn:              ext_ids.isbn13 = isbn          if ext_ids.doi: -            self.counts['has-doi'] += 1 +            self.counts["has-doi"] += 1          # dblp-specific extra          dblp_extra = dict(type=dblp_type)          note = clean_str(xml_elem.note and xml_elem.note.text) -        if note and 'base-search.net' not in note: -            dblp_extra['note'] = note +        if note and "base-search.net" not in note: +            dblp_extra["note"] = note          if part_of_key: -            dblp_extra['part_of_key'] = part_of_key +            dblp_extra["part_of_key"] = part_of_key          # generic extra          extra = dict()          if not container_id and container_name: -            extra['container_name'] = container_name +            extra["container_name"] = container_name -        if series and (dblp_key_type == 'series' or dblp_type == 'book'): -            extra['series-title'] = series +        if series and (dblp_key_type == "series" or dblp_type == "book"): +            extra["series-title"] = series          elif series: -            dblp_extra['series'] = series +            dblp_extra["series"] = series -        if booktitle and dblp_key_type == 'series': -            extra['container-title'] = booktitle -        elif booktitle and dblp_key_type == 'conf': -            extra['event'] = booktitle +        if booktitle and dblp_key_type == "series": +            extra["container-title"] = booktitle +        elif booktitle and dblp_key_type == "conf": +            extra["event"] = booktitle          elif booktitle: -            dblp_extra['booktitle'] = booktitle +            dblp_extra["booktitle"] = booktitle          if release_year and release_month:              # TODO: release_month schema migration -            extra['release_month'] = release_month +            extra["release_month"] = release_month          if dblp_extra: -            extra['dblp'] = dblp_extra +            extra["dblp"] = dblp_extra          if not extra:              extra = None @@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):              withdrawn_status=withdrawn_status,              title=title,              release_year=release_year, -            #release_date, +            # release_date,              publisher=publisher,              ext_ids=ext_ids,              contribs=contribs, @@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):          if self.dump_json_mode:              re_dict = entity_to_dict(re, api_client=self.api.api_client) -            re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) -            re_dict['_dblp_prefix'] = dblp_prefix +            re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem) +            re_dict["_dblp_prefix"] = dblp_prefix              print(json.dumps(re_dict, sort_keys=True))              return False @@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):          # then try other ext_id lookups          if not existing: -            for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): +            for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):                  extid_val = getattr(re.ext_ids, extid_type)                  if not extid_val:                      continue -                #print(f"  lookup release type: {extid_type} val: {extid_val}") +                # print(f"  lookup release type: {extid_type} val: {extid_val}")                  try:                      existing = self.api.lookup_release(**{extid_type: extid_val})                  except fatcat_openapi_client.rest.ApiException as err: @@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):              return True          if not self.do_updates or existing.ext_ids.dblp: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          # logic for whether to do update or skip -        if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: -            self.counts['skip-update'] += 1 +        if ( +            existing.container_id and existing.release_type and existing.release_stage +        ) or existing.ext_ids.arxiv: +            self.counts["skip-update"] += 1              return False          # fields to copy over for update @@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):          existing.release_stage = existing.release_stage or re.release_stage          existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status          existing.container_id = existing.container_id or re.container_id -        existing.extra['dblp'] = re.extra['dblp'] +        existing.extra["dblp"] = re.extra["dblp"]          existing.volume = existing.volume or re.volume          existing.issue = existing.issue or re.issue          existing.pages = existing.pages or re.pages          try:              self.api.update_release(self.get_editgroup_id(), existing.ident, existing) -            self.counts['update'] += 1 +            self.counts["update"] += 1          except fatcat_openapi_client.rest.ApiException as err:              # there is a code path where we try to update the same release              # twice in a row; if that happens, just skip              # NOTE: API behavior might change in the future?              if "release_edit_editgroup_id_ident_id_key" in err.body: -                self.counts['skip-update-conflict'] += 1 +                self.counts["skip-update-conflict"] += 1                  return False              else:                  raise err @@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):          return False      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )      def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:          """ @@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):          """          contribs = []          index = 0 -        for elem in authors.find_all('author'): +        for elem in authors.find_all("author"):              contrib = self.dblp_contrib_single(elem)              contrib.role = "author"              contrib.index = index              contribs.append(contrib)              index += 1 -        for elem in authors.find_all('editor'): +        for elem in authors.find_all("editor"):              contrib = self.dblp_contrib_single(elem)              contrib.role = "editor"              contribs.append(contrib) @@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):          # remove number in author name, if present          if raw_name.split()[-1].isdigit(): -            raw_name = ' '.join(raw_name.split()[:-1]) +            raw_name = " ".join(raw_name.split()[:-1]) -        if elem.get('orcid'): -            orcid = clean_orcid(elem['orcid']) +        if elem.get("orcid"): +            orcid = clean_orcid(elem["orcid"])              if orcid:                  creator_id = self.lookup_orcid(orcid)                  if not creator_id: @@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):          wikidata_qid: Optional[str] = None          arxiv_id: Optional[str] = None          hdl: Optional[str] = None -        for ee in xml_elem.find_all('ee'): +        for ee in xml_elem.find_all("ee"):              url = ee.text              # convert DOI-like domains, which mostly have DOIs anyways -            if '://doi.acm.org/' in url: -                url = url.replace('://doi.acm.org/', '://doi.org/') -            elif '://doi.ieeecomputersociety.org/' in url: -                url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/') +            if "://doi.acm.org/" in url: +                url = url.replace("://doi.acm.org/", "://doi.org/") +            elif "://doi.ieeecomputersociety.org/" in url: +                url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/") -            if 'doi.org/10.' in url and not doi: +            if "doi.org/10." in url and not doi:                  doi = clean_doi(url) -            elif 'wikidata.org/entity/Q' in url and not wikidata_qid: +            elif "wikidata.org/entity/Q" in url and not wikidata_qid:                  wikidata_qid = clean_wikidata_qid(url) -            elif '://arxiv.org/abs/' in url and not arxiv_id: -                arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '') +            elif "://arxiv.org/abs/" in url and not arxiv_id: +                arxiv_id = ( +                    url.replace("http://", "") +                    .replace("https://", "") +                    .replace("arxiv.org/abs/", "") +                )                  arxiv_id = clean_arxiv_id(arxiv_id) -            elif '://hdl.handle.net' in url and not hdl: +            elif "://hdl.handle.net" in url and not hdl:                  hdl = clean_hdl(url)          return fatcat_openapi_client.ReleaseExtIds( @@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):          sandcrawler ingest requests.          """          EXTID_PATTERNS = [ -            '://doi.acm.org/', -            '://doi.ieeecomputersociety.org/', -            'doi.org/10.', -            'wikidata.org/entity/Q', -            '://arxiv.org/abs/', +            "://doi.acm.org/", +            "://doi.ieeecomputersociety.org/", +            "doi.org/10.", +            "wikidata.org/entity/Q", +            "://arxiv.org/abs/",          ]          urls = [] -        for ee in xml_elem.find_all('ee'): +        for ee in xml_elem.find_all("ee"):              url = ee.text              skip = False              for pattern in EXTID_PATTERNS: diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 1831c4cd..cd063337 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048  class DoajArticleImporter(EntityImporter): - -    def __init__(self, -                 api, -                 issn_map_file, -                 **kwargs): +    def __init__(self, api, issn_map_file, **kwargs):          eg_desc = kwargs.get( -            'editgroup_description', -            "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps" +            "editgroup_description", +            "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps",          ) -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', -                                         'fatcat_tools.DoajArticleImporter') +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter")          # ensure default is to not do updates with this worker (override super() default) -        kwargs['do_updates'] = kwargs.get("do_updates", False) -        super().__init__(api, -                         issn_map_file=issn_map_file, -                         editgroup_description=eg_desc, -                         editgroup_extra=eg_extra, -                         **kwargs) +        kwargs["do_updates"] = kwargs.get("do_updates", False) +        super().__init__( +            api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs, +        )          self.this_year = datetime.datetime.now().year          self.read_issn_map_file(issn_map_file) @@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter):          }          """ -        if not obj or not isinstance(obj, dict) or 'bibjson' not in obj: -            self.counts['skip-empty'] += 1 +        if not obj or not isinstance(obj, dict) or "bibjson" not in obj: +            self.counts["skip-empty"] += 1              return None -        bibjson = obj['bibjson'] +        bibjson = obj["bibjson"] -        title = clean_str(bibjson.get('title'), force_xml=True) +        title = clean_str(bibjson.get("title"), force_xml=True)          if not title: -            self.counts['skip-title'] += 1 +            self.counts["skip-title"] += 1              return False -        container_name = clean_str(bibjson['journal']['title']) +        container_name = clean_str(bibjson["journal"]["title"])          container_id = None          # NOTE: 'issns' not documented in API schema -        for issn in bibjson['journal']['issns']: +        for issn in bibjson["journal"]["issns"]:              issnl = self.issn2issnl(issn)              if issnl:                  container_id = self.lookup_issnl(self.issn2issnl(issn)) @@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter):                  container_name = None                  break -        volume = clean_str(bibjson['journal'].get('volume')) +        volume = clean_str(bibjson["journal"].get("volume"))          # NOTE: this schema seems to use "number" as "issue number" -        issue = clean_str(bibjson['journal'].get('number')) -        publisher = clean_str(bibjson['journal'].get('publisher')) +        issue = clean_str(bibjson["journal"].get("number")) +        publisher = clean_str(bibjson["journal"].get("publisher"))          try: -            release_year = int(bibjson.get('year')) +            release_year = int(bibjson.get("year"))          except (TypeError, ValueError):              release_year = None -        release_month = parse_month(clean_str(bibjson.get('month'))) +        release_month = parse_month(clean_str(bibjson.get("month")))          # block bogus far-future years/dates -        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): +        if release_year is not None and ( +            release_year > (self.this_year + 5) or release_year < 1000 +        ):              release_month = None              release_year = None -        license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) -        country = parse_country_name(bibjson['journal'].get('country')) +        license_slug = self.doaj_license_slug(bibjson["journal"].get("license")) +        country = parse_country_name(bibjson["journal"].get("country"))          language = None -        for raw in bibjson['journal'].get('language') or []: +        for raw in bibjson["journal"].get("language") or []:              language = parse_lang_name(raw)              if language:                  break          # pages          # NOTE: error in API docs? seems like start_page not under 'journal' object -        start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) -        end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) +        start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str( +            bibjson.get("start_page") +        ) +        end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str( +            bibjson.get("end_page") +        )          pages: Optional[str] = None          if start_page and end_page:              pages = f"{start_page}-{end_page}"          elif start_page:              pages = start_page -        doaj_article_id = obj['id'].lower() -        ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) +        doaj_article_id = obj["id"].lower() +        ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)          abstracts = self.doaj_abstracts(bibjson) -        contribs = self.doaj_contribs(bibjson.get('author') or []) +        contribs = self.doaj_contribs(bibjson.get("author") or [])          # DOAJ-specific extra          doaj_extra = dict() -        if bibjson.get('subject'): -            doaj_extra['subject'] = bibjson.get('subject') -        if bibjson.get('keywords'): -            doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] +        if bibjson.get("subject"): +            doaj_extra["subject"] = bibjson.get("subject") +        if bibjson.get("keywords"): +            doaj_extra["keywords"] = [ +                k for k in [clean_str(s) for s in bibjson.get("keywords")] if k +            ]          # generic extra          extra = dict()          if country: -            extra['country'] = country +            extra["country"] = country          if not container_id and container_name: -            extra['container_name'] = container_name +            extra["container_name"] = container_name          if release_year and release_month:              # TODO: schema migration -            extra['release_month'] = release_month +            extra["release_month"] = release_month          if doaj_extra: -            extra['doaj'] = doaj_extra +            extra["doaj"] = doaj_extra          if not extra:              extra = None          re = fatcat_openapi_client.ReleaseEntity(              work_id=None,              container_id=container_id, -            release_type='article-journal', -            release_stage='published', +            release_type="article-journal", +            release_stage="published",              title=title,              release_year=release_year, -            #release_date, +            # release_date,              publisher=publisher,              ext_ids=ext_ids,              contribs=contribs, @@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter):          # then try other ext_id lookups          if not existing: -            for extid_type in ('doi', 'pmid', 'pmcid'): +            for extid_type in ("doi", "pmid", "pmcid"):                  extid_val = getattr(re.ext_ids, extid_type)                  if not extid_val:                      continue -                #print(f"  lookup release type: {extid_type} val: {extid_val}") +                # print(f"  lookup release type: {extid_type} val: {extid_val}")                  try:                      existing = self.api.lookup_release(**{extid_type: extid_val})                  except fatcat_openapi_client.rest.ApiException as err: @@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter):          # other logic could go here about skipping updates          if not self.do_updates or existing.ext_ids.doaj: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          # fields to copy over for update @@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter):          existing.release_stage = existing.release_stage or re.release_stage          existing.container_id = existing.container_id or re.container_id          existing.abstracts = existing.abstracts or re.abstracts -        existing.extra['doaj'] = re.extra['doaj'] +        existing.extra["doaj"] = re.extra["doaj"]          existing.volume = existing.volume or re.volume          existing.issue = existing.issue or re.issue          existing.pages = existing.pages or re.pages @@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter):          try:              self.api.update_release(self.get_editgroup_id(), existing.ident, existing) -            self.counts['update'] += 1 +            self.counts["update"] += 1          except fatcat_openapi_client.rest.ApiException as err:              # there is a code path where we try to update the same release              # twice in a row; if that happens, just skip              # NOTE: API behavior might change in the future?              if "release_edit_editgroup_id_ident_id_key" in err.body: -                self.counts['skip-update-conflict'] += 1 +                self.counts["skip-update-conflict"] += 1                  return False              else:                  raise err @@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter):          return False      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )      def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: -        text = clean_str(bibjson.get('abstract')) +        text = clean_str(bibjson.get("abstract"))          if not text or len(text) < 10:              return []          if len(text) > MAX_ABSTRACT_LENGTH: @@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter):              lang=lang,          ) -        return [abstract,] +        return [ +            abstract, +        ]      def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:          """ @@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter):          contribs = []          index = 0          for author in authors: -            if not author.get('name'): +            if not author.get("name"):                  continue              creator_id = None -            orcid = clean_orcid(author.get('orcid_id')) +            orcid = clean_orcid(author.get("orcid_id"))              if orcid:                  creator_id = self.lookup_orcid(orcid) -            contribs.append(fatcat_openapi_client.ReleaseContrib( -                raw_name=author.get('name'), -                role='author', -                index=index, -                creator_id=creator_id, -                raw_affiliation=clean_str(author.get('affiliation')), -            )) +            contribs.append( +                fatcat_openapi_client.ReleaseContrib( +                    raw_name=author.get("name"), +                    role="author", +                    index=index, +                    creator_id=creator_id, +                    raw_affiliation=clean_str(author.get("affiliation")), +                ) +            )              index += 1          return contribs -    def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: +    def doaj_ext_ids( +        self, identifiers: List[dict], doaj_article_id: str +    ) -> fatcat_openapi_client.ReleaseExtIds:          """          bibjson.identifier {              id (string), @@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter):          pmid: Optional[str] = None          pmcid: Optional[str] = None          for id_obj in identifiers: -            if not id_obj.get('id'): +            if not id_obj.get("id"):                  continue -            if id_obj['type'].lower() == 'doi': -                doi = clean_doi(id_obj['id']) -            elif id_obj['type'].lower() == 'pmid': -                pmid = clean_pmid(id_obj['id']) -            elif id_obj['type'].lower() == 'pmcid': -                pmcid = clean_pmcid(id_obj['id']) +            if id_obj["type"].lower() == "doi": +                doi = clean_doi(id_obj["id"]) +            elif id_obj["type"].lower() == "pmid": +                pmid = clean_pmid(id_obj["id"]) +            elif id_obj["type"].lower() == "pmcid": +                pmcid = clean_pmcid(id_obj["id"])          return fatcat_openapi_client.ReleaseExtIds(              doaj=doaj_article_id, @@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter):          if not license_list:              return None          for license in license_list: -            if not license.get('open_access'): +            if not license.get("open_access"):                  continue -            slug = license.get('type') -            if slug.startswith('CC '): -                slug = slug.replace('CC ', 'cc-').lower() +            slug = license.get("type") +            if slug.startswith("CC "): +                slug = slug.replace("CC ", "cc-").lower()                  return slug          return None diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 0951ed84..26584ff3 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from .common import EntityImporter @@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter):      def __init__(self, api, require_grobid=True, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') -        kwargs['do_updates'] = kwargs.get("do_updates", True) -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates" +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter") +        kwargs["do_updates"] = kwargs.get("do_updates", True) +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, row): -        for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): +        for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):              if not row.get(k): -                self.counts['skip-missing-field'] += 1 +                self.counts["skip-missing-field"] += 1                  return False          return True @@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter):          file_meta = row          fe = fatcat_openapi_client.FileEntity( -            md5=file_meta['md5hex'], -            sha1=file_meta['sha1hex'], -            sha256=file_meta['sha256hex'], -            size=file_meta['size_bytes'], -            mimetype=file_meta['mimetype'], +            md5=file_meta["md5hex"], +            sha1=file_meta["sha1hex"], +            sha256=file_meta["sha256hex"], +            size=file_meta["size_bytes"], +            mimetype=file_meta["mimetype"],          )          return fe @@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter):                  raise err          if not existing: -            self.counts['skip-no-match'] += 1 +            self.counts["skip-no-match"] += 1              return False -        if (existing.md5 and existing.sha256 and existing.size and existing.mimetype): -            self.counts['skip-existing-complete'] += 1 +        if existing.md5 and existing.sha256 and existing.size and existing.mimetype: +            self.counts["skip-existing-complete"] += 1              return False          existing.md5 = existing.md5 or fe.md5 @@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter):          existing = self.generic_file_cleanups(existing)          self.api.update_file(self.get_editgroup_id(), existing.ident, existing) -        self.counts['update'] += 1 +        self.counts["update"] += 1          return False diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 43c2a49c..dd8f5600 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from fatcat_tools import entity_from_dict @@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter') -        kwargs['do_updates'] = bool(kwargs.get("do_updates", False)) +        eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import" +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter") +        kwargs["do_updates"] = bool(kwargs.get("do_updates", False))          self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False)) -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          # bezerk mode doesn't make sense for this importer          assert self.bezerk_mode is False      def want(self, row): -        if not row.get('release_ids'): -            self.counts['skip-no-release-ids'] += 1 +        if not row.get("release_ids"): +            self.counts["skip-no-release-ids"] += 1              return False -        if not row.get('urls'): -            self.counts['skip-no-urls'] += 1 +        if not row.get("urls"): +            self.counts["skip-no-urls"] += 1              return False -        if not row.get('manifest'): -            self.counts['skip-no-files'] += 1 +        if not row.get("manifest"): +            self.counts["skip-no-files"] += 1              return False -        for f in row.get('manifest'): -            for k in ('sha1', 'md5'): +        for f in row.get("manifest"): +            for k in ("sha1", "md5"):                  if not f.get(k): -                    self.counts['skip-missing-file-field'] += 1 +                    self.counts["skip-missing-file-field"] += 1                      return False          return True @@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter):          if not self.skip_release_fileset_check:              for release_id in fse.release_ids:                  # don't catch 404, that would be an error -                release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs') -                assert release.state == 'active' +                release = self.api.get_release( +                    release_id, expand="filesets", hide="abstracts,refs" +                ) +                assert release.state == "active"                  if release.filesets: -                    self.counts['exists'] += 1 -                    self.counts['exists-via-release-filesets'] += 1 +                    self.counts["exists"] += 1 +                    self.counts["exists-via-release-filesets"] += 1                      return False          # do the insert          return True      def insert_batch(self, batch): -        self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_fileset_auto_batch( +            fatcat_openapi_client.FilesetAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 0f666652..f7bb5357 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,7 +7,7 @@ import fatcat_openapi_client  from .common import EntityImporter, clean, make_rel_url -MAX_ABSTRACT_BYTES=4096 +MAX_ABSTRACT_BYTES = 4096  class GrobidMetadataImporter(EntityImporter): @@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Import of release and file metadata, as extracted from PDFs by GROBID.") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.get( +            "editgroup_description", +            "Import of release and file metadata, as extracted from PDFs by GROBID.", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.default_link_rel = kwargs.get("default_link_rel", "web")          self.longtail_oa = kwargs.get("longtail_oa", False) @@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter):      def parse_record(self, row): -        fields = row.split('\t') +        fields = row.split("\t")          sha1_key = fields[0]          cdx = json.loads(fields[1])          mimetype = fields[2] @@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter):          # TODO: this is where we should check if the file actually has          # release_ids and/or URLs associated with it          if existing and not self.bezerk_mode: -            self.counts['exists'] += 1 -            self.counts['skip'] -= 1 +            self.counts["exists"] += 1 +            self.counts["skip"] -= 1              return None          release_edit = self.create_release(re) @@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter):      def parse_grobid_json(self, obj): -        if not obj.get('title'): +        if not obj.get("title"):              return None          extra_grobid = dict() -        abstract = obj.get('abstract') +        abstract = obj.get("abstract")          if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:              abobj = fatcat_openapi_client.ReleaseAbstract( -                mimetype="text/plain", -                content=clean(obj.get('abstract'))) +                mimetype="text/plain", content=clean(obj.get("abstract")) +            )              abstracts = [abobj]          else:              abstracts = None          contribs = [] -        for i, a in enumerate(obj.get('authors', [])): -            contribs.append(fatcat_openapi_client.ReleaseContrib( -                index=i, -                raw_name=clean(a['name']), -                given_name=clean(a.get('given_name')), -                surname=clean(a.get('surname')), -                role="author", -                extra=None)) +        for i, a in enumerate(obj.get("authors", [])): +            contribs.append( +                fatcat_openapi_client.ReleaseContrib( +                    index=i, +                    raw_name=clean(a["name"]), +                    given_name=clean(a.get("given_name")), +                    surname=clean(a.get("surname")), +                    role="author", +                    extra=None, +                ) +            )          refs = [] -        for raw in obj.get('citations', []): +        for raw in obj.get("citations", []):              cite_extra = dict()              year = None -            if raw.get('date'): +            if raw.get("date"):                  try: -                    year = int(raw['date'].strip()[:4]) +                    year = int(raw["date"].strip()[:4])                  except (IndexError, ValueError):                      pass -            for key in ('volume', 'url', 'issue', 'publisher'): +            for key in ("volume", "url", "issue", "publisher"):                  if raw.get(key):                      cite_extra[key] = clean(raw[key]) -            if raw.get('authors'): -                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] +            if raw.get("authors"): +                cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]              if not cite_extra:                  cite_extra = None -            refs.append(fatcat_openapi_client.ReleaseRef( -                key=clean(raw.get('id')), -                year=year, -                title=clean(raw['title']), -                extra=cite_extra)) +            refs.append( +                fatcat_openapi_client.ReleaseRef( +                    key=clean(raw.get("id")), +                    year=year, +                    title=clean(raw["title"]), +                    extra=cite_extra, +                ) +            )          release_date = None          release_year = None -        if obj.get('date'): +        if obj.get("date"):              # only returns year, ever? -            release_year = int(obj['date'][:4]) +            release_year = int(obj["date"][:4])          extra = dict() -        if obj.get('doi'): -            extra['doi'] = obj['doi'] -        if obj['journal'] and obj['journal'].get('name'): -            extra['container_name'] = clean(obj['journal']['name']) +        if obj.get("doi"): +            extra["doi"] = obj["doi"] +        if obj["journal"] and obj["journal"].get("name"): +            extra["container_name"] = clean(obj["journal"]["name"])          # TODO: ISSN/eISSN handling? or just journal name lookup?          if extra_grobid: -            extra['grobid'] = extra_grobid +            extra["grobid"] = extra_grobid          if self.longtail_oa: -            extra['longtail_oa'] = True +            extra["longtail_oa"] = True          if not extra:              extra = None -        title = clean(obj['title'], force_xml=True) +        title = clean(obj["title"], force_xml=True)          if not title or len(title) < 2:              return None @@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter):              release_year=release_year,              contribs=contribs,              refs=refs, -            publisher=clean(obj['journal'].get('publisher')), -            volume=clean(obj['journal'].get('volume')), -            issue=clean(obj['journal'].get('issue')), +            publisher=clean(obj["journal"].get("publisher")), +            volume=clean(obj["journal"].get("volume")), +            issue=clean(obj["journal"].get("issue")),              abstracts=abstracts,              ext_ids=fatcat_openapi_client.ReleaseExtIds(), -            extra=extra) +            extra=extra, +        )          return re      def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): -        sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() +        sha1 = ( +            base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) +            .decode("ascii") +            .lower() +        )          fe = fatcat_openapi_client.FileEntity(              sha1=sha1, @@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter):          )          # parse URLs and CDX -        original = cdx['url'] -        assert len(cdx['dt']) >= 8 -        wayback = "https://web.archive.org/web/{}/{}".format( -            cdx['dt'], -            original) -        fe.urls.append( -            fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) +        original = cdx["url"] +        assert len(cdx["dt"]) >= 8 +        wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) +        fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))          original_url = make_rel_url(original, default_link_rel=self.default_link_rel)          if original_url is not None: -            fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])) +            fe.urls.append( +                fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]) +            )          return fe @@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter):          return True      def insert_batch(self, batch): -        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_file_auto_batch( +            fatcat_openapi_client.FileAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index f0943c1e..e0a6c3f5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,3 @@ -  import datetime  import fatcat_openapi_client @@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url  class IngestFileResultImporter(EntityImporter): -      def __init__(self, api, require_grobid=True, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') -        kwargs['do_updates'] = kwargs.get("do_updates", False) -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Files crawled from web using sandcrawler ingest tool" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter") +        kwargs["do_updates"] = kwargs.get("do_updates", False) +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.use_glutton_match = False          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel @@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter):          else:              print("NOT checking GROBID success")          self.ingest_request_source_allowlist = [ -            'fatcat-changelog', -            'fatcat-ingest-container', -            'fatcat-ingest', -            'arabesque', +            "fatcat-changelog", +            "fatcat-ingest-container", +            "fatcat-ingest", +            "arabesque",              #'mag-corpus',              #'mag', -            'unpaywall-corpus', -            'unpaywall', +            "unpaywall-corpus", +            "unpaywall",              #'s2-corpus',              #'s2', -            'doaj', -            'dblp', +            "doaj", +            "dblp",          ] -        if kwargs.get('skip_source_allowlist', False): +        if kwargs.get("skip_source_allowlist", False):              self.ingest_request_source_allowlist = []      def want_file(self, row) -> bool: @@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter):          File-specific part of want(). Generic across general ingest and save-paper-now.          """ -        if not row.get('file_meta'): -            self.counts['skip-file-meta'] += 1 +        if not row.get("file_meta"): +            self.counts["skip-file-meta"] += 1              return False          # type-specific filters -        if row['request'].get('ingest_type') == 'pdf': -            if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: -                self.counts['skip-grobid'] += 1 +        if row["request"].get("ingest_type") == "pdf": +            if self.require_grobid and row.get("grobid", {}).get("status_code") != 200: +                self.counts["skip-grobid"] += 1                  return False -            if row['file_meta'].get('mimetype') not in ("application/pdf",): -                self.counts['skip-mimetype'] += 1 +            if row["file_meta"].get("mimetype") not in ("application/pdf",): +                self.counts["skip-mimetype"] += 1                  return False -        elif row['request'].get('ingest_type') == 'xml': -            if row['file_meta'].get('mimetype') not in ("application/xml", -                    "application/jats+xml", "application/tei+xml", "text/xml"): -                self.counts['skip-mimetype'] += 1 +        elif row["request"].get("ingest_type") == "xml": +            if row["file_meta"].get("mimetype") not in ( +                "application/xml", +                "application/jats+xml", +                "application/tei+xml", +                "text/xml", +            ): +                self.counts["skip-mimetype"] += 1                  return False -        elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']: +        elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]:              # we rely on sandcrawler for these checks              pass          else: -            self.counts['skip-ingest-type'] += 1 +            self.counts["skip-ingest-type"] += 1              return False          return True @@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter):          Sandcrawler ingest-specific part of want(). Generic across file and          webcapture ingest.          """ -        if row.get('hit') is not True: -            self.counts['skip-hit'] += 1 +        if row.get("hit") is not True: +            self.counts["skip-hit"] += 1              return False -        source = row['request'].get('ingest_request_source') +        source = row["request"].get("ingest_request_source")          if not source: -            self.counts['skip-ingest_request_source'] += 1 +            self.counts["skip-ingest_request_source"] += 1              return False -        if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: -            self.counts['skip-ingest_request_source'] += 1 +        if ( +            self.ingest_request_source_allowlist +            and source not in self.ingest_request_source_allowlist +        ): +            self.counts["skip-ingest_request_source"] += 1              return False -        if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): -            self.counts['skip-link-source'] += 1 +        if row["request"].get("link_source") not in ( +            "arxiv", +            "pmc", +            "unpaywall", +            "doi", +            "mag", +            "s2", +            "doaj", +            "dblp", +        ): +            self.counts["skip-link-source"] += 1              return False -        if source.startswith('savepapernow'): +        if source.startswith("savepapernow"):              # never process async savepapernow requests -            self.counts['skip-savepapernow'] += 1 +            self.counts["skip-savepapernow"] += 1              return False          return True @@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter):      def parse_ingest_release_ident(self, row): -        request = row['request'] -        fatcat = request.get('fatcat') +        request = row["request"] +        fatcat = request.get("fatcat")          release_ident = None -        if fatcat and fatcat.get('release_ident'): -            release_ident = fatcat.get('release_ident') -        elif request.get('ext_ids'): +        if fatcat and fatcat.get("release_ident"): +            release_ident = fatcat.get("release_ident") +        elif request.get("ext_ids"):              # if no fatcat ident, try extids -            for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'): -                extid = request['ext_ids'].get(extid_type) +            for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"): +                extid = request["ext_ids"].get(extid_type)                  if not extid:                      continue -                if extid_type == 'doi': +                if extid_type == "doi":                      extid = extid.lower()                  try:                      release = self.api.lookup_release(**{extid_type: extid}) @@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter):                      if err.status == 404:                          continue                      elif err.status == 400: -                        self.counts['warn-extid-invalid'] += 1 +                        self.counts["warn-extid-invalid"] += 1                          continue                      raise err                  # verify release_stage -                if request.get('release_stage') and release.release_stage: -                    if request['release_stage'] != release.release_stage: -                        self.counts['skip-release-stage'] += 1 +                if request.get("release_stage") and release.release_stage: +                    if request["release_stage"] != release.release_stage: +                        self.counts["skip-release-stage"] += 1                          return None                  release_ident = release.ident                  break -        if self.use_glutton_match and not release_ident and row.get('grobid'): +        if self.use_glutton_match and not release_ident and row.get("grobid"):              # try biblio-glutton extracted hit -            if row['grobid'].get('fatcat_release'): -                release_ident = row['grobid']['fatcat_release'].split('_')[-1] -                self.counts['glutton-match'] += 1 +            if row["grobid"].get("fatcat_release"): +                release_ident = row["grobid"]["fatcat_release"].split("_")[-1] +                self.counts["glutton-match"] += 1          return release_ident      def parse_terminal(self, row): -        terminal = row.get('terminal') +        terminal = row.get("terminal")          if not terminal:              # support old cdx-only ingest results -            cdx = row.get('cdx') +            cdx = row.get("cdx")              if not cdx:                  return None              else:                  terminal = { -                    'terminal_url': cdx['url'], -                    'terminal_dt': cdx['datetime'], -                    'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), +                    "terminal_url": cdx["url"], +                    "terminal_dt": cdx["datetime"], +                    "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"),                  }          # work around old schema -        if 'terminal_url' not in terminal: -            terminal['terminal_url'] = terminal['url'] -        if 'terminal_dt' not in terminal: -            terminal['terminal_dt'] = terminal['dt'] +        if "terminal_url" not in terminal: +            terminal["terminal_url"] = terminal["url"] +        if "terminal_dt" not in terminal: +            terminal["terminal_dt"] = terminal["dt"]          # convert CDX-style digits to ISO-style timestamp -        assert len(terminal['terminal_dt']) == 14 -        terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" +        assert len(terminal["terminal_dt"]) == 14 +        terminal["terminal_timestamp"] = ( +            datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat() +            + "Z" +        )          return terminal      def parse_urls(self, row, terminal): -        request = row['request'] +        request = row["request"]          default_rel = self.default_link_rel -        if request.get('link_source') == 'doi': -            default_rel = 'publisher' -        default_rel = request.get('rel', default_rel) -        url = make_rel_url(terminal['terminal_url'], default_rel) +        if request.get("link_source") == "doi": +            default_rel = "publisher" +        default_rel = request.get("rel", default_rel) +        url = make_rel_url(terminal["terminal_url"], default_rel)          if not url: -            self.counts['skip-url'] += 1 +            self.counts["skip-url"] += 1              return None          wayback = "https://web.archive.org/web/{}/{}".format( -            terminal['terminal_dt'], -            terminal['terminal_url']) +            terminal["terminal_dt"], terminal["terminal_url"] +        )          urls = [url, ("webarchive", wayback)]          urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] @@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter):      def parse_edit_extra(self, row): -        request = row['request'] +        request = row["request"]          edit_extra = dict() -        if request.get('edit_extra'): -            edit_extra = request['edit_extra'] +        if request.get("edit_extra"): +            edit_extra = request["edit_extra"] -        if request.get('ingest_request_source'): -            edit_extra['ingest_request_source'] = request['ingest_request_source'] -        if request.get('link_source') and request.get('link_source_id'): -            edit_extra['link_source'] = request['link_source'] -            edit_extra['link_source_id'] = request['link_source_id'] -            if edit_extra['link_source'] == 'doi': -                edit_extra['link_source_id'] = edit_extra['link_source_id'].lower() +        if request.get("ingest_request_source"): +            edit_extra["ingest_request_source"] = request["ingest_request_source"] +        if request.get("link_source") and request.get("link_source_id"): +            edit_extra["link_source"] = request["link_source"] +            edit_extra["link_source_id"] = request["link_source_id"] +            if edit_extra["link_source"] == "doi": +                edit_extra["link_source_id"] = edit_extra["link_source_id"].lower()          # GROBID metadata, for SPN requests (when there might not be 'success') -        if request.get('ingest_type') == 'pdf': -            if row.get('grobid') and row['grobid'].get('status') != 'success': -                edit_extra['grobid_status_code'] = row['grobid']['status_code'] -                edit_extra['grobid_version'] = row['grobid'].get('grobid_version') +        if request.get("ingest_type") == "pdf": +            if row.get("grobid") and row["grobid"].get("status") != "success": +                edit_extra["grobid_status_code"] = row["grobid"]["status_code"] +                edit_extra["grobid_version"] = row["grobid"].get("grobid_version")          return edit_extra      def parse_record(self, row): -        request = row['request'] -        file_meta = row['file_meta'] +        request = row["request"] +        file_meta = row["file_meta"]          # double check that want() filtered request correctly (eg, old requests) -        if request.get('ingest_type') not in ('pdf', 'xml'): -            self.counts['skip-ingest-type'] += 1 +        if request.get("ingest_type") not in ("pdf", "xml"): +            self.counts["skip-ingest-type"] += 1              return None -        assert (request['ingest_type'], file_meta['mimetype']) in [ +        assert (request["ingest_type"], file_meta["mimetype"]) in [              ("pdf", "application/pdf"),              ("xml", "application/xml"),              ("xml", "application/jats+xml"), @@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter):          release_ident = self.parse_ingest_release_ident(row)          if not release_ident: -            self.counts['skip-release-not-found'] += 1 +            self.counts["skip-release-not-found"] += 1              return None          terminal = self.parse_terminal(row)          if not terminal:              # TODO: support archive.org hits? -            self.counts['skip-no-terminal'] += 1 +            self.counts["skip-no-terminal"] += 1              return None          urls = self.parse_urls(row, terminal)          fe = fatcat_openapi_client.FileEntity( -            md5=file_meta['md5hex'], -            sha1=file_meta['sha1hex'], -            sha256=file_meta['sha256hex'], -            size=file_meta['size_bytes'], -            mimetype=file_meta['mimetype'], +            md5=file_meta["md5hex"], +            sha1=file_meta["sha1hex"], +            sha256=file_meta["sha256hex"], +            size=file_meta["size_bytes"], +            mimetype=file_meta["mimetype"],              release_ids=[release_ident],              urls=urls,          ) @@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter):          # check for existing edits-in-progress with same file hash          for other in self._entity_queue:              if other.sha1 == fe.sha1: -                self.counts['skip-in-queue'] += 1 +                self.counts["skip-in-queue"] += 1                  return False          if not existing: @@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter):          # NOTE: the following checks all assume there is an existing item          if (fe.release_ids[0] in existing.release_ids) and existing.urls:              # TODO: could still, in theory update with the new URL? -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          if not self.do_updates: -            self.counts['skip-update-disabled'] += 1 +            self.counts["skip-update-disabled"] += 1              return False          # TODO: for now, never update -        self.counts['skip-update-disabled'] += 1 +        self.counts["skip-update-disabled"] += 1          return False      def insert_batch(self, batch):          if self.submit_mode: -            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra)) +            eg = self.api.create_editgroup( +                fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ) +            )              for fe in batch:                  self.api.create_file(eg.editgroup_id, fe)              self.api.update_editgroup(eg.editgroup_id, eg, submit=True)          else: -            self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( -                editgroup=fatcat_openapi_client.Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra), -                entity_list=batch)) +            self.api.create_file_auto_batch( +                fatcat_openapi_client.FileAutoBatch( +                    editgroup=fatcat_openapi_client.Editgroup( +                        description=self.editgroup_description, extra=self.editgroup_extra +                    ), +                    entity_list=batch, +                ) +            )  class SavePaperNowFileImporter(IngestFileResultImporter): @@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter):      def __init__(self, api, submit_mode=True, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter') -        kwargs['submit_mode'] = submit_mode -        kwargs['require_grobid'] = False -        kwargs['do_updates'] = False -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Files crawled after a public 'Save Paper Now' request" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter") +        kwargs["submit_mode"] = submit_mode +        kwargs["require_grobid"] = False +        kwargs["do_updates"] = False +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, row): -        source = row['request'].get('ingest_request_source') +        source = row["request"].get("ingest_request_source")          if not source: -            self.counts['skip-ingest_request_source'] += 1 +            self.counts["skip-ingest_request_source"] += 1              return False -        if not source.startswith('savepapernow'): -            self.counts['skip-not-savepapernow'] += 1 +        if not source.startswith("savepapernow"): +            self.counts["skip-not-savepapernow"] += 1              return False -        if row.get('hit') is not True: -            self.counts['skip-hit'] += 1 +        if row.get("hit") is not True: +            self.counts["skip-hit"] += 1              return False          if not self.want_file(row): @@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') -        kwargs['do_updates'] = False -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Webcaptures crawled from web using sandcrawler ingest tool" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter") +        kwargs["do_updates"] = False +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, row): @@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter):              return False          # webcapture-specific filters -        if row['request'].get('ingest_type') != 'html': -            self.counts['skip-ingest-type'] += 1 +        if row["request"].get("ingest_type") != "html": +            self.counts["skip-ingest-type"] += 1              return False -        if not row.get('file_meta'): -            self.counts['skip-file-meta'] += 1 +        if not row.get("file_meta"): +            self.counts["skip-file-meta"] += 1              return False -        if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): -            self.counts['skip-mimetype'] += 1 +        if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): +            self.counts["skip-mimetype"] += 1              return False          return True      def parse_record(self, row): -        request = row['request'] -        file_meta = row['file_meta'] +        request = row["request"] +        file_meta = row["file_meta"]          # double check that want() filtered request correctly (eg, old requests) -        if request.get('ingest_type') != "html": -            self.counts['skip-ingest-type'] += 1 +        if request.get("ingest_type") != "html": +            self.counts["skip-ingest-type"] += 1              return None -        if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): -            self.counts['skip-mimetype'] += 1 +        if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): +            self.counts["skip-mimetype"] += 1              return None          # identify release by fatcat ident, or extid lookup          release_ident = self.parse_ingest_release_ident(row)          if not release_ident: -            self.counts['skip-release-not-found'] += 1 +            self.counts["skip-release-not-found"] += 1              return None          terminal = self.parse_terminal(row)          if not terminal:              # TODO: support archive.org hits? -            self.counts['skip-no-terminal'] += 1 +            self.counts["skip-no-terminal"] += 1              return None          urls = self.parse_urls(row, terminal) -        archive_urls = [u for u in urls if u.rel == 'webarchive'] +        archive_urls = [u for u in urls if u.rel == "webarchive"] -        if terminal['terminal_status_code'] != 200: -            self.counts['skip-terminal-status-code'] += 1 +        if terminal["terminal_status_code"] != 200: +            self.counts["skip-terminal-status-code"] += 1              return None -        terminal_cdx = row['cdx'] -        if 'revisit_cdx' in row: -            terminal_cdx = row['revisit_cdx'] -        assert terminal_cdx['surt'] -        if terminal_cdx['url'] != terminal['terminal_url']: -            self.counts['skip-terminal-url-mismatch'] += 1 +        terminal_cdx = row["cdx"] +        if "revisit_cdx" in row: +            terminal_cdx = row["revisit_cdx"] +        assert terminal_cdx["surt"] +        if terminal_cdx["url"] != terminal["terminal_url"]: +            self.counts["skip-terminal-url-mismatch"] += 1              return None          wc_cdx = []          # primary resource first -        wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( -            surt=terminal_cdx['surt'], -            timestamp=terminal['terminal_timestamp'], -            url=terminal['terminal_url'], -            mimetype=file_meta['mimetype'], -            status_code=terminal['terminal_status_code'], -            sha1=file_meta['sha1hex'], -            sha256=file_meta['sha256hex'], -            size=file_meta['size_bytes'], -        )) - -        for resource in row.get('html_resources', []): -            timestamp = resource['timestamp'] +        wc_cdx.append( +            fatcat_openapi_client.WebcaptureCdxLine( +                surt=terminal_cdx["surt"], +                timestamp=terminal["terminal_timestamp"], +                url=terminal["terminal_url"], +                mimetype=file_meta["mimetype"], +                status_code=terminal["terminal_status_code"], +                sha1=file_meta["sha1hex"], +                sha256=file_meta["sha256hex"], +                size=file_meta["size_bytes"], +            ) +        ) + +        for resource in row.get("html_resources", []): +            timestamp = resource["timestamp"]              if "+" not in timestamp and "Z" not in timestamp:                  timestamp += "Z" -            wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( -                surt=resource['surt'], -                timestamp=timestamp, -                url=resource['url'], -                mimetype=resource.get('mimetype'), -                size=resource.get('size'), -                sha1=resource.get('sha1hex'), -                sha256=resource.get('sha256hex'), -            )) +            wc_cdx.append( +                fatcat_openapi_client.WebcaptureCdxLine( +                    surt=resource["surt"], +                    timestamp=timestamp, +                    url=resource["url"], +                    mimetype=resource.get("mimetype"), +                    size=resource.get("size"), +                    sha1=resource.get("sha1hex"), +                    sha256=resource.get("sha256hex"), +                ) +            )          wc = fatcat_openapi_client.WebcaptureEntity(              cdx=wc_cdx,              archive_urls=archive_urls, -            original_url=terminal['terminal_url'], -            timestamp=terminal['terminal_timestamp'], +            original_url=terminal["terminal_url"], +            timestamp=terminal["terminal_timestamp"],              release_ids=[release_ident],          ) @@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter):          # check for existing edits-in-progress with same URL          for other in self._entity_queue:              if other.original_url == wc.original_url: -                self.counts['skip-in-queue'] += 1 +                self.counts["skip-in-queue"] += 1                  return False          # lookup sha1, or create new entity (TODO: API doesn't support this yet) -        #existing = None +        # existing = None          # TODO: currently only allow one release per webcapture          release = self.api.get_release(wc.release_ids[0], expand="webcaptures") @@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter):              for other in release.webcaptures:                  if wc.original_url == other.original_url:                      # TODO: compare very similar timestamps of same time (different formats) -                    self.counts['exists'] += 1 +                    self.counts["exists"] += 1                      return False -            self.counts['skip-release-has-webcapture'] += 1 +            self.counts["skip-release-has-webcapture"] += 1              return False          # Ok, if we got here then no existing web capture for (first) release, @@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter):      def insert_batch(self, batch):          if self.submit_mode: -            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra)) +            eg = self.api.create_editgroup( +                fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ) +            )              for fe in batch:                  self.api.create_webcapture(eg.editgroup_id, fe)              self.api.update_editgroup(eg.editgroup_id, eg, submit=True)          else: -            self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( -                editgroup=fatcat_openapi_client.Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra), -                entity_list=batch)) +            self.api.create_webcapture_auto_batch( +                fatcat_openapi_client.WebcaptureAutoBatch( +                    editgroup=fatcat_openapi_client.Editgroup( +                        description=self.editgroup_description, extra=self.editgroup_extra +                    ), +                    entity_list=batch, +                ) +            ) +  class SavePaperNowWebImporter(IngestWebResultImporter):      """ @@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter):      def __init__(self, api, submit_mode=True, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter') -        kwargs['submit_mode'] = submit_mode -        kwargs['do_updates'] = False -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Webcaptures crawled after a public 'Save Paper Now' request" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter") +        kwargs["submit_mode"] = submit_mode +        kwargs["do_updates"] = False +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, row):          """ @@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter):          path, which means allowing hit=false.          """ -        source = row['request'].get('ingest_request_source') +        source = row["request"].get("ingest_request_source")          if not source: -            self.counts['skip-ingest_request_source'] += 1 +            self.counts["skip-ingest_request_source"] += 1              return False -        if not source.startswith('savepapernow'): -            self.counts['skip-not-savepapernow'] += 1 +        if not source.startswith("savepapernow"): +            self.counts["skip-not-savepapernow"] += 1              return False          # webcapture-specific filters -        if row['request'].get('ingest_type') != 'html': -            self.counts['skip-ingest-type'] += 1 +        if row["request"].get("ingest_type") != "html": +            self.counts["skip-ingest-type"] += 1              return False -        if not row.get('file_meta'): -            self.counts['skip-file-meta'] += 1 +        if not row.get("file_meta"): +            self.counts["skip-file-meta"] += 1              return False -        if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): -            self.counts['skip-mimetype'] += 1 +        if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): +            self.counts["skip-mimetype"] += 1              return False -        if row.get('status') not in ['success', 'unknown-scope']: -            self.counts['skip-hit'] += 1 +        if row.get("status") not in ["success", "unknown-scope"]: +            self.counts["skip-hit"] += 1              return False          return True @@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter') -        kwargs['do_updates'] = False -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Filesets crawled from web using sandcrawler ingest tool" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter") +        kwargs["do_updates"] = False +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.max_file_count = 300      def want_fileset(self, row): -        if not row.get('manifest') or len(row.get('manifest')) == 0: -            self.counts['skip-empty-manifest'] += 1 +        if not row.get("manifest") or len(row.get("manifest")) == 0: +            self.counts["skip-empty-manifest"] += 1              return False -        if len(row.get('manifest')) == 1: -            self.counts['skip-single-file'] += 1 +        if len(row.get("manifest")) == 1: +            self.counts["skip-single-file"] += 1              return False -        if len(row.get('manifest')) > self.max_file_count: -            self.counts['skip-too-many-files'] += 1 +        if len(row.get("manifest")) > self.max_file_count: +            self.counts["skip-too-many-files"] += 1              return False          return True @@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):              return False          # fileset-specific filters -        if row['request'].get('ingest_type') not in ['dataset',]: -            self.counts['skip-ingest-type'] += 1 +        if row["request"].get("ingest_type") not in [ +            "dataset", +        ]: +            self.counts["skip-ingest-type"] += 1              return False          if not self.want_fileset(row): @@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter):          return True      def parse_fileset_urls(self, row): -        if not row.get('strategy'): +        if not row.get("strategy"):              return [] -        strategy = row['strategy'] +        strategy = row["strategy"]          urls = [] -        if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=f"https://archive.org/download/{row['archiveorg_item_name']}/", -                rel="archive-base", -            )) -        if row['strategy'].startswith('web-') and row.get('platform_base_url'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", -                rel="webarchive-base", -            )) +        if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=f"https://archive.org/download/{row['archiveorg_item_name']}/", +                    rel="archive-base", +                ) +            ) +        if row["strategy"].startswith("web-") and row.get("platform_base_url"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", +                    rel="webarchive-base", +                ) +            )          # TODO: repository-base          # TODO: web-base -        if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", -                rel="archive-bundle", -            )) +        if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", +                    rel="archive-bundle", +                ) +            ) -        if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", -                rel="webarchive-bundle", -            )) +        if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", +                    rel="webarchive-bundle", +                ) +            )          # add any additional / platform URLs here -        if row.get('platform_bundle_url'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=row['platform_bundle_url'], -                rel="repository-bundle", -            )) -        if row.get('platform_base_url'): -            urls.append(fatcat_openapi_client.FilesetUrl( -                url=row['platform_bundle_url'], -                rel="repository-base", -            )) +        if row.get("platform_bundle_url"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=row["platform_bundle_url"], +                    rel="repository-bundle", +                ) +            ) +        if row.get("platform_base_url"): +            urls.append( +                fatcat_openapi_client.FilesetUrl( +                    url=row["platform_bundle_url"], +                    rel="repository-base", +                ) +            )          return urls      def parse_record(self, row): -        request = row['request'] +        request = row["request"]          # double check that want() filtered request correctly -        if request.get('ingest_type') not in ["dataset",]: -            self.counts['skip-ingest-type'] += 1 +        if request.get("ingest_type") not in [ +            "dataset", +        ]: +            self.counts["skip-ingest-type"] += 1              return None          # identify release by fatcat ident, or extid lookup          release_ident = self.parse_ingest_release_ident(row)          if not release_ident: -            self.counts['skip-release-not-found'] += 1 +            self.counts["skip-release-not-found"] += 1              return None          entity_extra = dict()          edit_extra = self.parse_edit_extra(row) -        edit_extra['ingest_strategy'] = row['ingest_strategy'] -        if row.get('platform'): -            edit_extra['platform'] = row['platform'] -        if row.get('platform_id'): -            edit_extra['platform_id'] = row['platform_id'] +        edit_extra["ingest_strategy"] = row["ingest_strategy"] +        if row.get("platform"): +            edit_extra["platform"] = row["platform"] +        if row.get("platform_id"): +            edit_extra["platform_id"] = row["platform_id"]          entity_urls = self.parse_fileset_urls(row)          if not entity_urls: -            self.counts['skip-no-access-url'] += 1 +            self.counts["skip-no-access-url"] += 1              return None -        assert row['file_count'] == len(row['manifest']) -        if row['file_count'] > self.max_file_count: -            self.counts['skip-too-many-manifest-files'] += 1 +        assert row["file_count"] == len(row["manifest"]) +        if row["file_count"] > self.max_file_count: +            self.counts["skip-too-many-manifest-files"] += 1              return None          manifest = [] -        for ingest_file in row['manifest']: +        for ingest_file in row["manifest"]:              fsf = fatcat_openapi_client.FilesetFile( -                path=ingest_file['path'], -                size=ingest_file['size'], -                md5=ingest_file['md5'], -                sha1=ingest_file['sha1'], -                sha256=ingest_file.get('sha256'), +                path=ingest_file["path"], +                size=ingest_file["size"], +                md5=ingest_file["md5"], +                sha1=ingest_file["sha1"], +                sha256=ingest_file.get("sha256"),                  extra=dict( -                    mimetype=ingest_file['mimetype'], +                    mimetype=ingest_file["mimetype"],                  ),              )              if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size): -                self.counts['skip-partial-file-info'] += 1 +                self.counts["skip-partial-file-info"] += 1                  return None -            if ingest_file.get('platform_url'): +            if ingest_file.get("platform_url"):                  # XXX: should we include this? -                fsf.extra['original_url'] = ingest_file['platform_url'] -            if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'): -                fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" +                fsf.extra["original_url"] = ingest_file["platform_url"] +            if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"): +                fsf.extra[ +                    "wayback_url" +                ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"              manifest.append(fsf)          fe = fatcat_openapi_client.FilesetEntity( @@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter):          for other in self._entity_queue:              # XXX: how to duplicate check?              if other.original_url == wc.original_url: -                self.counts['skip-in-queue'] += 1 +                self.counts["skip-in-queue"] += 1                  return False          # lookup sha1, or create new entity (TODO: API doesn't support this yet) -        #existing = None +        # existing = None          # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release          release = self.api.get_release(wc.release_ids[0], expand="filesets") @@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter):              for other in release.filesets:                  if wc.original_url == other.original_url:                      # TODO: compare very similar timestamps of same time (different formats) -                    self.counts['exists'] += 1 +                    self.counts["exists"] += 1                      return False -            self.counts['skip-release-has-fileset'] += 1 +            self.counts["skip-release-has-fileset"] += 1              return False          return True      def insert_batch(self, batch):          if self.submit_mode: -            eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra)) +            eg = self.api.create_editgroup( +                fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ) +            )              for fe in batch:                  self.api.create_fileset(eg.editgroup_id, fe)              self.api.update_editgroup(eg.editgroup_id, eg, submit=True)          else: -            self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( -                editgroup=fatcat_openapi_client.Editgroup( -                    description=self.editgroup_description, -                    extra=self.editgroup_extra), -                entity_list=batch)) +            self.api.create_fileset_auto_batch( +                fatcat_openapi_client.FilesetAutoBatch( +                    editgroup=fatcat_openapi_client.Editgroup( +                        description=self.editgroup_description, extra=self.editgroup_extra +                    ), +                    entity_list=batch, +                ) +            )  class SavePaperNowFilesetImporter(IngestFilesetResultImporter): @@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):      def __init__(self, api, submit_mode=True, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter') -        kwargs['submit_mode'] = submit_mode -        kwargs['do_updates'] = False -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Fileset crawled after a public 'Save Paper Now' request" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter") +        kwargs["submit_mode"] = submit_mode +        kwargs["do_updates"] = False +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, row): -        source = row['request'].get('ingest_request_source') +        source = row["request"].get("ingest_request_source")          if not source: -            self.counts['skip-ingest_request_source'] += 1 +            self.counts["skip-ingest_request_source"] += 1              return False -        if not source.startswith('savepapernow'): -            self.counts['skip-not-savepapernow'] += 1 +        if not source.startswith("savepapernow"): +            self.counts["skip-not-savepapernow"] += 1              return False -        if row.get('hit') is not True: -            self.counts['skip-hit'] += 1 +        if row.get("hit") is not True: +            self.counts["skip-hit"] += 1              return False          if not self.want_fileset(row): diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 0a983c5e..8e3af416 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,4 +1,3 @@ -  import datetime  import sqlite3  import sys @@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons):      # first parse out into language-agnostic dics      for raw in raw_persons: -        name = raw.find('name') or None +        name = raw.find("name") or None          if name: -            name = clean(name.get_text().replace('\n', ' ')) -        surname = raw.find('familyName') or None +            name = clean(name.get_text().replace("\n", " ")) +        surname = raw.find("familyName") or None          if surname: -            surname = clean(surname.get_text().replace('\n', ' ')) -        given_name = raw.find('givenName') or None +            surname = clean(surname.get_text().replace("\n", " ")) +        given_name = raw.find("givenName") or None          if given_name: -            given_name = clean(given_name.get_text().replace('\n', ' ')) -        lang = 'en' +            given_name = clean(given_name.get_text().replace("\n", " ")) +        lang = "en"          if is_cjk(name): -            lang = 'ja' -        if lang == 'en' and surname and given_name: +            lang = "ja" +        if lang == "en" and surname and given_name:              # english names order is flipped              name = "{} {}".format(given_name, surname)          rc = fatcat_openapi_client.ReleaseContrib( -            raw_name=name, -            surname=surname, -            given_name=given_name, -            role="author") +            raw_name=name, surname=surname, given_name=given_name, role="author" +        )          # add an extra hint field; won't end up in serialized object          rc._lang = lang          persons.append(rc) @@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons):      if not persons:          return [] -    if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]): +    if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]):          # all english names, or all japanese names          return persons      # for debugging -    #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): +    # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):      #    print("INTERESTING: {}".format(persons[0]))      start_lang = persons[0]._lang @@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons):          if p._lang == start_lang:              contribs.append(p)          else: -            if p._lang == 'en' and contribs[-1]._lang == 'ja': +            if p._lang == "en" and contribs[-1]._lang == "ja":                  eng = p                  jpn = contribs[-1] -            elif p._lang == 'ja' and contribs[-1]._lang == 'en': +            elif p._lang == "ja" and contribs[-1]._lang == "en":                  eng = contribs[-1]                  jpn = p              else: @@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons):                  contribs.append(p)                  continue              eng.extra = { -                'original_name': { -                    'lang': jpn._lang, -                    'raw_name': jpn.raw_name, -                    'given_name': jpn.given_name, -                    'surname': jpn.surname, +                "original_name": { +                    "lang": jpn._lang, +                    "raw_name": jpn.raw_name, +                    "given_name": jpn.given_name, +                    "surname": jpn.surname,                  },              }              contribs[-1] = eng @@ -105,18 +102,19 @@ class JalcImporter(EntityImporter):      def __init__(self, api, issn_map_file, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of JALC DOI metadata") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter') -        super().__init__(api, +        eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata") +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter") +        super().__init__( +            api,              issn_map_file=issn_map_file,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, -            **kwargs) +            **kwargs +        ) -        self.create_containers = kwargs.get('create_containers', True) -        extid_map_file = kwargs.get('extid_map_file') +        self.create_containers = kwargs.get("create_containers", True) +        extid_map_file = kwargs.get("extid_map_file")          self.extid_map_db = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -129,12 +127,27 @@ class JalcImporter(EntityImporter):      def lookup_ext_ids(self, doi):          if self.extid_map_db is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", -            [doi.lower()]).fetchone() +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            ) +        row = self.extid_map_db.execute( +            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] +        ).fetchone()          if row is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) -        row = [str(cell or '') or None for cell in row] +            return dict( +                core_id=None, +                pmid=None, +                pmcid=None, +                wikidata_qid=None, +                arxiv_id=None, +                jstor_id=None, +            ) +        row = [str(cell or "") or None for cell in row]          return dict(              core_id=row[0],              pmid=row[1], @@ -163,27 +176,27 @@ class JalcImporter(EntityImporter):          titles = record.find_all("title")          if not titles:              return None -        title = titles[0].get_text().replace('\n', ' ').strip() +        title = titles[0].get_text().replace("\n", " ").strip()          original_title = None -        if title.endswith('.'): +        if title.endswith("."):              title = title[:-1]          if len(titles) > 1: -            original_title = titles[1].get_text().replace('\n', ' ').strip() -            if original_title.endswith('.'): +            original_title = titles[1].get_text().replace("\n", " ").strip() +            if original_title.endswith("."):                  original_title = original_title[:-1]          doi = None          if record.doi:              doi = clean_doi(record.doi.string.strip().lower()) -            if doi.startswith('http://dx.doi.org/'): -                doi = doi.replace('http://dx.doi.org/', '') -            elif doi.startswith('https://dx.doi.org/'): -                doi = doi.replace('https://dx.doi.org/', '') -            elif doi.startswith('http://doi.org/'): -                doi = doi.replace('http://doi.org/', '') -            elif doi.startswith('https://doi.org/'): -                doi = doi.replace('https://doi.org/', '') -            if not (doi.startswith('10.') and '/' in doi): +            if doi.startswith("http://dx.doi.org/"): +                doi = doi.replace("http://dx.doi.org/", "") +            elif doi.startswith("https://dx.doi.org/"): +                doi = doi.replace("https://dx.doi.org/", "") +            elif doi.startswith("http://doi.org/"): +                doi = doi.replace("http://doi.org/", "") +            elif doi.startswith("https://doi.org/"): +                doi = doi.replace("https://doi.org/", "") +            if not (doi.startswith("10.") and "/" in doi):                  sys.stderr.write("bogus JALC DOI: {}\n".format(doi))                  doi = None          if not doi: @@ -202,7 +215,9 @@ class JalcImporter(EntityImporter):          if date:              date = date.string              if len(date) == 10: -                release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() +                release_date = datetime.datetime.strptime( +                    date["completed-date"], DATE_FMT +                ).date()                  release_year = release_date.year                  release_date = release_date.isoformat()              elif len(date) == 4 and date.isdigit(): @@ -214,7 +229,7 @@ class JalcImporter(EntityImporter):              if record.endingPage and record.endingPage.string.strip():                  pages = "{}-{}".format(pages, record.endingPage.string.strip())          # double check to prevent "-" as pages -        if pages and pages.strip() == '-': +        if pages and pages.strip() == "-":              pages = None          volume = None @@ -242,9 +257,13 @@ class JalcImporter(EntityImporter):          container_extra = dict()          if record.publicationName: -            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] +            pubs = [ +                p.get_text().replace("\n", " ").strip() +                for p in record.find_all("publicationName") +                if p.get_text() +            ]              pubs = [clean(p) for p in pubs if p] -            assert(pubs) +            assert pubs              if len(pubs) > 1 and pubs[0] == pubs[1]:                  pubs = [pubs[0]]              if len(pubs) > 1 and is_cjk(pubs[0]): @@ -252,10 +271,14 @@ class JalcImporter(EntityImporter):                  pubs = [pubs[1], pubs[0]]              container_name = clean(pubs[0])              if len(pubs) > 1: -                container_extra['original_name'] = clean(pubs[1]) +                container_extra["original_name"] = clean(pubs[1])          if record.publisher: -            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] +            pubs = [ +                p.get_text().replace("\n", " ").strip() +                for p in record.find_all("publisher") +                if p.get_text() +            ]              pubs = [p for p in pubs if p]              if len(pubs) > 1 and pubs[0] == pubs[1]:                  pubs = [pubs[0]] @@ -265,20 +288,25 @@ class JalcImporter(EntityImporter):              if pubs:                  publisher = clean(pubs[0])                  if len(pubs) > 1: -                    container_extra['publisher_aliases'] = pubs[1:] - -        if (container_id is None and self.create_containers and (issnl is not None) -                and container_name): +                    container_extra["publisher_aliases"] = pubs[1:] + +        if ( +            container_id is None +            and self.create_containers +            and (issnl is not None) +            and container_name +        ):              # name, type, publisher, issnl              # extra: issnp, issne, original_name, languages, country -            container_extra['country'] = 'jp' -            container_extra['languages'] = ['ja'] +            container_extra["country"] = "jp" +            container_extra["languages"] = ["ja"]              ce = fatcat_openapi_client.ContainerEntity(                  name=container_name, -                container_type='journal', +                container_type="journal",                  publisher=publisher,                  issnl=issnl, -                extra=(container_extra or None)) +                extra=(container_extra or None), +            )              ce_edit = self.create_container(ce)              container_id = ce_edit.ident              # short-cut future imports in same batch @@ -301,7 +329,7 @@ class JalcImporter(EntityImporter):          #   group-title          # always put at least an empty dict here to indicate the DOI registrar          # (informally) -        extra['jalc'] = extra_jalc +        extra["jalc"] = extra_jalc          title = clean(title)          if not title: @@ -312,24 +340,24 @@ class JalcImporter(EntityImporter):              title=title,              original_title=clean(original_title),              release_type=release_type, -            release_stage='published', +            release_stage="published",              release_date=release_date,              release_year=release_year,              ext_ids=fatcat_openapi_client.ReleaseExtIds(                  doi=doi, -                pmid=extids['pmid'], -                pmcid=extids['pmcid'], -                wikidata_qid=extids['wikidata_qid'], -                core=extids['core_id'], -                arxiv=extids['arxiv_id'], -                jstor=extids['jstor_id'], +                pmid=extids["pmid"], +                pmcid=extids["pmcid"], +                wikidata_qid=extids["wikidata_qid"], +                core=extids["core_id"], +                arxiv=extids["arxiv_id"], +                jstor=extids["jstor_id"],              ),              volume=volume,              issue=issue,              pages=pages,              publisher=publisher,              language=lang, -            #license_slug +            # license_slug              container_id=container_id,              contribs=contribs,              extra=extra, @@ -351,17 +379,20 @@ class JalcImporter(EntityImporter):          # eventually we'll want to support "updates", but for now just skip if          # entity already exists          if existing: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          return True      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )      def parse_file(self, handle):          """ @@ -374,11 +405,11 @@ class JalcImporter(EntityImporter):          # 2. iterate over articles, call parse_article on each          for record in soup.find_all("Description"):              resp = self.parse_record(record) -            #print(json.dumps(resp)) +            # print(json.dumps(resp))              print(resp) -            #sys.exit(-1) +            # sys.exit(-1) -if __name__=='__main__': +if __name__ == "__main__":      parser = JalcImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 25d7b3b5..6d1fefa3 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from .common import EntityImporter, clean @@ -11,18 +10,20 @@ def or_none(s):          return None      return s +  def truthy(s):      if s is None:          return None      s = s.lower() -    if s in ('true', 't', 'yes', 'y', '1'): +    if s in ("true", "t", "yes", "y", "1"):          return True -    elif s in ('false', 'f', 'no', 'n', '0'): +    elif s in ("false", "f", "no", "n", "0"):          return False      else:          return None +  class JournalMetadataImporter(EntityImporter):      """      Imports journal metadata ("containers") by ISSN, currently from a custom @@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.get( +            "editgroup_description", +            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, raw_record): -        if raw_record.get('issnl') and raw_record.get('name'): +        if raw_record.get("issnl") and raw_record.get("name"):              return True          return False @@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter):          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        if not row.get('name'): +        if not row.get("name"):              # Name is required (by schema)              return None          extra = dict() -        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', -            'coden', 'aliases', 'original_name', 'first_year', 'last_year', -            'platform', 'default_license', 'road', 'mimetypes', -            'sherpa_romeo', 'kbart'): +        for key in ( +            "issne", +            "issnp", +            "languages", +            "country", +            "urls", +            "abbrev", +            "coden", +            "aliases", +            "original_name", +            "first_year", +            "last_year", +            "platform", +            "default_license", +            "road", +            "mimetypes", +            "sherpa_romeo", +            "kbart", +        ):              if row.get(key):                  extra[key] = row[key]          # TODO: not including for now: norwegian, dois/crossref, ia          extra_doaj = dict() -        if row.get('doaj'): -            if row['doaj'].get('as_of'): -                extra_doaj['as_of'] = row['doaj']['as_of'] -            if row['doaj'].get('works'): -                extra_doaj['works'] = row['doaj']['works'] +        if row.get("doaj"): +            if row["doaj"].get("as_of"): +                extra_doaj["as_of"] = row["doaj"]["as_of"] +            if row["doaj"].get("works"): +                extra_doaj["works"] = row["doaj"]["works"]          if extra_doaj: -            extra['doaj'] = extra_doaj +            extra["doaj"] = extra_doaj          extra_ia = dict()          # TODO: would like an ia.longtail_ia flag -        if row.get('sim'): +        if row.get("sim"):              # NB: None case of the .get() here is blech, but othrwise              # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on -            extra_ia['sim'] = { -                'year_spans': row['sim'].get('year_spans'), +            extra_ia["sim"] = { +                "year_spans": row["sim"].get("year_spans"),              }          if extra_ia: -            extra['ia'] = extra_ia +            extra["ia"] = extra_ia -        name = clean(row.get('name')) +        name = clean(row.get("name"))          if not name:              return None          ce = fatcat_openapi_client.ContainerEntity( -            issnl=row['issnl'], -            issne=row.get('issne'), -            issnp=row.get('issnp'), -            container_type=None, # TODO +            issnl=row["issnl"], +            issne=row.get("issne"), +            issnp=row.get("issnp"), +            container_type=None,  # TODO              name=name, -            publisher=clean(row.get('publisher')), -            wikidata_qid=None, # TODO -            extra=extra) +            publisher=clean(row.get("publisher")), +            wikidata_qid=None,  # TODO +            extra=extra, +        )          return ce      def try_update(self, ce): @@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter):          # for now, only update KBART, and only if there is new content          if not existing.extra:              existing.extra = dict() -        if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']): -            if not existing.extra.get('kbart'): -                existing.extra['kbart'] = {} -            existing.extra['kbart'].update(ce.extra['kbart']) +        if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]): +            if not existing.extra.get("kbart"): +                existing.extra["kbart"] = {} +            existing.extra["kbart"].update(ce.extra["kbart"])              self.api.update_container(self.get_editgroup_id(), existing.ident, existing) -            self.counts['update'] += 1 +            self.counts["update"] += 1              return False          else: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          # if we got this far, it's a bug          raise NotImplementedError      def insert_batch(self, batch): -        self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_container_auto_batch( +            fatcat_openapi_client.ContainerAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index d37424d6..8c7bfad4 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,4 +1,3 @@ -  import datetime  import json  import sys @@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP  # TODO: more entries?  JSTOR_CONTRIB_MAP = { -    'author': 'author', -    'editor': 'editor', -    'translator': 'translator', -    'illustrator': 'illustrator', +    "author": "author", +    "editor": "editor", +    "translator": "translator", +    "illustrator": "illustrator",  }  JSTOR_TYPE_MAP = { @@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = {      "research-article": "article-journal",  } +  class JstorImporter(EntityImporter):      """      Importer for JSTOR bulk XML metadata (eg, from their Early Journals @@ -34,17 +34,18 @@ class JstorImporter(EntityImporter):      def __init__(self, api, issn_map_file, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of JSTOR XML metadata") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') -        super().__init__(api, +        eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata") +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter") +        super().__init__( +            api,              issn_map_file=issn_map_file,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, -            **kwargs) +            **kwargs +        ) -        self.create_containers = kwargs.get('create_containers', True) +        self.create_containers = kwargs.get("create_containers", True)          self.read_issn_map_file(issn_map_file) @@ -62,20 +63,22 @@ class JstorImporter(EntityImporter):          extra = dict()          extra_jstor = dict() -        release_type = JSTOR_TYPE_MAP.get(article['article-type']) +        release_type = JSTOR_TYPE_MAP.get(article["article-type"])          title = article_meta.find("article-title")          if title and title.get_text(): -            title = title.get_text().replace('\n', ' ').strip() +            title = title.get_text().replace("\n", " ").strip()          elif title and not title.get_text():              title = None -        if not title and release_type.startswith('review') and article_meta.product.source: -            title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text()) +        if not title and release_type.startswith("review") and article_meta.product.source: +            title = "Review: {}".format( +                article_meta.product.source.replace("\n", " ").get_text() +            )          if not title:              return None -        if title.endswith('.'): +        if title.endswith("."):              title = title[:-1]          if "[Abstract]" in title: @@ -93,12 +96,12 @@ class JstorImporter(EntityImporter):              title = title[1:-1]          # JSTOR journal-id -        journal_ids = [j.string for j in journal_meta.find_all('journal-id')] +        journal_ids = [j.string for j in journal_meta.find_all("journal-id")]          if journal_ids: -            extra_jstor['journal_ids'] = journal_ids +            extra_jstor["journal_ids"] = journal_ids -        journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') -        publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ') +        journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ") +        publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ")          issn = journal_meta.find("issn")          if issn:              issn = issn.string @@ -113,13 +116,18 @@ class JstorImporter(EntityImporter):              container_id = self.lookup_issnl(issnl)          # create container if it doesn't exist -        if (container_id is None and self.create_containers and (issnl is not None) -                and journal_title): +        if ( +            container_id is None +            and self.create_containers +            and (issnl is not None) +            and journal_title +        ):              ce = fatcat_openapi_client.ContainerEntity(                  issnl=issnl,                  publisher=publisher,                  container_type=self.map_container_type(release_type), -                name=clean(journal_title, force_xml=True)) +                name=clean(journal_title, force_xml=True), +            )              ce_edit = self.create_container(ce)              container_id = ce_edit.ident              self._issnl_id_map[issnl] = container_id @@ -132,8 +140,8 @@ class JstorImporter(EntityImporter):          if jstor_id:              jstor_id = jstor_id.string.strip()          if not jstor_id and doi: -            assert doi.startswith('10.2307/') -            jstor_id = doi.replace('10.2307/', '') +            assert doi.startswith("10.2307/") +            jstor_id = doi.replace("10.2307/", "")          assert jstor_id and int(jstor_id)          contribs = [] @@ -142,13 +150,13 @@ class JstorImporter(EntityImporter):              for c in cgroup.find_all("contrib"):                  given = c.find("given-names")                  if given: -                    given = clean(given.get_text().replace('\n', ' ')) +                    given = clean(given.get_text().replace("\n", " "))                  surname = c.find("surname")                  if surname: -                    surname = clean(surname.get_text().replace('\n', ' ')) +                    surname = clean(surname.get_text().replace("\n", " "))                  raw_name = c.find("string-name")                  if raw_name: -                    raw_name = clean(raw_name.get_text().replace('\n', ' ')) +                    raw_name = clean(raw_name.get_text().replace("\n", " "))                  if not raw_name:                      if given and surname: @@ -156,15 +164,17 @@ class JstorImporter(EntityImporter):                      elif surname:                          raw_name = surname -                role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) -                if not role and c.get('contrib-type'): -                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type'])) -                contribs.append(fatcat_openapi_client.ReleaseContrib( -                    role=role, -                    raw_name=raw_name, -                    given_name=given, -                    surname=surname, -                )) +                role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author")) +                if not role and c.get("contrib-type"): +                    sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"])) +                contribs.append( +                    fatcat_openapi_client.ReleaseContrib( +                        role=role, +                        raw_name=raw_name, +                        given_name=given, +                        surname=surname, +                    ) +                )          for i, contrib in enumerate(contribs):              if contrib.raw_name != "et al.": @@ -172,14 +182,13 @@ class JstorImporter(EntityImporter):          release_year = None          release_date = None -        pub_date = article_meta.find('pub-date') +        pub_date = article_meta.find("pub-date")          if pub_date and pub_date.year:              release_year = int(pub_date.year.string)              if pub_date.month and pub_date.day:                  release_date = datetime.date( -                    release_year, -                    int(pub_date.month.string), -                    int(pub_date.day.string)) +                    release_year, int(pub_date.month.string), int(pub_date.day.string) +                )                  if release_date.day == 1 and release_date.month == 1:                      # suspect jan 1st dates get set by JSTOR when actual                      # date not known (citation needed), so drop them @@ -208,10 +217,10 @@ class JstorImporter(EntityImporter):                  warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))          # JSTOR issue-id -        if article_meta.find('issue-id'): -            issue_id = clean(article_meta.find('issue-id').string) +        if article_meta.find("issue-id"): +            issue_id = clean(article_meta.find("issue-id").string)              if issue_id: -                extra_jstor['issue_id'] = issue_id +                extra_jstor["issue_id"] = issue_id          # everything in JSTOR is published          release_stage = "published" @@ -225,14 +234,14 @@ class JstorImporter(EntityImporter):          #   group-title          #   pubmed: retraction refs          if extra_jstor: -            extra['jstor'] = extra_jstor +            extra["jstor"] = extra_jstor          if not extra:              extra = None          re = fatcat_openapi_client.ReleaseEntity( -            #work_id +            # work_id              title=title, -            #original_title +            # original_title              release_type=release_type,              release_stage=release_stage,              release_date=release_date, @@ -246,21 +255,16 @@ class JstorImporter(EntityImporter):              pages=pages,              publisher=publisher,              language=language, -            #license_slug - +            # license_slug              # content, mimetype, lang -            #abstracts=abstracts, - +            # abstracts=abstracts,              contribs=contribs, -              # key, year, container_name, title, locator              # extra: volume, authors, issue, publisher, identifiers -            #refs=refs, - +            # refs=refs,              #   name, type, publisher, issnl              #   extra: issnp, issne, original_name, languages, country              container_id=container_id, -              extra=extra,          )          return re @@ -289,12 +293,12 @@ class JstorImporter(EntityImporter):          if existing and existing.ext_ids.jstor:              # don't update if it already has JSTOR ID -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          elif existing:              # but do update if only DOI was set              existing.ext_ids.jstor = re.ext_ids.jstor -            existing.extra['jstor'] = re.extra['jstor'] +            existing.extra["jstor"] = re.extra["jstor"]              # better release_type detection, and some other fields              # TODO: don't do this over-writing in the future? assuming here              # this is a one-time batch import over/extending bootstrap crossref @@ -304,17 +308,20 @@ class JstorImporter(EntityImporter):              existing.contribs = re.contribs              existing.language = re.language              self.api.update_release(self.get_editgroup_id(), existing.ident, existing) -            self.counts['update'] += 1 +            self.counts["update"] += 1              return False          return True      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )      def parse_file(self, handle): @@ -325,8 +332,9 @@ class JstorImporter(EntityImporter):          for article in soup.find_all("article"):              resp = self.parse_record(article)              print(json.dumps(resp)) -            #sys.exit(-1) +            # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__":      parser = JstorImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 09807276..7c2a6a87 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from fatcat_tools.normal import clean_doi @@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies." -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Import of large-scale file-to-release match results. Source of metadata varies." +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.default_link_rel = kwargs.get("default_link_rel", "web")          self.default_mimetype = kwargs.get("default_mimetype", None) @@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter):          return True      def parse_record(self, obj): -        dois = [d.lower() for d in obj.get('dois', [])] +        dois = [d.lower() for d in obj.get("dois", [])]          # lookup dois          re_list = set()          for doi in dois:              doi = clean_doi(doi)              if not doi: -                self.counts['skip-bad-doi'] += 1 +                self.counts["skip-bad-doi"] += 1                  return None              try:                  re = self.api.lookup_release(doi=doi) @@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter):                      raise err                  re = None              if re is None: -                #print("DOI not found: {}".format(doi)) +                # print("DOI not found: {}".format(doi))                  pass              else:                  re_list.add(re.ident)          # look up other external ids -        for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'): +        for extid_type in ( +            "arxiv", +            "pmid", +            "pmcid", +            "jstor", +            "wikidata_qid", +            "core", +            "isbn13", +            "ark", +        ):              extid = obj.get(extid_type)              if extid:                  try: @@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter):          release_ids = list(re_list)          if len(release_ids) == 0: -            self.counts['skip-no-releases'] += 1 +            self.counts["skip-no-releases"] += 1              return None          if len(release_ids) > SANE_MAX_RELEASES: -            self.counts['skip-too-many-releases'] += 1 +            self.counts["skip-too-many-releases"] += 1              return None          # parse URLs and CDX          urls = set() -        for url in obj.get('urls', []): +        for url in obj.get("urls", []):              url = make_rel_url(url, default_link_rel=self.default_link_rel)              if url is not None:                  urls.add(url) -        for cdx in obj.get('cdx', []): -            original = cdx['url'] -            if cdx.get('dt'): -                wayback = "https://web.archive.org/web/{}/{}".format( -                    cdx['dt'], -                    original) +        for cdx in obj.get("cdx", []): +            original = cdx["url"] +            if cdx.get("dt"): +                wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)                  urls.add(("webarchive", wayback))              url = make_rel_url(original, default_link_rel=self.default_link_rel)              if url is not None:                  urls.add(url)          urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]          if len(urls) == 0: -            self.counts['skip-no-urls'] += 1 +            self.counts["skip-no-urls"] += 1              return None          if len(urls) > SANE_MAX_URLS: -            self.counts['skip-too-many-urls'] += 1 +            self.counts["skip-too-many-urls"] += 1              return None -        size = obj.get('size') +        size = obj.get("size")          if size:              size = int(size) -        mimetype = obj.get('mimetype', self.default_mimetype) +        mimetype = obj.get("mimetype", self.default_mimetype)          if not mimetype and urls: -            if urls[0].url.endswith('.pdf'): -                mimetype = 'application/pdf' +            if urls[0].url.endswith(".pdf"): +                mimetype = "application/pdf"          fe = fatcat_openapi_client.FileEntity( -            md5=obj.get('md5'), -            sha1=obj['sha1'], -            sha256=obj.get('sha256'), +            md5=obj.get("md5"), +            sha1=obj["sha1"], +            sha256=obj.get("sha256"),              size=size,              mimetype=mimetype,              release_ids=release_ids, @@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter):          combined_release_ids = list(set(fe.release_ids + existing.release_ids))          if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:              # no new release matches *and* there are already existing URLs -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          # check for edit conflicts          if existing.ident in [e.ident for e in self._edits_inflight]: -            self.counts['skip-update-inflight'] += 1 +            self.counts["skip-update-inflight"] += 1              return False          # minimum viable "existing" URL cleanup to fix dupes and broken links:          # remove 'None' wayback URLs, and set archive.org rel 'archive' -        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] +        existing.urls = [ +            u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) +        ]          for i in range(len(existing.urls)):              u = existing.urls[i] -            if u.rel == 'repository' and '://archive.org/download/' in u.url: -                existing.urls[i].rel = 'archive' +            if u.rel == "repository" and "://archive.org/download/" in u.url: +                existing.urls[i].rel = "archive"          # special case: if importing *new* from archive.org arxiv collections,          # blow away any existing release_id mappings; this is a direct arxiv_id          # map. This *should* be safe to run in all matched imports.          is_arxiv = False          for u in fe.urls: -            if 'archive.org/download/arxiv' in u.url.lower(): +            if "archive.org/download/arxiv" in u.url.lower():                  is_arxiv = True                  break          if is_arxiv and fe.release_ids: @@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter):          # merge the existing into this one and update          existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) -        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] +        existing.urls = [ +            fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls +        ]          if len(existing.urls) > SANE_MAX_URLS: -            self.counts['skip-update-too-many-url'] += 1 +            self.counts["skip-update-too-many-url"] += 1              return None          existing.release_ids = list(set(fe.release_ids + existing.release_ids))          if len(existing.release_ids) > SANE_MAX_RELEASES: -            self.counts['skip-update-too-many-releases'] += 1 +            self.counts["skip-update-too-many-releases"] += 1              return None          existing.mimetype = existing.mimetype or fe.mimetype          existing.size = existing.size or fe.size @@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter):          existing.sha256 = existing.sha256 or fe.sha256          edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)          self._edits_inflight.append(edit) -        self.counts['update'] += 1 +        self.counts["update"] += 1          return False      def insert_batch(self, batch): -        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_file_auto_batch( +            fatcat_openapi_client.FileAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 3bdd23a1..b514e6e5 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,4 +1,3 @@ -  import sys  import fatcat_openapi_client @@ -8,7 +7,7 @@ from .common import EntityImporter, clean  def value_or_none(e):      if type(e) == dict: -        e = e.get('value') +        e = e.get("value")      if type(e) == str and len(e) == 0:          e = None      # TODO: this is probably bogus; patched in desperation; remove? @@ -21,18 +20,17 @@ def value_or_none(e):              return None      return e -class OrcidImporter(EntityImporter): +class OrcidImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of ORCID metadata, from official bulk releases.") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = kwargs.get( +            "editgroup_description", +            "Automated import of ORCID metadata, from official bulk releases.", +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)      def want(self, raw_record):          return True @@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter):          returns a CreatorEntity          """ -        if 'person' not in obj: +        if "person" not in obj:              return False -        name = obj['person']['name'] +        name = obj["person"]["name"]          if not name:              return None          extra = None -        given = value_or_none(name.get('given-names')) -        sur = value_or_none(name.get('family-name')) -        display = value_or_none(name.get('credit-name')) +        given = value_or_none(name.get("given-names")) +        sur = value_or_none(name.get("family-name")) +        display = value_or_none(name.get("credit-name"))          if display is None:              # TODO: sorry human beings              if given and sur: @@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter):                  display = sur              elif given:                  display = given -        orcid = obj['orcid-identifier']['path'] +        orcid = obj["orcid-identifier"]["path"]          if not self.is_orcid(orcid):              sys.stderr.write("Bad ORCID: {}\n".format(orcid))              return None @@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter):              given_name=clean(given),              surname=clean(sur),              display_name=display, -            extra=extra) +            extra=extra, +        )          return ce      def try_update(self, raw_record): @@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter):          # eventually we'll want to support "updates", but for now just skip if          # entity already exists          if existing: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          return True      def insert_batch(self, batch): -        self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_creator_auto_batch( +            fatcat_openapi_client.CreatorAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 00ad54d0..cfdafcf7 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,4 +1,3 @@ -  import datetime  import json  import sys @@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean  # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly  PUBMED_RELEASE_TYPE_MAP = { -    #Adaptive Clinical Trial +    # Adaptive Clinical Trial      "Address": "speech",      "Autobiography": "book", -    #Bibliography +    # Bibliography      "Biography": "book", -    #Case Reports +    # Case Reports      "Classical Article": "article-journal", -    #Clinical Conference -    #Clinical Study -    #Clinical Trial -    #Clinical Trial, Phase I -    #Clinical Trial, Phase II -    #Clinical Trial, Phase III -    #Clinical Trial, Phase IV -    #Clinical Trial Protocol -    #Clinical Trial, Veterinary -    #Collected Works -    #Comparative Study -    #Congress -    #Consensus Development Conference -    #Consensus Development Conference, NIH -    #Controlled Clinical Trial +    # Clinical Conference +    # Clinical Study +    # Clinical Trial +    # Clinical Trial, Phase I +    # Clinical Trial, Phase II +    # Clinical Trial, Phase III +    # Clinical Trial, Phase IV +    # Clinical Trial Protocol +    # Clinical Trial, Veterinary +    # Collected Works +    # Comparative Study +    # Congress +    # Consensus Development Conference +    # Consensus Development Conference, NIH +    # Controlled Clinical Trial      "Dataset": "dataset", -    #Dictionary -    #Directory -    #Duplicate Publication +    # Dictionary +    # Directory +    # Duplicate Publication      "Editorial": "editorial", -    #English Abstract   # doesn't indicate that this is abstract-only -    #Equivalence Trial -    #Evaluation Studies -    #Expression of Concern -    #Festschrift -    #Government Document -    #Guideline +    # English Abstract   # doesn't indicate that this is abstract-only +    # Equivalence Trial +    # Evaluation Studies +    # Expression of Concern +    # Festschrift +    # Government Document +    # Guideline      "Historical Article": "article-journal", -    #Interactive Tutorial +    # Interactive Tutorial      "Interview": "interview",      "Introductory Journal Article": "article-journal",      "Journal Article": "article-journal", @@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = {      "Legal Case": "legal_case",      "Legislation": "legislation",      "Letter": "letter", -    #Meta-Analysis -    #Multicenter Study -    #News +    # Meta-Analysis +    # Multicenter Study +    # News      "Newspaper Article": "article-newspaper", -    #Observational Study -    #Observational Study, Veterinary -    #Overall -    #Patient Education Handout -    #Periodical Index -    #Personal Narrative -    #Portrait -    #Practice Guideline -    #Pragmatic Clinical Trial -    #Publication Components -    #Publication Formats -    #Publication Type Category -    #Randomized Controlled Trial -    #Research Support, American Recovery and Reinvestment Act -    #Research Support, N.I.H., Extramural -    #Research Support, N.I.H., Intramural -    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. -    #Research Support, U.S. Gov't, P.H.S. -    #Review     # in the "literature review" sense, not "product review" -    #Scientific Integrity Review -    #Study Characteristics -    #Support of Research -    #Systematic Review +    # Observational Study +    # Observational Study, Veterinary +    # Overall +    # Patient Education Handout +    # Periodical Index +    # Personal Narrative +    # Portrait +    # Practice Guideline +    # Pragmatic Clinical Trial +    # Publication Components +    # Publication Formats +    # Publication Type Category +    # Randomized Controlled Trial +    # Research Support, American Recovery and Reinvestment Act +    # Research Support, N.I.H., Extramural +    # Research Support, N.I.H., Intramural +    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. +    # Research Support, U.S. Gov't, P.H.S. +    # Review     # in the "literature review" sense, not "product review" +    # Scientific Integrity Review +    # Study Characteristics +    # Support of Research +    # Systematic Review      "Technical Report": "report", -    #Twin Study -    #Validation Studies -    #Video-Audio Media -    #Webcasts +    # Twin Study +    # Validation Studies +    # Video-Audio Media +    # Webcasts  }  MONTH_ABBR_MAP = { -    "Jan":  1, "01":  1, -    "Feb":  2, "02":  2, -    "Mar":  3, "03":  3, -    "Apr":  4, "04":  4, -    "May":  5, "05":  5, -    "Jun":  6, "06":  6, -    "Jul":  7, "07":  7, -    "Aug":  8, "08":  8, -    "Sep":  9, "09":  9, -    "Oct": 10, "10": 10, -    "Nov": 11, "11": 11, -    "Dec": 12, "12": 12, +    "Jan": 1, +    "01": 1, +    "Feb": 2, +    "02": 2, +    "Mar": 3, +    "03": 3, +    "Apr": 4, +    "04": 4, +    "May": 5, +    "05": 5, +    "Jun": 6, +    "06": 6, +    "Jul": 7, +    "07": 7, +    "Aug": 8, +    "08": 8, +    "Sep": 9, +    "09": 9, +    "Oct": 10, +    "10": 10, +    "Nov": 11, +    "11": 11, +    "Dec": 12, +    "12": 12,  }  # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ @@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = {      "United Kingdom": "gb",      "United States": "us",      "Uruguay": "uy", -      # Additions from running over large files      "Bosnia and Herzegovina": "ba", -    #"International" -    "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn +    # "International" +    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn      "Russia (Federation)": "ru",      "Scotland": "gb",      "England": "gb", @@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter):      def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of PubMed/MEDLINE XML metadata") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') -        super().__init__(api, +        eg_desc = kwargs.get( +            "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" +        ) +        eg_extra = kwargs.get("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter") +        super().__init__( +            api,              issn_map_file=issn_map_file,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, -            **kwargs) +            **kwargs +        )          self.lookup_refs = lookup_refs -        self.create_containers = kwargs.get('create_containers', True) +        self.create_containers = kwargs.get("create_containers", True)          self.read_issn_map_file(issn_map_file)      def want(self, obj): @@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter):                  release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]                  break          if pub_types: -            extra_pubmed['pub_types'] = pub_types +            extra_pubmed["pub_types"] = pub_types          if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):              release_type = "retraction"              retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")              if retraction_of:                  if retraction_of.RefSource: -                    extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string +                    extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string                  if retraction_of.PMID: -                    extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string +                    extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string          # everything in medline is published          release_stage = "published" @@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter):          elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):              withdrawn_status = "concern" -        pages = medline.find('MedlinePgn') +        pages = medline.find("MedlinePgn")          if pages:              pages = pages.string -        title = medline.Article.ArticleTitle.get_text() # always present +        title = medline.Article.ArticleTitle.get_text()  # always present          if title: -            title = title.replace('\n', ' ') -            if title.endswith('.'): +            title = title.replace("\n", " ") +            if title.endswith("."):                  title = title[:-1]              # this hides some "special" titles, but the vast majority are              # translations; translations don't always include the original_title -            if title.startswith('[') and title.endswith(']'): +            if title.startswith("[") and title.endswith("]"):                  title = title[1:-1]          else:              # will filter out later @@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter):          original_title = medline.Article.find("VernacularTitle", recurse=False)          if original_title:              original_title = original_title.get_text() or None -            original_title = original_title.replace('\n', ' ') -            if original_title and original_title.endswith('.'): +            original_title = original_title.replace("\n", " ") +            if original_title and original_title.endswith("."):                  original_title = original_title[:-1]          if original_title and not title: @@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter):              else:                  language = LANG_MAP_MARC.get(language)                  if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): -                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) +                    warnings.warn( +                        "MISSING MARC LANG: {}".format(medline.Article.Language.string) +                    )          ### Journal/Issue Metadata          # MedlineJournalInfo is always present @@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter):              country_name = mji.Country.string.strip()              country_code = COUNTRY_NAME_MAP.get(country_name)              if country_code: -                container_extra['country'] = country_code +                container_extra["country"] = country_code              elif country_name: -                container_extra['country_name'] = country_name +                container_extra["country_name"] = country_name          if mji.find("ISSNLinking"):              issnl = mji.ISSNLinking.string @@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter):          if issnl:              container_id = self.lookup_issnl(issnl) -        pub_date = medline.Article.find('ArticleDate') +        pub_date = medline.Article.find("ArticleDate")          if not pub_date:              pub_date = journal.PubDate          if not pub_date: @@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter):                      release_date = datetime.date(                          release_year,                          MONTH_ABBR_MAP[pub_date.Month.string], -                        int(pub_date.Day.string)) +                        int(pub_date.Day.string), +                    )                      release_date = release_date.isoformat()                  except ValueError as ve:                      print("bad date, skipping: {}".format(ve), file=sys.stderr) @@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter):              if len(medline_date) >= 4 and medline_date[:4].isdigit():                  release_year = int(medline_date[:4])                  if release_year < 1300 or release_year > 2040: -                    print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) +                    print( +                        "bad medline year, skipping: {}".format(release_year), file=sys.stderr +                    )                      release_year = None              else: -                print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) +                print( +                    "unparsable medline date, skipping: {}".format(medline_date), +                    file=sys.stderr, +                )          if journal.find("Title"):              container_name = journal.Title.get_text() -        if (container_id is None and self.create_containers and (issnl is not None) -                and container_name): +        if ( +            container_id is None +            and self.create_containers +            and (issnl is not None) +            and container_name +        ):              # name, type, publisher, issnl              # extra: original_name, languages, country              ce = fatcat_openapi_client.ContainerEntity(                  name=container_name, -                container_type='journal', -                #NOTE: publisher not included +                container_type="journal", +                # NOTE: publisher not included                  issnl=issnl,                  issnp=issnp, -                extra=(container_extra or None)) +                extra=(container_extra or None), +            )              ce_edit = self.create_container(ce)              container_id = ce_edit.ident              self._issnl_id_map[issnl] = container_id @@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter):          # "All abstracts are in English"          abstracts = []          primary_abstract = medline.find("Abstract") -        if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): -            joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) +        if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"): +            joined = "\n".join( +                [m.get_text() for m in primary_abstract.find_all("AbstractText")] +            )              abst = fatcat_openapi_client.ReleaseAbstract(                  content=joined,                  mimetype="text/plain", @@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter):                  )                  if abst.content:                      abstracts.append(abst) -                if abstract.find('math'): +                if abstract.find("math"):                      abst = fatcat_openapi_client.ReleaseAbstract(                          # strip the <AbstractText> tags                          content=str(abstract)[14:-15], @@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter):          other_abstracts = medline.find_all("OtherAbstract")          for other in other_abstracts:              lang = "en" -            if other.get('Language'): -                lang = LANG_MAP_MARC.get(other['Language']) +            if other.get("Language"): +                lang = LANG_MAP_MARC.get(other["Language"])              abst = fatcat_openapi_client.ReleaseAbstract(                  content=other.AbstractText.get_text().strip(),                  mimetype="text/plain", @@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter):                  surname = None                  raw_name = None                  if author.ForeName: -                    given_name = author.ForeName.get_text().replace('\n', ' ') +                    given_name = author.ForeName.get_text().replace("\n", " ")                  if author.LastName: -                    surname = author.LastName.get_text().replace('\n', ' ') +                    surname = author.LastName.get_text().replace("\n", " ")                  if given_name and surname:                      raw_name = "{} {}".format(given_name, surname)                  elif surname:                      raw_name = surname                  if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): -                    raw_name = author.CollectiveName.get_text().replace('\n', ' ') +                    raw_name = author.CollectiveName.get_text().replace("\n", " ")                  contrib_extra = dict()                  orcid = author.find("Identifier", Source="ORCID")                  if orcid: @@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter):                          orcid = orcid.replace("http://orcid.org/", "")                      elif orcid.startswith("https://orcid.org/"):                          orcid = orcid.replace("https://orcid.org/", "") -                    elif '-' not in orcid: +                    elif "-" not in orcid:                          orcid = "{}-{}-{}-{}".format(                              orcid[0:4],                              orcid[4:8], @@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter):                              orcid[12:16],                          )                      creator_id = self.lookup_orcid(orcid) -                    contrib_extra['orcid'] = orcid +                    contrib_extra["orcid"] = orcid                  affiliations = author.find_all("Affiliation")                  raw_affiliation = None                  if affiliations: -                    raw_affiliation = affiliations[0].get_text().replace('\n', ' ') +                    raw_affiliation = affiliations[0].get_text().replace("\n", " ")                      if len(affiliations) > 1: -                        contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] +                        contrib_extra["more_affiliations"] = [ +                            ra.get_text().replace("\n", " ") for ra in affiliations[1:] +                        ]                  if author.find("EqualContrib"):                      # TODO: schema for this? -                    contrib_extra['equal'] = True -                contribs.append(fatcat_openapi_client.ReleaseContrib( -                    raw_name=raw_name, -                    given_name=given_name, -                    surname=surname, -                    role="author", -                    raw_affiliation=raw_affiliation, -                    creator_id=creator_id, -                    extra=contrib_extra, -                )) - -            if medline.AuthorList['CompleteYN'] == 'N': +                    contrib_extra["equal"] = True +                contribs.append( +                    fatcat_openapi_client.ReleaseContrib( +                        raw_name=raw_name, +                        given_name=given_name, +                        surname=surname, +                        role="author", +                        raw_affiliation=raw_affiliation, +                        creator_id=creator_id, +                        extra=contrib_extra, +                    ) +                ) + +            if medline.AuthorList["CompleteYN"] == "N":                  contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))          for i, contrib in enumerate(contribs): @@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter):              # note that Reference always exists within a ReferenceList, but              # that there may be multiple ReferenceList (eg, sometimes one per              # Reference) -            for ref in pubmed.find_all('Reference'): +            for ref in pubmed.find_all("Reference"):                  ref_extra = dict()                  ref_doi = ref.find("ArticleId", IdType="doi")                  if ref_doi: @@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter):                      ref_pmid = clean_pmid(ref_pmid.string)                  ref_release_id = None                  if ref_doi: -                    ref_extra['doi'] = ref_doi +                    ref_extra["doi"] = ref_doi                      if self.lookup_refs:                          ref_release_id = self.lookup_doi(ref_doi)                  if ref_pmid: -                    ref_extra['pmid'] = ref_pmid +                    ref_extra["pmid"] = ref_pmid                      if self.lookup_refs:                          ref_release_id = self.lookup_pmid(ref_pmid)                  ref_raw = ref.Citation                  if ref_raw: -                    ref_extra['unstructured'] = ref_raw.get_text() +                    ref_extra["unstructured"] = ref_raw.get_text()                  if not ref_extra:                      ref_extra = None -                refs.append(fatcat_openapi_client.ReleaseRef( -                    target_release_id=ref_release_id, -                    extra=ref_extra, -                )) +                refs.append( +                    fatcat_openapi_client.ReleaseRef( +                        target_release_id=ref_release_id, +                        extra=ref_extra, +                    ) +                )          if not refs:              refs = None @@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter):          #   group-title          #   pubmed: retraction refs          if extra_pubmed: -            extra['pubmed'] = extra_pubmed +            extra["pubmed"] = extra_pubmed          if not extra:              extra = None @@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter):                  doi=doi,                  pmid=pmid,                  pmcid=pmcid, -                #isbn13     # never in Article +                # isbn13     # never in Article              ),              volume=volume,              issue=issue,              pages=pages, -            #publisher  # not included? +            # publisher  # not included?              language=language, -            #license_slug   # not in MEDLINE +            # license_slug   # not in MEDLINE              abstracts=abstracts,              contribs=contribs,              refs=refs, @@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter):                      raise err              if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:                  warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( -                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) +                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid +                )                  warnings.warn(warn_str) -                self.counts['warn-pmid-doi-mismatch'] += 1 +                self.counts["warn-pmid-doi-mismatch"] += 1                  # don't clobber DOI, but do group together                  re.ext_ids.doi = None                  re.work_id = existing.work_id          if existing and not self.do_updates: -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):              # TODO: any other reasons to do an update?              # don't update if it already has PMID -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          elif existing:              # but do update if only DOI was set @@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter):              existing.container_id = existing.container_id or re.container_id              existing.refs = existing.refs or re.refs              existing.abstracts = existing.abstracts or re.abstracts -            existing.extra['pubmed'] = re.extra['pubmed'] +            existing.extra["pubmed"] = re.extra["pubmed"]              # fix stub titles              if existing.title in [ -                    "OUP accepted manuscript", -                ]: +                "OUP accepted manuscript", +            ]:                  existing.title = re.title              existing.original_title = existing.original_title or re.original_title @@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter):              existing.language = existing.language or re.language              # update subtitle in-place first -            if not existing.subtitle and existing.extra.get('subtitle'): -                subtitle = existing.extra.pop('subtitle') +            if not existing.subtitle and existing.extra.get("subtitle"): +                subtitle = existing.extra.pop("subtitle")                  if type(subtitle) == list:                      subtitle = subtitle[0]                  if subtitle: @@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter):              try:                  self.api.update_release(self.get_editgroup_id(), existing.ident, existing) -                self.counts['update'] += 1 +                self.counts["update"] += 1              except fatcat_openapi_client.rest.ApiException as err:                  # there is a code path where we try to update the same release                  # twice in a row; if that happens, just skip                  # NOTE: API behavior might change in the future?                  if "release_edit_editgroup_id_ident_id_key" in err.body: -                    self.counts['skip-update-conflict'] += 1 +                    self.counts["skip-update-conflict"] += 1                      return False                  else:                      raise err @@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter):          return True      def insert_batch(self, batch): -        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        )      def parse_file(self, handle): @@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter):          for article in soup.find_all("PubmedArticle"):              resp = self.parse_record(article)              print(json.dumps(resp)) -            #sys.exit(-1) +            # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__":      parser = PubmedImporter(None, None)      parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 77205cee..78eeec7a 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,4 +1,3 @@ -  import fatcat_openapi_client  from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid @@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter):      def __init__(self, api, **kwargs): -        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" -        eg_extra = kwargs.pop('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra, -            **kwargs) +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Import of 'Shadow Library' file/release matches" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter") +        super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)          self.default_link_rel = kwargs.get("default_link_rel", "web")      def want(self, raw_record):          """          Only want to import records with complete file-level metadata          """ -        fm = raw_record['file_meta'] -        if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): -            self.counts['skip-file-meta-incomplete'] += 1 +        fm = raw_record["file_meta"] +        if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]): +            self.counts["skip-file-meta-incomplete"] += 1              return False -        if fm['mimetype'] != 'application/pdf': -            self.counts['skip-not-pdf'] += 1 +        if fm["mimetype"] != "application/pdf": +            self.counts["skip-not-pdf"] += 1              return False          return True @@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter):          We do the release lookup in this method. Try DOI, then PMID, last ISBN13.          """ -        shadow_corpus = obj['shadow']['shadow_corpus'] +        shadow_corpus = obj["shadow"]["shadow_corpus"]          assert shadow_corpus == shadow_corpus.strip().lower() -        doi = clean_doi(obj['shadow'].get('doi')) -        pmid = clean_pmid(obj['shadow'].get('pmid')) -        isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) -        shadow_id = obj['shadow'].get('shadow_id').strip() +        doi = clean_doi(obj["shadow"].get("doi")) +        pmid = clean_pmid(obj["shadow"].get("pmid")) +        isbn13 = clean_isbn13(obj["shadow"].get("isbn13")) +        shadow_id = obj["shadow"].get("shadow_id").strip()          assert shadow_id -        extra = { '{}_id'.format(shadow_corpus): shadow_id } -        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: +        extra = {"{}_id".format(shadow_corpus): shadow_id} +        for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:              if not ext_id:                  continue -            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id +            extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id          # lookup release via several idents          re = None -        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: +        for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:              if not ext_id:                  continue              try: @@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter):                  break          if not re: -            self.counts['skip-release-not-found'] += 1 +            self.counts["skip-release-not-found"] += 1              return None -        release_ids = [re.ident,] +        release_ids = [ +            re.ident, +        ]          # parse single CDX into URLs (if exists)          urls = [] -        if obj.get('cdx'): -            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) +        if obj.get("cdx"): +            url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel)              if url is not None:                  urls.append(url)              wayback = "https://web.archive.org/web/{}/{}".format( -                obj['cdx']['datetime'], -                obj['cdx']['url']) +                obj["cdx"]["datetime"], obj["cdx"]["url"] +            )              urls.append(("webarchive", wayback))          urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]          fe = fatcat_openapi_client.FileEntity( -            md5=obj['file_meta']['md5hex'], -            sha1=obj['file_meta']['sha1hex'], -            sha256=obj['file_meta']['sha256hex'], -            size=int(obj['file_meta']['size_bytes']), -            mimetype=obj['file_meta']['mimetype'] or None, +            md5=obj["file_meta"]["md5hex"], +            sha1=obj["file_meta"]["sha1hex"], +            sha256=obj["file_meta"]["sha256hex"], +            size=int(obj["file_meta"]["size_bytes"]), +            mimetype=obj["file_meta"]["mimetype"] or None,              release_ids=release_ids,              urls=urls,              extra=dict(shadows=extra), @@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter):          if not existing.extra:              existing.extra = {} -        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: +        if ( +            existing.extra.get("shadows") +            and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"] +        ):              # already imported from this shadow library; skip -            self.counts['exists'] += 1 +            self.counts["exists"] += 1              return False          # check for edit conflicts          if existing.ident in [e.ident for e in self._edits_inflight]: -            self.counts['skip-update-inflight'] += 1 +            self.counts["skip-update-inflight"] += 1              return False          if fe.sha1 in [e.sha1 for e in self._edits_inflight]:              raise Exception("Inflight insert; shouldn't happen")          # minimum viable "existing" URL cleanup to fix dupes and broken links:          # remove 'None' wayback URLs, and set archive.org rel 'archive' -        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] +        existing.urls = [ +            u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) +        ]          for i in range(len(existing.urls)):              u = existing.urls[i] -            if u.rel == 'repository' and '://archive.org/download/' in u.url: -                existing.urls[i].rel = 'archive' -            if u.rel == 'social': -                u.rel = 'academicsocial' +            if u.rel == "repository" and "://archive.org/download/" in u.url: +                existing.urls[i].rel = "archive" +            if u.rel == "social": +                u.rel = "academicsocial"          # merge the existing into this one and update          merged_urls = {}          for u in fe.urls + existing.urls:              merged_urls[u.url] = u          existing.urls = list(merged_urls.values()) -        if not existing.extra.get('shadows'): -            existing.extra['shadows'] = fe.extra['shadows'] +        if not existing.extra.get("shadows"): +            existing.extra["shadows"] = fe.extra["shadows"]          else: -            existing.extra['shadows'].update(fe.extra['shadows']) +            existing.extra["shadows"].update(fe.extra["shadows"])          # do these "plus ones" because we really want to do these updates when possible          if len(existing.urls) > SANE_MAX_URLS + 1: -            self.counts['skip-update-too-many-url'] += 1 +            self.counts["skip-update-too-many-url"] += 1              return None          existing.release_ids = list(set(fe.release_ids + existing.release_ids))          if len(existing.release_ids) > SANE_MAX_RELEASES + 1: -            self.counts['skip-update-too-many-releases'] += 1 +            self.counts["skip-update-too-many-releases"] += 1              return None          existing.mimetype = existing.mimetype or fe.mimetype          existing.size = existing.size or fe.size @@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter):          # group-level de-dupe          edit.sha1 = existing.sha1          self._edits_inflight.append(edit) -        self.counts['update'] += 1 +        self.counts["update"] += 1          return False      def insert_batch(self, batch): -        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( -            editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), -            entity_list=batch)) +        self.api.create_file_auto_batch( +            fatcat_openapi_client.FileAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, extra=self.editgroup_extra +                ), +                entity_list=batch, +            ) +        ) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 196f86ff..22fefad3 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -33,22 +33,23 @@ REQ_SESSION = requests.Session()  def parse_wbm_url(url):      """Takes a wayback machine URL, and returns a tuple: -        (timestamp, datetime, original_url) +    (timestamp, datetime, original_url)      """ -    chunks = url.split('/') +    chunks = url.split("/")      assert len(chunks) >= 6 -    assert chunks[2] == 'web.archive.org' -    assert chunks[3] == 'web' -    return (chunks[4], -            parse_wbm_timestamp(chunks[4]), -            '/'.join(chunks[5:])) +    assert chunks[2] == "web.archive.org" +    assert chunks[3] == "web" +    return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) +  def test_parse_wbm_url():      u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"      assert parse_wbm_url(u) == (          "20010712114837",          datetime.datetime(2001, 7, 12, 11, 48, 37), -        "http://www.dlib.org/dlib/june01/reich/06reich.html") +        "http://www.dlib.org/dlib/june01/reich/06reich.html", +    ) +  def parse_wbm_timestamp(timestamp):      """ @@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp):      python datetime object (UTC)      """      # strip any "im_" or "id_" suffix -    if timestamp.endswith('_'): +    if timestamp.endswith("_"):          timestamp = timestamp[:-3]      # inflexible; require the full second-precision timestamp      assert len(timestamp) == 14 @@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp):          day=int(timestamp[6:8]),          hour=int(timestamp[8:10]),          minute=int(timestamp[10:12]), -        second=int(timestamp[12:14])) +        second=int(timestamp[12:14]), +    ) +  def test_parse_wbm_timestamp(): -    assert parse_wbm_timestamp("20010712114837") == \ -        datetime.datetime(2001, 7, 12, 11, 48, 37) +    assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) +  def fetch_wbm(url):      resp = REQ_SESSION.get(url) @@ -78,31 +81,35 @@ def fetch_wbm(url):      assert resp.content      return resp.content +  def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):      sys.stderr.write(embed_url + "\n") -    assert embed_url.startswith('/web/') -    embed_url = embed_url.split('/') +    assert embed_url.startswith("/web/") +    embed_url = embed_url.split("/")      timestamp = embed_url[2] -    if timestamp.endswith('_'): +    if timestamp.endswith("_"):          timestamp = timestamp[:-3] -    url = '/'.join(embed_url[3:]) -    #print((timestamp, url)) -    resp = REQ_SESSION.get(CDX_API_BASE, params=dict( -        url=url, -        closest=timestamp, -        sort="closest", -        resolveRevisits="true", -        matchType="exact", -        limit=1, -    )) +    url = "/".join(embed_url[3:]) +    # print((timestamp, url)) +    resp = REQ_SESSION.get( +        CDX_API_BASE, +        params=dict( +            url=url, +            closest=timestamp, +            sort="closest", +            resolveRevisits="true", +            matchType="exact", +            limit=1, +        ), +    )      resp.raise_for_status() -    #print(resp.url) +    # print(resp.url)      if resp.content: -        hit = resp.content.decode('utf-8').split('\n')[0] +        hit = resp.content.decode("utf-8").split("\n")[0]          if cdx_output:              cdx_output.write(hit + "\n") -        cdx = hit.split(' ') -        cdx = [x if (x and x != '-') else None for x in cdx] +        cdx = hit.split(" ") +        cdx = [x if (x and x != "-") else None for x in cdx]          webcapture_cdx = WebcaptureCdxLine(              surt=cdx[0],              timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", @@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):              sha256=None,          )          if verify_hashes: -            resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( -                cdx[1], # raw timestamp -                webcapture_cdx.url)) +            resp = REQ_SESSION.get( +                GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url)  # raw timestamp +            )              resp.raise_for_status()              assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()              webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() @@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):      else:          return None +  def wayback_url_to_relative(url):      """      Wayback URLs can be relative or absolute in rewritten documents. This      function converts any form of rewritten URL to a relative (to      web.archive.org) one, or returns None if it isn't a rewritten URL at all.      """ -    if url.startswith('https://web.archive.org/'): +    if url.startswith("https://web.archive.org/"):          url = url[23:] -    elif url.startswith('http://web.archive.org/'): +    elif url.startswith("http://web.archive.org/"):          url = url[22:] -    if url.startswith('/web/'): +    if url.startswith("/web/"):          return url      else:          return None +  def extract_embeds(soup):      embeds = set()      # <link href=""> -    for tag in soup.find_all('link', href=True): -        if tag['rel'] not in ('stylesheet',): +    for tag in soup.find_all("link", href=True): +        if tag["rel"] not in ("stylesheet",):              continue -        url = wayback_url_to_relative(tag['href']) +        url = wayback_url_to_relative(tag["href"])          if url:              embeds.add(url)      # <img src=""> -    for tag in soup.find_all('img', src=True): -        url = wayback_url_to_relative(tag['src']) +    for tag in soup.find_all("img", src=True): +        url = wayback_url_to_relative(tag["src"])          if url:              embeds.add(url)      # <script src=""> -    for tag in soup.find_all('script', src=True): -        url = wayback_url_to_relative(tag['src']) +    for tag in soup.find_all("script", src=True): +        url = wayback_url_to_relative(tag["src"])          if url:              embeds.add(url)      return list(embeds) +  def static_wayback_webcapture(wayback_url, cdx_output=None):      """      Given a complete wayback machine capture URL, like: @@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):      wbm_html = fetch_wbm(wayback_url)      raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    #with open(rewritten_path, 'r') as fp: +    # with open(rewritten_path, 'r') as fp:      #    soup = BeautifulSoup(fp, "lxml")      soup = BeautifulSoup(wbm_html, "lxml")      embeds = extract_embeds(soup) -    cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), -        cdx_output=cdx_output) +    cdx_obj = lookup_cdx( +        "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output +    )      cdx_list = [cdx_obj]      for url in embeds:          cdx_obj = lookup_cdx(url, cdx_output=cdx_output)          cdx_list.append(cdx_obj) -    archive_urls = [WebcaptureUrl( -        rel="wayback", -        url="https://web.archive.org/web/", -    )] +    archive_urls = [ +        WebcaptureUrl( +            rel="wayback", +            url="https://web.archive.org/web/", +        ) +    ]      wc = WebcaptureEntity(          cdx=cdx_list,          timestamp=timestamp.isoformat() + "Z",          original_url=original_url,          archive_urls=archive_urls, -        release_ids=None) +        release_ids=None, +    )      return wc +  def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):      """      Returns a tuple: (editgroup_id, edit). If failed, both are None      """      raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) -    git_rev = subprocess.check_output( -        ["git", "describe", "--always"]).strip().decode('utf-8') +    git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")      release = api.get_release(release_id, expand="webcaptures") @@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):      for wc in release.webcaptures:          if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():              # skipping: already existed -            print("release {} already had webcapture {} {}".format( -                release_id, raw_timestamp, original_url)) +            print( +                "release {} already had webcapture {} {}".format( +                    release_id, raw_timestamp, original_url +                ) +            )              return (None, None)      wc = static_wayback_webcapture(wayback_url)      assert len(wc.cdx) >= 1      wc.release_ids = [release_id]      if not editgroup_id: -        eg = api.create_editgroup(Editgroup( -            description="One-off import of static web content from wayback machine", -            extra=dict( -                git_rev=git_rev, -                agent="fatcat_tools.auto_wayback_static"))) +        eg = api.create_editgroup( +            Editgroup( +                description="One-off import of static web content from wayback machine", +                extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), +            ) +        )          editgroup_id = eg.editgroup_id      edit = api.create_webcapture(eg.editgroup_id, wc)      return (editgroup_id, edit) +  def main():      parser = argparse.ArgumentParser() -    parser.add_argument('--verbose', -        action='store_true', -        help="verbose output") -    parser.add_argument('wayback_url', -        type=str, -        help="URL of wayback capture to extract from") -    parser.add_argument('--json-output', -        type=argparse.FileType('w'), default=sys.stdout, -        help="where to write out webcapture entity (as JSON)") -    parser.add_argument('--cdx-output', -        type=argparse.FileType('w'), default=None, -        help="(optional) file to write out CDX stub") +    parser.add_argument("--verbose", action="store_true", help="verbose output") +    parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") +    parser.add_argument( +        "--json-output", +        type=argparse.FileType("w"), +        default=sys.stdout, +        help="where to write out webcapture entity (as JSON)", +    ) +    parser.add_argument( +        "--cdx-output", +        type=argparse.FileType("w"), +        default=None, +        help="(optional) file to write out CDX stub", +    )      args = parser.parse_args() @@ -254,5 +275,6 @@ def main():      wc_dict = api_client.sanitize_for_serialization(wc)      print(json.dumps(wc_dict)) -if __name__ == '__main__': + +if __name__ == "__main__":      main() diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 32749db2..2a4451ad 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,4 +1,3 @@ -  from confluent_kafka import KafkaException, Producer @@ -9,14 +8,15 @@ def kafka_fail_fast(err, msg):          # TODO: should it be sys.exit(-1)?          raise KafkaException(err) +  def simple_kafka_producer(kafka_hosts):      kafka_config = { -        'bootstrap.servers': kafka_hosts, -        'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes -        'delivery.report.only.error': True, -        'default.topic.config': { -            'request.required.acks': -1, +        "bootstrap.servers": kafka_hosts, +        "message.max.bytes": 20000000,  # ~20 MBytes; broker-side max is ~50 MBytes +        "delivery.report.only.error": True, +        "default.topic.config": { +            "request.required.acks": -1,          },      }      return Producer(kafka_config) diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 9b65e768..12c58829 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -1,4 +1,3 @@ -  """  A bunch of helpers to parse and normalize strings: external identifiers,  free-form input, titles, etc. @@ -32,7 +31,7 @@ def clean_doi(raw: str) -> Optional[str]:      if not raw:          return None      raw = raw.strip().lower() -    if '\u2013' in raw: +    if "\u2013" in raw:          # Do not attempt to normalize "en dash" and since FC does not allow          # unicode in DOI, treat this as invalid.          return None @@ -54,7 +53,7 @@ def clean_doi(raw: str) -> Optional[str]:      # fatcatd uses same REGEX, but Rust regex rejects these characters, while      # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;      # for now filter them out. -    for c in ('¬', ): +    for c in ("¬",):          if c in raw:              return None @@ -70,6 +69,7 @@ def clean_doi(raw: str) -> Optional[str]:          return None      return raw +  def test_clean_doi():      assert clean_doi("10.1234/asdf ") == "10.1234/asdf"      assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" @@ -81,7 +81,9 @@ def test_clean_doi():      assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"      assert clean_doi("doi:10.1234/ asdf ") is None      assert clean_doi("10.4149/gpb¬_2017042") is None  # "logical negation" character -    assert clean_doi("10.6002/ect.2020.häyry") is None  # this example via pubmed (pmid:32519616) +    assert ( +        clean_doi("10.6002/ect.2020.häyry") is None +    )  # this example via pubmed (pmid:32519616)      assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") is None      assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") is None      assert clean_doi("10.4025/diálogos.v17i2.36030") is None @@ -92,6 +94,7 @@ def test_clean_doi():  ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") +  def clean_arxiv_id(raw: str) -> Optional[str]:      """      Removes any: @@ -113,6 +116,7 @@ def clean_arxiv_id(raw: str) -> Optional[str]:          return None      return raw +  def test_clean_arxiv_id():      assert clean_arxiv_id("0806.2878v1") == "0806.2878v1"      assert clean_arxiv_id("0806.2878") == "0806.2878" @@ -141,16 +145,18 @@ def test_clean_arxiv_id():      assert clean_arxiv_id("0806.v1") is None      assert clean_arxiv_id("08062878v1") is None +  def clean_wikidata_qid(raw):      if not raw:          return None      raw = raw.strip()      if len(raw.split()) != 1 or len(raw) < 2:          return None -    if raw[0] == 'Q' and raw[1] != '0' and raw[1:].isdigit(): +    if raw[0] == "Q" and raw[1] != "0" and raw[1:].isdigit():          return raw      return None +  def test_clean_wikidata_qid():      assert clean_wikidata_qid("Q1234") == "Q1234"      assert clean_wikidata_qid("Q1") == "Q1" @@ -163,6 +169,7 @@ def test_clean_wikidata_qid():      assert clean_wikidata_qid("qfba3") is None      assert clean_wikidata_qid("") is None +  def clean_pmid(raw: str) -> Optional[str]:      if not raw:          return None @@ -173,6 +180,7 @@ def clean_pmid(raw: str) -> Optional[str]:          return raw      return None +  def test_clean_pmid():      assert clean_pmid("1234") == "1234"      assert clean_pmid("1234 ") == "1234" @@ -180,6 +188,7 @@ def test_clean_pmid():      assert clean_pmid("qfba3") is None      assert clean_pmid("") is None +  def clean_pmcid(raw: str) -> Optional[str]:      if not raw:          return None @@ -190,6 +199,7 @@ def clean_pmcid(raw: str) -> Optional[str]:          return raw      return None +  def clean_sha1(raw: str) -> Optional[str]:      if not raw:          return None @@ -203,13 +213,21 @@ def clean_sha1(raw: str) -> Optional[str]:              return None      return raw +  def test_clean_sha1(): -    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" -    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" +    assert ( +        clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") +        == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" +    ) +    assert ( +        clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") +        == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" +    )      assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None      assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") is None      assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None +  def clean_sha256(raw: str) -> Optional[str]:      raw = raw.strip().lower()      if len(raw.split()) != 1: @@ -221,12 +239,18 @@ def clean_sha256(raw: str) -> Optional[str]:              return None      return raw +  def test_clean_sha256(): -    assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" +    assert ( +        clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") +        == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" +    )      assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None +  ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$") +  def clean_issn(raw: str) -> Optional[str]:      if not raw:          return None @@ -237,14 +261,17 @@ def clean_issn(raw: str) -> Optional[str]:          return None      return raw +  def test_clean_issn():      assert clean_issn("1234-4567") == "1234-4567"      assert clean_issn("1234-456X") == "1234-456X"      assert clean_issn("134-4567") is None      assert clean_issn("123X-4567") is None +  ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") +  def clean_isbn13(raw: str) -> Optional[str]:      if not raw:          return None @@ -253,14 +280,17 @@ def clean_isbn13(raw: str) -> Optional[str]:          return None      return raw +  def test_clean_isbn13():      assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4"      assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6"      assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4"      assert clean_isbn13("9781566199094") is None +  ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$") +  def clean_orcid(raw: str) -> Optional[str]:      if not raw:          return None @@ -269,6 +299,7 @@ def clean_orcid(raw: str) -> Optional[str]:          return None      return raw +  def test_clean_orcid():      assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789"      assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X" @@ -279,6 +310,7 @@ def test_clean_orcid():  HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$") +  def clean_hdl(raw):      if not raw:          return None @@ -293,14 +325,17 @@ def clean_hdl(raw):          raw = raw[15:]      if not HDL_REGEX.fullmatch(raw):          return None -    if raw.startswith('10.'): +    if raw.startswith("10."):          return None      return raw +  def test_clean_hdl():      assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"      assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" -    assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" +    assert ( +        clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" +    )      assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"      assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh"      assert clean_hdl("2381/12775") == "2381/12775" @@ -326,7 +361,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:      """      if not thing:          return None -    unescape_html: Union[str, bool] = 'auto' +    unescape_html: Union[str, bool] = "auto"      if force_xml:          unescape_html = True      fixed = ftfy.fix_text(thing, unescape_html=unescape_html).strip() @@ -335,15 +370,17 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:          return None      return fixed +  def test_clean_str():      assert clean_str(None) is None -    assert clean_str('') is None -    assert clean_str('1') is None -    assert clean_str('123') == '123' -    assert clean_str('a&b') == 'a&b' -    assert clean_str('<b>a&b</b>') == '<b>a&b</b>' -    assert clean_str('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +    assert clean_str("") is None +    assert clean_str("1") is None +    assert clean_str("123") == "123" +    assert clean_str("a&b") == "a&b" +    assert clean_str("<b>a&b</b>") == "<b>a&b</b>" +    assert clean_str("<b>a&b</b>", force_xml=True) == "<b>a&b</b>" +  def b32_hex(s):      s = s.strip().split()[0].lower() @@ -351,7 +388,8 @@ def b32_hex(s):          s = s[5:]      if len(s) != 32:          return s -    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") +  def is_cjk(s):      if not s: @@ -359,38 +397,53 @@ def is_cjk(s):      for c in s:          if c.isalpha():              lang_prefix = unicodedata.name(c).split()[0] -            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') +            return lang_prefix in ("CJK", "HIRAGANA", "KATAKANA", "HANGUL")      return False +  def test_is_cjk():      assert is_cjk(None) is False -    assert is_cjk('') is False -    assert is_cjk('blah') is False -    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True -    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True -    assert is_cjk('菊') is True -    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True -    assert is_cjk('水道') is True -    assert is_cjk('オウ, イク') is True # kanji -    assert is_cjk('ひヒ') is True -    assert is_cjk('き゚ゅ') is True -    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True +    assert is_cjk("") is False +    assert is_cjk("blah") is False +    assert is_cjk("岡, 鹿, 梨, 阜, 埼") is True +    assert is_cjk("[岡, 鹿, 梨, 阜, 埼]") is True +    assert is_cjk("菊") is True +    assert is_cjk("岡, 鹿, 梨, 阜, 埼 with eng after") is True +    assert is_cjk("水道") is True +    assert is_cjk("オウ, イク") is True  # kanji +    assert is_cjk("ひヒ") is True +    assert is_cjk("き゚ゅ") is True +    assert is_cjk("ㄴ, ㄹ, ㅁ, ㅂ, ㅅ") is True +  MONTH_MAP = { -    "jan":  1, "january":   1, -    "feb":  2, "febuary":   2, -    "mar":  3, "march":     3, -    "apr":  4, "april":     4, -    "may":  5, "may":       5, -    "jun":  6, "june":      6, -    "jul":  7, "july":      7, -    "aug":  8, "august":    8, -    "sep":  9, "september": 9, -    "oct": 10, "october":   10, -    "nov": 11, "nov":       11, -    "dec": 12, "december":  12, +    "jan": 1, +    "january": 1, +    "feb": 2, +    "febuary": 2, +    "mar": 3, +    "march": 3, +    "apr": 4, +    "april": 4, +    "may": 5, +    "may": 5, +    "jun": 6, +    "june": 6, +    "jul": 7, +    "july": 7, +    "aug": 8, +    "august": 8, +    "sep": 9, +    "september": 9, +    "oct": 10, +    "october": 10, +    "nov": 11, +    "nov": 11, +    "dec": 12, +    "december": 12,  } +  def parse_month(raw: Optional[str]) -> Optional[int]:      """      Parses a string into a month number (1 to 12) @@ -408,6 +461,7 @@ def parse_month(raw: Optional[str]) -> Optional[int]:          return MONTH_MAP[raw]      return None +  def test_parse_month() -> None:      assert parse_month(None) is None @@ -417,6 +471,7 @@ def test_parse_month() -> None:      assert parse_month("jan") == 1      assert parse_month("September") == 9 +  def detect_text_lang(raw: str) -> Optional[str]:      """      Tries to determine language of, eg, an abstract. @@ -427,13 +482,14 @@ def detect_text_lang(raw: str) -> Optional[str]:          return None      try:          lang = langdetect.detect(raw) -        lang = lang.split('-')[0] +        lang = lang.split("-")[0]          assert len(lang) == 2          return lang      except (langdetect.lang_detect_exception.LangDetectException, TypeError):          return None      return None +  def test_detect_text_lang() -> None:      assert detect_text_lang("") is None      EN_SAMPLE = "this is a string of English text for testing" @@ -444,6 +500,7 @@ def test_detect_text_lang() -> None:      # XXX: why does this detect as `ko` sometimes?      assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko") +  def parse_lang_name(raw: Optional[str]) -> Optional[str]:      """      Parses a language name and returns a 2-char ISO 631 language code. @@ -456,13 +513,14 @@ def parse_lang_name(raw: Optional[str]) -> Optional[str]:              return None          return lang.alpha_2.lower()      except LookupError: -        #print(f"  unknown language: '{raw}', file=sys.stderr) +        # print(f"  unknown language: '{raw}', file=sys.stderr)          return None      except AttributeError: -        #print(f"  partial language metadata: '{lang}', file=sys.stderr) +        # print(f"  partial language metadata: '{lang}', file=sys.stderr)          return None      return None +  def test_parse_lang_name() -> None:      assert parse_lang_name(None) is None @@ -544,86 +602,85 @@ def test_parse_country_name():      assert parse_country_name("Russia") == "ru"      assert parse_country_name("Japan") == "jp" +  # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of  # 2/T and 2/B?  # PubMed/MEDLINE and JSTOR use these MARC codes  # https://www.loc.gov/marc/languages/language_name.html  LANG_MAP_MARC = { -    'afr': 'af', -    'alb': 'sq', -    'amh': 'am', -    'ara': 'ar', -    'arm': 'hy', -    'aze': 'az', -    'ben': 'bn', -    'bos': 'bs', -    'bul': 'bg', -    'cat': 'ca', -    'chi': 'zh', -    'cze': 'cs', -    'dan': 'da', -    'dut': 'nl', -    'eng': 'en', -    'epo': 'eo', -    'est': 'et', -    'fin': 'fi', -    'fre': 'fr', -    'geo': 'ka', -    'ger': 'de', -    'gla': 'gd', -    'gre': 'el', -    'heb': 'he', -    'hin': 'hi', -    'hrv': 'hr', -    'hun': 'hu', -    'ice': 'is', -    'ind': 'id', -    'ita': 'it', -    'jpn': 'ja', -    'kin': 'rw', -    'kor': 'ko', -    'lat': 'la', -    'lav': 'lv', -    'lit': 'lt', -    'mac': 'mk', -    'mal': 'ml', -    'mao': 'mi', -    'may': 'ms', -    'nor': 'no', -    'per': 'fa', -    'per': 'fa', -    'pol': 'pl', -    'por': 'pt', -    'pus': 'ps', -    'rum': 'ro', -    'rus': 'ru', -    'san': 'sa', -    'slo': 'sk', -    'slv': 'sl', -    'spa': 'es', -    'srp': 'sr', -    'swe': 'sv', -    'tha': 'th', -    'tur': 'tr', -    'ukr': 'uk', -    'urd': 'ur', -    'vie': 'vi', -    'wel': 'cy', - -# additions -    'gle': 'ga', # "Irish" (Gaelic) -    'jav': 'jv', # Javanese -    'welsh': 'cy', # Welsh -    'oci': 'oc', # Occitan - -# Don't have ISO 639-1 codes -    'grc': 'el', # Ancient Greek; map to modern greek -    'map': None, # Austronesian (collection) -    'syr': None, # Syriac, Modern -    'gem': None, # Old Saxon -    'non': None, # Old Norse -    'emg': None, # Eastern Meohang -    'neg': None, # Negidal -    'mul': None, # Multiple languages -    'und': None, # Undetermined +    "afr": "af", +    "alb": "sq", +    "amh": "am", +    "ara": "ar", +    "arm": "hy", +    "aze": "az", +    "ben": "bn", +    "bos": "bs", +    "bul": "bg", +    "cat": "ca", +    "chi": "zh", +    "cze": "cs", +    "dan": "da", +    "dut": "nl", +    "eng": "en", +    "epo": "eo", +    "est": "et", +    "fin": "fi", +    "fre": "fr", +    "geo": "ka", +    "ger": "de", +    "gla": "gd", +    "gre": "el", +    "heb": "he", +    "hin": "hi", +    "hrv": "hr", +    "hun": "hu", +    "ice": "is", +    "ind": "id", +    "ita": "it", +    "jpn": "ja", +    "kin": "rw", +    "kor": "ko", +    "lat": "la", +    "lav": "lv", +    "lit": "lt", +    "mac": "mk", +    "mal": "ml", +    "mao": "mi", +    "may": "ms", +    "nor": "no", +    "per": "fa", +    "per": "fa", +    "pol": "pl", +    "por": "pt", +    "pus": "ps", +    "rum": "ro", +    "rus": "ru", +    "san": "sa", +    "slo": "sk", +    "slv": "sl", +    "spa": "es", +    "srp": "sr", +    "swe": "sv", +    "tha": "th", +    "tur": "tr", +    "ukr": "uk", +    "urd": "ur", +    "vie": "vi", +    "wel": "cy", +    # additions +    "gle": "ga",  # "Irish" (Gaelic) +    "jav": "jv",  # Javanese +    "welsh": "cy",  # Welsh +    "oci": "oc",  # Occitan +    # Don't have ISO 639-1 codes +    "grc": "el",  # Ancient Greek; map to modern greek +    "map": None,  # Austronesian (collection) +    "syr": None,  # Syriac, Modern +    "gem": None,  # Old Saxon +    "non": None,  # Old Norse +    "emg": None,  # Eastern Meohang +    "neg": None,  # Negidal +    "mul": None,  # Multiple languages +    "und": None,  # Undetermined  } diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 8361b260..6fd9ca49 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -22,6 +22,7 @@ from fatcat_tools.transforms.entities import entity_to_dict  class BiblioRef(BaseModel):      """bibliographic reference""" +      # ("release", source_release_ident, ref_index)      # ("wikipedia", source_wikipedia_article, ref_index)      _key: Optional[str] @@ -37,7 +38,7 @@ class BiblioRef(BaseModel):      # context of the reference itself      # 1-indexed, not 0-indexed -    ref_index: Optional[int] # TODO: actually optional? +    ref_index: Optional[int]  # TODO: actually optional?      # eg, "Lee86", "BIB23"      ref_key: Optional[str]      # eg, page number @@ -74,16 +75,20 @@ class BiblioRef(BaseModel):          # work-arounds for bad/weird ref_key          if self.ref_key:              self.ref_key = self.ref_key.strip() -            if self.ref_key[0] in ['/', '_']: +            if self.ref_key[0] in ["/", "_"]:                  self.ref_key = self.ref_key[1:] -            if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key: -                self.ref_key = self.ref_key.split('-')[-1] -            if self.ref_key.startswith("10.") and '_' in self.ref_key: -                self.ref_key = self.ref_key.split('_')[-1] +            if ( +                self.ref_key.startswith("10.") +                and "SICI" in self.ref_key +                and "-" in self.ref_key +            ): +                self.ref_key = self.ref_key.split("-")[-1] +            if self.ref_key.startswith("10.") and "_" in self.ref_key: +                self.ref_key = self.ref_key.split("_")[-1]              if len(self.ref_key) > 10 and "#" in self.ref_key: -                self.ref_key = self.ref_key.split('#')[-1] +                self.ref_key = self.ref_key.split("#")[-1]              if len(self.ref_key) > 10 and "_" in self.ref_key: -                self.ref_key = self.ref_key.split('_')[-1] +                self.ref_key = self.ref_key.split("_")[-1]          if not self.ref_key and self.ref_index is not None:              self.ref_key = str(self.ref_index)          return self @@ -98,7 +103,7 @@ class EnrichedBiblioRef(BaseModel):      # TODO: openlibrary work?      access: List[AccessOption] -    @validator('release') +    @validator("release")      @classmethod      def check_release(cls, v):          if v is not None and not isinstance(v, ReleaseEntity): @@ -119,7 +124,7 @@ class RefHits(BaseModel):      limit: int      query_time_ms: int      query_wall_time_ms: int -    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] +    result_refs: List[Union[BiblioRef, EnrichedBiblioRef]]      class Config:          json_encoders = { @@ -145,22 +150,22 @@ def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) ->      except elasticsearch.exceptions.RequestError as e_raw:          # this is a "user" error          e: Any = e_raw -        #logging.warn("elasticsearch 400: " + str(e.info)) +        # logging.warn("elasticsearch 400: " + str(e.info))          if e.info.get("error", {}).get("root_cause", {}):              raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e          else:              raise ValueError(str(e.info)) from e      except elasticsearch.exceptions.TransportError as e:          # all other errors -        #logging.warn(f"elasticsearch non-200 status code: {e.info}") +        # logging.warn(f"elasticsearch non-200 status code: {e.info}")          raise IOError(str(e.info)) from e      query_delta = datetime.datetime.now() - query_start      result_refs = []      for h in resp.hits:          # might be a list because of consolidation -        if isinstance(h._d_.get('source_work_ident'), list): -            h._d_['source_work_ident'] = h._d_['source_work_ident'][0] +        if isinstance(h._d_.get("source_work_ident"), list): +            h._d_["source_work_ident"] = h._d_["source_work_ident"][0]          result_refs.append(BiblioRef.parse_obj(h._d_).hacks())      return RefHits( @@ -224,7 +229,10 @@ def get_inbound_refs(          search = search.extra(              collapse={                  "field": "source_work_ident", -                "inner_hits": {"name": "source_more", "size": 0,}, +                "inner_hits": { +                    "name": "source_more", +                    "size": 0, +                },              }          ) @@ -281,61 +289,87 @@ def count_inbound_refs(  # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: +def enrich_inbound_refs( +    refs: List[BiblioRef], +    fatcat_api_client: Any, +    hide: Optional[str] = "refs", +    expand: Optional[str] = "container,files,webcaptures,filesets", +) -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          release = None          access = []          if ref.source_release_ident: -            release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) +            release = fatcat_api_client.get_release( +                ref.source_release_ident, hide=hide, expand=expand +            )              access = release_access_options(release)          if ref.source_wikipedia_article: -            wiki_lang = ref.source_wikipedia_article.split(':')[0] -            wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_') -            access.append(AccessOption( -                access_type="wikipedia", -                access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", -                mimetype=None, -                size_bytes=None, -                thumbnail_url=None -            )) -        enriched.append(EnrichedBiblioRef( -            ref=ref, -            access=access, -            release=release, -        )) +            wiki_lang = ref.source_wikipedia_article.split(":")[0] +            wiki_article = ":".join(ref.source_wikipedia_article.split(":")[1:]).replace( +                " ", "_" +            ) +            access.append( +                AccessOption( +                    access_type="wikipedia", +                    access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", +                    mimetype=None, +                    size_bytes=None, +                    thumbnail_url=None, +                ) +            ) +        enriched.append( +            EnrichedBiblioRef( +                ref=ref, +                access=access, +                release=release, +            ) +        )      return enriched -def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: +def enrich_outbound_refs( +    refs: List[BiblioRef], +    fatcat_api_client: Any, +    hide: Optional[str] = "refs", +    expand: Optional[str] = "container,files,webcaptures,filesets", +) -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          release = None          access = []          if ref.target_release_ident: -            release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) +            release = fatcat_api_client.get_release( +                ref.target_release_ident, hide=hide, expand=expand +            )              access = release_access_options(release)          if ref.target_openlibrary_work: -            access.append(AccessOption( -                access_type="openlibrary", -                access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", -                mimetype=None, -                size_bytes=None, -                thumbnail_url=None -            )) -        if ref.target_url and '://web.archive.org/' in ref.target_url: -            access.append(AccessOption( -                access_type="wayback", -                access_url=ref.target_url, -                mimetype=None, -                size_bytes=None, -                thumbnail_url=None -            )) -        enriched.append(EnrichedBiblioRef( -            ref=ref, -            access=access, -            release=release, -        )) +            access.append( +                AccessOption( +                    access_type="openlibrary", +                    access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", +                    mimetype=None, +                    size_bytes=None, +                    thumbnail_url=None, +                ) +            ) +        if ref.target_url and "://web.archive.org/" in ref.target_url: +            access.append( +                AccessOption( +                    access_type="wayback", +                    access_url=ref.target_url, +                    mimetype=None, +                    size_bytes=None, +                    thumbnail_url=None, +                ) +            ) +        enriched.append( +            EnrichedBiblioRef( +                ref=ref, +                access=access, +                release=release, +            ) +        )      return enriched @@ -346,21 +380,29 @@ def run_ref_query(args) -> None:      release_ident = None      work_ident = None      if args.ident.startswith("release_"): -        release_ident = args.ident.split('_')[1] +        release_ident = args.ident.split("_")[1]      elif args.ident.startswith("work_"): -        work_ident = args.ident.split('_')[1] +        work_ident = args.ident.split("_")[1]      else:          release_ident = args.ident      print("## Outbound References") -    hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) -    print(f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") +    hits = get_outbound_refs( +        release_ident=release_ident, work_ident=work_ident, es_client=args.es_client +    ) +    print( +        f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms" +    )      if args.enrich == "fatcat": -        enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) +        enriched = enrich_outbound_refs( +            hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client +        )          for ref in enriched:              if ref.release: -                print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") +                print( +                    f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" +                )              else:                  print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}")      else: @@ -369,21 +411,30 @@ def run_ref_query(args) -> None:      print()      print("## Inbound References") -    hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) +    hits = get_inbound_refs( +        release_ident=release_ident, work_ident=work_ident, es_client=args.es_client +    ) -    print(f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") +    print( +        f"Total: {hits.count_total}  Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms" +    )      if args.enrich == "fatcat": -        enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) +        enriched = enrich_inbound_refs( +            hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client +        )          for ref in enriched:              if ref.release: -                print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") +                print( +                    f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" +                )              else:                  print(f"release_{ref.target_release_ident}")      else:          for ref in hits.result_refs:              print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") +  def main() -> None:      """      Run this utility like: @@ -395,9 +446,7 @@ def main() -> None:          python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply      """ -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter -    ) +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)      subparsers = parser.add_subparsers()      parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0") @@ -425,5 +474,6 @@ def main() -> None:      else:          raise NotImplementedError(args.func) +  if __name__ == "__main__":      main() diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index 867d826d..59ff1c4e 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -1,4 +1,3 @@ -  import datetime  import subprocess  import time @@ -34,8 +33,8 @@ class CheckResult:          self.status = status          self.check_type = check_type          self.description = description -        self.ident = kwargs.get('ident') -        self.rev = kwargs.get('rev') +        self.ident = kwargs.get("ident") +        self.rev = kwargs.get("rev")      def __repr__(self):          return str(self.__dict__) @@ -72,17 +71,17 @@ class EditCheck:  class ReviewBot: -      def __init__(self, api, verbose=False, **kwargs):          self.api = api          self.checks = []          self.verbose = verbose -        self.extra = kwargs.get('extra', dict()) -        self.extra['git_rev'] = self.extra.get('git_rev', -            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') -        self.extra['agent'] = self.extra.get('agent', 'fatcat_tools.ReviewBot') -        self.poll_interval = kwargs.get('poll_interval', 10.0) +        self.extra = kwargs.get("extra", dict()) +        self.extra["git_rev"] = self.extra.get( +            "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() +        ).decode("utf-8") +        self.extra["agent"] = self.extra.get("agent", "fatcat_tools.ReviewBot") +        self.poll_interval = kwargs.get("poll_interval", 10.0)      def run_single(self, editgroup_id, annotate=True):          eg = self.api.get_editgroup(editgroup_id) @@ -96,7 +95,9 @@ class ReviewBot:              since = datetime.datetime.utcnow()          while True:              # XXX: better isoformat conversion? -            eg_list = self.api.get_editgroups_reviewable(since=since.isoformat()[:19] + "Z", limit=100) +            eg_list = self.api.get_editgroups_reviewable( +                since=since.isoformat()[:19] + "Z", limit=100 +            )              if not eg_list:                  print("Sleeping {} seconds...".format(self.poll_interval))                  time.sleep(self.poll_interval) @@ -104,8 +105,11 @@ class ReviewBot:              for eg in eg_list:                  # TODO: fetch annotations to ensure we haven't already annotated                  annotation = self.review_editgroup(eg) -                print("Reviewed {} disposition:{}".format( -                    eg.editgroup_id, annotation.extra['disposition'])) +                print( +                    "Reviewed {} disposition:{}".format( +                        eg.editgroup_id, annotation.extra["disposition"] +                    ) +                )                  self.api.create_editgroup_annotation(eg.editgroup_id, annotation)                  since = eg.submitted              # to prevent busy loops (TODO: needs review/rethink; multiple @@ -125,10 +129,9 @@ class ReviewBot:          else:              raise ValueError -        for (status, title) in (('fail', 'Failed check'), ('warning', 'Warnings')): +        for (status, title) in (("fail", "Failed check"), ("warning", "Warnings")):              if result_counts[status] > 0: -                comment += "\n\n### {} ({}):\n".format( -                    status, result_counts[status]) +                comment += "\n\n### {} ({}):\n".format(status, result_counts[status])              for result in results:                  if result.status == status and result.check_type == "editgroup":                      comment += "\n- {description}".format(description=result.description) @@ -137,15 +140,18 @@ class ReviewBot:                          check_type=result.check_type,                          rev=result.rev,                          entity_type=result.check_type, -                        description=result.description) +                        description=result.description, +                    )          extra = self.extra.copy() -        extra.update({ -            "disposition": disposition, -            "submit_timestamp": editgroup.submitted.isoformat(), -            "checks": [check.name for check in self.checks], -            "result_counts": dict(result_counts), -        }) +        extra.update( +            { +                "disposition": disposition, +                "submit_timestamp": editgroup.submitted.isoformat(), +                "checks": [check.name for check in self.checks], +                "result_counts": dict(result_counts), +            } +        )          annotation = fatcat_openapi_client.EditgroupAnnotation(              comment_markdown=comment,              editgroup_id=editgroup.editgroup_id, @@ -156,7 +162,7 @@ class ReviewBot:      def result_counts(self, results):          counts = Counter()          for result in results: -            counts['total'] += 1 +            counts["total"] += 1              counts[result.status] += 1          return counts @@ -217,13 +223,18 @@ class DummyCheck(EditCheck):      name = "DummyCheck"      def check_editgroup(self, editgroup): -        return CheckResult("pass", "editgroup", +        return CheckResult( +            "pass", +            "editgroup",              "every edit is precious, thanks [editor {editor_id}](/editor/{editor_id})!".format( -                editor_id=editgroup.editor_id)) +                editor_id=editgroup.editor_id +            ), +        )      def check_work(self, entity, edit):          return CheckResult("pass", "work", "this work edit is beautiful") +  class DummyReviewBot(ReviewBot):      """      This bot reviews everything and always passes. diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index ae9880e7..34212a6a 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -1,4 +1,3 @@ -  from enum import Enum  from typing import List, Optional @@ -16,6 +15,7 @@ class AccessType(str, Enum):      openlibrary = "openlibrary"      wikipedia = "wikipedia" +  class AccessOption(BaseModel):      access_type: AccessType @@ -40,27 +40,31 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:      option found      """      options = [] -    for f in (release.files or []): +    for f in release.files or []:          thumbnail_url = None -        if f.mimetype == 'application/pdf' and f.sha1 and f.urls: +        if f.mimetype == "application/pdf" and f.sha1 and f.urls:              # NOTE: scholar.archive.org does an actual database check before              # generating these URLs, but we skip that for speed              thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg" -        for u in (f.urls or []): -            if '://web.archive.org/' in u.url: -                return [AccessOption( -                    access_type="wayback", -                    access_url=u.url, -                    mimetype=f.mimetype, -                    size_bytes=f.size, -                    thumbnail_url=thumbnail_url, -                )] -            elif '://archive.org/' in u.url: -                return [AccessOption( -                    access_type="ia_file", -                    access_url=u.url, -                    mimetype=f.mimetype, -                    size_bytes=f.size, -                    thumbnail_url=thumbnail_url, -                )] +        for u in f.urls or []: +            if "://web.archive.org/" in u.url: +                return [ +                    AccessOption( +                        access_type="wayback", +                        access_url=u.url, +                        mimetype=f.mimetype, +                        size_bytes=f.size, +                        thumbnail_url=thumbnail_url, +                    ) +                ] +            elif "://archive.org/" in u.url: +                return [ +                    AccessOption( +                        access_type="ia_file", +                        access_url=u.url, +                        mimetype=f.mimetype, +                        size_bytes=f.size, +                        thumbnail_url=thumbnail_url, +                    ) +                ]      return options diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index f8b26bce..2b39068a 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,4 +1,3 @@ -  import json  from citeproc import ( @@ -13,10 +12,10 @@ from citeproc_styles import get_style_filepath  def contribs_by_role(contribs, role): -    ret = [c.copy() for c in contribs if c['role'] == role] -    [c.pop('role') for c in ret] +    ret = [c.copy() for c in contribs if c["role"] == role] +    [c.pop("role") for c in ret]      # TODO: some note to self here -    [c.pop('literal') for c in ret if 'literal' in c] +    [c.pop("literal") for c in ret if "literal" in c]      if not ret:          return None      else: @@ -33,26 +32,30 @@ def release_to_csl(entity):      Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json      """      contribs = [] -    for contrib in (entity.contribs or []): +    for contrib in entity.contribs or []:          if contrib.creator:              # Default to "local" (publication-specific) metadata; fall back to              # creator-level -            family = contrib.creator.surname or contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) +            family = ( +                contrib.creator.surname +                or contrib.surname +                or (contrib.raw_name and contrib.raw_name.split()[-1]) +            )              if not family:                  # CSL requires some surname (family name)                  continue              c = dict(                  family=family,                  given=contrib.creator.given_name or contrib.given_name, -                #dropping-particle -                #non-dropping-particle -                #suffix -                #comma-suffix -                #static-ordering +                # dropping-particle +                # non-dropping-particle +                # suffix +                # comma-suffix +                # static-ordering                  literal=contrib.creator.display_name or contrib.raw_name, -                #parse-names, +                # parse-names,                  # role must be defined; default to author -                role=contrib.role or 'author', +                role=contrib.role or "author",              )          else:              family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) @@ -64,7 +67,7 @@ def release_to_csl(entity):                  given=contrib.given_name,                  literal=contrib.raw_name,                  # role must be defined; default to author -                role=contrib.role or 'author', +                role=contrib.role or "author",              )          for k in list(c.keys()):              if not c[k]: @@ -78,93 +81,108 @@ def release_to_csl(entity):      issued_date = None      if entity.release_date: -        issued_date = {"date-parts": [[ -            entity.release_date.year, -            entity.release_date.month, -            entity.release_date.day, -        ]]} +        issued_date = { +            "date-parts": [ +                [ +                    entity.release_date.year, +                    entity.release_date.month, +                    entity.release_date.day, +                ] +            ] +        }      elif entity.release_year:          issued_date = {"date-parts": [[entity.release_year]]}      csl = dict( -        #id, -        #categories -        type=entity.release_type or "article", # can't be blank +        # id, +        # categories +        type=entity.release_type or "article",  # can't be blank          language=entity.language, -        #journalAbbreviation -        #shortTitle +        # journalAbbreviation +        # shortTitle          ## see below for all contrib roles -        #accessed -        #container -        #event-date +        # accessed +        # container +        # event-date          issued=issued_date, -        #original-date -        #submitted +        # original-date +        # submitted          abstract=abstract, -        #annote -        #archive -        #archive_location -        #archive-place -        #authority -        #call-number -        #chapter-number -        #citation-number -        #citation-label -        #collection-number -        #collection-title +        # annote +        # archive +        # archive_location +        # archive-place +        # authority +        # call-number +        # chapter-number +        # citation-number +        # citation-label +        # collection-number +        # collection-title          container_title=entity.container and entity.container.name, -        #container-title-short -        #dimensions +        # container-title-short +        # dimensions          DOI=entity.ext_ids.doi, -        #edition -        #event -        #event-place -        #first-reference-note-number -        #genre +        # edition +        # event +        # event-place +        # first-reference-note-number +        # genre          ISBN=entity.ext_ids.isbn13,          ISSN=entity.container and entity.container.issnl,          issue=entity.issue, -        #jurisdiction -        #keyword -        #locator -        #medium -        #note -        #number -        #number-of-pages -        #number-of-volumes -        #original-publisher -        #original-publisher-place -        #original-title +        # jurisdiction +        # keyword +        # locator +        # medium +        # note +        # number +        # number-of-pages +        # number-of-volumes +        # original-publisher +        # original-publisher-place +        # original-title          # TODO: page=entity.pages, -        page_first=entity.pages and entity.pages.split('-')[0], +        page_first=entity.pages and entity.pages.split("-")[0],          PMCID=entity.ext_ids.pmcid,          PMID=entity.ext_ids.pmid,          publisher=(entity.container and entity.container.publisher) or entity.publisher, -        #publisher-place -        #references -        #reviewed-title -        #scale -        #section -        #source -        #status +        # publisher-place +        # references +        # reviewed-title +        # scale +        # section +        # source +        # status          title=entity.title, -        #title-short -        #URL -        #version +        # title-short +        # URL +        # version          volume=entity.volume, -        #year-suffix +        # year-suffix      ) -    for role in ['author', 'collection-editor', 'composer', 'container-author', -            'director', 'editor', 'editorial-director', 'interviewer', -            'illustrator', 'original-author', 'recipient', 'reviewed-author', -            'translator']: +    for role in [ +        "author", +        "collection-editor", +        "composer", +        "container-author", +        "director", +        "editor", +        "editorial-director", +        "interviewer", +        "illustrator", +        "original-author", +        "recipient", +        "reviewed-author", +        "translator", +    ]:          cbr = contribs_by_role(contribs, role)          if cbr:              csl[role] = cbr      # underline-to-dash -    csl['container-title'] = csl.pop('container_title') -    csl['page-first'] = csl.pop('page_first') -    empty_keys = [k for k,v in csl.items() if not v] +    csl["container-title"] = csl.pop("container_title") +    csl["page-first"] = csl.pop("page_first") +    empty_keys = [k for k, v in csl.items() if not v]      for k in empty_keys:          csl.pop(k)      return csl @@ -184,10 +202,11 @@ def refs_to_csl(entity):                  title=ref.title,                  issued=issued_date,              ) -        csl['id'] = ref.key or ref.index, # zero- or one-indexed? +        csl["id"] = (ref.key or ref.index,)  # zero- or one-indexed?          ret.append(csl)      return ret +  def citeproc_csl(csl_json, style, html=False):      """      Renders a release entity to a styled citation. @@ -200,8 +219,8 @@ def citeproc_csl(csl_json, style, html=False):      Returns a string; if the html flag is set, and the style isn't 'csl-json'      or 'bibtex', it will be HTML. Otherwise plain text.      """ -    if not csl_json.get('id'): -        csl_json['id'] = "unknown" +    if not csl_json.get("id"): +        csl_json["id"] = "unknown"      if style == "csl-json":          return json.dumps(csl_json)      bib_src = CiteProcJSON([csl_json]) @@ -211,7 +230,7 @@ def citeproc_csl(csl_json, style, html=False):      style_path = get_style_filepath(style)      bib_style = CitationStylesStyle(style_path, validate=False)      bib = CitationStylesBibliography(bib_style, bib_src, form) -    bib.register(Citation([CitationItem(csl_json['id'])])) +    bib.register(Citation([CitationItem(csl_json["id"])]))      lines = bib.bibliography()[0]      if style == "bibtex":          out = "" @@ -222,6 +241,6 @@ def citeproc_csl(csl_json, style, html=False):                  out += "\n " + line              else:                  out += line -        return ''.join(out) +        return "".join(out)      else: -        return ''.join(lines) +        return "".join(lines) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 1826d4eb..e39e9ea4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,4 +1,3 @@ -  import datetime  from typing import Any, Dict, Optional @@ -13,13 +12,14 @@ from fatcat_openapi_client import (  def check_kbart(year: int, archive: dict) -> Optional[bool]: -    if not archive or not archive.get('year_spans'): +    if not archive or not archive.get("year_spans"):          return None -    for span in archive['year_spans']: +    for span in archive["year_spans"]:          if year >= span[0] and year <= span[1]:              return True      return False +  def test_check_kbart() -> None:      assert check_kbart(1990, dict()) is None @@ -40,87 +40,89 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->      Raises exception on error (never returns None)      """ -    if entity.state in ('redirect', 'deleted'): +    if entity.state in ("redirect", "deleted"):          return dict( -            ident = entity.ident, -            state = entity.state, +            ident=entity.ident, +            state=entity.state,          ) -    elif entity.state != 'active': +    elif entity.state != "active":          raise ValueError("Unhandled entity state: {}".format(entity.state))      # First, the easy ones (direct copy)      release = entity      t: Dict[str, Any] = dict( -        doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", -        ident = release.ident, -        state = release.state, -        revision = release.revision, -        work_id = release.work_id, -        title = release.title, -        subtitle = release.subtitle, -        original_title = release.original_title, -        release_type = release.release_type, -        release_stage = release.release_stage, -        withdrawn_status = release.withdrawn_status, -        language = release.language, -        volume = release.volume, -        issue = release.issue, -        pages = release.pages, -        number = release.number, -        license = release.license_slug, -        version = release.version, -        doi = release.ext_ids.doi, -        pmid = release.ext_ids.pmid, -        pmcid = release.ext_ids.pmcid, -        isbn13 = release.ext_ids.isbn13, -        wikidata_qid = release.ext_ids.wikidata_qid, -        core_id = release.ext_ids.core, -        arxiv_id = release.ext_ids.arxiv, -        jstor_id = release.ext_ids.jstor, -        ark_id = release.ext_ids.ark, -        mag_id = release.ext_ids.mag, -        dblp_id = release.ext_ids.dblp, -        doaj_id = release.ext_ids.doaj, -        hdl = release.ext_ids.hdl, -        tags = [], +        doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", +        ident=release.ident, +        state=release.state, +        revision=release.revision, +        work_id=release.work_id, +        title=release.title, +        subtitle=release.subtitle, +        original_title=release.original_title, +        release_type=release.release_type, +        release_stage=release.release_stage, +        withdrawn_status=release.withdrawn_status, +        language=release.language, +        volume=release.volume, +        issue=release.issue, +        pages=release.pages, +        number=release.number, +        license=release.license_slug, +        version=release.version, +        doi=release.ext_ids.doi, +        pmid=release.ext_ids.pmid, +        pmcid=release.ext_ids.pmcid, +        isbn13=release.ext_ids.isbn13, +        wikidata_qid=release.ext_ids.wikidata_qid, +        core_id=release.ext_ids.core, +        arxiv_id=release.ext_ids.arxiv, +        jstor_id=release.ext_ids.jstor, +        ark_id=release.ext_ids.ark, +        mag_id=release.ext_ids.mag, +        dblp_id=release.ext_ids.dblp, +        doaj_id=release.ext_ids.doaj, +        hdl=release.ext_ids.hdl, +        tags=[],      ) -    t.update(dict( -        is_oa = None, -        is_longtail_oa = None, -        is_preserved = None, -        in_web = False, -        in_dweb = False, -        in_ia = False, -        in_ia_sim = False, -        in_kbart = None, -        in_jstor = False, -        in_doaj= bool(release.ext_ids.doaj), -        in_shadows = False, -    )) +    t.update( +        dict( +            is_oa=None, +            is_longtail_oa=None, +            is_preserved=None, +            in_web=False, +            in_dweb=False, +            in_ia=False, +            in_ia_sim=False, +            in_kbart=None, +            in_jstor=False, +            in_doaj=bool(release.ext_ids.doaj), +            in_shadows=False, +        ) +    )      release_year = release.release_year      if release.release_date:          # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) -        t['release_date'] = release.release_date.isoformat() +        t["release_date"] = release.release_date.isoformat()          if not release_year:              release_year = release.release_date.year      if release_year: -        t['release_year'] = release_year +        t["release_year"] = release_year -    t['any_abstract'] = len(release.abstracts or []) > 0 -    t['ref_count'] = len(release.refs or []) +    t["any_abstract"] = len(release.abstracts or []) > 0 +    t["ref_count"] = len(release.refs or [])      ref_release_ids = [] -    for r in (release.refs or []): +    for r in release.refs or []:          if r.target_release_id:              ref_release_ids.append(r.target_release_id) -    t['ref_release_ids'] = ref_release_ids -    t['ref_linked_count'] = len(ref_release_ids) -    t['contrib_count'] = len(release.contribs or []) +    t["ref_release_ids"] = ref_release_ids +    t["ref_linked_count"] = len(ref_release_ids) +    t["contrib_count"] = len(release.contribs or [])      contrib_names = []      contrib_affiliations = []      creator_ids = [] -    for c in (release.contribs or []): +    for c in release.contribs or []:          if c.creator and c.creator.display_name:              contrib_names.append(c.creator.display_name)          elif c.raw_name: @@ -132,193 +134,218 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->              creator_ids.append(c.creator_id)          if c.raw_affiliation:              contrib_affiliations.append(c.raw_affiliation) -    t['contrib_names'] = contrib_names -    t['creator_ids'] = creator_ids -    t['affiliations'] = contrib_affiliations +    t["contrib_names"] = contrib_names +    t["creator_ids"] = creator_ids +    t["affiliations"] = contrib_affiliations      # TODO: mapping... probably by lookup? -    t['affiliation_rors'] = None +    t["affiliation_rors"] = None      if release.container:          t.update(_rte_container_helper(release.container, release_year))      # fall back to release-level container metadata if container not linked or      # missing context -    if not t.get('publisher'): -        t['publisher'] = release.publisher -    if not t.get('container_name') and release.extra: -        t['container_name'] = release.extra.get('container_name') +    if not t.get("publisher"): +        t["publisher"] = release.publisher +    if not t.get("container_name") and release.extra: +        t["container_name"] = release.extra.get("container_name") -    if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): -        t['in_jstor'] = True +    if release.ext_ids.jstor or ( +        release.ext_ids.doi and release.ext_ids.doi.startswith("10.2307/") +    ): +        t["in_jstor"] = True      # transform file/fileset/webcapture related fields      t.update(_rte_content_helper(release))      if release.ext_ids.doaj: -        t['is_oa'] = True +        t["is_oa"] = True      if release.license_slug:          # TODO: more/better checks here, particularly strict *not* OA licenses          if release.license_slug.startswith("CC-"): -            t['is_oa'] = True +            t["is_oa"] = True          if release.license_slug.startswith("ARXIV-"): -            t['is_oa'] = True +            t["is_oa"] = True -    t['is_work_alias'] = None +    t["is_work_alias"] = None      extra = release.extra or dict()      if extra: -        if extra.get('is_oa'): +        if extra.get("is_oa"):              # NOTE: not actually setting this anywhere... but could -            t['is_oa'] = True -        if extra.get('is_work_alias') is not None: -            t['is_work_alias'] = bool(extra.get('is_work_alias')) -        if extra.get('longtail_oa'): +            t["is_oa"] = True +        if extra.get("is_work_alias") is not None: +            t["is_work_alias"] = bool(extra.get("is_work_alias")) +        if extra.get("longtail_oa"):              # sometimes set by GROBID/matcher -            t['is_oa'] = True -            t['is_longtail_oa'] = True -        if not t.get('container_name'): -            t['container_name'] = extra.get('container_name') -        if extra.get('crossref'): -            if extra['crossref'].get('archive'): +            t["is_oa"] = True +            t["is_longtail_oa"] = True +        if not t.get("container_name"): +            t["container_name"] = extra.get("container_name") +        if extra.get("crossref"): +            if extra["crossref"].get("archive"):                  # all crossref archives are KBART, I believe -                t['in_kbart'] = True +                t["in_kbart"] = True          # backwards compatible subtitle fetching -        if not t['subtitle'] and extra.get('subtitle'): -            if type(extra['subtitle']) == list: -                t['subtitle'] = extra['subtitle'][0] +        if not t["subtitle"] and extra.get("subtitle"): +            if type(extra["subtitle"]) == list: +                t["subtitle"] = extra["subtitle"][0]              else: -                t['subtitle'] = extra['subtitle'] +                t["subtitle"] = extra["subtitle"] -    t['first_page'] = None +    t["first_page"] = None      if release.pages: -        first = release.pages.split('-')[0] -        first = first.replace('p', '') +        first = release.pages.split("-")[0] +        first = first.replace("p", "")          if first.isdigit(): -            t['first_page'] = first +            t["first_page"] = first          # TODO: non-numerical first pages -    t['ia_microfilm_url'] = None -    if t['in_ia_sim']: +    t["ia_microfilm_url"] = None +    if t["in_ia_sim"]:          # TODO: determine URL somehow? I think this is in flux. Will probably          # need extra metadata in the container extra field.          # special case as a demo for now. -        if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ -                and release.release_year in (2011, 2013) \ -                and release.issue \ -                and release.issue.isdigit() \ -                and t['first_page']: -            t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( +        if ( +            release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" +            and release.release_year in (2011, 2013) +            and release.issue +            and release.issue.isdigit() +            and t["first_page"] +        ): +            t[ +                "ia_microfilm_url" +            ] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(                  release.release_year,                  int(release.issue) - 1, -                t['first_page'], +                t["first_page"],              ) -    t['doi_registrar'] = None -    if extra and t['doi']: -        for k in ('crossref', 'datacite', 'jalc'): +    t["doi_registrar"] = None +    if extra and t["doi"]: +        for k in ("crossref", "datacite", "jalc"):              if k in extra: -                t['doi_registrar'] = k -        if 'doi_registrar' not in t: -            t['doi_registrar'] = 'crossref' +                t["doi_registrar"] = k +        if "doi_registrar" not in t: +            t["doi_registrar"] = "crossref" -    if t['doi']: -        t['doi_prefix'] = t['doi'].split('/')[0] +    if t["doi"]: +        t["doi_prefix"] = t["doi"].split("/")[0] -    if t['is_longtail_oa']: -        t['is_oa'] = True +    if t["is_longtail_oa"]: +        t["is_oa"] = True      # optionally coerce all flags from Optional[bool] to bool      if force_bool: -        for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim', -                  'in_jstor', 'in_web', 'in_dweb', 'in_shadows', -                  'is_work_alias'): +        for k in ( +            "is_oa", +            "is_longtail_oa", +            "in_kbart", +            "in_ia_sim", +            "in_jstor", +            "in_web", +            "in_dweb", +            "in_shadows", +            "is_work_alias", +        ):              t[k] = bool(t[k]) -    t['in_ia'] = bool(t['in_ia']) -    t['is_preserved'] = bool( -        t['is_preserved'] -        or t['in_ia'] -        or t['in_kbart'] -        or t['in_jstor'] -        or t.get('pmcid') -        or t.get('arxiv_id') +    t["in_ia"] = bool(t["in_ia"]) +    t["is_preserved"] = bool( +        t["is_preserved"] +        or t["in_ia"] +        or t["in_kbart"] +        or t["in_jstor"] +        or t.get("pmcid") +        or t.get("arxiv_id")      ) -    if t['in_ia']: -        t['preservation'] = 'bright' -    elif t['is_preserved']: -        t['preservation'] = 'dark' -    elif t['in_shadows']: -        t['preservation'] = 'shadows_only' +    if t["in_ia"]: +        t["preservation"] = "bright" +    elif t["is_preserved"]: +        t["preservation"] = "dark" +    elif t["in_shadows"]: +        t["preservation"] = "shadows_only"      else: -        t['preservation'] = 'none' +        t["preservation"] = "none"      return t +  def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict:      """      Container metadata sub-section of release_to_elasticsearch()      """      this_year = datetime.date.today().year      t = dict() -    t['publisher'] = container.publisher -    t['container_name'] = container.name +    t["publisher"] = container.publisher +    t["container_name"] = container.name      # this is container.ident, not release.container_id, because there may      # be a redirect involved -    t['container_id'] = container.ident -    t['container_issnl'] = container.issnl +    t["container_id"] = container.ident +    t["container_issnl"] = container.issnl      issns = [container.issnl, container.issne, container.issnp]      issns = list(set([i for i in issns if i])) -    t['container_issns'] = issns -    t['container_type'] = container.container_type -    t['container_publication_status'] = container.publication_status +    t["container_issns"] = issns +    t["container_type"] = container.container_type +    t["container_publication_status"] = container.publication_status      if container.extra:          c_extra = container.extra -        if c_extra.get('kbart') and release_year: -            if check_kbart(release_year, c_extra['kbart'].get('jstor')): -                t['in_jstor'] = True -            if t.get('in_kbart') or t.get('in_jstor'): -                t['in_kbart'] = True -            for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', -                            'hathitrust', 'scholarsportal', 'cariniana'): -                t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive)) +        if c_extra.get("kbart") and release_year: +            if check_kbart(release_year, c_extra["kbart"].get("jstor")): +                t["in_jstor"] = True +            if t.get("in_kbart") or t.get("in_jstor"): +                t["in_kbart"] = True +            for archive in ( +                "portico", +                "lockss", +                "clockss", +                "pkp_pln", +                "hathitrust", +                "scholarsportal", +                "cariniana", +            ): +                t["in_kbart"] = t.get("in_kbart") or check_kbart( +                    release_year, c_extra["kbart"].get(archive) +                )                  # recent KBART coverage is often not updated for the                  # current year. So for current-year publications, consider                  # coverage from *last* year to also be included in the                  # Keeper -                if not t.get('in_kbart') and release_year == this_year: -                    t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) - -        if c_extra.get('ia'): -            if c_extra['ia'].get('sim') and release_year: -                t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim']) -            if c_extra['ia'].get('longtail_oa'): -                t['is_longtail_oa'] = True -        if c_extra.get('sherpa_romeo'): -            if c_extra['sherpa_romeo'].get('color') == 'white': -                t['is_oa'] = False -        if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): -            t['is_oa'] = True -        if c_extra.get('doaj'): -            if c_extra['doaj'].get('as_of'): -                t['is_oa'] = True -                t['in_doaj'] = True -        if c_extra.get('road'): -            if c_extra['road'].get('as_of'): -                t['is_oa'] = True -        if c_extra.get('szczepanski'): -            if c_extra['szczepanski'].get('as_of'): -                t['is_oa'] = True -        if c_extra.get('country'): -            t['country_code'] = c_extra['country'] -            t['country_code_upper'] = c_extra['country'].upper() -        if c_extra.get('publisher_type'): -            t['publisher_type'] = c_extra['publisher_type'] -        if c_extra.get('discipline'): -            t['discipline'] = c_extra['discipline'] +                if not t.get("in_kbart") and release_year == this_year: +                    t["in_kbart"] = check_kbart(this_year - 1, c_extra["kbart"].get(archive)) + +        if c_extra.get("ia"): +            if c_extra["ia"].get("sim") and release_year: +                t["in_ia_sim"] = check_kbart(release_year, c_extra["ia"]["sim"]) +            if c_extra["ia"].get("longtail_oa"): +                t["is_longtail_oa"] = True +        if c_extra.get("sherpa_romeo"): +            if c_extra["sherpa_romeo"].get("color") == "white": +                t["is_oa"] = False +        if c_extra.get("default_license") and c_extra.get("default_license").startswith("CC-"): +            t["is_oa"] = True +        if c_extra.get("doaj"): +            if c_extra["doaj"].get("as_of"): +                t["is_oa"] = True +                t["in_doaj"] = True +        if c_extra.get("road"): +            if c_extra["road"].get("as_of"): +                t["is_oa"] = True +        if c_extra.get("szczepanski"): +            if c_extra["szczepanski"].get("as_of"): +                t["is_oa"] = True +        if c_extra.get("country"): +            t["country_code"] = c_extra["country"] +            t["country_code_upper"] = c_extra["country"].upper() +        if c_extra.get("publisher_type"): +            t["publisher_type"] = c_extra["publisher_type"] +        if c_extra.get("discipline"): +            t["discipline"] = c_extra["discipline"]      return t +  def _rte_content_helper(release: ReleaseEntity) -> dict:      """      File/FileSet/WebCapture sub-section of release_to_elasticsearch() @@ -329,9 +356,9 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:      - any other URL      """      t = dict( -        file_count = len(release.files or []), -        fileset_count = len(release.filesets or []), -        webcapture_count = len(release.webcaptures or []), +        file_count=len(release.files or []), +        fileset_count=len(release.filesets or []), +        webcapture_count=len(release.webcaptures or []),      )      any_pdf_url = None @@ -340,38 +367,42 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:      ia_pdf_url = None      for f in release.files or []: -        if f.extra and f.extra.get('shadows'): -            t['in_shadows'] = True -        is_pdf = 'pdf' in (f.mimetype or '') -        for release_url in (f.urls or []): +        if f.extra and f.extra.get("shadows"): +            t["in_shadows"] = True +        is_pdf = "pdf" in (f.mimetype or "") +        for release_url in f.urls or []:              # first generic flags              t.update(_rte_url_helper(release_url))              # then PDF specific stuff (for generating "best URL" fields) -            if not f.mimetype and 'pdf' in release_url.url.lower(): +            if not f.mimetype and "pdf" in release_url.url.lower():                  is_pdf = True              if is_pdf:                  any_pdf_url = release_url.url -                if release_url.rel in ('webarchive', 'repository', 'repo'): +                if release_url.rel in ("webarchive", "repository", "repo"):                      good_pdf_url = release_url.url -                if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: +                if ( +                    "//web.archive.org/" in release_url.url +                    or "//archive.org/" in release_url.url +                ):                      best_pdf_url = release_url.url                      ia_pdf_url = release_url.url      # here is where we bake-in PDF url priority; IA-specific -    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url -    t['ia_pdf_url'] = ia_pdf_url +    t["best_pdf_url"] = best_pdf_url or good_pdf_url or any_pdf_url +    t["ia_pdf_url"] = ia_pdf_url      for fs in release.filesets or []: -        for url_obj in (fs.urls or []): +        for url_obj in fs.urls or []:              t.update(_rte_url_helper(url_obj))      for wc in release.webcaptures or []: -        for url_obj in (wc.archive_urls or []): +        for url_obj in wc.archive_urls or []:              t.update(_rte_url_helper(url_obj))      return t +  def _rte_url_helper(url_obj) -> dict:      """      Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. @@ -382,17 +413,17 @@ def _rte_url_helper(url_obj) -> dict:      these will be iteratively update() into the overal object.      """      t = dict() -    if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'): -        t['is_preserved'] = True -    if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url: -        t['in_ia'] = True -    if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'): -        t['in_web'] = True -    if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): +    if url_obj.rel in ("webarchive", "repository", "archive", "repo"): +        t["is_preserved"] = True +    if "//web.archive.org/" in url_obj.url or "//archive.org/" in url_obj.url: +        t["in_ia"] = True +    if url_obj.url.lower().startswith("http") or url_obj.url.lower().startswith("ftp"): +        t["in_web"] = True +    if url_obj.rel in ("dweb", "p2p", "ipfs", "dat", "torrent"):          # not sure what rel will be for this stuff -        t['in_dweb'] = True -    if '//www.jstor.org/' in url_obj.url: -        t['in_jstor'] = True +        t["in_dweb"] = True +    if "//www.jstor.org/" in url_obj.url: +        t["in_jstor"] = True      return t @@ -404,50 +435,59 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None):      Raises exception on error (never returns None)      """ -    if entity.state in ('redirect', 'deleted'): +    if entity.state in ("redirect", "deleted"):          return dict( -            ident = entity.ident, -            state = entity.state, +            ident=entity.ident, +            state=entity.state,          ) -    elif entity.state != 'active': +    elif entity.state != "active":          raise ValueError("Unhandled entity state: {}".format(entity.state))      # First, the easy ones (direct copy)      t = dict( -        doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", -        ident = entity.ident, -        state = entity.state, -        revision = entity.revision, - -        name = entity.name, -        publisher = entity.publisher, -        container_type = entity.container_type, -        publication_status= entity.publication_status, -        issnl = entity.issnl, -        issne = entity.issne, -        issnp = entity.issnp, -        wikidata_qid = entity.wikidata_qid, +        doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", +        ident=entity.ident, +        state=entity.state, +        revision=entity.revision, +        name=entity.name, +        publisher=entity.publisher, +        container_type=entity.container_type, +        publication_status=entity.publication_status, +        issnl=entity.issnl, +        issne=entity.issne, +        issnp=entity.issnp, +        wikidata_qid=entity.wikidata_qid,      )      if not entity.extra:          entity.extra = dict() -    for key in ('country', 'languages', 'mimetypes', 'original_name', -                'first_year', 'last_year', 'aliases', 'abbrev', 'region', -                'discipline', 'publisher_type'): +    for key in ( +        "country", +        "languages", +        "mimetypes", +        "original_name", +        "first_year", +        "last_year", +        "aliases", +        "abbrev", +        "region", +        "discipline", +        "publisher_type", +    ):          if entity.extra.get(key):              t[key] = entity.extra[key] -    if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'): -        t['dblp_prefix'] = entity.extra['dblp']['prefix'] +    if entity.extra.get("dblp") and entity.extra["dblp"].get("prefix"): +        t["dblp_prefix"] = entity.extra["dblp"]["prefix"] -    if 'country' in t: -        t['country_code'] = t.pop('country') +    if "country" in t: +        t["country_code"] = t.pop("country") -    t['issns'] = [entity.issnl, entity.issne, entity.issnp] -    for key in ('issnp', 'issne'): +    t["issns"] = [entity.issnl, entity.issne, entity.issnp] +    for key in ("issnp", "issne"):          if entity.extra.get(key): -            t['issns'].append(entity.extra[key]) -    t['issns'] = list(set([i for i in t['issns'] if i])) +            t["issns"].append(entity.extra[key]) +    t["issns"] = list(set([i for i in t["issns"] if i]))      in_doaj = None      in_road = None @@ -459,72 +499,72 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None):      keepers = []      extra = entity.extra -    if extra.get('doaj'): -        if extra['doaj'].get('as_of'): +    if extra.get("doaj"): +        if extra["doaj"].get("as_of"):              in_doaj = True -    if extra.get('road'): -        if extra['road'].get('as_of'): +    if extra.get("road"): +        if extra["road"].get("as_of"):              in_road = True -    if extra.get('szczepanski'): -        if extra['szczepanski'].get('as_of'): +    if extra.get("szczepanski"): +        if extra["szczepanski"].get("as_of"):              is_oa = True -    if extra.get('default_license'): -        if extra['default_license'].startswith('CC-'): +    if extra.get("default_license"): +        if extra["default_license"].startswith("CC-"):              is_oa = True -    t['sherpa_romeo_color'] = None -    if extra.get('sherpa_romeo'): -        t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color') -        if extra['sherpa_romeo'].get('color') == 'white': +    t["sherpa_romeo_color"] = None +    if extra.get("sherpa_romeo"): +        t["sherpa_romeo_color"] = extra["sherpa_romeo"].get("color") +        if extra["sherpa_romeo"].get("color") == "white":              is_oa = False -    if extra.get('kbart'): +    if extra.get("kbart"):          any_kbart = True -        if extra['kbart'].get('jstor'): +        if extra["kbart"].get("jstor"):              any_jstor = True -        for k, v in extra['kbart'].items(): +        for k, v in extra["kbart"].items():              if v and isinstance(v, dict):                  keepers.append(k) -    if extra.get('ia'): -        if extra['ia'].get('sim'): +    if extra.get("ia"): +        if extra["ia"].get("sim"):              any_ia_sim = True -        if extra['ia'].get('longtail_oa'): +        if extra["ia"].get("longtail_oa"):              is_longtail_oa = True -    t['is_superceded'] = bool(extra.get('superceded')) +    t["is_superceded"] = bool(extra.get("superceded")) -    t['keepers'] = keepers -    t['in_doaj'] = bool(in_doaj) -    t['in_road'] = bool(in_road) -    t['any_kbart'] = bool(any_kbart) +    t["keepers"] = keepers +    t["in_doaj"] = bool(in_doaj) +    t["in_road"] = bool(in_road) +    t["any_kbart"] = bool(any_kbart)      if force_bool: -        t['is_oa'] = bool(in_doaj or in_road or is_oa) -        t['is_longtail_oa'] = bool(is_longtail_oa) -        t['any_jstor'] = bool(any_jstor) -        t['any_ia_sim'] = bool(any_ia_sim) +        t["is_oa"] = bool(in_doaj or in_road or is_oa) +        t["is_longtail_oa"] = bool(is_longtail_oa) +        t["any_jstor"] = bool(any_jstor) +        t["any_ia_sim"] = bool(any_ia_sim)      else: -        t['is_oa'] = in_doaj or in_road or is_oa -        t['is_longtail_oa'] = is_longtail_oa -        t['any_jstor'] = any_jstor -        t['any_ia_sim'] = any_ia_sim +        t["is_oa"] = in_doaj or in_road or is_oa +        t["is_longtail_oa"] = is_longtail_oa +        t["any_jstor"] = any_jstor +        t["any_ia_sim"] = any_ia_sim      # mix in stats, if provided      if stats: -        t['releases_total'] = stats['total'] -        t['preservation_bright'] = stats['preservation']['bright'] -        t['preservation_dark'] = stats['preservation']['dark'] -        t['preservation_shadows_only'] = stats['preservation']['shadows_only'] -        t['preservation_none'] = stats['preservation']['none'] +        t["releases_total"] = stats["total"] +        t["preservation_bright"] = stats["preservation"]["bright"] +        t["preservation_dark"] = stats["preservation"]["dark"] +        t["preservation_shadows_only"] = stats["preservation"]["shadows_only"] +        t["preservation_none"] = stats["preservation"]["none"]      return t  def _type_of_edit(edit: EntityEdit) -> str:      if edit.revision is None and edit.redirect_ident is None: -        return 'delete' +        return "delete"      elif edit.redirect_ident:          # redirect -        return 'update' +        return "update"      elif edit.prev_revision is None and edit.redirect_ident is None and edit.revision: -        return 'create' +        return "create"      else: -        return 'update' +        return "update"  def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]: @@ -536,7 +576,7 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:      editgroup = entity.editgroup      t = dict( -        doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", +        doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z",          index=entity.index,          editgroup_id=entity.editgroup_id,          timestamp=entity.timestamp.isoformat(), @@ -547,8 +587,8 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:      )      extra = editgroup.extra or dict() -    if extra.get('agent'): -        t['agent'] = extra['agent'] +    if extra.get("agent"): +        t["agent"] = extra["agent"]      containers = [_type_of_edit(e) for e in editgroup.edits.containers]      creators = [_type_of_edit(e) for e in editgroup.edits.creators] @@ -558,27 +598,27 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:      releases = [_type_of_edit(e) for e in editgroup.edits.releases]      works = [_type_of_edit(e) for e in editgroup.edits.works] -    t['containers'] = len(containers) -    t['new_containers'] = len([e for e in containers if e == 'create']) -    t['creators'] = len(creators) -    t['new_creators'] = len([e for e in creators if e == 'create']) -    t['files'] = len(files) -    t['new_files'] = len([e for e in files if e == 'create']) -    t['filesets'] = len(filesets) -    t['new_filesets'] = len([e for e in filesets if e == 'create']) -    t['webcaptures'] = len(webcaptures) -    t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) -    t['releases'] = len(releases) -    t['new_releases'] = len([e for e in releases if e == 'create']) -    t['works'] = len(works) -    t['new_works'] = len([e for e in works if e == 'create']) +    t["containers"] = len(containers) +    t["new_containers"] = len([e for e in containers if e == "create"]) +    t["creators"] = len(creators) +    t["new_creators"] = len([e for e in creators if e == "create"]) +    t["files"] = len(files) +    t["new_files"] = len([e for e in files if e == "create"]) +    t["filesets"] = len(filesets) +    t["new_filesets"] = len([e for e in filesets if e == "create"]) +    t["webcaptures"] = len(webcaptures) +    t["new_webcaptures"] = len([e for e in webcaptures if e == "create"]) +    t["releases"] = len(releases) +    t["new_releases"] = len([e for e in releases if e == "create"]) +    t["works"] = len(works) +    t["new_works"] = len([e for e in works if e == "create"])      all_edits = containers + creators + files + filesets + webcaptures + releases + works -    t['created'] = len([e for e in all_edits if e == 'create']) -    t['updated'] = len([e for e in all_edits if e == 'update']) -    t['deleted'] = len([e for e in all_edits if e == 'delete']) -    t['total'] = len(all_edits) +    t["created"] = len([e for e in all_edits if e == "create"]) +    t["updated"] = len([e for e in all_edits if e == "update"]) +    t["deleted"] = len([e for e in all_edits if e == "delete"]) +    t["total"] = len(all_edits)      return t @@ -590,47 +630,47 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]:      Raises exception on error (never returns None)      """ -    if entity.state in ('redirect', 'deleted'): +    if entity.state in ("redirect", "deleted"):          return dict( -            ident = entity.ident, -            state = entity.state, +            ident=entity.ident, +            state=entity.state,          ) -    elif entity.state != 'active': +    elif entity.state != "active":          raise ValueError("Unhandled entity state: {}".format(entity.state))      # First, the easy ones (direct copy)      t = dict( -        doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", -        ident = entity.ident, -        state = entity.state, -        revision = entity.revision, -        release_ids = entity.release_ids, -        release_count = len(entity.release_ids), -        mimetype = entity.mimetype, -        size_bytes = entity.size, -        sha1 = entity.sha1, -        sha256 = entity.sha256, -        md5 = entity.md5, +        doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", +        ident=entity.ident, +        state=entity.state, +        revision=entity.revision, +        release_ids=entity.release_ids, +        release_count=len(entity.release_ids), +        mimetype=entity.mimetype, +        size_bytes=entity.size, +        sha1=entity.sha1, +        sha256=entity.sha256, +        md5=entity.md5,      )      parsed_urls = [tldextract.extract(u.url) for u in entity.urls] -    t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) -    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) -    t['rels'] = list(set([u.rel for u in entity.urls])) +    t["hosts"] = list(set([".".join([seg for seg in pu if seg]) for pu in parsed_urls])) +    t["domains"] = list(set([pu.registered_domain for pu in parsed_urls])) +    t["rels"] = list(set([u.rel for u in entity.urls])) -    t['in_ia'] = bool('archive.org' in t['domains']) -    t['in_ia_petabox'] = bool('archive.org' in t['hosts']) +    t["in_ia"] = bool("archive.org" in t["domains"]) +    t["in_ia_petabox"] = bool("archive.org" in t["hosts"])      any_url = None      good_url = None      best_url = None -    for release_url in (entity.urls or []): +    for release_url in entity.urls or []:          any_url = release_url.url -        if release_url.rel in ('webarchive', 'repository'): +        if release_url.rel in ("webarchive", "repository"):              good_url = release_url.url -        if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: +        if "//web.archive.org/" in release_url.url or "//archive.org/" in release_url.url:              best_url = release_url.url      # here is where we bake-in priority; IA-specific -    t['best_url'] = best_url or good_url or any_url +    t["best_url"] = best_url or good_url or any_url      return t diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 9101a4ec..30b5b190 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,4 +1,3 @@ -  INGEST_TYPE_CONTAINER_MAP = {      # Optica      "twtpsm6ytje3nhuqfu3pa7ca7u": "html", @@ -14,7 +13,8 @@ INGEST_TYPE_CONTAINER_MAP = {      "lovwr7ladjagzkhmoaszg7efqu": "html",  } -def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None): + +def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None):      """      Takes a full release entity object and returns an ingest request (as dict),      or None if it seems like this release shouldn't be ingested. @@ -27,27 +27,35 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=      calling code should check the returned type field.      """ -    if release.state != 'active': +    if release.state != "active":          return None      if (not ingest_type) and release.container_id:          ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)      if not ingest_type: -        if release.release_type == 'stub': +        if release.release_type == "stub":              return None -        elif release.release_type in ['component', 'graphic']: -            ingest_type = 'component' -        elif release.release_type == 'dataset': -            ingest_type = 'dataset' -        elif release.release_type == 'software': -            ingest_type = 'software' -        elif release.release_type == 'post-weblog': -            ingest_type = 'html' -        elif release.release_type in ['article-journal', 'article', 'chapter', 'paper-conference', 'book', 'report', 'thesis']: -            ingest_type = 'pdf' +        elif release.release_type in ["component", "graphic"]: +            ingest_type = "component" +        elif release.release_type == "dataset": +            ingest_type = "dataset" +        elif release.release_type == "software": +            ingest_type = "software" +        elif release.release_type == "post-weblog": +            ingest_type = "html" +        elif release.release_type in [ +            "article-journal", +            "article", +            "chapter", +            "paper-conference", +            "book", +            "report", +            "thesis", +        ]: +            ingest_type = "pdf"          else: -            ingest_type = 'pdf' +            ingest_type = "pdf"      # generate a URL where we expect to find fulltext      url = None @@ -59,8 +67,10 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=          link_source_id = release.ext_ids.arxiv      elif release.ext_ids.pmcid and ingest_type == "pdf":          # TODO: how to tell if an author manuscript in PMC vs. published? -        #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) -        url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) +        # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) +        url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format( +            release.ext_ids.pmcid +        )          link_source = "pmc"          link_source_id = release.ext_ids.pmcid      elif release.ext_ids.doi: @@ -75,19 +85,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=      ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])      ingest_request = { -        'ingest_type': ingest_type, -        'ingest_request_source': ingest_request_source, -        'base_url': url, -        'release_stage': release.release_stage, -        'fatcat': { -            'release_ident': release.ident, -            'work_ident': release.work_id, +        "ingest_type": ingest_type, +        "ingest_request_source": ingest_request_source, +        "base_url": url, +        "release_stage": release.release_stage, +        "fatcat": { +            "release_ident": release.ident, +            "work_ident": release.work_id,          }, -        'ext_ids': ext_ids, +        "ext_ids": ext_ids,      }      if link_source and link_source_id: -        ingest_request['link_source'] = link_source -        ingest_request['link_source_id'] = link_source_id +        ingest_request["link_source"] = link_source +        ingest_request["link_source_id"] = link_source_id      return ingest_request diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index a61e364c..1e4cb41d 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -1,4 +1,3 @@ -  import json  import time @@ -16,11 +15,9 @@ class ChangelogWorker(FatcatWorker):      """      def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): -        super().__init__(kafka_hosts=kafka_hosts, -                         produce_topic=produce_topic, -                         api=api) +        super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, api=api)          self.poll_interval = poll_interval -        self.offset = offset    # the fatcat changelog offset, not the kafka offset +        self.offset = offset  # the fatcat changelog offset, not the kafka offset      def run(self): @@ -31,7 +28,7 @@ class ChangelogWorker(FatcatWorker):              print("Checking for most recent changelog offset...")              msg = most_recent_message(self.produce_topic, self.kafka_config)              if msg: -                self.offset = json.loads(msg.decode('utf-8'))['index'] +                self.offset = json.loads(msg.decode("utf-8"))["index"]              else:                  self.offset = 0              print("Most recent changelog index in Kafka seems to be {}".format(self.offset)) @@ -44,28 +41,29 @@ class ChangelogWorker(FatcatWorker):                  raise KafkaException(err)          producer_conf = self.kafka_config.copy() -        producer_conf.update({ -            'delivery.report.only.error': True, -            'default.topic.config': { -                'request.required.acks': -1, # all brokers must confirm -            }, -        }) +        producer_conf.update( +            { +                "delivery.report.only.error": True, +                "default.topic.config": { +                    "request.required.acks": -1,  # all brokers must confirm +                }, +            } +        )          producer = Producer(producer_conf)          while True:              latest = int(self.api.get_changelog(limit=1)[0].index)              if latest > self.offset: -                print("Fetching changelogs from {} through {}".format( -                    self.offset+1, latest)) -            for i in range(self.offset+1, latest+1): +                print("Fetching changelogs from {} through {}".format(self.offset + 1, latest)) +            for i in range(self.offset + 1, latest + 1):                  cle = self.api.get_changelog_entry(i)                  obj = self.api.api_client.sanitize_for_serialization(cle)                  producer.produce(                      self.produce_topic, -                    json.dumps(obj).encode('utf-8'), +                    json.dumps(obj).encode("utf-8"),                      key=str(i),                      on_delivery=fail_fast, -                    #NOTE timestamp could be timestamp=cle.timestamp (?) +                    # NOTE timestamp could be timestamp=cle.timestamp (?)                  )                  self.offset = i              producer.flush() @@ -79,12 +77,19 @@ class EntityUpdatesWorker(FatcatWorker):      from API) to update topics.      """ -    def __init__(self, api, kafka_hosts, consume_topic, release_topic, -            file_topic, container_topic, ingest_file_request_topic, -            work_ident_topic, poll_interval=5.0): -        super().__init__(kafka_hosts=kafka_hosts, -                         consume_topic=consume_topic, -                         api=api) +    def __init__( +        self, +        api, +        kafka_hosts, +        consume_topic, +        release_topic, +        file_topic, +        container_topic, +        ingest_file_request_topic, +        work_ident_topic, +        poll_interval=5.0, +    ): +        super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, api=api)          self.release_topic = release_topic          self.file_topic = file_topic          self.container_topic = container_topic @@ -150,7 +155,7 @@ class EntityUpdatesWorker(FatcatWorker):              # Transactions of the Japan Society of Mechanical Engineers              "10.1299/kikai",              # protocols.io -            "10.17504/" +            "10.17504/",          ]      def want_live_ingest(self, release, ingest_request): @@ -163,40 +168,40 @@ class EntityUpdatesWorker(FatcatWorker):          ingest crawling (via wayback SPN).          """ -        link_source = ingest_request.get('ingest_request') -        ingest_type = ingest_request.get('ingest_type') -        doi = ingest_request.get('ext_ids', {}).get('doi') +        link_source = ingest_request.get("ingest_request") +        ingest_type = ingest_request.get("ingest_type") +        doi = ingest_request.get("ext_ids", {}).get("doi")          es = release_to_elasticsearch(release)          is_document = release.release_type in ( -            'article', -            'article-journal', -            'article-newspaper', -            'book', -            'chapter', -            'editorial', -            'interview', -            'legal_case', -            'legislation', -            'letter', -            'manuscript', -            'paper-conference', -            'patent', -            'peer_review', -            'post', -            'report', -            'retraction', -            'review', -            'review-book', -            'thesis', +            "article", +            "article-journal", +            "article-newspaper", +            "book", +            "chapter", +            "editorial", +            "interview", +            "legal_case", +            "legislation", +            "letter", +            "manuscript", +            "paper-conference", +            "patent", +            "peer_review", +            "post", +            "report", +            "retraction", +            "review", +            "review-book", +            "thesis",          )          is_not_pdf = release.release_type in ( -            'component', -            'dataset', -            'figure', -            'graphic', -            'software', -            'stub', +            "component", +            "dataset", +            "figure", +            "graphic", +            "software", +            "stub",          )          # accept list sets a default "crawl it" despite OA metadata for @@ -207,19 +212,23 @@ class EntityUpdatesWorker(FatcatWorker):                  if doi.startswith(prefix):                      in_acceptlist = True -        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): +        if self.ingest_oa_only and link_source not in ("arxiv", "pmc"):              # most datacite documents are in IRs and should be crawled              is_datacite_doc = False -            if release.extra and ('datacite' in release.extra) and is_document: +            if release.extra and ("datacite" in release.extra) and is_document:                  is_datacite_doc = True -            if not (es['is_oa'] or in_acceptlist or is_datacite_doc): +            if not (es["is_oa"] or in_acceptlist or is_datacite_doc):                  return False          # big publishers *generally* have accurate OA metadata, use          # preservation networks, and block our crawlers. So unless OA, or          # explicitly on accept list, or not preserved, skip crawling -        if es.get('publisher_type') == 'big5' and es.get('is_preserved') and not (es['is_oa'] or in_acceptlist): +        if ( +            es.get("publisher_type") == "big5" +            and es.get("is_preserved") +            and not (es["is_oa"] or in_acceptlist) +        ):              return False          # if ingest_type is pdf but release_type is almost certainly not a PDF, @@ -233,23 +242,24 @@ class EntityUpdatesWorker(FatcatWorker):                      return False          # figshare -        if doi and (doi.startswith('10.6084/') or doi.startswith('10.25384/')): +        if doi and (doi.startswith("10.6084/") or doi.startswith("10.25384/")):              # don't crawl "most recent version" (aka "group") DOIs              if not release.version:                  return False          # zenodo -        if doi and doi.startswith('10.5281/'): +        if doi and doi.startswith("10.5281/"):              # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs) -            if release.extra and release.extra.get('relations'): -                for rel in release.extra['relations']: -                    if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')): +            if release.extra and release.extra.get("relations"): +                for rel in release.extra["relations"]: +                    if rel.get("relationType") == "HasVersion" and rel.get( +                        "relatedIdentifier", "" +                    ).startswith("10.5281/"):                          return False          return True      def run(self): -          def fail_fast(err, msg):              if err is not None:                  print("Kafka producer delivery error: {}".format(err)) @@ -278,36 +288,40 @@ class EntityUpdatesWorker(FatcatWorker):              for p in partitions:                  if p.error:                      raise KafkaException(p.error) -            print("Kafka partitions rebalanced: {} / {}".format( -                consumer, partitions)) +            print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))          consumer_conf = self.kafka_config.copy() -        consumer_conf.update({ -            'group.id': self.consumer_group, -            'on_commit': fail_fast, -            # messages don't have offset marked as stored until pushed to -            # elastic, but we do auto-commit stored offsets to broker -            'enable.auto.commit': True, -            'enable.auto.offset.store': False, -            # user code timeout; if no poll after this long, assume user code -            # hung and rebalance (default: 5min) -            'max.poll.interval.ms': 180000, -            'default.topic.config': { -                'auto.offset.reset': 'latest', -            }, -        }) +        consumer_conf.update( +            { +                "group.id": self.consumer_group, +                "on_commit": fail_fast, +                # messages don't have offset marked as stored until pushed to +                # elastic, but we do auto-commit stored offsets to broker +                "enable.auto.commit": True, +                "enable.auto.offset.store": False, +                # user code timeout; if no poll after this long, assume user code +                # hung and rebalance (default: 5min) +                "max.poll.interval.ms": 180000, +                "default.topic.config": { +                    "auto.offset.reset": "latest", +                }, +            } +        )          consumer = Consumer(consumer_conf)          producer_conf = self.kafka_config.copy() -        producer_conf.update({ -            'delivery.report.only.error': True, -            'default.topic.config': { -                'request.required.acks': -1, # all brokers must confirm -            }, -        }) +        producer_conf.update( +            { +                "delivery.report.only.error": True, +                "default.topic.config": { +                    "request.required.acks": -1,  # all brokers must confirm +                }, +            } +        )          producer = Producer(producer_conf) -        consumer.subscribe([self.consume_topic], +        consumer.subscribe( +            [self.consume_topic],              on_assign=on_rebalance,              on_revoke=on_rebalance,          ) @@ -316,14 +330,16 @@ class EntityUpdatesWorker(FatcatWorker):          while True:              msg = consumer.poll(self.poll_interval)              if not msg: -                print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval)) +                print( +                    "nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval) +                )                  continue              if msg.error():                  raise KafkaException(msg.error()) -            cle = json.loads(msg.value().decode('utf-8')) -            #print(cle) -            print("processing changelog index {}".format(cle['index'])) +            cle = json.loads(msg.value().decode("utf-8")) +            # print(cle) +            print("processing changelog index {}".format(cle["index"]))              release_ids = []              new_release_ids = []              file_ids = [] @@ -331,27 +347,27 @@ class EntityUpdatesWorker(FatcatWorker):              webcapture_ids = []              container_ids = []              work_ids = [] -            release_edits = cle['editgroup']['edits']['releases'] +            release_edits = cle["editgroup"]["edits"]["releases"]              for re in release_edits: -                release_ids.append(re['ident']) +                release_ids.append(re["ident"])                  # filter to direct release edits which are not updates -                if not re.get('prev_revision') and not re.get('redirect_ident'): -                    new_release_ids.append(re['ident']) -            file_edits = cle['editgroup']['edits']['files'] +                if not re.get("prev_revision") and not re.get("redirect_ident"): +                    new_release_ids.append(re["ident"]) +            file_edits = cle["editgroup"]["edits"]["files"]              for e in file_edits: -                file_ids.append(e['ident']) -            fileset_edits = cle['editgroup']['edits']['filesets'] +                file_ids.append(e["ident"]) +            fileset_edits = cle["editgroup"]["edits"]["filesets"]              for e in fileset_edits: -                fileset_ids.append(e['ident']) -            webcapture_edits = cle['editgroup']['edits']['webcaptures'] +                fileset_ids.append(e["ident"]) +            webcapture_edits = cle["editgroup"]["edits"]["webcaptures"]              for e in webcapture_edits: -                webcapture_ids.append(e['ident']) -            container_edits = cle['editgroup']['edits']['containers'] +                webcapture_ids.append(e["ident"]) +            container_edits = cle["editgroup"]["edits"]["containers"]              for e in container_edits: -                container_ids.append(e['ident']) -            work_edits = cle['editgroup']['edits']['works'] +                container_ids.append(e["ident"]) +            work_edits = cle["editgroup"]["edits"]["works"]              for e in work_edits: -                work_ids.append(e['ident']) +                work_ids.append(e["ident"])              # TODO: do these fetches in parallel using a thread pool?              for ident in set(file_ids): @@ -363,8 +379,8 @@ class EntityUpdatesWorker(FatcatWorker):                  file_dict = self.api.api_client.sanitize_for_serialization(file_entity)                  producer.produce(                      self.file_topic, -                    json.dumps(file_dict).encode('utf-8'), -                    key=ident.encode('utf-8'), +                    json.dumps(file_dict).encode("utf-8"), +                    key=ident.encode("utf-8"),                      on_delivery=fail_fast,                  ) @@ -385,30 +401,34 @@ class EntityUpdatesWorker(FatcatWorker):                  container_dict = self.api.api_client.sanitize_for_serialization(container)                  producer.produce(                      self.container_topic, -                    json.dumps(container_dict).encode('utf-8'), -                    key=ident.encode('utf-8'), +                    json.dumps(container_dict).encode("utf-8"), +                    key=ident.encode("utf-8"),                      on_delivery=fail_fast,                  )              for ident in set(release_ids): -                release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") +                release = self.api.get_release( +                    ident, expand="files,filesets,webcaptures,container" +                )                  if release.work_id:                      work_ids.append(release.work_id)                  release_dict = self.api.api_client.sanitize_for_serialization(release)                  producer.produce(                      self.release_topic, -                    json.dumps(release_dict).encode('utf-8'), -                    key=ident.encode('utf-8'), +                    json.dumps(release_dict).encode("utf-8"), +                    key=ident.encode("utf-8"),                      on_delivery=fail_fast,                  )                  # for ingest requests, filter to "new" active releases with no matched files                  if release.ident in new_release_ids: -                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') +                    ir = release_ingest_request( +                        release, ingest_request_source="fatcat-changelog" +                    )                      if ir and not release.files and self.want_live_ingest(release, ir):                          producer.produce(                              self.ingest_file_request_topic, -                            json.dumps(ir).encode('utf-8'), -                            #key=None, +                            json.dumps(ir).encode("utf-8"), +                            # key=None,                              on_delivery=fail_fast,                          ) @@ -420,13 +440,13 @@ class EntityUpdatesWorker(FatcatWorker):                      key=key,                      type="fatcat_work",                      work_ident=ident, -                    updated=cle['timestamp'], -                    fatcat_changelog_index=cle['index'], +                    updated=cle["timestamp"], +                    fatcat_changelog_index=cle["index"],                  )                  producer.produce(                      self.work_ident_topic, -                    json.dumps(work_ident_dict).encode('utf-8'), -                    key=key.encode('utf-8'), +                    json.dumps(work_ident_dict).encode("utf-8"), +                    key=key.encode("utf-8"),                      on_delivery=fail_fast,                  ) diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index f411073d..0d75f964 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -1,4 +1,3 @@ -  import json  import sys @@ -26,12 +25,20 @@ class ElasticsearchReleaseWorker(FatcatWorker):      Uses a consumer group to manage offset.      """ -    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, -            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", -            elasticsearch_release_index="fatcat_releases", -            batch_size=200, api_host="https://api.fatcat.wiki/v0", query_stats=False): -        super().__init__(kafka_hosts=kafka_hosts, -                         consume_topic=consume_topic) +    def __init__( +        self, +        kafka_hosts, +        consume_topic, +        poll_interval=10.0, +        offset=None, +        elasticsearch_backend="http://localhost:9200", +        elasticsearch_index="fatcat", +        elasticsearch_release_index="fatcat_releases", +        batch_size=200, +        api_host="https://api.fatcat.wiki/v0", +        query_stats=False, +    ): +        super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic)          self.consumer_group = "elasticsearch-updates3"          self.batch_size = batch_size          self.poll_interval = poll_interval @@ -63,45 +70,53 @@ class ElasticsearchReleaseWorker(FatcatWorker):                      print("Bailing out...", file=sys.stderr)                      # TODO: should it be sys.exit(-1)?                      raise KafkaException(p.error) -            #print("Kafka consumer commit successful") +            # print("Kafka consumer commit successful")              pass          def on_rebalance(consumer, partitions):              for p in partitions:                  if p.error:                      raise KafkaException(p.error) -            print("Kafka partitions rebalanced: {} / {}".format( -                consumer, partitions), file=sys.stderr) +            print( +                "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), +                file=sys.stderr, +            )          consumer_conf = self.kafka_config.copy() -        consumer_conf.update({ -            'group.id': self.consumer_group, -            'on_commit': fail_fast, -            # messages don't have offset marked as stored until pushed to -            # elastic, but we do auto-commit stored offsets to broker -            'enable.auto.commit': True, -            'enable.auto.offset.store': False, -            # user code timeout; if no poll after this long, assume user code -            # hung and rebalance (default: 5min) -            'max.poll.interval.ms': 60000, -            'default.topic.config': { -                'auto.offset.reset': 'latest', -            }, -        }) +        consumer_conf.update( +            { +                "group.id": self.consumer_group, +                "on_commit": fail_fast, +                # messages don't have offset marked as stored until pushed to +                # elastic, but we do auto-commit stored offsets to broker +                "enable.auto.commit": True, +                "enable.auto.offset.store": False, +                # user code timeout; if no poll after this long, assume user code +                # hung and rebalance (default: 5min) +                "max.poll.interval.ms": 60000, +                "default.topic.config": { +                    "auto.offset.reset": "latest", +                }, +            } +        )          consumer = Consumer(consumer_conf) -        consumer.subscribe([self.consume_topic], +        consumer.subscribe( +            [self.consume_topic],              on_assign=on_rebalance,              on_revoke=on_rebalance,          )          while True: -            batch = consumer.consume( -                num_messages=self.batch_size, -                timeout=self.poll_interval) +            batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval)              if not batch:                  if not consumer.assignment():                      print("... no Kafka consumer partitions assigned yet", file=sys.stderr) -                print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval), file=sys.stderr) +                print( +                    "... nothing new from kafka, try again (interval: {}".format( +                        self.poll_interval +                    ), +                    file=sys.stderr, +                )                  continue              print("... got {} kafka messages".format(len(batch)), file=sys.stderr)              # first check errors on entire batch... @@ -111,19 +126,24 @@ class ElasticsearchReleaseWorker(FatcatWorker):              # ... then process              bulk_actions = []              for msg in batch: -                json_str = msg.value().decode('utf-8') +                json_str = msg.value().decode("utf-8")                  entity = entity_from_json(json_str, self.entity_type, api_client=ac)                  assert isinstance(entity, self.entity_type)                  if self.entity_type == ChangelogEntry:                      key = entity.index                      # might need to fetch from API -                    if not (entity.editgroup and entity.editgroup.editor): # pylint: disable=no-member # (TODO) +                    if not ( +                        entity.editgroup and entity.editgroup.editor +                    ):  # pylint: disable=no-member # (TODO)                          entity = api.get_changelog_entry(entity.index)                  else:                      key = entity.ident  # pylint: disable=no-member # (TODO) -                if self.entity_type != ChangelogEntry and entity.state == 'wip': -                    print(f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr) +                if self.entity_type != ChangelogEntry and entity.state == "wip": +                    print( +                        f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", +                        file=sys.stderr, +                    )                      continue                  if self.entity_type == ContainerEntity and self.query_stats: @@ -138,9 +158,15 @@ class ElasticsearchReleaseWorker(FatcatWorker):                      doc_dict = self.transform_func(entity)                  # TODO: handle deletions from index -                bulk_actions.append(json.dumps({ -                    "index": { "_id": key, }, -                })) +                bulk_actions.append( +                    json.dumps( +                        { +                            "index": { +                                "_id": key, +                            }, +                        } +                    ) +                )                  bulk_actions.append(json.dumps(doc_dict))              # if only WIP entities, then skip @@ -149,15 +175,22 @@ class ElasticsearchReleaseWorker(FatcatWorker):                      consumer.store_offsets(message=msg)                  continue -            print("Upserting, eg, {} (of {} {} in elasticsearch)".format(key, len(batch), self.entity_type.__name__), file=sys.stderr) +            print( +                "Upserting, eg, {} (of {} {} in elasticsearch)".format( +                    key, len(batch), self.entity_type.__name__ +                ), +                file=sys.stderr, +            )              elasticsearch_endpoint = "{}/{}/_bulk".format( -                self.elasticsearch_backend, -                self.elasticsearch_index) -            resp = requests.post(elasticsearch_endpoint, +                self.elasticsearch_backend, self.elasticsearch_index +            ) +            resp = requests.post( +                elasticsearch_endpoint,                  headers={"Content-Type": "application/x-ndjson"}, -                data="\n".join(bulk_actions) + "\n") +                data="\n".join(bulk_actions) + "\n", +            )              resp.raise_for_status() -            if resp.json()['errors']: +            if resp.json()["errors"]:                  desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)                  print(desc, file=sys.stderr)                  print(resp.content, file=sys.stderr) @@ -169,20 +202,29 @@ class ElasticsearchReleaseWorker(FatcatWorker):  class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): - -    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, -            query_stats=False, elasticsearch_release_index="fatcat_release", -            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", -            batch_size=200): -        super().__init__(kafka_hosts=kafka_hosts, -                         consume_topic=consume_topic, -                         poll_interval=poll_interval, -                         offset=offset, -                         elasticsearch_backend=elasticsearch_backend, -                         elasticsearch_index=elasticsearch_index, -                         elasticsearch_release_index=elasticsearch_release_index, -                         query_stats=query_stats, -                         batch_size=batch_size) +    def __init__( +        self, +        kafka_hosts, +        consume_topic, +        poll_interval=10.0, +        offset=None, +        query_stats=False, +        elasticsearch_release_index="fatcat_release", +        elasticsearch_backend="http://localhost:9200", +        elasticsearch_index="fatcat", +        batch_size=200, +    ): +        super().__init__( +            kafka_hosts=kafka_hosts, +            consume_topic=consume_topic, +            poll_interval=poll_interval, +            offset=offset, +            elasticsearch_backend=elasticsearch_backend, +            elasticsearch_index=elasticsearch_index, +            elasticsearch_release_index=elasticsearch_release_index, +            query_stats=query_stats, +            batch_size=batch_size, +        )          # previous group got corrupted (by pykafka library?)          self.consumer_group = "elasticsearch-updates3"          self.entity_type = ContainerEntity @@ -196,11 +238,18 @@ class ElasticsearchChangelogWorker(ElasticsearchReleaseWorker):      Note: Very early versions of changelog entries did not contain details      about the editor or extra fields.      """ -    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, -            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat_changelog", -            batch_size=200): -        super().__init__(kafka_hosts=kafka_hosts, -                         consume_topic=consume_topic) + +    def __init__( +        self, +        kafka_hosts, +        consume_topic, +        poll_interval=10.0, +        offset=None, +        elasticsearch_backend="http://localhost:9200", +        elasticsearch_index="fatcat_changelog", +        batch_size=200, +    ): +        super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic)          self.consumer_group = "elasticsearch-updates3"          self.batch_size = batch_size          self.poll_interval = poll_interval diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py index 8c2936be..baec44f4 100644 --- a/python/fatcat_tools/workers/worker_common.py +++ b/python/fatcat_tools/workers/worker_common.py @@ -1,4 +1,3 @@ -  from confluent_kafka import Consumer, KafkaException, TopicPartition @@ -13,22 +12,21 @@ def most_recent_message(topic, kafka_config):      print("Fetching most Kafka message from {}".format(topic))      conf = kafka_config.copy() -    conf.update({ -        'group.id': 'worker-init-last-msg', # should never commit -        'delivery.report.only.error': True, -        'enable.auto.commit': False, -        'default.topic.config': { -            'request.required.acks': -1, -            'auto.offset.reset': 'latest', -        }, -    }) +    conf.update( +        { +            "group.id": "worker-init-last-msg",  # should never commit +            "delivery.report.only.error": True, +            "enable.auto.commit": False, +            "default.topic.config": { +                "request.required.acks": -1, +                "auto.offset.reset": "latest", +            }, +        } +    )      consumer = Consumer(conf) -    hwm = consumer.get_watermark_offsets( -        TopicPartition(topic, 0), -        timeout=5.0, -        cached=False) +    hwm = consumer.get_watermark_offsets(TopicPartition(topic, 0), timeout=5.0, cached=False)      if not hwm:          raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(topic))      print("High watermarks: {}".format(hwm)) @@ -37,7 +35,7 @@ def most_recent_message(topic, kafka_config):          print("topic is new; not 'most recent message'")          return None -    consumer.assign([TopicPartition(topic, 0, hwm[1]-1)]) +    consumer.assign([TopicPartition(topic, 0, hwm[1] - 1)])      msg = consumer.poll(2.0)      consumer.close()      if not msg: @@ -56,8 +54,8 @@ class FatcatWorker:          if api:              self.api = api          self.kafka_config = { -            'bootstrap.servers': kafka_hosts, -            'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes +            "bootstrap.servers": kafka_hosts, +            "message.max.bytes": 20000000,  # ~20 MBytes; broker-side max is ~50 MBytes          }          self.produce_topic = produce_topic          self.consume_topic = consume_topic  | 
