diff options
43 files changed, 4020 insertions, 3194 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index ec38a17b..6f9ee7d8 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,4 +1,3 @@ - from .api_auth import authenticated_api, public_api from .fcid import fcid2uuid, uuid2fcid from .kafka import kafka_fail_fast, simple_kafka_producer diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index bbf059c0..d8f0c46d 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,4 +1,3 @@ - import os import sys @@ -15,6 +14,7 @@ def public_api(host_uri): conf.host = host_uri return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) + def authenticated_api(host_uri, token=None): """ Note: if this helper is called, it's implied that an actual API connection @@ -24,10 +24,11 @@ def authenticated_api(host_uri, token=None): conf = fatcat_openapi_client.Configuration() conf.host = host_uri if not token: - token = os.environ['FATCAT_API_AUTH_TOKEN'] + token = os.environ["FATCAT_API_AUTH_TOKEN"] if not token: sys.stderr.write( - 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') + "This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n" + ) sys.exit(-1) conf.api_key["Authorization"] = token diff --git a/python/fatcat_tools/cleanups/__init__.py b/python/fatcat_tools/cleanups/__init__.py index 587c7b9b..0aeec977 100644 --- a/python/fatcat_tools/cleanups/__init__.py +++ b/python/fatcat_tools/cleanups/__init__.py @@ -1,3 +1,2 @@ - from .common import EntityCleaner from .files import FileCleaner diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py index d0fcc761..26ca7bd6 100644 --- a/python/fatcat_tools/cleanups/common.py +++ b/python/fatcat_tools/cleanups/common.py @@ -1,4 +1,3 @@ - import copy import json import subprocess @@ -30,16 +29,19 @@ class EntityCleaner: def __init__(self, api, entity_type, **kwargs): - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['git_rev'] = eg_extra.get('git_rev', - subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["git_rev"] = eg_extra.get( + "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() + ).decode("utf-8") + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner") self.api = api self.entity_type = entity_type - self.dry_run_mode = kwargs.get('dry_run_mode', True) - self.edit_batch_size = kwargs.get('edit_batch_size', 50) - self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot") + self.dry_run_mode = kwargs.get("dry_run_mode", True) + self.edit_batch_size = kwargs.get("edit_batch_size", 50) + self.editgroup_description = kwargs.get( + "editgroup_description", "Generic Entity Cleaner Bot" + ) self.editgroup_extra = eg_extra self.reset() self.ac = ApiClient() @@ -48,7 +50,7 @@ class EntityCleaner: print("Running in dry-run mode!") def reset(self): - self.counts = Counter({'lines': 0, 'cleaned': 0, 'updated': 0}) + self.counts = Counter({"lines": 0, "cleaned": 0, "updated": 0}) self._edit_count = 0 self._editgroup_id = None self._entity_queue = [] @@ -63,23 +65,23 @@ class EntityCleaner: Returns nothing. """ - self.counts['lines'] += 1 - if (not record): - self.counts['skip-null'] += 1 + self.counts["lines"] += 1 + if not record: + self.counts["skip-null"] += 1 return entity = entity_from_dict(record, self.entity_type, api_client=self.ac) - if entity.state != 'active': - self.counts['skip-inactive'] += 1 + if entity.state != "active": + self.counts["skip-inactive"] += 1 return cleaned = self.clean_entity(copy.deepcopy(entity)) if entity == cleaned: - self.counts['skip-clean'] += 1 + self.counts["skip-clean"] += 1 return else: - self.counts['cleaned'] += 1 + self.counts["cleaned"] += 1 if self.dry_run_mode: entity_dict = entity_to_dict(entity, api_client=self.ac) @@ -87,11 +89,13 @@ class EntityCleaner: return if entity.ident in self._idents_inflight: - raise ValueError("Entity already part of in-process update: {}".format(entity.ident)) + raise ValueError( + "Entity already part of in-process update: {}".format(entity.ident) + ) updated = self.try_update(cleaned) if updated: - self.counts['updated'] += updated + self.counts["updated"] += updated self._edit_count += updated self._idents_inflight.append(entity.ident) @@ -132,9 +136,8 @@ class EntityCleaner: if not self._editgroup_id: eg = self.api.create_editgroup( - Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + Editgroup(description=self.editgroup_description, extra=self.editgroup_extra) + ) self._editgroup_id = eg.editgroup_id return self._editgroup_id diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py index 0d275ba6..d378a91f 100644 --- a/python/fatcat_tools/cleanups/files.py +++ b/python/fatcat_tools/cleanups/files.py @@ -1,4 +1,3 @@ - from fatcat_openapi_client.models import FileEntity from fatcat_openapi_client.rest import ApiException @@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner') - super().__init__(api, + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Automated cleanup of file entities (eg, remove bad URLs)" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner") + super().__init__( + api, entity_type=FileEntity, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) def clean_entity(self, entity): """ @@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner): """ # URL has ://web.archive.org/web/None/ link => delete URL - entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url] + entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url] # URL has ://archive.org/ link with rel=repository => rel=archive for u in entity.urls: - if '://archive.org/' in u.url and u.rel == 'repository': - u.rel = 'archive' + if "://archive.org/" in u.url and u.rel == "repository": + u.rel = "archive" # URL has short wayback date ("2017") and another url with that as prefix => delete URL stub_wayback_urls = [] full_wayback_urls = [] for u in entity.urls: - if '://web.archive.org/web/' in u.url: - if len(u.url.split('/')[4]) <= 8: + if "://web.archive.org/web/" in u.url: + if len(u.url.split("/")[4]) <= 8: stub_wayback_urls.append(u.url) else: - full_wayback_urls.append('/'.join(u.url.split('/')[5:])) + full_wayback_urls.append("/".join(u.url.split("/")[5:])) for stub in stub_wayback_urls: - target = '/'.join(stub.split('/')[5:]) + target = "/".join(stub.split("/")[5:]) if target in full_wayback_urls: entity.urls = [u for u in entity.urls if u.url != stub] @@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner): except ApiException as err: if err.status != 404: raise err - self.counts['skip-not-found'] += 1 + self.counts["skip-not-found"] += 1 return 0 - if existing.state != 'active': - self.counts['skip-existing-inactive'] += 1 + if existing.state != "active": + self.counts["skip-existing-inactive"] += 1 return 0 if existing.revision != entity.revision: - self.counts['skip-revision'] += 1 + self.counts["skip-revision"] += 1 return 0 self.api.update_file(self.get_editgroup_id(), entity.ident, entity) diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index 0987d10d..53891e5a 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -1,4 +1,3 @@ - import base64 import uuid @@ -7,18 +6,20 @@ def fcid2uuid(s): """ Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object """ - s = s.split('_')[-1].upper().encode('utf-8') + s = s.split("_")[-1].upper().encode("utf-8") assert len(s) == 26 raw = base64.b32decode(s + b"======") return str(uuid.UUID(bytes=raw)).lower() + def uuid2fcid(s): """ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) """ raw = uuid.UUID(s).bytes - return base64.b32encode(raw)[:26].lower().decode('utf-8') + return base64.b32encode(raw)[:26].lower().decode("utf-8") + def test_fcid(): - test_uuid = '00000000-0000-0000-3333-000000000001' + test_uuid = "00000000-0000-0000-3333-000000000001" assert test_uuid == fcid2uuid(uuid2fcid(test_uuid)) diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py index d441d495..dd48e256 100644 --- a/python/fatcat_tools/harvest/doi_registrars.py +++ b/python/fatcat_tools/harvest/doi_registrars.py @@ -1,4 +1,3 @@ - import json import sys import time @@ -59,29 +58,35 @@ class HarvestCrossrefWorker: to be careful how state is serialized back into kafka. """ - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.crossref.org/works", start_date=None, - end_date=None): + def __init__( + self, + kafka_hosts, + produce_topic, + state_topic, + contact_email, + api_host_url="https://api.crossref.org/works", + start_date=None, + end_date=None, + ): self.api_host_url = api_host_url self.produce_topic = produce_topic self.state_topic = state_topic self.contact_email = contact_email self.kafka_config = { - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes } self.state = HarvestState(start_date, end_date) self.state.initialize_from_kafka(self.state_topic, self.kafka_config) - self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks + self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks self.api_batch_size = 50 self.name = "Crossref" self.producer = self._kafka_producer() def _kafka_producer(self): - def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err), file=sys.stderr) @@ -92,46 +97,53 @@ class HarvestCrossrefWorker: self._kafka_fail_fast = fail_fast producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) return Producer(producer_conf) def params(self, date_str): - filter_param = 'from-update-date:{},until-update-date:{}'.format( - date_str, date_str) + filter_param = "from-update-date:{},until-update-date:{}".format(date_str, date_str) return { - 'filter': filter_param, - 'rows': self.api_batch_size, - 'cursor': '*', + "filter": filter_param, + "rows": self.api_batch_size, + "cursor": "*", } def update_params(self, params, resp): - params['cursor'] = resp['message']['next-cursor'] + params["cursor"] = resp["message"]["next-cursor"] return params def extract_key(self, obj): - return obj['DOI'].encode('utf-8') + return obj["DOI"].encode("utf-8") def fetch_date(self, date): date_str = date.isoformat() params = self.params(date_str) http_session = requests_retry_session() - http_session.headers.update({ - 'User-Agent': 'fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests'.format( - self.contact_email), - }) + http_session.headers.update( + { + "User-Agent": "fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests".format( + self.contact_email + ), + } + ) count = 0 while True: http_resp = http_session.get(self.api_host_url, params=params) if http_resp.status_code == 503: # crude backoff; now redundant with session exponential # backoff, but allows for longer backoff/downtime on remote end - print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr) + print( + "got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), + file=sys.stderr, + ) # keep kafka producer connection alive self.producer.poll(0) time.sleep(30.0) @@ -143,19 +155,27 @@ class HarvestCrossrefWorker: except json.JSONDecodeError as exc: # Datacite API returned HTTP 200, but JSON seemed unparseable. # It might be a glitch, so we retry. - print("failed to decode body from {}: {}".format(http_resp.url, resp_body), file=sys.stderr) + print( + "failed to decode body from {}: {}".format(http_resp.url, resp_body), + file=sys.stderr, + ) raise exc items = self.extract_items(resp) count += len(items) - print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count, - self.extract_total(resp), http_resp.elapsed), file=sys.stderr) - #print(json.dumps(resp)) + print( + "... got {} ({} of {}), HTTP fetch took {}".format( + len(items), count, self.extract_total(resp), http_resp.elapsed + ), + file=sys.stderr, + ) + # print(json.dumps(resp)) for work in items: self.producer.produce( self.produce_topic, - json.dumps(work).encode('utf-8'), + json.dumps(work).encode("utf-8"), key=self.extract_key(work), - on_delivery=self._kafka_fail_fast) + on_delivery=self._kafka_fail_fast, + ) self.producer.poll(0) if len(items) < self.api_batch_size: break @@ -163,10 +183,10 @@ class HarvestCrossrefWorker: self.producer.flush() def extract_items(self, resp): - return resp['message']['items'] + return resp["message"]["items"] def extract_total(self, resp): - return resp['message']['total-results'] + return resp["message"]["total-results"] def run(self, continuous=False): @@ -175,9 +195,9 @@ class HarvestCrossrefWorker: if current: print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr) self.fetch_date(current) - self.state.complete(current, - kafka_topic=self.state_topic, - kafka_config=self.kafka_config) + self.state.complete( + current, kafka_topic=self.state_topic, kafka_config=self.kafka_config + ) continue if continuous: @@ -200,16 +220,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): could/should use this script for that, and dump to JSON? """ - def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email, - api_host_url="https://api.datacite.org/dois", - start_date=None, end_date=None): - super().__init__(kafka_hosts=kafka_hosts, - produce_topic=produce_topic, - state_topic=state_topic, - api_host_url=api_host_url, - contact_email=contact_email, - start_date=start_date, - end_date=end_date) + def __init__( + self, + kafka_hosts, + produce_topic, + state_topic, + contact_email, + api_host_url="https://api.datacite.org/dois", + start_date=None, + end_date=None, + ): + super().__init__( + kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + state_topic=state_topic, + api_host_url=api_host_url, + contact_email=contact_email, + start_date=start_date, + end_date=end_date, + ) # for datecite, it's "from-update-date" self.name = "Datacite" @@ -219,19 +248,21 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): Dates have to be supplied in 2018-10-27T22:36:30.000Z format. """ return { - 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str), - 'page[size]': self.api_batch_size, - 'page[cursor]': 1, + "query": "updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]".format( + date_str, date_str + ), + "page[size]": self.api_batch_size, + "page[cursor]": 1, } def extract_items(self, resp): - return resp['data'] + return resp["data"] def extract_total(self, resp): - return resp['meta']['total'] + return resp["meta"]["total"] def extract_key(self, obj): - return obj['attributes']['doi'].encode('utf-8') + return obj["attributes"]["doi"].encode("utf-8") def update_params(self, params, resp): """ @@ -245,9 +276,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker): https://github.com/datacite/datacite/issues/897 (HTTP 400) https://github.com/datacite/datacite/issues/898 (HTTP 500) """ - parsed = urlparse(resp['links']['next']) - page_cursor = parse_qs(parsed.query).get('page[cursor]') + parsed = urlparse(resp["links"]["next"]) + page_cursor = parse_qs(parsed.query).get("page[cursor]") if not page_cursor: - raise ValueError('no page[cursor] in .links.next') - params['page[cursor]'] = page_cursor[0] + raise ValueError("no page[cursor] in .links.next") + params["page[cursor]"] = page_cursor[0] return params diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 45c2b8ea..fda0dc62 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -14,8 +13,10 @@ from requests.packages.urllib3.util.retry import Retry # pylint: disable=import # Used for parsing ISO date format (YYYY-MM-DD) DATE_FMT = "%Y-%m-%d" -def requests_retry_session(retries=10, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): + +def requests_retry_session( + retries=10, backoff_factor=3, status_forcelist=(500, 502, 504), session=None +): """ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests """ @@ -28,10 +29,11 @@ def requests_retry_session(retries=10, backoff_factor=3, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) + session.mount("http://", adapter) + session.mount("https://", adapter) return session + class HarvestState: """ First version of this works with full days (dates) @@ -57,8 +59,9 @@ class HarvestState: self.enqueue_period(start_date, end_date, catchup_days) def __str__(self): - return '<HarvestState to_process={}, completed={}>'.format( - len(self.to_process), len(self.completed)) + return "<HarvestState to_process={}, completed={}>".format( + len(self.to_process), len(self.completed) + ) def enqueue_period(self, start_date=None, end_date=None, catchup_days=14): """ @@ -92,7 +95,9 @@ class HarvestState: """ if continuous: # enqueue yesterday - self.enqueue_period(start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1)) + self.enqueue_period( + start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1) + ) if not self.to_process: return None return sorted(list(self.to_process))[0] @@ -105,8 +110,8 @@ class HarvestState: state stored on disk or in Kafka. """ state = json.loads(state_json) - if 'completed-date' in state: - date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date() + if "completed-date" in state: + date = datetime.datetime.strptime(state["completed-date"], DATE_FMT).date() self.complete(date) def complete(self, date, kafka_topic=None, kafka_config=None): @@ -123,12 +128,14 @@ class HarvestState: except KeyError: pass self.completed.add(date) - state_json = json.dumps({ - 'in-progress-dates': [str(d) for d in self.to_process], - 'completed-date': str(date), - }).encode('utf-8') + state_json = json.dumps( + { + "in-progress-dates": [str(d) for d in self.to_process], + "completed-date": str(date), + } + ).encode("utf-8") if kafka_topic: - assert(kafka_config) + assert kafka_config def fail_fast(err, msg): if err: @@ -136,17 +143,16 @@ class HarvestState: print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr) producer_conf = kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) producer = Producer(producer_conf) - producer.produce( - kafka_topic, - state_json, - on_delivery=fail_fast) + producer.produce(kafka_topic, state_json, on_delivery=fail_fast) producer.flush() return state_json @@ -166,22 +172,25 @@ class HarvestState: raise KafkaException(err) conf = kafka_config.copy() - conf.update({ - 'group.id': 'dummy_init_group', # should never be committed - 'enable.auto.commit': False, - 'auto.offset.reset': 'earliest', - 'session.timeout.ms': 10000, - }) + conf.update( + { + "group.id": "dummy_init_group", # should never be committed + "enable.auto.commit": False, + "auto.offset.reset": "earliest", + "session.timeout.ms": 10000, + } + ) consumer = Consumer(conf) # this watermark fetch is mostly to ensure we are connected to broker and # fail fast if not, but we also confirm that we read to end below. hwm = consumer.get_watermark_offsets( - TopicPartition(kafka_topic, 0), - timeout=5.0, - cached=False) + TopicPartition(kafka_topic, 0), timeout=5.0, cached=False + ) if not hwm: - raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic)) + raise Exception( + "Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic) + ) consumer.assign([TopicPartition(kafka_topic, 0, 0)]) c = 0 @@ -191,8 +200,8 @@ class HarvestState: break if msg.error(): raise KafkaException(msg.error()) - #sys.stdout.write('.') - self.update(msg.value().decode('utf-8')) + # sys.stdout.write('.') + self.update(msg.value().decode("utf-8")) c += 1 consumer.close() diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py index 0eb0343d..40d1c853 100644 --- a/python/fatcat_tools/harvest/oaipmh.py +++ b/python/fatcat_tools/harvest/oaipmh.py @@ -1,4 +1,3 @@ - import sys import time @@ -25,19 +24,18 @@ class HarvestOaiPmhWorker: would want something similar operationally. Oh well! """ - def __init__(self, kafka_hosts, produce_topic, state_topic, - start_date=None, end_date=None): + def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): self.produce_topic = produce_topic self.state_topic = state_topic self.kafka_config = { - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes } - self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks + self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks - self.endpoint_url = None # needs override + self.endpoint_url = None # needs override self.metadata_prefix = None # needs override self.name = "unnamed" self.state = HarvestState(start_date, end_date) @@ -45,7 +43,6 @@ class HarvestOaiPmhWorker: print(self.state, file=sys.stderr) def fetch_date(self, date): - def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err), file=sys.stderr) @@ -54,12 +51,14 @@ class HarvestOaiPmhWorker: raise KafkaException(err) producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) producer = Producer(producer_conf) api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503]) @@ -67,13 +66,18 @@ class HarvestOaiPmhWorker: # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs try: - records = api.ListRecords(**{ - 'metadataPrefix': self.metadata_prefix, - 'from': date_str, - 'until': date_str, - }) + records = api.ListRecords( + **{ + "metadataPrefix": self.metadata_prefix, + "from": date_str, + "until": date_str, + } + ) except sickle.oaiexceptions.NoRecordsMatch: - print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr) + print( + "WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), + file=sys.stderr, + ) return count = 0 @@ -83,9 +87,10 @@ class HarvestOaiPmhWorker: print("... up to {}".format(count), file=sys.stderr) producer.produce( self.produce_topic, - item.raw.encode('utf-8'), - key=item.header.identifier.encode('utf-8'), - on_delivery=fail_fast) + item.raw.encode("utf-8"), + key=item.header.identifier.encode("utf-8"), + on_delivery=fail_fast, + ) producer.flush() def run(self, continuous=False): @@ -95,9 +100,9 @@ class HarvestOaiPmhWorker: if current: print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr) self.fetch_date(current) - self.state.complete(current, - kafka_topic=self.state_topic, - kafka_config=self.kafka_config) + self.state.complete( + current, kafka_topic=self.state_topic, kafka_config=self.kafka_config + ) continue if continuous: diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index ee55f4eb..0f33f334 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -60,14 +60,15 @@ class PubmedFTPWorker: <tr> """ + def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): - self.name = 'Pubmed' - self.host = 'ftp.ncbi.nlm.nih.gov' + self.name = "Pubmed" + self.host = "ftp.ncbi.nlm.nih.gov" self.produce_topic = produce_topic self.state_topic = state_topic self.kafka_config = { - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes } self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks self.state = HarvestState(start_date, end_date) @@ -86,12 +87,14 @@ class PubmedFTPWorker: self._kafka_fail_fast = fail_fast producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) return Producer(producer_conf) def fetch_date(self, date): @@ -105,24 +108,35 @@ class PubmedFTPWorker: if self.date_file_map is None: raise ValueError("cannot fetch date without date file mapping") - date_str = date.strftime('%Y-%m-%d') + date_str = date.strftime("%Y-%m-%d") paths = self.date_file_map.get(date_str) if paths is None: - print("WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format(date_str, self.date_file_map), file=sys.stderr) + print( + "WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format( + date_str, self.date_file_map + ), + file=sys.stderr, + ) return False count = 0 for path in paths: # Fetch and decompress file. url = "ftp://{}{}".format(self.host, path) - filename = ftpretr(url, proxy_hostport="159.69.240.245:15201") # TODO: proxy obsolete, when networking issue is resolved - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp: + filename = ftpretr( + url, proxy_hostport="159.69.240.245:15201" + ) # TODO: proxy obsolete, when networking issue is resolved + with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as decomp: try: gzf = gzip.open(filename) shutil.copyfileobj(gzf, decomp) except zlib.error as exc: - print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format( - url, exc), file=sys.stderr) + print( + "[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)".format( + url, exc + ), + file=sys.stderr, + ) continue # Here, blob is the unparsed XML; we peek into it to use PMID as @@ -131,15 +145,17 @@ class PubmedFTPWorker: # WARNING: Parsing foreign XML exposes us at some # https://docs.python.org/3/library/xml.html#xml-vulnerabilities # here. - for blob in xmlstream(decomp.name, 'PubmedArticle', encoding='utf-8'): - soup = BeautifulSoup(blob, 'xml') - pmid = soup.find('PMID') + for blob in xmlstream(decomp.name, "PubmedArticle", encoding="utf-8"): + soup = BeautifulSoup(blob, "xml") + pmid = soup.find("PMID") if pmid is None: raise ValueError("no PMID found, please adjust identifier extraction") count += 1 if count % 50 == 0: print("... up to {}".format(count), file=sys.stderr) - self.producer.produce(self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast) + self.producer.produce( + self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast + ) self.producer.flush() os.remove(filename) @@ -151,13 +167,17 @@ class PubmedFTPWorker: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: - raise ValueError("map from dates to files should not be empty, maybe the HTML changed?") + raise ValueError( + "map from dates to files should not be empty, maybe the HTML changed?" + ) current = self.state.next_span(continuous) if current: print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr) self.fetch_date(current) - self.state.complete(current, kafka_topic=self.state_topic, kafka_config=self.kafka_config) + self.state.complete( + current, kafka_topic=self.state_topic, kafka_config=self.kafka_config + ) continue if continuous: @@ -168,7 +188,7 @@ class PubmedFTPWorker: print("{} FTP ingest caught up".format(self.name)) -def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): +def generate_date_file_map(host="ftp.ncbi.nlm.nih.gov"): """ Generate a DefaultDict[string, set] mapping dates to absolute filepaths on the server (mostly we have one file, but sometimes more). @@ -176,14 +196,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): Example: {"2020-01-02": set(["/pubmed/updatefiles/pubmed20n1016.xml.gz"]), ...} """ mapping = collections.defaultdict(set) - pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)') + pattern = re.compile(r"Filename: ([^ ]*.xml) -- Created: ([^<]*)") ftp = ftplib.FTP(host) ftp.login() - filenames = ftp.nlst('/pubmed/updatefiles') + filenames = ftp.nlst("/pubmed/updatefiles") retries, retry_delay = 10, 60 for name in filenames: - if not name.endswith('.html'): + if not name.endswith(".html"): continue sio = io.StringIO() for i in range(retries): @@ -201,10 +221,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): ftp = ftplib.FTP(host) ftp.login() sio.truncate(0) - ftp.retrlines('RETR {}'.format(name), sio.write) + ftp.retrlines("RETR {}".format(name), sio.write) except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc: - print("ftp retr on {} failed with {} ({}) ({} retries left)".format( - name, exc, type(exc), retries - (i + 1)), file=sys.stderr) + print( + "ftp retr on {} failed with {} ({}) ({} retries left)".format( + name, exc, type(exc), retries - (i + 1) + ), + file=sys.stderr, + ) if i + 1 == retries: raise else: @@ -214,16 +238,24 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'): contents = sio.getvalue() match = pattern.search(contents) if match is None: - print('pattern miss in {} on: {}, may need to adjust pattern: {}'.format(name, contents, pattern), file=sys.stderr) + print( + "pattern miss in {} on: {}, may need to adjust pattern: {}".format( + name, contents, pattern + ), + file=sys.stderr, + ) continue - filename, filedate = match.groups() # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019') + ( + filename, + filedate, + ) = match.groups() # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019') date = dateparser.parse(filedate) - fullpath = '/pubmed/updatefiles/{}.gz'.format(filename) - date_str = date.strftime('%Y-%m-%d') + fullpath = "/pubmed/updatefiles/{}.gz".format(filename) + date_str = date.strftime("%Y-%m-%d") mapping[date_str].add(fullpath) - print('added entry for {}: {}'.format(date_str, fullpath), file=sys.stderr) + print("added entry for {}: {}".format(date_str, fullpath), file=sys.stderr) - print('generated date-file mapping for {} dates'.format(len(mapping)), file=sys.stderr) + print("generated date-file mapping for {} dates".format(len(mapping)), file=sys.stderr) return mapping @@ -241,20 +273,29 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None): when we encountered EOFError while talking to the FTP server. Retry delay in seconds. """ if proxy_hostport is not None: - return ftpretr_via_http_proxy(url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay) + return ftpretr_via_http_proxy( + url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay + ) parsed = urlparse(url) server, path = parsed.netloc, parsed.path for i in range(max_retries): try: ftp = ftplib.FTP(server) ftp.login() - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: - print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr) - ftp.retrbinary('RETR %s' % path, f.write) + with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f: + print( + "retrieving {} from {} to {} ...".format(path, server, f.name), + file=sys.stderr, + ) + ftp.retrbinary("RETR %s" % path, f.write) ftp.close() except EOFError as exc: - print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( - path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) + print( + "ftp retrbinary on {} failed with {} ({}) ({} retries left)".format( + path, exc, type(exc), max_retries - (i + 1) + ), + file=sys.stderr, + ) if i + 1 == max_retries: raise else: @@ -263,7 +304,9 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None): return f.name -def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1): +def ftpretr_via_http_proxy( + url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1 +): """ Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would be retrievable via proxy.com/a/b/c; (in 09/2021 we used @@ -276,19 +319,23 @@ def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retri try: url = "http://{}{}".format(proxy_hostport, path) print("retrieving file via proxy (ftpup) from {}".format(url), file=sys.stderr) - with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f: + with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f: cmd = ["wget", "-c", url, "-O", f.name] result = subprocess.run(cmd) return f.name except (subprocess.CalledProcessError, OSError, ValueError) as exc: - print("ftp fetch {} failed with {} ({}) ({} retries left)".format( - url, exc, type(exc), max_retries - (i + 1)), file=sys.stderr) + print( + "ftp fetch {} failed with {} ({}) ({} retries left)".format( + url, exc, type(exc), max_retries - (i + 1) + ), + file=sys.stderr, + ) if i + 1 == max_retries: raise time.sleep(retry_delay) -def xmlstream(filename, tag, encoding='utf-8'): +def xmlstream(filename, tag, encoding="utf-8"): """ Note: This might move into a generic place in the future. @@ -300,23 +347,29 @@ def xmlstream(filename, tag, encoding='utf-8'): Known vulnerabilities: https://docs.python.org/3/library/xml.html#xml-vulnerabilities """ + def strip_ns(tag): - if '}' not in tag: + if "}" not in tag: return tag - return tag.split('}')[1] + return tag.split("}")[1] # https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm - context = iter(ET.iterparse(filename, events=( - 'start', - 'end', - ))) + context = iter( + ET.iterparse( + filename, + events=( + "start", + "end", + ), + ) + ) try: _, root = next(context) except StopIteration: return for event, elem in context: - if not strip_ns(elem.tag) == tag or event == 'start': + if not strip_ns(elem.tag) == tag or event == "start": continue yield ET.tostring(elem, encoding=encoding) diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 2b0ff7ec..ae4f9049 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,9 +1,9 @@ - import fatcat_openapi_client from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url -ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' +ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL" + class ArabesqueMatchImporter(EntityImporter): """ @@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter): def __init__(self, api, extid_type, require_grobid=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist" - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') - if kwargs.get('crawl_id'): - eg_extra['crawl_id'] = kwargs.get('crawl_id') - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - assert extid_type in ('doi', 'pmcid', 'pmid') + eg_desc = ( + kwargs.get("editgroup_description", None) + or "Match web crawl files to releases based on identifier/URL seedlist" + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter") + if kwargs.get("crawl_id"): + eg_extra["crawl_id"] = kwargs.get("crawl_id") + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + assert extid_type in ("doi", "pmcid", "pmid") self.extid_type = extid_type self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel @@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter): print("NOT checking GROBID status column") def want(self, row): - if self.require_grobid and not row['postproc_status'] == "200": + if self.require_grobid and not row["postproc_status"] == "200": return False - if (bool(row['hit']) is True - and row['final_sha1'] - and row['final_timestamp'] - and row['final_timestamp'] != "-" - and len(row['final_timestamp']) == 14 - and row['final_mimetype'] - and bool(row['hit']) is True - and row['identifier']): + if ( + bool(row["hit"]) is True + and row["final_sha1"] + and row["final_timestamp"] + and row["final_timestamp"] != "-" + and len(row["final_timestamp"]) == 14 + and row["final_mimetype"] + and bool(row["hit"]) is True + and row["identifier"] + ): return True else: return False def parse_record(self, row): - extid = row['identifier'].strip() + extid = row["identifier"].strip() # check/cleanup DOI - if self.extid_type == 'doi': + if self.extid_type == "doi": extid = extid.lower() - extid.replace('http://doi.org/', '') - extid.replace('https://doi.org/', '') - if extid.startswith('doi:'): + extid.replace("http://doi.org/", "") + extid.replace("https://doi.org/", "") + if extid.startswith("doi:"): extid = extid[4:] - if not extid.startswith('10.'): - self.counts['skip-extid-invalid'] + if not extid.startswith("10."): + self.counts["skip-extid-invalid"] return None # lookup extid @@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status == 404: # bail on 404 (release not in DB) - self.counts['skip-extid-not-found'] += 1 + self.counts["skip-extid-not-found"] += 1 return None elif err.status == 400: - self.counts['skip-extid-invalid'] += 1 + self.counts["skip-extid-invalid"] += 1 return None else: raise err - url = make_rel_url(row['final_url'], self.default_link_rel) + url = make_rel_url(row["final_url"], self.default_link_rel) if not url: - self.counts['skip-url'] += 1 + self.counts["skip-url"] += 1 return None - if not row['final_timestamp']: - self.counts['skip-missing-timestamp'] += 1 + if not row["final_timestamp"]: + self.counts["skip-missing-timestamp"] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( - row['final_timestamp'], - row['final_url']) + row["final_timestamp"], row["final_url"] + ) urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] if len(urls) > SANE_MAX_URLS: - self.counts['skip-too-many-url'] += 1 + self.counts["skip-too-many-url"] += 1 return None fe = fatcat_openapi_client.FileEntity( - sha1=b32_hex(row['final_sha1']), - mimetype=row['final_mimetype'] or self.default_mimetype, + sha1=b32_hex(row["final_sha1"]), + mimetype=row["final_mimetype"] or self.default_mimetype, release_ids=[re.ident], urls=urls, ) @@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter): if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not self.do_updates: - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False # TODO: this code path never gets hit because of the check above @@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter): existing_urls = set([u.url for u in existing.urls]) new_urls = set([u.url for u in fe.urls]) if existing_urls.issuperset(new_urls): - self.counts['skip-update-nothing-new'] += 1 + self.counts["skip-update-nothing-new"] += 1 return False # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + existing.urls = [ + fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls + ] if len(existing.urls) > SANE_MAX_URLS: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index fc429fb0..7a689ed2 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -1,4 +1,3 @@ - import datetime import json import re @@ -13,6 +12,7 @@ from .crossref import lookup_license_slug latex2text = LatexNodes2Text() + def latex_to_text(raw): try: return latex2text.latex_to_text(raw).strip() @@ -21,13 +21,14 @@ def latex_to_text(raw): except IndexError: return raw.strip() + def parse_arxiv_authors(raw): if not raw: return [] - raw = raw.replace('*', '') - if '(' in raw: - raw = re.sub(r'\(.*\)', '', raw) - authors = raw.split(', ') + raw = raw.replace("*", "") + if "(" in raw: + raw = re.sub(r"\(.*\)", "", raw) + authors = raw.split(", ") if authors: last = authors[-1].split(" and ") if len(last) == 2: @@ -39,9 +40,12 @@ def parse_arxiv_authors(raw): authors = [a for a in authors if a] return authors + def test_parse_arxiv_authors(): - assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [ + assert parse_arxiv_authors( + "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" + ) == [ "Raphael Chetrite", "Shamik Gupta", "Izaak Neri", @@ -63,7 +67,9 @@ def test_parse_arxiv_authors(): "Raphael Chetrite Shamik Gupta", ] - assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [ + assert parse_arxiv_authors( + "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)" + ) == [ "B. P. Lanyon", "T. J. Weinhold", "N. K. Langford", @@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of arxiv metadata via arXivRaw OAI-PMH feed") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter') + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of arxiv metadata via arXivRaw OAI-PMH feed", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter") # lower batch size, because multiple versions per entry (guessing 2-3 on average?) - batch_size = kwargs.get('edit_batch_size', 50) - super().__init__(api, + batch_size = kwargs.get("edit_batch_size", 50) + super().__init__( + api, editgroup_description=eg_desc, editgroup_extra=eg_extra, batch_size=batch_size, - **kwargs) + **kwargs + ) self._test_override = False def parse_record(self, record): @@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter): doi = None if metadata.doi and metadata.doi.string: doi = metadata.doi.string.lower().split()[0].strip() - if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]): + if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]): sys.stderr.write("BOGUS DOI: {}\n".format(doi)) doi = None - title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) - authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' ')) - contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)] - - lang = "en" # the vast majority in english + title = latex_to_text(metadata.title.get_text().replace("\n", " ")) + authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " ")) + contribs = [ + fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author") + for i, a in enumerate(authors) + ] + + lang = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): - comments = metadata.comments.get_text().replace('\n', ' ').strip() - extra_arxiv['comments'] = comments - if 'in french' in comments.lower(): - lang = 'fr' - elif 'in spanish' in comments.lower(): - lang = 'es' - elif 'in portuguese' in comments.lower(): - lang = 'pt' - elif 'in hindi' in comments.lower(): - lang = 'hi' - elif 'in japanese' in comments.lower(): - lang = 'ja' - elif 'in german' in comments.lower(): - lang = 'de' - elif 'simplified chinese' in comments.lower(): - lang = 'zh' - elif 'in russian' in comments.lower(): - lang = 'ru' + comments = metadata.comments.get_text().replace("\n", " ").strip() + extra_arxiv["comments"] = comments + if "in french" in comments.lower(): + lang = "fr" + elif "in spanish" in comments.lower(): + lang = "es" + elif "in portuguese" in comments.lower(): + lang = "pt" + elif "in hindi" in comments.lower(): + lang = "hi" + elif "in japanese" in comments.lower(): + lang = "ja" + elif "in german" in comments.lower(): + lang = "de" + elif "simplified chinese" in comments.lower(): + lang = "zh" + elif "in russian" in comments.lower(): + lang = "ru" # more languages? number = None - if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): - journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip() - extra_arxiv['journal_ref'] = journal_ref + if metadata.find("journal-ref") and metadata.find("journal-ref").get_text(): + journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip() + extra_arxiv["journal_ref"] = journal_ref if "conf." in journal_ref.lower() or "proc." in journal_ref.lower(): release_type = "paper-conference" - if metadata.find('report-no') and metadata.find('report-no').string: - number = metadata.find('report-no').string.strip() + if metadata.find("report-no") and metadata.find("report-no").string: + number = metadata.find("report-no").string.strip() # at least some people plop extra metadata in here. hrmf! - if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2: - extra_arxiv['report-no'] = number + if "ISSN " in number or "ISBN " in number or len(number.split()) > 2: + extra_arxiv["report-no"] = number number = None else: release_type = "report" - if metadata.find('acm-class') and metadata.find('acm-class').string: - extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() + if metadata.find("acm-class") and metadata.find("acm-class").string: + extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip() if metadata.categories and metadata.categories.get_text(): - extra_arxiv['categories'] = metadata.categories.get_text().split() + extra_arxiv["categories"] = metadata.categories.get_text().split() license_slug = None if metadata.license and metadata.license.get_text(): license_slug = lookup_license_slug(metadata.license.get_text()) @@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter): abstracts = [] abst = metadata.abstract.get_text().strip() orig = None - if '-----' in abst: - both = abst.split('-----') + if "-----" in abst: + both = abst.split("-----") abst = both[0].strip() orig = both[1].strip() - if '$' in abst or '{' in abst: + if "$" in abst or "{" in abst: mime = "application/x-latex" abst_plain = latex_to_text(abst) - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en")) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + content=abst_plain, mimetype="text/plain", lang="en" + ) + ) else: mime = "text/plain" - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en") + ) if orig: - abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime) + ) # indicates that fulltext probably isn't english either - if lang == 'en': + if lang == "en": lang = None # extra: @@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter): # container_name # group-title # arxiv: comments, categories, etc - extra_arxiv['base_id'] = base_id - extra['superceded'] = True - extra['arxiv'] = extra_arxiv + extra_arxiv["base_id"] = base_id + extra["superceded"] = True + extra["arxiv"] = extra_arxiv versions = [] - for version in metadata.find_all('version'): - arxiv_id = base_id + version['version'] + for version in metadata.find_all("version"): + arxiv_id = base_id + version["version"] release_date = version.date.string.strip() - release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date() + release_date = datetime.datetime.strptime( + release_date, "%a, %d %b %Y %H:%M:%S %Z" + ).date() # TODO: source_type? - versions.append(fatcat_openapi_client.ReleaseEntity( - work_id=None, - title=title, - #original_title - version=version['version'], - release_type=release_type, - release_stage='submitted', - release_date=release_date.isoformat(), - release_year=release_date.year, - ext_ids=fatcat_openapi_client.ReleaseExtIds( - arxiv=arxiv_id, - ), - number=number, - language=lang, - license_slug=license_slug, - abstracts=abstracts, - contribs=contribs, - extra=extra.copy(), - )) + versions.append( + fatcat_openapi_client.ReleaseEntity( + work_id=None, + title=title, + # original_title + version=version["version"], + release_type=release_type, + release_stage="submitted", + release_date=release_date.isoformat(), + release_year=release_date.year, + ext_ids=fatcat_openapi_client.ReleaseExtIds( + arxiv=arxiv_id, + ), + number=number, + language=lang, + license_slug=license_slug, + abstracts=abstracts, + contribs=contribs, + extra=extra.copy(), + ) + ) # TODO: assert that versions are actually in order? assert versions - versions[-1].extra.pop('superceded') + versions[-1].extra.pop("superceded") # only apply DOI to most recent version (HACK) if doi: @@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter): for v in versions: if v._existing_work_id: if not v._updated: - self.counts['exists'] += 1 + self.counts["exists"] += 1 continue if not any_work_id and last_edit: # fetch the last inserted release from this group @@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter): any_work_id = r.work_id v.work_id = any_work_id last_edit = self.api.create_release(self.get_editgroup_id(), v) - self.counts['insert'] += 1 + self.counts["insert"] += 1 return False @@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter): # there is no batch/bezerk mode for arxiv importer, except for testing if self._test_override: for batch in batch_batch: - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) - self.counts['insert'] += len(batch) - 1 + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) + self.counts["insert"] += len(batch) - 1 else: raise NotImplementedError() @@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter): for article in soup.find_all("record"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) -if __name__ == '__main__': +if __name__ == "__main__": parser = ArxivRawImporter(None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index 0340f6a3..e9de42fc 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -34,15 +34,15 @@ def single_file(prefix, path): hashlib.sha1(), hashlib.sha256(), ] - with open(full, 'rb') as fp: + with open(full, "rb") as fp: while True: - data = fp.read(2**20) + data = fp.read(2 ** 20) if not data: break for h in hashes: h.update(data) mime = magic.Magic(mime=True).from_file(full) - if mime == 'application/octet-stream': + if mime == "application/octet-stream": # magic apparently isn't that great; try using filename as well guess = mimetypes.guess_type(full)[0] if guess: @@ -54,9 +54,11 @@ def single_file(prefix, path): md5=hashes[0].hexdigest(), sha1=hashes[1].hexdigest(), sha256=hashes[2].hexdigest(), - extra=dict(mimetype=mime)) + extra=dict(mimetype=mime), + ) return fsf + def make_manifest(base_dir): manifest = [] for root, dirs, files in os.walk(base_dir): @@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None): if not extra: extra = dict() - assert meta['identifier']['type'] == 'DOI' - doi = meta['identifier']['value'].lower() - assert doi.startswith('10.') + assert meta["identifier"]["type"] == "DOI" + doi = meta["identifier"]["value"].lower() + assert doi.startswith("10.") ark_id = None - for extid in meta.get('alternativeIdentifiers', []): - if extid['value'].startswith('ark:'): - ark_id = extid['value'] + for extid in meta.get("alternativeIdentifiers", []): + if extid["value"].startswith("ark:"): + ark_id = extid["value"] assert ark_id - license_slug = lookup_license_slug(meta['rights']['uri']) + license_slug = lookup_license_slug(meta["rights"]["uri"]) abstracts = [] - for desc in meta['descriptions']: - if desc['type'] == "abstract": - abstracts.append(ReleaseAbstract( - mimetype="text/html", - content=clean(desc['value']))) - #print(abstracts) + for desc in meta["descriptions"]: + if desc["type"] == "abstract": + abstracts.append( + ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) + ) + # print(abstracts) if not abstracts: abstracts = None contribs = [] - for creator in meta['creator']: - contribs.append(ReleaseContrib( - given_name=creator['given'], - surname=creator['family'], - # sorry everybody - raw_name="{} {}".format(creator['given'], creator['family']), - raw_affiliation=creator.get('affiliation'), - role="author", # presumably, for these datasets? - )) + for creator in meta["creator"]: + contribs.append( + ReleaseContrib( + given_name=creator["given"], + surname=creator["family"], + # sorry everybody + raw_name="{} {}".format(creator["given"], creator["family"]), + raw_affiliation=creator.get("affiliation"), + role="author", # presumably, for these datasets? + ) + ) r = ReleaseEntity( ext_ids=ReleaseExtIds( doi=doi, ark=ark_id, ), - title=clean(meta['title'], force_xml=True), - publisher=clean(meta['publisher']), - release_year=int(meta['publicationYear']), + title=clean(meta["title"], force_xml=True), + publisher=clean(meta["publisher"]), + release_year=int(meta["publicationYear"]), release_type="dataset", license_slug=license_slug, contribs=contribs, @@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None): ) return r + def make_release_fileset(dat_path): - if dat_path.endswith('/'): + if dat_path.endswith("/"): dat_path = dat_path[:-1] dat_discovery = dat_path extra = dict() assert len(dat_discovery) == 64 - with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp: + with open(dat_path + "/cdl_dash_metadata.json", "r") as fp: meta_dict = json.loads(fp.read()) release = cdl_dash_release(meta_dict) - ark_id = release.extra['ark_id'] + ark_id = release.extra["ark_id"] dash_version = None # really crude XML parse-out - with open(dat_path + "/stash-wrapper.xml", 'r') as fp: + with open(dat_path + "/stash-wrapper.xml", "r") as fp: for line in fp: line = line.strip() if line.startswith("<st:version_number>"): - dash_version = int(line[19:].split('<')[0]) + dash_version = int(line[19:].split("<")[0]) assert dash_version is not None - extra['cdl_dash'] = dict(version=dash_version) - release.extra['cdl_dash'] = dict(version=dash_version) + extra["cdl_dash"] = dict(version=dash_version) + release.extra["cdl_dash"] = dict(version=dash_version) manifest = make_manifest(dat_path + "/files/") bundle_url = dict( url="https://merritt.cdlib.org/u/{}/{}".format( - urllib.parse.quote(ark_id, safe=''), - dash_version), - rel="repo-bundle") + urllib.parse.quote(ark_id, safe=""), dash_version + ), + rel="repo-bundle", + ) repo_url = dict( url="https://merritt.cdlib.org/d/{}/{}/".format( - urllib.parse.quote(ark_id, safe=''), - dash_version), - rel="repo") - dat_url = dict( - url="dat://{}/files/".format(dat_discovery), - rel="dweb") + urllib.parse.quote(ark_id, safe=""), dash_version + ), + rel="repo", + ) + dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb") fs = FilesetEntity( - urls=[bundle_url, repo_url, dat_url], - release_ids=None, - manifest=manifest, - extra=extra) + urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra + ) return (release, fs) + def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): - git_rev = subprocess.check_output( - ["git", "describe", "--always"]).strip().decode('utf-8') + git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") (release, fileset) = make_release_fileset(dat_path) if not editgroup_id: - eg = api.create_editgroup(Editgroup( - description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", - extra=dict( - git_rev=git_rev, - agent="fatcat_tools.auto_cdl_dash_dat"))) + eg = api.create_editgroup( + Editgroup( + description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)", + extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"), + ) + ) editgroup_id = eg.editgroup_id if not release_id and release.ext_ids.doi: @@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): fileset = api.get_fileset(edit.ident) return (editgroup_id, release, fileset) -if __name__=='__main__': + +if __name__ == "__main__": # pass this a discovery key that has been cloned to the local directory print(make_release_fileset(sys.argv[1])) diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 0b634e73..8d2a89b6 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter, clean @@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata from Chocula tool.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata from Chocula tool.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): - if not raw_record.get('ident') and not raw_record.get('_known_issnl'): - self.counts['skip-unknown-new-issnl'] += 1 + if not raw_record.get("ident") and not raw_record.get("_known_issnl"): + self.counts["skip-unknown-new-issnl"] += 1 return False - if raw_record.get('issnl') and raw_record.get('name'): + if raw_record.get("issnl") and raw_record.get("name"): return True return False @@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - name = clean(row.get('name')) + name = clean(row.get("name")) if not name: # Name is required (by schema) return None name = name.strip() - if name.endswith(', Proceedings of the'): - name = "Proceedings of the " + name.split(',')[0] + if name.endswith(", Proceedings of the"): + name = "Proceedings of the " + name.split(",")[0] - if name.endswith('.'): + if name.endswith("."): name = name[:-1] extra = dict() - for k in ('urls', 'webarchive_urls', 'country', - 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages', - 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'): - if row['extra'].get(k): - extra[k] = row['extra'][k] + for k in ( + "urls", + "webarchive_urls", + "country", + "sherpa_romeo", + "ezb", + "szczepanski", + "doaj", + "languages", + "ia", + "scielo", + "kbart", + "publisher_type", + "platform", + ): + if row["extra"].get(k): + extra[k] = row["extra"][k] container_type = None - if 'proceedings' in name.lower(): - container_type = 'proceedings' - elif 'journal ' in name.lower(): - container_type = 'journal' + if "proceedings" in name.lower(): + container_type = "proceedings" + elif "journal " in name.lower(): + container_type = "journal" ce = fatcat_openapi_client.ContainerEntity( - issnl=row['issnl'], - issnp=row['extra'].get('issnp'), - issne=row['extra'].get('issne'), - ident=row['ident'], + issnl=row["issnl"], + issnp=row["extra"].get("issnp"), + issne=row["extra"].get("issne"), + ident=row["ident"], name=name, container_type=container_type, - publisher=clean(row.get('publisher')), - wikidata_qid=row.get('wikidata_qid'), - extra=extra) + publisher=clean(row.get("publisher")), + wikidata_qid=row.get("wikidata_qid"), + extra=extra, + ) return ce def try_update(self, ce): @@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter): except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err - self.counts['exists'] += 1 - self.counts['exists-not-found'] += 1 + self.counts["exists"] += 1 + self.counts["exists-not-found"] += 1 return False - if existing.state != 'active': - self.counts['exists'] += 1 - self.counts['exists-inactive'] += 1 + if existing.state != "active": + self.counts["exists"] += 1 + self.counts["exists-inactive"] += 1 return False if not existing: @@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter): if err.status != 404: raise err if existing: - self.counts['exists'] += 1 - self.counts['exists-by-issnl'] += 1 + self.counts["exists"] += 1 + self.counts["exists-by-issnl"] += 1 return False # doesn't exist, always create return True @@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter): # decide whether to update do_update = False if not self.do_updates: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not existing.extra: existing.extra = dict() - if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): + if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set( + existing.extra.get("urls", []) + ): do_update = True - if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): + if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set( + existing.extra.get("webarchive_urls", []) + ): do_update = True - for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'): + for k in ("ezb", "szczepanski", "publisher_type", "platform"): if ce.extra.get(k) and not existing.extra.get(k): do_update = True - for k in ('kbart', 'ia', 'doaj'): + for k in ("kbart", "ia", "doaj"): # always update these fields if not equal (chocula override) if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): do_update = True @@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter): existing.container_type = existing.container_type or ce.container_type existing.issne = existing.issne or ce.issne existing.issnp = existing.issnp or ce.issnp - for k in ('urls', 'webarchive_urls'): + for k in ("urls", "webarchive_urls"): # be conservative about URL updates; don't clobber existing URL lists # may want to make this behavior more sophisticated in the # future, or at least a config flag if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra.get(k, []) - for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia', - 'scielo', 'kbart', 'publisher_type', 'platform'): + for k in ( + "sherpa_romeo", + "ezb", + "szczepanski", + "doaj", + "ia", + "scielo", + "kbart", + "publisher_type", + "platform", + ): # always update (chocula over-rides) if ce.extra.get(k): existing.extra[k] = ce.extra[k] - for k in ('country',): + for k in ("country",): # only include if not set (don't clobber human edits) if ce.extra.get(k) and not existing.extra.get(k): existing.extra[k] = ce.extra[k] - if ce.extra.get('languages'): - if not existing.extra.get('languages'): - existing.extra['languages'] = ce.extra['languages'] - elif not ce.extra['languages'][0] in existing.extra['languages']: - existing.extra['languages'].append(ce.extra['languages'][0]) + if ce.extra.get("languages"): + if not existing.extra.get("languages"): + existing.extra["languages"] = ce.extra["languages"] + elif not ce.extra["languages"][0] in existing.extra["languages"]: + existing.extra["languages"].append(ce.extra["languages"][0]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False else: - self.counts['exists'] += 1 - self.counts['exists-skip-update'] += 1 + self.counts["exists"] += 1 + self.counts["exists-skip-update"] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): - self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e33a2012..2639c85a 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -1,4 +1,3 @@ - import csv import datetime import json @@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100 DOMAIN_REL_MAP: Dict[str, str] = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" - "arxiv.org": "repository", "babel.hathitrust.org": "repository", "cds.cern.ch": "repository", @@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = { "zenodo.org": "repository", "www.biorxiv.org": "repository", "www.medrxiv.org": "repository", - "citeseerx.ist.psu.edu": "aggregator", "publisher-connector.core.ac.uk": "aggregator", "core.ac.uk": "aggregator", @@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = { "pdfs.semanticscholar.org": "aggregator", "semanticscholar.org": "aggregator", "www.semanticscholar.org": "aggregator", - "academic.oup.com": "publisher", "cdn.elifesciences.org": "publisher", "cell.com": "publisher", @@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = { "ehp.niehs.nih.gov": "publisher", "journals.tsu.ru": "publisher", "www.cogentoa.com": "publisher", - "www.researchgate.net": "academicsocial", "academia.edu": "academicsocial", - "wayback.archive-it.org": "webarchive", "web.archive.org": "webarchive", "archive.is": "webarchive", } + def make_rel_url(raw_url: str, default_link_rel: str = "web"): # this is where we map specific domains to rel types, and also filter out # bad domains, invalid URLs, etc @@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"): break return (rel, raw_url) + def test_make_rel_url(): assert make_rel_url("http://example.com/thing.pdf")[0] == "web" assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" - assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive" + assert ( + make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] + == "webarchive" + ) assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher" + class EntityImporter: """ Base class for fatcat entity importers. @@ -147,23 +147,26 @@ class EntityImporter: def __init__(self, api, **kwargs): - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['git_rev'] = eg_extra.get('git_rev', - subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["git_rev"] = eg_extra.get( + "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() + ).decode("utf-8") + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter") self.api = api - self.do_updates = bool(kwargs.get('do_updates', True)) - self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True) - self.bezerk_mode: bool = kwargs.get('bezerk_mode', False) - self.submit_mode: bool = kwargs.get('submit_mode', False) - self.edit_batch_size: int = kwargs.get('edit_batch_size', 100) - self.editgroup_description: Optional[str] = kwargs.get('editgroup_description') + self.do_updates = bool(kwargs.get("do_updates", True)) + self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True) + self.bezerk_mode: bool = kwargs.get("bezerk_mode", False) + self.submit_mode: bool = kwargs.get("submit_mode", False) + self.edit_batch_size: int = kwargs.get("edit_batch_size", 100) + self.editgroup_description: Optional[str] = kwargs.get("editgroup_description") self.editgroup_extra: Optional[Any] = eg_extra - self.es_client = kwargs.get('es_client') + self.es_client = kwargs.get("es_client") if not self.es_client: - self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120) + self.es_client = elasticsearch.Elasticsearch( + "https://search.fatcat.wiki", timeout=120 + ) self._issnl_id_map: Dict[str, Any] = dict() self._orcid_id_map: Dict[str, Any] = dict() @@ -174,7 +177,7 @@ class EntityImporter: self.reset() def reset(self) -> None: - self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0}) self._edit_count: int = 0 self._editgroup_id: Optional[str] = None self._entity_queue: List[Any] = [] @@ -184,13 +187,13 @@ class EntityImporter: """ Returns nothing. """ - self.counts['total'] += 1 + self.counts["total"] += 1 if (not raw_record) or (not self.want(raw_record)): - self.counts['skip'] += 1 + self.counts["skip"] += 1 return entity = self.parse_record(raw_record) if not entity: - self.counts['skip'] += 1 + self.counts["skip"] += 1 return if self.bezerk_mode: self.push_entity(entity) @@ -230,7 +233,7 @@ class EntityImporter: if self._entity_queue: self.insert_batch(self._entity_queue) - self.counts['insert'] += len(self._entity_queue) + self.counts["insert"] += len(self._entity_queue) self._entity_queue = [] return self.counts @@ -248,8 +251,9 @@ class EntityImporter: if not self._editgroup_id: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) self._editgroup_id = eg.editgroup_id self._edit_count += edits @@ -257,30 +261,30 @@ class EntityImporter: def create_container(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.container'] += 1 + self.counts["inserted.container"] += 1 return self.api.create_container(eg_id, entity) def create_release(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.release'] += 1 + self.counts["inserted.release"] += 1 return self.api.create_release(eg_id, entity) def create_file(self, entity): eg_id = self.get_editgroup_id() - self.counts['inserted.file'] += 1 + self.counts["inserted.file"] += 1 return self.api.create_file(eg_id, entity) def updated(self): """ Implementations should call this from try_update() if the update was successful """ - self.counts['update'] += 1 + self.counts["update"] += 1 def push_entity(self, entity): self._entity_queue.append(entity) if len(self._entity_queue) >= self.edit_batch_size: self.insert_batch(self._entity_queue) - self.counts['insert'] += len(self._entity_queue) + self.counts["insert"] += len(self._entity_queue) self._entity_queue = [] def want(self, raw_record: Any) -> bool: @@ -324,7 +328,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._orcid_id_map[orcid] = creator_id # might be None + self._orcid_id_map[orcid] = creator_id # might be None return creator_id def is_doi(self, doi: str) -> bool: @@ -347,7 +351,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._doi_id_map[doi] = release_id # might be None + self._doi_id_map[doi] = release_id # might be None return release_id def lookup_pmid(self, pmid: str): @@ -364,11 +368,11 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._pmid_id_map[pmid] = release_id # might be None + self._pmid_id_map[pmid] = release_id # might be None return release_id def is_issnl(self, issnl: str) -> bool: - return len(issnl) == 9 and issnl[4] == '-' + return len(issnl) == 9 and issnl[4] == "-" def lookup_issnl(self, issnl: str): """Caches calls to the ISSN-L lookup API endpoint in a local dict""" @@ -382,7 +386,7 @@ class EntityImporter: # If anything other than a 404 (not found), something is wrong if ae.status != 404: raise ae - self._issnl_id_map[issnl] = container_id # might be None + self._issnl_id_map[issnl] = container_id # might be None return container_id def read_issn_map_file(self, issn_map_file): @@ -417,26 +421,26 @@ class EntityImporter: # update old/deprecated 'rel' on URLs for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' - if u.rel == 'social': - u.rel = 'academicsocial' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" + if u.rel == "social": + u.rel = "academicsocial" # remove URLs which are near-duplicates redundant_urls = [] all_urls = [u.url for u in existing.urls] - all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url] + all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url] for url in all_urls: # https/http redundancy - if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls: + if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls: redundant_urls.append(url) continue # default HTTP port included and not included - if ':80/' in url and url.replace(':80', '', 1) in all_urls: + if ":80/" in url and url.replace(":80", "", 1) in all_urls: redundant_urls.append(url) continue # partial and complete wayback timestamps - if '://web.archive.org/web/2017/' in url: + if "://web.archive.org/web/2017/" in url: original_url = "/".join(url.split("/")[5:]) assert len(original_url) > 5 for wb_url in all_wayback_urls: @@ -452,7 +456,9 @@ class EntityImporter: def generic_fileset_cleanups(existing): return existing - def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: + def match_existing_release_fuzzy( + self, release: ReleaseEntity + ) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. @@ -488,7 +494,15 @@ class EntityImporter: return None release_dict = entity_to_dict(release, api_client=self.api.api_client) - verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + verified = [ + ( + fuzzycat.verify.verify( + release_dict, entity_to_dict(c, api_client=self.api.api_client) + ), + c, + ) + for c in candidates + ] # chose the "closest" match closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] @@ -522,7 +536,6 @@ class RecordPusher: class JsonLinePusher(RecordPusher): - def __init__(self, importer, json_file, **kwargs): self.importer = importer self.json_file = json_file @@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher): class CsvPusher(RecordPusher): - def __init__(self, importer, csv_file, **kwargs): self.importer = importer - self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ",")) def run(self): for line in self.reader: @@ -555,7 +567,6 @@ class CsvPusher(RecordPusher): class LinePusher(RecordPusher): - def __init__(self, importer, text_file, **kwargs): self.importer = importer self.text_file = text_file @@ -571,17 +582,15 @@ class LinePusher(RecordPusher): class SqlitePusher(RecordPusher): - def __init__(self, importer, db_file, table_name, where_clause="", **kwargs): self.importer = importer - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.db.row_factory = sqlite3.Row self.table_name = table_name self.where_clause = where_clause def run(self): - cur = self.db.execute("SELECT * FROM {} {};".format( - self.table_name, self.where_clause)) + cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause)) for row in cur: self.importer.push_record(row) counts = self.importer.finish() @@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher): class Bs4XmlLinesPusher(RecordPusher): - def __init__(self, importer, xml_file, prefix_filter=None, **kwargs): self.importer = importer self.xml_file = xml_file @@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher): class Bs4XmlFilePusher(RecordPusher): - def __init__(self, importer, xml_file, record_tag, **kwargs): self.importer = importer self.xml_file = xml_file @@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher): class Bs4XmlFileListPusher(RecordPusher): - def __init__(self, importer, list_file, record_tag, **kwargs): self.importer = importer self.list_file = list_file @@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher): xml_path = xml_path.strip() if not xml_path or xml_path.startswith("#"): continue - with open(xml_path, 'r') as xml_file: + with open(xml_path, "r") as xml_file: soup = BeautifulSoup(xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) @@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher): print(counts) return counts + class KafkaBs4XmlPusher(RecordPusher): """ Fetch XML for an article from Kafka, parse via Bs4. """ + def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): self.importer = importer self.consumer = make_kafka_consumer( @@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher): kafka_env, topic_suffix, group, - kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') + kafka_namespace=kwargs.get("kafka_namespace", "fatcat"), ) - self.poll_interval = kwargs.get('poll_interval', 5.0) - self.consume_batch_size = kwargs.get('consume_batch_size', 25) + self.poll_interval = kwargs.get("poll_interval", 5.0) + self.consume_batch_size = kwargs.get("consume_batch_size", 25) def run(self): count = 0 @@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher): # outstanding editgroups every 5 minutes, but there is still that # window when editgroups might be hanging (unsubmitted). batch = self.consumer.consume( - num_messages=self.consume_batch_size, - timeout=self.poll_interval) - print("... got {} kafka messages ({}sec poll interval) {}".format( - len(batch), self.poll_interval, self.importer.counts)) + num_messages=self.consume_batch_size, timeout=self.poll_interval + ) + print( + "... got {} kafka messages ({}sec poll interval) {}".format( + len(batch), self.poll_interval, self.importer.counts + ) + ) if not batch: if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5): # it has been some time, so flush any current editgroup self.importer.finish() last_push = datetime.datetime.now() - #print("Flushed any partial import batch: {}".format(self.importer.counts)) + # print("Flushed any partial import batch: {}".format(self.importer.counts)) continue # first check errors on entire batch... for msg in batch: @@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher): raise KafkaException(msg.error()) # ... then process for msg in batch: - soup = BeautifulSoup(msg.value().decode('utf-8'), "xml") + soup = BeautifulSoup(msg.value().decode("utf-8"), "xml") self.importer.push_record(soup) soup.decompose() count += 1 @@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher): self.consumer.close() return counts -class KafkaJsonPusher(RecordPusher): +class KafkaJsonPusher(RecordPusher): def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): self.importer = importer self.consumer = make_kafka_consumer( @@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher): kafka_env, topic_suffix, group, - kafka_namespace=kwargs.get('kafka_namespace', 'fatcat') + kafka_namespace=kwargs.get("kafka_namespace", "fatcat"), ) - self.poll_interval = kwargs.get('poll_interval', 5.0) - self.consume_batch_size = kwargs.get('consume_batch_size', 100) - self.force_flush = kwargs.get('force_flush', False) + self.poll_interval = kwargs.get("poll_interval", 5.0) + self.consume_batch_size = kwargs.get("consume_batch_size", 100) + self.force_flush = kwargs.get("force_flush", False) def run(self): count = 0 @@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher): # outstanding editgroups every 5 minutes, but there is still that # window when editgroups might be hanging (unsubmitted). batch = self.consumer.consume( - num_messages=self.consume_batch_size, - timeout=self.poll_interval) - print("... got {} kafka messages ({}sec poll interval) {}".format( - len(batch), self.poll_interval, self.importer.counts)) + num_messages=self.consume_batch_size, timeout=self.poll_interval + ) + print( + "... got {} kafka messages ({}sec poll interval) {}".format( + len(batch), self.poll_interval, self.importer.counts + ) + ) if self.force_flush: # this flushing happens even if there have been 'push' events # more recently. it is intended for, eg, importers off the @@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher): self.importer.finish() last_push = datetime.datetime.now() last_force_flush = datetime.datetime.now() - #print("Flushed any partial import batch: {}".format(self.importer.counts)) + # print("Flushed any partial import batch: {}".format(self.importer.counts)) continue # first check errors on entire batch... for msg in batch: @@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher): raise KafkaException(msg.error()) # ... then process for msg in batch: - record = json.loads(msg.value().decode('utf-8')) + record = json.loads(msg.value().decode("utf-8")) self.importer.push_record(record) count += 1 if count % 500 == 0: @@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) - #print("Kafka consumer commit successful") + # print("Kafka consumer commit successful") pass # previously, using pykafka - #auto_commit_enable=True, - #auto_commit_interval_ms=30000, # 30 seconds + # auto_commit_enable=True, + # auto_commit_interval_ms=30000, # 30 seconds conf = { - 'bootstrap.servers': hosts, - 'group.id': group, - 'on_commit': fail_fast, + "bootstrap.servers": hosts, + "group.id": group, + "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker - 'enable.auto.offset.store': False, - 'enable.auto.commit': True, + "enable.auto.offset.store": False, + "enable.auto.commit": True, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) - 'max.poll.interval.ms': 120000, - 'default.topic.config': { - 'auto.offset.reset': 'latest', + "max.poll.interval.ms": 120000, + "default.topic.config": { + "auto.offset.reset": "latest", }, } @@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat for p in partitions: if p.error: raise KafkaException(p.error) - print("Kafka partitions rebalanced: {} / {}".format( - consumer, partitions)) + print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions)) consumer = Consumer(conf) # NOTE: it's actually important that topic_name *not* be bytes (UTF-8 # encoded) - consumer.subscribe([topic_name], + consumer.subscribe( + [topic_name], on_assign=on_rebalance, on_revoke=on_rebalance, ) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fd6936a4..606d4bb1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,4 +1,3 @@ - import datetime import sqlite3 from typing import Any, Dict, Optional @@ -13,30 +12,30 @@ from .common import EntityImporter, clean # Can get a list of Crossref types (with counts) via API: # https://api.crossref.org/works?rows=0&facet=type-name:* CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { - 'book': 'book', - 'book-chapter': 'chapter', - 'book-part': 'chapter', - 'book-section': 'chapter', - 'component': 'component', - 'dataset': 'dataset', - 'dissertation': 'thesis', - 'edited-book': 'book', - 'journal-article': 'article-journal', - 'monograph': 'book', - 'other': None, - 'peer-review': 'peer_review', - 'posted-content': 'post', - 'proceedings-article': 'paper-conference', - 'reference-book': 'book', - 'reference-entry': 'entry', - 'report': 'report', - 'standard': 'standard', + "book": "book", + "book-chapter": "chapter", + "book-part": "chapter", + "book-section": "chapter", + "component": "component", + "dataset": "dataset", + "dissertation": "thesis", + "edited-book": "book", + "journal-article": "article-journal", + "monograph": "book", + "other": None, + "peer-review": "peer_review", + "posted-content": "post", + "proceedings-article": "paper-conference", + "reference-book": "book", + "reference-entry": "entry", + "report": "report", + "standard": "standard", } CONTAINER_TYPE_MAP: Dict[str, str] = { - 'article-journal': 'journal', - 'paper-conference': 'conference', - 'book': 'book-series', + "article-journal": "journal", + "paper-conference": "conference", + "book": "book-series", } # These are based, informally, on sorting the most popular licenses found in @@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = { "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", } + def lookup_license_slug(raw: str) -> Optional[str]: if not raw: return None - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if 'creativecommons.org' in raw.lower(): + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if "creativecommons.org" in raw.lower(): raw = raw.lower() - raw = raw.replace('/legalcode', '/').replace('/uk', '') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.replace("/legalcode", "/").replace("/uk", "") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) + def test_lookup_license_slug(): assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" - assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" + assert ( + lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") + == "CC-BY" + ) + assert ( + lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") + == "CC-0" + ) assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" + assert ( + lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") + == "CC-BY-NC-SA" + ) assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None assert lookup_license_slug("") is None assert lookup_license_slug(None) is None + class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc: Optional[str] = kwargs.get('editgroup_description', - "Automated import of Crossref DOI metadata, harvested from REST API") - eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') - super().__init__(api, + eg_desc: Optional[str] = kwargs.get( + "editgroup_description", + "Automated import of Crossref DOI metadata, harvested from REST API", + ) + eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers: bool = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + self.create_containers: bool = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db: Optional[Any] = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter): def lookup_ext_ids(self, doi: str) -> Optional[Any]: if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter): return CONTAINER_TYPE_MAP.get(crossref_type) def want(self, obj: Dict[str, Any]) -> bool: - if not obj.get('title'): - self.counts['skip-blank-title'] += 1 + if not obj.get("title"): + self.counts["skip-blank-title"] += 1 return False # these are pre-registered DOIs before the actual record is ready # title is a list of titles - titles = obj.get('title') + titles = obj.get("title") if titles is not None and titles[0].strip().lower() in [ - "OUP accepted manuscript".lower(), - ]: - self.counts['skip-stub-title'] += 1 + "OUP accepted manuscript".lower(), + ]: + self.counts["skip-stub-title"] += 1 return False # do most of these checks in-line below @@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter): # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now - if obj.get('type') in (None, 'journal', 'proceedings', - 'standard-series', 'report-series', 'book-series', 'book-set', - 'book-track', 'proceedings-series'): - self.counts['skip-release-type'] += 1 + if obj.get("type") in ( + None, + "journal", + "proceedings", + "standard-series", + "report-series", + "book-series", + "book-set", + "book-track", + "proceedings-series", + ): + self.counts["skip-release-type"] += 1 return None # Do require the 'title' keys to exist, as release entities do - if ('title' not in obj) or (not obj['title']): - self.counts['skip-blank-title'] += 1 + if ("title" not in obj) or (not obj["title"]): + self.counts["skip-blank-title"] += 1 return None - release_type = self.map_release_type(obj['type']) + release_type = self.map_release_type(obj["type"]) # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None - if 'ORCID' in am.keys(): - creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) + if "ORCID" in am.keys(): + creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( - if am.get('given') and am.get('family'): - raw_name = "{} {}".format(am['given'], am['family']) - elif am.get('family'): - raw_name = am['family'] + if am.get("given") and am.get("family"): + raw_name = "{} {}".format(am["given"], am["family"]) + elif am.get("family"): + raw_name = am["family"] else: # TODO: can end up empty - raw_name = am.get('name') or am.get('given') + raw_name = am.get("name") or am.get("given") extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None - if am.get('affiliation'): - if len(am.get('affiliation')) > 0: - raw_affiliation = am.get('affiliation')[0]['name'] - if len(am.get('affiliation')) > 1: + if am.get("affiliation"): + if len(am.get("affiliation")) > 0: + raw_affiliation = am.get("affiliation")[0]["name"] + if len(am.get("affiliation")) > 1: # note: affiliation => more_affiliations - extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] - if am.get('sequence') and am.get('sequence') != "additional": - extra['seq'] = clean(am.get('sequence')) + extra["more_affiliations"] = [ + clean(a["name"]) for a in am.get("affiliation")[1:] + ] + if am.get("sequence") and am.get("sequence") != "additional": + extra["seq"] = clean(am.get("sequence")) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) - contribs.append(fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=index, - raw_name=raw_name, - given_name=clean(am.get('given')), - surname=clean(am.get('family')), - raw_affiliation=clean(raw_affiliation), - role=ctype, - extra=extra)) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=index, + raw_name=raw_name, + given_name=clean(am.get("given")), + surname=clean(am.get("family")), + raw_affiliation=clean(raw_affiliation), + role=ctype, + extra=extra, + ) + ) return contribs - contribs = do_contribs(obj.get('author', []), "author") - contribs.extend(do_contribs(obj.get('editor', []), "editor")) - contribs.extend(do_contribs(obj.get('translator', []), "translator")) + + contribs = do_contribs(obj.get("author", []), "author") + contribs.extend(do_contribs(obj.get("editor", []), "editor")) + contribs.extend(do_contribs(obj.get("translator", []), "translator")) # container - issn = obj.get('ISSN', [None])[0] + issn = obj.get("ISSN", [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = clean(obj.get('publisher')) + publisher = clean(obj.get("publisher")) - container_name = obj.get('container-title') + container_name = obj.get("container-title") if container_name: container_name = clean(container_name[0], force_xml=True) if not container_name: container_name = None - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=container_name) + name=container_name, + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter): # license slug license_slug = None license_extra = [] - for lic in obj.get('license', []): - if lic['content-version'] not in ('vor', 'unspecified'): + for lic in obj.get("license", []): + if lic["content-version"] not in ("vor", "unspecified"): continue - slug = lookup_license_slug(lic['URL']) + slug = lookup_license_slug(lic["URL"]) if slug: license_slug = slug - if 'start' in lic: - lic['start'] = lic['start']['date-time'] + if "start" in lic: + lic["start"] = lic["start"]["date-time"] license_extra.append(lic) # references refs = [] - for i, rm in enumerate(obj.get('reference', [])): + for i, rm in enumerate(obj.get("reference", [])): try: - year: Optional[int] = int(rm.get('year')) + year: Optional[int] = int(rm.get("year")) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year is not None: @@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter): except (TypeError, ValueError): year = None ref_extra: Dict[str, Any] = dict() - key = rm.get('key') - if key and key.startswith(obj['DOI'].upper()): - key = key.replace(obj['DOI'].upper() + "-", '') - key = key.replace(obj['DOI'].upper(), '') - ref_container_name = rm.get('volume-title') + key = rm.get("key") + if key and key.startswith(obj["DOI"].upper()): + key = key.replace(obj["DOI"].upper() + "-", "") + key = key.replace(obj["DOI"].upper(), "") + ref_container_name = rm.get("volume-title") if not ref_container_name: - ref_container_name = rm.get('journal-title') - elif rm.get('journal-title'): - ref_extra['journal-title'] = rm['journal-title'] - if rm.get('DOI'): - ref_extra['doi'] = rm.get('DOI').lower() - author = clean(rm.get('author')) + ref_container_name = rm.get("journal-title") + elif rm.get("journal-title"): + ref_extra["journal-title"] = rm["journal-title"] + if rm.get("DOI"): + ref_extra["doi"] = rm.get("DOI").lower() + author = clean(rm.get("author")) if author: - ref_extra['authors'] = [author] - for k in ('editor', 'edition', 'authority', 'version', 'genre', - 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', - 'issued', 'page', 'medium', 'collection_title', 'chapter_number', - 'unstructured', 'series-title', 'volume-title'): + ref_extra["authors"] = [author] + for k in ( + "editor", + "edition", + "authority", + "version", + "genre", + "url", + "event", + "issue", + "volume", + "date", + "accessed_date", + "issued", + "page", + "medium", + "collection_title", + "chapter_number", + "unstructured", + "series-title", + "volume-title", + ): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - index=i, - # doing lookups would be a second import pass - target_release_id=None, - key=key, - year=year, - container_name=clean(ref_container_name), - title=clean(rm.get('article-title')), - locator=clean(rm.get('first-page')), - # TODO: just dump JSON somewhere here? - extra=ref_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + index=i, + # doing lookups would be a second import pass + target_release_id=None, + key=key, + year=year, + container_name=clean(ref_container_name), + title=clean(rm.get("article-title")), + locator=clean(rm.get("first-page")), + # TODO: just dump JSON somewhere here? + extra=ref_extra, + ) + ) # abstracts abstracts = [] - abstract = clean(obj.get('abstract')) + abstract = clean(obj.get("abstract")) if abstract and len(abstract) > 10: - abstracts.append(fatcat_openapi_client.ReleaseAbstract( - mimetype="application/xml+jats", - content=abstract)) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="application/xml+jats", content=abstract + ) + ) # extra fields extra = dict() extra_crossref = dict() # top-level extra keys if not container_id: - if obj.get('container-title'): - extra['container_name'] = container_name - for key in ('group-title'): + if obj.get("container-title"): + extra["container_name"] = container_name + for key in "group-title": val = obj.get(key) if val: if type(val) == list: @@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter): else: extra[key] = val # crossref-nested extra keys - for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): + for key in ("subject", "type", "alternative-id", "archive", "funder"): val = obj.get(key) if val: if type(val) == str: @@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter): else: extra_crossref[key] = val if license_extra: - extra_crossref['license'] = license_extra + extra_crossref["license"] = license_extra - if len(obj['title']) > 1: - aliases = [clean(t) for t in obj['title'][1:]] + if len(obj["title"]) > 1: + aliases = [clean(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: - extra['aliases'] = aliases + extra["aliases"] = aliases # ISBN isbn13 = None - for raw in obj.get('ISBN', []): + for raw in obj.get("ISBN", []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status - if obj['type'] in ('journal-article', 'conference-proceeding', 'book', - 'dissertation', 'book-chapter'): + if obj["type"] in ( + "journal-article", + "conference-proceeding", + "book", + "dissertation", + "book-chapter", + ): release_stage = "published" else: # unknown release_stage = None # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower()) + extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) # filter out unreasonably huge releases if len(abstracts) > 100: - self.counts['skip-huge-abstracts'] += 1 + self.counts["skip-huge-abstracts"] += 1 return None if len(contribs) > 2000: - self.counts['skip-huge-contribs'] += 1 + self.counts["skip-huge-contribs"] += 1 return None if len(refs) > 5000: - self.counts['skip-huge-refs'] += 1 + self.counts["skip-huge-refs"] += 1 return None # release date parsing is amazingly complex - raw_date = obj['issued']['date-parts'][0] + raw_date = obj["issued"]["date-parts"][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None @@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter): release_date = None original_title: Optional[str] = None - if obj.get('original-title'): - ot = obj.get('original-title') + if obj.get("original-title"): + ot = obj.get("original-title") if ot is not None: original_title = clean(ot[0], force_xml=True) title: Optional[str] = None - if obj.get('title'): - title = clean(obj.get('title')[0], force_xml=True) + if obj.get("title"): + title = clean(obj.get("title")[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character - self.counts['skip-blank-title'] += 1 + self.counts["skip-blank-title"] += 1 return None subtitle = None - if obj.get('subtitle'): - subtitle = clean(obj.get('subtitle')[0], force_xml=True) + if obj.get("subtitle"): + subtitle = clean(obj.get("subtitle")[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: - extra['crossref'] = extra_crossref + extra["crossref"] = extra_crossref if not extra: extra = None @@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj['DOI'].lower(), - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], + doi=obj["DOI"].lower(), + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), - volume=clean(obj.get('volume')), - issue=clean(obj.get('issue')), - pages=clean(obj.get('page')), - language=clean(obj.get('language')), + volume=clean(obj.get("volume")), + issue=clean(obj.get("issue")), + pages=clean(obj.get("page")), + language=clean(obj.get("language")), license_slug=license_slug, extra=extra, abstracts=abstracts, @@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index a06c68a4..4c174b0b 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { - 'Journal': 'journal', - 'Series': 'journal', - 'Book Series': 'book-series', + "Journal": "journal", + "Series": "journal", + "Book Series": "book-series", } # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. DATACITE_TYPE_MAP = { - 'ris': { - 'THES': 'thesis', - 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) - 'CHAP': 'chapter', - 'FIGURE': 'figure', - 'RPRT': 'report', - 'JOUR': 'article-journal', - 'MPCT': 'motion_picture', - 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset - 'BOOK': 'book', - 'DATA': 'dataset', - 'COMP': 'software', + "ris": { + "THES": "thesis", + "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) + "CHAP": "chapter", + "FIGURE": "figure", + "RPRT": "report", + "JOUR": "article-journal", + "MPCT": "motion_picture", + "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset + "BOOK": "book", + "DATA": "dataset", + "COMP": "software", }, - 'schemaOrg': { - 'Dataset': 'dataset', - 'Book': 'book', - 'ScholarlyArticle': 'article-journal', - 'ImageObject': 'graphic', - 'Collection': None, - 'MediaObject': None, - 'Event': None, - 'SoftwareSourceCode': 'software', - 'Chapter': 'chapter', - 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. - 'PublicationIssue': 'article', - 'AudioObject': None, - 'Thesis': 'thesis', + "schemaOrg": { + "Dataset": "dataset", + "Book": "book", + "ScholarlyArticle": "article-journal", + "ImageObject": "graphic", + "Collection": None, + "MediaObject": None, + "Event": None, + "SoftwareSourceCode": "software", + "Chapter": "chapter", + "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + "PublicationIssue": "article", + "AudioObject": None, + "Thesis": "thesis", }, - 'citeproc': { - 'article': 'article', - 'article-journal': 'article-journal', - 'article-magazine': 'article-magazine', - 'article-newspaper': 'article-newspaper', - 'bill': 'bill', - 'book': 'book', - 'broadcast': 'broadcast', - 'chapter': 'chapter', - 'dataset': 'dataset', - 'entry-dictionary': 'entry-dictionary', - 'entry-encyclopedia': 'entry-encyclopedia', - 'entry': 'entry', - 'figure': 'figure', - 'graphic': 'graphic', - 'interview': 'interview', - 'legal_case': 'legal_case', - 'legislation': 'legislation', - 'manuscript': 'manuscript', - 'map': 'map', - 'motion_picture': 'motion_picture', - 'musical_score': 'musical_score', - 'pamphlet': 'pamphlet', - 'paper-conference': 'paper-conference', - 'patent': 'patent', - 'personal_communication': 'personal_communication', - 'post': 'post', - 'post-weblog': 'post-weblog', - 'report': 'report', - 'review-book': 'review-book', - 'review': 'review', - 'song': 'song', - 'speech': 'speech', - 'thesis': 'thesis', - 'treaty': 'treaty', - 'webpage': 'webpage', + "citeproc": { + "article": "article", + "article-journal": "article-journal", + "article-magazine": "article-magazine", + "article-newspaper": "article-newspaper", + "bill": "bill", + "book": "book", + "broadcast": "broadcast", + "chapter": "chapter", + "dataset": "dataset", + "entry-dictionary": "entry-dictionary", + "entry-encyclopedia": "entry-encyclopedia", + "entry": "entry", + "figure": "figure", + "graphic": "graphic", + "interview": "interview", + "legal_case": "legal_case", + "legislation": "legislation", + "manuscript": "manuscript", + "map": "map", + "motion_picture": "motion_picture", + "musical_score": "musical_score", + "pamphlet": "pamphlet", + "paper-conference": "paper-conference", + "patent": "patent", + "personal_communication": "personal_communication", + "post": "post", + "post-weblog": "post-weblog", + "report": "report", + "review-book": "review-book", + "review": "review", + "song": "song", + "speech": "speech", + "thesis": "thesis", + "treaty": "treaty", + "webpage": "webpage", }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types - 'bibtex': { - 'phdthesis': 'thesis', - 'inbook': 'chapter', - 'misc': None, - 'article': 'article-journal', - 'book': 'book', + "bibtex": { + "phdthesis": "thesis", + "inbook": "chapter", + "misc": None, + "article": "article-journal", + "book": "book", }, - 'resourceTypeGeneral': { - 'Image': 'graphic', - 'Dataset': 'dataset', - 'PhysicalObject': None, - 'Collection': None, - 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" - 'Sound': None, - 'InteractiveResource': None, - 'Event': None, - 'Software': 'software', - 'Other': None, - 'Workflow': None, - 'Audiovisual': None, - } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 + "resourceTypeGeneral": { + "Image": "graphic", + "Dataset": "dataset", + "PhysicalObject": None, + "Collection": None, + "Text": None, # "Greyliterature, labnotes, accompanyingmaterials" + "Sound": None, + "InteractiveResource": None, + "Event": None, + "Software": "software", + "Other": None, + "Workflow": None, + "Audiovisual": None, + }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS = ( - '(:unac)', # temporarily inaccessible - '(:unal)', # unallowed, suppressed intentionally - '(:unap)', # not applicable, makes no sense - '(:unas)', # value unassigned (e.g., Untitled) - '(:unav)', # value unavailable, possibly unknown - '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) - '(:none)', # never had a value, never will - '(:null)', # explicitly and meaningfully empty - '(:tba)', # to be assigned or announced later - '(:etal)', # too numerous to list (et alia) + "(:unac)", # temporarily inaccessible + "(:unal)", # unallowed, suppressed intentionally + "(:unap)", # not applicable, makes no sense + "(:unas)", # value unassigned (e.g., Untitled) + "(:unav)", # value unavailable, possibly unknown + "(:unkn)", # known to be unknown (e.g., Anonymous, Inconnue) + "(:none)", # never had a value, never will + "(:null)", # explicitly and meaningfully empty + "(:tba)", # to be assigned or announced later + "(:etal)", # too numerous to list (et alia) ) # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( - 'NA', - 'NN', - 'n.a.', - '[s.n.]', - 'Unknown', -))) +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( + set( + ( + "NA", + "NN", + "n.a.", + "[s.n.]", + "Unknown", + ) + ) +) # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) @@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi DATACITE_TITLE_SPAM_WORDGROUPS = [ { - "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', - 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), + "tokens": ( + "full", + "movies", + "movie", + "watch", + "streaming", + "online", + "free", + "hd", + "download", + "english", + "subtitle", + "bluray", + ), "min": 4, } ] @@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter): """ Importer for datacite records. """ - def __init__(self, - api, - issn_map_file, - debug=False, - insert_log_file=None, - **kwargs): + + def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of Datacite DOI metadata, harvested from REST API" + "editgroup_description", + "Automated import of Datacite DOI metadata, harvested from REST API", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DataciteImporter') - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) - - self.create_containers = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter") + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs + ) + + self.create_containers = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter): self.insert_log_file = insert_log_file self.this_year = datetime.datetime.now().year - print('datacite with debug={}'.format(self.debug), file=sys.stderr) + print("datacite with debug={}".format(self.debug), file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers referring to the same things as the given DOI. """ if self.extid_map_db is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) row = self.extid_map_db.execute( - "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, - pmid=None, - pmcid=None, - wikidata_qid=None, - arxiv_id=None, - jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter): """ if not obj or not isinstance(obj, dict): return None - if 'attributes' not in obj: + if "attributes" not in obj: return None - attributes = obj['attributes'] - doi = clean_doi(attributes.get('doi', '').lower()) + attributes = obj["attributes"] + doi = clean_doi(attributes.get("doi", "").lower()) if not doi: - print('skipping record without a DOI', file=sys.stderr) + print("skipping record without a DOI", file=sys.stderr) return if not str.isascii(doi): - print('[{}] skipping non-ascii doi for now'.format(doi)) + print("[{}] skipping non-ascii doi for now".format(doi)) return None - creators = attributes.get('creators', []) or [] - contributors = attributes.get('contributors', []) or [] # Much fewer than creators. + creators = attributes.get("creators", []) or [] + contributors = attributes.get("contributors", []) or [] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) @@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter): # Related: https://guide.fatcat.wiki/entity_release.html -- role # (string, of a set): the type of contribution, from a controlled # vocabulary. TODO: vocabulary needs review. - contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + contribs_extra_contributors = self.parse_datacite_creators( + contributors, set_index=False, doi=doi + ) # Unfortunately, creators and contributors might overlap, refs GH59. for cc in contribs_extra_contributors: @@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter): # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" - titles = attributes.get('titles', []) or [] - title, original_language_title, subtitle = parse_datacite_titles( - titles) + titles = attributes.get("titles", []) or [] + title, original_language_title, subtitle = parse_datacite_titles(titles) if title is None: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: - print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr) return False # check for blocklisted "spam", e.g. "FULL MOVIE" @@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter): # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( - attributes.get('dates', [])) + attributes.get("dates", []) + ) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_date = None release_month = None release_year = None @@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter): # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + release_date, release_month, release_year = parse_single_date( + attributes.get("publicationYear") + ) if not any((release_date, release_month, release_year)): - release_date, release_month, release_year = parse_single_date(attributes.get('published')) + release_date, release_month, release_year = parse_single_date( + attributes.get("published") + ) if not any((release_date, release_month, release_year)): - print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = 'published' + release_stage = "published" # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. - publisher = attributes.get('publisher') + publisher = attributes.get("publisher") - if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): + if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: @@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter): container_id = None container_name = None - container = attributes.get('container', {}) or {} - if container.get('type') in CONTAINER_TYPE_MAP.keys(): - container_type = CONTAINER_TYPE_MAP.get(container['type']) - if container.get('identifier') and container.get( - 'identifierType') == 'ISSN': - issn = container.get('identifier') + container = attributes.get("container", {}) or {} + if container.get("type") in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container["type"]) + if container.get("identifier") and container.get("identifierType") == "ISSN": + issn = container.get("identifier") if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) - if container_id is None and container.get('title'): - container_name = container.get('title') + if container_id is None and container.get("title"): + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( @@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter): else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 - container_name = container.get('title') + container_name = container.get("title") if isinstance(container_name, list): if len(container_name) > 0: - print('[{}] too many container titles: {}'.format(doi, - len(container_name))) + print( + "[{}] too many container titles: {}".format( + doi, len(container_name) + ) + ) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: - if publisher and publisher.lower().startswith('micropublication'): + if publisher and publisher.lower().startswith("micropublication"): container_name = publisher # Volume and issue. - volume = container.get('volume') - issue = container.get('issue') + volume = container.get("volume") + issue = container.get("issue") if volume: volume = clean(volume) @@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter): # Pages. pages = None - first_page = container.get('firstPage') - last_page = container.get('lastPage') + first_page = container.get("firstPage") + last_page = container.get("lastPage") if first_page and last_page: try: _ = int(first_page) < int(last_page) - pages = '{}-{}'.format(first_page, last_page) + pages = "{}-{}".format(first_page, last_page) except ValueError as err: # noqa: F841 # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) @@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter): license_slug = None license_extra = [] - for lic in attributes.get('rightsList', []): - slug = lookup_license_slug(lic.get('rightsUri')) + for lic in attributes.get("rightsList", []): + slug = lookup_license_slug(lic.get("rightsUri")) if slug: license_slug = slug license_extra.append(lic) @@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter): # library solves it for you." -- TODO(martin): We need more of these. language = None - value = attributes.get('language', '') or '' + value = attributes.get("language", "") or "" try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: # noqa: F841 @@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter): # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] - descs = attributes.get('descriptions', []) or [] + descs = attributes.get("descriptions", []) or [] for desc in descs: - if not desc.get('descriptionType') == 'Abstract': + if not desc.get("descriptionType") == "Abstract": continue # Description maybe a string, int or list. - text = desc.get('description', '') + text = desc.get("description", "") if not text: continue if isinstance(text, int): - text = '{}'.format(text) + text = "{}".format(text) if isinstance(text, list): try: text = "\n".join(text) except TypeError: - continue # Bail out, if it is not a list of strings. + continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: @@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter): try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: - print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + print( + "[{}] language detection failed with {} on {}".format(doi, err, text), + file=sys.stderr, + ) abstract_text = clean(text) if not abstract_text: continue @@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter): mimetype="text/plain", content=abstract_text, lang=lang, - )) + ) + ) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. @@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter): # For the moment, we only care about References. refs, ref_index = [], 0 - relIds = attributes.get('relatedIdentifiers', []) or [] + relIds = attributes.get("relatedIdentifiers", []) or [] for rel in relIds: - if not rel.get('relationType', '') in ('References', 'Cites'): + if not rel.get("relationType", "") in ("References", "Cites"): continue ref_extra = dict() - if rel.get('relatedIdentifierType', '') == 'DOI': - ref_extra['doi'] = rel.get('relatedIdentifier') + if rel.get("relatedIdentifierType", "") == "DOI": + ref_extra["doi"] = rel.get("relatedIdentifier") if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, - )) + ) + ) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: - if rel.get('relatedIdentifierType', '') != 'Reviews': + if rel.get("relatedIdentifierType", "") != "Reviews": continue - release_type = 'review' + release_type = "review" # Extra information. extra_datacite = dict() if license_extra: - extra_datacite['license'] = license_extra - if attributes.get('subjects'): - extra_datacite['subjects'] = attributes['subjects'] + extra_datacite["license"] = license_extra + if attributes.get("subjects"): + extra_datacite["subjects"] = attributes["subjects"] # Include version information. - metadata_version = attributes.get('metadataVersion') or '' + metadata_version = attributes.get("metadataVersion") or "" if metadata_version: - extra_datacite['metadataVersion'] = metadata_version + extra_datacite["metadataVersion"] = metadata_version # Include resource types. - types = attributes.get('types', {}) or {} - resource_type = types.get('resourceType', '') or '' - resource_type_general = types.get('resourceTypeGeneral', '') or '' + types = attributes.get("types", {}) or {} + resource_type = types.get("resourceType", "") or "" + resource_type_general = types.get("resourceTypeGeneral", "") or "" if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceType'] = resource_type + extra_datacite["resourceType"] = resource_type if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER: - extra_datacite['resourceTypeGeneral'] = resource_type_general + extra_datacite["resourceTypeGeneral"] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifier and identifier type (mostly). relations = [] for rel in relIds: - if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', - 'IsVariantFormOf', 'IsSupplementTo', - 'HasVersion', 'IsMetadataFor', - 'IsNewVersionOf', 'IsIdenticalTo', - 'IsVersionOf', 'IsDerivedFrom', - 'IsSourceOf'): + if rel.get("relationType") in ( + "IsPartOf", + "Reviews", + "Continues", + "IsVariantFormOf", + "IsSupplementTo", + "HasVersion", + "IsMetadataFor", + "IsNewVersionOf", + "IsIdenticalTo", + "IsVersionOf", + "IsDerivedFrom", + "IsSourceOf", + ): relations.append(rel) if relations: - extra_datacite['relations'] = relations + extra_datacite["relations"] = relations extra = dict() @@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter): # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" - version = attributes.get('version') or None + version = attributes.get("version") or None # top-level extra keys if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name # Always include datacite key, even if value is empty (dict). - extra['datacite'] = extra_datacite + extra["datacite"] = extra_datacite # Preparation for a schema update. if release_month: - extra['release_month'] = release_month + extra["release_month"] = release_month extids = self.lookup_ext_ids(doi=doi) @@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter): publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), contribs=contribs, volume=volume, @@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter): """ release_type = None - if not attributes.get('types'): + if not attributes.get("types"): return None - types = attributes['types'] + types = attributes["types"] - for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"): value = types.get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break # special case: figshare "collections" which group other entities - if doi.startswith('10.6084/') or doi.startswith('10.25384'): - if types.get('resourceType') == "Collection": + if doi.startswith("10.6084/") or doi.startswith("10.25384"): + if types.get("resourceType") == "Collection": release_type = "stub" if release_type is None: @@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter): # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). - if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): - re.release_type = 'stub' + if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."): + re.release_type = "stub" # release_type exception: lots of "Experimental Crystal Structure Determination" # publisher: "Cambridge Crystallographic Data Centre" - if re.ext_ids.doi.startswith('10.5517/'): - re.release_type = 'entry' + if re.ext_ids.doi.startswith("10.5517/"): + re.release_type = "entry" # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." - if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): - re.release_type = 'component' + if re.title.lower().startswith("additional file") and re.release_type in ( + "article", + "article-journal", + ): + re.release_type = "component" # figshare - if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): + if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"): # set version if DOI ends with versioned suffix - doi_suffix = re.ext_ids.doi.split('.')[-1] - if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): + doi_suffix = re.ext_ids.doi.split(".")[-1] + if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" - if " from " in re.title and re.release_type not in ('stub', 'graphic'): + if " from " in re.title and re.release_type not in ("stub", "graphic"): if re.title.startswith("Figure "): re.release_type = "component" elif re.title.startswith("Table "): re.release_type = "component" # figshare.com - if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: - re.extra['container_name'] = "figshare.com" + if ( + re.ext_ids.doi.startswith("10.6084/m9.figshare.") + and re.extra.get("container_name") is None + ): + re.extra["container_name"] = "figshare.com" return re @@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + print("inserting batch ({})".format(len(batch)), file=sys.stderr) if self.insert_log_file: - with open(self.insert_log_file, 'a') as f: + with open(self.insert_log_file, "a") as f: for doc in batch: json.dump(entity_to_dict(doc, api_client=None), f) - f.write('\n') + f.write("\n") self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) - def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): + def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None): """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. @@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blocklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(("Occdownload Gbif.Org",)) i = 0 for c in creators: if not set_index: i = None - nameType = c.get('nameType', '') or '' - if nameType in ('', 'Personal'): + nameType = c.get("nameType", "") or "" + if nameType in ("", "Personal"): creator_id = None - for nid in c.get('nameIdentifiers', []) or []: + for nid in c.get("nameIdentifiers", []) or []: if not isinstance(nid, dict): # see: fatcat-workers/issues/44035/ - print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr) + print( + "unexpected nameIdentifiers, expected list of dicts, got: {}".format( + nid + ), + file=sys.stderr, + ) continue - name_scheme = nid.get('nameIdentifierScheme', '') or '' + name_scheme = nid.get("nameIdentifierScheme", "") or "" if not name_scheme.lower() == "orcid": continue - orcid = nid.get('nameIdentifier') or '' - orcid = orcid.replace('https://orcid.org/', '') + orcid = nid.get("nameIdentifier") or "" + orcid = orcid.replace("https://orcid.org/", "") if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. - affiliations = c.get('affiliation', []) or [] + affiliations = c.get("affiliation", []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean(affiliations[0]) - name = c.get('name') - given_name = c.get('givenName') - surname = c.get('familyName') + name = c.get("name") + given_name = c.get("givenName") + surname = c.get("familyName") if name: name = clean(name) if not any((name, given_name, surname)): continue if not name: - name = "{} {}".format(given_name or '', surname or '').strip() + name = "{} {}".format(given_name or "", surname or "").strip() if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter): if not name: continue - if raw_affiliation == '': + if raw_affiliation == "": continue extra = None @@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter): # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" - contributorType = c.get('contributorType', '') or '' + contributorType = c.get("contributorType", "") or "" if contributorType: - extra = {'type': contributorType} + extra = {"type": contributorType} rc = fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=name, - given_name=given_name, - surname=surname, - role=role, - raw_affiliation=raw_affiliation, - extra=extra, - ) + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + ) # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) if i is not None: i += 1 - elif nameType == 'Organizational': - name = c.get('name', '') or '' + elif nameType == "Organizational": + name = c.get("name", "") or "" if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue - extra = {'organization': name} - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, extra=extra)) + extra = {"organization": name} + contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra)) if i is not None: i += 1 else: - print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr) return contribs @@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor): for cc in contributor_list: if cc.raw_name != contributor.raw_name: continue - cc_role = cc.role or 'author' - contributor_role = contributor.role or 'author' + cc_role = cc.role or "author" + contributor_role = contributor.role or "author" if cc_role != contributor_role: continue return True @@ -952,91 +1007,97 @@ def lookup_license_slug(raw): if not raw: return None - if 'creativecommons.org/publicdomain/zero' in raw: - return 'CC-0' - if raw.lower().endswith('/cc0'): - return 'CC-0' + if "creativecommons.org/publicdomain/zero" in raw: + return "CC-0" + if raw.lower().endswith("/cc0"): + return "CC-0" - if 'creativecommons' in raw: + if "creativecommons" in raw: # https://creativecommons.org/publicdomain/mark/1.0/deed.de - if 'creativecommons.org/publicdomain' in raw: - return 'CC-PUBLICDOMAIN' - if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: - return 'CC-0' + if "creativecommons.org/publicdomain" in raw: + return "CC-PUBLICDOMAIN" + if "creativecommons.org/share-your-work/public-domain/cc0" in raw: + return "CC-0" # https://creativecommons.org/licenses/by/4.0/deed.es_ES raw = raw.lower() - match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE) + match = re.search( + r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None - if not name.startswith('cc'): - name = 'cc-{}'.format(name) + if not name.startswith("cc"): + name = "cc-{}".format(name) return name.upper() - if 'opensource.org' in raw: + if "opensource.org" in raw: # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 - match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE) + match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 11: return None return name.upper() - if 'gnu.org' in raw: + if "gnu.org" in raw: # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html - match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) + match = re.search( + r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)", + raw, + re.IGNORECASE, + ) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 8: return None return name.upper() - if 'spdx.org' in raw: - if 'spdx.org/licenses/CC0' in raw: - return 'CC-0' + if "spdx.org" in raw: + if "spdx.org/licenses/CC0" in raw: + return "CC-0" # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html - match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE) + match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 36: return None # cleanup version and extensions - name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) + name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower()) return name.upper() - if 'rightsstatements.org' in raw: + if "rightsstatements.org" in raw: # http://rightsstatements.org/vocab/InC/1.0/ - match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw) + match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw) if not match: - print('missed potential license: {}'.format(raw), file=sys.stderr) + print("missed potential license: {}".format(raw), file=sys.stderr) return None - name = match.groupdict().get('name') + name = match.groupdict().get("name") if not name: return None if len(name) > 9: return None - return 'RS-{}'.format(name.upper()) + return "RS-{}".format(name.upper()) # Fallback to mapped values. raw = raw.lower() - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) @@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3): Example input: {'title': 'Some title', 'original_language_title': 'Some title'} """ - if 'original_language_title' not in item: + if "original_language_title" not in item: return None - title = item.get('title') + title = item.get("title") if not title: return None - original_language_title = item.get('original_language_title') - if isinstance(original_language_title, - str) and title != original_language_title: + original_language_title = item.get("original_language_title") + if isinstance(original_language_title, str) and title != original_language_title: if len(original_language_title) < min_length: return None - if original_language_title.count('?') > max_questionmarks: + if original_language_title.count("?") > max_questionmarks: return None return original_language_title if isinstance(original_language_title, dict): - content = original_language_title.get('__content__', '') or '' - if content and content != title and not content.count( - '?') > max_questionmarks: + content = original_language_title.get("__content__", "") or "" + if content and content != title and not content.count("?") > max_questionmarks: return content return None @@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle elif len(titles) == 1: original_language_title = find_original_language_title(titles[0]) - title = titles[0].get('title', '') or '' + title = titles[0].get("title", "") or "" title = title.strip() if not title: title = None return title, original_language_title, subtitle else: for entry in titles: - if not title and ('titleType' not in entry - or not entry.get('titleType')): - title = (entry.get('title') or '').strip() - if not subtitle and entry.get('titleType') == 'Subtitle': - subtitle = entry.get('title', '').strip() + if not title and ("titleType" not in entry or not entry.get("titleType")): + title = (entry.get("title") or "").strip() + if not subtitle and entry.get("titleType") == "Subtitle": + subtitle = entry.get("title", "").strip() if not original_language_title: original_language_title = find_original_language_title(entry) return title, original_language_title, subtitle + def parse_single_date(value): """ Given a single string containing a date in arbitrary format, try to return @@ -1113,11 +1172,11 @@ def parse_single_date(value): # Results in a dict with keys: date_obj, period, locale. parse_result = parser.get_date_data(value) # A datetime object, later we need a date, only. - result = parse_result['date_obj'] + result = parse_result["date_obj"] if result is not None: - if parse_result['period'] == 'year': + if parse_result["period"] == "year": return None, None, result.year - elif parse_result['period'] == 'month': + elif parse_result["period"] == "month": return None, result.month, result.year else: return result.date(), result.month, result.year @@ -1126,6 +1185,7 @@ def parse_single_date(value): return None, None, None + def parse_datacite_dates(dates): """ Given a list of date fields (under .dates), return tuple, (release_date, @@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year if not isinstance(dates, list): - raise ValueError('expected a list of date items') + raise ValueError("expected a list of date items") # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", # "Collected", "Updated", "Copyrighted", "Created" # Ignored for now: "Collected", "Issued" date_type_prio = ( - 'Valid', - 'Available', - 'Accepted', - 'Submitted', - 'Copyrighted', - 'Created', - 'Updated', + "Valid", + "Available", + "Accepted", + "Submitted", + "Copyrighted", + "Created", + "Updated", ) # We need to note the granularity, since a string like "2019" would be # parsed into "2019-01-01", even though the month is unknown. Use 3 # granularity types: 'y', 'm', 'd'. - Pattern = collections.namedtuple('Pattern', 'layout granularity') + Pattern = collections.namedtuple("Pattern", "layout granularity") # Before using (expensive) dateparser, try a few common patterns. common_patterns = ( - Pattern('%Y-%m-%d', 'd'), - Pattern('%Y-%m', 'm'), - Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), - Pattern('%Y-%m-%dT%H:%M:%S', 'd'), - Pattern('%Y', 'y'), + Pattern("%Y-%m-%d", "d"), + Pattern("%Y-%m", "m"), + Pattern("%Y-%m-%dT%H:%M:%SZ", "d"), + Pattern("%Y-%m-%dT%H:%M:%S", "d"), + Pattern("%Y", "y"), ) def parse_item(item): - result, value, year_only = None, str(item.get('date', '')) or '', False + result, value, year_only = None, str(item.get("date", "")) or "", False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: @@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates): except ValueError: continue else: - if granularity == 'y': + if granularity == "y": year_only = True break if result is None: - print('fallback for {}'.format(value), file=sys.stderr) + print("fallback for {}".format(value), file=sys.stderr) release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. return release_date, release_month, release_year - if granularity != 'y': + if granularity != "y": release_date = result.date() release_year = result.year - if granularity in ('m', 'd'): + if granularity in ("m", "d"): release_month = result.month return release_date, release_month, release_year @@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates): for prio in date_type_prio: for item in dates: - if not item.get('dateType') == prio: + if not item.get("dateType") == prio: continue release_date, release_month, release_year = parse_item(item) @@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year + def index_form_to_display_name(s): """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. """ - if ',' not in s: + if "," not in s: return s - skip_on_chars = ['(', ')', '*'] + skip_on_chars = ["(", ")", "*"] for char in skip_on_chars: if char in s: return s - if s.count(',') > 1: + if s.count(",") > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s # Not names, but sprinkled in fields where authors live. - stopwords = [s.lower() for s in ( - 'Archive', - 'Collection', - 'Coordinator', - 'Department', - 'Germany', - 'International', - 'National', - 'Netherlands', - 'Office', - 'Organisation', - 'Organization', - 'Service', - 'Services', - 'United States', - 'University', - 'Verein', - 'Volkshochschule', - )] + stopwords = [ + s.lower() + for s in ( + "Archive", + "Collection", + "Coordinator", + "Department", + "Germany", + "International", + "National", + "Netherlands", + "Office", + "Organisation", + "Organization", + "Service", + "Services", + "United States", + "University", + "Verein", + "Volkshochschule", + ) + ] lower = s.lower() for stop in stopwords: if stop in lower: return s - a, b = s.split(',') - return '{} {}'.format(b.strip(), a.strip()) + a, b = s.split(",") + return "{} {}".format(b.strip(), a.strip()) diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index 3d280fb7..603a6271 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -1,4 +1,3 @@ - """ Importer for DBLP container-level (journal/conference/series) metadata, pre-scraped in to JSON from HTML pages. @@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): + def __init__( + self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs + ): - def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata scraped from dblp HTML") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata scraped from dblp HTML", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dblp_container_map_output = dblp_container_map_output self.read_dblp_container_map_file(dblp_container_map_file) @@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter): assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id print("\t".join([prefix, container_id]), file=self.dblp_container_map_output) - print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + print( + "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), + file=sys.stderr, + ) def lookup_dblp_prefix(self, prefix): if not prefix: @@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - dblp_prefix = row.get('key') or row.get('dblp_prefix') + dblp_prefix = row.get("key") or row.get("dblp_prefix") assert dblp_prefix - assert row['title'] + assert row["title"] container_type = None - if dblp_prefix.startswith('conf/'): + if dblp_prefix.startswith("conf/"): container_type = "conference-series" - elif dblp_prefix.startswith('journals/'): + elif dblp_prefix.startswith("journals/"): container_type = "journal" - elif dblp_prefix.startswith('series/'): + elif dblp_prefix.startswith("series/"): container_type = "book-series" issnl = None - for issn in row.get('issns', []): + for issn in row.get("issns", []): issnl = self.issn2issnl(issn) if issnl: break extra = { - 'dblp': { - 'prefix': dblp_prefix, + "dblp": { + "prefix": dblp_prefix, }, } - if row.get('homepage_url'): - extra['urls'] = [row['homepage_url']] + if row.get("homepage_url"): + extra["urls"] = [row["homepage_url"]] - if row.get('acronym'): - extra['acronym'] = row['acronym'] + if row.get("acronym"): + extra["acronym"] = row["acronym"] ce = fatcat_openapi_client.ContainerEntity( - name=clean_str(row['title']), + name=clean_str(row["title"]), container_type=container_type, issnl=issnl, - wikidata_qid=row.get('wikidata_qid'), + wikidata_qid=row.get("wikidata_qid"), extra=extra, ) return ce def try_update(self, ce): - dblp_prefix = ce.extra['dblp']['prefix'] + dblp_prefix = ce.extra["dblp"]["prefix"] existing = None existing_container_id = self.lookup_dblp_prefix(dblp_prefix) if existing_container_id: @@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter): return True if existing: - self.counts['exists'] += 1 - print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output) + self.counts["exists"] += 1 + print( + "\t".join([ce.extra["dblp"]["prefix"], existing.ident]), + file=self.dblp_container_map_output, + ) return False # shouldn't get here @@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter): Because we want to print a prefix/container_id match for each row, we require a special batch insert method """ - eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + eg = self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) for c_edit in eg.edits.containers: c = self.api.get_container(c_edit.ident) - print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output) + print( + "\t".join([c.extra["dblp"]["prefix"], c.ident]), + file=self.dblp_container_map_output, + ) diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 6d028f2f..5baa6cd6 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -1,4 +1,3 @@ - """ Importer for DBLP release-level (article/paper/etc) XML metadata. @@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict class DblpReleaseImporter(EntityImporter): - - def __init__(self, - api, - dblp_container_map_file=None, - **kwargs): + def __init__(self, api, dblp_container_map_file=None, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of dblp metadata via XML records" + "editgroup_description", "Automated import of dblp metadata via XML records" ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DblpReleaseImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.dump_json_mode = kwargs.get("dump_json_mode", False) self.this_year = datetime.datetime.now().year @@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter): "phdthesis", "mastersthesis", "www", - #"data", # no instances in 2020-11 dump + # "data", # no instances in 2020-11 dump ] def read_dblp_container_map_file(self, dblp_container_map_file) -> None: self._dblp_container_map = dict() if not dblp_container_map_file: - print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr) + print( + "Not loading a dblp prefix container map file; entities will fail to import", + file=sys.stderr, + ) return print("Loading dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: @@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter): container_id = container_id.strip() assert len(container_id) == 26 self._dblp_container_map[prefix] = container_id - print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr) + print( + "Got {} dblp container mappings.".format(len(self._dblp_container_map)), + file=sys.stderr, + ) def lookup_dblp_prefix(self, prefix): if not prefix: @@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter): def want(self, xml_elem): if xml_elem.name not in self.ELEMENT_TYPES: - self.counts['skip-type'] += 1 + self.counts["skip-type"] += 1 return False - if not xml_elem.get('key'): - self.counts['skip-no-key'] += 1 + if not xml_elem.get("key"): + self.counts["skip-no-key"] += 1 return False - if xml_elem['key'].startswith('homepage/'): - self.counts['skip-type-homepage'] += 1 + if xml_elem["key"].startswith("homepage/"): + self.counts["skip-type-homepage"] += 1 return False return True @@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter): - isbn """ - dblp_key = xml_elem.get('key') + dblp_key = xml_elem.get("key") if not dblp_key: - self.counts['skip-empty-key'] += 1 + self.counts["skip-empty-key"] += 1 return False - dblp_key_type = dblp_key.split('/')[0] + dblp_key_type = dblp_key.split("/")[0] # dblp_prefix may be used for container lookup dblp_prefix = None - if dblp_key_type in ('journals', 'conf'): - dblp_prefix = '/'.join(dblp_key.split('/')[:2]) - elif dblp_key_type in ('series', 'reference', 'tr', 'books'): - dblp_prefix = '/'.join(dblp_key.split('/')[:-1]) + if dblp_key_type in ("journals", "conf"): + dblp_prefix = "/".join(dblp_key.split("/")[:2]) + elif dblp_key_type in ("series", "reference", "tr", "books"): + dblp_prefix = "/".join(dblp_key.split("/")[:-1]) - publtype = xml_elem.get('publtype') or None + publtype = xml_elem.get("publtype") or None dblp_type = xml_elem.name if dblp_type not in self.ELEMENT_TYPES: - self.counts[f'skip-dblp-type:{dblp_type}'] += 1 + self.counts[f"skip-dblp-type:{dblp_type}"] += 1 - if dblp_key_type in ('homepages', 'persons', 'dblpnote'): - self.counts['skip-key-type'] += 1 + if dblp_key_type in ("homepages", "persons", "dblpnote"): + self.counts["skip-key-type"] += 1 return False - if dblp_key.startswith('journals/corr/'): - self.counts['skip-arxiv-corr'] += 1 + if dblp_key.startswith("journals/corr/"): + self.counts["skip-arxiv-corr"] += 1 return False title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - if title.endswith('.'): + if title.endswith("."): title = title[:-1] release_type = None - release_stage = 'published' + release_stage = "published" withdrawn_status = None # primary releae_type detection: type of XML element, then prefix of key for granularity - if dblp_type == 'article': - release_type = 'article' - if dblp_key_type == 'journals' and publtype != 'informal': - release_type = 'article-journal' - elif dblp_key_type == 'tr': - release_type = 'report' + if dblp_type == "article": + release_type = "article" + if dblp_key_type == "journals" and publtype != "informal": + release_type = "article-journal" + elif dblp_key_type == "tr": + release_type = "report" elif title.startswith("Review:"): - release_type = 'review' - elif dblp_type == 'inproceedings': - release_type = 'paper-conference' - elif dblp_type == 'book': - release_type = 'book' - elif dblp_type == 'incollection': + release_type = "review" + elif dblp_type == "inproceedings": + release_type = "paper-conference" + elif dblp_type == "book": + release_type = "book" + elif dblp_type == "incollection": # XXX: part vs. chapter? - release_type = 'chapter' - elif dblp_type == 'data': - release_type = 'dataset' - elif dblp_type in ('mastersthesis', 'phdthesis'): - release_type = 'thesis' + release_type = "chapter" + elif dblp_type == "data": + release_type = "dataset" + elif dblp_type in ("mastersthesis", "phdthesis"): + release_type = "thesis" # overrides/extensions of the above - if publtype == 'informal': + if publtype == "informal": # for conferences, seems to indicate peer-review status # for journals, seems to indicate things like book reviews; split out above pass - elif publtype == 'encyclopedia': - release_type = 'entry-encyclopedia' - elif publtype == 'edited': + elif publtype == "encyclopedia": + release_type = "entry-encyclopedia" + elif publtype == "edited": # XXX: article? - release_type = 'editorial' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'data': - release_type = 'dataset' - elif publtype == 'software': - release_type = 'software' - elif publtype == 'widthdrawn': - withdrawn_status = 'widthdrawn' - elif publtype == 'survey': + release_type = "editorial" + elif publtype == "data": + release_type = "dataset" + elif publtype == "data": + release_type = "dataset" + elif publtype == "software": + release_type = "software" + elif publtype == "widthdrawn": + withdrawn_status = "widthdrawn" + elif publtype == "survey": # XXX: flag as a review/survey article? pass - #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) + # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr) container_name = None booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text) @@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter): part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None @@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter): if isbn: ext_ids.isbn13 = isbn if ext_ids.doi: - self.counts['has-doi'] += 1 + self.counts["has-doi"] += 1 # dblp-specific extra dblp_extra = dict(type=dblp_type) note = clean_str(xml_elem.note and xml_elem.note.text) - if note and 'base-search.net' not in note: - dblp_extra['note'] = note + if note and "base-search.net" not in note: + dblp_extra["note"] = note if part_of_key: - dblp_extra['part_of_key'] = part_of_key + dblp_extra["part_of_key"] = part_of_key # generic extra extra = dict() if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name - if series and (dblp_key_type == 'series' or dblp_type == 'book'): - extra['series-title'] = series + if series and (dblp_key_type == "series" or dblp_type == "book"): + extra["series-title"] = series elif series: - dblp_extra['series'] = series + dblp_extra["series"] = series - if booktitle and dblp_key_type == 'series': - extra['container-title'] = booktitle - elif booktitle and dblp_key_type == 'conf': - extra['event'] = booktitle + if booktitle and dblp_key_type == "series": + extra["container-title"] = booktitle + elif booktitle and dblp_key_type == "conf": + extra["event"] = booktitle elif booktitle: - dblp_extra['booktitle'] = booktitle + dblp_extra["booktitle"] = booktitle if release_year and release_month: # TODO: release_month schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if dblp_extra: - extra['dblp'] = dblp_extra + extra["dblp"] = dblp_extra if not extra: extra = None @@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter): withdrawn_status=withdrawn_status, title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter): if self.dump_json_mode: re_dict = entity_to_dict(re, api_client=self.api.api_client) - re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem) - re_dict['_dblp_prefix'] = dblp_prefix + re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem) + re_dict["_dblp_prefix"] = dblp_prefix print(json.dumps(re_dict, sort_keys=True)) return False @@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'): + for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter): return True if not self.do_updates or existing.ext_ids.dblp: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # logic for whether to do update or skip - if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv: - self.counts['skip-update'] += 1 + if ( + existing.container_id and existing.release_type and existing.release_stage + ) or existing.ext_ids.arxiv: + self.counts["skip-update"] += 1 return False # fields to copy over for update @@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status existing.container_id = existing.container_id or re.container_id - existing.extra['dblp'] = re.extra['dblp'] + existing.extra["dblp"] = re.extra["dblp"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter): """ contribs = [] index = 0 - for elem in authors.find_all('author'): + for elem in authors.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 - for elem in authors.find_all('editor'): + for elem in authors.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) @@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter): # remove number in author name, if present if raw_name.split()[-1].isdigit(): - raw_name = ' '.join(raw_name.split()[:-1]) + raw_name = " ".join(raw_name.split()[:-1]) - if elem.get('orcid'): - orcid = clean_orcid(elem['orcid']) + if elem.get("orcid"): + orcid = clean_orcid(elem["orcid"]) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: @@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter): wikidata_qid: Optional[str] = None arxiv_id: Optional[str] = None hdl: Optional[str] = None - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text # convert DOI-like domains, which mostly have DOIs anyways - if '://doi.acm.org/' in url: - url = url.replace('://doi.acm.org/', '://doi.org/') - elif '://doi.ieeecomputersociety.org/' in url: - url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/') + if "://doi.acm.org/" in url: + url = url.replace("://doi.acm.org/", "://doi.org/") + elif "://doi.ieeecomputersociety.org/" in url: + url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/") - if 'doi.org/10.' in url and not doi: + if "doi.org/10." in url and not doi: doi = clean_doi(url) - elif 'wikidata.org/entity/Q' in url and not wikidata_qid: + elif "wikidata.org/entity/Q" in url and not wikidata_qid: wikidata_qid = clean_wikidata_qid(url) - elif '://arxiv.org/abs/' in url and not arxiv_id: - arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '') + elif "://arxiv.org/abs/" in url and not arxiv_id: + arxiv_id = ( + url.replace("http://", "") + .replace("https://", "") + .replace("arxiv.org/abs/", "") + ) arxiv_id = clean_arxiv_id(arxiv_id) - elif '://hdl.handle.net' in url and not hdl: + elif "://hdl.handle.net" in url and not hdl: hdl = clean_hdl(url) return fatcat_openapi_client.ReleaseExtIds( @@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter): sandcrawler ingest requests. """ EXTID_PATTERNS = [ - '://doi.acm.org/', - '://doi.ieeecomputersociety.org/', - 'doi.org/10.', - 'wikidata.org/entity/Q', - '://arxiv.org/abs/', + "://doi.acm.org/", + "://doi.ieeecomputersociety.org/", + "doi.org/10.", + "wikidata.org/entity/Q", + "://arxiv.org/abs/", ] urls = [] - for ee in xml_elem.find_all('ee'): + for ee in xml_elem.find_all("ee"): url = ee.text skip = False for pattern in EXTID_PATTERNS: diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 1831c4cd..cd063337 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048 class DoajArticleImporter(EntityImporter): - - def __init__(self, - api, - issn_map_file, - **kwargs): + def __init__(self, api, issn_map_file, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps" + "editgroup_description", + "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DoajArticleImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) self.this_year = datetime.datetime.now().year self.read_issn_map_file(issn_map_file) @@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter): } """ - if not obj or not isinstance(obj, dict) or 'bibjson' not in obj: - self.counts['skip-empty'] += 1 + if not obj or not isinstance(obj, dict) or "bibjson" not in obj: + self.counts["skip-empty"] += 1 return None - bibjson = obj['bibjson'] + bibjson = obj["bibjson"] - title = clean_str(bibjson.get('title'), force_xml=True) + title = clean_str(bibjson.get("title"), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - container_name = clean_str(bibjson['journal']['title']) + container_name = clean_str(bibjson["journal"]["title"]) container_id = None # NOTE: 'issns' not documented in API schema - for issn in bibjson['journal']['issns']: + for issn in bibjson["journal"]["issns"]: issnl = self.issn2issnl(issn) if issnl: container_id = self.lookup_issnl(self.issn2issnl(issn)) @@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter): container_name = None break - volume = clean_str(bibjson['journal'].get('volume')) + volume = clean_str(bibjson["journal"].get("volume")) # NOTE: this schema seems to use "number" as "issue number" - issue = clean_str(bibjson['journal'].get('number')) - publisher = clean_str(bibjson['journal'].get('publisher')) + issue = clean_str(bibjson["journal"].get("number")) + publisher = clean_str(bibjson["journal"].get("publisher")) try: - release_year = int(bibjson.get('year')) + release_year = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None - release_month = parse_month(clean_str(bibjson.get('month'))) + release_month = parse_month(clean_str(bibjson.get("month"))) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None - license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) - country = parse_country_name(bibjson['journal'].get('country')) + license_slug = self.doaj_license_slug(bibjson["journal"].get("license")) + country = parse_country_name(bibjson["journal"].get("country")) language = None - for raw in bibjson['journal'].get('language') or []: + for raw in bibjson["journal"].get("language") or []: language = parse_lang_name(raw) if language: break # pages # NOTE: error in API docs? seems like start_page not under 'journal' object - start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) - end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) + start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str( + bibjson.get("start_page") + ) + end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str( + bibjson.get("end_page") + ) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" elif start_page: pages = start_page - doaj_article_id = obj['id'].lower() - ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) + doaj_article_id = obj["id"].lower() + ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) - contribs = self.doaj_contribs(bibjson.get('author') or []) + contribs = self.doaj_contribs(bibjson.get("author") or []) # DOAJ-specific extra doaj_extra = dict() - if bibjson.get('subject'): - doaj_extra['subject'] = bibjson.get('subject') - if bibjson.get('keywords'): - doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] + if bibjson.get("subject"): + doaj_extra["subject"] = bibjson.get("subject") + if bibjson.get("keywords"): + doaj_extra["keywords"] = [ + k for k in [clean_str(s) for s in bibjson.get("keywords")] if k + ] # generic extra extra = dict() if country: - extra['country'] = country + extra["country"] = country if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name if release_year and release_month: # TODO: schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if doaj_extra: - extra['doaj'] = doaj_extra + extra["doaj"] = doaj_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, - release_type='article-journal', - release_stage='published', + release_type="article-journal", + release_stage="published", title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'pmid', 'pmcid'): + for extid_type in ("doi", "pmid", "pmcid"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter): # other logic could go here about skipping updates if not self.do_updates or existing.ext_ids.doaj: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # fields to copy over for update @@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.container_id = existing.container_id or re.container_id existing.abstracts = existing.abstracts or re.abstracts - existing.extra['doaj'] = re.extra['doaj'] + existing.extra["doaj"] = re.extra["doaj"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages @@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: - text = clean_str(bibjson.get('abstract')) + text = clean_str(bibjson.get("abstract")) if not text or len(text) < 10: return [] if len(text) > MAX_ABSTRACT_LENGTH: @@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter): lang=lang, ) - return [abstract,] + return [ + abstract, + ] def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter): contribs = [] index = 0 for author in authors: - if not author.get('name'): + if not author.get("name"): continue creator_id = None - orcid = clean_orcid(author.get('orcid_id')) + orcid = clean_orcid(author.get("orcid_id")) if orcid: creator_id = self.lookup_orcid(orcid) - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=author.get('name'), - role='author', - index=index, - creator_id=creator_id, - raw_affiliation=clean_str(author.get('affiliation')), - )) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=author.get("name"), + role="author", + index=index, + creator_id=creator_id, + raw_affiliation=clean_str(author.get("affiliation")), + ) + ) index += 1 return contribs - def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: + def doaj_ext_ids( + self, identifiers: List[dict], doaj_article_id: str + ) -> fatcat_openapi_client.ReleaseExtIds: """ bibjson.identifier { id (string), @@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter): pmid: Optional[str] = None pmcid: Optional[str] = None for id_obj in identifiers: - if not id_obj.get('id'): + if not id_obj.get("id"): continue - if id_obj['type'].lower() == 'doi': - doi = clean_doi(id_obj['id']) - elif id_obj['type'].lower() == 'pmid': - pmid = clean_pmid(id_obj['id']) - elif id_obj['type'].lower() == 'pmcid': - pmcid = clean_pmcid(id_obj['id']) + if id_obj["type"].lower() == "doi": + doi = clean_doi(id_obj["id"]) + elif id_obj["type"].lower() == "pmid": + pmid = clean_pmid(id_obj["id"]) + elif id_obj["type"].lower() == "pmcid": + pmcid = clean_pmcid(id_obj["id"]) return fatcat_openapi_client.ReleaseExtIds( doaj=doaj_article_id, @@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter): if not license_list: return None for license in license_list: - if not license.get('open_access'): + if not license.get("open_access"): continue - slug = license.get('type') - if slug.startswith('CC '): - slug = slug.replace('CC ', 'cc-').lower() + slug = license.get("type") + if slug.startswith("CC "): + slug = slug.replace("CC ", "cc-").lower() return slug return None diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 0951ed84..26584ff3 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter @@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter): def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter') - kwargs['do_updates'] = kwargs.get("do_updates", True) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates" + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter") + kwargs["do_updates"] = kwargs.get("do_updates", True) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'): + for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"): if not row.get(k): - self.counts['skip-missing-field'] += 1 + self.counts["skip-missing-field"] += 1 return False return True @@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter): file_meta = row fe = fatcat_openapi_client.FileEntity( - md5=file_meta['md5hex'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'], + md5=file_meta["md5hex"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + mimetype=file_meta["mimetype"], ) return fe @@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter): raise err if not existing: - self.counts['skip-no-match'] += 1 + self.counts["skip-no-match"] += 1 return False - if (existing.md5 and existing.sha256 and existing.size and existing.mimetype): - self.counts['skip-existing-complete'] += 1 + if existing.md5 and existing.sha256 and existing.size and existing.mimetype: + self.counts["skip-existing-complete"] += 1 return False existing.md5 = existing.md5 or fe.md5 @@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter): existing = self.generic_file_cleanups(existing) self.api.update_file(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 43c2a49c..dd8f5600 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools import entity_from_dict @@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter') - kwargs['do_updates'] = bool(kwargs.get("do_updates", False)) + eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import" + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter") + kwargs["do_updates"] = bool(kwargs.get("do_updates", False)) self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False)) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False def want(self, row): - if not row.get('release_ids'): - self.counts['skip-no-release-ids'] += 1 + if not row.get("release_ids"): + self.counts["skip-no-release-ids"] += 1 return False - if not row.get('urls'): - self.counts['skip-no-urls'] += 1 + if not row.get("urls"): + self.counts["skip-no-urls"] += 1 return False - if not row.get('manifest'): - self.counts['skip-no-files'] += 1 + if not row.get("manifest"): + self.counts["skip-no-files"] += 1 return False - for f in row.get('manifest'): - for k in ('sha1', 'md5'): + for f in row.get("manifest"): + for k in ("sha1", "md5"): if not f.get(k): - self.counts['skip-missing-file-field'] += 1 + self.counts["skip-missing-file-field"] += 1 return False return True @@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter): if not self.skip_release_fileset_check: for release_id in fse.release_ids: # don't catch 404, that would be an error - release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs') - assert release.state == 'active' + release = self.api.get_release( + release_id, expand="filesets", hide="abstracts,refs" + ) + assert release.state == "active" if release.filesets: - self.counts['exists'] += 1 - self.counts['exists-via-release-filesets'] += 1 + self.counts["exists"] += 1 + self.counts["exists-via-release-filesets"] += 1 return False # do the insert return True def insert_batch(self, batch): - self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_fileset_auto_batch( + fatcat_openapi_client.FilesetAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 0f666652..f7bb5357 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -7,7 +7,7 @@ import fatcat_openapi_client from .common import EntityImporter, clean, make_rel_url -MAX_ABSTRACT_BYTES=4096 +MAX_ABSTRACT_BYTES = 4096 class GrobidMetadataImporter(EntityImporter): @@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Import of release and file metadata, as extracted from PDFs by GROBID.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Import of release and file metadata, as extracted from PDFs by GROBID.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) @@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter): def parse_record(self, row): - fields = row.split('\t') + fields = row.split("\t") sha1_key = fields[0] cdx = json.loads(fields[1]) mimetype = fields[2] @@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter): # TODO: this is where we should check if the file actually has # release_ids and/or URLs associated with it if existing and not self.bezerk_mode: - self.counts['exists'] += 1 - self.counts['skip'] -= 1 + self.counts["exists"] += 1 + self.counts["skip"] -= 1 return None release_edit = self.create_release(re) @@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter): def parse_grobid_json(self, obj): - if not obj.get('title'): + if not obj.get("title"): return None extra_grobid = dict() - abstract = obj.get('abstract') + abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: abobj = fatcat_openapi_client.ReleaseAbstract( - mimetype="text/plain", - content=clean(obj.get('abstract'))) + mimetype="text/plain", content=clean(obj.get("abstract")) + ) abstracts = [abobj] else: abstracts = None contribs = [] - for i, a in enumerate(obj.get('authors', [])): - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, - raw_name=clean(a['name']), - given_name=clean(a.get('given_name')), - surname=clean(a.get('surname')), - role="author", - extra=None)) + for i, a in enumerate(obj.get("authors", [])): + contribs.append( + fatcat_openapi_client.ReleaseContrib( + index=i, + raw_name=clean(a["name"]), + given_name=clean(a.get("given_name")), + surname=clean(a.get("surname")), + role="author", + extra=None, + ) + ) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): cite_extra = dict() year = None - if raw.get('date'): + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) + year = int(raw["date"].strip()[:4]) except (IndexError, ValueError): pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): cite_extra[key] = clean(raw[key]) - if raw.get('authors'): - cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] + if raw.get("authors"): + cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] if not cite_extra: cite_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - key=clean(raw.get('id')), - year=year, - title=clean(raw['title']), - extra=cite_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + key=clean(raw.get("id")), + year=year, + title=clean(raw["title"]), + extra=cite_extra, + ) + ) release_date = None release_year = None - if obj.get('date'): + if obj.get("date"): # only returns year, ever? - release_year = int(obj['date'][:4]) + release_year = int(obj["date"][:4]) extra = dict() - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = clean(obj['journal']['name']) + if obj.get("doi"): + extra["doi"] = obj["doi"] + if obj["journal"] and obj["journal"].get("name"): + extra["container_name"] = clean(obj["journal"]["name"]) # TODO: ISSN/eISSN handling? or just journal name lookup? if extra_grobid: - extra['grobid'] = extra_grobid + extra["grobid"] = extra_grobid if self.longtail_oa: - extra['longtail_oa'] = True + extra["longtail_oa"] = True if not extra: extra = None - title = clean(obj['title'], force_xml=True) + title = clean(obj["title"], force_xml=True) if not title or len(title) < 2: return None @@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter): release_year=release_year, contribs=contribs, refs=refs, - publisher=clean(obj['journal'].get('publisher')), - volume=clean(obj['journal'].get('volume')), - issue=clean(obj['journal'].get('issue')), + publisher=clean(obj["journal"].get("publisher")), + volume=clean(obj["journal"].get("volume")), + issue=clean(obj["journal"].get("issue")), abstracts=abstracts, ext_ids=fatcat_openapi_client.ReleaseExtIds(), - extra=extra) + extra=extra, + ) return re def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + sha1 = ( + base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) + .decode("ascii") + .lower() + ) fe = fatcat_openapi_client.FileEntity( sha1=sha1, @@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter): ) # parse URLs and CDX - original = cdx['url'] - assert len(cdx['dt']) >= 8 - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - fe.urls.append( - fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) + original = cdx["url"] + assert len(cdx["dt"]) >= 8 + wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) + fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive")) original_url = make_rel_url(original, default_link_rel=self.default_link_rel) if original_url is not None: - fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])) + fe.urls.append( + fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]) + ) return fe @@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index f0943c1e..e0a6c3f5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,3 @@ - import datetime import fatcat_openapi_client @@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): - def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter") + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.use_glutton_match = False self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel @@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter): else: print("NOT checking GROBID success") self.ingest_request_source_allowlist = [ - 'fatcat-changelog', - 'fatcat-ingest-container', - 'fatcat-ingest', - 'arabesque', + "fatcat-changelog", + "fatcat-ingest-container", + "fatcat-ingest", + "arabesque", #'mag-corpus', #'mag', - 'unpaywall-corpus', - 'unpaywall', + "unpaywall-corpus", + "unpaywall", #'s2-corpus', #'s2', - 'doaj', - 'dblp', + "doaj", + "dblp", ] - if kwargs.get('skip_source_allowlist', False): + if kwargs.get("skip_source_allowlist", False): self.ingest_request_source_allowlist = [] def want_file(self, row) -> bool: @@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter): File-specific part of want(). Generic across general ingest and save-paper-now. """ - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False # type-specific filters - if row['request'].get('ingest_type') == 'pdf': - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + if row["request"].get("ingest_type") == "pdf": + if self.require_grobid and row.get("grobid", {}).get("status_code") != 200: + self.counts["skip-grobid"] += 1 return False - if row['file_meta'].get('mimetype') not in ("application/pdf",): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("application/pdf",): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') == 'xml': - if row['file_meta'].get('mimetype') not in ("application/xml", - "application/jats+xml", "application/tei+xml", "text/xml"): - self.counts['skip-mimetype'] += 1 + elif row["request"].get("ingest_type") == "xml": + if row["file_meta"].get("mimetype") not in ( + "application/xml", + "application/jats+xml", + "application/tei+xml", + "text/xml", + ): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']: + elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]: # we rely on sandcrawler for these checks pass else: - self.counts['skip-ingest-type'] += 1 + self.counts["skip-ingest-type"] += 1 return False return True @@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter): Sandcrawler ingest-specific part of want(). Generic across file and webcapture ingest. """ - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: - self.counts['skip-ingest_request_source'] += 1 + if ( + self.ingest_request_source_allowlist + and source not in self.ingest_request_source_allowlist + ): + self.counts["skip-ingest_request_source"] += 1 return False - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): - self.counts['skip-link-source'] += 1 + if row["request"].get("link_source") not in ( + "arxiv", + "pmc", + "unpaywall", + "doi", + "mag", + "s2", + "doaj", + "dblp", + ): + self.counts["skip-link-source"] += 1 return False - if source.startswith('savepapernow'): + if source.startswith("savepapernow"): # never process async savepapernow requests - self.counts['skip-savepapernow'] += 1 + self.counts["skip-savepapernow"] += 1 return False return True @@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter): def parse_ingest_release_ident(self, row): - request = row['request'] - fatcat = request.get('fatcat') + request = row["request"] + fatcat = request.get("fatcat") release_ident = None - if fatcat and fatcat.get('release_ident'): - release_ident = fatcat.get('release_ident') - elif request.get('ext_ids'): + if fatcat and fatcat.get("release_ident"): + release_ident = fatcat.get("release_ident") + elif request.get("ext_ids"): # if no fatcat ident, try extids - for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'): - extid = request['ext_ids'].get(extid_type) + for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"): + extid = request["ext_ids"].get(extid_type) if not extid: continue - if extid_type == 'doi': + if extid_type == "doi": extid = extid.lower() try: release = self.api.lookup_release(**{extid_type: extid}) @@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter): if err.status == 404: continue elif err.status == 400: - self.counts['warn-extid-invalid'] += 1 + self.counts["warn-extid-invalid"] += 1 continue raise err # verify release_stage - if request.get('release_stage') and release.release_stage: - if request['release_stage'] != release.release_stage: - self.counts['skip-release-stage'] += 1 + if request.get("release_stage") and release.release_stage: + if request["release_stage"] != release.release_stage: + self.counts["skip-release-stage"] += 1 return None release_ident = release.ident break - if self.use_glutton_match and not release_ident and row.get('grobid'): + if self.use_glutton_match and not release_ident and row.get("grobid"): # try biblio-glutton extracted hit - if row['grobid'].get('fatcat_release'): - release_ident = row['grobid']['fatcat_release'].split('_')[-1] - self.counts['glutton-match'] += 1 + if row["grobid"].get("fatcat_release"): + release_ident = row["grobid"]["fatcat_release"].split("_")[-1] + self.counts["glutton-match"] += 1 return release_ident def parse_terminal(self, row): - terminal = row.get('terminal') + terminal = row.get("terminal") if not terminal: # support old cdx-only ingest results - cdx = row.get('cdx') + cdx = row.get("cdx") if not cdx: return None else: terminal = { - 'terminal_url': cdx['url'], - 'terminal_dt': cdx['datetime'], - 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), + "terminal_url": cdx["url"], + "terminal_dt": cdx["datetime"], + "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"), } # work around old schema - if 'terminal_url' not in terminal: - terminal['terminal_url'] = terminal['url'] - if 'terminal_dt' not in terminal: - terminal['terminal_dt'] = terminal['dt'] + if "terminal_url" not in terminal: + terminal["terminal_url"] = terminal["url"] + if "terminal_dt" not in terminal: + terminal["terminal_dt"] = terminal["dt"] # convert CDX-style digits to ISO-style timestamp - assert len(terminal['terminal_dt']) == 14 - terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" + assert len(terminal["terminal_dt"]) == 14 + terminal["terminal_timestamp"] = ( + datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat() + + "Z" + ) return terminal def parse_urls(self, row, terminal): - request = row['request'] + request = row["request"] default_rel = self.default_link_rel - if request.get('link_source') == 'doi': - default_rel = 'publisher' - default_rel = request.get('rel', default_rel) - url = make_rel_url(terminal['terminal_url'], default_rel) + if request.get("link_source") == "doi": + default_rel = "publisher" + default_rel = request.get("rel", default_rel) + url = make_rel_url(terminal["terminal_url"], default_rel) if not url: - self.counts['skip-url'] += 1 + self.counts["skip-url"] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( - terminal['terminal_dt'], - terminal['terminal_url']) + terminal["terminal_dt"], terminal["terminal_url"] + ) urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] @@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter): def parse_edit_extra(self, row): - request = row['request'] + request = row["request"] edit_extra = dict() - if request.get('edit_extra'): - edit_extra = request['edit_extra'] + if request.get("edit_extra"): + edit_extra = request["edit_extra"] - if request.get('ingest_request_source'): - edit_extra['ingest_request_source'] = request['ingest_request_source'] - if request.get('link_source') and request.get('link_source_id'): - edit_extra['link_source'] = request['link_source'] - edit_extra['link_source_id'] = request['link_source_id'] - if edit_extra['link_source'] == 'doi': - edit_extra['link_source_id'] = edit_extra['link_source_id'].lower() + if request.get("ingest_request_source"): + edit_extra["ingest_request_source"] = request["ingest_request_source"] + if request.get("link_source") and request.get("link_source_id"): + edit_extra["link_source"] = request["link_source"] + edit_extra["link_source_id"] = request["link_source_id"] + if edit_extra["link_source"] == "doi": + edit_extra["link_source_id"] = edit_extra["link_source_id"].lower() # GROBID metadata, for SPN requests (when there might not be 'success') - if request.get('ingest_type') == 'pdf': - if row.get('grobid') and row['grobid'].get('status') != 'success': - edit_extra['grobid_status_code'] = row['grobid']['status_code'] - edit_extra['grobid_version'] = row['grobid'].get('grobid_version') + if request.get("ingest_type") == "pdf": + if row.get("grobid") and row["grobid"].get("status") != "success": + edit_extra["grobid_status_code"] = row["grobid"]["status_code"] + edit_extra["grobid_version"] = row["grobid"].get("grobid_version") return edit_extra def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') not in ('pdf', 'xml'): - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in ("pdf", "xml"): + self.counts["skip-ingest-type"] += 1 return None - assert (request['ingest_type'], file_meta['mimetype']) in [ + assert (request["ingest_type"], file_meta["mimetype"]) in [ ("pdf", "application/pdf"), ("xml", "application/xml"), ("xml", "application/jats+xml"), @@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter): release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( - md5=file_meta['md5hex'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'], + md5=file_meta["md5hex"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + mimetype=file_meta["mimetype"], release_ids=[release_ident], urls=urls, ) @@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter): # check for existing edits-in-progress with same file hash for other in self._entity_queue: if other.sha1 == fe.sha1: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False if not existing: @@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter): # NOTE: the following checks all assume there is an existing item if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not self.do_updates: - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False # TODO: for now, never update - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_file(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFileImporter(IngestFileResultImporter): @@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter') - kwargs['submit_mode'] = submit_mode - kwargs['require_grobid'] = False - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter") + kwargs["submit_mode"] = submit_mode + kwargs["require_grobid"] = False + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_file(row): @@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): @@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter): return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False return True def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') != "html": - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return None - if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) - archive_urls = [u for u in urls if u.rel == 'webarchive'] + archive_urls = [u for u in urls if u.rel == "webarchive"] - if terminal['terminal_status_code'] != 200: - self.counts['skip-terminal-status-code'] += 1 + if terminal["terminal_status_code"] != 200: + self.counts["skip-terminal-status-code"] += 1 return None - terminal_cdx = row['cdx'] - if 'revisit_cdx' in row: - terminal_cdx = row['revisit_cdx'] - assert terminal_cdx['surt'] - if terminal_cdx['url'] != terminal['terminal_url']: - self.counts['skip-terminal-url-mismatch'] += 1 + terminal_cdx = row["cdx"] + if "revisit_cdx" in row: + terminal_cdx = row["revisit_cdx"] + assert terminal_cdx["surt"] + if terminal_cdx["url"] != terminal["terminal_url"]: + self.counts["skip-terminal-url-mismatch"] += 1 return None wc_cdx = [] # primary resource first - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=terminal_cdx['surt'], - timestamp=terminal['terminal_timestamp'], - url=terminal['terminal_url'], - mimetype=file_meta['mimetype'], - status_code=terminal['terminal_status_code'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - )) - - for resource in row.get('html_resources', []): - timestamp = resource['timestamp'] + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=terminal_cdx["surt"], + timestamp=terminal["terminal_timestamp"], + url=terminal["terminal_url"], + mimetype=file_meta["mimetype"], + status_code=terminal["terminal_status_code"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + ) + ) + + for resource in row.get("html_resources", []): + timestamp = resource["timestamp"] if "+" not in timestamp and "Z" not in timestamp: timestamp += "Z" - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=resource['surt'], - timestamp=timestamp, - url=resource['url'], - mimetype=resource.get('mimetype'), - size=resource.get('size'), - sha1=resource.get('sha1hex'), - sha256=resource.get('sha256hex'), - )) + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=resource["surt"], + timestamp=timestamp, + url=resource["url"], + mimetype=resource.get("mimetype"), + size=resource.get("size"), + sha1=resource.get("sha1hex"), + sha256=resource.get("sha256hex"), + ) + ) wc = fatcat_openapi_client.WebcaptureEntity( cdx=wc_cdx, archive_urls=archive_urls, - original_url=terminal['terminal_url'], - timestamp=terminal['terminal_timestamp'], + original_url=terminal["terminal_url"], + timestamp=terminal["terminal_timestamp"], release_ids=[release_ident], ) @@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter): # check for existing edits-in-progress with same URL for other in self._entity_queue: if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # TODO: currently only allow one release per webcapture release = self.api.get_release(wc.release_ids[0], expand="webcaptures") @@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter): for other in release.webcaptures: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-webcapture'] += 1 + self.counts["skip-release-has-webcapture"] += 1 return False # Ok, if we got here then no existing web capture for (first) release, @@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter): def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_webcapture(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_webcapture_auto_batch( + fatcat_openapi_client.WebcaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) + class SavePaperNowWebImporter(IngestWebResultImporter): """ @@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): """ @@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter): path, which means allowing hit=false. """ - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False - if row.get('status') not in ['success', 'unknown-scope']: - self.counts['skip-hit'] += 1 + if row.get("status") not in ["success", "unknown-scope"]: + self.counts["skip-hit"] += 1 return False return True @@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Filesets crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.max_file_count = 300 def want_fileset(self, row): - if not row.get('manifest') or len(row.get('manifest')) == 0: - self.counts['skip-empty-manifest'] += 1 + if not row.get("manifest") or len(row.get("manifest")) == 0: + self.counts["skip-empty-manifest"] += 1 return False - if len(row.get('manifest')) == 1: - self.counts['skip-single-file'] += 1 + if len(row.get("manifest")) == 1: + self.counts["skip-single-file"] += 1 return False - if len(row.get('manifest')) > self.max_file_count: - self.counts['skip-too-many-files'] += 1 + if len(row.get("manifest")) > self.max_file_count: + self.counts["skip-too-many-files"] += 1 return False return True @@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return False # fileset-specific filters - if row['request'].get('ingest_type') not in ['dataset',]: - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return False if not self.want_fileset(row): @@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True def parse_fileset_urls(self, row): - if not row.get('strategy'): + if not row.get("strategy"): return [] - strategy = row['strategy'] + strategy = row["strategy"] urls = [] - if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/", - rel="archive-base", - )) - if row['strategy'].startswith('web-') and row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", - rel="webarchive-base", - )) + if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive-base", + ) + ) + if row["strategy"].startswith("web-") and row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", + rel="webarchive-base", + ) + ) # TODO: repository-base # TODO: web-base - if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", - rel="archive-bundle", - )) + if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", + rel="archive-bundle", + ) + ) - if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", - rel="webarchive-bundle", - )) + if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", + rel="webarchive-bundle", + ) + ) # add any additional / platform URLs here - if row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-bundle", - )) - if row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-base", - )) + if row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-bundle", + ) + ) + if row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-base", + ) + ) return urls def parse_record(self, row): - request = row['request'] + request = row["request"] # double check that want() filtered request correctly - if request.get('ingest_type') not in ["dataset",]: - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None entity_extra = dict() edit_extra = self.parse_edit_extra(row) - edit_extra['ingest_strategy'] = row['ingest_strategy'] - if row.get('platform'): - edit_extra['platform'] = row['platform'] - if row.get('platform_id'): - edit_extra['platform_id'] = row['platform_id'] + edit_extra["ingest_strategy"] = row["ingest_strategy"] + if row.get("platform"): + edit_extra["platform"] = row["platform"] + if row.get("platform_id"): + edit_extra["platform_id"] = row["platform_id"] entity_urls = self.parse_fileset_urls(row) if not entity_urls: - self.counts['skip-no-access-url'] += 1 + self.counts["skip-no-access-url"] += 1 return None - assert row['file_count'] == len(row['manifest']) - if row['file_count'] > self.max_file_count: - self.counts['skip-too-many-manifest-files'] += 1 + assert row["file_count"] == len(row["manifest"]) + if row["file_count"] > self.max_file_count: + self.counts["skip-too-many-manifest-files"] += 1 return None manifest = [] - for ingest_file in row['manifest']: + for ingest_file in row["manifest"]: fsf = fatcat_openapi_client.FilesetFile( - path=ingest_file['path'], - size=ingest_file['size'], - md5=ingest_file['md5'], - sha1=ingest_file['sha1'], - sha256=ingest_file.get('sha256'), + path=ingest_file["path"], + size=ingest_file["size"], + md5=ingest_file["md5"], + sha1=ingest_file["sha1"], + sha256=ingest_file.get("sha256"), extra=dict( - mimetype=ingest_file['mimetype'], + mimetype=ingest_file["mimetype"], ), ) if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size): - self.counts['skip-partial-file-info'] += 1 + self.counts["skip-partial-file-info"] += 1 return None - if ingest_file.get('platform_url'): + if ingest_file.get("platform_url"): # XXX: should we include this? - fsf.extra['original_url'] = ingest_file['platform_url'] - if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'): - fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" + fsf.extra["original_url"] = ingest_file["platform_url"] + if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"): + fsf.extra[ + "wayback_url" + ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" manifest.append(fsf) fe = fatcat_openapi_client.FilesetEntity( @@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in self._entity_queue: # XXX: how to duplicate check? if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release release = self.api.get_release(wc.release_ids[0], expand="filesets") @@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in release.filesets: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-fileset'] += 1 + self.counts["skip-release-has-fileset"] += 1 return False return True def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_fileset(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_fileset_auto_batch( + fatcat_openapi_client.FilesetAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFilesetImporter(IngestFilesetResultImporter): @@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Fileset crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_fileset(row): diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 0a983c5e..8e3af416 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,4 +1,3 @@ - import datetime import sqlite3 import sys @@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons): # first parse out into language-agnostic dics for raw in raw_persons: - name = raw.find('name') or None + name = raw.find("name") or None if name: - name = clean(name.get_text().replace('\n', ' ')) - surname = raw.find('familyName') or None + name = clean(name.get_text().replace("\n", " ")) + surname = raw.find("familyName") or None if surname: - surname = clean(surname.get_text().replace('\n', ' ')) - given_name = raw.find('givenName') or None + surname = clean(surname.get_text().replace("\n", " ")) + given_name = raw.find("givenName") or None if given_name: - given_name = clean(given_name.get_text().replace('\n', ' ')) - lang = 'en' + given_name = clean(given_name.get_text().replace("\n", " ")) + lang = "en" if is_cjk(name): - lang = 'ja' - if lang == 'en' and surname and given_name: + lang = "ja" + if lang == "en" and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) rc = fatcat_openapi_client.ReleaseContrib( - raw_name=name, - surname=surname, - given_name=given_name, - role="author") + raw_name=name, surname=surname, given_name=given_name, role="author" + ) # add an extra hint field; won't end up in serialized object rc._lang = lang persons.append(rc) @@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons): if not persons: return [] - if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]): + if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]): # all english names, or all japanese names return persons # for debugging - #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): + # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']): # print("INTERESTING: {}".format(persons[0])) start_lang = persons[0]._lang @@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons): if p._lang == start_lang: contribs.append(p) else: - if p._lang == 'en' and contribs[-1]._lang == 'ja': + if p._lang == "en" and contribs[-1]._lang == "ja": eng = p jpn = contribs[-1] - elif p._lang == 'ja' and contribs[-1]._lang == 'en': + elif p._lang == "ja" and contribs[-1]._lang == "en": eng = contribs[-1] jpn = p else: @@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons): contribs.append(p) continue eng.extra = { - 'original_name': { - 'lang': jpn._lang, - 'raw_name': jpn.raw_name, - 'given_name': jpn.given_name, - 'surname': jpn.surname, + "original_name": { + "lang": jpn._lang, + "raw_name": jpn.raw_name, + "given_name": jpn.given_name, + "surname": jpn.surname, }, } contribs[-1] = eng @@ -105,18 +102,19 @@ class JalcImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of JALC DOI metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter') - super().__init__(api, + eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata") + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + self.create_containers = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -129,12 +127,27 @@ class JalcImporter(EntityImporter): def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -163,27 +176,27 @@ class JalcImporter(EntityImporter): titles = record.find_all("title") if not titles: return None - title = titles[0].get_text().replace('\n', ' ').strip() + title = titles[0].get_text().replace("\n", " ").strip() original_title = None - if title.endswith('.'): + if title.endswith("."): title = title[:-1] if len(titles) > 1: - original_title = titles[1].get_text().replace('\n', ' ').strip() - if original_title.endswith('.'): + original_title = titles[1].get_text().replace("\n", " ").strip() + if original_title.endswith("."): original_title = original_title[:-1] doi = None if record.doi: doi = clean_doi(record.doi.string.strip().lower()) - if doi.startswith('http://dx.doi.org/'): - doi = doi.replace('http://dx.doi.org/', '') - elif doi.startswith('https://dx.doi.org/'): - doi = doi.replace('https://dx.doi.org/', '') - elif doi.startswith('http://doi.org/'): - doi = doi.replace('http://doi.org/', '') - elif doi.startswith('https://doi.org/'): - doi = doi.replace('https://doi.org/', '') - if not (doi.startswith('10.') and '/' in doi): + if doi.startswith("http://dx.doi.org/"): + doi = doi.replace("http://dx.doi.org/", "") + elif doi.startswith("https://dx.doi.org/"): + doi = doi.replace("https://dx.doi.org/", "") + elif doi.startswith("http://doi.org/"): + doi = doi.replace("http://doi.org/", "") + elif doi.startswith("https://doi.org/"): + doi = doi.replace("https://doi.org/", "") + if not (doi.startswith("10.") and "/" in doi): sys.stderr.write("bogus JALC DOI: {}\n".format(doi)) doi = None if not doi: @@ -202,7 +215,9 @@ class JalcImporter(EntityImporter): if date: date = date.string if len(date) == 10: - release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date() + release_date = datetime.datetime.strptime( + date["completed-date"], DATE_FMT + ).date() release_year = release_date.year release_date = release_date.isoformat() elif len(date) == 4 and date.isdigit(): @@ -214,7 +229,7 @@ class JalcImporter(EntityImporter): if record.endingPage and record.endingPage.string.strip(): pages = "{}-{}".format(pages, record.endingPage.string.strip()) # double check to prevent "-" as pages - if pages and pages.strip() == '-': + if pages and pages.strip() == "-": pages = None volume = None @@ -242,9 +257,13 @@ class JalcImporter(EntityImporter): container_extra = dict() if record.publicationName: - pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()] + pubs = [ + p.get_text().replace("\n", " ").strip() + for p in record.find_all("publicationName") + if p.get_text() + ] pubs = [clean(p) for p in pubs if p] - assert(pubs) + assert pubs if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] if len(pubs) > 1 and is_cjk(pubs[0]): @@ -252,10 +271,14 @@ class JalcImporter(EntityImporter): pubs = [pubs[1], pubs[0]] container_name = clean(pubs[0]) if len(pubs) > 1: - container_extra['original_name'] = clean(pubs[1]) + container_extra["original_name"] = clean(pubs[1]) if record.publisher: - pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()] + pubs = [ + p.get_text().replace("\n", " ").strip() + for p in record.find_all("publisher") + if p.get_text() + ] pubs = [p for p in pubs if p] if len(pubs) > 1 and pubs[0] == pubs[1]: pubs = [pubs[0]] @@ -265,20 +288,25 @@ class JalcImporter(EntityImporter): if pubs: publisher = clean(pubs[0]) if len(pubs) > 1: - container_extra['publisher_aliases'] = pubs[1:] - - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + container_extra["publisher_aliases"] = pubs[1:] + + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country - container_extra['country'] = 'jp' - container_extra['languages'] = ['ja'] + container_extra["country"] = "jp" + container_extra["languages"] = ["ja"] ce = fatcat_openapi_client.ContainerEntity( name=container_name, - container_type='journal', + container_type="journal", publisher=publisher, issnl=issnl, - extra=(container_extra or None)) + extra=(container_extra or None), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident # short-cut future imports in same batch @@ -301,7 +329,7 @@ class JalcImporter(EntityImporter): # group-title # always put at least an empty dict here to indicate the DOI registrar # (informally) - extra['jalc'] = extra_jalc + extra["jalc"] = extra_jalc title = clean(title) if not title: @@ -312,24 +340,24 @@ class JalcImporter(EntityImporter): title=title, original_title=clean(original_title), release_type=release_type, - release_stage='published', + release_stage="published", release_date=release_date, release_year=release_year, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), volume=volume, issue=issue, pages=pages, publisher=publisher, language=lang, - #license_slug + # license_slug container_id=container_id, contribs=contribs, extra=extra, @@ -351,17 +379,20 @@ class JalcImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): """ @@ -374,11 +405,11 @@ class JalcImporter(EntityImporter): # 2. iterate over articles, call parse_article on each for record in soup.find_all("Description"): resp = self.parse_record(record) - #print(json.dumps(resp)) + # print(json.dumps(resp)) print(resp) - #sys.exit(-1) + # sys.exit(-1) -if __name__=='__main__': +if __name__ == "__main__": parser = JalcImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 25d7b3b5..6d1fefa3 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from .common import EntityImporter, clean @@ -11,18 +10,20 @@ def or_none(s): return None return s + def truthy(s): if s is None: return None s = s.lower() - if s in ('true', 't', 'yes', 'y', '1'): + if s in ("true", "t", "yes", "y", "1"): return True - elif s in ('false', 'f', 'no', 'n', '0'): + elif s in ("false", "f", "no", "n", "0"): return False else: return None + class JournalMetadataImporter(EntityImporter): """ Imports journal metadata ("containers") by ISSN, currently from a custom @@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): - if raw_record.get('issnl') and raw_record.get('name'): + if raw_record.get("issnl") and raw_record.get("name"): return True return False @@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ - if not row.get('name'): + if not row.get("name"): # Name is required (by schema) return None extra = dict() - for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', - 'coden', 'aliases', 'original_name', 'first_year', 'last_year', - 'platform', 'default_license', 'road', 'mimetypes', - 'sherpa_romeo', 'kbart'): + for key in ( + "issne", + "issnp", + "languages", + "country", + "urls", + "abbrev", + "coden", + "aliases", + "original_name", + "first_year", + "last_year", + "platform", + "default_license", + "road", + "mimetypes", + "sherpa_romeo", + "kbart", + ): if row.get(key): extra[key] = row[key] # TODO: not including for now: norwegian, dois/crossref, ia extra_doaj = dict() - if row.get('doaj'): - if row['doaj'].get('as_of'): - extra_doaj['as_of'] = row['doaj']['as_of'] - if row['doaj'].get('works'): - extra_doaj['works'] = row['doaj']['works'] + if row.get("doaj"): + if row["doaj"].get("as_of"): + extra_doaj["as_of"] = row["doaj"]["as_of"] + if row["doaj"].get("works"): + extra_doaj["works"] = row["doaj"]["works"] if extra_doaj: - extra['doaj'] = extra_doaj + extra["doaj"] = extra_doaj extra_ia = dict() # TODO: would like an ia.longtail_ia flag - if row.get('sim'): + if row.get("sim"): # NB: None case of the .get() here is blech, but othrwise # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on - extra_ia['sim'] = { - 'year_spans': row['sim'].get('year_spans'), + extra_ia["sim"] = { + "year_spans": row["sim"].get("year_spans"), } if extra_ia: - extra['ia'] = extra_ia + extra["ia"] = extra_ia - name = clean(row.get('name')) + name = clean(row.get("name")) if not name: return None ce = fatcat_openapi_client.ContainerEntity( - issnl=row['issnl'], - issne=row.get('issne'), - issnp=row.get('issnp'), - container_type=None, # TODO + issnl=row["issnl"], + issne=row.get("issne"), + issnp=row.get("issnp"), + container_type=None, # TODO name=name, - publisher=clean(row.get('publisher')), - wikidata_qid=None, # TODO - extra=extra) + publisher=clean(row.get("publisher")), + wikidata_qid=None, # TODO + extra=extra, + ) return ce def try_update(self, ce): @@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter): # for now, only update KBART, and only if there is new content if not existing.extra: existing.extra = dict() - if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']): - if not existing.extra.get('kbart'): - existing.extra['kbart'] = {} - existing.extra['kbart'].update(ce.extra['kbart']) + if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]): + if not existing.extra.get("kbart"): + existing.extra["kbart"] = {} + existing.extra["kbart"].update(ce.extra["kbart"]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False else: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): - self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_container_auto_batch( + fatcat_openapi_client.ContainerAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index d37424d6..8c7bfad4 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP # TODO: more entries? JSTOR_CONTRIB_MAP = { - 'author': 'author', - 'editor': 'editor', - 'translator': 'translator', - 'illustrator': 'illustrator', + "author": "author", + "editor": "editor", + "translator": "translator", + "illustrator": "illustrator", } JSTOR_TYPE_MAP = { @@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = { "research-article": "article-journal", } + class JstorImporter(EntityImporter): """ Importer for JSTOR bulk XML metadata (eg, from their Early Journals @@ -34,17 +34,18 @@ class JstorImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of JSTOR XML metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter') - super().__init__(api, + eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata") + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers = kwargs.get('create_containers', True) + self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) @@ -62,20 +63,22 @@ class JstorImporter(EntityImporter): extra = dict() extra_jstor = dict() - release_type = JSTOR_TYPE_MAP.get(article['article-type']) + release_type = JSTOR_TYPE_MAP.get(article["article-type"]) title = article_meta.find("article-title") if title and title.get_text(): - title = title.get_text().replace('\n', ' ').strip() + title = title.get_text().replace("\n", " ").strip() elif title and not title.get_text(): title = None - if not title and release_type.startswith('review') and article_meta.product.source: - title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text()) + if not title and release_type.startswith("review") and article_meta.product.source: + title = "Review: {}".format( + article_meta.product.source.replace("\n", " ").get_text() + ) if not title: return None - if title.endswith('.'): + if title.endswith("."): title = title[:-1] if "[Abstract]" in title: @@ -93,12 +96,12 @@ class JstorImporter(EntityImporter): title = title[1:-1] # JSTOR journal-id - journal_ids = [j.string for j in journal_meta.find_all('journal-id')] + journal_ids = [j.string for j in journal_meta.find_all("journal-id")] if journal_ids: - extra_jstor['journal_ids'] = journal_ids + extra_jstor["journal_ids"] = journal_ids - journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') - publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ') + journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ") + publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ") issn = journal_meta.find("issn") if issn: issn = issn.string @@ -113,13 +116,18 @@ class JstorImporter(EntityImporter): container_id = self.lookup_issnl(issnl) # create container if it doesn't exist - if (container_id is None and self.create_containers and (issnl is not None) - and journal_title): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and journal_title + ): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=clean(journal_title, force_xml=True)) + name=clean(journal_title, force_xml=True), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -132,8 +140,8 @@ class JstorImporter(EntityImporter): if jstor_id: jstor_id = jstor_id.string.strip() if not jstor_id and doi: - assert doi.startswith('10.2307/') - jstor_id = doi.replace('10.2307/', '') + assert doi.startswith("10.2307/") + jstor_id = doi.replace("10.2307/", "") assert jstor_id and int(jstor_id) contribs = [] @@ -142,13 +150,13 @@ class JstorImporter(EntityImporter): for c in cgroup.find_all("contrib"): given = c.find("given-names") if given: - given = clean(given.get_text().replace('\n', ' ')) + given = clean(given.get_text().replace("\n", " ")) surname = c.find("surname") if surname: - surname = clean(surname.get_text().replace('\n', ' ')) + surname = clean(surname.get_text().replace("\n", " ")) raw_name = c.find("string-name") if raw_name: - raw_name = clean(raw_name.get_text().replace('\n', ' ')) + raw_name = clean(raw_name.get_text().replace("\n", " ")) if not raw_name: if given and surname: @@ -156,15 +164,17 @@ class JstorImporter(EntityImporter): elif surname: raw_name = surname - role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author')) - if not role and c.get('contrib-type'): - sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type'])) - contribs.append(fatcat_openapi_client.ReleaseContrib( - role=role, - raw_name=raw_name, - given_name=given, - surname=surname, - )) + role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author")) + if not role and c.get("contrib-type"): + sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"])) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + role=role, + raw_name=raw_name, + given_name=given, + surname=surname, + ) + ) for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": @@ -172,14 +182,13 @@ class JstorImporter(EntityImporter): release_year = None release_date = None - pub_date = article_meta.find('pub-date') + pub_date = article_meta.find("pub-date") if pub_date and pub_date.year: release_year = int(pub_date.year.string) if pub_date.month and pub_date.day: release_date = datetime.date( - release_year, - int(pub_date.month.string), - int(pub_date.day.string)) + release_year, int(pub_date.month.string), int(pub_date.day.string) + ) if release_date.day == 1 and release_date.month == 1: # suspect jan 1st dates get set by JSTOR when actual # date not known (citation needed), so drop them @@ -208,10 +217,10 @@ class JstorImporter(EntityImporter): warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string)) # JSTOR issue-id - if article_meta.find('issue-id'): - issue_id = clean(article_meta.find('issue-id').string) + if article_meta.find("issue-id"): + issue_id = clean(article_meta.find("issue-id").string) if issue_id: - extra_jstor['issue_id'] = issue_id + extra_jstor["issue_id"] = issue_id # everything in JSTOR is published release_stage = "published" @@ -225,14 +234,14 @@ class JstorImporter(EntityImporter): # group-title # pubmed: retraction refs if extra_jstor: - extra['jstor'] = extra_jstor + extra["jstor"] = extra_jstor if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( - #work_id + # work_id title=title, - #original_title + # original_title release_type=release_type, release_stage=release_stage, release_date=release_date, @@ -246,21 +255,16 @@ class JstorImporter(EntityImporter): pages=pages, publisher=publisher, language=language, - #license_slug - + # license_slug # content, mimetype, lang - #abstracts=abstracts, - + # abstracts=abstracts, contribs=contribs, - # key, year, container_name, title, locator # extra: volume, authors, issue, publisher, identifiers - #refs=refs, - + # refs=refs, # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_id=container_id, - extra=extra, ) return re @@ -289,12 +293,12 @@ class JstorImporter(EntityImporter): if existing and existing.ext_ids.jstor: # don't update if it already has JSTOR ID - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set existing.ext_ids.jstor = re.ext_ids.jstor - existing.extra['jstor'] = re.extra['jstor'] + existing.extra["jstor"] = re.extra["jstor"] # better release_type detection, and some other fields # TODO: don't do this over-writing in the future? assuming here # this is a one-time batch import over/extending bootstrap crossref @@ -304,17 +308,20 @@ class JstorImporter(EntityImporter): existing.contribs = re.contribs existing.language = re.language self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): @@ -325,8 +332,9 @@ class JstorImporter(EntityImporter): for article in soup.find_all("article"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__": parser = JstorImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 09807276..7c2a6a87 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools.normal import clean_doi @@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies." - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Import of large-scale file-to-release match results. Source of metadata varies." + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mimetype = kwargs.get("default_mimetype", None) @@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter): return True def parse_record(self, obj): - dois = [d.lower() for d in obj.get('dois', [])] + dois = [d.lower() for d in obj.get("dois", [])] # lookup dois re_list = set() for doi in dois: doi = clean_doi(doi) if not doi: - self.counts['skip-bad-doi'] += 1 + self.counts["skip-bad-doi"] += 1 return None try: re = self.api.lookup_release(doi=doi) @@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter): raise err re = None if re is None: - #print("DOI not found: {}".format(doi)) + # print("DOI not found: {}".format(doi)) pass else: re_list.add(re.ident) # look up other external ids - for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'): + for extid_type in ( + "arxiv", + "pmid", + "pmcid", + "jstor", + "wikidata_qid", + "core", + "isbn13", + "ark", + ): extid = obj.get(extid_type) if extid: try: @@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter): release_ids = list(re_list) if len(release_ids) == 0: - self.counts['skip-no-releases'] += 1 + self.counts["skip-no-releases"] += 1 return None if len(release_ids) > SANE_MAX_RELEASES: - self.counts['skip-too-many-releases'] += 1 + self.counts["skip-too-many-releases"] += 1 return None # parse URLs and CDX urls = set() - for url in obj.get('urls', []): + for url in obj.get("urls", []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url is not None: urls.add(url) - for cdx in obj.get('cdx', []): - original = cdx['url'] - if cdx.get('dt'): - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) + for cdx in obj.get("cdx", []): + original = cdx["url"] + if cdx.get("dt"): + wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) urls.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url is not None: urls.add(url) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] if len(urls) == 0: - self.counts['skip-no-urls'] += 1 + self.counts["skip-no-urls"] += 1 return None if len(urls) > SANE_MAX_URLS: - self.counts['skip-too-many-urls'] += 1 + self.counts["skip-too-many-urls"] += 1 return None - size = obj.get('size') + size = obj.get("size") if size: size = int(size) - mimetype = obj.get('mimetype', self.default_mimetype) + mimetype = obj.get("mimetype", self.default_mimetype) if not mimetype and urls: - if urls[0].url.endswith('.pdf'): - mimetype = 'application/pdf' + if urls[0].url.endswith(".pdf"): + mimetype = "application/pdf" fe = fatcat_openapi_client.FileEntity( - md5=obj.get('md5'), - sha1=obj['sha1'], - sha256=obj.get('sha256'), + md5=obj.get("md5"), + sha1=obj["sha1"], + sha256=obj.get("sha256"), size=size, mimetype=mimetype, release_ids=release_ids, @@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter): combined_release_ids = list(set(fe.release_ids + existing.release_ids)) if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0: # no new release matches *and* there are already existing URLs - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # check for edit conflicts if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' - existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + existing.urls = [ + u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) + ] for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" # special case: if importing *new* from archive.org arxiv collections, # blow away any existing release_id mappings; this is a direct arxiv_id # map. This *should* be safe to run in all matched imports. is_arxiv = False for u in fe.urls: - if 'archive.org/download/arxiv' in u.url.lower(): + if "archive.org/download/arxiv" in u.url.lower(): is_arxiv = True break if is_arxiv and fe.release_ids: @@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter): # merge the existing into this one and update existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + existing.urls = [ + fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls + ] if len(existing.urls) > SANE_MAX_URLS: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: - self.counts['skip-update-too-many-releases'] += 1 + self.counts["skip-update-too-many-releases"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size @@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter): existing.sha256 = existing.sha256 or fe.sha256 edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 3bdd23a1..b514e6e5 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,4 +1,3 @@ - import sys import fatcat_openapi_client @@ -8,7 +7,7 @@ from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: - e = e.get('value') + e = e.get("value") if type(e) == str and len(e) == 0: e = None # TODO: this is probably bogus; patched in desperation; remove? @@ -21,18 +20,17 @@ def value_or_none(e): return None return e -class OrcidImporter(EntityImporter): +class OrcidImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of ORCID metadata, from official bulk releases.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = kwargs.get( + "editgroup_description", + "Automated import of ORCID metadata, from official bulk releases.", + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): return True @@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter): returns a CreatorEntity """ - if 'person' not in obj: + if "person" not in obj: return False - name = obj['person']['name'] + name = obj["person"]["name"] if not name: return None extra = None - given = value_or_none(name.get('given-names')) - sur = value_or_none(name.get('family-name')) - display = value_or_none(name.get('credit-name')) + given = value_or_none(name.get("given-names")) + sur = value_or_none(name.get("family-name")) + display = value_or_none(name.get("credit-name")) if display is None: # TODO: sorry human beings if given and sur: @@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter): display = sur elif given: display = given - orcid = obj['orcid-identifier']['path'] + orcid = obj["orcid-identifier"]["path"] if not self.is_orcid(orcid): sys.stderr.write("Bad ORCID: {}\n".format(orcid)) return None @@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter): given_name=clean(given), surname=clean(sur), display_name=display, - extra=extra) + extra=extra, + ) return ce def try_update(self, raw_record): @@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_creator_auto_batch( + fatcat_openapi_client.CreatorAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 00ad54d0..cfdafcf7 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { - #Adaptive Clinical Trial + # Adaptive Clinical Trial "Address": "speech", "Autobiography": "book", - #Bibliography + # Bibliography "Biography": "book", - #Case Reports + # Case Reports "Classical Article": "article-journal", - #Clinical Conference - #Clinical Study - #Clinical Trial - #Clinical Trial, Phase I - #Clinical Trial, Phase II - #Clinical Trial, Phase III - #Clinical Trial, Phase IV - #Clinical Trial Protocol - #Clinical Trial, Veterinary - #Collected Works - #Comparative Study - #Congress - #Consensus Development Conference - #Consensus Development Conference, NIH - #Controlled Clinical Trial + # Clinical Conference + # Clinical Study + # Clinical Trial + # Clinical Trial, Phase I + # Clinical Trial, Phase II + # Clinical Trial, Phase III + # Clinical Trial, Phase IV + # Clinical Trial Protocol + # Clinical Trial, Veterinary + # Collected Works + # Comparative Study + # Congress + # Consensus Development Conference + # Consensus Development Conference, NIH + # Controlled Clinical Trial "Dataset": "dataset", - #Dictionary - #Directory - #Duplicate Publication + # Dictionary + # Directory + # Duplicate Publication "Editorial": "editorial", - #English Abstract # doesn't indicate that this is abstract-only - #Equivalence Trial - #Evaluation Studies - #Expression of Concern - #Festschrift - #Government Document - #Guideline + # English Abstract # doesn't indicate that this is abstract-only + # Equivalence Trial + # Evaluation Studies + # Expression of Concern + # Festschrift + # Government Document + # Guideline "Historical Article": "article-journal", - #Interactive Tutorial + # Interactive Tutorial "Interview": "interview", "Introductory Journal Article": "article-journal", "Journal Article": "article-journal", @@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = { "Legal Case": "legal_case", "Legislation": "legislation", "Letter": "letter", - #Meta-Analysis - #Multicenter Study - #News + # Meta-Analysis + # Multicenter Study + # News "Newspaper Article": "article-newspaper", - #Observational Study - #Observational Study, Veterinary - #Overall - #Patient Education Handout - #Periodical Index - #Personal Narrative - #Portrait - #Practice Guideline - #Pragmatic Clinical Trial - #Publication Components - #Publication Formats - #Publication Type Category - #Randomized Controlled Trial - #Research Support, American Recovery and Reinvestment Act - #Research Support, N.I.H., Extramural - #Research Support, N.I.H., Intramural - #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - #Research Support, U.S. Gov't, P.H.S. - #Review # in the "literature review" sense, not "product review" - #Scientific Integrity Review - #Study Characteristics - #Support of Research - #Systematic Review + # Observational Study + # Observational Study, Veterinary + # Overall + # Patient Education Handout + # Periodical Index + # Personal Narrative + # Portrait + # Practice Guideline + # Pragmatic Clinical Trial + # Publication Components + # Publication Formats + # Publication Type Category + # Randomized Controlled Trial + # Research Support, American Recovery and Reinvestment Act + # Research Support, N.I.H., Extramural + # Research Support, N.I.H., Intramural + # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + # Research Support, U.S. Gov't, P.H.S. + # Review # in the "literature review" sense, not "product review" + # Scientific Integrity Review + # Study Characteristics + # Support of Research + # Systematic Review "Technical Report": "report", - #Twin Study - #Validation Studies - #Video-Audio Media - #Webcasts + # Twin Study + # Validation Studies + # Video-Audio Media + # Webcasts } MONTH_ABBR_MAP = { - "Jan": 1, "01": 1, - "Feb": 2, "02": 2, - "Mar": 3, "03": 3, - "Apr": 4, "04": 4, - "May": 5, "05": 5, - "Jun": 6, "06": 6, - "Jul": 7, "07": 7, - "Aug": 8, "08": 8, - "Sep": 9, "09": 9, - "Oct": 10, "10": 10, - "Nov": 11, "11": 11, - "Dec": 12, "12": 12, + "Jan": 1, + "01": 1, + "Feb": 2, + "02": 2, + "Mar": 3, + "03": 3, + "Apr": 4, + "04": 4, + "May": 5, + "05": 5, + "Jun": 6, + "06": 6, + "Jul": 7, + "07": 7, + "Aug": 8, + "08": 8, + "Sep": 9, + "09": 9, + "Oct": 10, + "10": 10, + "Nov": 11, + "11": 11, + "Dec": 12, + "12": 12, } # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ @@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = { "United Kingdom": "gb", "United States": "us", "Uruguay": "uy", - # Additions from running over large files "Bosnia and Herzegovina": "ba", - #"International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn + # "International" + "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn "Russia (Federation)": "ru", "Scotland": "gb", "England": "gb", @@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter): def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of PubMed/MEDLINE XML metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') - super().__init__(api, + eg_desc = kwargs.get( + "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) self.lookup_refs = lookup_refs - self.create_containers = kwargs.get('create_containers', True) + self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) def want(self, obj): @@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter): release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: - extra_pubmed['pub_types'] = pub_types + extra_pubmed["pub_types"] = pub_types if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: if retraction_of.RefSource: - extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string + extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string if retraction_of.PMID: - extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string # everything in medline is published release_stage = "published" @@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter): elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): withdrawn_status = "concern" - pages = medline.find('MedlinePgn') + pages = medline.find("MedlinePgn") if pages: pages = pages.string - title = medline.Article.ArticleTitle.get_text() # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: - title = title.replace('\n', ' ') - if title.endswith('.'): + title = title.replace("\n", " ") + if title.endswith("."): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): + if title.startswith("[") and title.endswith("]"): title = title[1:-1] else: # will filter out later @@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None - original_title = original_title.replace('\n', ' ') - if original_title and original_title.endswith('.'): + original_title = original_title.replace("\n", " ") + if original_title and original_title.endswith("."): original_title = original_title[:-1] if original_title and not title: @@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter): else: language = LANG_MAP_MARC.get(language) if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): - warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) + warnings.warn( + "MISSING MARC LANG: {}".format(medline.Article.Language.string) + ) ### Journal/Issue Metadata # MedlineJournalInfo is always present @@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter): country_name = mji.Country.string.strip() country_code = COUNTRY_NAME_MAP.get(country_name) if country_code: - container_extra['country'] = country_code + container_extra["country"] = country_code elif country_name: - container_extra['country_name'] = country_name + container_extra["country_name"] = country_name if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string @@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter): if issnl: container_id = self.lookup_issnl(issnl) - pub_date = medline.Article.find('ArticleDate') + pub_date = medline.Article.find("ArticleDate") if not pub_date: pub_date = journal.PubDate if not pub_date: @@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter): release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) + int(pub_date.Day.string), + ) release_date = release_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) @@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter): if len(medline_date) >= 4 and medline_date[:4].isdigit(): release_year = int(medline_date[:4]) if release_year < 1300 or release_year > 2040: - print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) + print( + "bad medline year, skipping: {}".format(release_year), file=sys.stderr + ) release_year = None else: - print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) + print( + "unparsable medline date, skipping: {}".format(medline_date), + file=sys.stderr, + ) if journal.find("Title"): container_name = journal.Title.get_text() - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): # name, type, publisher, issnl # extra: original_name, languages, country ce = fatcat_openapi_client.ContainerEntity( name=container_name, - container_type='journal', - #NOTE: publisher not included + container_type="journal", + # NOTE: publisher not included issnl=issnl, issnp=issnp, - extra=(container_extra or None)) + extra=(container_extra or None), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter): # "All abstracts are in English" abstracts = [] primary_abstract = medline.find("Abstract") - if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) + if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"): + joined = "\n".join( + [m.get_text() for m in primary_abstract.find_all("AbstractText")] + ) abst = fatcat_openapi_client.ReleaseAbstract( content=joined, mimetype="text/plain", @@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter): ) if abst.content: abstracts.append(abst) - if abstract.find('math'): + if abstract.find("math"): abst = fatcat_openapi_client.ReleaseAbstract( # strip the <AbstractText> tags content=str(abstract)[14:-15], @@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter): other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: lang = "en" - if other.get('Language'): - lang = LANG_MAP_MARC.get(other['Language']) + if other.get("Language"): + lang = LANG_MAP_MARC.get(other["Language"]) abst = fatcat_openapi_client.ReleaseAbstract( content=other.AbstractText.get_text().strip(), mimetype="text/plain", @@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.get_text().replace('\n', ' ') + given_name = author.ForeName.get_text().replace("\n", " ") if author.LastName: - surname = author.LastName.get_text().replace('\n', ' ') + surname = author.LastName.get_text().replace("\n", " ") if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): - raw_name = author.CollectiveName.get_text().replace('\n', ' ') + raw_name = author.CollectiveName.get_text().replace("\n", " ") contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") - elif '-' not in orcid: + elif "-" not in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], @@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter): orcid[12:16], ) creator_id = self.lookup_orcid(orcid) - contrib_extra['orcid'] = orcid + contrib_extra["orcid"] = orcid affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].get_text().replace('\n', ' ') + raw_affiliation = affiliations[0].get_text().replace("\n", " ") if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] + contrib_extra["more_affiliations"] = [ + ra.get_text().replace("\n", " ") for ra in affiliations[1:] + ] if author.find("EqualContrib"): # TODO: schema for this? - contrib_extra['equal'] = True - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=raw_name, - given_name=given_name, - surname=surname, - role="author", - raw_affiliation=raw_affiliation, - creator_id=creator_id, - extra=contrib_extra, - )) - - if medline.AuthorList['CompleteYN'] == 'N': + contrib_extra["equal"] = True + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=raw_name, + given_name=given_name, + surname=surname, + role="author", + raw_affiliation=raw_affiliation, + creator_id=creator_id, + extra=contrib_extra, + ) + ) + + if medline.AuthorList["CompleteYN"] == "N": contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al.")) for i, contrib in enumerate(contribs): @@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter): # note that Reference always exists within a ReferenceList, but # that there may be multiple ReferenceList (eg, sometimes one per # Reference) - for ref in pubmed.find_all('Reference'): + for ref in pubmed.find_all("Reference"): ref_extra = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: @@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter): ref_pmid = clean_pmid(ref_pmid.string) ref_release_id = None if ref_doi: - ref_extra['doi'] = ref_doi + ref_extra["doi"] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) if ref_pmid: - ref_extra['pmid'] = ref_pmid + ref_extra["pmid"] = ref_pmid if self.lookup_refs: ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.get_text() + ref_extra["unstructured"] = ref_raw.get_text() if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - target_release_id=ref_release_id, - extra=ref_extra, - )) + refs.append( + fatcat_openapi_client.ReleaseRef( + target_release_id=ref_release_id, + extra=ref_extra, + ) + ) if not refs: refs = None @@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter): # group-title # pubmed: retraction refs if extra_pubmed: - extra['pubmed'] = extra_pubmed + extra["pubmed"] = extra_pubmed if not extra: extra = None @@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter): doi=doi, pmid=pmid, pmcid=pmcid, - #isbn13 # never in Article + # isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, - #publisher # not included? + # publisher # not included? language=language, - #license_slug # not in MEDLINE + # license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, @@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter): raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( - existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid + ) warnings.warn(warn_str) - self.counts['warn-pmid-doi-mismatch'] += 1 + self.counts["warn-pmid-doi-mismatch"] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None re.work_id = existing.work_id if existing and not self.do_updates: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set @@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter): existing.container_id = existing.container_id or re.container_id existing.refs = existing.refs or re.refs existing.abstracts = existing.abstracts or re.abstracts - existing.extra['pubmed'] = re.extra['pubmed'] + existing.extra["pubmed"] = re.extra["pubmed"] # fix stub titles if existing.title in [ - "OUP accepted manuscript", - ]: + "OUP accepted manuscript", + ]: existing.title = re.title existing.original_title = existing.original_title or re.original_title @@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter): existing.language = existing.language or re.language # update subtitle in-place first - if not existing.subtitle and existing.extra.get('subtitle'): - subtitle = existing.extra.pop('subtitle') + if not existing.subtitle and existing.extra.get("subtitle"): + subtitle = existing.extra.pop("subtitle") if type(subtitle) == list: subtitle = subtitle[0] if subtitle: @@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): @@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter): for article in soup.find_all("PubmedArticle"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__": parser = PubmedImporter(None, None) parser.parse_file(open(sys.argv[1])) diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 77205cee..78eeec7a 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,4 +1,3 @@ - import fatcat_openapi_client from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid @@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Import of 'Shadow Library' file/release matches" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter") + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): """ Only want to import records with complete file-level metadata """ - fm = raw_record['file_meta'] - if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): - self.counts['skip-file-meta-incomplete'] += 1 + fm = raw_record["file_meta"] + if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]): + self.counts["skip-file-meta-incomplete"] += 1 return False - if fm['mimetype'] != 'application/pdf': - self.counts['skip-not-pdf'] += 1 + if fm["mimetype"] != "application/pdf": + self.counts["skip-not-pdf"] += 1 return False return True @@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter): We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ - shadow_corpus = obj['shadow']['shadow_corpus'] + shadow_corpus = obj["shadow"]["shadow_corpus"] assert shadow_corpus == shadow_corpus.strip().lower() - doi = clean_doi(obj['shadow'].get('doi')) - pmid = clean_pmid(obj['shadow'].get('pmid')) - isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) - shadow_id = obj['shadow'].get('shadow_id').strip() + doi = clean_doi(obj["shadow"].get("doi")) + pmid = clean_pmid(obj["shadow"].get("pmid")) + isbn13 = clean_isbn13(obj["shadow"].get("isbn13")) + shadow_id = obj["shadow"].get("shadow_id").strip() assert shadow_id - extra = { '{}_id'.format(shadow_corpus): shadow_id } - for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + extra = {"{}_id".format(shadow_corpus): shadow_id} + for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]: if not ext_id: continue - extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id # lookup release via several idents re = None - for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]: if not ext_id: continue try: @@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter): break if not re: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None - release_ids = [re.ident,] + release_ids = [ + re.ident, + ] # parse single CDX into URLs (if exists) urls = [] - if obj.get('cdx'): - url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if obj.get("cdx"): + url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel) if url is not None: urls.append(url) wayback = "https://web.archive.org/web/{}/{}".format( - obj['cdx']['datetime'], - obj['cdx']['url']) + obj["cdx"]["datetime"], obj["cdx"]["url"] + ) urls.append(("webarchive", wayback)) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] fe = fatcat_openapi_client.FileEntity( - md5=obj['file_meta']['md5hex'], - sha1=obj['file_meta']['sha1hex'], - sha256=obj['file_meta']['sha256hex'], - size=int(obj['file_meta']['size_bytes']), - mimetype=obj['file_meta']['mimetype'] or None, + md5=obj["file_meta"]["md5hex"], + sha1=obj["file_meta"]["sha1hex"], + sha256=obj["file_meta"]["sha256hex"], + size=int(obj["file_meta"]["size_bytes"]), + mimetype=obj["file_meta"]["mimetype"] or None, release_ids=release_ids, urls=urls, extra=dict(shadows=extra), @@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter): if not existing.extra: existing.extra = {} - if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + if ( + existing.extra.get("shadows") + and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"] + ): # already imported from this shadow library; skip - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # check for edit conflicts if existing.ident in [e.ident for e in self._edits_inflight]: - self.counts['skip-update-inflight'] += 1 + self.counts["skip-update-inflight"] += 1 return False if fe.sha1 in [e.sha1 for e in self._edits_inflight]: raise Exception("Inflight insert; shouldn't happen") # minimum viable "existing" URL cleanup to fix dupes and broken links: # remove 'None' wayback URLs, and set archive.org rel 'archive' - existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + existing.urls = [ + u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url) + ] for i in range(len(existing.urls)): u = existing.urls[i] - if u.rel == 'repository' and '://archive.org/download/' in u.url: - existing.urls[i].rel = 'archive' - if u.rel == 'social': - u.rel = 'academicsocial' + if u.rel == "repository" and "://archive.org/download/" in u.url: + existing.urls[i].rel = "archive" + if u.rel == "social": + u.rel = "academicsocial" # merge the existing into this one and update merged_urls = {} for u in fe.urls + existing.urls: merged_urls[u.url] = u existing.urls = list(merged_urls.values()) - if not existing.extra.get('shadows'): - existing.extra['shadows'] = fe.extra['shadows'] + if not existing.extra.get("shadows"): + existing.extra["shadows"] = fe.extra["shadows"] else: - existing.extra['shadows'].update(fe.extra['shadows']) + existing.extra["shadows"].update(fe.extra["shadows"]) # do these "plus ones" because we really want to do these updates when possible if len(existing.urls) > SANE_MAX_URLS + 1: - self.counts['skip-update-too-many-url'] += 1 + self.counts["skip-update-too-many-url"] += 1 return None existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES + 1: - self.counts['skip-update-too-many-releases'] += 1 + self.counts["skip-update-too-many-releases"] += 1 return None existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size @@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter): # group-level de-dupe edit.sha1 = existing.sha1 self._edits_inflight.append(edit) - self.counts['update'] += 1 + self.counts["update"] += 1 return False def insert_batch(self, batch): - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 196f86ff..22fefad3 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -33,22 +33,23 @@ REQ_SESSION = requests.Session() def parse_wbm_url(url): """Takes a wayback machine URL, and returns a tuple: - (timestamp, datetime, original_url) + (timestamp, datetime, original_url) """ - chunks = url.split('/') + chunks = url.split("/") assert len(chunks) >= 6 - assert chunks[2] == 'web.archive.org' - assert chunks[3] == 'web' - return (chunks[4], - parse_wbm_timestamp(chunks[4]), - '/'.join(chunks[5:])) + assert chunks[2] == "web.archive.org" + assert chunks[3] == "web" + return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) + def test_parse_wbm_url(): u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" assert parse_wbm_url(u) == ( "20010712114837", datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html") + "http://www.dlib.org/dlib/june01/reich/06reich.html", + ) + def parse_wbm_timestamp(timestamp): """ @@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp): python datetime object (UTC) """ # strip any "im_" or "id_" suffix - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] # inflexible; require the full second-precision timestamp assert len(timestamp) == 14 @@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp): day=int(timestamp[6:8]), hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), - second=int(timestamp[12:14])) + second=int(timestamp[12:14]), + ) + def test_parse_wbm_timestamp(): - assert parse_wbm_timestamp("20010712114837") == \ - datetime.datetime(2001, 7, 12, 11, 48, 37) + assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) + def fetch_wbm(url): resp = REQ_SESSION.get(url) @@ -78,31 +81,35 @@ def fetch_wbm(url): assert resp.content return resp.content + def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sys.stderr.write(embed_url + "\n") - assert embed_url.startswith('/web/') - embed_url = embed_url.split('/') + assert embed_url.startswith("/web/") + embed_url = embed_url.split("/") timestamp = embed_url[2] - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] - url = '/'.join(embed_url[3:]) - #print((timestamp, url)) - resp = REQ_SESSION.get(CDX_API_BASE, params=dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - )) + url = "/".join(embed_url[3:]) + # print((timestamp, url)) + resp = REQ_SESSION.get( + CDX_API_BASE, + params=dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + ), + ) resp.raise_for_status() - #print(resp.url) + # print(resp.url) if resp.content: - hit = resp.content.decode('utf-8').split('\n')[0] + hit = resp.content.decode("utf-8").split("\n")[0] if cdx_output: cdx_output.write(hit + "\n") - cdx = hit.split(' ') - cdx = [x if (x and x != '-') else None for x in cdx] + cdx = hit.split(" ") + cdx = [x if (x and x != "-") else None for x in cdx] webcapture_cdx = WebcaptureCdxLine( surt=cdx[0], timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", @@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sha256=None, ) if verify_hashes: - resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( - cdx[1], # raw timestamp - webcapture_cdx.url)) + resp = REQ_SESSION.get( + GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp + ) resp.raise_for_status() assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() @@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): else: return None + def wayback_url_to_relative(url): """ Wayback URLs can be relative or absolute in rewritten documents. This function converts any form of rewritten URL to a relative (to web.archive.org) one, or returns None if it isn't a rewritten URL at all. """ - if url.startswith('https://web.archive.org/'): + if url.startswith("https://web.archive.org/"): url = url[23:] - elif url.startswith('http://web.archive.org/'): + elif url.startswith("http://web.archive.org/"): url = url[22:] - if url.startswith('/web/'): + if url.startswith("/web/"): return url else: return None + def extract_embeds(soup): embeds = set() # <link href=""> - for tag in soup.find_all('link', href=True): - if tag['rel'] not in ('stylesheet',): + for tag in soup.find_all("link", href=True): + if tag["rel"] not in ("stylesheet",): continue - url = wayback_url_to_relative(tag['href']) + url = wayback_url_to_relative(tag["href"]) if url: embeds.add(url) # <img src=""> - for tag in soup.find_all('img', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("img", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) # <script src=""> - for tag in soup.find_all('script', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("script", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) return list(embeds) + def static_wayback_webcapture(wayback_url, cdx_output=None): """ Given a complete wayback machine capture URL, like: @@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): wbm_html = fetch_wbm(wayback_url) raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - #with open(rewritten_path, 'r') as fp: + # with open(rewritten_path, 'r') as fp: # soup = BeautifulSoup(fp, "lxml") soup = BeautifulSoup(wbm_html, "lxml") embeds = extract_embeds(soup) - cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), - cdx_output=cdx_output) + cdx_obj = lookup_cdx( + "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output + ) cdx_list = [cdx_obj] for url in embeds: cdx_obj = lookup_cdx(url, cdx_output=cdx_output) cdx_list.append(cdx_obj) - archive_urls = [WebcaptureUrl( - rel="wayback", - url="https://web.archive.org/web/", - )] + archive_urls = [ + WebcaptureUrl( + rel="wayback", + url="https://web.archive.org/web/", + ) + ] wc = WebcaptureEntity( cdx=cdx_list, timestamp=timestamp.isoformat() + "Z", original_url=original_url, archive_urls=archive_urls, - release_ids=None) + release_ids=None, + ) return wc + def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): """ Returns a tuple: (editgroup_id, edit). If failed, both are None """ raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - git_rev = subprocess.check_output( - ["git", "describe", "--always"]).strip().decode('utf-8') + git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") release = api.get_release(release_id, expand="webcaptures") @@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): for wc in release.webcaptures: if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): # skipping: already existed - print("release {} already had webcapture {} {}".format( - release_id, raw_timestamp, original_url)) + print( + "release {} already had webcapture {} {}".format( + release_id, raw_timestamp, original_url + ) + ) return (None, None) wc = static_wayback_webcapture(wayback_url) assert len(wc.cdx) >= 1 wc.release_ids = [release_id] if not editgroup_id: - eg = api.create_editgroup(Editgroup( - description="One-off import of static web content from wayback machine", - extra=dict( - git_rev=git_rev, - agent="fatcat_tools.auto_wayback_static"))) + eg = api.create_editgroup( + Editgroup( + description="One-off import of static web content from wayback machine", + extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), + ) + ) editgroup_id = eg.editgroup_id edit = api.create_webcapture(eg.editgroup_id, wc) return (editgroup_id, edit) + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--verbose', - action='store_true', - help="verbose output") - parser.add_argument('wayback_url', - type=str, - help="URL of wayback capture to extract from") - parser.add_argument('--json-output', - type=argparse.FileType('w'), default=sys.stdout, - help="where to write out webcapture entity (as JSON)") - parser.add_argument('--cdx-output', - type=argparse.FileType('w'), default=None, - help="(optional) file to write out CDX stub") + parser.add_argument("--verbose", action="store_true", help="verbose output") + parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") + parser.add_argument( + "--json-output", + type=argparse.FileType("w"), + default=sys.stdout, + help="where to write out webcapture entity (as JSON)", + ) + parser.add_argument( + "--cdx-output", + type=argparse.FileType("w"), + default=None, + help="(optional) file to write out CDX stub", + ) args = parser.parse_args() @@ -254,5 +275,6 @@ def main(): wc_dict = api_client.sanitize_for_serialization(wc) print(json.dumps(wc_dict)) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 32749db2..2a4451ad 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,4 +1,3 @@ - from confluent_kafka import KafkaException, Producer @@ -9,14 +8,15 @@ def kafka_fail_fast(err, msg): # TODO: should it be sys.exit(-1)? raise KafkaException(err) + def simple_kafka_producer(kafka_hosts): kafka_config = { - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 20000000, # ~20 MBytes; broker-side max is ~50 MBytes + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, }, } return Producer(kafka_config) diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 9b65e768..12c58829 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -1,4 +1,3 @@ - """ A bunch of helpers to parse and normalize strings: external identifiers, free-form input, titles, etc. @@ -32,7 +31,7 @@ def clean_doi(raw: str) -> Optional[str]: if not raw: return None raw = raw.strip().lower() - if '\u2013' in raw: + if "\u2013" in raw: # Do not attempt to normalize "en dash" and since FC does not allow # unicode in DOI, treat this as invalid. return None @@ -54,7 +53,7 @@ def clean_doi(raw: str) -> Optional[str]: # fatcatd uses same REGEX, but Rust regex rejects these characters, while # python doesn't. DOIs are syntaxtually valid, but very likely to be typos; # for now filter them out. - for c in ('¬', ): + for c in ("¬",): if c in raw: return None @@ -70,6 +69,7 @@ def clean_doi(raw: str) -> Optional[str]: return None return raw + def test_clean_doi(): assert clean_doi("10.1234/asdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" @@ -81,7 +81,9 @@ def test_clean_doi(): assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" assert clean_doi("doi:10.1234/ asdf ") is None assert clean_doi("10.4149/gpb¬_2017042") is None # "logical negation" character - assert clean_doi("10.6002/ect.2020.häyry") is None # this example via pubmed (pmid:32519616) + assert ( + clean_doi("10.6002/ect.2020.häyry") is None + ) # this example via pubmed (pmid:32519616) assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") is None assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") is None assert clean_doi("10.4025/diálogos.v17i2.36030") is None @@ -92,6 +94,7 @@ def test_clean_doi(): ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") + def clean_arxiv_id(raw: str) -> Optional[str]: """ Removes any: @@ -113,6 +116,7 @@ def clean_arxiv_id(raw: str) -> Optional[str]: return None return raw + def test_clean_arxiv_id(): assert clean_arxiv_id("0806.2878v1") == "0806.2878v1" assert clean_arxiv_id("0806.2878") == "0806.2878" @@ -141,16 +145,18 @@ def test_clean_arxiv_id(): assert clean_arxiv_id("0806.v1") is None assert clean_arxiv_id("08062878v1") is None + def clean_wikidata_qid(raw): if not raw: return None raw = raw.strip() if len(raw.split()) != 1 or len(raw) < 2: return None - if raw[0] == 'Q' and raw[1] != '0' and raw[1:].isdigit(): + if raw[0] == "Q" and raw[1] != "0" and raw[1:].isdigit(): return raw return None + def test_clean_wikidata_qid(): assert clean_wikidata_qid("Q1234") == "Q1234" assert clean_wikidata_qid("Q1") == "Q1" @@ -163,6 +169,7 @@ def test_clean_wikidata_qid(): assert clean_wikidata_qid("qfba3") is None assert clean_wikidata_qid("") is None + def clean_pmid(raw: str) -> Optional[str]: if not raw: return None @@ -173,6 +180,7 @@ def clean_pmid(raw: str) -> Optional[str]: return raw return None + def test_clean_pmid(): assert clean_pmid("1234") == "1234" assert clean_pmid("1234 ") == "1234" @@ -180,6 +188,7 @@ def test_clean_pmid(): assert clean_pmid("qfba3") is None assert clean_pmid("") is None + def clean_pmcid(raw: str) -> Optional[str]: if not raw: return None @@ -190,6 +199,7 @@ def clean_pmcid(raw: str) -> Optional[str]: return raw return None + def clean_sha1(raw: str) -> Optional[str]: if not raw: return None @@ -203,13 +213,21 @@ def clean_sha1(raw: str) -> Optional[str]: return None return raw + def test_clean_sha1(): - assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" - assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + assert ( + clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") + == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + ) + assert ( + clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") + == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + ) assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") is None assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None + def clean_sha256(raw: str) -> Optional[str]: raw = raw.strip().lower() if len(raw.split()) != 1: @@ -221,12 +239,18 @@ def clean_sha256(raw: str) -> Optional[str]: return None return raw + def test_clean_sha256(): - assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" + assert ( + clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") + == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" + ) assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None + ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$") + def clean_issn(raw: str) -> Optional[str]: if not raw: return None @@ -237,14 +261,17 @@ def clean_issn(raw: str) -> Optional[str]: return None return raw + def test_clean_issn(): assert clean_issn("1234-4567") == "1234-4567" assert clean_issn("1234-456X") == "1234-456X" assert clean_issn("134-4567") is None assert clean_issn("123X-4567") is None + ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") + def clean_isbn13(raw: str) -> Optional[str]: if not raw: return None @@ -253,14 +280,17 @@ def clean_isbn13(raw: str) -> Optional[str]: return None return raw + def test_clean_isbn13(): assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4" assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6" assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4" assert clean_isbn13("9781566199094") is None + ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$") + def clean_orcid(raw: str) -> Optional[str]: if not raw: return None @@ -269,6 +299,7 @@ def clean_orcid(raw: str) -> Optional[str]: return None return raw + def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789" assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X" @@ -279,6 +310,7 @@ def test_clean_orcid(): HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$") + def clean_hdl(raw): if not raw: return None @@ -293,14 +325,17 @@ def clean_hdl(raw): raw = raw[15:] if not HDL_REGEX.fullmatch(raw): return None - if raw.startswith('10.'): + if raw.startswith("10."): return None return raw + def test_clean_hdl(): assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" - assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert ( + clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + ) assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh" assert clean_hdl("2381/12775") == "2381/12775" @@ -326,7 +361,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: """ if not thing: return None - unescape_html: Union[str, bool] = 'auto' + unescape_html: Union[str, bool] = "auto" if force_xml: unescape_html = True fixed = ftfy.fix_text(thing, unescape_html=unescape_html).strip() @@ -335,15 +370,17 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: return None return fixed + def test_clean_str(): assert clean_str(None) is None - assert clean_str('') is None - assert clean_str('1') is None - assert clean_str('123') == '123' - assert clean_str('a&b') == 'a&b' - assert clean_str('<b>a&b</b>') == '<b>a&b</b>' - assert clean_str('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + assert clean_str("") is None + assert clean_str("1") is None + assert clean_str("123") == "123" + assert clean_str("a&b") == "a&b" + assert clean_str("<b>a&b</b>") == "<b>a&b</b>" + assert clean_str("<b>a&b</b>", force_xml=True) == "<b>a&b</b>" + def b32_hex(s): s = s.strip().split()[0].lower() @@ -351,7 +388,8 @@ def b32_hex(s): s = s[5:] if len(s) != 32: return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + def is_cjk(s): if not s: @@ -359,38 +397,53 @@ def is_cjk(s): for c in s: if c.isalpha(): lang_prefix = unicodedata.name(c).split()[0] - return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') + return lang_prefix in ("CJK", "HIRAGANA", "KATAKANA", "HANGUL") return False + def test_is_cjk(): assert is_cjk(None) is False - assert is_cjk('') is False - assert is_cjk('blah') is False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True - assert is_cjk('菊') is True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True - assert is_cjk('水道') is True - assert is_cjk('オウ, イク') is True # kanji - assert is_cjk('ひヒ') is True - assert is_cjk('き゚ゅ') is True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True + assert is_cjk("") is False + assert is_cjk("blah") is False + assert is_cjk("岡, 鹿, 梨, 阜, 埼") is True + assert is_cjk("[岡, 鹿, 梨, 阜, 埼]") is True + assert is_cjk("菊") is True + assert is_cjk("岡, 鹿, 梨, 阜, 埼 with eng after") is True + assert is_cjk("水道") is True + assert is_cjk("オウ, イク") is True # kanji + assert is_cjk("ひヒ") is True + assert is_cjk("き゚ゅ") is True + assert is_cjk("ㄴ, ㄹ, ㅁ, ㅂ, ㅅ") is True + MONTH_MAP = { - "jan": 1, "january": 1, - "feb": 2, "febuary": 2, - "mar": 3, "march": 3, - "apr": 4, "april": 4, - "may": 5, "may": 5, - "jun": 6, "june": 6, - "jul": 7, "july": 7, - "aug": 8, "august": 8, - "sep": 9, "september": 9, - "oct": 10, "october": 10, - "nov": 11, "nov": 11, - "dec": 12, "december": 12, + "jan": 1, + "january": 1, + "feb": 2, + "febuary": 2, + "mar": 3, + "march": 3, + "apr": 4, + "april": 4, + "may": 5, + "may": 5, + "jun": 6, + "june": 6, + "jul": 7, + "july": 7, + "aug": 8, + "august": 8, + "sep": 9, + "september": 9, + "oct": 10, + "october": 10, + "nov": 11, + "nov": 11, + "dec": 12, + "december": 12, } + def parse_month(raw: Optional[str]) -> Optional[int]: """ Parses a string into a month number (1 to 12) @@ -408,6 +461,7 @@ def parse_month(raw: Optional[str]) -> Optional[int]: return MONTH_MAP[raw] return None + def test_parse_month() -> None: assert parse_month(None) is None @@ -417,6 +471,7 @@ def test_parse_month() -> None: assert parse_month("jan") == 1 assert parse_month("September") == 9 + def detect_text_lang(raw: str) -> Optional[str]: """ Tries to determine language of, eg, an abstract. @@ -427,13 +482,14 @@ def detect_text_lang(raw: str) -> Optional[str]: return None try: lang = langdetect.detect(raw) - lang = lang.split('-')[0] + lang = lang.split("-")[0] assert len(lang) == 2 return lang except (langdetect.lang_detect_exception.LangDetectException, TypeError): return None return None + def test_detect_text_lang() -> None: assert detect_text_lang("") is None EN_SAMPLE = "this is a string of English text for testing" @@ -444,6 +500,7 @@ def test_detect_text_lang() -> None: # XXX: why does this detect as `ko` sometimes? assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko") + def parse_lang_name(raw: Optional[str]) -> Optional[str]: """ Parses a language name and returns a 2-char ISO 631 language code. @@ -456,13 +513,14 @@ def parse_lang_name(raw: Optional[str]) -> Optional[str]: return None return lang.alpha_2.lower() except LookupError: - #print(f" unknown language: '{raw}', file=sys.stderr) + # print(f" unknown language: '{raw}', file=sys.stderr) return None except AttributeError: - #print(f" partial language metadata: '{lang}', file=sys.stderr) + # print(f" partial language metadata: '{lang}', file=sys.stderr) return None return None + def test_parse_lang_name() -> None: assert parse_lang_name(None) is None @@ -544,86 +602,85 @@ def test_parse_country_name(): assert parse_country_name("Russia") == "ru" assert parse_country_name("Japan") == "jp" + # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of # 2/T and 2/B? # PubMed/MEDLINE and JSTOR use these MARC codes # https://www.loc.gov/marc/languages/language_name.html LANG_MAP_MARC = { - 'afr': 'af', - 'alb': 'sq', - 'amh': 'am', - 'ara': 'ar', - 'arm': 'hy', - 'aze': 'az', - 'ben': 'bn', - 'bos': 'bs', - 'bul': 'bg', - 'cat': 'ca', - 'chi': 'zh', - 'cze': 'cs', - 'dan': 'da', - 'dut': 'nl', - 'eng': 'en', - 'epo': 'eo', - 'est': 'et', - 'fin': 'fi', - 'fre': 'fr', - 'geo': 'ka', - 'ger': 'de', - 'gla': 'gd', - 'gre': 'el', - 'heb': 'he', - 'hin': 'hi', - 'hrv': 'hr', - 'hun': 'hu', - 'ice': 'is', - 'ind': 'id', - 'ita': 'it', - 'jpn': 'ja', - 'kin': 'rw', - 'kor': 'ko', - 'lat': 'la', - 'lav': 'lv', - 'lit': 'lt', - 'mac': 'mk', - 'mal': 'ml', - 'mao': 'mi', - 'may': 'ms', - 'nor': 'no', - 'per': 'fa', - 'per': 'fa', - 'pol': 'pl', - 'por': 'pt', - 'pus': 'ps', - 'rum': 'ro', - 'rus': 'ru', - 'san': 'sa', - 'slo': 'sk', - 'slv': 'sl', - 'spa': 'es', - 'srp': 'sr', - 'swe': 'sv', - 'tha': 'th', - 'tur': 'tr', - 'ukr': 'uk', - 'urd': 'ur', - 'vie': 'vi', - 'wel': 'cy', - -# additions - 'gle': 'ga', # "Irish" (Gaelic) - 'jav': 'jv', # Javanese - 'welsh': 'cy', # Welsh - 'oci': 'oc', # Occitan - -# Don't have ISO 639-1 codes - 'grc': 'el', # Ancient Greek; map to modern greek - 'map': None, # Austronesian (collection) - 'syr': None, # Syriac, Modern - 'gem': None, # Old Saxon - 'non': None, # Old Norse - 'emg': None, # Eastern Meohang - 'neg': None, # Negidal - 'mul': None, # Multiple languages - 'und': None, # Undetermined + "afr": "af", + "alb": "sq", + "amh": "am", + "ara": "ar", + "arm": "hy", + "aze": "az", + "ben": "bn", + "bos": "bs", + "bul": "bg", + "cat": "ca", + "chi": "zh", + "cze": "cs", + "dan": "da", + "dut": "nl", + "eng": "en", + "epo": "eo", + "est": "et", + "fin": "fi", + "fre": "fr", + "geo": "ka", + "ger": "de", + "gla": "gd", + "gre": "el", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "ice": "is", + "ind": "id", + "ita": "it", + "jpn": "ja", + "kin": "rw", + "kor": "ko", + "lat": "la", + "lav": "lv", + "lit": "lt", + "mac": "mk", + "mal": "ml", + "mao": "mi", + "may": "ms", + "nor": "no", + "per": "fa", + "per": "fa", + "pol": "pl", + "por": "pt", + "pus": "ps", + "rum": "ro", + "rus": "ru", + "san": "sa", + "slo": "sk", + "slv": "sl", + "spa": "es", + "srp": "sr", + "swe": "sv", + "tha": "th", + "tur": "tr", + "ukr": "uk", + "urd": "ur", + "vie": "vi", + "wel": "cy", + # additions + "gle": "ga", # "Irish" (Gaelic) + "jav": "jv", # Javanese + "welsh": "cy", # Welsh + "oci": "oc", # Occitan + # Don't have ISO 639-1 codes + "grc": "el", # Ancient Greek; map to modern greek + "map": None, # Austronesian (collection) + "syr": None, # Syriac, Modern + "gem": None, # Old Saxon + "non": None, # Old Norse + "emg": None, # Eastern Meohang + "neg": None, # Negidal + "mul": None, # Multiple languages + "und": None, # Undetermined } diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 8361b260..6fd9ca49 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -22,6 +22,7 @@ from fatcat_tools.transforms.entities import entity_to_dict class BiblioRef(BaseModel): """bibliographic reference""" + # ("release", source_release_ident, ref_index) # ("wikipedia", source_wikipedia_article, ref_index) _key: Optional[str] @@ -37,7 +38,7 @@ class BiblioRef(BaseModel): # context of the reference itself # 1-indexed, not 0-indexed - ref_index: Optional[int] # TODO: actually optional? + ref_index: Optional[int] # TODO: actually optional? # eg, "Lee86", "BIB23" ref_key: Optional[str] # eg, page number @@ -74,16 +75,20 @@ class BiblioRef(BaseModel): # work-arounds for bad/weird ref_key if self.ref_key: self.ref_key = self.ref_key.strip() - if self.ref_key[0] in ['/', '_']: + if self.ref_key[0] in ["/", "_"]: self.ref_key = self.ref_key[1:] - if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key: - self.ref_key = self.ref_key.split('-')[-1] - if self.ref_key.startswith("10.") and '_' in self.ref_key: - self.ref_key = self.ref_key.split('_')[-1] + if ( + self.ref_key.startswith("10.") + and "SICI" in self.ref_key + and "-" in self.ref_key + ): + self.ref_key = self.ref_key.split("-")[-1] + if self.ref_key.startswith("10.") and "_" in self.ref_key: + self.ref_key = self.ref_key.split("_")[-1] if len(self.ref_key) > 10 and "#" in self.ref_key: - self.ref_key = self.ref_key.split('#')[-1] + self.ref_key = self.ref_key.split("#")[-1] if len(self.ref_key) > 10 and "_" in self.ref_key: - self.ref_key = self.ref_key.split('_')[-1] + self.ref_key = self.ref_key.split("_")[-1] if not self.ref_key and self.ref_index is not None: self.ref_key = str(self.ref_index) return self @@ -98,7 +103,7 @@ class EnrichedBiblioRef(BaseModel): # TODO: openlibrary work? access: List[AccessOption] - @validator('release') + @validator("release") @classmethod def check_release(cls, v): if v is not None and not isinstance(v, ReleaseEntity): @@ -119,7 +124,7 @@ class RefHits(BaseModel): limit: int query_time_ms: int query_wall_time_ms: int - result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] + result_refs: List[Union[BiblioRef, EnrichedBiblioRef]] class Config: json_encoders = { @@ -145,22 +150,22 @@ def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> except elasticsearch.exceptions.RequestError as e_raw: # this is a "user" error e: Any = e_raw - #logging.warn("elasticsearch 400: " + str(e.info)) + # logging.warn("elasticsearch 400: " + str(e.info)) if e.info.get("error", {}).get("root_cause", {}): raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e else: raise ValueError(str(e.info)) from e except elasticsearch.exceptions.TransportError as e: # all other errors - #logging.warn(f"elasticsearch non-200 status code: {e.info}") + # logging.warn(f"elasticsearch non-200 status code: {e.info}") raise IOError(str(e.info)) from e query_delta = datetime.datetime.now() - query_start result_refs = [] for h in resp.hits: # might be a list because of consolidation - if isinstance(h._d_.get('source_work_ident'), list): - h._d_['source_work_ident'] = h._d_['source_work_ident'][0] + if isinstance(h._d_.get("source_work_ident"), list): + h._d_["source_work_ident"] = h._d_["source_work_ident"][0] result_refs.append(BiblioRef.parse_obj(h._d_).hacks()) return RefHits( @@ -224,7 +229,10 @@ def get_inbound_refs( search = search.extra( collapse={ "field": "source_work_ident", - "inner_hits": {"name": "source_more", "size": 0,}, + "inner_hits": { + "name": "source_more", + "size": 0, + }, } ) @@ -281,61 +289,87 @@ def count_inbound_refs( # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: +def enrich_inbound_refs( + refs: List[BiblioRef], + fatcat_api_client: Any, + hide: Optional[str] = "refs", + expand: Optional[str] = "container,files,webcaptures,filesets", +) -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: release = None access = [] if ref.source_release_ident: - release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) + release = fatcat_api_client.get_release( + ref.source_release_ident, hide=hide, expand=expand + ) access = release_access_options(release) if ref.source_wikipedia_article: - wiki_lang = ref.source_wikipedia_article.split(':')[0] - wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_') - access.append(AccessOption( - access_type="wikipedia", - access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", - mimetype=None, - size_bytes=None, - thumbnail_url=None - )) - enriched.append(EnrichedBiblioRef( - ref=ref, - access=access, - release=release, - )) + wiki_lang = ref.source_wikipedia_article.split(":")[0] + wiki_article = ":".join(ref.source_wikipedia_article.split(":")[1:]).replace( + " ", "_" + ) + access.append( + AccessOption( + access_type="wikipedia", + access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}", + mimetype=None, + size_bytes=None, + thumbnail_url=None, + ) + ) + enriched.append( + EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + ) + ) return enriched -def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: +def enrich_outbound_refs( + refs: List[BiblioRef], + fatcat_api_client: Any, + hide: Optional[str] = "refs", + expand: Optional[str] = "container,files,webcaptures,filesets", +) -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: release = None access = [] if ref.target_release_ident: - release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) + release = fatcat_api_client.get_release( + ref.target_release_ident, hide=hide, expand=expand + ) access = release_access_options(release) if ref.target_openlibrary_work: - access.append(AccessOption( - access_type="openlibrary", - access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", - mimetype=None, - size_bytes=None, - thumbnail_url=None - )) - if ref.target_url and '://web.archive.org/' in ref.target_url: - access.append(AccessOption( - access_type="wayback", - access_url=ref.target_url, - mimetype=None, - size_bytes=None, - thumbnail_url=None - )) - enriched.append(EnrichedBiblioRef( - ref=ref, - access=access, - release=release, - )) + access.append( + AccessOption( + access_type="openlibrary", + access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}", + mimetype=None, + size_bytes=None, + thumbnail_url=None, + ) + ) + if ref.target_url and "://web.archive.org/" in ref.target_url: + access.append( + AccessOption( + access_type="wayback", + access_url=ref.target_url, + mimetype=None, + size_bytes=None, + thumbnail_url=None, + ) + ) + enriched.append( + EnrichedBiblioRef( + ref=ref, + access=access, + release=release, + ) + ) return enriched @@ -346,21 +380,29 @@ def run_ref_query(args) -> None: release_ident = None work_ident = None if args.ident.startswith("release_"): - release_ident = args.ident.split('_')[1] + release_ident = args.ident.split("_")[1] elif args.ident.startswith("work_"): - work_ident = args.ident.split('_')[1] + work_ident = args.ident.split("_")[1] else: release_ident = args.ident print("## Outbound References") - hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) - print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + hits = get_outbound_refs( + release_ident=release_ident, work_ident=work_ident, es_client=args.es_client + ) + print( + f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms" + ) if args.enrich == "fatcat": - enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + enriched = enrich_outbound_refs( + hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client + ) for ref in enriched: if ref.release: - print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + print( + f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" + ) else: print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") else: @@ -369,21 +411,30 @@ def run_ref_query(args) -> None: print() print("## Inbound References") - hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client) + hits = get_inbound_refs( + release_ident=release_ident, work_ident=work_ident, es_client=args.es_client + ) - print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms") + print( + f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms" + ) if args.enrich == "fatcat": - enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client) + enriched = enrich_inbound_refs( + hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client + ) for ref in enriched: if ref.release: - print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}") + print( + f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" + ) else: print(f"release_{ref.target_release_ident}") else: for ref in hits.result_refs: print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") + def main() -> None: """ Run this utility like: @@ -395,9 +446,7 @@ def main() -> None: python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply """ - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0") @@ -425,5 +474,6 @@ def main() -> None: else: raise NotImplementedError(args.func) + if __name__ == "__main__": main() diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index 867d826d..59ff1c4e 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -1,4 +1,3 @@ - import datetime import subprocess import time @@ -34,8 +33,8 @@ class CheckResult: self.status = status self.check_type = check_type self.description = description - self.ident = kwargs.get('ident') - self.rev = kwargs.get('rev') + self.ident = kwargs.get("ident") + self.rev = kwargs.get("rev") def __repr__(self): return str(self.__dict__) @@ -72,17 +71,17 @@ class EditCheck: class ReviewBot: - def __init__(self, api, verbose=False, **kwargs): self.api = api self.checks = [] self.verbose = verbose - self.extra = kwargs.get('extra', dict()) - self.extra['git_rev'] = self.extra.get('git_rev', - subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - self.extra['agent'] = self.extra.get('agent', 'fatcat_tools.ReviewBot') - self.poll_interval = kwargs.get('poll_interval', 10.0) + self.extra = kwargs.get("extra", dict()) + self.extra["git_rev"] = self.extra.get( + "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip() + ).decode("utf-8") + self.extra["agent"] = self.extra.get("agent", "fatcat_tools.ReviewBot") + self.poll_interval = kwargs.get("poll_interval", 10.0) def run_single(self, editgroup_id, annotate=True): eg = self.api.get_editgroup(editgroup_id) @@ -96,7 +95,9 @@ class ReviewBot: since = datetime.datetime.utcnow() while True: # XXX: better isoformat conversion? - eg_list = self.api.get_editgroups_reviewable(since=since.isoformat()[:19] + "Z", limit=100) + eg_list = self.api.get_editgroups_reviewable( + since=since.isoformat()[:19] + "Z", limit=100 + ) if not eg_list: print("Sleeping {} seconds...".format(self.poll_interval)) time.sleep(self.poll_interval) @@ -104,8 +105,11 @@ class ReviewBot: for eg in eg_list: # TODO: fetch annotations to ensure we haven't already annotated annotation = self.review_editgroup(eg) - print("Reviewed {} disposition:{}".format( - eg.editgroup_id, annotation.extra['disposition'])) + print( + "Reviewed {} disposition:{}".format( + eg.editgroup_id, annotation.extra["disposition"] + ) + ) self.api.create_editgroup_annotation(eg.editgroup_id, annotation) since = eg.submitted # to prevent busy loops (TODO: needs review/rethink; multiple @@ -125,10 +129,9 @@ class ReviewBot: else: raise ValueError - for (status, title) in (('fail', 'Failed check'), ('warning', 'Warnings')): + for (status, title) in (("fail", "Failed check"), ("warning", "Warnings")): if result_counts[status] > 0: - comment += "\n\n### {} ({}):\n".format( - status, result_counts[status]) + comment += "\n\n### {} ({}):\n".format(status, result_counts[status]) for result in results: if result.status == status and result.check_type == "editgroup": comment += "\n- {description}".format(description=result.description) @@ -137,15 +140,18 @@ class ReviewBot: check_type=result.check_type, rev=result.rev, entity_type=result.check_type, - description=result.description) + description=result.description, + ) extra = self.extra.copy() - extra.update({ - "disposition": disposition, - "submit_timestamp": editgroup.submitted.isoformat(), - "checks": [check.name for check in self.checks], - "result_counts": dict(result_counts), - }) + extra.update( + { + "disposition": disposition, + "submit_timestamp": editgroup.submitted.isoformat(), + "checks": [check.name for check in self.checks], + "result_counts": dict(result_counts), + } + ) annotation = fatcat_openapi_client.EditgroupAnnotation( comment_markdown=comment, editgroup_id=editgroup.editgroup_id, @@ -156,7 +162,7 @@ class ReviewBot: def result_counts(self, results): counts = Counter() for result in results: - counts['total'] += 1 + counts["total"] += 1 counts[result.status] += 1 return counts @@ -217,13 +223,18 @@ class DummyCheck(EditCheck): name = "DummyCheck" def check_editgroup(self, editgroup): - return CheckResult("pass", "editgroup", + return CheckResult( + "pass", + "editgroup", "every edit is precious, thanks [editor {editor_id}](/editor/{editor_id})!".format( - editor_id=editgroup.editor_id)) + editor_id=editgroup.editor_id + ), + ) def check_work(self, entity, edit): return CheckResult("pass", "work", "this work edit is beautiful") + class DummyReviewBot(ReviewBot): """ This bot reviews everything and always passes. diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index ae9880e7..34212a6a 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -1,4 +1,3 @@ - from enum import Enum from typing import List, Optional @@ -16,6 +15,7 @@ class AccessType(str, Enum): openlibrary = "openlibrary" wikipedia = "wikipedia" + class AccessOption(BaseModel): access_type: AccessType @@ -40,27 +40,31 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: option found """ options = [] - for f in (release.files or []): + for f in release.files or []: thumbnail_url = None - if f.mimetype == 'application/pdf' and f.sha1 and f.urls: + if f.mimetype == "application/pdf" and f.sha1 and f.urls: # NOTE: scholar.archive.org does an actual database check before # generating these URLs, but we skip that for speed thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg" - for u in (f.urls or []): - if '://web.archive.org/' in u.url: - return [AccessOption( - access_type="wayback", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=thumbnail_url, - )] - elif '://archive.org/' in u.url: - return [AccessOption( - access_type="ia_file", - access_url=u.url, - mimetype=f.mimetype, - size_bytes=f.size, - thumbnail_url=thumbnail_url, - )] + for u in f.urls or []: + if "://web.archive.org/" in u.url: + return [ + AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=thumbnail_url, + ) + ] + elif "://archive.org/" in u.url: + return [ + AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=thumbnail_url, + ) + ] return options diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index f8b26bce..2b39068a 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,4 +1,3 @@ - import json from citeproc import ( @@ -13,10 +12,10 @@ from citeproc_styles import get_style_filepath def contribs_by_role(contribs, role): - ret = [c.copy() for c in contribs if c['role'] == role] - [c.pop('role') for c in ret] + ret = [c.copy() for c in contribs if c["role"] == role] + [c.pop("role") for c in ret] # TODO: some note to self here - [c.pop('literal') for c in ret if 'literal' in c] + [c.pop("literal") for c in ret if "literal" in c] if not ret: return None else: @@ -33,26 +32,30 @@ def release_to_csl(entity): Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json """ contribs = [] - for contrib in (entity.contribs or []): + for contrib in entity.contribs or []: if contrib.creator: # Default to "local" (publication-specific) metadata; fall back to # creator-level - family = contrib.creator.surname or contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) + family = ( + contrib.creator.surname + or contrib.surname + or (contrib.raw_name and contrib.raw_name.split()[-1]) + ) if not family: # CSL requires some surname (family name) continue c = dict( family=family, given=contrib.creator.given_name or contrib.given_name, - #dropping-particle - #non-dropping-particle - #suffix - #comma-suffix - #static-ordering + # dropping-particle + # non-dropping-particle + # suffix + # comma-suffix + # static-ordering literal=contrib.creator.display_name or contrib.raw_name, - #parse-names, + # parse-names, # role must be defined; default to author - role=contrib.role or 'author', + role=contrib.role or "author", ) else: family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) @@ -64,7 +67,7 @@ def release_to_csl(entity): given=contrib.given_name, literal=contrib.raw_name, # role must be defined; default to author - role=contrib.role or 'author', + role=contrib.role or "author", ) for k in list(c.keys()): if not c[k]: @@ -78,93 +81,108 @@ def release_to_csl(entity): issued_date = None if entity.release_date: - issued_date = {"date-parts": [[ - entity.release_date.year, - entity.release_date.month, - entity.release_date.day, - ]]} + issued_date = { + "date-parts": [ + [ + entity.release_date.year, + entity.release_date.month, + entity.release_date.day, + ] + ] + } elif entity.release_year: issued_date = {"date-parts": [[entity.release_year]]} csl = dict( - #id, - #categories - type=entity.release_type or "article", # can't be blank + # id, + # categories + type=entity.release_type or "article", # can't be blank language=entity.language, - #journalAbbreviation - #shortTitle + # journalAbbreviation + # shortTitle ## see below for all contrib roles - #accessed - #container - #event-date + # accessed + # container + # event-date issued=issued_date, - #original-date - #submitted + # original-date + # submitted abstract=abstract, - #annote - #archive - #archive_location - #archive-place - #authority - #call-number - #chapter-number - #citation-number - #citation-label - #collection-number - #collection-title + # annote + # archive + # archive_location + # archive-place + # authority + # call-number + # chapter-number + # citation-number + # citation-label + # collection-number + # collection-title container_title=entity.container and entity.container.name, - #container-title-short - #dimensions + # container-title-short + # dimensions DOI=entity.ext_ids.doi, - #edition - #event - #event-place - #first-reference-note-number - #genre + # edition + # event + # event-place + # first-reference-note-number + # genre ISBN=entity.ext_ids.isbn13, ISSN=entity.container and entity.container.issnl, issue=entity.issue, - #jurisdiction - #keyword - #locator - #medium - #note - #number - #number-of-pages - #number-of-volumes - #original-publisher - #original-publisher-place - #original-title + # jurisdiction + # keyword + # locator + # medium + # note + # number + # number-of-pages + # number-of-volumes + # original-publisher + # original-publisher-place + # original-title # TODO: page=entity.pages, - page_first=entity.pages and entity.pages.split('-')[0], + page_first=entity.pages and entity.pages.split("-")[0], PMCID=entity.ext_ids.pmcid, PMID=entity.ext_ids.pmid, publisher=(entity.container and entity.container.publisher) or entity.publisher, - #publisher-place - #references - #reviewed-title - #scale - #section - #source - #status + # publisher-place + # references + # reviewed-title + # scale + # section + # source + # status title=entity.title, - #title-short - #URL - #version + # title-short + # URL + # version volume=entity.volume, - #year-suffix + # year-suffix ) - for role in ['author', 'collection-editor', 'composer', 'container-author', - 'director', 'editor', 'editorial-director', 'interviewer', - 'illustrator', 'original-author', 'recipient', 'reviewed-author', - 'translator']: + for role in [ + "author", + "collection-editor", + "composer", + "container-author", + "director", + "editor", + "editorial-director", + "interviewer", + "illustrator", + "original-author", + "recipient", + "reviewed-author", + "translator", + ]: cbr = contribs_by_role(contribs, role) if cbr: csl[role] = cbr # underline-to-dash - csl['container-title'] = csl.pop('container_title') - csl['page-first'] = csl.pop('page_first') - empty_keys = [k for k,v in csl.items() if not v] + csl["container-title"] = csl.pop("container_title") + csl["page-first"] = csl.pop("page_first") + empty_keys = [k for k, v in csl.items() if not v] for k in empty_keys: csl.pop(k) return csl @@ -184,10 +202,11 @@ def refs_to_csl(entity): title=ref.title, issued=issued_date, ) - csl['id'] = ref.key or ref.index, # zero- or one-indexed? + csl["id"] = (ref.key or ref.index,) # zero- or one-indexed? ret.append(csl) return ret + def citeproc_csl(csl_json, style, html=False): """ Renders a release entity to a styled citation. @@ -200,8 +219,8 @@ def citeproc_csl(csl_json, style, html=False): Returns a string; if the html flag is set, and the style isn't 'csl-json' or 'bibtex', it will be HTML. Otherwise plain text. """ - if not csl_json.get('id'): - csl_json['id'] = "unknown" + if not csl_json.get("id"): + csl_json["id"] = "unknown" if style == "csl-json": return json.dumps(csl_json) bib_src = CiteProcJSON([csl_json]) @@ -211,7 +230,7 @@ def citeproc_csl(csl_json, style, html=False): style_path = get_style_filepath(style) bib_style = CitationStylesStyle(style_path, validate=False) bib = CitationStylesBibliography(bib_style, bib_src, form) - bib.register(Citation([CitationItem(csl_json['id'])])) + bib.register(Citation([CitationItem(csl_json["id"])])) lines = bib.bibliography()[0] if style == "bibtex": out = "" @@ -222,6 +241,6 @@ def citeproc_csl(csl_json, style, html=False): out += "\n " + line else: out += line - return ''.join(out) + return "".join(out) else: - return ''.join(lines) + return "".join(lines) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 1826d4eb..e39e9ea4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,4 +1,3 @@ - import datetime from typing import Any, Dict, Optional @@ -13,13 +12,14 @@ from fatcat_openapi_client import ( def check_kbart(year: int, archive: dict) -> Optional[bool]: - if not archive or not archive.get('year_spans'): + if not archive or not archive.get("year_spans"): return None - for span in archive['year_spans']: + for span in archive["year_spans"]: if year >= span[0] and year <= span[1]: return True return False + def test_check_kbart() -> None: assert check_kbart(1990, dict()) is None @@ -40,87 +40,89 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> Raises exception on error (never returns None) """ - if entity.state in ('redirect', 'deleted'): + if entity.state in ("redirect", "deleted"): return dict( - ident = entity.ident, - state = entity.state, + ident=entity.ident, + state=entity.state, ) - elif entity.state != 'active': + elif entity.state != "active": raise ValueError("Unhandled entity state: {}".format(entity.state)) # First, the easy ones (direct copy) release = entity t: Dict[str, Any] = dict( - doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", - ident = release.ident, - state = release.state, - revision = release.revision, - work_id = release.work_id, - title = release.title, - subtitle = release.subtitle, - original_title = release.original_title, - release_type = release.release_type, - release_stage = release.release_stage, - withdrawn_status = release.withdrawn_status, - language = release.language, - volume = release.volume, - issue = release.issue, - pages = release.pages, - number = release.number, - license = release.license_slug, - version = release.version, - doi = release.ext_ids.doi, - pmid = release.ext_ids.pmid, - pmcid = release.ext_ids.pmcid, - isbn13 = release.ext_ids.isbn13, - wikidata_qid = release.ext_ids.wikidata_qid, - core_id = release.ext_ids.core, - arxiv_id = release.ext_ids.arxiv, - jstor_id = release.ext_ids.jstor, - ark_id = release.ext_ids.ark, - mag_id = release.ext_ids.mag, - dblp_id = release.ext_ids.dblp, - doaj_id = release.ext_ids.doaj, - hdl = release.ext_ids.hdl, - tags = [], + doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", + ident=release.ident, + state=release.state, + revision=release.revision, + work_id=release.work_id, + title=release.title, + subtitle=release.subtitle, + original_title=release.original_title, + release_type=release.release_type, + release_stage=release.release_stage, + withdrawn_status=release.withdrawn_status, + language=release.language, + volume=release.volume, + issue=release.issue, + pages=release.pages, + number=release.number, + license=release.license_slug, + version=release.version, + doi=release.ext_ids.doi, + pmid=release.ext_ids.pmid, + pmcid=release.ext_ids.pmcid, + isbn13=release.ext_ids.isbn13, + wikidata_qid=release.ext_ids.wikidata_qid, + core_id=release.ext_ids.core, + arxiv_id=release.ext_ids.arxiv, + jstor_id=release.ext_ids.jstor, + ark_id=release.ext_ids.ark, + mag_id=release.ext_ids.mag, + dblp_id=release.ext_ids.dblp, + doaj_id=release.ext_ids.doaj, + hdl=release.ext_ids.hdl, + tags=[], ) - t.update(dict( - is_oa = None, - is_longtail_oa = None, - is_preserved = None, - in_web = False, - in_dweb = False, - in_ia = False, - in_ia_sim = False, - in_kbart = None, - in_jstor = False, - in_doaj= bool(release.ext_ids.doaj), - in_shadows = False, - )) + t.update( + dict( + is_oa=None, + is_longtail_oa=None, + is_preserved=None, + in_web=False, + in_dweb=False, + in_ia=False, + in_ia_sim=False, + in_kbart=None, + in_jstor=False, + in_doaj=bool(release.ext_ids.doaj), + in_shadows=False, + ) + ) release_year = release.release_year if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) - t['release_date'] = release.release_date.isoformat() + t["release_date"] = release.release_date.isoformat() if not release_year: release_year = release.release_date.year if release_year: - t['release_year'] = release_year + t["release_year"] = release_year - t['any_abstract'] = len(release.abstracts or []) > 0 - t['ref_count'] = len(release.refs or []) + t["any_abstract"] = len(release.abstracts or []) > 0 + t["ref_count"] = len(release.refs or []) ref_release_ids = [] - for r in (release.refs or []): + for r in release.refs or []: if r.target_release_id: ref_release_ids.append(r.target_release_id) - t['ref_release_ids'] = ref_release_ids - t['ref_linked_count'] = len(ref_release_ids) - t['contrib_count'] = len(release.contribs or []) + t["ref_release_ids"] = ref_release_ids + t["ref_linked_count"] = len(ref_release_ids) + t["contrib_count"] = len(release.contribs or []) contrib_names = [] contrib_affiliations = [] creator_ids = [] - for c in (release.contribs or []): + for c in release.contribs or []: if c.creator and c.creator.display_name: contrib_names.append(c.creator.display_name) elif c.raw_name: @@ -132,193 +134,218 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> creator_ids.append(c.creator_id) if c.raw_affiliation: contrib_affiliations.append(c.raw_affiliation) - t['contrib_names'] = contrib_names - t['creator_ids'] = creator_ids - t['affiliations'] = contrib_affiliations + t["contrib_names"] = contrib_names + t["creator_ids"] = creator_ids + t["affiliations"] = contrib_affiliations # TODO: mapping... probably by lookup? - t['affiliation_rors'] = None + t["affiliation_rors"] = None if release.container: t.update(_rte_container_helper(release.container, release_year)) # fall back to release-level container metadata if container not linked or # missing context - if not t.get('publisher'): - t['publisher'] = release.publisher - if not t.get('container_name') and release.extra: - t['container_name'] = release.extra.get('container_name') + if not t.get("publisher"): + t["publisher"] = release.publisher + if not t.get("container_name") and release.extra: + t["container_name"] = release.extra.get("container_name") - if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): - t['in_jstor'] = True + if release.ext_ids.jstor or ( + release.ext_ids.doi and release.ext_ids.doi.startswith("10.2307/") + ): + t["in_jstor"] = True # transform file/fileset/webcapture related fields t.update(_rte_content_helper(release)) if release.ext_ids.doaj: - t['is_oa'] = True + t["is_oa"] = True if release.license_slug: # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): - t['is_oa'] = True + t["is_oa"] = True if release.license_slug.startswith("ARXIV-"): - t['is_oa'] = True + t["is_oa"] = True - t['is_work_alias'] = None + t["is_work_alias"] = None extra = release.extra or dict() if extra: - if extra.get('is_oa'): + if extra.get("is_oa"): # NOTE: not actually setting this anywhere... but could - t['is_oa'] = True - if extra.get('is_work_alias') is not None: - t['is_work_alias'] = bool(extra.get('is_work_alias')) - if extra.get('longtail_oa'): + t["is_oa"] = True + if extra.get("is_work_alias") is not None: + t["is_work_alias"] = bool(extra.get("is_work_alias")) + if extra.get("longtail_oa"): # sometimes set by GROBID/matcher - t['is_oa'] = True - t['is_longtail_oa'] = True - if not t.get('container_name'): - t['container_name'] = extra.get('container_name') - if extra.get('crossref'): - if extra['crossref'].get('archive'): + t["is_oa"] = True + t["is_longtail_oa"] = True + if not t.get("container_name"): + t["container_name"] = extra.get("container_name") + if extra.get("crossref"): + if extra["crossref"].get("archive"): # all crossref archives are KBART, I believe - t['in_kbart'] = True + t["in_kbart"] = True # backwards compatible subtitle fetching - if not t['subtitle'] and extra.get('subtitle'): - if type(extra['subtitle']) == list: - t['subtitle'] = extra['subtitle'][0] + if not t["subtitle"] and extra.get("subtitle"): + if type(extra["subtitle"]) == list: + t["subtitle"] = extra["subtitle"][0] else: - t['subtitle'] = extra['subtitle'] + t["subtitle"] = extra["subtitle"] - t['first_page'] = None + t["first_page"] = None if release.pages: - first = release.pages.split('-')[0] - first = first.replace('p', '') + first = release.pages.split("-")[0] + first = first.replace("p", "") if first.isdigit(): - t['first_page'] = first + t["first_page"] = first # TODO: non-numerical first pages - t['ia_microfilm_url'] = None - if t['in_ia_sim']: + t["ia_microfilm_url"] = None + if t["in_ia_sim"]: # TODO: determine URL somehow? I think this is in flux. Will probably # need extra metadata in the container extra field. # special case as a demo for now. - if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ - and release.release_year in (2011, 2013) \ - and release.issue \ - and release.issue.isdigit() \ - and t['first_page']: - t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( + if ( + release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" + and release.release_year in (2011, 2013) + and release.issue + and release.issue.isdigit() + and t["first_page"] + ): + t[ + "ia_microfilm_url" + ] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( release.release_year, int(release.issue) - 1, - t['first_page'], + t["first_page"], ) - t['doi_registrar'] = None - if extra and t['doi']: - for k in ('crossref', 'datacite', 'jalc'): + t["doi_registrar"] = None + if extra and t["doi"]: + for k in ("crossref", "datacite", "jalc"): if k in extra: - t['doi_registrar'] = k - if 'doi_registrar' not in t: - t['doi_registrar'] = 'crossref' + t["doi_registrar"] = k + if "doi_registrar" not in t: + t["doi_registrar"] = "crossref" - if t['doi']: - t['doi_prefix'] = t['doi'].split('/')[0] + if t["doi"]: + t["doi_prefix"] = t["doi"].split("/")[0] - if t['is_longtail_oa']: - t['is_oa'] = True + if t["is_longtail_oa"]: + t["is_oa"] = True # optionally coerce all flags from Optional[bool] to bool if force_bool: - for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim', - 'in_jstor', 'in_web', 'in_dweb', 'in_shadows', - 'is_work_alias'): + for k in ( + "is_oa", + "is_longtail_oa", + "in_kbart", + "in_ia_sim", + "in_jstor", + "in_web", + "in_dweb", + "in_shadows", + "is_work_alias", + ): t[k] = bool(t[k]) - t['in_ia'] = bool(t['in_ia']) - t['is_preserved'] = bool( - t['is_preserved'] - or t['in_ia'] - or t['in_kbart'] - or t['in_jstor'] - or t.get('pmcid') - or t.get('arxiv_id') + t["in_ia"] = bool(t["in_ia"]) + t["is_preserved"] = bool( + t["is_preserved"] + or t["in_ia"] + or t["in_kbart"] + or t["in_jstor"] + or t.get("pmcid") + or t.get("arxiv_id") ) - if t['in_ia']: - t['preservation'] = 'bright' - elif t['is_preserved']: - t['preservation'] = 'dark' - elif t['in_shadows']: - t['preservation'] = 'shadows_only' + if t["in_ia"]: + t["preservation"] = "bright" + elif t["is_preserved"]: + t["preservation"] = "dark" + elif t["in_shadows"]: + t["preservation"] = "shadows_only" else: - t['preservation'] = 'none' + t["preservation"] = "none" return t + def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict: """ Container metadata sub-section of release_to_elasticsearch() """ this_year = datetime.date.today().year t = dict() - t['publisher'] = container.publisher - t['container_name'] = container.name + t["publisher"] = container.publisher + t["container_name"] = container.name # this is container.ident, not release.container_id, because there may # be a redirect involved - t['container_id'] = container.ident - t['container_issnl'] = container.issnl + t["container_id"] = container.ident + t["container_issnl"] = container.issnl issns = [container.issnl, container.issne, container.issnp] issns = list(set([i for i in issns if i])) - t['container_issns'] = issns - t['container_type'] = container.container_type - t['container_publication_status'] = container.publication_status + t["container_issns"] = issns + t["container_type"] = container.container_type + t["container_publication_status"] = container.publication_status if container.extra: c_extra = container.extra - if c_extra.get('kbart') and release_year: - if check_kbart(release_year, c_extra['kbart'].get('jstor')): - t['in_jstor'] = True - if t.get('in_kbart') or t.get('in_jstor'): - t['in_kbart'] = True - for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', - 'hathitrust', 'scholarsportal', 'cariniana'): - t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive)) + if c_extra.get("kbart") and release_year: + if check_kbart(release_year, c_extra["kbart"].get("jstor")): + t["in_jstor"] = True + if t.get("in_kbart") or t.get("in_jstor"): + t["in_kbart"] = True + for archive in ( + "portico", + "lockss", + "clockss", + "pkp_pln", + "hathitrust", + "scholarsportal", + "cariniana", + ): + t["in_kbart"] = t.get("in_kbart") or check_kbart( + release_year, c_extra["kbart"].get(archive) + ) # recent KBART coverage is often not updated for the # current year. So for current-year publications, consider # coverage from *last* year to also be included in the # Keeper - if not t.get('in_kbart') and release_year == this_year: - t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) - - if c_extra.get('ia'): - if c_extra['ia'].get('sim') and release_year: - t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim']) - if c_extra['ia'].get('longtail_oa'): - t['is_longtail_oa'] = True - if c_extra.get('sherpa_romeo'): - if c_extra['sherpa_romeo'].get('color') == 'white': - t['is_oa'] = False - if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): - t['is_oa'] = True - if c_extra.get('doaj'): - if c_extra['doaj'].get('as_of'): - t['is_oa'] = True - t['in_doaj'] = True - if c_extra.get('road'): - if c_extra['road'].get('as_of'): - t['is_oa'] = True - if c_extra.get('szczepanski'): - if c_extra['szczepanski'].get('as_of'): - t['is_oa'] = True - if c_extra.get('country'): - t['country_code'] = c_extra['country'] - t['country_code_upper'] = c_extra['country'].upper() - if c_extra.get('publisher_type'): - t['publisher_type'] = c_extra['publisher_type'] - if c_extra.get('discipline'): - t['discipline'] = c_extra['discipline'] + if not t.get("in_kbart") and release_year == this_year: + t["in_kbart"] = check_kbart(this_year - 1, c_extra["kbart"].get(archive)) + + if c_extra.get("ia"): + if c_extra["ia"].get("sim") and release_year: + t["in_ia_sim"] = check_kbart(release_year, c_extra["ia"]["sim"]) + if c_extra["ia"].get("longtail_oa"): + t["is_longtail_oa"] = True + if c_extra.get("sherpa_romeo"): + if c_extra["sherpa_romeo"].get("color") == "white": + t["is_oa"] = False + if c_extra.get("default_license") and c_extra.get("default_license").startswith("CC-"): + t["is_oa"] = True + if c_extra.get("doaj"): + if c_extra["doaj"].get("as_of"): + t["is_oa"] = True + t["in_doaj"] = True + if c_extra.get("road"): + if c_extra["road"].get("as_of"): + t["is_oa"] = True + if c_extra.get("szczepanski"): + if c_extra["szczepanski"].get("as_of"): + t["is_oa"] = True + if c_extra.get("country"): + t["country_code"] = c_extra["country"] + t["country_code_upper"] = c_extra["country"].upper() + if c_extra.get("publisher_type"): + t["publisher_type"] = c_extra["publisher_type"] + if c_extra.get("discipline"): + t["discipline"] = c_extra["discipline"] return t + def _rte_content_helper(release: ReleaseEntity) -> dict: """ File/FileSet/WebCapture sub-section of release_to_elasticsearch() @@ -329,9 +356,9 @@ def _rte_content_helper(release: ReleaseEntity) -> dict: - any other URL """ t = dict( - file_count = len(release.files or []), - fileset_count = len(release.filesets or []), - webcapture_count = len(release.webcaptures or []), + file_count=len(release.files or []), + fileset_count=len(release.filesets or []), + webcapture_count=len(release.webcaptures or []), ) any_pdf_url = None @@ -340,38 +367,42 @@ def _rte_content_helper(release: ReleaseEntity) -> dict: ia_pdf_url = None for f in release.files or []: - if f.extra and f.extra.get('shadows'): - t['in_shadows'] = True - is_pdf = 'pdf' in (f.mimetype or '') - for release_url in (f.urls or []): + if f.extra and f.extra.get("shadows"): + t["in_shadows"] = True + is_pdf = "pdf" in (f.mimetype or "") + for release_url in f.urls or []: # first generic flags t.update(_rte_url_helper(release_url)) # then PDF specific stuff (for generating "best URL" fields) - if not f.mimetype and 'pdf' in release_url.url.lower(): + if not f.mimetype and "pdf" in release_url.url.lower(): is_pdf = True if is_pdf: any_pdf_url = release_url.url - if release_url.rel in ('webarchive', 'repository', 'repo'): + if release_url.rel in ("webarchive", "repository", "repo"): good_pdf_url = release_url.url - if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: + if ( + "//web.archive.org/" in release_url.url + or "//archive.org/" in release_url.url + ): best_pdf_url = release_url.url ia_pdf_url = release_url.url # here is where we bake-in PDF url priority; IA-specific - t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url - t['ia_pdf_url'] = ia_pdf_url + t["best_pdf_url"] = best_pdf_url or good_pdf_url or any_pdf_url + t["ia_pdf_url"] = ia_pdf_url for fs in release.filesets or []: - for url_obj in (fs.urls or []): + for url_obj in fs.urls or []: t.update(_rte_url_helper(url_obj)) for wc in release.webcaptures or []: - for url_obj in (wc.archive_urls or []): + for url_obj in wc.archive_urls or []: t.update(_rte_url_helper(url_obj)) return t + def _rte_url_helper(url_obj) -> dict: """ Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. @@ -382,17 +413,17 @@ def _rte_url_helper(url_obj) -> dict: these will be iteratively update() into the overal object. """ t = dict() - if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'): - t['is_preserved'] = True - if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url: - t['in_ia'] = True - if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'): - t['in_web'] = True - if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + if url_obj.rel in ("webarchive", "repository", "archive", "repo"): + t["is_preserved"] = True + if "//web.archive.org/" in url_obj.url or "//archive.org/" in url_obj.url: + t["in_ia"] = True + if url_obj.url.lower().startswith("http") or url_obj.url.lower().startswith("ftp"): + t["in_web"] = True + if url_obj.rel in ("dweb", "p2p", "ipfs", "dat", "torrent"): # not sure what rel will be for this stuff - t['in_dweb'] = True - if '//www.jstor.org/' in url_obj.url: - t['in_jstor'] = True + t["in_dweb"] = True + if "//www.jstor.org/" in url_obj.url: + t["in_jstor"] = True return t @@ -404,50 +435,59 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None): Raises exception on error (never returns None) """ - if entity.state in ('redirect', 'deleted'): + if entity.state in ("redirect", "deleted"): return dict( - ident = entity.ident, - state = entity.state, + ident=entity.ident, + state=entity.state, ) - elif entity.state != 'active': + elif entity.state != "active": raise ValueError("Unhandled entity state: {}".format(entity.state)) # First, the easy ones (direct copy) t = dict( - doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", - ident = entity.ident, - state = entity.state, - revision = entity.revision, - - name = entity.name, - publisher = entity.publisher, - container_type = entity.container_type, - publication_status= entity.publication_status, - issnl = entity.issnl, - issne = entity.issne, - issnp = entity.issnp, - wikidata_qid = entity.wikidata_qid, + doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", + ident=entity.ident, + state=entity.state, + revision=entity.revision, + name=entity.name, + publisher=entity.publisher, + container_type=entity.container_type, + publication_status=entity.publication_status, + issnl=entity.issnl, + issne=entity.issne, + issnp=entity.issnp, + wikidata_qid=entity.wikidata_qid, ) if not entity.extra: entity.extra = dict() - for key in ('country', 'languages', 'mimetypes', 'original_name', - 'first_year', 'last_year', 'aliases', 'abbrev', 'region', - 'discipline', 'publisher_type'): + for key in ( + "country", + "languages", + "mimetypes", + "original_name", + "first_year", + "last_year", + "aliases", + "abbrev", + "region", + "discipline", + "publisher_type", + ): if entity.extra.get(key): t[key] = entity.extra[key] - if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'): - t['dblp_prefix'] = entity.extra['dblp']['prefix'] + if entity.extra.get("dblp") and entity.extra["dblp"].get("prefix"): + t["dblp_prefix"] = entity.extra["dblp"]["prefix"] - if 'country' in t: - t['country_code'] = t.pop('country') + if "country" in t: + t["country_code"] = t.pop("country") - t['issns'] = [entity.issnl, entity.issne, entity.issnp] - for key in ('issnp', 'issne'): + t["issns"] = [entity.issnl, entity.issne, entity.issnp] + for key in ("issnp", "issne"): if entity.extra.get(key): - t['issns'].append(entity.extra[key]) - t['issns'] = list(set([i for i in t['issns'] if i])) + t["issns"].append(entity.extra[key]) + t["issns"] = list(set([i for i in t["issns"] if i])) in_doaj = None in_road = None @@ -459,72 +499,72 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None): keepers = [] extra = entity.extra - if extra.get('doaj'): - if extra['doaj'].get('as_of'): + if extra.get("doaj"): + if extra["doaj"].get("as_of"): in_doaj = True - if extra.get('road'): - if extra['road'].get('as_of'): + if extra.get("road"): + if extra["road"].get("as_of"): in_road = True - if extra.get('szczepanski'): - if extra['szczepanski'].get('as_of'): + if extra.get("szczepanski"): + if extra["szczepanski"].get("as_of"): is_oa = True - if extra.get('default_license'): - if extra['default_license'].startswith('CC-'): + if extra.get("default_license"): + if extra["default_license"].startswith("CC-"): is_oa = True - t['sherpa_romeo_color'] = None - if extra.get('sherpa_romeo'): - t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color') - if extra['sherpa_romeo'].get('color') == 'white': + t["sherpa_romeo_color"] = None + if extra.get("sherpa_romeo"): + t["sherpa_romeo_color"] = extra["sherpa_romeo"].get("color") + if extra["sherpa_romeo"].get("color") == "white": is_oa = False - if extra.get('kbart'): + if extra.get("kbart"): any_kbart = True - if extra['kbart'].get('jstor'): + if extra["kbart"].get("jstor"): any_jstor = True - for k, v in extra['kbart'].items(): + for k, v in extra["kbart"].items(): if v and isinstance(v, dict): keepers.append(k) - if extra.get('ia'): - if extra['ia'].get('sim'): + if extra.get("ia"): + if extra["ia"].get("sim"): any_ia_sim = True - if extra['ia'].get('longtail_oa'): + if extra["ia"].get("longtail_oa"): is_longtail_oa = True - t['is_superceded'] = bool(extra.get('superceded')) + t["is_superceded"] = bool(extra.get("superceded")) - t['keepers'] = keepers - t['in_doaj'] = bool(in_doaj) - t['in_road'] = bool(in_road) - t['any_kbart'] = bool(any_kbart) + t["keepers"] = keepers + t["in_doaj"] = bool(in_doaj) + t["in_road"] = bool(in_road) + t["any_kbart"] = bool(any_kbart) if force_bool: - t['is_oa'] = bool(in_doaj or in_road or is_oa) - t['is_longtail_oa'] = bool(is_longtail_oa) - t['any_jstor'] = bool(any_jstor) - t['any_ia_sim'] = bool(any_ia_sim) + t["is_oa"] = bool(in_doaj or in_road or is_oa) + t["is_longtail_oa"] = bool(is_longtail_oa) + t["any_jstor"] = bool(any_jstor) + t["any_ia_sim"] = bool(any_ia_sim) else: - t['is_oa'] = in_doaj or in_road or is_oa - t['is_longtail_oa'] = is_longtail_oa - t['any_jstor'] = any_jstor - t['any_ia_sim'] = any_ia_sim + t["is_oa"] = in_doaj or in_road or is_oa + t["is_longtail_oa"] = is_longtail_oa + t["any_jstor"] = any_jstor + t["any_ia_sim"] = any_ia_sim # mix in stats, if provided if stats: - t['releases_total'] = stats['total'] - t['preservation_bright'] = stats['preservation']['bright'] - t['preservation_dark'] = stats['preservation']['dark'] - t['preservation_shadows_only'] = stats['preservation']['shadows_only'] - t['preservation_none'] = stats['preservation']['none'] + t["releases_total"] = stats["total"] + t["preservation_bright"] = stats["preservation"]["bright"] + t["preservation_dark"] = stats["preservation"]["dark"] + t["preservation_shadows_only"] = stats["preservation"]["shadows_only"] + t["preservation_none"] = stats["preservation"]["none"] return t def _type_of_edit(edit: EntityEdit) -> str: if edit.revision is None and edit.redirect_ident is None: - return 'delete' + return "delete" elif edit.redirect_ident: # redirect - return 'update' + return "update" elif edit.prev_revision is None and edit.redirect_ident is None and edit.revision: - return 'create' + return "create" else: - return 'update' + return "update" def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]: @@ -536,7 +576,7 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]: editgroup = entity.editgroup t = dict( - doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", + doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", index=entity.index, editgroup_id=entity.editgroup_id, timestamp=entity.timestamp.isoformat(), @@ -547,8 +587,8 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]: ) extra = editgroup.extra or dict() - if extra.get('agent'): - t['agent'] = extra['agent'] + if extra.get("agent"): + t["agent"] = extra["agent"] containers = [_type_of_edit(e) for e in editgroup.edits.containers] creators = [_type_of_edit(e) for e in editgroup.edits.creators] @@ -558,27 +598,27 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]: releases = [_type_of_edit(e) for e in editgroup.edits.releases] works = [_type_of_edit(e) for e in editgroup.edits.works] - t['containers'] = len(containers) - t['new_containers'] = len([e for e in containers if e == 'create']) - t['creators'] = len(creators) - t['new_creators'] = len([e for e in creators if e == 'create']) - t['files'] = len(files) - t['new_files'] = len([e for e in files if e == 'create']) - t['filesets'] = len(filesets) - t['new_filesets'] = len([e for e in filesets if e == 'create']) - t['webcaptures'] = len(webcaptures) - t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) - t['releases'] = len(releases) - t['new_releases'] = len([e for e in releases if e == 'create']) - t['works'] = len(works) - t['new_works'] = len([e for e in works if e == 'create']) + t["containers"] = len(containers) + t["new_containers"] = len([e for e in containers if e == "create"]) + t["creators"] = len(creators) + t["new_creators"] = len([e for e in creators if e == "create"]) + t["files"] = len(files) + t["new_files"] = len([e for e in files if e == "create"]) + t["filesets"] = len(filesets) + t["new_filesets"] = len([e for e in filesets if e == "create"]) + t["webcaptures"] = len(webcaptures) + t["new_webcaptures"] = len([e for e in webcaptures if e == "create"]) + t["releases"] = len(releases) + t["new_releases"] = len([e for e in releases if e == "create"]) + t["works"] = len(works) + t["new_works"] = len([e for e in works if e == "create"]) all_edits = containers + creators + files + filesets + webcaptures + releases + works - t['created'] = len([e for e in all_edits if e == 'create']) - t['updated'] = len([e for e in all_edits if e == 'update']) - t['deleted'] = len([e for e in all_edits if e == 'delete']) - t['total'] = len(all_edits) + t["created"] = len([e for e in all_edits if e == "create"]) + t["updated"] = len([e for e in all_edits if e == "update"]) + t["deleted"] = len([e for e in all_edits if e == "delete"]) + t["total"] = len(all_edits) return t @@ -590,47 +630,47 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]: Raises exception on error (never returns None) """ - if entity.state in ('redirect', 'deleted'): + if entity.state in ("redirect", "deleted"): return dict( - ident = entity.ident, - state = entity.state, + ident=entity.ident, + state=entity.state, ) - elif entity.state != 'active': + elif entity.state != "active": raise ValueError("Unhandled entity state: {}".format(entity.state)) # First, the easy ones (direct copy) t = dict( - doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z", - ident = entity.ident, - state = entity.state, - revision = entity.revision, - release_ids = entity.release_ids, - release_count = len(entity.release_ids), - mimetype = entity.mimetype, - size_bytes = entity.size, - sha1 = entity.sha1, - sha256 = entity.sha256, - md5 = entity.md5, + doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z", + ident=entity.ident, + state=entity.state, + revision=entity.revision, + release_ids=entity.release_ids, + release_count=len(entity.release_ids), + mimetype=entity.mimetype, + size_bytes=entity.size, + sha1=entity.sha1, + sha256=entity.sha256, + md5=entity.md5, ) parsed_urls = [tldextract.extract(u.url) for u in entity.urls] - t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) - t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) - t['rels'] = list(set([u.rel for u in entity.urls])) + t["hosts"] = list(set([".".join([seg for seg in pu if seg]) for pu in parsed_urls])) + t["domains"] = list(set([pu.registered_domain for pu in parsed_urls])) + t["rels"] = list(set([u.rel for u in entity.urls])) - t['in_ia'] = bool('archive.org' in t['domains']) - t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + t["in_ia"] = bool("archive.org" in t["domains"]) + t["in_ia_petabox"] = bool("archive.org" in t["hosts"]) any_url = None good_url = None best_url = None - for release_url in (entity.urls or []): + for release_url in entity.urls or []: any_url = release_url.url - if release_url.rel in ('webarchive', 'repository'): + if release_url.rel in ("webarchive", "repository"): good_url = release_url.url - if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: + if "//web.archive.org/" in release_url.url or "//archive.org/" in release_url.url: best_url = release_url.url # here is where we bake-in priority; IA-specific - t['best_url'] = best_url or good_url or any_url + t["best_url"] = best_url or good_url or any_url return t diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 9101a4ec..30b5b190 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,4 +1,3 @@ - INGEST_TYPE_CONTAINER_MAP = { # Optica "twtpsm6ytje3nhuqfu3pa7ca7u": "html", @@ -14,7 +13,8 @@ INGEST_TYPE_CONTAINER_MAP = { "lovwr7ladjagzkhmoaszg7efqu": "html", } -def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None): + +def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -27,27 +27,35 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= calling code should check the returned type field. """ - if release.state != 'active': + if release.state != "active": return None if (not ingest_type) and release.container_id: ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id) if not ingest_type: - if release.release_type == 'stub': + if release.release_type == "stub": return None - elif release.release_type in ['component', 'graphic']: - ingest_type = 'component' - elif release.release_type == 'dataset': - ingest_type = 'dataset' - elif release.release_type == 'software': - ingest_type = 'software' - elif release.release_type == 'post-weblog': - ingest_type = 'html' - elif release.release_type in ['article-journal', 'article', 'chapter', 'paper-conference', 'book', 'report', 'thesis']: - ingest_type = 'pdf' + elif release.release_type in ["component", "graphic"]: + ingest_type = "component" + elif release.release_type == "dataset": + ingest_type = "dataset" + elif release.release_type == "software": + ingest_type = "software" + elif release.release_type == "post-weblog": + ingest_type = "html" + elif release.release_type in [ + "article-journal", + "article", + "chapter", + "paper-conference", + "book", + "report", + "thesis", + ]: + ingest_type = "pdf" else: - ingest_type = 'pdf' + ingest_type = "pdf" # generate a URL where we expect to find fulltext url = None @@ -59,8 +67,10 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= link_source_id = release.ext_ids.arxiv elif release.ext_ids.pmcid and ingest_type == "pdf": # TODO: how to tell if an author manuscript in PMC vs. published? - #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) - url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) + url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format( + release.ext_ids.pmcid + ) link_source = "pmc" link_source_id = release.ext_ids.pmcid elif release.ext_ids.doi: @@ -75,19 +85,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) ingest_request = { - 'ingest_type': ingest_type, - 'ingest_request_source': ingest_request_source, - 'base_url': url, - 'release_stage': release.release_stage, - 'fatcat': { - 'release_ident': release.ident, - 'work_ident': release.work_id, + "ingest_type": ingest_type, + "ingest_request_source": ingest_request_source, + "base_url": url, + "release_stage": release.release_stage, + "fatcat": { + "release_ident": release.ident, + "work_ident": release.work_id, }, - 'ext_ids': ext_ids, + "ext_ids": ext_ids, } if link_source and link_source_id: - ingest_request['link_source'] = link_source - ingest_request['link_source_id'] = link_source_id + ingest_request["link_source"] = link_source + ingest_request["link_source_id"] = link_source_id return ingest_request diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index a61e364c..1e4cb41d 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -1,4 +1,3 @@ - import json import time @@ -16,11 +15,9 @@ class ChangelogWorker(FatcatWorker): """ def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): - super().__init__(kafka_hosts=kafka_hosts, - produce_topic=produce_topic, - api=api) + super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, api=api) self.poll_interval = poll_interval - self.offset = offset # the fatcat changelog offset, not the kafka offset + self.offset = offset # the fatcat changelog offset, not the kafka offset def run(self): @@ -31,7 +28,7 @@ class ChangelogWorker(FatcatWorker): print("Checking for most recent changelog offset...") msg = most_recent_message(self.produce_topic, self.kafka_config) if msg: - self.offset = json.loads(msg.decode('utf-8'))['index'] + self.offset = json.loads(msg.decode("utf-8"))["index"] else: self.offset = 0 print("Most recent changelog index in Kafka seems to be {}".format(self.offset)) @@ -44,28 +41,29 @@ class ChangelogWorker(FatcatWorker): raise KafkaException(err) producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) producer = Producer(producer_conf) while True: latest = int(self.api.get_changelog(limit=1)[0].index) if latest > self.offset: - print("Fetching changelogs from {} through {}".format( - self.offset+1, latest)) - for i in range(self.offset+1, latest+1): + print("Fetching changelogs from {} through {}".format(self.offset + 1, latest)) + for i in range(self.offset + 1, latest + 1): cle = self.api.get_changelog_entry(i) obj = self.api.api_client.sanitize_for_serialization(cle) producer.produce( self.produce_topic, - json.dumps(obj).encode('utf-8'), + json.dumps(obj).encode("utf-8"), key=str(i), on_delivery=fail_fast, - #NOTE timestamp could be timestamp=cle.timestamp (?) + # NOTE timestamp could be timestamp=cle.timestamp (?) ) self.offset = i producer.flush() @@ -79,12 +77,19 @@ class EntityUpdatesWorker(FatcatWorker): from API) to update topics. """ - def __init__(self, api, kafka_hosts, consume_topic, release_topic, - file_topic, container_topic, ingest_file_request_topic, - work_ident_topic, poll_interval=5.0): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic, - api=api) + def __init__( + self, + api, + kafka_hosts, + consume_topic, + release_topic, + file_topic, + container_topic, + ingest_file_request_topic, + work_ident_topic, + poll_interval=5.0, + ): + super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, api=api) self.release_topic = release_topic self.file_topic = file_topic self.container_topic = container_topic @@ -150,7 +155,7 @@ class EntityUpdatesWorker(FatcatWorker): # Transactions of the Japan Society of Mechanical Engineers "10.1299/kikai", # protocols.io - "10.17504/" + "10.17504/", ] def want_live_ingest(self, release, ingest_request): @@ -163,40 +168,40 @@ class EntityUpdatesWorker(FatcatWorker): ingest crawling (via wayback SPN). """ - link_source = ingest_request.get('ingest_request') - ingest_type = ingest_request.get('ingest_type') - doi = ingest_request.get('ext_ids', {}).get('doi') + link_source = ingest_request.get("ingest_request") + ingest_type = ingest_request.get("ingest_type") + doi = ingest_request.get("ext_ids", {}).get("doi") es = release_to_elasticsearch(release) is_document = release.release_type in ( - 'article', - 'article-journal', - 'article-newspaper', - 'book', - 'chapter', - 'editorial', - 'interview', - 'legal_case', - 'legislation', - 'letter', - 'manuscript', - 'paper-conference', - 'patent', - 'peer_review', - 'post', - 'report', - 'retraction', - 'review', - 'review-book', - 'thesis', + "article", + "article-journal", + "article-newspaper", + "book", + "chapter", + "editorial", + "interview", + "legal_case", + "legislation", + "letter", + "manuscript", + "paper-conference", + "patent", + "peer_review", + "post", + "report", + "retraction", + "review", + "review-book", + "thesis", ) is_not_pdf = release.release_type in ( - 'component', - 'dataset', - 'figure', - 'graphic', - 'software', - 'stub', + "component", + "dataset", + "figure", + "graphic", + "software", + "stub", ) # accept list sets a default "crawl it" despite OA metadata for @@ -207,19 +212,23 @@ class EntityUpdatesWorker(FatcatWorker): if doi.startswith(prefix): in_acceptlist = True - if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): + if self.ingest_oa_only and link_source not in ("arxiv", "pmc"): # most datacite documents are in IRs and should be crawled is_datacite_doc = False - if release.extra and ('datacite' in release.extra) and is_document: + if release.extra and ("datacite" in release.extra) and is_document: is_datacite_doc = True - if not (es['is_oa'] or in_acceptlist or is_datacite_doc): + if not (es["is_oa"] or in_acceptlist or is_datacite_doc): return False # big publishers *generally* have accurate OA metadata, use # preservation networks, and block our crawlers. So unless OA, or # explicitly on accept list, or not preserved, skip crawling - if es.get('publisher_type') == 'big5' and es.get('is_preserved') and not (es['is_oa'] or in_acceptlist): + if ( + es.get("publisher_type") == "big5" + and es.get("is_preserved") + and not (es["is_oa"] or in_acceptlist) + ): return False # if ingest_type is pdf but release_type is almost certainly not a PDF, @@ -233,23 +242,24 @@ class EntityUpdatesWorker(FatcatWorker): return False # figshare - if doi and (doi.startswith('10.6084/') or doi.startswith('10.25384/')): + if doi and (doi.startswith("10.6084/") or doi.startswith("10.25384/")): # don't crawl "most recent version" (aka "group") DOIs if not release.version: return False # zenodo - if doi and doi.startswith('10.5281/'): + if doi and doi.startswith("10.5281/"): # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs) - if release.extra and release.extra.get('relations'): - for rel in release.extra['relations']: - if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')): + if release.extra and release.extra.get("relations"): + for rel in release.extra["relations"]: + if rel.get("relationType") == "HasVersion" and rel.get( + "relatedIdentifier", "" + ).startswith("10.5281/"): return False return True def run(self): - def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err)) @@ -278,36 +288,40 @@ class EntityUpdatesWorker(FatcatWorker): for p in partitions: if p.error: raise KafkaException(p.error) - print("Kafka partitions rebalanced: {} / {}".format( - consumer, partitions)) + print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions)) consumer_conf = self.kafka_config.copy() - consumer_conf.update({ - 'group.id': self.consumer_group, - 'on_commit': fail_fast, - # messages don't have offset marked as stored until pushed to - # elastic, but we do auto-commit stored offsets to broker - 'enable.auto.commit': True, - 'enable.auto.offset.store': False, - # user code timeout; if no poll after this long, assume user code - # hung and rebalance (default: 5min) - 'max.poll.interval.ms': 180000, - 'default.topic.config': { - 'auto.offset.reset': 'latest', - }, - }) + consumer_conf.update( + { + "group.id": self.consumer_group, + "on_commit": fail_fast, + # messages don't have offset marked as stored until pushed to + # elastic, but we do auto-commit stored offsets to broker + "enable.auto.commit": True, + "enable.auto.offset.store": False, + # user code timeout; if no poll after this long, assume user code + # hung and rebalance (default: 5min) + "max.poll.interval.ms": 180000, + "default.topic.config": { + "auto.offset.reset": "latest", + }, + } + ) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() - producer_conf.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'request.required.acks': -1, # all brokers must confirm - }, - }) + producer_conf.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "request.required.acks": -1, # all brokers must confirm + }, + } + ) producer = Producer(producer_conf) - consumer.subscribe([self.consume_topic], + consumer.subscribe( + [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) @@ -316,14 +330,16 @@ class EntityUpdatesWorker(FatcatWorker): while True: msg = consumer.poll(self.poll_interval) if not msg: - print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval)) + print( + "nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval) + ) continue if msg.error(): raise KafkaException(msg.error()) - cle = json.loads(msg.value().decode('utf-8')) - #print(cle) - print("processing changelog index {}".format(cle['index'])) + cle = json.loads(msg.value().decode("utf-8")) + # print(cle) + print("processing changelog index {}".format(cle["index"])) release_ids = [] new_release_ids = [] file_ids = [] @@ -331,27 +347,27 @@ class EntityUpdatesWorker(FatcatWorker): webcapture_ids = [] container_ids = [] work_ids = [] - release_edits = cle['editgroup']['edits']['releases'] + release_edits = cle["editgroup"]["edits"]["releases"] for re in release_edits: - release_ids.append(re['ident']) + release_ids.append(re["ident"]) # filter to direct release edits which are not updates - if not re.get('prev_revision') and not re.get('redirect_ident'): - new_release_ids.append(re['ident']) - file_edits = cle['editgroup']['edits']['files'] + if not re.get("prev_revision") and not re.get("redirect_ident"): + new_release_ids.append(re["ident"]) + file_edits = cle["editgroup"]["edits"]["files"] for e in file_edits: - file_ids.append(e['ident']) - fileset_edits = cle['editgroup']['edits']['filesets'] + file_ids.append(e["ident"]) + fileset_edits = cle["editgroup"]["edits"]["filesets"] for e in fileset_edits: - fileset_ids.append(e['ident']) - webcapture_edits = cle['editgroup']['edits']['webcaptures'] + fileset_ids.append(e["ident"]) + webcapture_edits = cle["editgroup"]["edits"]["webcaptures"] for e in webcapture_edits: - webcapture_ids.append(e['ident']) - container_edits = cle['editgroup']['edits']['containers'] + webcapture_ids.append(e["ident"]) + container_edits = cle["editgroup"]["edits"]["containers"] for e in container_edits: - container_ids.append(e['ident']) - work_edits = cle['editgroup']['edits']['works'] + container_ids.append(e["ident"]) + work_edits = cle["editgroup"]["edits"]["works"] for e in work_edits: - work_ids.append(e['ident']) + work_ids.append(e["ident"]) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): @@ -363,8 +379,8 @@ class EntityUpdatesWorker(FatcatWorker): file_dict = self.api.api_client.sanitize_for_serialization(file_entity) producer.produce( self.file_topic, - json.dumps(file_dict).encode('utf-8'), - key=ident.encode('utf-8'), + json.dumps(file_dict).encode("utf-8"), + key=ident.encode("utf-8"), on_delivery=fail_fast, ) @@ -385,30 +401,34 @@ class EntityUpdatesWorker(FatcatWorker): container_dict = self.api.api_client.sanitize_for_serialization(container) producer.produce( self.container_topic, - json.dumps(container_dict).encode('utf-8'), - key=ident.encode('utf-8'), + json.dumps(container_dict).encode("utf-8"), + key=ident.encode("utf-8"), on_delivery=fail_fast, ) for ident in set(release_ids): - release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") + release = self.api.get_release( + ident, expand="files,filesets,webcaptures,container" + ) if release.work_id: work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( self.release_topic, - json.dumps(release_dict).encode('utf-8'), - key=ident.encode('utf-8'), + json.dumps(release_dict).encode("utf-8"), + key=ident.encode("utf-8"), on_delivery=fail_fast, ) # for ingest requests, filter to "new" active releases with no matched files if release.ident in new_release_ids: - ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') + ir = release_ingest_request( + release, ingest_request_source="fatcat-changelog" + ) if ir and not release.files and self.want_live_ingest(release, ir): producer.produce( self.ingest_file_request_topic, - json.dumps(ir).encode('utf-8'), - #key=None, + json.dumps(ir).encode("utf-8"), + # key=None, on_delivery=fail_fast, ) @@ -420,13 +440,13 @@ class EntityUpdatesWorker(FatcatWorker): key=key, type="fatcat_work", work_ident=ident, - updated=cle['timestamp'], - fatcat_changelog_index=cle['index'], + updated=cle["timestamp"], + fatcat_changelog_index=cle["index"], ) producer.produce( self.work_ident_topic, - json.dumps(work_ident_dict).encode('utf-8'), - key=key.encode('utf-8'), + json.dumps(work_ident_dict).encode("utf-8"), + key=key.encode("utf-8"), on_delivery=fail_fast, ) diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index f411073d..0d75f964 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -1,4 +1,3 @@ - import json import sys @@ -26,12 +25,20 @@ class ElasticsearchReleaseWorker(FatcatWorker): Uses a consumer group to manage offset. """ - def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", - elasticsearch_release_index="fatcat_releases", - batch_size=200, api_host="https://api.fatcat.wiki/v0", query_stats=False): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic) + def __init__( + self, + kafka_hosts, + consume_topic, + poll_interval=10.0, + offset=None, + elasticsearch_backend="http://localhost:9200", + elasticsearch_index="fatcat", + elasticsearch_release_index="fatcat_releases", + batch_size=200, + api_host="https://api.fatcat.wiki/v0", + query_stats=False, + ): + super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic) self.consumer_group = "elasticsearch-updates3" self.batch_size = batch_size self.poll_interval = poll_interval @@ -63,45 +70,53 @@ class ElasticsearchReleaseWorker(FatcatWorker): print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) - #print("Kafka consumer commit successful") + # print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) - print("Kafka partitions rebalanced: {} / {}".format( - consumer, partitions), file=sys.stderr) + print( + "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), + file=sys.stderr, + ) consumer_conf = self.kafka_config.copy() - consumer_conf.update({ - 'group.id': self.consumer_group, - 'on_commit': fail_fast, - # messages don't have offset marked as stored until pushed to - # elastic, but we do auto-commit stored offsets to broker - 'enable.auto.commit': True, - 'enable.auto.offset.store': False, - # user code timeout; if no poll after this long, assume user code - # hung and rebalance (default: 5min) - 'max.poll.interval.ms': 60000, - 'default.topic.config': { - 'auto.offset.reset': 'latest', - }, - }) + consumer_conf.update( + { + "group.id": self.consumer_group, + "on_commit": fail_fast, + # messages don't have offset marked as stored until pushed to + # elastic, but we do auto-commit stored offsets to broker + "enable.auto.commit": True, + "enable.auto.offset.store": False, + # user code timeout; if no poll after this long, assume user code + # hung and rebalance (default: 5min) + "max.poll.interval.ms": 60000, + "default.topic.config": { + "auto.offset.reset": "latest", + }, + } + ) consumer = Consumer(consumer_conf) - consumer.subscribe([self.consume_topic], + consumer.subscribe( + [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: - batch = consumer.consume( - num_messages=self.batch_size, - timeout=self.poll_interval) + batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet", file=sys.stderr) - print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval), file=sys.stderr) + print( + "... nothing new from kafka, try again (interval: {}".format( + self.poll_interval + ), + file=sys.stderr, + ) continue print("... got {} kafka messages".format(len(batch)), file=sys.stderr) # first check errors on entire batch... @@ -111,19 +126,24 @@ class ElasticsearchReleaseWorker(FatcatWorker): # ... then process bulk_actions = [] for msg in batch: - json_str = msg.value().decode('utf-8') + json_str = msg.value().decode("utf-8") entity = entity_from_json(json_str, self.entity_type, api_client=ac) assert isinstance(entity, self.entity_type) if self.entity_type == ChangelogEntry: key = entity.index # might need to fetch from API - if not (entity.editgroup and entity.editgroup.editor): # pylint: disable=no-member # (TODO) + if not ( + entity.editgroup and entity.editgroup.editor + ): # pylint: disable=no-member # (TODO) entity = api.get_changelog_entry(entity.index) else: key = entity.ident # pylint: disable=no-member # (TODO) - if self.entity_type != ChangelogEntry and entity.state == 'wip': - print(f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr) + if self.entity_type != ChangelogEntry and entity.state == "wip": + print( + f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", + file=sys.stderr, + ) continue if self.entity_type == ContainerEntity and self.query_stats: @@ -138,9 +158,15 @@ class ElasticsearchReleaseWorker(FatcatWorker): doc_dict = self.transform_func(entity) # TODO: handle deletions from index - bulk_actions.append(json.dumps({ - "index": { "_id": key, }, - })) + bulk_actions.append( + json.dumps( + { + "index": { + "_id": key, + }, + } + ) + ) bulk_actions.append(json.dumps(doc_dict)) # if only WIP entities, then skip @@ -149,15 +175,22 @@ class ElasticsearchReleaseWorker(FatcatWorker): consumer.store_offsets(message=msg) continue - print("Upserting, eg, {} (of {} {} in elasticsearch)".format(key, len(batch), self.entity_type.__name__), file=sys.stderr) + print( + "Upserting, eg, {} (of {} {} in elasticsearch)".format( + key, len(batch), self.entity_type.__name__ + ), + file=sys.stderr, + ) elasticsearch_endpoint = "{}/{}/_bulk".format( - self.elasticsearch_backend, - self.elasticsearch_index) - resp = requests.post(elasticsearch_endpoint, + self.elasticsearch_backend, self.elasticsearch_index + ) + resp = requests.post( + elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, - data="\n".join(bulk_actions) + "\n") + data="\n".join(bulk_actions) + "\n", + ) resp.raise_for_status() - if resp.json()['errors']: + if resp.json()["errors"]: desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) print(desc, file=sys.stderr) print(resp.content, file=sys.stderr) @@ -169,20 +202,29 @@ class ElasticsearchReleaseWorker(FatcatWorker): class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): - - def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - query_stats=False, elasticsearch_release_index="fatcat_release", - elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", - batch_size=200): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic, - poll_interval=poll_interval, - offset=offset, - elasticsearch_backend=elasticsearch_backend, - elasticsearch_index=elasticsearch_index, - elasticsearch_release_index=elasticsearch_release_index, - query_stats=query_stats, - batch_size=batch_size) + def __init__( + self, + kafka_hosts, + consume_topic, + poll_interval=10.0, + offset=None, + query_stats=False, + elasticsearch_release_index="fatcat_release", + elasticsearch_backend="http://localhost:9200", + elasticsearch_index="fatcat", + batch_size=200, + ): + super().__init__( + kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + poll_interval=poll_interval, + offset=offset, + elasticsearch_backend=elasticsearch_backend, + elasticsearch_index=elasticsearch_index, + elasticsearch_release_index=elasticsearch_release_index, + query_stats=query_stats, + batch_size=batch_size, + ) # previous group got corrupted (by pykafka library?) self.consumer_group = "elasticsearch-updates3" self.entity_type = ContainerEntity @@ -196,11 +238,18 @@ class ElasticsearchChangelogWorker(ElasticsearchReleaseWorker): Note: Very early versions of changelog entries did not contain details about the editor or extra fields. """ - def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat_changelog", - batch_size=200): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic) + + def __init__( + self, + kafka_hosts, + consume_topic, + poll_interval=10.0, + offset=None, + elasticsearch_backend="http://localhost:9200", + elasticsearch_index="fatcat_changelog", + batch_size=200, + ): + super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic) self.consumer_group = "elasticsearch-updates3" self.batch_size = batch_size self.poll_interval = poll_interval diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py index 8c2936be..baec44f4 100644 --- a/python/fatcat_tools/workers/worker_common.py +++ b/python/fatcat_tools/workers/worker_common.py @@ -1,4 +1,3 @@ - from confluent_kafka import Consumer, KafkaException, TopicPartition @@ -13,22 +12,21 @@ def most_recent_message(topic, kafka_config): print("Fetching most Kafka message from {}".format(topic)) conf = kafka_config.copy() - conf.update({ - 'group.id': 'worker-init-last-msg', # should never commit - 'delivery.report.only.error': True, - 'enable.auto.commit': False, - 'default.topic.config': { - 'request.required.acks': -1, - 'auto.offset.reset': 'latest', - }, - }) + conf.update( + { + "group.id": "worker-init-last-msg", # should never commit + "delivery.report.only.error": True, + "enable.auto.commit": False, + "default.topic.config": { + "request.required.acks": -1, + "auto.offset.reset": "latest", + }, + } + ) consumer = Consumer(conf) - hwm = consumer.get_watermark_offsets( - TopicPartition(topic, 0), - timeout=5.0, - cached=False) + hwm = consumer.get_watermark_offsets(TopicPartition(topic, 0), timeout=5.0, cached=False) if not hwm: raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(topic)) print("High watermarks: {}".format(hwm)) @@ -37,7 +35,7 @@ def most_recent_message(topic, kafka_config): print("topic is new; not 'most recent message'") return None - consumer.assign([TopicPartition(topic, 0, hwm[1]-1)]) + consumer.assign([TopicPartition(topic, 0, hwm[1] - 1)]) msg = consumer.poll(2.0) consumer.close() if not msg: @@ -56,8 +54,8 @@ class FatcatWorker: if api: self.api = api self.kafka_config = { - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 20000000, # ~20 MBytes; broker-side max is ~50 MBytes } self.produce_topic = produce_topic self.consume_topic = consume_topic |