aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
commit31d1a6a713d177990609767d508209ced19ca396 (patch)
treea628a57bdb373669394a6b520102b1b4b5ffe7da /python
parent9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
downloadfatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
fmt (black): fatcat_tools/
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/__init__.py1
-rw-r--r--python/fatcat_tools/api_auth.py7
-rw-r--r--python/fatcat_tools/cleanups/__init__.py1
-rw-r--r--python/fatcat_tools/cleanups/common.py45
-rw-r--r--python/fatcat_tools/cleanups/files.py38
-rw-r--r--python/fatcat_tools/fcid.py9
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py145
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py83
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py57
-rw-r--r--python/fatcat_tools/harvest/pubmed.py159
-rw-r--r--python/fatcat_tools/importers/arabesque.py113
-rw-r--r--python/fatcat_tools/importers/arxiv.py210
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py119
-rw-r--r--python/fatcat_tools/importers/chocula.py149
-rw-r--r--python/fatcat_tools/importers/common.py198
-rw-r--r--python/fatcat_tools/importers/crossref.py413
-rw-r--r--python/fatcat_tools/importers/datacite.py824
-rw-r--r--python/fatcat_tools/importers/dblp_container.py81
-rw-r--r--python/fatcat_tools/importers/dblp_release.py257
-rw-r--r--python/fatcat_tools/importers/doaj_article.py178
-rw-r--r--python/fatcat_tools/importers/file_meta.py36
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py55
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py136
-rw-r--r--python/fatcat_tools/importers/ingest.py693
-rw-r--r--python/fatcat_tools/importers/jalc.py193
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py111
-rw-r--r--python/fatcat_tools/importers/jstor.py140
-rw-r--r--python/fatcat_tools/importers/matched.py103
-rw-r--r--python/fatcat_tools/importers/orcid.py50
-rw-r--r--python/fatcat_tools/importers/pubmed.py355
-rw-r--r--python/fatcat_tools/importers/shadow.py113
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py166
-rw-r--r--python/fatcat_tools/kafka.py12
-rw-r--r--python/fatcat_tools/normal.py301
-rw-r--r--python/fatcat_tools/references.py182
-rw-r--r--python/fatcat_tools/reviewers/review_common.py61
-rw-r--r--python/fatcat_tools/transforms/access.py44
-rw-r--r--python/fatcat_tools/transforms/csl.py185
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py668
-rw-r--r--python/fatcat_tools/transforms/ingest.py64
-rw-r--r--python/fatcat_tools/workers/changelog.py256
-rw-r--r--python/fatcat_tools/workers/elasticsearch.py171
-rw-r--r--python/fatcat_tools/workers/worker_common.py32
43 files changed, 4020 insertions, 3194 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index ec38a17b..6f9ee7d8 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,4 +1,3 @@
-
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
from .kafka import kafka_fail_fast, simple_kafka_producer
diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py
index bbf059c0..d8f0c46d 100644
--- a/python/fatcat_tools/api_auth.py
+++ b/python/fatcat_tools/api_auth.py
@@ -1,4 +1,3 @@
-
import os
import sys
@@ -15,6 +14,7 @@ def public_api(host_uri):
conf.host = host_uri
return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+
def authenticated_api(host_uri, token=None):
"""
Note: if this helper is called, it's implied that an actual API connection
@@ -24,10 +24,11 @@ def authenticated_api(host_uri, token=None):
conf = fatcat_openapi_client.Configuration()
conf.host = host_uri
if not token:
- token = os.environ['FATCAT_API_AUTH_TOKEN']
+ token = os.environ["FATCAT_API_AUTH_TOKEN"]
if not token:
sys.stderr.write(
- 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n')
+ "This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n"
+ )
sys.exit(-1)
conf.api_key["Authorization"] = token
diff --git a/python/fatcat_tools/cleanups/__init__.py b/python/fatcat_tools/cleanups/__init__.py
index 587c7b9b..0aeec977 100644
--- a/python/fatcat_tools/cleanups/__init__.py
+++ b/python/fatcat_tools/cleanups/__init__.py
@@ -1,3 +1,2 @@
-
from .common import EntityCleaner
from .files import FileCleaner
diff --git a/python/fatcat_tools/cleanups/common.py b/python/fatcat_tools/cleanups/common.py
index d0fcc761..26ca7bd6 100644
--- a/python/fatcat_tools/cleanups/common.py
+++ b/python/fatcat_tools/cleanups/common.py
@@ -1,4 +1,3 @@
-
import copy
import json
import subprocess
@@ -30,16 +29,19 @@ class EntityCleaner:
def __init__(self, api, entity_type, **kwargs):
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['git_rev'] = eg_extra.get('git_rev',
- subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["git_rev"] = eg_extra.get(
+ "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+ ).decode("utf-8")
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner")
self.api = api
self.entity_type = entity_type
- self.dry_run_mode = kwargs.get('dry_run_mode', True)
- self.edit_batch_size = kwargs.get('edit_batch_size', 50)
- self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot")
+ self.dry_run_mode = kwargs.get("dry_run_mode", True)
+ self.edit_batch_size = kwargs.get("edit_batch_size", 50)
+ self.editgroup_description = kwargs.get(
+ "editgroup_description", "Generic Entity Cleaner Bot"
+ )
self.editgroup_extra = eg_extra
self.reset()
self.ac = ApiClient()
@@ -48,7 +50,7 @@ class EntityCleaner:
print("Running in dry-run mode!")
def reset(self):
- self.counts = Counter({'lines': 0, 'cleaned': 0, 'updated': 0})
+ self.counts = Counter({"lines": 0, "cleaned": 0, "updated": 0})
self._edit_count = 0
self._editgroup_id = None
self._entity_queue = []
@@ -63,23 +65,23 @@ class EntityCleaner:
Returns nothing.
"""
- self.counts['lines'] += 1
- if (not record):
- self.counts['skip-null'] += 1
+ self.counts["lines"] += 1
+ if not record:
+ self.counts["skip-null"] += 1
return
entity = entity_from_dict(record, self.entity_type, api_client=self.ac)
- if entity.state != 'active':
- self.counts['skip-inactive'] += 1
+ if entity.state != "active":
+ self.counts["skip-inactive"] += 1
return
cleaned = self.clean_entity(copy.deepcopy(entity))
if entity == cleaned:
- self.counts['skip-clean'] += 1
+ self.counts["skip-clean"] += 1
return
else:
- self.counts['cleaned'] += 1
+ self.counts["cleaned"] += 1
if self.dry_run_mode:
entity_dict = entity_to_dict(entity, api_client=self.ac)
@@ -87,11 +89,13 @@ class EntityCleaner:
return
if entity.ident in self._idents_inflight:
- raise ValueError("Entity already part of in-process update: {}".format(entity.ident))
+ raise ValueError(
+ "Entity already part of in-process update: {}".format(entity.ident)
+ )
updated = self.try_update(cleaned)
if updated:
- self.counts['updated'] += updated
+ self.counts["updated"] += updated
self._edit_count += updated
self._idents_inflight.append(entity.ident)
@@ -132,9 +136,8 @@ class EntityCleaner:
if not self._editgroup_id:
eg = self.api.create_editgroup(
- Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ Editgroup(description=self.editgroup_description, extra=self.editgroup_extra)
+ )
self._editgroup_id = eg.editgroup_id
return self._editgroup_id
diff --git a/python/fatcat_tools/cleanups/files.py b/python/fatcat_tools/cleanups/files.py
index 0d275ba6..d378a91f 100644
--- a/python/fatcat_tools/cleanups/files.py
+++ b/python/fatcat_tools/cleanups/files.py
@@ -1,4 +1,3 @@
-
from fatcat_openapi_client.models import FileEntity
from fatcat_openapi_client.rest import ApiException
@@ -12,14 +11,19 @@ class FileCleaner(EntityCleaner):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Automated cleanup of file entities (eg, remove bad URLs)"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileCleaner')
- super().__init__(api,
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Automated cleanup of file entities (eg, remove bad URLs)"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileCleaner")
+ super().__init__(
+ api,
entity_type=FileEntity,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
def clean_entity(self, entity):
"""
@@ -27,24 +31,24 @@ class FileCleaner(EntityCleaner):
"""
# URL has ://web.archive.org/web/None/ link => delete URL
- entity.urls = [u for u in entity.urls if '://web.archive.org/web/None/' not in u.url]
+ entity.urls = [u for u in entity.urls if "://web.archive.org/web/None/" not in u.url]
# URL has ://archive.org/ link with rel=repository => rel=archive
for u in entity.urls:
- if '://archive.org/' in u.url and u.rel == 'repository':
- u.rel = 'archive'
+ if "://archive.org/" in u.url and u.rel == "repository":
+ u.rel = "archive"
# URL has short wayback date ("2017") and another url with that as prefix => delete URL
stub_wayback_urls = []
full_wayback_urls = []
for u in entity.urls:
- if '://web.archive.org/web/' in u.url:
- if len(u.url.split('/')[4]) <= 8:
+ if "://web.archive.org/web/" in u.url:
+ if len(u.url.split("/")[4]) <= 8:
stub_wayback_urls.append(u.url)
else:
- full_wayback_urls.append('/'.join(u.url.split('/')[5:]))
+ full_wayback_urls.append("/".join(u.url.split("/")[5:]))
for stub in stub_wayback_urls:
- target = '/'.join(stub.split('/')[5:])
+ target = "/".join(stub.split("/")[5:])
if target in full_wayback_urls:
entity.urls = [u for u in entity.urls if u.url != stub]
@@ -57,14 +61,14 @@ class FileCleaner(EntityCleaner):
except ApiException as err:
if err.status != 404:
raise err
- self.counts['skip-not-found'] += 1
+ self.counts["skip-not-found"] += 1
return 0
- if existing.state != 'active':
- self.counts['skip-existing-inactive'] += 1
+ if existing.state != "active":
+ self.counts["skip-existing-inactive"] += 1
return 0
if existing.revision != entity.revision:
- self.counts['skip-revision'] += 1
+ self.counts["skip-revision"] += 1
return 0
self.api.update_file(self.get_editgroup_id(), entity.ident, entity)
diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py
index 0987d10d..53891e5a 100644
--- a/python/fatcat_tools/fcid.py
+++ b/python/fatcat_tools/fcid.py
@@ -1,4 +1,3 @@
-
import base64
import uuid
@@ -7,18 +6,20 @@ def fcid2uuid(s):
"""
Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object
"""
- s = s.split('_')[-1].upper().encode('utf-8')
+ s = s.split("_")[-1].upper().encode("utf-8")
assert len(s) == 26
raw = base64.b32decode(s + b"======")
return str(uuid.UUID(bytes=raw)).lower()
+
def uuid2fcid(s):
"""
Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
"""
raw = uuid.UUID(s).bytes
- return base64.b32encode(raw)[:26].lower().decode('utf-8')
+ return base64.b32encode(raw)[:26].lower().decode("utf-8")
+
def test_fcid():
- test_uuid = '00000000-0000-0000-3333-000000000001'
+ test_uuid = "00000000-0000-0000-3333-000000000001"
assert test_uuid == fcid2uuid(uuid2fcid(test_uuid))
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index d441d495..dd48e256 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -1,4 +1,3 @@
-
import json
import sys
import time
@@ -59,29 +58,35 @@ class HarvestCrossrefWorker:
to be careful how state is serialized back into kafka.
"""
- def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
- api_host_url="https://api.crossref.org/works", start_date=None,
- end_date=None):
+ def __init__(
+ self,
+ kafka_hosts,
+ produce_topic,
+ state_topic,
+ contact_email,
+ api_host_url="https://api.crossref.org/works",
+ start_date=None,
+ end_date=None,
+ ):
self.api_host_url = api_host_url
self.produce_topic = produce_topic
self.state_topic = state_topic
self.contact_email = contact_email
self.kafka_config = {
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes
}
self.state = HarvestState(start_date, end_date)
self.state.initialize_from_kafka(self.state_topic, self.kafka_config)
- self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
+ self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks
self.api_batch_size = 50
self.name = "Crossref"
self.producer = self._kafka_producer()
def _kafka_producer(self):
-
def fail_fast(err, msg):
if err is not None:
print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
@@ -92,46 +97,53 @@ class HarvestCrossrefWorker:
self._kafka_fail_fast = fail_fast
producer_conf = self.kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
return Producer(producer_conf)
def params(self, date_str):
- filter_param = 'from-update-date:{},until-update-date:{}'.format(
- date_str, date_str)
+ filter_param = "from-update-date:{},until-update-date:{}".format(date_str, date_str)
return {
- 'filter': filter_param,
- 'rows': self.api_batch_size,
- 'cursor': '*',
+ "filter": filter_param,
+ "rows": self.api_batch_size,
+ "cursor": "*",
}
def update_params(self, params, resp):
- params['cursor'] = resp['message']['next-cursor']
+ params["cursor"] = resp["message"]["next-cursor"]
return params
def extract_key(self, obj):
- return obj['DOI'].encode('utf-8')
+ return obj["DOI"].encode("utf-8")
def fetch_date(self, date):
date_str = date.isoformat()
params = self.params(date_str)
http_session = requests_retry_session()
- http_session.headers.update({
- 'User-Agent': 'fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests'.format(
- self.contact_email),
- })
+ http_session.headers.update(
+ {
+ "User-Agent": "fatcat_tools/0.1.0 (https://fatcat.wiki; mailto:{}) python-requests".format(
+ self.contact_email
+ ),
+ }
+ )
count = 0
while True:
http_resp = http_session.get(self.api_host_url, params=params)
if http_resp.status_code == 503:
# crude backoff; now redundant with session exponential
# backoff, but allows for longer backoff/downtime on remote end
- print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr)
+ print(
+ "got HTTP {}, pausing for 30 seconds".format(http_resp.status_code),
+ file=sys.stderr,
+ )
# keep kafka producer connection alive
self.producer.poll(0)
time.sleep(30.0)
@@ -143,19 +155,27 @@ class HarvestCrossrefWorker:
except json.JSONDecodeError as exc:
# Datacite API returned HTTP 200, but JSON seemed unparseable.
# It might be a glitch, so we retry.
- print("failed to decode body from {}: {}".format(http_resp.url, resp_body), file=sys.stderr)
+ print(
+ "failed to decode body from {}: {}".format(http_resp.url, resp_body),
+ file=sys.stderr,
+ )
raise exc
items = self.extract_items(resp)
count += len(items)
- print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count,
- self.extract_total(resp), http_resp.elapsed), file=sys.stderr)
- #print(json.dumps(resp))
+ print(
+ "... got {} ({} of {}), HTTP fetch took {}".format(
+ len(items), count, self.extract_total(resp), http_resp.elapsed
+ ),
+ file=sys.stderr,
+ )
+ # print(json.dumps(resp))
for work in items:
self.producer.produce(
self.produce_topic,
- json.dumps(work).encode('utf-8'),
+ json.dumps(work).encode("utf-8"),
key=self.extract_key(work),
- on_delivery=self._kafka_fail_fast)
+ on_delivery=self._kafka_fail_fast,
+ )
self.producer.poll(0)
if len(items) < self.api_batch_size:
break
@@ -163,10 +183,10 @@ class HarvestCrossrefWorker:
self.producer.flush()
def extract_items(self, resp):
- return resp['message']['items']
+ return resp["message"]["items"]
def extract_total(self, resp):
- return resp['message']['total-results']
+ return resp["message"]["total-results"]
def run(self, continuous=False):
@@ -175,9 +195,9 @@ class HarvestCrossrefWorker:
if current:
print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
self.fetch_date(current)
- self.state.complete(current,
- kafka_topic=self.state_topic,
- kafka_config=self.kafka_config)
+ self.state.complete(
+ current, kafka_topic=self.state_topic, kafka_config=self.kafka_config
+ )
continue
if continuous:
@@ -200,16 +220,25 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
could/should use this script for that, and dump to JSON?
"""
- def __init__(self, kafka_hosts, produce_topic, state_topic, contact_email,
- api_host_url="https://api.datacite.org/dois",
- start_date=None, end_date=None):
- super().__init__(kafka_hosts=kafka_hosts,
- produce_topic=produce_topic,
- state_topic=state_topic,
- api_host_url=api_host_url,
- contact_email=contact_email,
- start_date=start_date,
- end_date=end_date)
+ def __init__(
+ self,
+ kafka_hosts,
+ produce_topic,
+ state_topic,
+ contact_email,
+ api_host_url="https://api.datacite.org/dois",
+ start_date=None,
+ end_date=None,
+ ):
+ super().__init__(
+ kafka_hosts=kafka_hosts,
+ produce_topic=produce_topic,
+ state_topic=state_topic,
+ api_host_url=api_host_url,
+ contact_email=contact_email,
+ start_date=start_date,
+ end_date=end_date,
+ )
# for datecite, it's "from-update-date"
self.name = "Datacite"
@@ -219,19 +248,21 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
Dates have to be supplied in 2018-10-27T22:36:30.000Z format.
"""
return {
- 'query': 'updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]'.format(date_str, date_str),
- 'page[size]': self.api_batch_size,
- 'page[cursor]': 1,
+ "query": "updated:[{}T00:00:00.000Z TO {}T23:59:59.999Z]".format(
+ date_str, date_str
+ ),
+ "page[size]": self.api_batch_size,
+ "page[cursor]": 1,
}
def extract_items(self, resp):
- return resp['data']
+ return resp["data"]
def extract_total(self, resp):
- return resp['meta']['total']
+ return resp["meta"]["total"]
def extract_key(self, obj):
- return obj['attributes']['doi'].encode('utf-8')
+ return obj["attributes"]["doi"].encode("utf-8")
def update_params(self, params, resp):
"""
@@ -245,9 +276,9 @@ class HarvestDataciteWorker(HarvestCrossrefWorker):
https://github.com/datacite/datacite/issues/897 (HTTP 400)
https://github.com/datacite/datacite/issues/898 (HTTP 500)
"""
- parsed = urlparse(resp['links']['next'])
- page_cursor = parse_qs(parsed.query).get('page[cursor]')
+ parsed = urlparse(resp["links"]["next"])
+ page_cursor = parse_qs(parsed.query).get("page[cursor]")
if not page_cursor:
- raise ValueError('no page[cursor] in .links.next')
- params['page[cursor]'] = page_cursor[0]
+ raise ValueError("no page[cursor] in .links.next")
+ params["page[cursor]"] = page_cursor[0]
return params
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index 45c2b8ea..fda0dc62 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -14,8 +13,10 @@ from requests.packages.urllib3.util.retry import Retry # pylint: disable=import
# Used for parsing ISO date format (YYYY-MM-DD)
DATE_FMT = "%Y-%m-%d"
-def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None):
+
+def requests_retry_session(
+ retries=10, backoff_factor=3, status_forcelist=(500, 502, 504), session=None
+):
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
@@ -28,10 +29,11 @@ def requests_retry_session(retries=10, backoff_factor=3,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
return session
+
class HarvestState:
"""
First version of this works with full days (dates)
@@ -57,8 +59,9 @@ class HarvestState:
self.enqueue_period(start_date, end_date, catchup_days)
def __str__(self):
- return '<HarvestState to_process={}, completed={}>'.format(
- len(self.to_process), len(self.completed))
+ return "<HarvestState to_process={}, completed={}>".format(
+ len(self.to_process), len(self.completed)
+ )
def enqueue_period(self, start_date=None, end_date=None, catchup_days=14):
"""
@@ -92,7 +95,9 @@ class HarvestState:
"""
if continuous:
# enqueue yesterday
- self.enqueue_period(start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1))
+ self.enqueue_period(
+ start_date=datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
+ )
if not self.to_process:
return None
return sorted(list(self.to_process))[0]
@@ -105,8 +110,8 @@ class HarvestState:
state stored on disk or in Kafka.
"""
state = json.loads(state_json)
- if 'completed-date' in state:
- date = datetime.datetime.strptime(state['completed-date'], DATE_FMT).date()
+ if "completed-date" in state:
+ date = datetime.datetime.strptime(state["completed-date"], DATE_FMT).date()
self.complete(date)
def complete(self, date, kafka_topic=None, kafka_config=None):
@@ -123,12 +128,14 @@ class HarvestState:
except KeyError:
pass
self.completed.add(date)
- state_json = json.dumps({
- 'in-progress-dates': [str(d) for d in self.to_process],
- 'completed-date': str(date),
- }).encode('utf-8')
+ state_json = json.dumps(
+ {
+ "in-progress-dates": [str(d) for d in self.to_process],
+ "completed-date": str(date),
+ }
+ ).encode("utf-8")
if kafka_topic:
- assert(kafka_config)
+ assert kafka_config
def fail_fast(err, msg):
if err:
@@ -136,17 +143,16 @@ class HarvestState:
print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr)
producer_conf = kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
producer = Producer(producer_conf)
- producer.produce(
- kafka_topic,
- state_json,
- on_delivery=fail_fast)
+ producer.produce(kafka_topic, state_json, on_delivery=fail_fast)
producer.flush()
return state_json
@@ -166,22 +172,25 @@ class HarvestState:
raise KafkaException(err)
conf = kafka_config.copy()
- conf.update({
- 'group.id': 'dummy_init_group', # should never be committed
- 'enable.auto.commit': False,
- 'auto.offset.reset': 'earliest',
- 'session.timeout.ms': 10000,
- })
+ conf.update(
+ {
+ "group.id": "dummy_init_group", # should never be committed
+ "enable.auto.commit": False,
+ "auto.offset.reset": "earliest",
+ "session.timeout.ms": 10000,
+ }
+ )
consumer = Consumer(conf)
# this watermark fetch is mostly to ensure we are connected to broker and
# fail fast if not, but we also confirm that we read to end below.
hwm = consumer.get_watermark_offsets(
- TopicPartition(kafka_topic, 0),
- timeout=5.0,
- cached=False)
+ TopicPartition(kafka_topic, 0), timeout=5.0, cached=False
+ )
if not hwm:
- raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic))
+ raise Exception(
+ "Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic)
+ )
consumer.assign([TopicPartition(kafka_topic, 0, 0)])
c = 0
@@ -191,8 +200,8 @@ class HarvestState:
break
if msg.error():
raise KafkaException(msg.error())
- #sys.stdout.write('.')
- self.update(msg.value().decode('utf-8'))
+ # sys.stdout.write('.')
+ self.update(msg.value().decode("utf-8"))
c += 1
consumer.close()
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 0eb0343d..40d1c853 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -1,4 +1,3 @@
-
import sys
import time
@@ -25,19 +24,18 @@ class HarvestOaiPmhWorker:
would want something similar operationally. Oh well!
"""
- def __init__(self, kafka_hosts, produce_topic, state_topic,
- start_date=None, end_date=None):
+ def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None):
self.produce_topic = produce_topic
self.state_topic = state_topic
self.kafka_config = {
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes
}
- self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
+ self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks
- self.endpoint_url = None # needs override
+ self.endpoint_url = None # needs override
self.metadata_prefix = None # needs override
self.name = "unnamed"
self.state = HarvestState(start_date, end_date)
@@ -45,7 +43,6 @@ class HarvestOaiPmhWorker:
print(self.state, file=sys.stderr)
def fetch_date(self, date):
-
def fail_fast(err, msg):
if err is not None:
print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
@@ -54,12 +51,14 @@ class HarvestOaiPmhWorker:
raise KafkaException(err)
producer_conf = self.kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
producer = Producer(producer_conf)
api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503])
@@ -67,13 +66,18 @@ class HarvestOaiPmhWorker:
# this dict kwargs hack is to work around 'from' as a reserved python keyword
# recommended by sickle docs
try:
- records = api.ListRecords(**{
- 'metadataPrefix': self.metadata_prefix,
- 'from': date_str,
- 'until': date_str,
- })
+ records = api.ListRecords(
+ **{
+ "metadataPrefix": self.metadata_prefix,
+ "from": date_str,
+ "until": date_str,
+ }
+ )
except sickle.oaiexceptions.NoRecordsMatch:
- print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr)
+ print(
+ "WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str),
+ file=sys.stderr,
+ )
return
count = 0
@@ -83,9 +87,10 @@ class HarvestOaiPmhWorker:
print("... up to {}".format(count), file=sys.stderr)
producer.produce(
self.produce_topic,
- item.raw.encode('utf-8'),
- key=item.header.identifier.encode('utf-8'),
- on_delivery=fail_fast)
+ item.raw.encode("utf-8"),
+ key=item.header.identifier.encode("utf-8"),
+ on_delivery=fail_fast,
+ )
producer.flush()
def run(self, continuous=False):
@@ -95,9 +100,9 @@ class HarvestOaiPmhWorker:
if current:
print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
self.fetch_date(current)
- self.state.complete(current,
- kafka_topic=self.state_topic,
- kafka_config=self.kafka_config)
+ self.state.complete(
+ current, kafka_topic=self.state_topic, kafka_config=self.kafka_config
+ )
continue
if continuous:
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index ee55f4eb..0f33f334 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -60,14 +60,15 @@ class PubmedFTPWorker:
<tr>
"""
+
def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None):
- self.name = 'Pubmed'
- self.host = 'ftp.ncbi.nlm.nih.gov'
+ self.name = "Pubmed"
+ self.host = "ftp.ncbi.nlm.nih.gov"
self.produce_topic = produce_topic
self.state_topic = state_topic
self.kafka_config = {
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 20000000, # ~20 MBytes; broker is ~50 MBytes
}
self.loop_sleep = 60 * 60 # how long to wait, in seconds, between date checks
self.state = HarvestState(start_date, end_date)
@@ -86,12 +87,14 @@ class PubmedFTPWorker:
self._kafka_fail_fast = fail_fast
producer_conf = self.kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
return Producer(producer_conf)
def fetch_date(self, date):
@@ -105,24 +108,35 @@ class PubmedFTPWorker:
if self.date_file_map is None:
raise ValueError("cannot fetch date without date file mapping")
- date_str = date.strftime('%Y-%m-%d')
+ date_str = date.strftime("%Y-%m-%d")
paths = self.date_file_map.get(date_str)
if paths is None:
- print("WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format(date_str, self.date_file_map), file=sys.stderr)
+ print(
+ "WARN: no pubmed update for this date: {} (UTC), available dates were: {}".format(
+ date_str, self.date_file_map
+ ),
+ file=sys.stderr,
+ )
return False
count = 0
for path in paths:
# Fetch and decompress file.
url = "ftp://{}{}".format(self.host, path)
- filename = ftpretr(url, proxy_hostport="159.69.240.245:15201") # TODO: proxy obsolete, when networking issue is resolved
- with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as decomp:
+ filename = ftpretr(
+ url, proxy_hostport="159.69.240.245:15201"
+ ) # TODO: proxy obsolete, when networking issue is resolved
+ with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as decomp:
try:
gzf = gzip.open(filename)
shutil.copyfileobj(gzf, decomp)
except zlib.error as exc:
- print('[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)'.format(
- url, exc), file=sys.stderr)
+ print(
+ "[skip] retrieving {} failed with {} (maybe empty, missing or broken gzip)".format(
+ url, exc
+ ),
+ file=sys.stderr,
+ )
continue
# Here, blob is the unparsed XML; we peek into it to use PMID as
@@ -131,15 +145,17 @@ class PubmedFTPWorker:
# WARNING: Parsing foreign XML exposes us at some
# https://docs.python.org/3/library/xml.html#xml-vulnerabilities
# here.
- for blob in xmlstream(decomp.name, 'PubmedArticle', encoding='utf-8'):
- soup = BeautifulSoup(blob, 'xml')
- pmid = soup.find('PMID')
+ for blob in xmlstream(decomp.name, "PubmedArticle", encoding="utf-8"):
+ soup = BeautifulSoup(blob, "xml")
+ pmid = soup.find("PMID")
if pmid is None:
raise ValueError("no PMID found, please adjust identifier extraction")
count += 1
if count % 50 == 0:
print("... up to {}".format(count), file=sys.stderr)
- self.producer.produce(self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast)
+ self.producer.produce(
+ self.produce_topic, blob, key=pmid.text, on_delivery=self._kafka_fail_fast
+ )
self.producer.flush()
os.remove(filename)
@@ -151,13 +167,17 @@ class PubmedFTPWorker:
while True:
self.date_file_map = generate_date_file_map(host=self.host)
if len(self.date_file_map) == 0:
- raise ValueError("map from dates to files should not be empty, maybe the HTML changed?")
+ raise ValueError(
+ "map from dates to files should not be empty, maybe the HTML changed?"
+ )
current = self.state.next_span(continuous)
if current:
print("Fetching citations updated on {} (UTC)".format(current), file=sys.stderr)
self.fetch_date(current)
- self.state.complete(current, kafka_topic=self.state_topic, kafka_config=self.kafka_config)
+ self.state.complete(
+ current, kafka_topic=self.state_topic, kafka_config=self.kafka_config
+ )
continue
if continuous:
@@ -168,7 +188,7 @@ class PubmedFTPWorker:
print("{} FTP ingest caught up".format(self.name))
-def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
+def generate_date_file_map(host="ftp.ncbi.nlm.nih.gov"):
"""
Generate a DefaultDict[string, set] mapping dates to absolute filepaths on
the server (mostly we have one file, but sometimes more).
@@ -176,14 +196,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
Example: {"2020-01-02": set(["/pubmed/updatefiles/pubmed20n1016.xml.gz"]), ...}
"""
mapping = collections.defaultdict(set)
- pattern = re.compile(r'Filename: ([^ ]*.xml) -- Created: ([^<]*)')
+ pattern = re.compile(r"Filename: ([^ ]*.xml) -- Created: ([^<]*)")
ftp = ftplib.FTP(host)
ftp.login()
- filenames = ftp.nlst('/pubmed/updatefiles')
+ filenames = ftp.nlst("/pubmed/updatefiles")
retries, retry_delay = 10, 60
for name in filenames:
- if not name.endswith('.html'):
+ if not name.endswith(".html"):
continue
sio = io.StringIO()
for i in range(retries):
@@ -201,10 +221,14 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
ftp = ftplib.FTP(host)
ftp.login()
sio.truncate(0)
- ftp.retrlines('RETR {}'.format(name), sio.write)
+ ftp.retrlines("RETR {}".format(name), sio.write)
except (EOFError, ftplib.error_temp, socket.gaierror, BrokenPipeError) as exc:
- print("ftp retr on {} failed with {} ({}) ({} retries left)".format(
- name, exc, type(exc), retries - (i + 1)), file=sys.stderr)
+ print(
+ "ftp retr on {} failed with {} ({}) ({} retries left)".format(
+ name, exc, type(exc), retries - (i + 1)
+ ),
+ file=sys.stderr,
+ )
if i + 1 == retries:
raise
else:
@@ -214,16 +238,24 @@ def generate_date_file_map(host='ftp.ncbi.nlm.nih.gov'):
contents = sio.getvalue()
match = pattern.search(contents)
if match is None:
- print('pattern miss in {} on: {}, may need to adjust pattern: {}'.format(name, contents, pattern), file=sys.stderr)
+ print(
+ "pattern miss in {} on: {}, may need to adjust pattern: {}".format(
+ name, contents, pattern
+ ),
+ file=sys.stderr,
+ )
continue
- filename, filedate = match.groups() # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019')
+ (
+ filename,
+ filedate,
+ ) = match.groups() # ('pubmed20n1017.xml', 'Tue Dec 17 15:23:32 EST 2019')
date = dateparser.parse(filedate)
- fullpath = '/pubmed/updatefiles/{}.gz'.format(filename)
- date_str = date.strftime('%Y-%m-%d')
+ fullpath = "/pubmed/updatefiles/{}.gz".format(filename)
+ date_str = date.strftime("%Y-%m-%d")
mapping[date_str].add(fullpath)
- print('added entry for {}: {}'.format(date_str, fullpath), file=sys.stderr)
+ print("added entry for {}: {}".format(date_str, fullpath), file=sys.stderr)
- print('generated date-file mapping for {} dates'.format(len(mapping)), file=sys.stderr)
+ print("generated date-file mapping for {} dates".format(len(mapping)), file=sys.stderr)
return mapping
@@ -241,20 +273,29 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):
when we encountered EOFError while talking to the FTP server. Retry delay in seconds.
"""
if proxy_hostport is not None:
- return ftpretr_via_http_proxy(url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay)
+ return ftpretr_via_http_proxy(
+ url, proxy_hostport, max_retries=max_retries, retry_delay=retry_delay
+ )
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
for i in range(max_retries):
try:
ftp = ftplib.FTP(server)
ftp.login()
- with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
- print('retrieving {} from {} to {} ...'.format(path, server, f.name), file=sys.stderr)
- ftp.retrbinary('RETR %s' % path, f.write)
+ with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f:
+ print(
+ "retrieving {} from {} to {} ...".format(path, server, f.name),
+ file=sys.stderr,
+ )
+ ftp.retrbinary("RETR %s" % path, f.write)
ftp.close()
except EOFError as exc:
- print("ftp retrbinary on {} failed with {} ({}) ({} retries left)".format(
- path, exc, type(exc), max_retries - (i + 1)), file=sys.stderr)
+ print(
+ "ftp retrbinary on {} failed with {} ({}) ({} retries left)".format(
+ path, exc, type(exc), max_retries - (i + 1)
+ ),
+ file=sys.stderr,
+ )
if i + 1 == max_retries:
raise
else:
@@ -263,7 +304,9 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):
return f.name
-def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1):
+def ftpretr_via_http_proxy(
+ url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1
+):
"""
Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would
be retrievable via proxy.com/a/b/c; (in 09/2021 we used
@@ -276,19 +319,23 @@ def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retri
try:
url = "http://{}{}".format(proxy_hostport, path)
print("retrieving file via proxy (ftpup) from {}".format(url), file=sys.stderr)
- with tempfile.NamedTemporaryFile(prefix='fatcat-ftp-tmp-', delete=False) as f:
+ with tempfile.NamedTemporaryFile(prefix="fatcat-ftp-tmp-", delete=False) as f:
cmd = ["wget", "-c", url, "-O", f.name]
result = subprocess.run(cmd)
return f.name
except (subprocess.CalledProcessError, OSError, ValueError) as exc:
- print("ftp fetch {} failed with {} ({}) ({} retries left)".format(
- url, exc, type(exc), max_retries - (i + 1)), file=sys.stderr)
+ print(
+ "ftp fetch {} failed with {} ({}) ({} retries left)".format(
+ url, exc, type(exc), max_retries - (i + 1)
+ ),
+ file=sys.stderr,
+ )
if i + 1 == max_retries:
raise
time.sleep(retry_delay)
-def xmlstream(filename, tag, encoding='utf-8'):
+def xmlstream(filename, tag, encoding="utf-8"):
"""
Note: This might move into a generic place in the future.
@@ -300,23 +347,29 @@ def xmlstream(filename, tag, encoding='utf-8'):
Known vulnerabilities: https://docs.python.org/3/library/xml.html#xml-vulnerabilities
"""
+
def strip_ns(tag):
- if '}' not in tag:
+ if "}" not in tag:
return tag
- return tag.split('}')[1]
+ return tag.split("}")[1]
# https://stackoverflow.com/a/13261805, http://effbot.org/elementtree/iterparse.htm
- context = iter(ET.iterparse(filename, events=(
- 'start',
- 'end',
- )))
+ context = iter(
+ ET.iterparse(
+ filename,
+ events=(
+ "start",
+ "end",
+ ),
+ )
+ )
try:
_, root = next(context)
except StopIteration:
return
for event, elem in context:
- if not strip_ns(elem.tag) == tag or event == 'start':
+ if not strip_ns(elem.tag) == tag or event == "start":
continue
yield ET.tostring(elem, encoding=encoding)
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 2b0ff7ec..ae4f9049 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,9 +1,9 @@
-
import fatcat_openapi_client
from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url
-ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
+ARABESQUE_MATCH_WHERE_CLAUSE = "WHERE hit = 1 AND identifier IS NOT NULL"
+
class ArabesqueMatchImporter(EntityImporter):
"""
@@ -38,17 +38,17 @@ class ArabesqueMatchImporter(EntityImporter):
def __init__(self, api, extid_type, require_grobid=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist"
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
- if kwargs.get('crawl_id'):
- eg_extra['crawl_id'] = kwargs.get('crawl_id')
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
- assert extid_type in ('doi', 'pmcid', 'pmid')
+ eg_desc = (
+ kwargs.get("editgroup_description", None)
+ or "Match web crawl files to releases based on identifier/URL seedlist"
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArabesqueMatchImporter")
+ if kwargs.get("crawl_id"):
+ eg_extra["crawl_id"] = kwargs.get("crawl_id")
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
+ assert extid_type in ("doi", "pmcid", "pmid")
self.extid_type = extid_type
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
@@ -60,33 +60,35 @@ class ArabesqueMatchImporter(EntityImporter):
print("NOT checking GROBID status column")
def want(self, row):
- if self.require_grobid and not row['postproc_status'] == "200":
+ if self.require_grobid and not row["postproc_status"] == "200":
return False
- if (bool(row['hit']) is True
- and row['final_sha1']
- and row['final_timestamp']
- and row['final_timestamp'] != "-"
- and len(row['final_timestamp']) == 14
- and row['final_mimetype']
- and bool(row['hit']) is True
- and row['identifier']):
+ if (
+ bool(row["hit"]) is True
+ and row["final_sha1"]
+ and row["final_timestamp"]
+ and row["final_timestamp"] != "-"
+ and len(row["final_timestamp"]) == 14
+ and row["final_mimetype"]
+ and bool(row["hit"]) is True
+ and row["identifier"]
+ ):
return True
else:
return False
def parse_record(self, row):
- extid = row['identifier'].strip()
+ extid = row["identifier"].strip()
# check/cleanup DOI
- if self.extid_type == 'doi':
+ if self.extid_type == "doi":
extid = extid.lower()
- extid.replace('http://doi.org/', '')
- extid.replace('https://doi.org/', '')
- if extid.startswith('doi:'):
+ extid.replace("http://doi.org/", "")
+ extid.replace("https://doi.org/", "")
+ if extid.startswith("doi:"):
extid = extid[4:]
- if not extid.startswith('10.'):
- self.counts['skip-extid-invalid']
+ if not extid.startswith("10."):
+ self.counts["skip-extid-invalid"]
return None
# lookup extid
@@ -95,35 +97,35 @@ class ArabesqueMatchImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status == 404:
# bail on 404 (release not in DB)
- self.counts['skip-extid-not-found'] += 1
+ self.counts["skip-extid-not-found"] += 1
return None
elif err.status == 400:
- self.counts['skip-extid-invalid'] += 1
+ self.counts["skip-extid-invalid"] += 1
return None
else:
raise err
- url = make_rel_url(row['final_url'], self.default_link_rel)
+ url = make_rel_url(row["final_url"], self.default_link_rel)
if not url:
- self.counts['skip-url'] += 1
+ self.counts["skip-url"] += 1
return None
- if not row['final_timestamp']:
- self.counts['skip-missing-timestamp'] += 1
+ if not row["final_timestamp"]:
+ self.counts["skip-missing-timestamp"] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- row['final_timestamp'],
- row['final_url'])
+ row["final_timestamp"], row["final_url"]
+ )
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
if len(urls) > SANE_MAX_URLS:
- self.counts['skip-too-many-url'] += 1
+ self.counts["skip-too-many-url"] += 1
return None
fe = fatcat_openapi_client.FileEntity(
- sha1=b32_hex(row['final_sha1']),
- mimetype=row['final_mimetype'] or self.default_mimetype,
+ sha1=b32_hex(row["final_sha1"]),
+ mimetype=row["final_mimetype"] or self.default_mimetype,
release_ids=[re.ident],
urls=urls,
)
@@ -143,15 +145,15 @@ class ArabesqueMatchImporter(EntityImporter):
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not self.do_updates:
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
# TODO: this code path never gets hit because of the check above
@@ -159,28 +161,33 @@ class ArabesqueMatchImporter(EntityImporter):
existing_urls = set([u.url for u in existing.urls])
new_urls = set([u.url for u in fe.urls])
if existing_urls.issuperset(new_urls):
- self.counts['skip-update-nothing-new'] += 1
+ self.counts["skip-update-nothing-new"] += 1
return False
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
- existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.urls = [
+ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+ ]
if len(existing.urls) > SANE_MAX_URLS:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py
index fc429fb0..7a689ed2 100644
--- a/python/fatcat_tools/importers/arxiv.py
+++ b/python/fatcat_tools/importers/arxiv.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import re
@@ -13,6 +12,7 @@ from .crossref import lookup_license_slug
latex2text = LatexNodes2Text()
+
def latex_to_text(raw):
try:
return latex2text.latex_to_text(raw).strip()
@@ -21,13 +21,14 @@ def latex_to_text(raw):
except IndexError:
return raw.strip()
+
def parse_arxiv_authors(raw):
if not raw:
return []
- raw = raw.replace('*', '')
- if '(' in raw:
- raw = re.sub(r'\(.*\)', '', raw)
- authors = raw.split(', ')
+ raw = raw.replace("*", "")
+ if "(" in raw:
+ raw = re.sub(r"\(.*\)", "", raw)
+ authors = raw.split(", ")
if authors:
last = authors[-1].split(" and ")
if len(last) == 2:
@@ -39,9 +40,12 @@ def parse_arxiv_authors(raw):
authors = [a for a in authors if a]
return authors
+
def test_parse_arxiv_authors():
- assert parse_arxiv_authors("Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an") == [
+ assert parse_arxiv_authors(
+ "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an"
+ ) == [
"Raphael Chetrite",
"Shamik Gupta",
"Izaak Neri",
@@ -63,7 +67,9 @@ def test_parse_arxiv_authors():
"Raphael Chetrite Shamik Gupta",
]
- assert parse_arxiv_authors("B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)") == [
+ assert parse_arxiv_authors(
+ "B. P. Lanyon, T. J. Weinhold, N. K. Langford, M. Barbieri, D. F. V. James*, A. Gilchrist, and A. G. White (University of Queensland, *University of Toronto)"
+ ) == [
"B. P. Lanyon",
"T. J. Weinhold",
"N. K. Langford",
@@ -84,17 +90,21 @@ class ArxivRawImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of arxiv metadata via arXivRaw OAI-PMH feed")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArxivRawImporter')
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of arxiv metadata via arXivRaw OAI-PMH feed",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ArxivRawImporter")
# lower batch size, because multiple versions per entry (guessing 2-3 on average?)
- batch_size = kwargs.get('edit_batch_size', 50)
- super().__init__(api,
+ batch_size = kwargs.get("edit_batch_size", 50)
+ super().__init__(
+ api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
batch_size=batch_size,
- **kwargs)
+ **kwargs
+ )
self._test_override = False
def parse_record(self, record):
@@ -114,53 +124,56 @@ class ArxivRawImporter(EntityImporter):
doi = None
if metadata.doi and metadata.doi.string:
doi = metadata.doi.string.lower().split()[0].strip()
- if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):
+ if not (doi.startswith("10.") and "/" in doi and doi.split("/")[1]):
sys.stderr.write("BOGUS DOI: {}\n".format(doi))
doi = None
- title = latex_to_text(metadata.title.get_text().replace('\n', ' '))
- authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))
- contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]
-
- lang = "en" # the vast majority in english
+ title = latex_to_text(metadata.title.get_text().replace("\n", " "))
+ authors = parse_arxiv_authors(metadata.authors.get_text().replace("\n", " "))
+ contribs = [
+ fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role="author")
+ for i, a in enumerate(authors)
+ ]
+
+ lang = "en" # the vast majority in english
if metadata.comments and metadata.comments.get_text():
- comments = metadata.comments.get_text().replace('\n', ' ').strip()
- extra_arxiv['comments'] = comments
- if 'in french' in comments.lower():
- lang = 'fr'
- elif 'in spanish' in comments.lower():
- lang = 'es'
- elif 'in portuguese' in comments.lower():
- lang = 'pt'
- elif 'in hindi' in comments.lower():
- lang = 'hi'
- elif 'in japanese' in comments.lower():
- lang = 'ja'
- elif 'in german' in comments.lower():
- lang = 'de'
- elif 'simplified chinese' in comments.lower():
- lang = 'zh'
- elif 'in russian' in comments.lower():
- lang = 'ru'
+ comments = metadata.comments.get_text().replace("\n", " ").strip()
+ extra_arxiv["comments"] = comments
+ if "in french" in comments.lower():
+ lang = "fr"
+ elif "in spanish" in comments.lower():
+ lang = "es"
+ elif "in portuguese" in comments.lower():
+ lang = "pt"
+ elif "in hindi" in comments.lower():
+ lang = "hi"
+ elif "in japanese" in comments.lower():
+ lang = "ja"
+ elif "in german" in comments.lower():
+ lang = "de"
+ elif "simplified chinese" in comments.lower():
+ lang = "zh"
+ elif "in russian" in comments.lower():
+ lang = "ru"
# more languages?
number = None
- if metadata.find('journal-ref') and metadata.find('journal-ref').get_text():
- journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()
- extra_arxiv['journal_ref'] = journal_ref
+ if metadata.find("journal-ref") and metadata.find("journal-ref").get_text():
+ journal_ref = metadata.find("journal-ref").get_text().replace("\n", " ").strip()
+ extra_arxiv["journal_ref"] = journal_ref
if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():
release_type = "paper-conference"
- if metadata.find('report-no') and metadata.find('report-no').string:
- number = metadata.find('report-no').string.strip()
+ if metadata.find("report-no") and metadata.find("report-no").string:
+ number = metadata.find("report-no").string.strip()
# at least some people plop extra metadata in here. hrmf!
- if 'ISSN ' in number or 'ISBN ' in number or len(number.split()) > 2:
- extra_arxiv['report-no'] = number
+ if "ISSN " in number or "ISBN " in number or len(number.split()) > 2:
+ extra_arxiv["report-no"] = number
number = None
else:
release_type = "report"
- if metadata.find('acm-class') and metadata.find('acm-class').string:
- extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip()
+ if metadata.find("acm-class") and metadata.find("acm-class").string:
+ extra_arxiv["acm_class"] = metadata.find("acm-class").string.strip()
if metadata.categories and metadata.categories.get_text():
- extra_arxiv['categories'] = metadata.categories.get_text().split()
+ extra_arxiv["categories"] = metadata.categories.get_text().split()
license_slug = None
if metadata.license and metadata.license.get_text():
license_slug = lookup_license_slug(metadata.license.get_text())
@@ -170,21 +183,29 @@ class ArxivRawImporter(EntityImporter):
abstracts = []
abst = metadata.abstract.get_text().strip()
orig = None
- if '-----' in abst:
- both = abst.split('-----')
+ if "-----" in abst:
+ both = abst.split("-----")
abst = both[0].strip()
orig = both[1].strip()
- if '$' in abst or '{' in abst:
+ if "$" in abst or "{" in abst:
mime = "application/x-latex"
abst_plain = latex_to_text(abst)
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst_plain, mimetype="text/plain", lang="en"))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ content=abst_plain, mimetype="text/plain", lang="en"
+ )
+ )
else:
mime = "text/plain"
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en"))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(content=abst, mimetype=mime, lang="en")
+ )
if orig:
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(content=orig, mimetype=mime)
+ )
# indicates that fulltext probably isn't english either
- if lang == 'en':
+ if lang == "en":
lang = None
# extra:
@@ -195,39 +216,43 @@ class ArxivRawImporter(EntityImporter):
# container_name
# group-title
# arxiv: comments, categories, etc
- extra_arxiv['base_id'] = base_id
- extra['superceded'] = True
- extra['arxiv'] = extra_arxiv
+ extra_arxiv["base_id"] = base_id
+ extra["superceded"] = True
+ extra["arxiv"] = extra_arxiv
versions = []
- for version in metadata.find_all('version'):
- arxiv_id = base_id + version['version']
+ for version in metadata.find_all("version"):
+ arxiv_id = base_id + version["version"]
release_date = version.date.string.strip()
- release_date = datetime.datetime.strptime(release_date, "%a, %d %b %Y %H:%M:%S %Z").date()
+ release_date = datetime.datetime.strptime(
+ release_date, "%a, %d %b %Y %H:%M:%S %Z"
+ ).date()
# TODO: source_type?
- versions.append(fatcat_openapi_client.ReleaseEntity(
- work_id=None,
- title=title,
- #original_title
- version=version['version'],
- release_type=release_type,
- release_stage='submitted',
- release_date=release_date.isoformat(),
- release_year=release_date.year,
- ext_ids=fatcat_openapi_client.ReleaseExtIds(
- arxiv=arxiv_id,
- ),
- number=number,
- language=lang,
- license_slug=license_slug,
- abstracts=abstracts,
- contribs=contribs,
- extra=extra.copy(),
- ))
+ versions.append(
+ fatcat_openapi_client.ReleaseEntity(
+ work_id=None,
+ title=title,
+ # original_title
+ version=version["version"],
+ release_type=release_type,
+ release_stage="submitted",
+ release_date=release_date.isoformat(),
+ release_year=release_date.year,
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(
+ arxiv=arxiv_id,
+ ),
+ number=number,
+ language=lang,
+ license_slug=license_slug,
+ abstracts=abstracts,
+ contribs=contribs,
+ extra=extra.copy(),
+ )
+ )
# TODO: assert that versions are actually in order?
assert versions
- versions[-1].extra.pop('superceded')
+ versions[-1].extra.pop("superceded")
# only apply DOI to most recent version (HACK)
if doi:
@@ -306,7 +331,7 @@ class ArxivRawImporter(EntityImporter):
for v in versions:
if v._existing_work_id:
if not v._updated:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
continue
if not any_work_id and last_edit:
# fetch the last inserted release from this group
@@ -315,7 +340,7 @@ class ArxivRawImporter(EntityImporter):
any_work_id = r.work_id
v.work_id = any_work_id
last_edit = self.api.create_release(self.get_editgroup_id(), v)
- self.counts['insert'] += 1
+ self.counts["insert"] += 1
return False
@@ -323,12 +348,15 @@ class ArxivRawImporter(EntityImporter):
# there is no batch/bezerk mode for arxiv importer, except for testing
if self._test_override:
for batch in batch_batch:
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
- self.counts['insert'] += len(batch) - 1
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
+ self.counts["insert"] += len(batch) - 1
else:
raise NotImplementedError()
@@ -341,9 +369,9 @@ class ArxivRawImporter(EntityImporter):
for article in soup.find_all("record"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = ArxivRawImporter(None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index 0340f6a3..e9de42fc 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -34,15 +34,15 @@ def single_file(prefix, path):
hashlib.sha1(),
hashlib.sha256(),
]
- with open(full, 'rb') as fp:
+ with open(full, "rb") as fp:
while True:
- data = fp.read(2**20)
+ data = fp.read(2 ** 20)
if not data:
break
for h in hashes:
h.update(data)
mime = magic.Magic(mime=True).from_file(full)
- if mime == 'application/octet-stream':
+ if mime == "application/octet-stream":
# magic apparently isn't that great; try using filename as well
guess = mimetypes.guess_type(full)[0]
if guess:
@@ -54,9 +54,11 @@ def single_file(prefix, path):
md5=hashes[0].hexdigest(),
sha1=hashes[1].hexdigest(),
sha256=hashes[2].hexdigest(),
- extra=dict(mimetype=mime))
+ extra=dict(mimetype=mime),
+ )
return fsf
+
def make_manifest(base_dir):
manifest = []
for root, dirs, files in os.walk(base_dir):
@@ -70,47 +72,49 @@ def cdl_dash_release(meta, extra=None):
if not extra:
extra = dict()
- assert meta['identifier']['type'] == 'DOI'
- doi = meta['identifier']['value'].lower()
- assert doi.startswith('10.')
+ assert meta["identifier"]["type"] == "DOI"
+ doi = meta["identifier"]["value"].lower()
+ assert doi.startswith("10.")
ark_id = None
- for extid in meta.get('alternativeIdentifiers', []):
- if extid['value'].startswith('ark:'):
- ark_id = extid['value']
+ for extid in meta.get("alternativeIdentifiers", []):
+ if extid["value"].startswith("ark:"):
+ ark_id = extid["value"]
assert ark_id
- license_slug = lookup_license_slug(meta['rights']['uri'])
+ license_slug = lookup_license_slug(meta["rights"]["uri"])
abstracts = []
- for desc in meta['descriptions']:
- if desc['type'] == "abstract":
- abstracts.append(ReleaseAbstract(
- mimetype="text/html",
- content=clean(desc['value'])))
- #print(abstracts)
+ for desc in meta["descriptions"]:
+ if desc["type"] == "abstract":
+ abstracts.append(
+ ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
+ )
+ # print(abstracts)
if not abstracts:
abstracts = None
contribs = []
- for creator in meta['creator']:
- contribs.append(ReleaseContrib(
- given_name=creator['given'],
- surname=creator['family'],
- # sorry everybody
- raw_name="{} {}".format(creator['given'], creator['family']),
- raw_affiliation=creator.get('affiliation'),
- role="author", # presumably, for these datasets?
- ))
+ for creator in meta["creator"]:
+ contribs.append(
+ ReleaseContrib(
+ given_name=creator["given"],
+ surname=creator["family"],
+ # sorry everybody
+ raw_name="{} {}".format(creator["given"], creator["family"]),
+ raw_affiliation=creator.get("affiliation"),
+ role="author", # presumably, for these datasets?
+ )
+ )
r = ReleaseEntity(
ext_ids=ReleaseExtIds(
doi=doi,
ark=ark_id,
),
- title=clean(meta['title'], force_xml=True),
- publisher=clean(meta['publisher']),
- release_year=int(meta['publicationYear']),
+ title=clean(meta["title"], force_xml=True),
+ publisher=clean(meta["publisher"]),
+ release_year=int(meta["publicationYear"]),
release_type="dataset",
license_slug=license_slug,
contribs=contribs,
@@ -119,66 +123,66 @@ def cdl_dash_release(meta, extra=None):
)
return r
+
def make_release_fileset(dat_path):
- if dat_path.endswith('/'):
+ if dat_path.endswith("/"):
dat_path = dat_path[:-1]
dat_discovery = dat_path
extra = dict()
assert len(dat_discovery) == 64
- with open(dat_path + "/cdl_dash_metadata.json", 'r') as fp:
+ with open(dat_path + "/cdl_dash_metadata.json", "r") as fp:
meta_dict = json.loads(fp.read())
release = cdl_dash_release(meta_dict)
- ark_id = release.extra['ark_id']
+ ark_id = release.extra["ark_id"]
dash_version = None
# really crude XML parse-out
- with open(dat_path + "/stash-wrapper.xml", 'r') as fp:
+ with open(dat_path + "/stash-wrapper.xml", "r") as fp:
for line in fp:
line = line.strip()
if line.startswith("<st:version_number>"):
- dash_version = int(line[19:].split('<')[0])
+ dash_version = int(line[19:].split("<")[0])
assert dash_version is not None
- extra['cdl_dash'] = dict(version=dash_version)
- release.extra['cdl_dash'] = dict(version=dash_version)
+ extra["cdl_dash"] = dict(version=dash_version)
+ release.extra["cdl_dash"] = dict(version=dash_version)
manifest = make_manifest(dat_path + "/files/")
bundle_url = dict(
url="https://merritt.cdlib.org/u/{}/{}".format(
- urllib.parse.quote(ark_id, safe=''),
- dash_version),
- rel="repo-bundle")
+ urllib.parse.quote(ark_id, safe=""), dash_version
+ ),
+ rel="repo-bundle",
+ )
repo_url = dict(
url="https://merritt.cdlib.org/d/{}/{}/".format(
- urllib.parse.quote(ark_id, safe=''),
- dash_version),
- rel="repo")
- dat_url = dict(
- url="dat://{}/files/".format(dat_discovery),
- rel="dweb")
+ urllib.parse.quote(ark_id, safe=""), dash_version
+ ),
+ rel="repo",
+ )
+ dat_url = dict(url="dat://{}/files/".format(dat_discovery), rel="dweb")
fs = FilesetEntity(
- urls=[bundle_url, repo_url, dat_url],
- release_ids=None,
- manifest=manifest,
- extra=extra)
+ urls=[bundle_url, repo_url, dat_url], release_ids=None, manifest=manifest, extra=extra
+ )
return (release, fs)
+
def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
- git_rev = subprocess.check_output(
- ["git", "describe", "--always"]).strip().decode('utf-8')
+ git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
(release, fileset) = make_release_fileset(dat_path)
if not editgroup_id:
- eg = api.create_editgroup(Editgroup(
- description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
- extra=dict(
- git_rev=git_rev,
- agent="fatcat_tools.auto_cdl_dash_dat")))
+ eg = api.create_editgroup(
+ Editgroup(
+ description="One-off import of dataset(s) from CDL/DASH repository (via IA, Dat dweb pilot project)",
+ extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_cdl_dash_dat"),
+ )
+ )
editgroup_id = eg.editgroup_id
if not release_id and release.ext_ids.doi:
@@ -201,6 +205,7 @@ def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None):
fileset = api.get_fileset(edit.ident)
return (editgroup_id, release, fileset)
-if __name__=='__main__':
+
+if __name__ == "__main__":
# pass this a discovery key that has been cloned to the local directory
print(make_release_fileset(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index 0b634e73..8d2a89b6 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -15,20 +14,19 @@ class ChoculaImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata from Chocula tool.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata from Chocula tool.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
- if not raw_record.get('ident') and not raw_record.get('_known_issnl'):
- self.counts['skip-unknown-new-issnl'] += 1
+ if not raw_record.get("ident") and not raw_record.get("_known_issnl"):
+ self.counts["skip-unknown-new-issnl"] += 1
return False
- if raw_record.get('issnl') and raw_record.get('name'):
+ if raw_record.get("issnl") and raw_record.get("name"):
return True
return False
@@ -39,42 +37,55 @@ class ChoculaImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- name = clean(row.get('name'))
+ name = clean(row.get("name"))
if not name:
# Name is required (by schema)
return None
name = name.strip()
- if name.endswith(', Proceedings of the'):
- name = "Proceedings of the " + name.split(',')[0]
+ if name.endswith(", Proceedings of the"):
+ name = "Proceedings of the " + name.split(",")[0]
- if name.endswith('.'):
+ if name.endswith("."):
name = name[:-1]
extra = dict()
- for k in ('urls', 'webarchive_urls', 'country',
- 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages',
- 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'):
- if row['extra'].get(k):
- extra[k] = row['extra'][k]
+ for k in (
+ "urls",
+ "webarchive_urls",
+ "country",
+ "sherpa_romeo",
+ "ezb",
+ "szczepanski",
+ "doaj",
+ "languages",
+ "ia",
+ "scielo",
+ "kbart",
+ "publisher_type",
+ "platform",
+ ):
+ if row["extra"].get(k):
+ extra[k] = row["extra"][k]
container_type = None
- if 'proceedings' in name.lower():
- container_type = 'proceedings'
- elif 'journal ' in name.lower():
- container_type = 'journal'
+ if "proceedings" in name.lower():
+ container_type = "proceedings"
+ elif "journal " in name.lower():
+ container_type = "journal"
ce = fatcat_openapi_client.ContainerEntity(
- issnl=row['issnl'],
- issnp=row['extra'].get('issnp'),
- issne=row['extra'].get('issne'),
- ident=row['ident'],
+ issnl=row["issnl"],
+ issnp=row["extra"].get("issnp"),
+ issne=row["extra"].get("issne"),
+ ident=row["ident"],
name=name,
container_type=container_type,
- publisher=clean(row.get('publisher')),
- wikidata_qid=row.get('wikidata_qid'),
- extra=extra)
+ publisher=clean(row.get("publisher")),
+ wikidata_qid=row.get("wikidata_qid"),
+ extra=extra,
+ )
return ce
def try_update(self, ce):
@@ -86,12 +97,12 @@ class ChoculaImporter(EntityImporter):
except fatcat_openapi_client.rest.ApiException as err:
if err.status != 404:
raise err
- self.counts['exists'] += 1
- self.counts['exists-not-found'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-not-found"] += 1
return False
- if existing.state != 'active':
- self.counts['exists'] += 1
- self.counts['exists-inactive'] += 1
+ if existing.state != "active":
+ self.counts["exists"] += 1
+ self.counts["exists-inactive"] += 1
return False
if not existing:
@@ -102,8 +113,8 @@ class ChoculaImporter(EntityImporter):
if err.status != 404:
raise err
if existing:
- self.counts['exists'] += 1
- self.counts['exists-by-issnl'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-by-issnl"] += 1
return False
# doesn't exist, always create
return True
@@ -111,18 +122,22 @@ class ChoculaImporter(EntityImporter):
# decide whether to update
do_update = False
if not self.do_updates:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not existing.extra:
existing.extra = dict()
- if ce.extra.get('urls') and set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+ if ce.extra.get("urls") and set(ce.extra.get("urls", [])) != set(
+ existing.extra.get("urls", [])
+ ):
do_update = True
- if ce.extra.get('webarchive_urls') and set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
+ if ce.extra.get("webarchive_urls") and set(ce.extra.get("webarchive_urls", [])) != set(
+ existing.extra.get("webarchive_urls", [])
+ ):
do_update = True
- for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):
+ for k in ("ezb", "szczepanski", "publisher_type", "platform"):
if ce.extra.get(k) and not existing.extra.get(k):
do_update = True
- for k in ('kbart', 'ia', 'doaj'):
+ for k in ("kbart", "ia", "doaj"):
# always update these fields if not equal (chocula override)
if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
do_update = True
@@ -137,41 +152,53 @@ class ChoculaImporter(EntityImporter):
existing.container_type = existing.container_type or ce.container_type
existing.issne = existing.issne or ce.issne
existing.issnp = existing.issnp or ce.issnp
- for k in ('urls', 'webarchive_urls'):
+ for k in ("urls", "webarchive_urls"):
# be conservative about URL updates; don't clobber existing URL lists
# may want to make this behavior more sophisticated in the
# future, or at least a config flag
if ce.extra.get(k) and not existing.extra.get(k):
existing.extra[k] = ce.extra.get(k, [])
- for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia',
- 'scielo', 'kbart', 'publisher_type', 'platform'):
+ for k in (
+ "sherpa_romeo",
+ "ezb",
+ "szczepanski",
+ "doaj",
+ "ia",
+ "scielo",
+ "kbart",
+ "publisher_type",
+ "platform",
+ ):
# always update (chocula over-rides)
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
- for k in ('country',):
+ for k in ("country",):
# only include if not set (don't clobber human edits)
if ce.extra.get(k) and not existing.extra.get(k):
existing.extra[k] = ce.extra[k]
- if ce.extra.get('languages'):
- if not existing.extra.get('languages'):
- existing.extra['languages'] = ce.extra['languages']
- elif not ce.extra['languages'][0] in existing.extra['languages']:
- existing.extra['languages'].append(ce.extra['languages'][0])
+ if ce.extra.get("languages"):
+ if not existing.extra.get("languages"):
+ existing.extra["languages"] = ce.extra["languages"]
+ elif not ce.extra["languages"][0] in existing.extra["languages"]:
+ existing.extra["languages"].append(ce.extra["languages"][0])
self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
else:
- self.counts['exists'] += 1
- self.counts['exists-skip-update'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-skip-update"] += 1
return False
# if we got this far, it's a bug
raise NotImplementedError
def insert_batch(self, batch):
- self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e33a2012..2639c85a 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -1,4 +1,3 @@
-
import csv
import datetime
import json
@@ -34,7 +33,6 @@ SANE_MAX_URLS: int = 100
DOMAIN_REL_MAP: Dict[str, str] = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
-
"arxiv.org": "repository",
"babel.hathitrust.org": "repository",
"cds.cern.ch": "repository",
@@ -53,7 +51,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"zenodo.org": "repository",
"www.biorxiv.org": "repository",
"www.medrxiv.org": "repository",
-
"citeseerx.ist.psu.edu": "aggregator",
"publisher-connector.core.ac.uk": "aggregator",
"core.ac.uk": "aggregator",
@@ -62,7 +59,6 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"pdfs.semanticscholar.org": "aggregator",
"semanticscholar.org": "aggregator",
"www.semanticscholar.org": "aggregator",
-
"academic.oup.com": "publisher",
"cdn.elifesciences.org": "publisher",
"cell.com": "publisher",
@@ -86,15 +82,14 @@ DOMAIN_REL_MAP: Dict[str, str] = {
"ehp.niehs.nih.gov": "publisher",
"journals.tsu.ru": "publisher",
"www.cogentoa.com": "publisher",
-
"www.researchgate.net": "academicsocial",
"academia.edu": "academicsocial",
-
"wayback.archive-it.org": "webarchive",
"web.archive.org": "webarchive",
"archive.is": "webarchive",
}
+
def make_rel_url(raw_url: str, default_link_rel: str = "web"):
# this is where we map specific domains to rel types, and also filter out
# bad domains, invalid URLs, etc
@@ -105,12 +100,17 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"):
break
return (rel, raw_url)
+
def test_make_rel_url():
assert make_rel_url("http://example.com/thing.pdf")[0] == "web"
assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans"
- assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive"
+ assert (
+ make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0]
+ == "webarchive"
+ )
assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher"
+
class EntityImporter:
"""
Base class for fatcat entity importers.
@@ -147,23 +147,26 @@ class EntityImporter:
def __init__(self, api, **kwargs):
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['git_rev'] = eg_extra.get('git_rev',
- subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["git_rev"] = eg_extra.get(
+ "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+ ).decode("utf-8")
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityImporter")
self.api = api
- self.do_updates = bool(kwargs.get('do_updates', True))
- self.do_fuzzy_match: bool = kwargs.get('do_fuzzy_match', True)
- self.bezerk_mode: bool = kwargs.get('bezerk_mode', False)
- self.submit_mode: bool = kwargs.get('submit_mode', False)
- self.edit_batch_size: int = kwargs.get('edit_batch_size', 100)
- self.editgroup_description: Optional[str] = kwargs.get('editgroup_description')
+ self.do_updates = bool(kwargs.get("do_updates", True))
+ self.do_fuzzy_match: bool = kwargs.get("do_fuzzy_match", True)
+ self.bezerk_mode: bool = kwargs.get("bezerk_mode", False)
+ self.submit_mode: bool = kwargs.get("submit_mode", False)
+ self.edit_batch_size: int = kwargs.get("edit_batch_size", 100)
+ self.editgroup_description: Optional[str] = kwargs.get("editgroup_description")
self.editgroup_extra: Optional[Any] = eg_extra
- self.es_client = kwargs.get('es_client')
+ self.es_client = kwargs.get("es_client")
if not self.es_client:
- self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki", timeout=120)
+ self.es_client = elasticsearch.Elasticsearch(
+ "https://search.fatcat.wiki", timeout=120
+ )
self._issnl_id_map: Dict[str, Any] = dict()
self._orcid_id_map: Dict[str, Any] = dict()
@@ -174,7 +177,7 @@ class EntityImporter:
self.reset()
def reset(self) -> None:
- self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+ self.counts = Counter({"total": 0, "skip": 0, "insert": 0, "update": 0, "exists": 0})
self._edit_count: int = 0
self._editgroup_id: Optional[str] = None
self._entity_queue: List[Any] = []
@@ -184,13 +187,13 @@ class EntityImporter:
"""
Returns nothing.
"""
- self.counts['total'] += 1
+ self.counts["total"] += 1
if (not raw_record) or (not self.want(raw_record)):
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
entity = self.parse_record(raw_record)
if not entity:
- self.counts['skip'] += 1
+ self.counts["skip"] += 1
return
if self.bezerk_mode:
self.push_entity(entity)
@@ -230,7 +233,7 @@ class EntityImporter:
if self._entity_queue:
self.insert_batch(self._entity_queue)
- self.counts['insert'] += len(self._entity_queue)
+ self.counts["insert"] += len(self._entity_queue)
self._entity_queue = []
return self.counts
@@ -248,8 +251,9 @@ class EntityImporter:
if not self._editgroup_id:
eg = self.api.create_editgroup(
fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
self._editgroup_id = eg.editgroup_id
self._edit_count += edits
@@ -257,30 +261,30 @@ class EntityImporter:
def create_container(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.container'] += 1
+ self.counts["inserted.container"] += 1
return self.api.create_container(eg_id, entity)
def create_release(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.release'] += 1
+ self.counts["inserted.release"] += 1
return self.api.create_release(eg_id, entity)
def create_file(self, entity):
eg_id = self.get_editgroup_id()
- self.counts['inserted.file'] += 1
+ self.counts["inserted.file"] += 1
return self.api.create_file(eg_id, entity)
def updated(self):
"""
Implementations should call this from try_update() if the update was successful
"""
- self.counts['update'] += 1
+ self.counts["update"] += 1
def push_entity(self, entity):
self._entity_queue.append(entity)
if len(self._entity_queue) >= self.edit_batch_size:
self.insert_batch(self._entity_queue)
- self.counts['insert'] += len(self._entity_queue)
+ self.counts["insert"] += len(self._entity_queue)
self._entity_queue = []
def want(self, raw_record: Any) -> bool:
@@ -324,7 +328,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._orcid_id_map[orcid] = creator_id # might be None
+ self._orcid_id_map[orcid] = creator_id # might be None
return creator_id
def is_doi(self, doi: str) -> bool:
@@ -347,7 +351,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._doi_id_map[doi] = release_id # might be None
+ self._doi_id_map[doi] = release_id # might be None
return release_id
def lookup_pmid(self, pmid: str):
@@ -364,11 +368,11 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._pmid_id_map[pmid] = release_id # might be None
+ self._pmid_id_map[pmid] = release_id # might be None
return release_id
def is_issnl(self, issnl: str) -> bool:
- return len(issnl) == 9 and issnl[4] == '-'
+ return len(issnl) == 9 and issnl[4] == "-"
def lookup_issnl(self, issnl: str):
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
@@ -382,7 +386,7 @@ class EntityImporter:
# If anything other than a 404 (not found), something is wrong
if ae.status != 404:
raise ae
- self._issnl_id_map[issnl] = container_id # might be None
+ self._issnl_id_map[issnl] = container_id # might be None
return container_id
def read_issn_map_file(self, issn_map_file):
@@ -417,26 +421,26 @@ class EntityImporter:
# update old/deprecated 'rel' on URLs
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
- if u.rel == 'social':
- u.rel = 'academicsocial'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
+ if u.rel == "social":
+ u.rel = "academicsocial"
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]
- all_wayback_urls = [u.url for u in existing.urls if '://web.archive.org/web/' in u.url]
+ all_wayback_urls = [u.url for u in existing.urls if "://web.archive.org/web/" in u.url]
for url in all_urls:
# https/http redundancy
- if url.startswith('http://') and url.replace('http://', 'https://', 1) in all_urls:
+ if url.startswith("http://") and url.replace("http://", "https://", 1) in all_urls:
redundant_urls.append(url)
continue
# default HTTP port included and not included
- if ':80/' in url and url.replace(':80', '', 1) in all_urls:
+ if ":80/" in url and url.replace(":80", "", 1) in all_urls:
redundant_urls.append(url)
continue
# partial and complete wayback timestamps
- if '://web.archive.org/web/2017/' in url:
+ if "://web.archive.org/web/2017/" in url:
original_url = "/".join(url.split("/")[5:])
assert len(original_url) > 5
for wb_url in all_wayback_urls:
@@ -452,7 +456,9 @@ class EntityImporter:
def generic_fileset_cleanups(existing):
return existing
- def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]:
+ def match_existing_release_fuzzy(
+ self, release: ReleaseEntity
+ ) -> Optional[Tuple[str, str, ReleaseEntity]]:
"""
This helper function uses fuzzycat (and elasticsearch) to look for
existing release entities with similar metadata.
@@ -488,7 +494,15 @@ class EntityImporter:
return None
release_dict = entity_to_dict(release, api_client=self.api.api_client)
- verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates]
+ verified = [
+ (
+ fuzzycat.verify.verify(
+ release_dict, entity_to_dict(c, api_client=self.api.api_client)
+ ),
+ c,
+ )
+ for c in candidates
+ ]
# chose the "closest" match
closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
@@ -522,7 +536,6 @@ class RecordPusher:
class JsonLinePusher(RecordPusher):
-
def __init__(self, importer, json_file, **kwargs):
self.importer = importer
self.json_file = json_file
@@ -539,10 +552,9 @@ class JsonLinePusher(RecordPusher):
class CsvPusher(RecordPusher):
-
def __init__(self, importer, csv_file, **kwargs):
self.importer = importer
- self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+ self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ","))
def run(self):
for line in self.reader:
@@ -555,7 +567,6 @@ class CsvPusher(RecordPusher):
class LinePusher(RecordPusher):
-
def __init__(self, importer, text_file, **kwargs):
self.importer = importer
self.text_file = text_file
@@ -571,17 +582,15 @@ class LinePusher(RecordPusher):
class SqlitePusher(RecordPusher):
-
def __init__(self, importer, db_file, table_name, where_clause="", **kwargs):
self.importer = importer
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self.db.row_factory = sqlite3.Row
self.table_name = table_name
self.where_clause = where_clause
def run(self):
- cur = self.db.execute("SELECT * FROM {} {};".format(
- self.table_name, self.where_clause))
+ cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause))
for row in cur:
self.importer.push_record(row)
counts = self.importer.finish()
@@ -590,7 +599,6 @@ class SqlitePusher(RecordPusher):
class Bs4XmlLinesPusher(RecordPusher):
-
def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):
self.importer = importer
self.xml_file = xml_file
@@ -611,7 +619,6 @@ class Bs4XmlLinesPusher(RecordPusher):
class Bs4XmlFilePusher(RecordPusher):
-
def __init__(self, importer, xml_file, record_tag, **kwargs):
self.importer = importer
self.xml_file = xml_file
@@ -684,7 +691,6 @@ class Bs4XmlLargeFilePusher(RecordPusher):
class Bs4XmlFileListPusher(RecordPusher):
-
def __init__(self, importer, list_file, record_tag, **kwargs):
self.importer = importer
self.list_file = list_file
@@ -695,7 +701,7 @@ class Bs4XmlFileListPusher(RecordPusher):
xml_path = xml_path.strip()
if not xml_path or xml_path.startswith("#"):
continue
- with open(xml_path, 'r') as xml_file:
+ with open(xml_path, "r") as xml_file:
soup = BeautifulSoup(xml_file, "xml")
for record in soup.find_all(self.record_tag):
self.importer.push_record(record)
@@ -705,10 +711,12 @@ class Bs4XmlFileListPusher(RecordPusher):
print(counts)
return counts
+
class KafkaBs4XmlPusher(RecordPusher):
"""
Fetch XML for an article from Kafka, parse via Bs4.
"""
+
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
self.importer = importer
self.consumer = make_kafka_consumer(
@@ -716,10 +724,10 @@ class KafkaBs4XmlPusher(RecordPusher):
kafka_env,
topic_suffix,
group,
- kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+ kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.consume_batch_size = kwargs.get('consume_batch_size', 25)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.consume_batch_size = kwargs.get("consume_batch_size", 25)
def run(self):
count = 0
@@ -735,16 +743,19 @@ class KafkaBs4XmlPusher(RecordPusher):
# outstanding editgroups every 5 minutes, but there is still that
# window when editgroups might be hanging (unsubmitted).
batch = self.consumer.consume(
- num_messages=self.consume_batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval) {}".format(
- len(batch), self.poll_interval, self.importer.counts))
+ num_messages=self.consume_batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval) {}".format(
+ len(batch), self.poll_interval, self.importer.counts
+ )
+ )
if not batch:
if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):
# it has been some time, so flush any current editgroup
self.importer.finish()
last_push = datetime.datetime.now()
- #print("Flushed any partial import batch: {}".format(self.importer.counts))
+ # print("Flushed any partial import batch: {}".format(self.importer.counts))
continue
# first check errors on entire batch...
for msg in batch:
@@ -752,7 +763,7 @@ class KafkaBs4XmlPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
for msg in batch:
- soup = BeautifulSoup(msg.value().decode('utf-8'), "xml")
+ soup = BeautifulSoup(msg.value().decode("utf-8"), "xml")
self.importer.push_record(soup)
soup.decompose()
count += 1
@@ -771,8 +782,8 @@ class KafkaBs4XmlPusher(RecordPusher):
self.consumer.close()
return counts
-class KafkaJsonPusher(RecordPusher):
+class KafkaJsonPusher(RecordPusher):
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
self.importer = importer
self.consumer = make_kafka_consumer(
@@ -780,11 +791,11 @@ class KafkaJsonPusher(RecordPusher):
kafka_env,
topic_suffix,
group,
- kafka_namespace=kwargs.get('kafka_namespace', 'fatcat')
+ kafka_namespace=kwargs.get("kafka_namespace", "fatcat"),
)
- self.poll_interval = kwargs.get('poll_interval', 5.0)
- self.consume_batch_size = kwargs.get('consume_batch_size', 100)
- self.force_flush = kwargs.get('force_flush', False)
+ self.poll_interval = kwargs.get("poll_interval", 5.0)
+ self.consume_batch_size = kwargs.get("consume_batch_size", 100)
+ self.force_flush = kwargs.get("force_flush", False)
def run(self):
count = 0
@@ -801,10 +812,13 @@ class KafkaJsonPusher(RecordPusher):
# outstanding editgroups every 5 minutes, but there is still that
# window when editgroups might be hanging (unsubmitted).
batch = self.consumer.consume(
- num_messages=self.consume_batch_size,
- timeout=self.poll_interval)
- print("... got {} kafka messages ({}sec poll interval) {}".format(
- len(batch), self.poll_interval, self.importer.counts))
+ num_messages=self.consume_batch_size, timeout=self.poll_interval
+ )
+ print(
+ "... got {} kafka messages ({}sec poll interval) {}".format(
+ len(batch), self.poll_interval, self.importer.counts
+ )
+ )
if self.force_flush:
# this flushing happens even if there have been 'push' events
# more recently. it is intended for, eg, importers off the
@@ -821,7 +835,7 @@ class KafkaJsonPusher(RecordPusher):
self.importer.finish()
last_push = datetime.datetime.now()
last_force_flush = datetime.datetime.now()
- #print("Flushed any partial import batch: {}".format(self.importer.counts))
+ # print("Flushed any partial import batch: {}".format(self.importer.counts))
continue
# first check errors on entire batch...
for msg in batch:
@@ -829,7 +843,7 @@ class KafkaJsonPusher(RecordPusher):
raise KafkaException(msg.error())
# ... then process
for msg in batch:
- record = json.loads(msg.value().decode('utf-8'))
+ record = json.loads(msg.value().decode("utf-8"))
self.importer.push_record(record)
count += 1
if count % 500 == 0:
@@ -864,25 +878,25 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
print("Bailing out...")
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
# previously, using pykafka
- #auto_commit_enable=True,
- #auto_commit_interval_ms=30000, # 30 seconds
+ # auto_commit_enable=True,
+ # auto_commit_interval_ms=30000, # 30 seconds
conf = {
- 'bootstrap.servers': hosts,
- 'group.id': group,
- 'on_commit': fail_fast,
+ "bootstrap.servers": hosts,
+ "group.id": group,
+ "on_commit": fail_fast,
# messages don't have offset marked as stored until pushed to
# elastic, but we do auto-commit stored offsets to broker
- 'enable.auto.offset.store': False,
- 'enable.auto.commit': True,
+ "enable.auto.offset.store": False,
+ "enable.auto.commit": True,
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 5min)
- 'max.poll.interval.ms': 120000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
+ "max.poll.interval.ms": 120000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
},
}
@@ -890,13 +904,13 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions))
+ print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))
consumer = Consumer(conf)
# NOTE: it's actually important that topic_name *not* be bytes (UTF-8
# encoded)
- consumer.subscribe([topic_name],
+ consumer.subscribe(
+ [topic_name],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fd6936a4..606d4bb1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -1,4 +1,3 @@
-
import datetime
import sqlite3
from typing import Any, Dict, Optional
@@ -13,30 +12,30 @@ from .common import EntityImporter, clean
# Can get a list of Crossref types (with counts) via API:
# https://api.crossref.org/works?rows=0&facet=type-name:*
CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = {
- 'book': 'book',
- 'book-chapter': 'chapter',
- 'book-part': 'chapter',
- 'book-section': 'chapter',
- 'component': 'component',
- 'dataset': 'dataset',
- 'dissertation': 'thesis',
- 'edited-book': 'book',
- 'journal-article': 'article-journal',
- 'monograph': 'book',
- 'other': None,
- 'peer-review': 'peer_review',
- 'posted-content': 'post',
- 'proceedings-article': 'paper-conference',
- 'reference-book': 'book',
- 'reference-entry': 'entry',
- 'report': 'report',
- 'standard': 'standard',
+ "book": "book",
+ "book-chapter": "chapter",
+ "book-part": "chapter",
+ "book-section": "chapter",
+ "component": "component",
+ "dataset": "dataset",
+ "dissertation": "thesis",
+ "edited-book": "book",
+ "journal-article": "article-journal",
+ "monograph": "book",
+ "other": None,
+ "peer-review": "peer_review",
+ "posted-content": "post",
+ "proceedings-article": "paper-conference",
+ "reference-book": "book",
+ "reference-entry": "entry",
+ "report": "report",
+ "standard": "standard",
}
CONTAINER_TYPE_MAP: Dict[str, str] = {
- 'article-journal': 'journal',
- 'paper-conference': 'conference',
- 'book': 'book-series',
+ "article-journal": "journal",
+ "paper-conference": "conference",
+ "book": "book-series",
}
# These are based, informally, on sorting the most popular licenses found in
@@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = {
"//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0",
}
+
def lookup_license_slug(raw: str) -> Optional[str]:
if not raw:
return None
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if 'creativecommons.org' in raw.lower():
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if "creativecommons.org" in raw.lower():
raw = raw.lower()
- raw = raw.replace('/legalcode', '/').replace('/uk', '')
- if not raw.endswith('/'):
- raw = raw + '/'
+ raw = raw.replace("/legalcode", "/").replace("/uk", "")
+ if not raw.endswith("/"):
+ raw = raw + "/"
return LICENSE_SLUG_MAP.get(raw)
+
def test_lookup_license_slug():
assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC"
- assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0"
+ assert (
+ lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode")
+ == "CC-BY"
+ )
+ assert (
+ lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode")
+ == "CC-0"
+ )
assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY"
- assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA"
+ assert (
+ lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/")
+ == "CC-BY-NC-SA"
+ )
assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC"
assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None
assert lookup_license_slug("") is None
assert lookup_license_slug(None) is None
+
class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc: Optional[str] = kwargs.get('editgroup_description',
- "Automated import of Crossref DOI metadata, harvested from REST API")
- eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
- super().__init__(api,
+ eg_desc: Optional[str] = kwargs.get(
+ "editgroup_description",
+ "Automated import of Crossref DOI metadata, harvested from REST API",
+ )
+ eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers: bool = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ self.create_containers: bool = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db: Optional[Any] = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter):
def lookup_ext_ids(self, doi: str) -> Optional[Any]:
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter):
return CONTAINER_TYPE_MAP.get(crossref_type)
def want(self, obj: Dict[str, Any]) -> bool:
- if not obj.get('title'):
- self.counts['skip-blank-title'] += 1
+ if not obj.get("title"):
+ self.counts["skip-blank-title"] += 1
return False
# these are pre-registered DOIs before the actual record is ready
# title is a list of titles
- titles = obj.get('title')
+ titles = obj.get("title")
if titles is not None and titles[0].strip().lower() in [
- "OUP accepted manuscript".lower(),
- ]:
- self.counts['skip-stub-title'] += 1
+ "OUP accepted manuscript".lower(),
+ ]:
+ self.counts["skip-stub-title"] += 1
return False
# do most of these checks in-line below
@@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter):
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
- if obj.get('type') in (None, 'journal', 'proceedings',
- 'standard-series', 'report-series', 'book-series', 'book-set',
- 'book-track', 'proceedings-series'):
- self.counts['skip-release-type'] += 1
+ if obj.get("type") in (
+ None,
+ "journal",
+ "proceedings",
+ "standard-series",
+ "report-series",
+ "book-series",
+ "book-set",
+ "book-track",
+ "proceedings-series",
+ ):
+ self.counts["skip-release-type"] += 1
return None
# Do require the 'title' keys to exist, as release entities do
- if ('title' not in obj) or (not obj['title']):
- self.counts['skip-blank-title'] += 1
+ if ("title" not in obj) or (not obj["title"]):
+ self.counts["skip-blank-title"] += 1
return None
- release_type = self.map_release_type(obj['type'])
+ release_type = self.map_release_type(obj["type"])
# contribs
def do_contribs(obj_list, ctype):
contribs = []
for i, am in enumerate(obj_list):
creator_id = None
- if 'ORCID' in am.keys():
- creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
+ if "ORCID" in am.keys():
+ creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1])
# Sorry humans :(
- if am.get('given') and am.get('family'):
- raw_name = "{} {}".format(am['given'], am['family'])
- elif am.get('family'):
- raw_name = am['family']
+ if am.get("given") and am.get("family"):
+ raw_name = "{} {}".format(am["given"], am["family"])
+ elif am.get("family"):
+ raw_name = am["family"]
else:
# TODO: can end up empty
- raw_name = am.get('name') or am.get('given')
+ raw_name = am.get("name") or am.get("given")
extra = dict()
if ctype == "author":
index = i
else:
index = None
raw_affiliation = None
- if am.get('affiliation'):
- if len(am.get('affiliation')) > 0:
- raw_affiliation = am.get('affiliation')[0]['name']
- if len(am.get('affiliation')) > 1:
+ if am.get("affiliation"):
+ if len(am.get("affiliation")) > 0:
+ raw_affiliation = am.get("affiliation")[0]["name"]
+ if len(am.get("affiliation")) > 1:
# note: affiliation => more_affiliations
- extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
- if am.get('sequence') and am.get('sequence') != "additional":
- extra['seq'] = clean(am.get('sequence'))
+ extra["more_affiliations"] = [
+ clean(a["name"]) for a in am.get("affiliation")[1:]
+ ]
+ if am.get("sequence") and am.get("sequence") != "additional":
+ extra["seq"] = clean(am.get("sequence"))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=index,
- raw_name=raw_name,
- given_name=clean(am.get('given')),
- surname=clean(am.get('family')),
- raw_affiliation=clean(raw_affiliation),
- role=ctype,
- extra=extra))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=index,
+ raw_name=raw_name,
+ given_name=clean(am.get("given")),
+ surname=clean(am.get("family")),
+ raw_affiliation=clean(raw_affiliation),
+ role=ctype,
+ extra=extra,
+ )
+ )
return contribs
- contribs = do_contribs(obj.get('author', []), "author")
- contribs.extend(do_contribs(obj.get('editor', []), "editor"))
- contribs.extend(do_contribs(obj.get('translator', []), "translator"))
+
+ contribs = do_contribs(obj.get("author", []), "author")
+ contribs.extend(do_contribs(obj.get("editor", []), "editor"))
+ contribs.extend(do_contribs(obj.get("translator", []), "translator"))
# container
- issn = obj.get('ISSN', [None])[0]
+ issn = obj.get("ISSN", [None])[0]
issnl = self.issn2issnl(issn)
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
- publisher = clean(obj.get('publisher'))
+ publisher = clean(obj.get("publisher"))
- container_name = obj.get('container-title')
+ container_name = obj.get("container-title")
if container_name:
container_name = clean(container_name[0], force_xml=True)
if not container_name:
container_name = None
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=container_name)
+ name=container_name,
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter):
# license slug
license_slug = None
license_extra = []
- for lic in obj.get('license', []):
- if lic['content-version'] not in ('vor', 'unspecified'):
+ for lic in obj.get("license", []):
+ if lic["content-version"] not in ("vor", "unspecified"):
continue
- slug = lookup_license_slug(lic['URL'])
+ slug = lookup_license_slug(lic["URL"])
if slug:
license_slug = slug
- if 'start' in lic:
- lic['start'] = lic['start']['date-time']
+ if "start" in lic:
+ lic["start"] = lic["start"]["date-time"]
license_extra.append(lic)
# references
refs = []
- for i, rm in enumerate(obj.get('reference', [])):
+ for i, rm in enumerate(obj.get("reference", [])):
try:
- year: Optional[int] = int(rm.get('year'))
+ year: Optional[int] = int(rm.get("year"))
# TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year is not None:
@@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter):
except (TypeError, ValueError):
year = None
ref_extra: Dict[str, Any] = dict()
- key = rm.get('key')
- if key and key.startswith(obj['DOI'].upper()):
- key = key.replace(obj['DOI'].upper() + "-", '')
- key = key.replace(obj['DOI'].upper(), '')
- ref_container_name = rm.get('volume-title')
+ key = rm.get("key")
+ if key and key.startswith(obj["DOI"].upper()):
+ key = key.replace(obj["DOI"].upper() + "-", "")
+ key = key.replace(obj["DOI"].upper(), "")
+ ref_container_name = rm.get("volume-title")
if not ref_container_name:
- ref_container_name = rm.get('journal-title')
- elif rm.get('journal-title'):
- ref_extra['journal-title'] = rm['journal-title']
- if rm.get('DOI'):
- ref_extra['doi'] = rm.get('DOI').lower()
- author = clean(rm.get('author'))
+ ref_container_name = rm.get("journal-title")
+ elif rm.get("journal-title"):
+ ref_extra["journal-title"] = rm["journal-title"]
+ if rm.get("DOI"):
+ ref_extra["doi"] = rm.get("DOI").lower()
+ author = clean(rm.get("author"))
if author:
- ref_extra['authors'] = [author]
- for k in ('editor', 'edition', 'authority', 'version', 'genre',
- 'url', 'event', 'issue', 'volume', 'date', 'accessed_date',
- 'issued', 'page', 'medium', 'collection_title', 'chapter_number',
- 'unstructured', 'series-title', 'volume-title'):
+ ref_extra["authors"] = [author]
+ for k in (
+ "editor",
+ "edition",
+ "authority",
+ "version",
+ "genre",
+ "url",
+ "event",
+ "issue",
+ "volume",
+ "date",
+ "accessed_date",
+ "issued",
+ "page",
+ "medium",
+ "collection_title",
+ "chapter_number",
+ "unstructured",
+ "series-title",
+ "volume-title",
+ ):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- index=i,
- # doing lookups would be a second import pass
- target_release_id=None,
- key=key,
- year=year,
- container_name=clean(ref_container_name),
- title=clean(rm.get('article-title')),
- locator=clean(rm.get('first-page')),
- # TODO: just dump JSON somewhere here?
- extra=ref_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ index=i,
+ # doing lookups would be a second import pass
+ target_release_id=None,
+ key=key,
+ year=year,
+ container_name=clean(ref_container_name),
+ title=clean(rm.get("article-title")),
+ locator=clean(rm.get("first-page")),
+ # TODO: just dump JSON somewhere here?
+ extra=ref_extra,
+ )
+ )
# abstracts
abstracts = []
- abstract = clean(obj.get('abstract'))
+ abstract = clean(obj.get("abstract"))
if abstract and len(abstract) > 10:
- abstracts.append(fatcat_openapi_client.ReleaseAbstract(
- mimetype="application/xml+jats",
- content=abstract))
+ abstracts.append(
+ fatcat_openapi_client.ReleaseAbstract(
+ mimetype="application/xml+jats", content=abstract
+ )
+ )
# extra fields
extra = dict()
extra_crossref = dict()
# top-level extra keys
if not container_id:
- if obj.get('container-title'):
- extra['container_name'] = container_name
- for key in ('group-title'):
+ if obj.get("container-title"):
+ extra["container_name"] = container_name
+ for key in "group-title":
val = obj.get(key)
if val:
if type(val) == list:
@@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter):
else:
extra[key] = val
# crossref-nested extra keys
- for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'):
+ for key in ("subject", "type", "alternative-id", "archive", "funder"):
val = obj.get(key)
if val:
if type(val) == str:
@@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter):
else:
extra_crossref[key] = val
if license_extra:
- extra_crossref['license'] = license_extra
+ extra_crossref["license"] = license_extra
- if len(obj['title']) > 1:
- aliases = [clean(t) for t in obj['title'][1:]]
+ if len(obj["title"]) > 1:
+ aliases = [clean(t) for t in obj["title"][1:]]
aliases = [t for t in aliases if t]
if aliases:
- extra['aliases'] = aliases
+ extra["aliases"] = aliases
# ISBN
isbn13 = None
- for raw in obj.get('ISBN', []):
+ for raw in obj.get("ISBN", []):
# TODO: convert if not ISBN-13 format
if len(raw) == 17:
isbn13 = raw
break
# release status
- if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
- 'dissertation', 'book-chapter'):
+ if obj["type"] in (
+ "journal-article",
+ "conference-proceeding",
+ "book",
+ "dissertation",
+ "book-chapter",
+ ):
release_stage = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
# filter out unreasonably huge releases
if len(abstracts) > 100:
- self.counts['skip-huge-abstracts'] += 1
+ self.counts["skip-huge-abstracts"] += 1
return None
if len(contribs) > 2000:
- self.counts['skip-huge-contribs'] += 1
+ self.counts["skip-huge-contribs"] += 1
return None
if len(refs) > 5000:
- self.counts['skip-huge-refs'] += 1
+ self.counts["skip-huge-refs"] += 1
return None
# release date parsing is amazingly complex
- raw_date = obj['issued']['date-parts'][0]
+ raw_date = obj["issued"]["date-parts"][0]
if not raw_date or not raw_date[0]:
# got some NoneType, even though at least year is supposed to be set
release_year = None
@@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter):
release_date = None
original_title: Optional[str] = None
- if obj.get('original-title'):
- ot = obj.get('original-title')
+ if obj.get("original-title"):
+ ot = obj.get("original-title")
if ot is not None:
original_title = clean(ot[0], force_xml=True)
title: Optional[str] = None
- if obj.get('title'):
- title = clean(obj.get('title')[0], force_xml=True)
+ if obj.get("title"):
+ title = clean(obj.get("title")[0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
- self.counts['skip-blank-title'] += 1
+ self.counts["skip-blank-title"] += 1
return None
subtitle = None
- if obj.get('subtitle'):
- subtitle = clean(obj.get('subtitle')[0], force_xml=True)
+ if obj.get("subtitle"):
+ subtitle = clean(obj.get("subtitle")[0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
- extra['crossref'] = extra_crossref
+ extra["crossref"] = extra_crossref
if not extra:
extra = None
@@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter):
release_year=release_year,
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=obj['DOI'].lower(),
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
+ doi=obj["DOI"].lower(),
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
isbn13=isbn13,
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
- volume=clean(obj.get('volume')),
- issue=clean(obj.get('issue')),
- pages=clean(obj.get('page')),
- language=clean(obj.get('language')),
+ volume=clean(obj.get("volume")),
+ issue=clean(obj.get("issue")),
+ pages=clean(obj.get("page")),
+ language=clean(obj.get("language")),
license_slug=license_slug,
extra=extra,
abstracts=abstracts,
@@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index a06c68a4..4c174b0b 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -30,126 +30,130 @@ MAX_ABSTRACT_LENGTH = 2048
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
- 'Journal': 'journal',
- 'Series': 'journal',
- 'Book Series': 'book-series',
+ "Journal": "journal",
+ "Series": "journal",
+ "Book Series": "book-series",
}
# The docs/guide should be the canonical home for these mappings; update there
# first. Map various datacite type types to CSL-ish types. None means TODO or
# remove.
DATACITE_TYPE_MAP = {
- 'ris': {
- 'THES': 'thesis',
- 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
- 'CHAP': 'chapter',
- 'FIGURE': 'figure',
- 'RPRT': 'report',
- 'JOUR': 'article-journal',
- 'MPCT': 'motion_picture',
- 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
- 'BOOK': 'book',
- 'DATA': 'dataset',
- 'COMP': 'software',
+ "ris": {
+ "THES": "thesis",
+ "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report)
+ "CHAP": "chapter",
+ "FIGURE": "figure",
+ "RPRT": "report",
+ "JOUR": "article-journal",
+ "MPCT": "motion_picture",
+ "GEN": "article-journal", # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+ "BOOK": "book",
+ "DATA": "dataset",
+ "COMP": "software",
},
- 'schemaOrg': {
- 'Dataset': 'dataset',
- 'Book': 'book',
- 'ScholarlyArticle': 'article-journal',
- 'ImageObject': 'graphic',
- 'Collection': None,
- 'MediaObject': None,
- 'Event': None,
- 'SoftwareSourceCode': 'software',
- 'Chapter': 'chapter',
- 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
- 'PublicationIssue': 'article',
- 'AudioObject': None,
- 'Thesis': 'thesis',
+ "schemaOrg": {
+ "Dataset": "dataset",
+ "Book": "book",
+ "ScholarlyArticle": "article-journal",
+ "ImageObject": "graphic",
+ "Collection": None,
+ "MediaObject": None,
+ "Event": None,
+ "SoftwareSourceCode": "software",
+ "Chapter": "chapter",
+ "CreativeWork": None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+ "PublicationIssue": "article",
+ "AudioObject": None,
+ "Thesis": "thesis",
},
- 'citeproc': {
- 'article': 'article',
- 'article-journal': 'article-journal',
- 'article-magazine': 'article-magazine',
- 'article-newspaper': 'article-newspaper',
- 'bill': 'bill',
- 'book': 'book',
- 'broadcast': 'broadcast',
- 'chapter': 'chapter',
- 'dataset': 'dataset',
- 'entry-dictionary': 'entry-dictionary',
- 'entry-encyclopedia': 'entry-encyclopedia',
- 'entry': 'entry',
- 'figure': 'figure',
- 'graphic': 'graphic',
- 'interview': 'interview',
- 'legal_case': 'legal_case',
- 'legislation': 'legislation',
- 'manuscript': 'manuscript',
- 'map': 'map',
- 'motion_picture': 'motion_picture',
- 'musical_score': 'musical_score',
- 'pamphlet': 'pamphlet',
- 'paper-conference': 'paper-conference',
- 'patent': 'patent',
- 'personal_communication': 'personal_communication',
- 'post': 'post',
- 'post-weblog': 'post-weblog',
- 'report': 'report',
- 'review-book': 'review-book',
- 'review': 'review',
- 'song': 'song',
- 'speech': 'speech',
- 'thesis': 'thesis',
- 'treaty': 'treaty',
- 'webpage': 'webpage',
+ "citeproc": {
+ "article": "article",
+ "article-journal": "article-journal",
+ "article-magazine": "article-magazine",
+ "article-newspaper": "article-newspaper",
+ "bill": "bill",
+ "book": "book",
+ "broadcast": "broadcast",
+ "chapter": "chapter",
+ "dataset": "dataset",
+ "entry-dictionary": "entry-dictionary",
+ "entry-encyclopedia": "entry-encyclopedia",
+ "entry": "entry",
+ "figure": "figure",
+ "graphic": "graphic",
+ "interview": "interview",
+ "legal_case": "legal_case",
+ "legislation": "legislation",
+ "manuscript": "manuscript",
+ "map": "map",
+ "motion_picture": "motion_picture",
+ "musical_score": "musical_score",
+ "pamphlet": "pamphlet",
+ "paper-conference": "paper-conference",
+ "patent": "patent",
+ "personal_communication": "personal_communication",
+ "post": "post",
+ "post-weblog": "post-weblog",
+ "report": "report",
+ "review-book": "review-book",
+ "review": "review",
+ "song": "song",
+ "speech": "speech",
+ "thesis": "thesis",
+ "treaty": "treaty",
+ "webpage": "webpage",
}, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
- 'bibtex': {
- 'phdthesis': 'thesis',
- 'inbook': 'chapter',
- 'misc': None,
- 'article': 'article-journal',
- 'book': 'book',
+ "bibtex": {
+ "phdthesis": "thesis",
+ "inbook": "chapter",
+ "misc": None,
+ "article": "article-journal",
+ "book": "book",
},
- 'resourceTypeGeneral': {
- 'Image': 'graphic',
- 'Dataset': 'dataset',
- 'PhysicalObject': None,
- 'Collection': None,
- 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
- 'Sound': None,
- 'InteractiveResource': None,
- 'Event': None,
- 'Software': 'software',
- 'Other': None,
- 'Workflow': None,
- 'Audiovisual': None,
- } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+ "resourceTypeGeneral": {
+ "Image": "graphic",
+ "Dataset": "dataset",
+ "PhysicalObject": None,
+ "Collection": None,
+ "Text": None, # "Greyliterature, labnotes, accompanyingmaterials"
+ "Sound": None,
+ "InteractiveResource": None,
+ "Event": None,
+ "Software": "software",
+ "Other": None,
+ "Workflow": None,
+ "Audiovisual": None,
+ }, # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
}
# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
DATACITE_UNKNOWN_MARKERS = (
- '(:unac)', # temporarily inaccessible
- '(:unal)', # unallowed, suppressed intentionally
- '(:unap)', # not applicable, makes no sense
- '(:unas)', # value unassigned (e.g., Untitled)
- '(:unav)', # value unavailable, possibly unknown
- '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue)
- '(:none)', # never had a value, never will
- '(:null)', # explicitly and meaningfully empty
- '(:tba)', # to be assigned or announced later
- '(:etal)', # too numerous to list (et alia)
+ "(:unac)", # temporarily inaccessible
+ "(:unal)", # unallowed, suppressed intentionally
+ "(:unap)", # not applicable, makes no sense
+ "(:unas)", # value unassigned (e.g., Untitled)
+ "(:unav)", # value unavailable, possibly unknown
+ "(:unkn)", # known to be unknown (e.g., Anonymous, Inconnue)
+ "(:none)", # never had a value, never will
+ "(:null)", # explicitly and meaningfully empty
+ "(:tba)", # to be assigned or announced later
+ "(:etal)", # too numerous to list (et alia)
)
# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
# unknown values.
-UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
- 'NA',
- 'NN',
- 'n.a.',
- '[s.n.]',
- 'Unknown',
-)))
+UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(
+ set(
+ (
+ "NA",
+ "NN",
+ "n.a.",
+ "[s.n.]",
+ "Unknown",
+ )
+ )
+)
# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
@@ -157,8 +161,20 @@ UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
DATACITE_TITLE_SPAM_WORDGROUPS = [
{
- "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
- 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+ "tokens": (
+ "full",
+ "movies",
+ "movie",
+ "watch",
+ "streaming",
+ "online",
+ "free",
+ "hd",
+ "download",
+ "english",
+ "subtitle",
+ "bluray",
+ ),
"min": 4,
}
]
@@ -205,28 +221,25 @@ class DataciteImporter(EntityImporter):
"""
Importer for datacite records.
"""
- def __init__(self,
- api,
- issn_map_file,
- debug=False,
- insert_log_file=None,
- **kwargs):
+
+ def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of Datacite DOI metadata, harvested from REST API"
+ "editgroup_description",
+ "Automated import of Datacite DOI metadata, harvested from REST API",
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DataciteImporter')
- super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
-
- self.create_containers = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DataciteImporter")
+ super().__init__(
+ api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs
+ )
+
+ self.create_containers = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -240,30 +253,34 @@ class DataciteImporter(EntityImporter):
self.insert_log_file = insert_log_file
self.this_year = datetime.datetime.now().year
- print('datacite with debug={}'.format(self.debug), file=sys.stderr)
+ print("datacite with debug={}".format(self.debug), file=sys.stderr)
def lookup_ext_ids(self, doi):
"""
Return dictionary of identifiers referring to the same things as the given DOI.
"""
if self.extid_map_db is None:
- return dict(core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None)
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
row = self.extid_map_db.execute(
- "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None,
- pmid=None,
- pmcid=None,
- wikidata_qid=None,
- arxiv_id=None,
- jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -280,22 +297,22 @@ class DataciteImporter(EntityImporter):
"""
if not obj or not isinstance(obj, dict):
return None
- if 'attributes' not in obj:
+ if "attributes" not in obj:
return None
- attributes = obj['attributes']
- doi = clean_doi(attributes.get('doi', '').lower())
+ attributes = obj["attributes"]
+ doi = clean_doi(attributes.get("doi", "").lower())
if not doi:
- print('skipping record without a DOI', file=sys.stderr)
+ print("skipping record without a DOI", file=sys.stderr)
return
if not str.isascii(doi):
- print('[{}] skipping non-ascii doi for now'.format(doi))
+ print("[{}] skipping non-ascii doi for now".format(doi))
return None
- creators = attributes.get('creators', []) or []
- contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
+ creators = attributes.get("creators", []) or []
+ contributors = attributes.get("contributors", []) or [] # Much fewer than creators.
contribs = self.parse_datacite_creators(creators, doi=doi)
@@ -323,7 +340,9 @@ class DataciteImporter(EntityImporter):
# Related: https://guide.fatcat.wiki/entity_release.html -- role
# (string, of a set): the type of contribution, from a controlled
# vocabulary. TODO: vocabulary needs review.
- contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+ contribs_extra_contributors = self.parse_datacite_creators(
+ contributors, set_index=False, doi=doi
+ )
# Unfortunately, creators and contributors might overlap, refs GH59.
for cc in contribs_extra_contributors:
@@ -333,17 +352,16 @@ class DataciteImporter(EntityImporter):
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
- titles = attributes.get('titles', []) or []
- title, original_language_title, subtitle = parse_datacite_titles(
- titles)
+ titles = attributes.get("titles", []) or []
+ title, original_language_title, subtitle = parse_datacite_titles(titles)
if title is None:
- print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
title = clean(title)
if not title:
- print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] skipping record w/o title: {}".format(doi, obj), file=sys.stderr)
return False
# check for blocklisted "spam", e.g. "FULL MOVIE"
@@ -367,10 +385,13 @@ class DataciteImporter(EntityImporter):
# "Collected", "Copyrighted", "Created", "Issued", "Submitted",
# "Updated", "Valid".
release_date, release_month, release_year = parse_datacite_dates(
- attributes.get('dates', []))
+ attributes.get("dates", [])
+ )
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_date = None
release_month = None
release_year = None
@@ -378,26 +399,30 @@ class DataciteImporter(EntityImporter):
# Some records do not use the "dates" field (e.g. micropub), but:
# "attributes.published" or "attributes.publicationYear"
if not any((release_date, release_month, release_year)):
- release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+ release_date, release_month, release_year = parse_single_date(
+ attributes.get("publicationYear")
+ )
if not any((release_date, release_month, release_year)):
- release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+ release_date, release_month, release_year = parse_single_date(
+ attributes.get("published")
+ )
if not any((release_date, release_month, release_year)):
- print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+ print("[{}] record w/o date: {}".format(doi, obj), file=sys.stderr)
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
- release_stage = 'published'
+ release_stage = "published"
# TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
# we might want something else than 'published'. See also:
# https://support.datacite.org/docs/doi-states.
# Publisher. A few NA values. A few bogus values.
- publisher = attributes.get('publisher')
+ publisher = attributes.get("publisher")
- if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
+ if publisher in UNKNOWN_MARKERS | set(("Unpublished", "Unknown")):
publisher = None
release_stage = None
if publisher is not None and len(publisher) > 80:
@@ -416,24 +441,26 @@ class DataciteImporter(EntityImporter):
container_id = None
container_name = None
- container = attributes.get('container', {}) or {}
- if container.get('type') in CONTAINER_TYPE_MAP.keys():
- container_type = CONTAINER_TYPE_MAP.get(container['type'])
- if container.get('identifier') and container.get(
- 'identifierType') == 'ISSN':
- issn = container.get('identifier')
+ container = attributes.get("container", {}) or {}
+ if container.get("type") in CONTAINER_TYPE_MAP.keys():
+ container_type = CONTAINER_TYPE_MAP.get(container["type"])
+ if container.get("identifier") and container.get("identifierType") == "ISSN":
+ issn = container.get("identifier")
if len(issn) == 8:
issn = issn[:4] + "-" + issn[4:]
issnl = self.issn2issnl(issn)
if issnl is not None:
container_id = self.lookup_issnl(issnl)
- if container_id is None and container.get('title'):
- container_name = container.get('title')
+ if container_id is None and container.get("title"):
+ container_name = container.get("title")
if isinstance(container_name, list):
if len(container_name) > 0:
- print('[{}] too many container titles: {}'.format(doi,
- len(container_name)))
+ print(
+ "[{}] too many container titles: {}".format(
+ doi, len(container_name)
+ )
+ )
container_name = container_name[0]
assert isinstance(container_name, str)
ce = fatcat_openapi_client.ContainerEntity(
@@ -447,21 +474,24 @@ class DataciteImporter(EntityImporter):
else:
# TODO(martin): factor this out into a testable function.
# TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
- container_name = container.get('title')
+ container_name = container.get("title")
if isinstance(container_name, list):
if len(container_name) > 0:
- print('[{}] too many container titles: {}'.format(doi,
- len(container_name)))
+ print(
+ "[{}] too many container titles: {}".format(
+ doi, len(container_name)
+ )
+ )
container_name = container_name[0]
# Exception: https://www.micropublication.org/, see: !MR24.
if container_id is None and container_name is None:
- if publisher and publisher.lower().startswith('micropublication'):
+ if publisher and publisher.lower().startswith("micropublication"):
container_name = publisher
# Volume and issue.
- volume = container.get('volume')
- issue = container.get('issue')
+ volume = container.get("volume")
+ issue = container.get("issue")
if volume:
volume = clean(volume)
@@ -472,13 +502,13 @@ class DataciteImporter(EntityImporter):
# Pages.
pages = None
- first_page = container.get('firstPage')
- last_page = container.get('lastPage')
+ first_page = container.get("firstPage")
+ last_page = container.get("lastPage")
if first_page and last_page:
try:
_ = int(first_page) < int(last_page)
- pages = '{}-{}'.format(first_page, last_page)
+ pages = "{}-{}".format(first_page, last_page)
except ValueError as err: # noqa: F841
# TODO(martin): This is more debug than info.
# print('[{}] {}'.format(doi, err), file=sys.stderr)
@@ -491,8 +521,8 @@ class DataciteImporter(EntityImporter):
license_slug = None
license_extra = []
- for lic in attributes.get('rightsList', []):
- slug = lookup_license_slug(lic.get('rightsUri'))
+ for lic in attributes.get("rightsList", []):
+ slug = lookup_license_slug(lic.get("rightsUri"))
if slug:
license_slug = slug
license_extra.append(lic)
@@ -506,7 +536,7 @@ class DataciteImporter(EntityImporter):
# library solves it for you." -- TODO(martin): We need more of these.
language = None
- value = attributes.get('language', '') or ''
+ value = attributes.get("language", "") or ""
try:
language = pycountry.languages.lookup(value).alpha_2
except (LookupError, AttributeError) as err: # noqa: F841
@@ -520,22 +550,22 @@ class DataciteImporter(EntityImporter):
# "Other" fields might contain references or related articles (with
# DOI). TODO(martin): maybe try to parse out some of those refs.
abstracts = []
- descs = attributes.get('descriptions', []) or []
+ descs = attributes.get("descriptions", []) or []
for desc in descs:
- if not desc.get('descriptionType') == 'Abstract':
+ if not desc.get("descriptionType") == "Abstract":
continue
# Description maybe a string, int or list.
- text = desc.get('description', '')
+ text = desc.get("description", "")
if not text:
continue
if isinstance(text, int):
- text = '{}'.format(text)
+ text = "{}".format(text)
if isinstance(text, list):
try:
text = "\n".join(text)
except TypeError:
- continue # Bail out, if it is not a list of strings.
+ continue # Bail out, if it is not a list of strings.
# Limit length.
if len(text) < 10:
@@ -548,7 +578,10 @@ class DataciteImporter(EntityImporter):
try:
lang = langdetect.detect(text)
except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
- print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
+ print(
+ "[{}] language detection failed with {} on {}".format(doi, err, text),
+ file=sys.stderr,
+ )
abstract_text = clean(text)
if not abstract_text:
continue
@@ -557,7 +590,8 @@ class DataciteImporter(EntityImporter):
mimetype="text/plain",
content=abstract_text,
lang=lang,
- ))
+ )
+ )
# References and relations. Datacite include many relation types in
# "attributes.relatedIdentifiers[].relationType", e.g.
@@ -570,67 +604,76 @@ class DataciteImporter(EntityImporter):
# For the moment, we only care about References.
refs, ref_index = [], 0
- relIds = attributes.get('relatedIdentifiers', []) or []
+ relIds = attributes.get("relatedIdentifiers", []) or []
for rel in relIds:
- if not rel.get('relationType', '') in ('References', 'Cites'):
+ if not rel.get("relationType", "") in ("References", "Cites"):
continue
ref_extra = dict()
- if rel.get('relatedIdentifierType', '') == 'DOI':
- ref_extra['doi'] = rel.get('relatedIdentifier')
+ if rel.get("relatedIdentifierType", "") == "DOI":
+ ref_extra["doi"] = rel.get("relatedIdentifier")
if not ref_extra:
ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=ref_index,
extra=ref_extra,
- ))
+ )
+ )
ref_index += 1
# More specific release_type via 'Reviews' relationsship.
for rel in relIds:
- if rel.get('relatedIdentifierType', '') != 'Reviews':
+ if rel.get("relatedIdentifierType", "") != "Reviews":
continue
- release_type = 'review'
+ release_type = "review"
# Extra information.
extra_datacite = dict()
if license_extra:
- extra_datacite['license'] = license_extra
- if attributes.get('subjects'):
- extra_datacite['subjects'] = attributes['subjects']
+ extra_datacite["license"] = license_extra
+ if attributes.get("subjects"):
+ extra_datacite["subjects"] = attributes["subjects"]
# Include version information.
- metadata_version = attributes.get('metadataVersion') or ''
+ metadata_version = attributes.get("metadataVersion") or ""
if metadata_version:
- extra_datacite['metadataVersion'] = metadata_version
+ extra_datacite["metadataVersion"] = metadata_version
# Include resource types.
- types = attributes.get('types', {}) or {}
- resource_type = types.get('resourceType', '') or ''
- resource_type_general = types.get('resourceTypeGeneral', '') or ''
+ types = attributes.get("types", {}) or {}
+ resource_type = types.get("resourceType", "") or ""
+ resource_type_general = types.get("resourceTypeGeneral", "") or ""
if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER:
- extra_datacite['resourceType'] = resource_type
+ extra_datacite["resourceType"] = resource_type
if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER:
- extra_datacite['resourceTypeGeneral'] = resource_type_general
+ extra_datacite["resourceTypeGeneral"] = resource_type_general
# Include certain relations from relatedIdentifiers. Keeping the
# original structure of data here, which is a list of dicts, with
# relation type, identifier and identifier type (mostly).
relations = []
for rel in relIds:
- if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
- 'IsVariantFormOf', 'IsSupplementTo',
- 'HasVersion', 'IsMetadataFor',
- 'IsNewVersionOf', 'IsIdenticalTo',
- 'IsVersionOf', 'IsDerivedFrom',
- 'IsSourceOf'):
+ if rel.get("relationType") in (
+ "IsPartOf",
+ "Reviews",
+ "Continues",
+ "IsVariantFormOf",
+ "IsSupplementTo",
+ "HasVersion",
+ "IsMetadataFor",
+ "IsNewVersionOf",
+ "IsIdenticalTo",
+ "IsVersionOf",
+ "IsDerivedFrom",
+ "IsSourceOf",
+ ):
relations.append(rel)
if relations:
- extra_datacite['relations'] = relations
+ extra_datacite["relations"] = relations
extra = dict()
@@ -640,18 +683,18 @@ class DataciteImporter(EntityImporter):
# Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
# "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
# "10161", "10010691", "10780", # "Presentación"
- version = attributes.get('version') or None
+ version = attributes.get("version") or None
# top-level extra keys
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
# Always include datacite key, even if value is empty (dict).
- extra['datacite'] = extra_datacite
+ extra["datacite"] = extra_datacite
# Preparation for a schema update.
if release_month:
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
extids = self.lookup_ext_ids(doi=doi)
@@ -669,12 +712,12 @@ class DataciteImporter(EntityImporter):
publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
contribs=contribs,
volume=volume,
@@ -702,19 +745,19 @@ class DataciteImporter(EntityImporter):
"""
release_type = None
- if not attributes.get('types'):
+ if not attributes.get("types"):
return None
- types = attributes['types']
+ types = attributes["types"]
- for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+ for typeType in ("citeproc", "ris", "schemaOrg", "bibtex", "resourceTypeGeneral"):
value = types.get(typeType)
release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
if release_type is not None:
break
# special case: figshare "collections" which group other entities
- if doi.startswith('10.6084/') or doi.startswith('10.25384'):
- if types.get('resourceType') == "Collection":
+ if doi.startswith("10.6084/") or doi.startswith("10.25384"):
+ if types.get("resourceType") == "Collection":
release_type = "stub"
if release_type is None:
@@ -736,35 +779,41 @@ class DataciteImporter(EntityImporter):
# publishes highly interesting datasets, but titles are mostly the same
# ("GBIF Occurrence Download" or "Occurrence Download"); set
# release_type to "stub" (CSL/FC).
- if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'):
- re.release_type = 'stub'
+ if re.title == "GBIF Occurrence Download" and re.ext_ids.doi.startswith("10.15468/dl."):
+ re.release_type = "stub"
# release_type exception: lots of "Experimental Crystal Structure Determination"
# publisher: "Cambridge Crystallographic Data Centre"
- if re.ext_ids.doi.startswith('10.5517/'):
- re.release_type = 'entry'
+ if re.ext_ids.doi.startswith("10.5517/"):
+ re.release_type = "entry"
# Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
- if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):
- re.release_type = 'component'
+ if re.title.lower().startswith("additional file") and re.release_type in (
+ "article",
+ "article-journal",
+ ):
+ re.release_type = "component"
# figshare
- if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'):
+ if re.ext_ids.doi.startswith("10.6084/") or re.ext_ids.doi.startswith("10.25384"):
# set version if DOI ends with versioned suffix
- doi_suffix = re.ext_ids.doi.split('.')[-1]
- if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit():
+ doi_suffix = re.ext_ids.doi.split(".")[-1]
+ if doi_suffix and doi_suffix.startswith("v") and doi_suffix[1:].isdigit():
re.version = doi_suffix
# "Figure 123 from " -> component
# "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean"
- if " from " in re.title and re.release_type not in ('stub', 'graphic'):
+ if " from " in re.title and re.release_type not in ("stub", "graphic"):
if re.title.startswith("Figure "):
re.release_type = "component"
elif re.title.startswith("Table "):
re.release_type = "component"
# figshare.com
- if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None:
- re.extra['container_name'] = "figshare.com"
+ if (
+ re.ext_ids.doi.startswith("10.6084/m9.figshare.")
+ and re.extra.get("container_name") is None
+ ):
+ re.extra["container_name"] = "figshare.com"
return re
@@ -788,26 +837,28 @@ class DataciteImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+ print("inserting batch ({})".format(len(batch)), file=sys.stderr)
if self.insert_log_file:
- with open(self.insert_log_file, 'a') as f:
+ with open(self.insert_log_file, "a") as f:
for doc in batch:
json.dump(entity_to_dict(doc, api_client=None), f)
- f.write('\n')
+ f.write("\n")
self.api.create_release_auto_batch(
fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
- def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None):
+ def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None):
"""
Parses a list of creators into a list of ReleaseContrib objects. Set
set_index to False, if the index contrib field should be left blank.
@@ -820,48 +871,53 @@ class DataciteImporter(EntityImporter):
contribs = []
# Names, that should be ignored right away.
- name_blocklist = set(('Occdownload Gbif.Org',))
+ name_blocklist = set(("Occdownload Gbif.Org",))
i = 0
for c in creators:
if not set_index:
i = None
- nameType = c.get('nameType', '') or ''
- if nameType in ('', 'Personal'):
+ nameType = c.get("nameType", "") or ""
+ if nameType in ("", "Personal"):
creator_id = None
- for nid in c.get('nameIdentifiers', []) or []:
+ for nid in c.get("nameIdentifiers", []) or []:
if not isinstance(nid, dict):
# see: fatcat-workers/issues/44035/
- print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr)
+ print(
+ "unexpected nameIdentifiers, expected list of dicts, got: {}".format(
+ nid
+ ),
+ file=sys.stderr,
+ )
continue
- name_scheme = nid.get('nameIdentifierScheme', '') or ''
+ name_scheme = nid.get("nameIdentifierScheme", "") or ""
if not name_scheme.lower() == "orcid":
continue
- orcid = nid.get('nameIdentifier') or ''
- orcid = orcid.replace('https://orcid.org/', '')
+ orcid = nid.get("nameIdentifier") or ""
+ orcid = orcid.replace("https://orcid.org/", "")
if not orcid:
continue
creator_id = self.lookup_orcid(orcid)
# TODO(martin): If creator_id is None, should we create creators?
# If there are multiple affiliation strings, use the first one.
- affiliations = c.get('affiliation', []) or []
+ affiliations = c.get("affiliation", []) or []
raw_affiliation = None
if len(affiliations) == 0:
raw_affiliation = None
else:
raw_affiliation = clean(affiliations[0])
- name = c.get('name')
- given_name = c.get('givenName')
- surname = c.get('familyName')
+ name = c.get("name")
+ given_name = c.get("givenName")
+ surname = c.get("familyName")
if name:
name = clean(name)
if not any((name, given_name, surname)):
continue
if not name:
- name = "{} {}".format(given_name or '', surname or '').strip()
+ name = "{} {}".format(given_name or "", surname or "").strip()
if name in name_blocklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -881,7 +937,7 @@ class DataciteImporter(EntityImporter):
if not name:
continue
- if raw_affiliation == '':
+ if raw_affiliation == "":
continue
extra = None
@@ -891,39 +947,38 @@ class DataciteImporter(EntityImporter):
# "RelatedPerson", "ProjectLeader", "Editor", "Other",
# "ProjectMember", "Funder", "RightsHolder", "DataCollector",
# "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
- contributorType = c.get('contributorType', '') or ''
+ contributorType = c.get("contributorType", "") or ""
if contributorType:
- extra = {'type': contributorType}
+ extra = {"type": contributorType}
rc = fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=i,
- raw_name=name,
- given_name=given_name,
- surname=surname,
- role=role,
- raw_affiliation=raw_affiliation,
- extra=extra,
- )
+ creator_id=creator_id,
+ index=i,
+ raw_name=name,
+ given_name=given_name,
+ surname=surname,
+ role=role,
+ raw_affiliation=raw_affiliation,
+ extra=extra,
+ )
# Filter out duplicates early.
if not contributor_list_contains_contributor(contribs, rc):
contribs.append(rc)
if i is not None:
i += 1
- elif nameType == 'Organizational':
- name = c.get('name', '') or ''
+ elif nameType == "Organizational":
+ name = c.get("name", "") or ""
if name in UNKNOWN_MARKERS:
continue
if len(name) < 3:
continue
- extra = {'organization': name}
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i, extra=extra))
+ extra = {"organization": name}
+ contribs.append(fatcat_openapi_client.ReleaseContrib(index=i, extra=extra))
if i is not None:
i += 1
else:
- print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+ print("[{}] unknown name type: {}".format(doi, nameType), file=sys.stderr)
return contribs
@@ -935,8 +990,8 @@ def contributor_list_contains_contributor(contributor_list, contributor):
for cc in contributor_list:
if cc.raw_name != contributor.raw_name:
continue
- cc_role = cc.role or 'author'
- contributor_role = contributor.role or 'author'
+ cc_role = cc.role or "author"
+ contributor_role = contributor.role or "author"
if cc_role != contributor_role:
continue
return True
@@ -952,91 +1007,97 @@ def lookup_license_slug(raw):
if not raw:
return None
- if 'creativecommons.org/publicdomain/zero' in raw:
- return 'CC-0'
- if raw.lower().endswith('/cc0'):
- return 'CC-0'
+ if "creativecommons.org/publicdomain/zero" in raw:
+ return "CC-0"
+ if raw.lower().endswith("/cc0"):
+ return "CC-0"
- if 'creativecommons' in raw:
+ if "creativecommons" in raw:
# https://creativecommons.org/publicdomain/mark/1.0/deed.de
- if 'creativecommons.org/publicdomain' in raw:
- return 'CC-PUBLICDOMAIN'
- if 'creativecommons.org/share-your-work/public-domain/cc0' in raw:
- return 'CC-0'
+ if "creativecommons.org/publicdomain" in raw:
+ return "CC-PUBLICDOMAIN"
+ if "creativecommons.org/share-your-work/public-domain/cc0" in raw:
+ return "CC-0"
# https://creativecommons.org/licenses/by/4.0/deed.es_ES
raw = raw.lower()
- match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE)
+ match = re.search(
+ r"creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)", raw, re.IGNORECASE
+ )
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
- if not name.startswith('cc'):
- name = 'cc-{}'.format(name)
+ if not name.startswith("cc"):
+ name = "cc-{}".format(name)
return name.upper()
- if 'opensource.org' in raw:
+ if "opensource.org" in raw:
# https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2
- match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE)
+ match = re.search(r"opensource.org/licenses/(?P<name>[^/]+)", raw, re.IGNORECASE)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 11:
return None
return name.upper()
- if 'gnu.org' in raw:
+ if "gnu.org" in raw:
# http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
- match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE)
+ match = re.search(
+ r"/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)",
+ raw,
+ re.IGNORECASE,
+ )
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 8:
return None
return name.upper()
- if 'spdx.org' in raw:
- if 'spdx.org/licenses/CC0' in raw:
- return 'CC-0'
+ if "spdx.org" in raw:
+ if "spdx.org/licenses/CC0" in raw:
+ return "CC-0"
# https://spdx.org/licenses/CC-BY-NC-ND-4.0.html
- match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE)
+ match = re.search(r"spdx.org/licenses/(?P<name>[a-z0-9-]+)", raw, re.IGNORECASE)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 36:
return None
# cleanup version and extensions
- name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower())
+ name = re.sub("(-[0-9])?[.]?[0-9]?(.json|.html)?", "", name.lower())
return name.upper()
- if 'rightsstatements.org' in raw:
+ if "rightsstatements.org" in raw:
# http://rightsstatements.org/vocab/InC/1.0/
- match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw)
+ match = re.search(r"rightsstatements.org/(vocab|page)/(?P<name>[^/]*)", raw)
if not match:
- print('missed potential license: {}'.format(raw), file=sys.stderr)
+ print("missed potential license: {}".format(raw), file=sys.stderr)
return None
- name = match.groupdict().get('name')
+ name = match.groupdict().get("name")
if not name:
return None
if len(name) > 9:
return None
- return 'RS-{}'.format(name.upper())
+ return "RS-{}".format(name.upper())
# Fallback to mapped values.
raw = raw.lower()
- raw = raw.strip().replace('http://', '//').replace('https://', '//')
- if not raw.endswith('/'):
- raw = raw + '/'
+ raw = raw.strip().replace("http://", "//").replace("https://", "//")
+ if not raw.endswith("/"):
+ raw = raw + "/"
return LICENSE_SLUG_MAP.get(raw)
@@ -1046,23 +1107,21 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3):
Example input: {'title': 'Some title', 'original_language_title': 'Some title'}
"""
- if 'original_language_title' not in item:
+ if "original_language_title" not in item:
return None
- title = item.get('title')
+ title = item.get("title")
if not title:
return None
- original_language_title = item.get('original_language_title')
- if isinstance(original_language_title,
- str) and title != original_language_title:
+ original_language_title = item.get("original_language_title")
+ if isinstance(original_language_title, str) and title != original_language_title:
if len(original_language_title) < min_length:
return None
- if original_language_title.count('?') > max_questionmarks:
+ if original_language_title.count("?") > max_questionmarks:
return None
return original_language_title
if isinstance(original_language_title, dict):
- content = original_language_title.get('__content__', '') or ''
- if content and content != title and not content.count(
- '?') > max_questionmarks:
+ content = original_language_title.get("__content__", "") or ""
+ if content and content != title and not content.count("?") > max_questionmarks:
return content
return None
@@ -1082,23 +1141,23 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
elif len(titles) == 1:
original_language_title = find_original_language_title(titles[0])
- title = titles[0].get('title', '') or ''
+ title = titles[0].get("title", "") or ""
title = title.strip()
if not title:
title = None
return title, original_language_title, subtitle
else:
for entry in titles:
- if not title and ('titleType' not in entry
- or not entry.get('titleType')):
- title = (entry.get('title') or '').strip()
- if not subtitle and entry.get('titleType') == 'Subtitle':
- subtitle = entry.get('title', '').strip()
+ if not title and ("titleType" not in entry or not entry.get("titleType")):
+ title = (entry.get("title") or "").strip()
+ if not subtitle and entry.get("titleType") == "Subtitle":
+ subtitle = entry.get("title", "").strip()
if not original_language_title:
original_language_title = find_original_language_title(entry)
return title, original_language_title, subtitle
+
def parse_single_date(value):
"""
Given a single string containing a date in arbitrary format, try to return
@@ -1113,11 +1172,11 @@ def parse_single_date(value):
# Results in a dict with keys: date_obj, period, locale.
parse_result = parser.get_date_data(value)
# A datetime object, later we need a date, only.
- result = parse_result['date_obj']
+ result = parse_result["date_obj"]
if result is not None:
- if parse_result['period'] == 'year':
+ if parse_result["period"] == "year":
return None, None, result.year
- elif parse_result['period'] == 'month':
+ elif parse_result["period"] == "month":
return None, result.month, result.year
else:
return result.date(), result.month, result.year
@@ -1126,6 +1185,7 @@ def parse_single_date(value):
return None, None, None
+
def parse_datacite_dates(dates):
"""
Given a list of date fields (under .dates), return tuple, (release_date,
@@ -1137,37 +1197,37 @@ def parse_datacite_dates(dates):
return release_date, release_month, release_year
if not isinstance(dates, list):
- raise ValueError('expected a list of date items')
+ raise ValueError("expected a list of date items")
# Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",
# "Collected", "Updated", "Copyrighted", "Created"
# Ignored for now: "Collected", "Issued"
date_type_prio = (
- 'Valid',
- 'Available',
- 'Accepted',
- 'Submitted',
- 'Copyrighted',
- 'Created',
- 'Updated',
+ "Valid",
+ "Available",
+ "Accepted",
+ "Submitted",
+ "Copyrighted",
+ "Created",
+ "Updated",
)
# We need to note the granularity, since a string like "2019" would be
# parsed into "2019-01-01", even though the month is unknown. Use 3
# granularity types: 'y', 'm', 'd'.
- Pattern = collections.namedtuple('Pattern', 'layout granularity')
+ Pattern = collections.namedtuple("Pattern", "layout granularity")
# Before using (expensive) dateparser, try a few common patterns.
common_patterns = (
- Pattern('%Y-%m-%d', 'd'),
- Pattern('%Y-%m', 'm'),
- Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
- Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
- Pattern('%Y', 'y'),
+ Pattern("%Y-%m-%d", "d"),
+ Pattern("%Y-%m", "m"),
+ Pattern("%Y-%m-%dT%H:%M:%SZ", "d"),
+ Pattern("%Y-%m-%dT%H:%M:%S", "d"),
+ Pattern("%Y", "y"),
)
def parse_item(item):
- result, value, year_only = None, str(item.get('date', '')) or '', False
+ result, value, year_only = None, str(item.get("date", "")) or "", False
release_date, release_month, release_year = None, None, None
for layout, granularity in common_patterns:
@@ -1176,22 +1236,22 @@ def parse_datacite_dates(dates):
except ValueError:
continue
else:
- if granularity == 'y':
+ if granularity == "y":
year_only = True
break
if result is None:
- print('fallback for {}'.format(value), file=sys.stderr)
+ print("fallback for {}".format(value), file=sys.stderr)
release_date, release_month, release_year = parse_single_date(value)
if result is None:
# Unparsable date.
return release_date, release_month, release_year
- if granularity != 'y':
+ if granularity != "y":
release_date = result.date()
release_year = result.year
- if granularity in ('m', 'd'):
+ if granularity in ("m", "d"):
release_month = result.month
return release_date, release_month, release_year
@@ -1200,7 +1260,7 @@ def parse_datacite_dates(dates):
for prio in date_type_prio:
for item in dates:
- if not item.get('dateType') == prio:
+ if not item.get("dateType") == prio:
continue
release_date, release_month, release_year = parse_item(item)
@@ -1224,45 +1284,49 @@ def parse_datacite_dates(dates):
return release_date, release_month, release_year
+
def index_form_to_display_name(s):
"""
Try to convert an index form name, like 'Razis, Panos A' into display_name,
e.g. 'Panos A Razis'.
"""
- if ',' not in s:
+ if "," not in s:
return s
- skip_on_chars = ['(', ')', '*']
+ skip_on_chars = ["(", ")", "*"]
for char in skip_on_chars:
if char in s:
return s
- if s.count(',') > 1:
+ if s.count(",") > 1:
# "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
return s
# Not names, but sprinkled in fields where authors live.
- stopwords = [s.lower() for s in (
- 'Archive',
- 'Collection',
- 'Coordinator',
- 'Department',
- 'Germany',
- 'International',
- 'National',
- 'Netherlands',
- 'Office',
- 'Organisation',
- 'Organization',
- 'Service',
- 'Services',
- 'United States',
- 'University',
- 'Verein',
- 'Volkshochschule',
- )]
+ stopwords = [
+ s.lower()
+ for s in (
+ "Archive",
+ "Collection",
+ "Coordinator",
+ "Department",
+ "Germany",
+ "International",
+ "National",
+ "Netherlands",
+ "Office",
+ "Organisation",
+ "Organization",
+ "Service",
+ "Services",
+ "United States",
+ "University",
+ "Verein",
+ "Volkshochschule",
+ )
+ ]
lower = s.lower()
for stop in stopwords:
if stop in lower:
return s
- a, b = s.split(',')
- return '{} {}'.format(b.strip(), a.strip())
+ a, b = s.split(",")
+ return "{} {}".format(b.strip(), a.strip())
diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py
index 3d280fb7..603a6271 100644
--- a/python/fatcat_tools/importers/dblp_container.py
+++ b/python/fatcat_tools/importers/dblp_container.py
@@ -1,4 +1,3 @@
-
"""
Importer for DBLP container-level (journal/conference/series) metadata,
pre-scraped in to JSON from HTML pages.
@@ -13,17 +12,17 @@ from fatcat_tools.normal import clean_str
class DblpContainerImporter(EntityImporter):
+ def __init__(
+ self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs
+ ):
- def __init__(self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs):
-
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata scraped from dblp HTML")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DblpContainerImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata scraped from dblp HTML",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpContainerImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.dblp_container_map_output = dblp_container_map_output
self.read_dblp_container_map_file(dblp_container_map_file)
@@ -40,7 +39,10 @@ class DblpContainerImporter(EntityImporter):
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
print("\t".join([prefix, container_id]), file=self.dblp_container_map_output)
- print("Got {} existing dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+ print(
+ "Got {} existing dblp container mappings.".format(len(self._dblp_container_map)),
+ file=sys.stderr,
+ )
def lookup_dblp_prefix(self, prefix):
if not prefix:
@@ -57,48 +59,48 @@ class DblpContainerImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- dblp_prefix = row.get('key') or row.get('dblp_prefix')
+ dblp_prefix = row.get("key") or row.get("dblp_prefix")
assert dblp_prefix
- assert row['title']
+ assert row["title"]
container_type = None
- if dblp_prefix.startswith('conf/'):
+ if dblp_prefix.startswith("conf/"):
container_type = "conference-series"
- elif dblp_prefix.startswith('journals/'):
+ elif dblp_prefix.startswith("journals/"):
container_type = "journal"
- elif dblp_prefix.startswith('series/'):
+ elif dblp_prefix.startswith("series/"):
container_type = "book-series"
issnl = None
- for issn in row.get('issns', []):
+ for issn in row.get("issns", []):
issnl = self.issn2issnl(issn)
if issnl:
break
extra = {
- 'dblp': {
- 'prefix': dblp_prefix,
+ "dblp": {
+ "prefix": dblp_prefix,
},
}
- if row.get('homepage_url'):
- extra['urls'] = [row['homepage_url']]
+ if row.get("homepage_url"):
+ extra["urls"] = [row["homepage_url"]]
- if row.get('acronym'):
- extra['acronym'] = row['acronym']
+ if row.get("acronym"):
+ extra["acronym"] = row["acronym"]
ce = fatcat_openapi_client.ContainerEntity(
- name=clean_str(row['title']),
+ name=clean_str(row["title"]),
container_type=container_type,
issnl=issnl,
- wikidata_qid=row.get('wikidata_qid'),
+ wikidata_qid=row.get("wikidata_qid"),
extra=extra,
)
return ce
def try_update(self, ce):
- dblp_prefix = ce.extra['dblp']['prefix']
+ dblp_prefix = ce.extra["dblp"]["prefix"]
existing = None
existing_container_id = self.lookup_dblp_prefix(dblp_prefix)
if existing_container_id:
@@ -123,8 +125,11 @@ class DblpContainerImporter(EntityImporter):
return True
if existing:
- self.counts['exists'] += 1
- print("\t".join([ce.extra['dblp']['prefix'], existing.ident]), file=self.dblp_container_map_output)
+ self.counts["exists"] += 1
+ print(
+ "\t".join([ce.extra["dblp"]["prefix"], existing.ident]),
+ file=self.dblp_container_map_output,
+ )
return False
# shouldn't get here
@@ -135,11 +140,17 @@ class DblpContainerImporter(EntityImporter):
Because we want to print a prefix/container_id match for each row, we
require a special batch insert method
"""
- eg = self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ eg = self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
for c_edit in eg.edits.containers:
c = self.api.get_container(c_edit.ident)
- print("\t".join([c.extra['dblp']['prefix'], c.ident]), file=self.dblp_container_map_output)
+ print(
+ "\t".join([c.extra["dblp"]["prefix"], c.ident]),
+ file=self.dblp_container_map_output,
+ )
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 6d028f2f..5baa6cd6 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -1,4 +1,3 @@
-
"""
Importer for DBLP release-level (article/paper/etc) XML metadata.
@@ -44,25 +43,16 @@ from fatcat_tools.transforms import entity_to_dict
class DblpReleaseImporter(EntityImporter):
-
- def __init__(self,
- api,
- dblp_container_map_file=None,
- **kwargs):
+ def __init__(self, api, dblp_container_map_file=None, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of dblp metadata via XML records"
+ "editgroup_description", "Automated import of dblp metadata via XML records"
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DblpReleaseImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DblpReleaseImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.dump_json_mode = kwargs.get("dump_json_mode", False)
self.this_year = datetime.datetime.now().year
@@ -76,13 +66,16 @@ class DblpReleaseImporter(EntityImporter):
"phdthesis",
"mastersthesis",
"www",
- #"data", # no instances in 2020-11 dump
+ # "data", # no instances in 2020-11 dump
]
def read_dblp_container_map_file(self, dblp_container_map_file) -> None:
self._dblp_container_map = dict()
if not dblp_container_map_file:
- print("Not loading a dblp prefix container map file; entities will fail to import", file=sys.stderr)
+ print(
+ "Not loading a dblp prefix container map file; entities will fail to import",
+ file=sys.stderr,
+ )
return
print("Loading dblp prefix container map file...", file=sys.stderr)
for line in dblp_container_map_file:
@@ -92,7 +85,10 @@ class DblpReleaseImporter(EntityImporter):
container_id = container_id.strip()
assert len(container_id) == 26
self._dblp_container_map[prefix] = container_id
- print("Got {} dblp container mappings.".format(len(self._dblp_container_map)), file=sys.stderr)
+ print(
+ "Got {} dblp container mappings.".format(len(self._dblp_container_map)),
+ file=sys.stderr,
+ )
def lookup_dblp_prefix(self, prefix):
if not prefix:
@@ -101,13 +97,13 @@ class DblpReleaseImporter(EntityImporter):
def want(self, xml_elem):
if xml_elem.name not in self.ELEMENT_TYPES:
- self.counts['skip-type'] += 1
+ self.counts["skip-type"] += 1
return False
- if not xml_elem.get('key'):
- self.counts['skip-no-key'] += 1
+ if not xml_elem.get("key"):
+ self.counts["skip-no-key"] += 1
return False
- if xml_elem['key'].startswith('homepage/'):
- self.counts['skip-type-homepage'] += 1
+ if xml_elem["key"].startswith("homepage/"):
+ self.counts["skip-type-homepage"] += 1
return False
return True
@@ -127,88 +123,88 @@ class DblpReleaseImporter(EntityImporter):
- isbn
"""
- dblp_key = xml_elem.get('key')
+ dblp_key = xml_elem.get("key")
if not dblp_key:
- self.counts['skip-empty-key'] += 1
+ self.counts["skip-empty-key"] += 1
return False
- dblp_key_type = dblp_key.split('/')[0]
+ dblp_key_type = dblp_key.split("/")[0]
# dblp_prefix may be used for container lookup
dblp_prefix = None
- if dblp_key_type in ('journals', 'conf'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:2])
- elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
- dblp_prefix = '/'.join(dblp_key.split('/')[:-1])
+ if dblp_key_type in ("journals", "conf"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:2])
+ elif dblp_key_type in ("series", "reference", "tr", "books"):
+ dblp_prefix = "/".join(dblp_key.split("/")[:-1])
- publtype = xml_elem.get('publtype') or None
+ publtype = xml_elem.get("publtype") or None
dblp_type = xml_elem.name
if dblp_type not in self.ELEMENT_TYPES:
- self.counts[f'skip-dblp-type:{dblp_type}'] += 1
+ self.counts[f"skip-dblp-type:{dblp_type}"] += 1
- if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
- self.counts['skip-key-type'] += 1
+ if dblp_key_type in ("homepages", "persons", "dblpnote"):
+ self.counts["skip-key-type"] += 1
return False
- if dblp_key.startswith('journals/corr/'):
- self.counts['skip-arxiv-corr'] += 1
+ if dblp_key.startswith("journals/corr/"):
+ self.counts["skip-arxiv-corr"] += 1
return False
title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
release_type = None
- release_stage = 'published'
+ release_stage = "published"
withdrawn_status = None
# primary releae_type detection: type of XML element, then prefix of key for granularity
- if dblp_type == 'article':
- release_type = 'article'
- if dblp_key_type == 'journals' and publtype != 'informal':
- release_type = 'article-journal'
- elif dblp_key_type == 'tr':
- release_type = 'report'
+ if dblp_type == "article":
+ release_type = "article"
+ if dblp_key_type == "journals" and publtype != "informal":
+ release_type = "article-journal"
+ elif dblp_key_type == "tr":
+ release_type = "report"
elif title.startswith("Review:"):
- release_type = 'review'
- elif dblp_type == 'inproceedings':
- release_type = 'paper-conference'
- elif dblp_type == 'book':
- release_type = 'book'
- elif dblp_type == 'incollection':
+ release_type = "review"
+ elif dblp_type == "inproceedings":
+ release_type = "paper-conference"
+ elif dblp_type == "book":
+ release_type = "book"
+ elif dblp_type == "incollection":
# XXX: part vs. chapter?
- release_type = 'chapter'
- elif dblp_type == 'data':
- release_type = 'dataset'
- elif dblp_type in ('mastersthesis', 'phdthesis'):
- release_type = 'thesis'
+ release_type = "chapter"
+ elif dblp_type == "data":
+ release_type = "dataset"
+ elif dblp_type in ("mastersthesis", "phdthesis"):
+ release_type = "thesis"
# overrides/extensions of the above
- if publtype == 'informal':
+ if publtype == "informal":
# for conferences, seems to indicate peer-review status
# for journals, seems to indicate things like book reviews; split out above
pass
- elif publtype == 'encyclopedia':
- release_type = 'entry-encyclopedia'
- elif publtype == 'edited':
+ elif publtype == "encyclopedia":
+ release_type = "entry-encyclopedia"
+ elif publtype == "edited":
# XXX: article?
- release_type = 'editorial'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'data':
- release_type = 'dataset'
- elif publtype == 'software':
- release_type = 'software'
- elif publtype == 'widthdrawn':
- withdrawn_status = 'widthdrawn'
- elif publtype == 'survey':
+ release_type = "editorial"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "data":
+ release_type = "dataset"
+ elif publtype == "software":
+ release_type = "software"
+ elif publtype == "widthdrawn":
+ withdrawn_status = "widthdrawn"
+ elif publtype == "survey":
# XXX: flag as a review/survey article?
pass
- #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
+ # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)
container_name = None
booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
@@ -236,7 +232,9 @@ class DblpReleaseImporter(EntityImporter):
part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
@@ -245,39 +243,39 @@ class DblpReleaseImporter(EntityImporter):
if isbn:
ext_ids.isbn13 = isbn
if ext_ids.doi:
- self.counts['has-doi'] += 1
+ self.counts["has-doi"] += 1
# dblp-specific extra
dblp_extra = dict(type=dblp_type)
note = clean_str(xml_elem.note and xml_elem.note.text)
- if note and 'base-search.net' not in note:
- dblp_extra['note'] = note
+ if note and "base-search.net" not in note:
+ dblp_extra["note"] = note
if part_of_key:
- dblp_extra['part_of_key'] = part_of_key
+ dblp_extra["part_of_key"] = part_of_key
# generic extra
extra = dict()
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
- if series and (dblp_key_type == 'series' or dblp_type == 'book'):
- extra['series-title'] = series
+ if series and (dblp_key_type == "series" or dblp_type == "book"):
+ extra["series-title"] = series
elif series:
- dblp_extra['series'] = series
+ dblp_extra["series"] = series
- if booktitle and dblp_key_type == 'series':
- extra['container-title'] = booktitle
- elif booktitle and dblp_key_type == 'conf':
- extra['event'] = booktitle
+ if booktitle and dblp_key_type == "series":
+ extra["container-title"] = booktitle
+ elif booktitle and dblp_key_type == "conf":
+ extra["event"] = booktitle
elif booktitle:
- dblp_extra['booktitle'] = booktitle
+ dblp_extra["booktitle"] = booktitle
if release_year and release_month:
# TODO: release_month schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if dblp_extra:
- extra['dblp'] = dblp_extra
+ extra["dblp"] = dblp_extra
if not extra:
extra = None
@@ -289,7 +287,7 @@ class DblpReleaseImporter(EntityImporter):
withdrawn_status=withdrawn_status,
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -302,8 +300,8 @@ class DblpReleaseImporter(EntityImporter):
if self.dump_json_mode:
re_dict = entity_to_dict(re, api_client=self.api.api_client)
- re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
- re_dict['_dblp_prefix'] = dblp_prefix
+ re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
+ re_dict["_dblp_prefix"] = dblp_prefix
print(json.dumps(re_dict, sort_keys=True))
return False
@@ -341,11 +339,11 @@ class DblpReleaseImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'wikidata_qid', 'isbn13', 'arxiv'):
+ for extid_type in ("doi", "wikidata_qid", "isbn13", "arxiv"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -373,12 +371,14 @@ class DblpReleaseImporter(EntityImporter):
return True
if not self.do_updates or existing.ext_ids.dblp:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# logic for whether to do update or skip
- if (existing.container_id and existing.release_type and existing.release_stage) or existing.ext_ids.arxiv:
- self.counts['skip-update'] += 1
+ if (
+ existing.container_id and existing.release_type and existing.release_stage
+ ) or existing.ext_ids.arxiv:
+ self.counts["skip-update"] += 1
return False
# fields to copy over for update
@@ -390,20 +390,20 @@ class DblpReleaseImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
existing.container_id = existing.container_id or re.container_id
- existing.extra['dblp'] = re.extra['dblp']
+ existing.extra["dblp"] = re.extra["dblp"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -411,11 +411,14 @@ class DblpReleaseImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -428,14 +431,14 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all('author'):
+ for elem in authors.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all('editor'):
+ for elem in authors.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
@@ -459,10 +462,10 @@ class DblpReleaseImporter(EntityImporter):
# remove number in author name, if present
if raw_name.split()[-1].isdigit():
- raw_name = ' '.join(raw_name.split()[:-1])
+ raw_name = " ".join(raw_name.split()[:-1])
- if elem.get('orcid'):
- orcid = clean_orcid(elem['orcid'])
+ if elem.get("orcid"):
+ orcid = clean_orcid(elem["orcid"])
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
@@ -491,22 +494,26 @@ class DblpReleaseImporter(EntityImporter):
wikidata_qid: Optional[str] = None
arxiv_id: Optional[str] = None
hdl: Optional[str] = None
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
# convert DOI-like domains, which mostly have DOIs anyways
- if '://doi.acm.org/' in url:
- url = url.replace('://doi.acm.org/', '://doi.org/')
- elif '://doi.ieeecomputersociety.org/' in url:
- url = url.replace('://doi.ieeecomputersociety.org/', '://doi.org/')
+ if "://doi.acm.org/" in url:
+ url = url.replace("://doi.acm.org/", "://doi.org/")
+ elif "://doi.ieeecomputersociety.org/" in url:
+ url = url.replace("://doi.ieeecomputersociety.org/", "://doi.org/")
- if 'doi.org/10.' in url and not doi:
+ if "doi.org/10." in url and not doi:
doi = clean_doi(url)
- elif 'wikidata.org/entity/Q' in url and not wikidata_qid:
+ elif "wikidata.org/entity/Q" in url and not wikidata_qid:
wikidata_qid = clean_wikidata_qid(url)
- elif '://arxiv.org/abs/' in url and not arxiv_id:
- arxiv_id = url.replace('http://', '').replace('https://', '').replace('arxiv.org/abs/', '')
+ elif "://arxiv.org/abs/" in url and not arxiv_id:
+ arxiv_id = (
+ url.replace("http://", "")
+ .replace("https://", "")
+ .replace("arxiv.org/abs/", "")
+ )
arxiv_id = clean_arxiv_id(arxiv_id)
- elif '://hdl.handle.net' in url and not hdl:
+ elif "://hdl.handle.net" in url and not hdl:
hdl = clean_hdl(url)
return fatcat_openapi_client.ReleaseExtIds(
@@ -525,14 +532,14 @@ class DblpReleaseImporter(EntityImporter):
sandcrawler ingest requests.
"""
EXTID_PATTERNS = [
- '://doi.acm.org/',
- '://doi.ieeecomputersociety.org/',
- 'doi.org/10.',
- 'wikidata.org/entity/Q',
- '://arxiv.org/abs/',
+ "://doi.acm.org/",
+ "://doi.ieeecomputersociety.org/",
+ "doi.org/10.",
+ "wikidata.org/entity/Q",
+ "://arxiv.org/abs/",
]
urls = []
- for ee in xml_elem.find_all('ee'):
+ for ee in xml_elem.find_all("ee"):
url = ee.text
skip = False
for pattern in EXTID_PATTERNS:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 1831c4cd..cd063337 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048
class DoajArticleImporter(EntityImporter):
-
- def __init__(self,
- api,
- issn_map_file,
- **kwargs):
+ def __init__(self, api, issn_map_file, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+ "editgroup_description",
+ "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps",
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DoajArticleImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(
+ api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
self.this_year = datetime.datetime.now().year
self.read_issn_map_file(issn_map_file)
@@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter):
}
"""
- if not obj or not isinstance(obj, dict) or 'bibjson' not in obj:
- self.counts['skip-empty'] += 1
+ if not obj or not isinstance(obj, dict) or "bibjson" not in obj:
+ self.counts["skip-empty"] += 1
return None
- bibjson = obj['bibjson']
+ bibjson = obj["bibjson"]
- title = clean_str(bibjson.get('title'), force_xml=True)
+ title = clean_str(bibjson.get("title"), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- container_name = clean_str(bibjson['journal']['title'])
+ container_name = clean_str(bibjson["journal"]["title"])
container_id = None
# NOTE: 'issns' not documented in API schema
- for issn in bibjson['journal']['issns']:
+ for issn in bibjson["journal"]["issns"]:
issnl = self.issn2issnl(issn)
if issnl:
container_id = self.lookup_issnl(self.issn2issnl(issn))
@@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter):
container_name = None
break
- volume = clean_str(bibjson['journal'].get('volume'))
+ volume = clean_str(bibjson["journal"].get("volume"))
# NOTE: this schema seems to use "number" as "issue number"
- issue = clean_str(bibjson['journal'].get('number'))
- publisher = clean_str(bibjson['journal'].get('publisher'))
+ issue = clean_str(bibjson["journal"].get("number"))
+ publisher = clean_str(bibjson["journal"].get("publisher"))
try:
- release_year = int(bibjson.get('year'))
+ release_year = int(bibjson.get("year"))
except (TypeError, ValueError):
release_year = None
- release_month = parse_month(clean_str(bibjson.get('month')))
+ release_month = parse_month(clean_str(bibjson.get("month")))
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
- license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
- country = parse_country_name(bibjson['journal'].get('country'))
+ license_slug = self.doaj_license_slug(bibjson["journal"].get("license"))
+ country = parse_country_name(bibjson["journal"].get("country"))
language = None
- for raw in bibjson['journal'].get('language') or []:
+ for raw in bibjson["journal"].get("language") or []:
language = parse_lang_name(raw)
if language:
break
# pages
# NOTE: error in API docs? seems like start_page not under 'journal' object
- start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
- end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+ start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str(
+ bibjson.get("start_page")
+ )
+ end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str(
+ bibjson.get("end_page")
+ )
pages: Optional[str] = None
if start_page and end_page:
pages = f"{start_page}-{end_page}"
elif start_page:
pages = start_page
- doaj_article_id = obj['id'].lower()
- ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+ doaj_article_id = obj["id"].lower()
+ ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)
abstracts = self.doaj_abstracts(bibjson)
- contribs = self.doaj_contribs(bibjson.get('author') or [])
+ contribs = self.doaj_contribs(bibjson.get("author") or [])
# DOAJ-specific extra
doaj_extra = dict()
- if bibjson.get('subject'):
- doaj_extra['subject'] = bibjson.get('subject')
- if bibjson.get('keywords'):
- doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+ if bibjson.get("subject"):
+ doaj_extra["subject"] = bibjson.get("subject")
+ if bibjson.get("keywords"):
+ doaj_extra["keywords"] = [
+ k for k in [clean_str(s) for s in bibjson.get("keywords")] if k
+ ]
# generic extra
extra = dict()
if country:
- extra['country'] = country
+ extra["country"] = country
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
if release_year and release_month:
# TODO: schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if doaj_extra:
- extra['doaj'] = doaj_extra
+ extra["doaj"] = doaj_extra
if not extra:
extra = None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- release_type='article-journal',
- release_stage='published',
+ release_type="article-journal",
+ release_stage="published",
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'pmid', 'pmcid'):
+ for extid_type in ("doi", "pmid", "pmcid"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter):
# other logic could go here about skipping updates
if not self.do_updates or existing.ext_ids.doaj:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# fields to copy over for update
@@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.container_id = existing.container_id or re.container_id
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['doaj'] = re.extra['doaj']
+ existing.extra["doaj"] = re.extra["doaj"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
@@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
- text = clean_str(bibjson.get('abstract'))
+ text = clean_str(bibjson.get("abstract"))
if not text or len(text) < 10:
return []
if len(text) > MAX_ABSTRACT_LENGTH:
@@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter):
lang=lang,
)
- return [abstract,]
+ return [
+ abstract,
+ ]
def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter):
contribs = []
index = 0
for author in authors:
- if not author.get('name'):
+ if not author.get("name"):
continue
creator_id = None
- orcid = clean_orcid(author.get('orcid_id'))
+ orcid = clean_orcid(author.get("orcid_id"))
if orcid:
creator_id = self.lookup_orcid(orcid)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=author.get('name'),
- role='author',
- index=index,
- creator_id=creator_id,
- raw_affiliation=clean_str(author.get('affiliation')),
- ))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=author.get("name"),
+ role="author",
+ index=index,
+ creator_id=creator_id,
+ raw_affiliation=clean_str(author.get("affiliation")),
+ )
+ )
index += 1
return contribs
- def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+ def doaj_ext_ids(
+ self, identifiers: List[dict], doaj_article_id: str
+ ) -> fatcat_openapi_client.ReleaseExtIds:
"""
bibjson.identifier {
id (string),
@@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter):
pmid: Optional[str] = None
pmcid: Optional[str] = None
for id_obj in identifiers:
- if not id_obj.get('id'):
+ if not id_obj.get("id"):
continue
- if id_obj['type'].lower() == 'doi':
- doi = clean_doi(id_obj['id'])
- elif id_obj['type'].lower() == 'pmid':
- pmid = clean_pmid(id_obj['id'])
- elif id_obj['type'].lower() == 'pmcid':
- pmcid = clean_pmcid(id_obj['id'])
+ if id_obj["type"].lower() == "doi":
+ doi = clean_doi(id_obj["id"])
+ elif id_obj["type"].lower() == "pmid":
+ pmid = clean_pmid(id_obj["id"])
+ elif id_obj["type"].lower() == "pmcid":
+ pmcid = clean_pmcid(id_obj["id"])
return fatcat_openapi_client.ReleaseExtIds(
doaj=doaj_article_id,
@@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter):
if not license_list:
return None
for license in license_list:
- if not license.get('open_access'):
+ if not license.get("open_access"):
continue
- slug = license.get('type')
- if slug.startswith('CC '):
- slug = slug.replace('CC ', 'cc-').lower()
+ slug = license.get("type")
+ if slug.startswith("CC "):
+ slug = slug.replace("CC ", "cc-").lower()
return slug
return None
diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py
index 0951ed84..26584ff3 100644
--- a/python/fatcat_tools/importers/file_meta.py
+++ b/python/fatcat_tools/importers/file_meta.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter
@@ -17,19 +16,16 @@ class FileMetaImporter(EntityImporter):
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "File metadata updates"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FileMetaImporter')
- kwargs['do_updates'] = kwargs.get("do_updates", True)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates"
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileMetaImporter")
+ kwargs["do_updates"] = kwargs.get("do_updates", True)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- for k in ('sha1hex', 'sha256hex', 'md5hex', 'size_bytes', 'mimetype'):
+ for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"):
if not row.get(k):
- self.counts['skip-missing-field'] += 1
+ self.counts["skip-missing-field"] += 1
return False
return True
@@ -40,11 +36,11 @@ class FileMetaImporter(EntityImporter):
file_meta = row
fe = fatcat_openapi_client.FileEntity(
- md5=file_meta['md5hex'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- mimetype=file_meta['mimetype'],
+ md5=file_meta["md5hex"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ mimetype=file_meta["mimetype"],
)
return fe
@@ -59,11 +55,11 @@ class FileMetaImporter(EntityImporter):
raise err
if not existing:
- self.counts['skip-no-match'] += 1
+ self.counts["skip-no-match"] += 1
return False
- if (existing.md5 and existing.sha256 and existing.size and existing.mimetype):
- self.counts['skip-existing-complete'] += 1
+ if existing.md5 and existing.sha256 and existing.size and existing.mimetype:
+ self.counts["skip-existing-complete"] += 1
return False
existing.md5 = existing.md5 or fe.md5
@@ -75,5 +71,5 @@ class FileMetaImporter(EntityImporter):
existing = self.generic_file_cleanups(existing)
self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index 43c2a49c..dd8f5600 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools import entity_from_dict
@@ -20,34 +19,31 @@ class FilesetImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
- kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
+ eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import"
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FilesetImporter")
+ kwargs["do_updates"] = bool(kwargs.get("do_updates", False))
self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
# bezerk mode doesn't make sense for this importer
assert self.bezerk_mode is False
def want(self, row):
- if not row.get('release_ids'):
- self.counts['skip-no-release-ids'] += 1
+ if not row.get("release_ids"):
+ self.counts["skip-no-release-ids"] += 1
return False
- if not row.get('urls'):
- self.counts['skip-no-urls'] += 1
+ if not row.get("urls"):
+ self.counts["skip-no-urls"] += 1
return False
- if not row.get('manifest'):
- self.counts['skip-no-files'] += 1
+ if not row.get("manifest"):
+ self.counts["skip-no-files"] += 1
return False
- for f in row.get('manifest'):
- for k in ('sha1', 'md5'):
+ for f in row.get("manifest"):
+ for k in ("sha1", "md5"):
if not f.get(k):
- self.counts['skip-missing-file-field'] += 1
+ self.counts["skip-missing-file-field"] += 1
return False
return True
@@ -66,19 +62,24 @@ class FilesetImporter(EntityImporter):
if not self.skip_release_fileset_check:
for release_id in fse.release_ids:
# don't catch 404, that would be an error
- release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
- assert release.state == 'active'
+ release = self.api.get_release(
+ release_id, expand="filesets", hide="abstracts,refs"
+ )
+ assert release.state == "active"
if release.filesets:
- self.counts['exists'] += 1
- self.counts['exists-via-release-filesets'] += 1
+ self.counts["exists"] += 1
+ self.counts["exists-via-release-filesets"] += 1
return False
# do the insert
return True
def insert_batch(self, batch):
- self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_fileset_auto_batch(
+ fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 0f666652..f7bb5357 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -7,7 +7,7 @@ import fatcat_openapi_client
from .common import EntityImporter, clean, make_rel_url
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
class GrobidMetadataImporter(EntityImporter):
@@ -24,14 +24,13 @@ class GrobidMetadataImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Import of release and file metadata, as extracted from PDFs by GROBID.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Import of release and file metadata, as extracted from PDFs by GROBID.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.GrobidMetadataImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.longtail_oa = kwargs.get("longtail_oa", False)
@@ -40,7 +39,7 @@ class GrobidMetadataImporter(EntityImporter):
def parse_record(self, row):
- fields = row.split('\t')
+ fields = row.split("\t")
sha1_key = fields[0]
cdx = json.loads(fields[1])
mimetype = fields[2]
@@ -65,8 +64,8 @@ class GrobidMetadataImporter(EntityImporter):
# TODO: this is where we should check if the file actually has
# release_ids and/or URLs associated with it
if existing and not self.bezerk_mode:
- self.counts['exists'] += 1
- self.counts['skip'] -= 1
+ self.counts["exists"] += 1
+ self.counts["skip"] -= 1
return None
release_edit = self.create_release(re)
@@ -75,75 +74,81 @@ class GrobidMetadataImporter(EntityImporter):
def parse_grobid_json(self, obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra_grobid = dict()
- abstract = obj.get('abstract')
+ abstract = obj.get("abstract")
if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10:
abobj = fatcat_openapi_client.ReleaseAbstract(
- mimetype="text/plain",
- content=clean(obj.get('abstract')))
+ mimetype="text/plain", content=clean(obj.get("abstract"))
+ )
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for i, a in enumerate(obj.get('authors', [])):
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i,
- raw_name=clean(a['name']),
- given_name=clean(a.get('given_name')),
- surname=clean(a.get('surname')),
- role="author",
- extra=None))
+ for i, a in enumerate(obj.get("authors", [])):
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ index=i,
+ raw_name=clean(a["name"]),
+ given_name=clean(a.get("given_name")),
+ surname=clean(a.get("surname")),
+ role="author",
+ extra=None,
+ )
+ )
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
cite_extra = dict()
year = None
- if raw.get('date'):
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
+ year = int(raw["date"].strip()[:4])
except (IndexError, ValueError):
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
cite_extra[key] = clean(raw[key])
- if raw.get('authors'):
- cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
+ if raw.get("authors"):
+ cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
if not cite_extra:
cite_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- key=clean(raw.get('id')),
- year=year,
- title=clean(raw['title']),
- extra=cite_extra))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ key=clean(raw.get("id")),
+ year=year,
+ title=clean(raw["title"]),
+ extra=cite_extra,
+ )
+ )
release_date = None
release_year = None
- if obj.get('date'):
+ if obj.get("date"):
# only returns year, ever?
- release_year = int(obj['date'][:4])
+ release_year = int(obj["date"][:4])
extra = dict()
- if obj.get('doi'):
- extra['doi'] = obj['doi']
- if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = clean(obj['journal']['name'])
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"]
+ if obj["journal"] and obj["journal"].get("name"):
+ extra["container_name"] = clean(obj["journal"]["name"])
# TODO: ISSN/eISSN handling? or just journal name lookup?
if extra_grobid:
- extra['grobid'] = extra_grobid
+ extra["grobid"] = extra_grobid
if self.longtail_oa:
- extra['longtail_oa'] = True
+ extra["longtail_oa"] = True
if not extra:
extra = None
- title = clean(obj['title'], force_xml=True)
+ title = clean(obj["title"], force_xml=True)
if not title or len(title) < 2:
return None
@@ -154,17 +159,22 @@ class GrobidMetadataImporter(EntityImporter):
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=clean(obj['journal'].get('publisher')),
- volume=clean(obj['journal'].get('volume')),
- issue=clean(obj['journal'].get('issue')),
+ publisher=clean(obj["journal"].get("publisher")),
+ volume=clean(obj["journal"].get("volume")),
+ issue=clean(obj["journal"].get("issue")),
abstracts=abstracts,
ext_ids=fatcat_openapi_client.ReleaseExtIds(),
- extra=extra)
+ extra=extra,
+ )
return re
def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
- sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+ sha1 = (
+ base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", "")))
+ .decode("ascii")
+ .lower()
+ )
fe = fatcat_openapi_client.FileEntity(
sha1=sha1,
@@ -175,16 +185,15 @@ class GrobidMetadataImporter(EntityImporter):
)
# parse URLs and CDX
- original = cdx['url']
- assert len(cdx['dt']) >= 8
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
- fe.urls.append(
- fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
+ original = cdx["url"]
+ assert len(cdx["dt"]) >= 8
+ wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
+ fe.urls.append(fatcat_openapi_client.FileUrl(url=wayback, rel="webarchive"))
original_url = make_rel_url(original, default_link_rel=self.default_link_rel)
if original_url is not None:
- fe.urls.append(fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1]))
+ fe.urls.append(
+ fatcat_openapi_client.FileUrl(rel=original_url[0], url=original_url[1])
+ )
return fe
@@ -193,8 +202,11 @@ class GrobidMetadataImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index f0943c1e..e0a6c3f5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,4 +1,3 @@
-
import datetime
import fatcat_openapi_client
@@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
-
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter")
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.use_glutton_match = False
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
@@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter):
else:
print("NOT checking GROBID success")
self.ingest_request_source_allowlist = [
- 'fatcat-changelog',
- 'fatcat-ingest-container',
- 'fatcat-ingest',
- 'arabesque',
+ "fatcat-changelog",
+ "fatcat-ingest-container",
+ "fatcat-ingest",
+ "arabesque",
#'mag-corpus',
#'mag',
- 'unpaywall-corpus',
- 'unpaywall',
+ "unpaywall-corpus",
+ "unpaywall",
#'s2-corpus',
#'s2',
- 'doaj',
- 'dblp',
+ "doaj",
+ "dblp",
]
- if kwargs.get('skip_source_allowlist', False):
+ if kwargs.get("skip_source_allowlist", False):
self.ingest_request_source_allowlist = []
def want_file(self, row) -> bool:
@@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter):
File-specific part of want(). Generic across general ingest and save-paper-now.
"""
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
# type-specific filters
- if row['request'].get('ingest_type') == 'pdf':
- if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
- self.counts['skip-grobid'] += 1
+ if row["request"].get("ingest_type") == "pdf":
+ if self.require_grobid and row.get("grobid", {}).get("status_code") != 200:
+ self.counts["skip-grobid"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("application/pdf",):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("application/pdf",):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') == 'xml':
- if row['file_meta'].get('mimetype') not in ("application/xml",
- "application/jats+xml", "application/tei+xml", "text/xml"):
- self.counts['skip-mimetype'] += 1
+ elif row["request"].get("ingest_type") == "xml":
+ if row["file_meta"].get("mimetype") not in (
+ "application/xml",
+ "application/jats+xml",
+ "application/tei+xml",
+ "text/xml",
+ ):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']:
+ elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]:
# we rely on sandcrawler for these checks
pass
else:
- self.counts['skip-ingest-type'] += 1
+ self.counts["skip-ingest-type"] += 1
return False
return True
@@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter):
Sandcrawler ingest-specific part of want(). Generic across file and
webcapture ingest.
"""
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist:
- self.counts['skip-ingest_request_source'] += 1
+ if (
+ self.ingest_request_source_allowlist
+ and source not in self.ingest_request_source_allowlist
+ ):
+ self.counts["skip-ingest_request_source"] += 1
return False
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
- self.counts['skip-link-source'] += 1
+ if row["request"].get("link_source") not in (
+ "arxiv",
+ "pmc",
+ "unpaywall",
+ "doi",
+ "mag",
+ "s2",
+ "doaj",
+ "dblp",
+ ):
+ self.counts["skip-link-source"] += 1
return False
- if source.startswith('savepapernow'):
+ if source.startswith("savepapernow"):
# never process async savepapernow requests
- self.counts['skip-savepapernow'] += 1
+ self.counts["skip-savepapernow"] += 1
return False
return True
@@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter):
def parse_ingest_release_ident(self, row):
- request = row['request']
- fatcat = request.get('fatcat')
+ request = row["request"]
+ fatcat = request.get("fatcat")
release_ident = None
- if fatcat and fatcat.get('release_ident'):
- release_ident = fatcat.get('release_ident')
- elif request.get('ext_ids'):
+ if fatcat and fatcat.get("release_ident"):
+ release_ident = fatcat.get("release_ident")
+ elif request.get("ext_ids"):
# if no fatcat ident, try extids
- for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'):
- extid = request['ext_ids'].get(extid_type)
+ for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"):
+ extid = request["ext_ids"].get(extid_type)
if not extid:
continue
- if extid_type == 'doi':
+ if extid_type == "doi":
extid = extid.lower()
try:
release = self.api.lookup_release(**{extid_type: extid})
@@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter):
if err.status == 404:
continue
elif err.status == 400:
- self.counts['warn-extid-invalid'] += 1
+ self.counts["warn-extid-invalid"] += 1
continue
raise err
# verify release_stage
- if request.get('release_stage') and release.release_stage:
- if request['release_stage'] != release.release_stage:
- self.counts['skip-release-stage'] += 1
+ if request.get("release_stage") and release.release_stage:
+ if request["release_stage"] != release.release_stage:
+ self.counts["skip-release-stage"] += 1
return None
release_ident = release.ident
break
- if self.use_glutton_match and not release_ident and row.get('grobid'):
+ if self.use_glutton_match and not release_ident and row.get("grobid"):
# try biblio-glutton extracted hit
- if row['grobid'].get('fatcat_release'):
- release_ident = row['grobid']['fatcat_release'].split('_')[-1]
- self.counts['glutton-match'] += 1
+ if row["grobid"].get("fatcat_release"):
+ release_ident = row["grobid"]["fatcat_release"].split("_")[-1]
+ self.counts["glutton-match"] += 1
return release_ident
def parse_terminal(self, row):
- terminal = row.get('terminal')
+ terminal = row.get("terminal")
if not terminal:
# support old cdx-only ingest results
- cdx = row.get('cdx')
+ cdx = row.get("cdx")
if not cdx:
return None
else:
terminal = {
- 'terminal_url': cdx['url'],
- 'terminal_dt': cdx['datetime'],
- 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'),
+ "terminal_url": cdx["url"],
+ "terminal_dt": cdx["datetime"],
+ "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"),
}
# work around old schema
- if 'terminal_url' not in terminal:
- terminal['terminal_url'] = terminal['url']
- if 'terminal_dt' not in terminal:
- terminal['terminal_dt'] = terminal['dt']
+ if "terminal_url" not in terminal:
+ terminal["terminal_url"] = terminal["url"]
+ if "terminal_dt" not in terminal:
+ terminal["terminal_dt"] = terminal["dt"]
# convert CDX-style digits to ISO-style timestamp
- assert len(terminal['terminal_dt']) == 14
- terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z"
+ assert len(terminal["terminal_dt"]) == 14
+ terminal["terminal_timestamp"] = (
+ datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat()
+ + "Z"
+ )
return terminal
def parse_urls(self, row, terminal):
- request = row['request']
+ request = row["request"]
default_rel = self.default_link_rel
- if request.get('link_source') == 'doi':
- default_rel = 'publisher'
- default_rel = request.get('rel', default_rel)
- url = make_rel_url(terminal['terminal_url'], default_rel)
+ if request.get("link_source") == "doi":
+ default_rel = "publisher"
+ default_rel = request.get("rel", default_rel)
+ url = make_rel_url(terminal["terminal_url"], default_rel)
if not url:
- self.counts['skip-url'] += 1
+ self.counts["skip-url"] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- terminal['terminal_dt'],
- terminal['terminal_url'])
+ terminal["terminal_dt"], terminal["terminal_url"]
+ )
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
@@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter):
def parse_edit_extra(self, row):
- request = row['request']
+ request = row["request"]
edit_extra = dict()
- if request.get('edit_extra'):
- edit_extra = request['edit_extra']
+ if request.get("edit_extra"):
+ edit_extra = request["edit_extra"]
- if request.get('ingest_request_source'):
- edit_extra['ingest_request_source'] = request['ingest_request_source']
- if request.get('link_source') and request.get('link_source_id'):
- edit_extra['link_source'] = request['link_source']
- edit_extra['link_source_id'] = request['link_source_id']
- if edit_extra['link_source'] == 'doi':
- edit_extra['link_source_id'] = edit_extra['link_source_id'].lower()
+ if request.get("ingest_request_source"):
+ edit_extra["ingest_request_source"] = request["ingest_request_source"]
+ if request.get("link_source") and request.get("link_source_id"):
+ edit_extra["link_source"] = request["link_source"]
+ edit_extra["link_source_id"] = request["link_source_id"]
+ if edit_extra["link_source"] == "doi":
+ edit_extra["link_source_id"] = edit_extra["link_source_id"].lower()
# GROBID metadata, for SPN requests (when there might not be 'success')
- if request.get('ingest_type') == 'pdf':
- if row.get('grobid') and row['grobid'].get('status') != 'success':
- edit_extra['grobid_status_code'] = row['grobid']['status_code']
- edit_extra['grobid_version'] = row['grobid'].get('grobid_version')
+ if request.get("ingest_type") == "pdf":
+ if row.get("grobid") and row["grobid"].get("status") != "success":
+ edit_extra["grobid_status_code"] = row["grobid"]["status_code"]
+ edit_extra["grobid_version"] = row["grobid"].get("grobid_version")
return edit_extra
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') not in ('pdf', 'xml'):
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in ("pdf", "xml"):
+ self.counts["skip-ingest-type"] += 1
return None
- assert (request['ingest_type'], file_meta['mimetype']) in [
+ assert (request["ingest_type"], file_meta["mimetype"]) in [
("pdf", "application/pdf"),
("xml", "application/xml"),
("xml", "application/jats+xml"),
@@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter):
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
fe = fatcat_openapi_client.FileEntity(
- md5=file_meta['md5hex'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- mimetype=file_meta['mimetype'],
+ md5=file_meta["md5hex"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ mimetype=file_meta["mimetype"],
release_ids=[release_ident],
urls=urls,
)
@@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter):
# check for existing edits-in-progress with same file hash
for other in self._entity_queue:
if other.sha1 == fe.sha1:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
if not existing:
@@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter):
# NOTE: the following checks all assume there is an existing item
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not self.do_updates:
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
# TODO: for now, never update
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_file(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFileImporter(IngestFileResultImporter):
@@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['require_grobid'] = False
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["require_grobid"] = False
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_file(row):
@@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
@@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter):
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
return True
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') != "html":
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return None
- if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
- archive_urls = [u for u in urls if u.rel == 'webarchive']
+ archive_urls = [u for u in urls if u.rel == "webarchive"]
- if terminal['terminal_status_code'] != 200:
- self.counts['skip-terminal-status-code'] += 1
+ if terminal["terminal_status_code"] != 200:
+ self.counts["skip-terminal-status-code"] += 1
return None
- terminal_cdx = row['cdx']
- if 'revisit_cdx' in row:
- terminal_cdx = row['revisit_cdx']
- assert terminal_cdx['surt']
- if terminal_cdx['url'] != terminal['terminal_url']:
- self.counts['skip-terminal-url-mismatch'] += 1
+ terminal_cdx = row["cdx"]
+ if "revisit_cdx" in row:
+ terminal_cdx = row["revisit_cdx"]
+ assert terminal_cdx["surt"]
+ if terminal_cdx["url"] != terminal["terminal_url"]:
+ self.counts["skip-terminal-url-mismatch"] += 1
return None
wc_cdx = []
# primary resource first
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=terminal_cdx['surt'],
- timestamp=terminal['terminal_timestamp'],
- url=terminal['terminal_url'],
- mimetype=file_meta['mimetype'],
- status_code=terminal['terminal_status_code'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- ))
-
- for resource in row.get('html_resources', []):
- timestamp = resource['timestamp']
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=terminal_cdx["surt"],
+ timestamp=terminal["terminal_timestamp"],
+ url=terminal["terminal_url"],
+ mimetype=file_meta["mimetype"],
+ status_code=terminal["terminal_status_code"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ )
+ )
+
+ for resource in row.get("html_resources", []):
+ timestamp = resource["timestamp"]
if "+" not in timestamp and "Z" not in timestamp:
timestamp += "Z"
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=resource['surt'],
- timestamp=timestamp,
- url=resource['url'],
- mimetype=resource.get('mimetype'),
- size=resource.get('size'),
- sha1=resource.get('sha1hex'),
- sha256=resource.get('sha256hex'),
- ))
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=resource["surt"],
+ timestamp=timestamp,
+ url=resource["url"],
+ mimetype=resource.get("mimetype"),
+ size=resource.get("size"),
+ sha1=resource.get("sha1hex"),
+ sha256=resource.get("sha256hex"),
+ )
+ )
wc = fatcat_openapi_client.WebcaptureEntity(
cdx=wc_cdx,
archive_urls=archive_urls,
- original_url=terminal['terminal_url'],
- timestamp=terminal['terminal_timestamp'],
+ original_url=terminal["terminal_url"],
+ timestamp=terminal["terminal_timestamp"],
release_ids=[release_ident],
)
@@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter):
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# TODO: currently only allow one release per webcapture
release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
@@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
for other in release.webcaptures:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-webcapture'] += 1
+ self.counts["skip-release-has-webcapture"] += 1
return False
# Ok, if we got here then no existing web capture for (first) release,
@@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter):
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_webcapture(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_webcapture_auto_batch(
+ fatcat_openapi_client.WebcaptureAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
+
class SavePaperNowWebImporter(IngestWebResultImporter):
"""
@@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
"""
@@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
path, which means allowing hit=false.
"""
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
- if row.get('status') not in ['success', 'unknown-scope']:
- self.counts['skip-hit'] += 1
+ if row.get("status") not in ["success", "unknown-scope"]:
+ self.counts["skip-hit"] += 1
return False
return True
@@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Filesets crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.max_file_count = 300
def want_fileset(self, row):
- if not row.get('manifest') or len(row.get('manifest')) == 0:
- self.counts['skip-empty-manifest'] += 1
+ if not row.get("manifest") or len(row.get("manifest")) == 0:
+ self.counts["skip-empty-manifest"] += 1
return False
- if len(row.get('manifest')) == 1:
- self.counts['skip-single-file'] += 1
+ if len(row.get("manifest")) == 1:
+ self.counts["skip-single-file"] += 1
return False
- if len(row.get('manifest')) > self.max_file_count:
- self.counts['skip-too-many-files'] += 1
+ if len(row.get("manifest")) > self.max_file_count:
+ self.counts["skip-too-many-files"] += 1
return False
return True
@@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return False
# fileset-specific filters
- if row['request'].get('ingest_type') not in ['dataset',]:
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return False
if not self.want_fileset(row):
@@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
def parse_fileset_urls(self, row):
- if not row.get('strategy'):
+ if not row.get("strategy"):
return []
- strategy = row['strategy']
+ strategy = row["strategy"]
urls = []
- if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
- rel="archive-base",
- ))
- if row['strategy'].startswith('web-') and row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
- rel="webarchive-base",
- ))
+ if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+ rel="archive-base",
+ )
+ )
+ if row["strategy"].startswith("web-") and row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
+ rel="webarchive-base",
+ )
+ )
# TODO: repository-base
# TODO: web-base
- if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
- rel="archive-bundle",
- ))
+ if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
+ rel="archive-bundle",
+ )
+ )
- if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
- rel="webarchive-bundle",
- ))
+ if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
+ rel="webarchive-bundle",
+ )
+ )
# add any additional / platform URLs here
- if row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-bundle",
- ))
- if row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-base",
- ))
+ if row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-bundle",
+ )
+ )
+ if row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-base",
+ )
+ )
return urls
def parse_record(self, row):
- request = row['request']
+ request = row["request"]
# double check that want() filtered request correctly
- if request.get('ingest_type') not in ["dataset",]:
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
entity_extra = dict()
edit_extra = self.parse_edit_extra(row)
- edit_extra['ingest_strategy'] = row['ingest_strategy']
- if row.get('platform'):
- edit_extra['platform'] = row['platform']
- if row.get('platform_id'):
- edit_extra['platform_id'] = row['platform_id']
+ edit_extra["ingest_strategy"] = row["ingest_strategy"]
+ if row.get("platform"):
+ edit_extra["platform"] = row["platform"]
+ if row.get("platform_id"):
+ edit_extra["platform_id"] = row["platform_id"]
entity_urls = self.parse_fileset_urls(row)
if not entity_urls:
- self.counts['skip-no-access-url'] += 1
+ self.counts["skip-no-access-url"] += 1
return None
- assert row['file_count'] == len(row['manifest'])
- if row['file_count'] > self.max_file_count:
- self.counts['skip-too-many-manifest-files'] += 1
+ assert row["file_count"] == len(row["manifest"])
+ if row["file_count"] > self.max_file_count:
+ self.counts["skip-too-many-manifest-files"] += 1
return None
manifest = []
- for ingest_file in row['manifest']:
+ for ingest_file in row["manifest"]:
fsf = fatcat_openapi_client.FilesetFile(
- path=ingest_file['path'],
- size=ingest_file['size'],
- md5=ingest_file['md5'],
- sha1=ingest_file['sha1'],
- sha256=ingest_file.get('sha256'),
+ path=ingest_file["path"],
+ size=ingest_file["size"],
+ md5=ingest_file["md5"],
+ sha1=ingest_file["sha1"],
+ sha256=ingest_file.get("sha256"),
extra=dict(
- mimetype=ingest_file['mimetype'],
+ mimetype=ingest_file["mimetype"],
),
)
if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size):
- self.counts['skip-partial-file-info'] += 1
+ self.counts["skip-partial-file-info"] += 1
return None
- if ingest_file.get('platform_url'):
+ if ingest_file.get("platform_url"):
# XXX: should we include this?
- fsf.extra['original_url'] = ingest_file['platform_url']
- if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'):
- fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
+ fsf.extra["original_url"] = ingest_file["platform_url"]
+ if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"):
+ fsf.extra[
+ "wayback_url"
+ ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
manifest.append(fsf)
fe = fatcat_openapi_client.FilesetEntity(
@@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in self._entity_queue:
# XXX: how to duplicate check?
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
release = self.api.get_release(wc.release_ids[0], expand="filesets")
@@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in release.filesets:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-fileset'] += 1
+ self.counts["skip-release-has-fileset"] += 1
return False
return True
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_fileset(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_fileset_auto_batch(
+ fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
@@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Fileset crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_fileset(row):
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index 0a983c5e..8e3af416 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -1,4 +1,3 @@
-
import datetime
import sqlite3
import sys
@@ -33,26 +32,24 @@ def parse_jalc_persons(raw_persons):
# first parse out into language-agnostic dics
for raw in raw_persons:
- name = raw.find('name') or None
+ name = raw.find("name") or None
if name:
- name = clean(name.get_text().replace('\n', ' '))
- surname = raw.find('familyName') or None
+ name = clean(name.get_text().replace("\n", " "))
+ surname = raw.find("familyName") or None
if surname:
- surname = clean(surname.get_text().replace('\n', ' '))
- given_name = raw.find('givenName') or None
+ surname = clean(surname.get_text().replace("\n", " "))
+ given_name = raw.find("givenName") or None
if given_name:
- given_name = clean(given_name.get_text().replace('\n', ' '))
- lang = 'en'
+ given_name = clean(given_name.get_text().replace("\n", " "))
+ lang = "en"
if is_cjk(name):
- lang = 'ja'
- if lang == 'en' and surname and given_name:
+ lang = "ja"
+ if lang == "en" and surname and given_name:
# english names order is flipped
name = "{} {}".format(given_name, surname)
rc = fatcat_openapi_client.ReleaseContrib(
- raw_name=name,
- surname=surname,
- given_name=given_name,
- role="author")
+ raw_name=name, surname=surname, given_name=given_name, role="author"
+ )
# add an extra hint field; won't end up in serialized object
rc._lang = lang
persons.append(rc)
@@ -60,12 +57,12 @@ def parse_jalc_persons(raw_persons):
if not persons:
return []
- if all([p._lang == 'en' for p in persons]) or all([p._lang == 'ja' for p in persons]):
+ if all([p._lang == "en" for p in persons]) or all([p._lang == "ja" for p in persons]):
# all english names, or all japanese names
return persons
# for debugging
- #if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
+ # if len([1 for p in persons if p._lang == 'en']) != len([1 for p in persons if p._lang == 'ja']):
# print("INTERESTING: {}".format(persons[0]))
start_lang = persons[0]._lang
@@ -74,10 +71,10 @@ def parse_jalc_persons(raw_persons):
if p._lang == start_lang:
contribs.append(p)
else:
- if p._lang == 'en' and contribs[-1]._lang == 'ja':
+ if p._lang == "en" and contribs[-1]._lang == "ja":
eng = p
jpn = contribs[-1]
- elif p._lang == 'ja' and contribs[-1]._lang == 'en':
+ elif p._lang == "ja" and contribs[-1]._lang == "en":
eng = contribs[-1]
jpn = p
else:
@@ -85,11 +82,11 @@ def parse_jalc_persons(raw_persons):
contribs.append(p)
continue
eng.extra = {
- 'original_name': {
- 'lang': jpn._lang,
- 'raw_name': jpn.raw_name,
- 'given_name': jpn.given_name,
- 'surname': jpn.surname,
+ "original_name": {
+ "lang": jpn._lang,
+ "raw_name": jpn.raw_name,
+ "given_name": jpn.given_name,
+ "surname": jpn.surname,
},
}
contribs[-1] = eng
@@ -105,18 +102,19 @@ class JalcImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of JALC DOI metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JalcImporter')
- super().__init__(api,
+ eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata")
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JalcImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers = kwargs.get('create_containers', True)
- extid_map_file = kwargs.get('extid_map_file')
+ self.create_containers = kwargs.get("create_containers", True)
+ extid_map_file = kwargs.get("extid_map_file")
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -129,12 +127,27 @@ class JalcImporter(EntityImporter):
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
- [doi.lower()]).fetchone()
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = self.extid_map_db.execute(
+ "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]
+ ).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
- row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=None,
+ pmid=None,
+ pmcid=None,
+ wikidata_qid=None,
+ arxiv_id=None,
+ jstor_id=None,
+ )
+ row = [str(cell or "") or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
@@ -163,27 +176,27 @@ class JalcImporter(EntityImporter):
titles = record.find_all("title")
if not titles:
return None
- title = titles[0].get_text().replace('\n', ' ').strip()
+ title = titles[0].get_text().replace("\n", " ").strip()
original_title = None
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
if len(titles) > 1:
- original_title = titles[1].get_text().replace('\n', ' ').strip()
- if original_title.endswith('.'):
+ original_title = titles[1].get_text().replace("\n", " ").strip()
+ if original_title.endswith("."):
original_title = original_title[:-1]
doi = None
if record.doi:
doi = clean_doi(record.doi.string.strip().lower())
- if doi.startswith('http://dx.doi.org/'):
- doi = doi.replace('http://dx.doi.org/', '')
- elif doi.startswith('https://dx.doi.org/'):
- doi = doi.replace('https://dx.doi.org/', '')
- elif doi.startswith('http://doi.org/'):
- doi = doi.replace('http://doi.org/', '')
- elif doi.startswith('https://doi.org/'):
- doi = doi.replace('https://doi.org/', '')
- if not (doi.startswith('10.') and '/' in doi):
+ if doi.startswith("http://dx.doi.org/"):
+ doi = doi.replace("http://dx.doi.org/", "")
+ elif doi.startswith("https://dx.doi.org/"):
+ doi = doi.replace("https://dx.doi.org/", "")
+ elif doi.startswith("http://doi.org/"):
+ doi = doi.replace("http://doi.org/", "")
+ elif doi.startswith("https://doi.org/"):
+ doi = doi.replace("https://doi.org/", "")
+ if not (doi.startswith("10.") and "/" in doi):
sys.stderr.write("bogus JALC DOI: {}\n".format(doi))
doi = None
if not doi:
@@ -202,7 +215,9 @@ class JalcImporter(EntityImporter):
if date:
date = date.string
if len(date) == 10:
- release_date = datetime.datetime.strptime(date['completed-date'], DATE_FMT).date()
+ release_date = datetime.datetime.strptime(
+ date["completed-date"], DATE_FMT
+ ).date()
release_year = release_date.year
release_date = release_date.isoformat()
elif len(date) == 4 and date.isdigit():
@@ -214,7 +229,7 @@ class JalcImporter(EntityImporter):
if record.endingPage and record.endingPage.string.strip():
pages = "{}-{}".format(pages, record.endingPage.string.strip())
# double check to prevent "-" as pages
- if pages and pages.strip() == '-':
+ if pages and pages.strip() == "-":
pages = None
volume = None
@@ -242,9 +257,13 @@ class JalcImporter(EntityImporter):
container_extra = dict()
if record.publicationName:
- pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]
+ pubs = [
+ p.get_text().replace("\n", " ").strip()
+ for p in record.find_all("publicationName")
+ if p.get_text()
+ ]
pubs = [clean(p) for p in pubs if p]
- assert(pubs)
+ assert pubs
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
if len(pubs) > 1 and is_cjk(pubs[0]):
@@ -252,10 +271,14 @@ class JalcImporter(EntityImporter):
pubs = [pubs[1], pubs[0]]
container_name = clean(pubs[0])
if len(pubs) > 1:
- container_extra['original_name'] = clean(pubs[1])
+ container_extra["original_name"] = clean(pubs[1])
if record.publisher:
- pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]
+ pubs = [
+ p.get_text().replace("\n", " ").strip()
+ for p in record.find_all("publisher")
+ if p.get_text()
+ ]
pubs = [p for p in pubs if p]
if len(pubs) > 1 and pubs[0] == pubs[1]:
pubs = [pubs[0]]
@@ -265,20 +288,25 @@ class JalcImporter(EntityImporter):
if pubs:
publisher = clean(pubs[0])
if len(pubs) > 1:
- container_extra['publisher_aliases'] = pubs[1:]
-
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ container_extra["publisher_aliases"] = pubs[1:]
+
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
- container_extra['country'] = 'jp'
- container_extra['languages'] = ['ja']
+ container_extra["country"] = "jp"
+ container_extra["languages"] = ["ja"]
ce = fatcat_openapi_client.ContainerEntity(
name=container_name,
- container_type='journal',
+ container_type="journal",
publisher=publisher,
issnl=issnl,
- extra=(container_extra or None))
+ extra=(container_extra or None),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
# short-cut future imports in same batch
@@ -301,7 +329,7 @@ class JalcImporter(EntityImporter):
# group-title
# always put at least an empty dict here to indicate the DOI registrar
# (informally)
- extra['jalc'] = extra_jalc
+ extra["jalc"] = extra_jalc
title = clean(title)
if not title:
@@ -312,24 +340,24 @@ class JalcImporter(EntityImporter):
title=title,
original_title=clean(original_title),
release_type=release_type,
- release_stage='published',
+ release_stage="published",
release_date=release_date,
release_year=release_year,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
doi=doi,
- pmid=extids['pmid'],
- pmcid=extids['pmcid'],
- wikidata_qid=extids['wikidata_qid'],
- core=extids['core_id'],
- arxiv=extids['arxiv_id'],
- jstor=extids['jstor_id'],
+ pmid=extids["pmid"],
+ pmcid=extids["pmcid"],
+ wikidata_qid=extids["wikidata_qid"],
+ core=extids["core_id"],
+ arxiv=extids["arxiv_id"],
+ jstor=extids["jstor_id"],
),
volume=volume,
issue=issue,
pages=pages,
publisher=publisher,
language=lang,
- #license_slug
+ # license_slug
container_id=container_id,
contribs=contribs,
extra=extra,
@@ -351,17 +379,20 @@ class JalcImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
"""
@@ -374,11 +405,11 @@ class JalcImporter(EntityImporter):
# 2. iterate over articles, call parse_article on each
for record in soup.find_all("Description"):
resp = self.parse_record(record)
- #print(json.dumps(resp))
+ # print(json.dumps(resp))
print(resp)
- #sys.exit(-1)
+ # sys.exit(-1)
-if __name__=='__main__':
+if __name__ == "__main__":
parser = JalcImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 25d7b3b5..6d1fefa3 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from .common import EntityImporter, clean
@@ -11,18 +10,20 @@ def or_none(s):
return None
return s
+
def truthy(s):
if s is None:
return None
s = s.lower()
- if s in ('true', 't', 'yes', 'y', '1'):
+ if s in ("true", "t", "yes", "y", "1"):
return True
- elif s in ('false', 'f', 'no', 'n', '0'):
+ elif s in ("false", "f", "no", "n", "0"):
return False
else:
return None
+
class JournalMetadataImporter(EntityImporter):
"""
Imports journal metadata ("containers") by ISSN, currently from a custom
@@ -33,17 +34,16 @@ class JournalMetadataImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
- if raw_record.get('issnl') and raw_record.get('name'):
+ if raw_record.get("issnl") and raw_record.get("name"):
return True
return False
@@ -54,52 +54,68 @@ class JournalMetadataImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
- if not row.get('name'):
+ if not row.get("name"):
# Name is required (by schema)
return None
extra = dict()
- for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
- 'coden', 'aliases', 'original_name', 'first_year', 'last_year',
- 'platform', 'default_license', 'road', 'mimetypes',
- 'sherpa_romeo', 'kbart'):
+ for key in (
+ "issne",
+ "issnp",
+ "languages",
+ "country",
+ "urls",
+ "abbrev",
+ "coden",
+ "aliases",
+ "original_name",
+ "first_year",
+ "last_year",
+ "platform",
+ "default_license",
+ "road",
+ "mimetypes",
+ "sherpa_romeo",
+ "kbart",
+ ):
if row.get(key):
extra[key] = row[key]
# TODO: not including for now: norwegian, dois/crossref, ia
extra_doaj = dict()
- if row.get('doaj'):
- if row['doaj'].get('as_of'):
- extra_doaj['as_of'] = row['doaj']['as_of']
- if row['doaj'].get('works'):
- extra_doaj['works'] = row['doaj']['works']
+ if row.get("doaj"):
+ if row["doaj"].get("as_of"):
+ extra_doaj["as_of"] = row["doaj"]["as_of"]
+ if row["doaj"].get("works"):
+ extra_doaj["works"] = row["doaj"]["works"]
if extra_doaj:
- extra['doaj'] = extra_doaj
+ extra["doaj"] = extra_doaj
extra_ia = dict()
# TODO: would like an ia.longtail_ia flag
- if row.get('sim'):
+ if row.get("sim"):
# NB: None case of the .get() here is blech, but othrwise
# extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
- extra_ia['sim'] = {
- 'year_spans': row['sim'].get('year_spans'),
+ extra_ia["sim"] = {
+ "year_spans": row["sim"].get("year_spans"),
}
if extra_ia:
- extra['ia'] = extra_ia
+ extra["ia"] = extra_ia
- name = clean(row.get('name'))
+ name = clean(row.get("name"))
if not name:
return None
ce = fatcat_openapi_client.ContainerEntity(
- issnl=row['issnl'],
- issne=row.get('issne'),
- issnp=row.get('issnp'),
- container_type=None, # TODO
+ issnl=row["issnl"],
+ issne=row.get("issne"),
+ issnp=row.get("issnp"),
+ container_type=None, # TODO
name=name,
- publisher=clean(row.get('publisher')),
- wikidata_qid=None, # TODO
- extra=extra)
+ publisher=clean(row.get("publisher")),
+ wikidata_qid=None, # TODO
+ extra=extra,
+ )
return ce
def try_update(self, ce):
@@ -118,23 +134,26 @@ class JournalMetadataImporter(EntityImporter):
# for now, only update KBART, and only if there is new content
if not existing.extra:
existing.extra = dict()
- if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']):
- if not existing.extra.get('kbart'):
- existing.extra['kbart'] = {}
- existing.extra['kbart'].update(ce.extra['kbart'])
+ if ce.extra.get("kbart") and (existing.extra.get("kbart") != ce.extra["kbart"]):
+ if not existing.extra.get("kbart"):
+ existing.extra["kbart"] = {}
+ existing.extra["kbart"].update(ce.extra["kbart"])
self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
else:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# if we got this far, it's a bug
raise NotImplementedError
def insert_batch(self, batch):
- self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_container_auto_batch(
+ fatcat_openapi_client.ContainerAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index d37424d6..8c7bfad4 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -12,10 +11,10 @@ from .crossref import CONTAINER_TYPE_MAP
# TODO: more entries?
JSTOR_CONTRIB_MAP = {
- 'author': 'author',
- 'editor': 'editor',
- 'translator': 'translator',
- 'illustrator': 'illustrator',
+ "author": "author",
+ "editor": "editor",
+ "translator": "translator",
+ "illustrator": "illustrator",
}
JSTOR_TYPE_MAP = {
@@ -26,6 +25,7 @@ JSTOR_TYPE_MAP = {
"research-article": "article-journal",
}
+
class JstorImporter(EntityImporter):
"""
Importer for JSTOR bulk XML metadata (eg, from their Early Journals
@@ -34,17 +34,18 @@ class JstorImporter(EntityImporter):
def __init__(self, api, issn_map_file, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of JSTOR XML metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JstorImporter')
- super().__init__(api,
+ eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata")
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JstorImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
@@ -62,20 +63,22 @@ class JstorImporter(EntityImporter):
extra = dict()
extra_jstor = dict()
- release_type = JSTOR_TYPE_MAP.get(article['article-type'])
+ release_type = JSTOR_TYPE_MAP.get(article["article-type"])
title = article_meta.find("article-title")
if title and title.get_text():
- title = title.get_text().replace('\n', ' ').strip()
+ title = title.get_text().replace("\n", " ").strip()
elif title and not title.get_text():
title = None
- if not title and release_type.startswith('review') and article_meta.product.source:
- title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())
+ if not title and release_type.startswith("review") and article_meta.product.source:
+ title = "Review: {}".format(
+ article_meta.product.source.replace("\n", " ").get_text()
+ )
if not title:
return None
- if title.endswith('.'):
+ if title.endswith("."):
title = title[:-1]
if "[Abstract]" in title:
@@ -93,12 +96,12 @@ class JstorImporter(EntityImporter):
title = title[1:-1]
# JSTOR journal-id
- journal_ids = [j.string for j in journal_meta.find_all('journal-id')]
+ journal_ids = [j.string for j in journal_meta.find_all("journal-id")]
if journal_ids:
- extra_jstor['journal_ids'] = journal_ids
+ extra_jstor["journal_ids"] = journal_ids
- journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ')
- publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')
+ journal_title = journal_meta.find("journal-title").get_text().replace("\n", " ")
+ publisher = journal_meta.find("publisher-name").get_text().replace("\n", " ")
issn = journal_meta.find("issn")
if issn:
issn = issn.string
@@ -113,13 +116,18 @@ class JstorImporter(EntityImporter):
container_id = self.lookup_issnl(issnl)
# create container if it doesn't exist
- if (container_id is None and self.create_containers and (issnl is not None)
- and journal_title):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and journal_title
+ ):
ce = fatcat_openapi_client.ContainerEntity(
issnl=issnl,
publisher=publisher,
container_type=self.map_container_type(release_type),
- name=clean(journal_title, force_xml=True))
+ name=clean(journal_title, force_xml=True),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -132,8 +140,8 @@ class JstorImporter(EntityImporter):
if jstor_id:
jstor_id = jstor_id.string.strip()
if not jstor_id and doi:
- assert doi.startswith('10.2307/')
- jstor_id = doi.replace('10.2307/', '')
+ assert doi.startswith("10.2307/")
+ jstor_id = doi.replace("10.2307/", "")
assert jstor_id and int(jstor_id)
contribs = []
@@ -142,13 +150,13 @@ class JstorImporter(EntityImporter):
for c in cgroup.find_all("contrib"):
given = c.find("given-names")
if given:
- given = clean(given.get_text().replace('\n', ' '))
+ given = clean(given.get_text().replace("\n", " "))
surname = c.find("surname")
if surname:
- surname = clean(surname.get_text().replace('\n', ' '))
+ surname = clean(surname.get_text().replace("\n", " "))
raw_name = c.find("string-name")
if raw_name:
- raw_name = clean(raw_name.get_text().replace('\n', ' '))
+ raw_name = clean(raw_name.get_text().replace("\n", " "))
if not raw_name:
if given and surname:
@@ -156,15 +164,17 @@ class JstorImporter(EntityImporter):
elif surname:
raw_name = surname
- role = JSTOR_CONTRIB_MAP.get(c.get('contrib-type', 'author'))
- if not role and c.get('contrib-type'):
- sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c['contrib-type']))
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- role=role,
- raw_name=raw_name,
- given_name=given,
- surname=surname,
- ))
+ role = JSTOR_CONTRIB_MAP.get(c.get("contrib-type", "author"))
+ if not role and c.get("contrib-type"):
+ sys.stderr.write("NOT IN JSTOR_CONTRIB_MAP: {}\n".format(c["contrib-type"]))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ role=role,
+ raw_name=raw_name,
+ given_name=given,
+ surname=surname,
+ )
+ )
for i, contrib in enumerate(contribs):
if contrib.raw_name != "et al.":
@@ -172,14 +182,13 @@ class JstorImporter(EntityImporter):
release_year = None
release_date = None
- pub_date = article_meta.find('pub-date')
+ pub_date = article_meta.find("pub-date")
if pub_date and pub_date.year:
release_year = int(pub_date.year.string)
if pub_date.month and pub_date.day:
release_date = datetime.date(
- release_year,
- int(pub_date.month.string),
- int(pub_date.day.string))
+ release_year, int(pub_date.month.string), int(pub_date.day.string)
+ )
if release_date.day == 1 and release_date.month == 1:
# suspect jan 1st dates get set by JSTOR when actual
# date not known (citation needed), so drop them
@@ -208,10 +217,10 @@ class JstorImporter(EntityImporter):
warnings.warn("MISSING MARC LANG: {}".format(cm.find("meta-value").string))
# JSTOR issue-id
- if article_meta.find('issue-id'):
- issue_id = clean(article_meta.find('issue-id').string)
+ if article_meta.find("issue-id"):
+ issue_id = clean(article_meta.find("issue-id").string)
if issue_id:
- extra_jstor['issue_id'] = issue_id
+ extra_jstor["issue_id"] = issue_id
# everything in JSTOR is published
release_stage = "published"
@@ -225,14 +234,14 @@ class JstorImporter(EntityImporter):
# group-title
# pubmed: retraction refs
if extra_jstor:
- extra['jstor'] = extra_jstor
+ extra["jstor"] = extra_jstor
if not extra:
extra = None
re = fatcat_openapi_client.ReleaseEntity(
- #work_id
+ # work_id
title=title,
- #original_title
+ # original_title
release_type=release_type,
release_stage=release_stage,
release_date=release_date,
@@ -246,21 +255,16 @@ class JstorImporter(EntityImporter):
pages=pages,
publisher=publisher,
language=language,
- #license_slug
-
+ # license_slug
# content, mimetype, lang
- #abstracts=abstracts,
-
+ # abstracts=abstracts,
contribs=contribs,
-
# key, year, container_name, title, locator
# extra: volume, authors, issue, publisher, identifiers
- #refs=refs,
-
+ # refs=refs,
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
container_id=container_id,
-
extra=extra,
)
return re
@@ -289,12 +293,12 @@ class JstorImporter(EntityImporter):
if existing and existing.ext_ids.jstor:
# don't update if it already has JSTOR ID
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
elif existing:
# but do update if only DOI was set
existing.ext_ids.jstor = re.ext_ids.jstor
- existing.extra['jstor'] = re.extra['jstor']
+ existing.extra["jstor"] = re.extra["jstor"]
# better release_type detection, and some other fields
# TODO: don't do this over-writing in the future? assuming here
# this is a one-time batch import over/extending bootstrap crossref
@@ -304,17 +308,20 @@ class JstorImporter(EntityImporter):
existing.contribs = re.contribs
existing.language = re.language
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
@@ -325,8 +332,9 @@ class JstorImporter(EntityImporter):
for article in soup.find_all("article"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
+
-if __name__=='__main__':
+if __name__ == "__main__":
parser = JstorImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 09807276..7c2a6a87 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools.normal import clean_doi
@@ -32,13 +31,13 @@ class MatchedImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies."
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Import of large-scale file-to-release match results. Source of metadata varies."
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.MatchedImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mimetype = kwargs.get("default_mimetype", None)
@@ -46,14 +45,14 @@ class MatchedImporter(EntityImporter):
return True
def parse_record(self, obj):
- dois = [d.lower() for d in obj.get('dois', [])]
+ dois = [d.lower() for d in obj.get("dois", [])]
# lookup dois
re_list = set()
for doi in dois:
doi = clean_doi(doi)
if not doi:
- self.counts['skip-bad-doi'] += 1
+ self.counts["skip-bad-doi"] += 1
return None
try:
re = self.api.lookup_release(doi=doi)
@@ -62,13 +61,22 @@ class MatchedImporter(EntityImporter):
raise err
re = None
if re is None:
- #print("DOI not found: {}".format(doi))
+ # print("DOI not found: {}".format(doi))
pass
else:
re_list.add(re.ident)
# look up other external ids
- for extid_type in ('arxiv', 'pmid', 'pmcid', 'jstor', 'wikidata_qid', 'core', 'isbn13', 'ark'):
+ for extid_type in (
+ "arxiv",
+ "pmid",
+ "pmcid",
+ "jstor",
+ "wikidata_qid",
+ "core",
+ "isbn13",
+ "ark",
+ ):
extid = obj.get(extid_type)
if extid:
try:
@@ -84,49 +92,47 @@ class MatchedImporter(EntityImporter):
release_ids = list(re_list)
if len(release_ids) == 0:
- self.counts['skip-no-releases'] += 1
+ self.counts["skip-no-releases"] += 1
return None
if len(release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-too-many-releases'] += 1
+ self.counts["skip-too-many-releases"] += 1
return None
# parse URLs and CDX
urls = set()
- for url in obj.get('urls', []):
+ for url in obj.get("urls", []):
url = make_rel_url(url, default_link_rel=self.default_link_rel)
if url is not None:
urls.add(url)
- for cdx in obj.get('cdx', []):
- original = cdx['url']
- if cdx.get('dt'):
- wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['dt'],
- original)
+ for cdx in obj.get("cdx", []):
+ original = cdx["url"]
+ if cdx.get("dt"):
+ wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
urls.add(("webarchive", wayback))
url = make_rel_url(original, default_link_rel=self.default_link_rel)
if url is not None:
urls.add(url)
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
if len(urls) == 0:
- self.counts['skip-no-urls'] += 1
+ self.counts["skip-no-urls"] += 1
return None
if len(urls) > SANE_MAX_URLS:
- self.counts['skip-too-many-urls'] += 1
+ self.counts["skip-too-many-urls"] += 1
return None
- size = obj.get('size')
+ size = obj.get("size")
if size:
size = int(size)
- mimetype = obj.get('mimetype', self.default_mimetype)
+ mimetype = obj.get("mimetype", self.default_mimetype)
if not mimetype and urls:
- if urls[0].url.endswith('.pdf'):
- mimetype = 'application/pdf'
+ if urls[0].url.endswith(".pdf"):
+ mimetype = "application/pdf"
fe = fatcat_openapi_client.FileEntity(
- md5=obj.get('md5'),
- sha1=obj['sha1'],
- sha256=obj.get('sha256'),
+ md5=obj.get("md5"),
+ sha1=obj["sha1"],
+ sha256=obj.get("sha256"),
size=size,
mimetype=mimetype,
release_ids=release_ids,
@@ -149,28 +155,30 @@ class MatchedImporter(EntityImporter):
combined_release_ids = list(set(fe.release_ids + existing.release_ids))
if set(combined_release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
# no new release matches *and* there are already existing URLs
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# check for edit conflicts
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
# minimum viable "existing" URL cleanup to fix dupes and broken links:
# remove 'None' wayback URLs, and set archive.org rel 'archive'
- existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ existing.urls = [
+ u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+ ]
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
# special case: if importing *new* from archive.org arxiv collections,
# blow away any existing release_id mappings; this is a direct arxiv_id
# map. This *should* be safe to run in all matched imports.
is_arxiv = False
for u in fe.urls:
- if 'archive.org/download/arxiv' in u.url.lower():
+ if "archive.org/download/arxiv" in u.url.lower():
is_arxiv = True
break
if is_arxiv and fe.release_ids:
@@ -178,14 +186,16 @@ class MatchedImporter(EntityImporter):
# merge the existing into this one and update
existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
- existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.urls = [
+ fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls
+ ]
if len(existing.urls) > SANE_MAX_URLS:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
- self.counts['skip-update-too-many-releases'] += 1
+ self.counts["skip-update-too-many-releases"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
@@ -194,12 +204,15 @@ class MatchedImporter(EntityImporter):
existing.sha256 = existing.sha256 or fe.sha256
edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 3bdd23a1..b514e6e5 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -1,4 +1,3 @@
-
import sys
import fatcat_openapi_client
@@ -8,7 +7,7 @@ from .common import EntityImporter, clean
def value_or_none(e):
if type(e) == dict:
- e = e.get('value')
+ e = e.get("value")
if type(e) == str and len(e) == 0:
e = None
# TODO: this is probably bogus; patched in desperation; remove?
@@ -21,18 +20,17 @@ def value_or_none(e):
return None
return e
-class OrcidImporter(EntityImporter):
+class OrcidImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of ORCID metadata, from official bulk releases.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = kwargs.get(
+ "editgroup_description",
+ "Automated import of ORCID metadata, from official bulk releases.",
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, raw_record):
return True
@@ -43,16 +41,16 @@ class OrcidImporter(EntityImporter):
returns a CreatorEntity
"""
- if 'person' not in obj:
+ if "person" not in obj:
return False
- name = obj['person']['name']
+ name = obj["person"]["name"]
if not name:
return None
extra = None
- given = value_or_none(name.get('given-names'))
- sur = value_or_none(name.get('family-name'))
- display = value_or_none(name.get('credit-name'))
+ given = value_or_none(name.get("given-names"))
+ sur = value_or_none(name.get("family-name"))
+ display = value_or_none(name.get("credit-name"))
if display is None:
# TODO: sorry human beings
if given and sur:
@@ -61,7 +59,7 @@ class OrcidImporter(EntityImporter):
display = sur
elif given:
display = given
- orcid = obj['orcid-identifier']['path']
+ orcid = obj["orcid-identifier"]["path"]
if not self.is_orcid(orcid):
sys.stderr.write("Bad ORCID: {}\n".format(orcid))
return None
@@ -74,7 +72,8 @@ class OrcidImporter(EntityImporter):
given_name=clean(given),
surname=clean(sur),
display_name=display,
- extra=extra)
+ extra=extra,
+ )
return ce
def try_update(self, raw_record):
@@ -88,14 +87,17 @@ class OrcidImporter(EntityImporter):
# eventually we'll want to support "updates", but for now just skip if
# entity already exists
if existing:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
return True
def insert_batch(self, batch):
- self.api.create_creator_auto_batch(fatcat_openapi_client.CreatorAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_creator_auto_batch(
+ fatcat_openapi_client.CreatorAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 00ad54d0..cfdafcf7 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
- #Adaptive Clinical Trial
+ # Adaptive Clinical Trial
"Address": "speech",
"Autobiography": "book",
- #Bibliography
+ # Bibliography
"Biography": "book",
- #Case Reports
+ # Case Reports
"Classical Article": "article-journal",
- #Clinical Conference
- #Clinical Study
- #Clinical Trial
- #Clinical Trial, Phase I
- #Clinical Trial, Phase II
- #Clinical Trial, Phase III
- #Clinical Trial, Phase IV
- #Clinical Trial Protocol
- #Clinical Trial, Veterinary
- #Collected Works
- #Comparative Study
- #Congress
- #Consensus Development Conference
- #Consensus Development Conference, NIH
- #Controlled Clinical Trial
+ # Clinical Conference
+ # Clinical Study
+ # Clinical Trial
+ # Clinical Trial, Phase I
+ # Clinical Trial, Phase II
+ # Clinical Trial, Phase III
+ # Clinical Trial, Phase IV
+ # Clinical Trial Protocol
+ # Clinical Trial, Veterinary
+ # Collected Works
+ # Comparative Study
+ # Congress
+ # Consensus Development Conference
+ # Consensus Development Conference, NIH
+ # Controlled Clinical Trial
"Dataset": "dataset",
- #Dictionary
- #Directory
- #Duplicate Publication
+ # Dictionary
+ # Directory
+ # Duplicate Publication
"Editorial": "editorial",
- #English Abstract # doesn't indicate that this is abstract-only
- #Equivalence Trial
- #Evaluation Studies
- #Expression of Concern
- #Festschrift
- #Government Document
- #Guideline
+ # English Abstract # doesn't indicate that this is abstract-only
+ # Equivalence Trial
+ # Evaluation Studies
+ # Expression of Concern
+ # Festschrift
+ # Government Document
+ # Guideline
"Historical Article": "article-journal",
- #Interactive Tutorial
+ # Interactive Tutorial
"Interview": "interview",
"Introductory Journal Article": "article-journal",
"Journal Article": "article-journal",
@@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = {
"Legal Case": "legal_case",
"Legislation": "legislation",
"Letter": "letter",
- #Meta-Analysis
- #Multicenter Study
- #News
+ # Meta-Analysis
+ # Multicenter Study
+ # News
"Newspaper Article": "article-newspaper",
- #Observational Study
- #Observational Study, Veterinary
- #Overall
- #Patient Education Handout
- #Periodical Index
- #Personal Narrative
- #Portrait
- #Practice Guideline
- #Pragmatic Clinical Trial
- #Publication Components
- #Publication Formats
- #Publication Type Category
- #Randomized Controlled Trial
- #Research Support, American Recovery and Reinvestment Act
- #Research Support, N.I.H., Extramural
- #Research Support, N.I.H., Intramural
- #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- #Research Support, U.S. Gov't, P.H.S.
- #Review # in the "literature review" sense, not "product review"
- #Scientific Integrity Review
- #Study Characteristics
- #Support of Research
- #Systematic Review
+ # Observational Study
+ # Observational Study, Veterinary
+ # Overall
+ # Patient Education Handout
+ # Periodical Index
+ # Personal Narrative
+ # Portrait
+ # Practice Guideline
+ # Pragmatic Clinical Trial
+ # Publication Components
+ # Publication Formats
+ # Publication Type Category
+ # Randomized Controlled Trial
+ # Research Support, American Recovery and Reinvestment Act
+ # Research Support, N.I.H., Extramural
+ # Research Support, N.I.H., Intramural
+ # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+ # Research Support, U.S. Gov't, P.H.S.
+ # Review # in the "literature review" sense, not "product review"
+ # Scientific Integrity Review
+ # Study Characteristics
+ # Support of Research
+ # Systematic Review
"Technical Report": "report",
- #Twin Study
- #Validation Studies
- #Video-Audio Media
- #Webcasts
+ # Twin Study
+ # Validation Studies
+ # Video-Audio Media
+ # Webcasts
}
MONTH_ABBR_MAP = {
- "Jan": 1, "01": 1,
- "Feb": 2, "02": 2,
- "Mar": 3, "03": 3,
- "Apr": 4, "04": 4,
- "May": 5, "05": 5,
- "Jun": 6, "06": 6,
- "Jul": 7, "07": 7,
- "Aug": 8, "08": 8,
- "Sep": 9, "09": 9,
- "Oct": 10, "10": 10,
- "Nov": 11, "11": 11,
- "Dec": 12, "12": 12,
+ "Jan": 1,
+ "01": 1,
+ "Feb": 2,
+ "02": 2,
+ "Mar": 3,
+ "03": 3,
+ "Apr": 4,
+ "04": 4,
+ "May": 5,
+ "05": 5,
+ "Jun": 6,
+ "06": 6,
+ "Jul": 7,
+ "07": 7,
+ "Aug": 8,
+ "08": 8,
+ "Sep": 9,
+ "09": 9,
+ "Oct": 10,
+ "10": 10,
+ "Nov": 11,
+ "11": 11,
+ "Dec": 12,
+ "12": 12,
}
# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
@@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = {
"United Kingdom": "gb",
"United States": "us",
"Uruguay": "uy",
-
# Additions from running over large files
"Bosnia and Herzegovina": "ba",
- #"International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
+ # "International"
+ "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
"Russia (Federation)": "ru",
"Scotland": "gb",
"England": "gb",
@@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter):
def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of PubMed/MEDLINE XML metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
- super().__init__(api,
+ eg_desc = kwargs.get(
+ "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
self.lookup_refs = lookup_refs
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
def want(self, obj):
@@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter):
release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
break
if pub_types:
- extra_pubmed['pub_types'] = pub_types
+ extra_pubmed["pub_types"] = pub_types
if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
release_type = "retraction"
retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
if retraction_of:
if retraction_of.RefSource:
- extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+ extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string
if retraction_of.PMID:
- extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+ extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string
# everything in medline is published
release_stage = "published"
@@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter):
elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
withdrawn_status = "concern"
- pages = medline.find('MedlinePgn')
+ pages = medline.find("MedlinePgn")
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.get_text() # always present
+ title = medline.Article.ArticleTitle.get_text() # always present
if title:
- title = title.replace('\n', ' ')
- if title.endswith('.'):
+ title = title.replace("\n", " ")
+ if title.endswith("."):
title = title[:-1]
# this hides some "special" titles, but the vast majority are
# translations; translations don't always include the original_title
- if title.startswith('[') and title.endswith(']'):
+ if title.startswith("[") and title.endswith("]"):
title = title[1:-1]
else:
# will filter out later
@@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter):
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
original_title = original_title.get_text() or None
- original_title = original_title.replace('\n', ' ')
- if original_title and original_title.endswith('.'):
+ original_title = original_title.replace("\n", " ")
+ if original_title and original_title.endswith("."):
original_title = original_title[:-1]
if original_title and not title:
@@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter):
else:
language = LANG_MAP_MARC.get(language)
if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
- warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+ warnings.warn(
+ "MISSING MARC LANG: {}".format(medline.Article.Language.string)
+ )
### Journal/Issue Metadata
# MedlineJournalInfo is always present
@@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter):
country_name = mji.Country.string.strip()
country_code = COUNTRY_NAME_MAP.get(country_name)
if country_code:
- container_extra['country'] = country_code
+ container_extra["country"] = country_code
elif country_name:
- container_extra['country_name'] = country_name
+ container_extra["country_name"] = country_name
if mji.find("ISSNLinking"):
issnl = mji.ISSNLinking.string
@@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter):
if issnl:
container_id = self.lookup_issnl(issnl)
- pub_date = medline.Article.find('ArticleDate')
+ pub_date = medline.Article.find("ArticleDate")
if not pub_date:
pub_date = journal.PubDate
if not pub_date:
@@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter):
release_date = datetime.date(
release_year,
MONTH_ABBR_MAP[pub_date.Month.string],
- int(pub_date.Day.string))
+ int(pub_date.Day.string),
+ )
release_date = release_date.isoformat()
except ValueError as ve:
print("bad date, skipping: {}".format(ve), file=sys.stderr)
@@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter):
if len(medline_date) >= 4 and medline_date[:4].isdigit():
release_year = int(medline_date[:4])
if release_year < 1300 or release_year > 2040:
- print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
+ print(
+ "bad medline year, skipping: {}".format(release_year), file=sys.stderr
+ )
release_year = None
else:
- print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
+ print(
+ "unparsable medline date, skipping: {}".format(medline_date),
+ file=sys.stderr,
+ )
if journal.find("Title"):
container_name = journal.Title.get_text()
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
# name, type, publisher, issnl
# extra: original_name, languages, country
ce = fatcat_openapi_client.ContainerEntity(
name=container_name,
- container_type='journal',
- #NOTE: publisher not included
+ container_type="journal",
+ # NOTE: publisher not included
issnl=issnl,
issnp=issnp,
- extra=(container_extra or None))
+ extra=(container_extra or None),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter):
# "All abstracts are in English"
abstracts = []
primary_abstract = medline.find("Abstract")
- if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
- joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
+ if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"):
+ joined = "\n".join(
+ [m.get_text() for m in primary_abstract.find_all("AbstractText")]
+ )
abst = fatcat_openapi_client.ReleaseAbstract(
content=joined,
mimetype="text/plain",
@@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter):
)
if abst.content:
abstracts.append(abst)
- if abstract.find('math'):
+ if abstract.find("math"):
abst = fatcat_openapi_client.ReleaseAbstract(
# strip the <AbstractText> tags
content=str(abstract)[14:-15],
@@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter):
other_abstracts = medline.find_all("OtherAbstract")
for other in other_abstracts:
lang = "en"
- if other.get('Language'):
- lang = LANG_MAP_MARC.get(other['Language'])
+ if other.get("Language"):
+ lang = LANG_MAP_MARC.get(other["Language"])
abst = fatcat_openapi_client.ReleaseAbstract(
content=other.AbstractText.get_text().strip(),
mimetype="text/plain",
@@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter):
surname = None
raw_name = None
if author.ForeName:
- given_name = author.ForeName.get_text().replace('\n', ' ')
+ given_name = author.ForeName.get_text().replace("\n", " ")
if author.LastName:
- surname = author.LastName.get_text().replace('\n', ' ')
+ surname = author.LastName.get_text().replace("\n", " ")
if given_name and surname:
raw_name = "{} {}".format(given_name, surname)
elif surname:
raw_name = surname
if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
- raw_name = author.CollectiveName.get_text().replace('\n', ' ')
+ raw_name = author.CollectiveName.get_text().replace("\n", " ")
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter):
orcid = orcid.replace("http://orcid.org/", "")
elif orcid.startswith("https://orcid.org/"):
orcid = orcid.replace("https://orcid.org/", "")
- elif '-' not in orcid:
+ elif "-" not in orcid:
orcid = "{}-{}-{}-{}".format(
orcid[0:4],
orcid[4:8],
@@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter):
orcid[12:16],
)
creator_id = self.lookup_orcid(orcid)
- contrib_extra['orcid'] = orcid
+ contrib_extra["orcid"] = orcid
affiliations = author.find_all("Affiliation")
raw_affiliation = None
if affiliations:
- raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
+ raw_affiliation = affiliations[0].get_text().replace("\n", " ")
if len(affiliations) > 1:
- contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
+ contrib_extra["more_affiliations"] = [
+ ra.get_text().replace("\n", " ") for ra in affiliations[1:]
+ ]
if author.find("EqualContrib"):
# TODO: schema for this?
- contrib_extra['equal'] = True
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=raw_name,
- given_name=given_name,
- surname=surname,
- role="author",
- raw_affiliation=raw_affiliation,
- creator_id=creator_id,
- extra=contrib_extra,
- ))
-
- if medline.AuthorList['CompleteYN'] == 'N':
+ contrib_extra["equal"] = True
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=raw_name,
+ given_name=given_name,
+ surname=surname,
+ role="author",
+ raw_affiliation=raw_affiliation,
+ creator_id=creator_id,
+ extra=contrib_extra,
+ )
+ )
+
+ if medline.AuthorList["CompleteYN"] == "N":
contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))
for i, contrib in enumerate(contribs):
@@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter):
# note that Reference always exists within a ReferenceList, but
# that there may be multiple ReferenceList (eg, sometimes one per
# Reference)
- for ref in pubmed.find_all('Reference'):
+ for ref in pubmed.find_all("Reference"):
ref_extra = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
@@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter):
ref_pmid = clean_pmid(ref_pmid.string)
ref_release_id = None
if ref_doi:
- ref_extra['doi'] = ref_doi
+ ref_extra["doi"] = ref_doi
if self.lookup_refs:
ref_release_id = self.lookup_doi(ref_doi)
if ref_pmid:
- ref_extra['pmid'] = ref_pmid
+ ref_extra["pmid"] = ref_pmid
if self.lookup_refs:
ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
- ref_extra['unstructured'] = ref_raw.get_text()
+ ref_extra["unstructured"] = ref_raw.get_text()
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- target_release_id=ref_release_id,
- extra=ref_extra,
- ))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ target_release_id=ref_release_id,
+ extra=ref_extra,
+ )
+ )
if not refs:
refs = None
@@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter):
# group-title
# pubmed: retraction refs
if extra_pubmed:
- extra['pubmed'] = extra_pubmed
+ extra["pubmed"] = extra_pubmed
if not extra:
extra = None
@@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter):
doi=doi,
pmid=pmid,
pmcid=pmcid,
- #isbn13 # never in Article
+ # isbn13 # never in Article
),
volume=volume,
issue=issue,
pages=pages,
- #publisher # not included?
+ # publisher # not included?
language=language,
- #license_slug # not in MEDLINE
+ # license_slug # not in MEDLINE
abstracts=abstracts,
contribs=contribs,
refs=refs,
@@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter):
raise err
if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
- existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
+ )
warnings.warn(warn_str)
- self.counts['warn-pmid-doi-mismatch'] += 1
+ self.counts["warn-pmid-doi-mismatch"] += 1
# don't clobber DOI, but do group together
re.ext_ids.doi = None
re.work_id = existing.work_id
if existing and not self.do_updates:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
elif existing:
# but do update if only DOI was set
@@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter):
existing.container_id = existing.container_id or re.container_id
existing.refs = existing.refs or re.refs
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['pubmed'] = re.extra['pubmed']
+ existing.extra["pubmed"] = re.extra["pubmed"]
# fix stub titles
if existing.title in [
- "OUP accepted manuscript",
- ]:
+ "OUP accepted manuscript",
+ ]:
existing.title = re.title
existing.original_title = existing.original_title or re.original_title
@@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter):
existing.language = existing.language or re.language
# update subtitle in-place first
- if not existing.subtitle and existing.extra.get('subtitle'):
- subtitle = existing.extra.pop('subtitle')
+ if not existing.subtitle and existing.extra.get("subtitle"):
+ subtitle = existing.extra.pop("subtitle")
if type(subtitle) == list:
subtitle = subtitle[0]
if subtitle:
@@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
@@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter):
for article in soup.find_all("PubmedArticle"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
+
-if __name__=='__main__':
+if __name__ == "__main__":
parser = PubmedImporter(None, None)
parser.parse_file(open(sys.argv[1]))
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
index 77205cee..78eeec7a 100644
--- a/python/fatcat_tools/importers/shadow.py
+++ b/python/fatcat_tools/importers/shadow.py
@@ -1,4 +1,3 @@
-
import fatcat_openapi_client
from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid
@@ -30,25 +29,25 @@ class ShadowLibraryImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Import of 'Shadow Library' file/release matches"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ShadowLibraryImporter")
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
def want(self, raw_record):
"""
Only want to import records with complete file-level metadata
"""
- fm = raw_record['file_meta']
- if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
- self.counts['skip-file-meta-incomplete'] += 1
+ fm = raw_record["file_meta"]
+ if not (fm["mimetype"] and fm["md5hex"] and fm["sha256hex"] and fm["size_bytes"]):
+ self.counts["skip-file-meta-incomplete"] += 1
return False
- if fm['mimetype'] != 'application/pdf':
- self.counts['skip-not-pdf'] += 1
+ if fm["mimetype"] != "application/pdf":
+ self.counts["skip-not-pdf"] += 1
return False
return True
@@ -57,23 +56,23 @@ class ShadowLibraryImporter(EntityImporter):
We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
"""
- shadow_corpus = obj['shadow']['shadow_corpus']
+ shadow_corpus = obj["shadow"]["shadow_corpus"]
assert shadow_corpus == shadow_corpus.strip().lower()
- doi = clean_doi(obj['shadow'].get('doi'))
- pmid = clean_pmid(obj['shadow'].get('pmid'))
- isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
- shadow_id = obj['shadow'].get('shadow_id').strip()
+ doi = clean_doi(obj["shadow"].get("doi"))
+ pmid = clean_pmid(obj["shadow"].get("pmid"))
+ isbn13 = clean_isbn13(obj["shadow"].get("isbn13"))
+ shadow_id = obj["shadow"].get("shadow_id").strip()
assert shadow_id
- extra = { '{}_id'.format(shadow_corpus): shadow_id }
- for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ extra = {"{}_id".format(shadow_corpus): shadow_id}
+ for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
if not ext_id:
continue
- extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+ extra["{}_{}".format(shadow_corpus, ext_type)] = ext_id
# lookup release via several idents
re = None
- for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ for (ext_type, ext_id) in [("doi", doi), ("pmid", pmid), ("isbn13", isbn13)]:
if not ext_id:
continue
try:
@@ -86,29 +85,31 @@ class ShadowLibraryImporter(EntityImporter):
break
if not re:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
- release_ids = [re.ident,]
+ release_ids = [
+ re.ident,
+ ]
# parse single CDX into URLs (if exists)
urls = []
- if obj.get('cdx'):
- url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if obj.get("cdx"):
+ url = make_rel_url(obj["cdx"]["url"], default_link_rel=self.default_link_rel)
if url is not None:
urls.append(url)
wayback = "https://web.archive.org/web/{}/{}".format(
- obj['cdx']['datetime'],
- obj['cdx']['url'])
+ obj["cdx"]["datetime"], obj["cdx"]["url"]
+ )
urls.append(("webarchive", wayback))
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
fe = fatcat_openapi_client.FileEntity(
- md5=obj['file_meta']['md5hex'],
- sha1=obj['file_meta']['sha1hex'],
- sha256=obj['file_meta']['sha256hex'],
- size=int(obj['file_meta']['size_bytes']),
- mimetype=obj['file_meta']['mimetype'] or None,
+ md5=obj["file_meta"]["md5hex"],
+ sha1=obj["file_meta"]["sha1hex"],
+ sha256=obj["file_meta"]["sha256hex"],
+ size=int(obj["file_meta"]["size_bytes"]),
+ mimetype=obj["file_meta"]["mimetype"] or None,
release_ids=release_ids,
urls=urls,
extra=dict(shadows=extra),
@@ -130,45 +131,50 @@ class ShadowLibraryImporter(EntityImporter):
if not existing.extra:
existing.extra = {}
- if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ if (
+ existing.extra.get("shadows")
+ and list(fe.extra["shadows"].keys())[0] in existing.extra["shadows"]
+ ):
# already imported from this shadow library; skip
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# check for edit conflicts
if existing.ident in [e.ident for e in self._edits_inflight]:
- self.counts['skip-update-inflight'] += 1
+ self.counts["skip-update-inflight"] += 1
return False
if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
raise Exception("Inflight insert; shouldn't happen")
# minimum viable "existing" URL cleanup to fix dupes and broken links:
# remove 'None' wayback URLs, and set archive.org rel 'archive'
- existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ existing.urls = [
+ u for u in existing.urls if not ("://web.archive.org/web/None/" in u.url)
+ ]
for i in range(len(existing.urls)):
u = existing.urls[i]
- if u.rel == 'repository' and '://archive.org/download/' in u.url:
- existing.urls[i].rel = 'archive'
- if u.rel == 'social':
- u.rel = 'academicsocial'
+ if u.rel == "repository" and "://archive.org/download/" in u.url:
+ existing.urls[i].rel = "archive"
+ if u.rel == "social":
+ u.rel = "academicsocial"
# merge the existing into this one and update
merged_urls = {}
for u in fe.urls + existing.urls:
merged_urls[u.url] = u
existing.urls = list(merged_urls.values())
- if not existing.extra.get('shadows'):
- existing.extra['shadows'] = fe.extra['shadows']
+ if not existing.extra.get("shadows"):
+ existing.extra["shadows"] = fe.extra["shadows"]
else:
- existing.extra['shadows'].update(fe.extra['shadows'])
+ existing.extra["shadows"].update(fe.extra["shadows"])
# do these "plus ones" because we really want to do these updates when possible
if len(existing.urls) > SANE_MAX_URLS + 1:
- self.counts['skip-update-too-many-url'] += 1
+ self.counts["skip-update-too-many-url"] += 1
return None
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
- self.counts['skip-update-too-many-releases'] += 1
+ self.counts["skip-update-too-many-releases"] += 1
return None
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
@@ -180,12 +186,15 @@ class ShadowLibraryImporter(EntityImporter):
# group-level de-dupe
edit.sha1 = existing.sha1
self._edits_inflight.append(edit)
- self.counts['update'] += 1
+ self.counts["update"] += 1
return False
def insert_batch(self, batch):
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 196f86ff..22fefad3 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -33,22 +33,23 @@ REQ_SESSION = requests.Session()
def parse_wbm_url(url):
"""Takes a wayback machine URL, and returns a tuple:
- (timestamp, datetime, original_url)
+ (timestamp, datetime, original_url)
"""
- chunks = url.split('/')
+ chunks = url.split("/")
assert len(chunks) >= 6
- assert chunks[2] == 'web.archive.org'
- assert chunks[3] == 'web'
- return (chunks[4],
- parse_wbm_timestamp(chunks[4]),
- '/'.join(chunks[5:]))
+ assert chunks[2] == "web.archive.org"
+ assert chunks[3] == "web"
+ return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
+
def test_parse_wbm_url():
u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
assert parse_wbm_url(u) == (
"20010712114837",
datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html")
+ "http://www.dlib.org/dlib/june01/reich/06reich.html",
+ )
+
def parse_wbm_timestamp(timestamp):
"""
@@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp):
python datetime object (UTC)
"""
# strip any "im_" or "id_" suffix
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
# inflexible; require the full second-precision timestamp
assert len(timestamp) == 14
@@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp):
day=int(timestamp[6:8]),
hour=int(timestamp[8:10]),
minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]))
+ second=int(timestamp[12:14]),
+ )
+
def test_parse_wbm_timestamp():
- assert parse_wbm_timestamp("20010712114837") == \
- datetime.datetime(2001, 7, 12, 11, 48, 37)
+ assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
+
def fetch_wbm(url):
resp = REQ_SESSION.get(url)
@@ -78,31 +81,35 @@ def fetch_wbm(url):
assert resp.content
return resp.content
+
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith('/web/')
- embed_url = embed_url.split('/')
+ assert embed_url.startswith("/web/")
+ embed_url = embed_url.split("/")
timestamp = embed_url[2]
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
- url = '/'.join(embed_url[3:])
- #print((timestamp, url))
- resp = REQ_SESSION.get(CDX_API_BASE, params=dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- ))
+ url = "/".join(embed_url[3:])
+ # print((timestamp, url))
+ resp = REQ_SESSION.get(
+ CDX_API_BASE,
+ params=dict(
+ url=url,
+ closest=timestamp,
+ sort="closest",
+ resolveRevisits="true",
+ matchType="exact",
+ limit=1,
+ ),
+ )
resp.raise_for_status()
- #print(resp.url)
+ # print(resp.url)
if resp.content:
- hit = resp.content.decode('utf-8').split('\n')[0]
+ hit = resp.content.decode("utf-8").split("\n")[0]
if cdx_output:
cdx_output.write(hit + "\n")
- cdx = hit.split(' ')
- cdx = [x if (x and x != '-') else None for x in cdx]
+ cdx = hit.split(" ")
+ cdx = [x if (x and x != "-") else None for x in cdx]
webcapture_cdx = WebcaptureCdxLine(
surt=cdx[0],
timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
@@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sha256=None,
)
if verify_hashes:
- resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format(
- cdx[1], # raw timestamp
- webcapture_cdx.url))
+ resp = REQ_SESSION.get(
+ GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
+ )
resp.raise_for_status()
assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
@@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
else:
return None
+
def wayback_url_to_relative(url):
"""
Wayback URLs can be relative or absolute in rewritten documents. This
function converts any form of rewritten URL to a relative (to
web.archive.org) one, or returns None if it isn't a rewritten URL at all.
"""
- if url.startswith('https://web.archive.org/'):
+ if url.startswith("https://web.archive.org/"):
url = url[23:]
- elif url.startswith('http://web.archive.org/'):
+ elif url.startswith("http://web.archive.org/"):
url = url[22:]
- if url.startswith('/web/'):
+ if url.startswith("/web/"):
return url
else:
return None
+
def extract_embeds(soup):
embeds = set()
# <link href="">
- for tag in soup.find_all('link', href=True):
- if tag['rel'] not in ('stylesheet',):
+ for tag in soup.find_all("link", href=True):
+ if tag["rel"] not in ("stylesheet",):
continue
- url = wayback_url_to_relative(tag['href'])
+ url = wayback_url_to_relative(tag["href"])
if url:
embeds.add(url)
# <img src="">
- for tag in soup.find_all('img', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("img", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
# <script src="">
- for tag in soup.find_all('script', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("script", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
return list(embeds)
+
def static_wayback_webcapture(wayback_url, cdx_output=None):
"""
Given a complete wayback machine capture URL, like:
@@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
wbm_html = fetch_wbm(wayback_url)
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- #with open(rewritten_path, 'r') as fp:
+ # with open(rewritten_path, 'r') as fp:
# soup = BeautifulSoup(fp, "lxml")
soup = BeautifulSoup(wbm_html, "lxml")
embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url),
- cdx_output=cdx_output)
+ cdx_obj = lookup_cdx(
+ "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
+ )
cdx_list = [cdx_obj]
for url in embeds:
cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
cdx_list.append(cdx_obj)
- archive_urls = [WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )]
+ archive_urls = [
+ WebcaptureUrl(
+ rel="wayback",
+ url="https://web.archive.org/web/",
+ )
+ ]
wc = WebcaptureEntity(
cdx=cdx_list,
timestamp=timestamp.isoformat() + "Z",
original_url=original_url,
archive_urls=archive_urls,
- release_ids=None)
+ release_ids=None,
+ )
return wc
+
def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
"""
Returns a tuple: (editgroup_id, edit). If failed, both are None
"""
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(
- ["git", "describe", "--always"]).strip().decode('utf-8')
+ git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
release = api.get_release(release_id, expand="webcaptures")
@@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
for wc in release.webcaptures:
if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
# skipping: already existed
- print("release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url))
+ print(
+ "release {} already had webcapture {} {}".format(
+ release_id, raw_timestamp, original_url
+ )
+ )
return (None, None)
wc = static_wayback_webcapture(wayback_url)
assert len(wc.cdx) >= 1
wc.release_ids = [release_id]
if not editgroup_id:
- eg = api.create_editgroup(Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(
- git_rev=git_rev,
- agent="fatcat_tools.auto_wayback_static")))
+ eg = api.create_editgroup(
+ Editgroup(
+ description="One-off import of static web content from wayback machine",
+ extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
+ )
+ )
editgroup_id = eg.editgroup_id
edit = api.create_webcapture(eg.editgroup_id, wc)
return (editgroup_id, edit)
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--verbose',
- action='store_true',
- help="verbose output")
- parser.add_argument('wayback_url',
- type=str,
- help="URL of wayback capture to extract from")
- parser.add_argument('--json-output',
- type=argparse.FileType('w'), default=sys.stdout,
- help="where to write out webcapture entity (as JSON)")
- parser.add_argument('--cdx-output',
- type=argparse.FileType('w'), default=None,
- help="(optional) file to write out CDX stub")
+ parser.add_argument("--verbose", action="store_true", help="verbose output")
+ parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
+ parser.add_argument(
+ "--json-output",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="where to write out webcapture entity (as JSON)",
+ )
+ parser.add_argument(
+ "--cdx-output",
+ type=argparse.FileType("w"),
+ default=None,
+ help="(optional) file to write out CDX stub",
+ )
args = parser.parse_args()
@@ -254,5 +275,6 @@ def main():
wc_dict = api_client.sanitize_for_serialization(wc)
print(json.dumps(wc_dict))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py
index 32749db2..2a4451ad 100644
--- a/python/fatcat_tools/kafka.py
+++ b/python/fatcat_tools/kafka.py
@@ -1,4 +1,3 @@
-
from confluent_kafka import KafkaException, Producer
@@ -9,14 +8,15 @@ def kafka_fail_fast(err, msg):
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
+
def simple_kafka_producer(kafka_hosts):
kafka_config = {
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1,
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1,
},
}
return Producer(kafka_config)
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 9b65e768..12c58829 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -1,4 +1,3 @@
-
"""
A bunch of helpers to parse and normalize strings: external identifiers,
free-form input, titles, etc.
@@ -32,7 +31,7 @@ def clean_doi(raw: str) -> Optional[str]:
if not raw:
return None
raw = raw.strip().lower()
- if '\u2013' in raw:
+ if "\u2013" in raw:
# Do not attempt to normalize "en dash" and since FC does not allow
# unicode in DOI, treat this as invalid.
return None
@@ -54,7 +53,7 @@ def clean_doi(raw: str) -> Optional[str]:
# fatcatd uses same REGEX, but Rust regex rejects these characters, while
# python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
# for now filter them out.
- for c in ('¬', ):
+ for c in ("¬",):
if c in raw:
return None
@@ -70,6 +69,7 @@ def clean_doi(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_doi():
assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
@@ -81,7 +81,9 @@ def test_clean_doi():
assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
assert clean_doi("doi:10.1234/ asdf ") is None
assert clean_doi("10.4149/gpb¬_2017042") is None # "logical negation" character
- assert clean_doi("10.6002/ect.2020.häyry") is None # this example via pubmed (pmid:32519616)
+ assert (
+ clean_doi("10.6002/ect.2020.häyry") is None
+ ) # this example via pubmed (pmid:32519616)
assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") is None
assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") is None
assert clean_doi("10.4025/diálogos.v17i2.36030") is None
@@ -92,6 +94,7 @@ def test_clean_doi():
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
+
def clean_arxiv_id(raw: str) -> Optional[str]:
"""
Removes any:
@@ -113,6 +116,7 @@ def clean_arxiv_id(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_arxiv_id():
assert clean_arxiv_id("0806.2878v1") == "0806.2878v1"
assert clean_arxiv_id("0806.2878") == "0806.2878"
@@ -141,16 +145,18 @@ def test_clean_arxiv_id():
assert clean_arxiv_id("0806.v1") is None
assert clean_arxiv_id("08062878v1") is None
+
def clean_wikidata_qid(raw):
if not raw:
return None
raw = raw.strip()
if len(raw.split()) != 1 or len(raw) < 2:
return None
- if raw[0] == 'Q' and raw[1] != '0' and raw[1:].isdigit():
+ if raw[0] == "Q" and raw[1] != "0" and raw[1:].isdigit():
return raw
return None
+
def test_clean_wikidata_qid():
assert clean_wikidata_qid("Q1234") == "Q1234"
assert clean_wikidata_qid("Q1") == "Q1"
@@ -163,6 +169,7 @@ def test_clean_wikidata_qid():
assert clean_wikidata_qid("qfba3") is None
assert clean_wikidata_qid("") is None
+
def clean_pmid(raw: str) -> Optional[str]:
if not raw:
return None
@@ -173,6 +180,7 @@ def clean_pmid(raw: str) -> Optional[str]:
return raw
return None
+
def test_clean_pmid():
assert clean_pmid("1234") == "1234"
assert clean_pmid("1234 ") == "1234"
@@ -180,6 +188,7 @@ def test_clean_pmid():
assert clean_pmid("qfba3") is None
assert clean_pmid("") is None
+
def clean_pmcid(raw: str) -> Optional[str]:
if not raw:
return None
@@ -190,6 +199,7 @@ def clean_pmcid(raw: str) -> Optional[str]:
return raw
return None
+
def clean_sha1(raw: str) -> Optional[str]:
if not raw:
return None
@@ -203,13 +213,21 @@ def clean_sha1(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_sha1():
- assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
- assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+ assert (
+ clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b")
+ == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+ )
+ assert (
+ clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ")
+ == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+ )
assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None
+
def clean_sha256(raw: str) -> Optional[str]:
raw = raw.strip().lower()
if len(raw.split()) != 1:
@@ -221,12 +239,18 @@ def clean_sha256(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_sha256():
- assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f"
+ assert (
+ clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f")
+ == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f"
+ )
assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
+
ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$")
+
def clean_issn(raw: str) -> Optional[str]:
if not raw:
return None
@@ -237,14 +261,17 @@ def clean_issn(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_issn():
assert clean_issn("1234-4567") == "1234-4567"
assert clean_issn("1234-456X") == "1234-456X"
assert clean_issn("134-4567") is None
assert clean_issn("123X-4567") is None
+
ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$")
+
def clean_isbn13(raw: str) -> Optional[str]:
if not raw:
return None
@@ -253,14 +280,17 @@ def clean_isbn13(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_isbn13():
assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4"
assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6"
assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4"
assert clean_isbn13("9781566199094") is None
+
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
+
def clean_orcid(raw: str) -> Optional[str]:
if not raw:
return None
@@ -269,6 +299,7 @@ def clean_orcid(raw: str) -> Optional[str]:
return None
return raw
+
def test_clean_orcid():
assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789"
assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X"
@@ -279,6 +310,7 @@ def test_clean_orcid():
HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$")
+
def clean_hdl(raw):
if not raw:
return None
@@ -293,14 +325,17 @@ def clean_hdl(raw):
raw = raw[15:]
if not HDL_REGEX.fullmatch(raw):
return None
- if raw.startswith('10.'):
+ if raw.startswith("10."):
return None
return raw
+
def test_clean_hdl():
assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
- assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ assert (
+ clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ )
assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh"
assert clean_hdl("2381/12775") == "2381/12775"
@@ -326,7 +361,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
"""
if not thing:
return None
- unescape_html: Union[str, bool] = 'auto'
+ unescape_html: Union[str, bool] = "auto"
if force_xml:
unescape_html = True
fixed = ftfy.fix_text(thing, unescape_html=unescape_html).strip()
@@ -335,15 +370,17 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
return None
return fixed
+
def test_clean_str():
assert clean_str(None) is None
- assert clean_str('') is None
- assert clean_str('1') is None
- assert clean_str('123') == '123'
- assert clean_str('a&amp;b') == 'a&b'
- assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
- assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+ assert clean_str("") is None
+ assert clean_str("1") is None
+ assert clean_str("123") == "123"
+ assert clean_str("a&amp;b") == "a&b"
+ assert clean_str("<b>a&amp;b</b>") == "<b>a&amp;b</b>"
+ assert clean_str("<b>a&amp;b</b>", force_xml=True) == "<b>a&b</b>"
+
def b32_hex(s):
s = s.strip().split()[0].lower()
@@ -351,7 +388,8 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
def is_cjk(s):
if not s:
@@ -359,38 +397,53 @@ def is_cjk(s):
for c in s:
if c.isalpha():
lang_prefix = unicodedata.name(c).split()[0]
- return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+ return lang_prefix in ("CJK", "HIRAGANA", "KATAKANA", "HANGUL")
return False
+
def test_is_cjk():
assert is_cjk(None) is False
- assert is_cjk('') is False
- assert is_cjk('blah') is False
- assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
- assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
- assert is_cjk('菊') is True
- assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
- assert is_cjk('水道') is True
- assert is_cjk('オウ, イク') is True # kanji
- assert is_cjk('ひヒ') is True
- assert is_cjk('き゚ゅ') is True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+ assert is_cjk("") is False
+ assert is_cjk("blah") is False
+ assert is_cjk("岡, 鹿, 梨, 阜, 埼") is True
+ assert is_cjk("[岡, 鹿, 梨, 阜, 埼]") is True
+ assert is_cjk("菊") is True
+ assert is_cjk("岡, 鹿, 梨, 阜, 埼 with eng after") is True
+ assert is_cjk("水道") is True
+ assert is_cjk("オウ, イク") is True # kanji
+ assert is_cjk("ひヒ") is True
+ assert is_cjk("き゚ゅ") is True
+ assert is_cjk("ㄴ, ㄹ, ㅁ, ㅂ, ㅅ") is True
+
MONTH_MAP = {
- "jan": 1, "january": 1,
- "feb": 2, "febuary": 2,
- "mar": 3, "march": 3,
- "apr": 4, "april": 4,
- "may": 5, "may": 5,
- "jun": 6, "june": 6,
- "jul": 7, "july": 7,
- "aug": 8, "august": 8,
- "sep": 9, "september": 9,
- "oct": 10, "october": 10,
- "nov": 11, "nov": 11,
- "dec": 12, "december": 12,
+ "jan": 1,
+ "january": 1,
+ "feb": 2,
+ "febuary": 2,
+ "mar": 3,
+ "march": 3,
+ "apr": 4,
+ "april": 4,
+ "may": 5,
+ "may": 5,
+ "jun": 6,
+ "june": 6,
+ "jul": 7,
+ "july": 7,
+ "aug": 8,
+ "august": 8,
+ "sep": 9,
+ "september": 9,
+ "oct": 10,
+ "october": 10,
+ "nov": 11,
+ "nov": 11,
+ "dec": 12,
+ "december": 12,
}
+
def parse_month(raw: Optional[str]) -> Optional[int]:
"""
Parses a string into a month number (1 to 12)
@@ -408,6 +461,7 @@ def parse_month(raw: Optional[str]) -> Optional[int]:
return MONTH_MAP[raw]
return None
+
def test_parse_month() -> None:
assert parse_month(None) is None
@@ -417,6 +471,7 @@ def test_parse_month() -> None:
assert parse_month("jan") == 1
assert parse_month("September") == 9
+
def detect_text_lang(raw: str) -> Optional[str]:
"""
Tries to determine language of, eg, an abstract.
@@ -427,13 +482,14 @@ def detect_text_lang(raw: str) -> Optional[str]:
return None
try:
lang = langdetect.detect(raw)
- lang = lang.split('-')[0]
+ lang = lang.split("-")[0]
assert len(lang) == 2
return lang
except (langdetect.lang_detect_exception.LangDetectException, TypeError):
return None
return None
+
def test_detect_text_lang() -> None:
assert detect_text_lang("") is None
EN_SAMPLE = "this is a string of English text for testing"
@@ -444,6 +500,7 @@ def test_detect_text_lang() -> None:
# XXX: why does this detect as `ko` sometimes?
assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko")
+
def parse_lang_name(raw: Optional[str]) -> Optional[str]:
"""
Parses a language name and returns a 2-char ISO 631 language code.
@@ -456,13 +513,14 @@ def parse_lang_name(raw: Optional[str]) -> Optional[str]:
return None
return lang.alpha_2.lower()
except LookupError:
- #print(f" unknown language: '{raw}', file=sys.stderr)
+ # print(f" unknown language: '{raw}', file=sys.stderr)
return None
except AttributeError:
- #print(f" partial language metadata: '{lang}', file=sys.stderr)
+ # print(f" partial language metadata: '{lang}', file=sys.stderr)
return None
return None
+
def test_parse_lang_name() -> None:
assert parse_lang_name(None) is None
@@ -544,86 +602,85 @@ def test_parse_country_name():
assert parse_country_name("Russia") == "ru"
assert parse_country_name("Japan") == "jp"
+
# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
# 2/T and 2/B?
# PubMed/MEDLINE and JSTOR use these MARC codes
# https://www.loc.gov/marc/languages/language_name.html
LANG_MAP_MARC = {
- 'afr': 'af',
- 'alb': 'sq',
- 'amh': 'am',
- 'ara': 'ar',
- 'arm': 'hy',
- 'aze': 'az',
- 'ben': 'bn',
- 'bos': 'bs',
- 'bul': 'bg',
- 'cat': 'ca',
- 'chi': 'zh',
- 'cze': 'cs',
- 'dan': 'da',
- 'dut': 'nl',
- 'eng': 'en',
- 'epo': 'eo',
- 'est': 'et',
- 'fin': 'fi',
- 'fre': 'fr',
- 'geo': 'ka',
- 'ger': 'de',
- 'gla': 'gd',
- 'gre': 'el',
- 'heb': 'he',
- 'hin': 'hi',
- 'hrv': 'hr',
- 'hun': 'hu',
- 'ice': 'is',
- 'ind': 'id',
- 'ita': 'it',
- 'jpn': 'ja',
- 'kin': 'rw',
- 'kor': 'ko',
- 'lat': 'la',
- 'lav': 'lv',
- 'lit': 'lt',
- 'mac': 'mk',
- 'mal': 'ml',
- 'mao': 'mi',
- 'may': 'ms',
- 'nor': 'no',
- 'per': 'fa',
- 'per': 'fa',
- 'pol': 'pl',
- 'por': 'pt',
- 'pus': 'ps',
- 'rum': 'ro',
- 'rus': 'ru',
- 'san': 'sa',
- 'slo': 'sk',
- 'slv': 'sl',
- 'spa': 'es',
- 'srp': 'sr',
- 'swe': 'sv',
- 'tha': 'th',
- 'tur': 'tr',
- 'ukr': 'uk',
- 'urd': 'ur',
- 'vie': 'vi',
- 'wel': 'cy',
-
-# additions
- 'gle': 'ga', # "Irish" (Gaelic)
- 'jav': 'jv', # Javanese
- 'welsh': 'cy', # Welsh
- 'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
- 'grc': 'el', # Ancient Greek; map to modern greek
- 'map': None, # Austronesian (collection)
- 'syr': None, # Syriac, Modern
- 'gem': None, # Old Saxon
- 'non': None, # Old Norse
- 'emg': None, # Eastern Meohang
- 'neg': None, # Negidal
- 'mul': None, # Multiple languages
- 'und': None, # Undetermined
+ "afr": "af",
+ "alb": "sq",
+ "amh": "am",
+ "ara": "ar",
+ "arm": "hy",
+ "aze": "az",
+ "ben": "bn",
+ "bos": "bs",
+ "bul": "bg",
+ "cat": "ca",
+ "chi": "zh",
+ "cze": "cs",
+ "dan": "da",
+ "dut": "nl",
+ "eng": "en",
+ "epo": "eo",
+ "est": "et",
+ "fin": "fi",
+ "fre": "fr",
+ "geo": "ka",
+ "ger": "de",
+ "gla": "gd",
+ "gre": "el",
+ "heb": "he",
+ "hin": "hi",
+ "hrv": "hr",
+ "hun": "hu",
+ "ice": "is",
+ "ind": "id",
+ "ita": "it",
+ "jpn": "ja",
+ "kin": "rw",
+ "kor": "ko",
+ "lat": "la",
+ "lav": "lv",
+ "lit": "lt",
+ "mac": "mk",
+ "mal": "ml",
+ "mao": "mi",
+ "may": "ms",
+ "nor": "no",
+ "per": "fa",
+ "per": "fa",
+ "pol": "pl",
+ "por": "pt",
+ "pus": "ps",
+ "rum": "ro",
+ "rus": "ru",
+ "san": "sa",
+ "slo": "sk",
+ "slv": "sl",
+ "spa": "es",
+ "srp": "sr",
+ "swe": "sv",
+ "tha": "th",
+ "tur": "tr",
+ "ukr": "uk",
+ "urd": "ur",
+ "vie": "vi",
+ "wel": "cy",
+ # additions
+ "gle": "ga", # "Irish" (Gaelic)
+ "jav": "jv", # Javanese
+ "welsh": "cy", # Welsh
+ "oci": "oc", # Occitan
+ # Don't have ISO 639-1 codes
+ "grc": "el", # Ancient Greek; map to modern greek
+ "map": None, # Austronesian (collection)
+ "syr": None, # Syriac, Modern
+ "gem": None, # Old Saxon
+ "non": None, # Old Norse
+ "emg": None, # Eastern Meohang
+ "neg": None, # Negidal
+ "mul": None, # Multiple languages
+ "und": None, # Undetermined
}
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 8361b260..6fd9ca49 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -22,6 +22,7 @@ from fatcat_tools.transforms.entities import entity_to_dict
class BiblioRef(BaseModel):
"""bibliographic reference"""
+
# ("release", source_release_ident, ref_index)
# ("wikipedia", source_wikipedia_article, ref_index)
_key: Optional[str]
@@ -37,7 +38,7 @@ class BiblioRef(BaseModel):
# context of the reference itself
# 1-indexed, not 0-indexed
- ref_index: Optional[int] # TODO: actually optional?
+ ref_index: Optional[int] # TODO: actually optional?
# eg, "Lee86", "BIB23"
ref_key: Optional[str]
# eg, page number
@@ -74,16 +75,20 @@ class BiblioRef(BaseModel):
# work-arounds for bad/weird ref_key
if self.ref_key:
self.ref_key = self.ref_key.strip()
- if self.ref_key[0] in ['/', '_']:
+ if self.ref_key[0] in ["/", "_"]:
self.ref_key = self.ref_key[1:]
- if self.ref_key.startswith("10.") and 'SICI' in self.ref_key and '-' in self.ref_key:
- self.ref_key = self.ref_key.split('-')[-1]
- if self.ref_key.startswith("10.") and '_' in self.ref_key:
- self.ref_key = self.ref_key.split('_')[-1]
+ if (
+ self.ref_key.startswith("10.")
+ and "SICI" in self.ref_key
+ and "-" in self.ref_key
+ ):
+ self.ref_key = self.ref_key.split("-")[-1]
+ if self.ref_key.startswith("10.") and "_" in self.ref_key:
+ self.ref_key = self.ref_key.split("_")[-1]
if len(self.ref_key) > 10 and "#" in self.ref_key:
- self.ref_key = self.ref_key.split('#')[-1]
+ self.ref_key = self.ref_key.split("#")[-1]
if len(self.ref_key) > 10 and "_" in self.ref_key:
- self.ref_key = self.ref_key.split('_')[-1]
+ self.ref_key = self.ref_key.split("_")[-1]
if not self.ref_key and self.ref_index is not None:
self.ref_key = str(self.ref_index)
return self
@@ -98,7 +103,7 @@ class EnrichedBiblioRef(BaseModel):
# TODO: openlibrary work?
access: List[AccessOption]
- @validator('release')
+ @validator("release")
@classmethod
def check_release(cls, v):
if v is not None and not isinstance(v, ReleaseEntity):
@@ -119,7 +124,7 @@ class RefHits(BaseModel):
limit: int
query_time_ms: int
query_wall_time_ms: int
- result_refs: List[Union[BiblioRef,EnrichedBiblioRef]]
+ result_refs: List[Union[BiblioRef, EnrichedBiblioRef]]
class Config:
json_encoders = {
@@ -145,22 +150,22 @@ def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) ->
except elasticsearch.exceptions.RequestError as e_raw:
# this is a "user" error
e: Any = e_raw
- #logging.warn("elasticsearch 400: " + str(e.info))
+ # logging.warn("elasticsearch 400: " + str(e.info))
if e.info.get("error", {}).get("root_cause", {}):
raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) from e
else:
raise ValueError(str(e.info)) from e
except elasticsearch.exceptions.TransportError as e:
# all other errors
- #logging.warn(f"elasticsearch non-200 status code: {e.info}")
+ # logging.warn(f"elasticsearch non-200 status code: {e.info}")
raise IOError(str(e.info)) from e
query_delta = datetime.datetime.now() - query_start
result_refs = []
for h in resp.hits:
# might be a list because of consolidation
- if isinstance(h._d_.get('source_work_ident'), list):
- h._d_['source_work_ident'] = h._d_['source_work_ident'][0]
+ if isinstance(h._d_.get("source_work_ident"), list):
+ h._d_["source_work_ident"] = h._d_["source_work_ident"][0]
result_refs.append(BiblioRef.parse_obj(h._d_).hacks())
return RefHits(
@@ -224,7 +229,10 @@ def get_inbound_refs(
search = search.extra(
collapse={
"field": "source_work_ident",
- "inner_hits": {"name": "source_more", "size": 0,},
+ "inner_hits": {
+ "name": "source_more",
+ "size": 0,
+ },
}
)
@@ -281,61 +289,87 @@ def count_inbound_refs(
# run fatcat API fetches for each ref and return "enriched" refs
-def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
+def enrich_inbound_refs(
+ refs: List[BiblioRef],
+ fatcat_api_client: Any,
+ hide: Optional[str] = "refs",
+ expand: Optional[str] = "container,files,webcaptures,filesets",
+) -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
release = None
access = []
if ref.source_release_ident:
- release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand)
+ release = fatcat_api_client.get_release(
+ ref.source_release_ident, hide=hide, expand=expand
+ )
access = release_access_options(release)
if ref.source_wikipedia_article:
- wiki_lang = ref.source_wikipedia_article.split(':')[0]
- wiki_article = ':'.join(ref.source_wikipedia_article.split(':')[1:]).replace(' ', '_')
- access.append(AccessOption(
- access_type="wikipedia",
- access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}",
- mimetype=None,
- size_bytes=None,
- thumbnail_url=None
- ))
- enriched.append(EnrichedBiblioRef(
- ref=ref,
- access=access,
- release=release,
- ))
+ wiki_lang = ref.source_wikipedia_article.split(":")[0]
+ wiki_article = ":".join(ref.source_wikipedia_article.split(":")[1:]).replace(
+ " ", "_"
+ )
+ access.append(
+ AccessOption(
+ access_type="wikipedia",
+ access_url=f"https://{wiki_lang}.wikipedia.org/wiki/{wiki_article}",
+ mimetype=None,
+ size_bytes=None,
+ thumbnail_url=None,
+ )
+ )
+ enriched.append(
+ EnrichedBiblioRef(
+ ref=ref,
+ access=access,
+ release=release,
+ )
+ )
return enriched
-def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:
+def enrich_outbound_refs(
+ refs: List[BiblioRef],
+ fatcat_api_client: Any,
+ hide: Optional[str] = "refs",
+ expand: Optional[str] = "container,files,webcaptures,filesets",
+) -> List[EnrichedBiblioRef]:
enriched = []
for ref in refs:
release = None
access = []
if ref.target_release_ident:
- release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand)
+ release = fatcat_api_client.get_release(
+ ref.target_release_ident, hide=hide, expand=expand
+ )
access = release_access_options(release)
if ref.target_openlibrary_work:
- access.append(AccessOption(
- access_type="openlibrary",
- access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}",
- mimetype=None,
- size_bytes=None,
- thumbnail_url=None
- ))
- if ref.target_url and '://web.archive.org/' in ref.target_url:
- access.append(AccessOption(
- access_type="wayback",
- access_url=ref.target_url,
- mimetype=None,
- size_bytes=None,
- thumbnail_url=None
- ))
- enriched.append(EnrichedBiblioRef(
- ref=ref,
- access=access,
- release=release,
- ))
+ access.append(
+ AccessOption(
+ access_type="openlibrary",
+ access_url=f"https://openlibrary.org/works/{ref.target_openlibrary_work}",
+ mimetype=None,
+ size_bytes=None,
+ thumbnail_url=None,
+ )
+ )
+ if ref.target_url and "://web.archive.org/" in ref.target_url:
+ access.append(
+ AccessOption(
+ access_type="wayback",
+ access_url=ref.target_url,
+ mimetype=None,
+ size_bytes=None,
+ thumbnail_url=None,
+ )
+ )
+ enriched.append(
+ EnrichedBiblioRef(
+ ref=ref,
+ access=access,
+ release=release,
+ )
+ )
return enriched
@@ -346,21 +380,29 @@ def run_ref_query(args) -> None:
release_ident = None
work_ident = None
if args.ident.startswith("release_"):
- release_ident = args.ident.split('_')[1]
+ release_ident = args.ident.split("_")[1]
elif args.ident.startswith("work_"):
- work_ident = args.ident.split('_')[1]
+ work_ident = args.ident.split("_")[1]
else:
release_ident = args.ident
print("## Outbound References")
- hits = get_outbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client)
- print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms")
+ hits = get_outbound_refs(
+ release_ident=release_ident, work_ident=work_ident, es_client=args.es_client
+ )
+ print(
+ f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms"
+ )
if args.enrich == "fatcat":
- enriched = enrich_outbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client)
+ enriched = enrich_outbound_refs(
+ hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
+ )
for ref in enriched:
if ref.release:
- print(f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}")
+ print(
+ f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+ )
else:
print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}")
else:
@@ -369,21 +411,30 @@ def run_ref_query(args) -> None:
print()
print("## Inbound References")
- hits = get_inbound_refs(release_ident=release_ident, work_ident=work_ident, es_client=args.es_client)
+ hits = get_inbound_refs(
+ release_ident=release_ident, work_ident=work_ident, es_client=args.es_client
+ )
- print(f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms")
+ print(
+ f"Total: {hits.count_total} Time: {hits.query_wall_time_ms}ms; {hits.query_time_ms}ms"
+ )
if args.enrich == "fatcat":
- enriched = enrich_inbound_refs(hits.result_refs, hide='refs,abstracts', fatcat_api_client=args.fatcat_api_client)
+ enriched = enrich_inbound_refs(
+ hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
+ )
for ref in enriched:
if ref.release:
- print(f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}")
+ print(
+ f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+ )
else:
print(f"release_{ref.target_release_ident}")
else:
for ref in hits.result_refs:
print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}")
+
def main() -> None:
"""
Run this utility like:
@@ -395,9 +446,7 @@ def main() -> None:
python -m fatcat_tools.references query release_pfrind3kh5hqhgqkueulk2tply
"""
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
parser.add_argument("--fatcat-api-base", default="https://api.fatcat.wiki/v0")
@@ -425,5 +474,6 @@ def main() -> None:
else:
raise NotImplementedError(args.func)
+
if __name__ == "__main__":
main()
diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py
index 867d826d..59ff1c4e 100644
--- a/python/fatcat_tools/reviewers/review_common.py
+++ b/python/fatcat_tools/reviewers/review_common.py
@@ -1,4 +1,3 @@
-
import datetime
import subprocess
import time
@@ -34,8 +33,8 @@ class CheckResult:
self.status = status
self.check_type = check_type
self.description = description
- self.ident = kwargs.get('ident')
- self.rev = kwargs.get('rev')
+ self.ident = kwargs.get("ident")
+ self.rev = kwargs.get("rev")
def __repr__(self):
return str(self.__dict__)
@@ -72,17 +71,17 @@ class EditCheck:
class ReviewBot:
-
def __init__(self, api, verbose=False, **kwargs):
self.api = api
self.checks = []
self.verbose = verbose
- self.extra = kwargs.get('extra', dict())
- self.extra['git_rev'] = self.extra.get('git_rev',
- subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- self.extra['agent'] = self.extra.get('agent', 'fatcat_tools.ReviewBot')
- self.poll_interval = kwargs.get('poll_interval', 10.0)
+ self.extra = kwargs.get("extra", dict())
+ self.extra["git_rev"] = self.extra.get(
+ "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()
+ ).decode("utf-8")
+ self.extra["agent"] = self.extra.get("agent", "fatcat_tools.ReviewBot")
+ self.poll_interval = kwargs.get("poll_interval", 10.0)
def run_single(self, editgroup_id, annotate=True):
eg = self.api.get_editgroup(editgroup_id)
@@ -96,7 +95,9 @@ class ReviewBot:
since = datetime.datetime.utcnow()
while True:
# XXX: better isoformat conversion?
- eg_list = self.api.get_editgroups_reviewable(since=since.isoformat()[:19] + "Z", limit=100)
+ eg_list = self.api.get_editgroups_reviewable(
+ since=since.isoformat()[:19] + "Z", limit=100
+ )
if not eg_list:
print("Sleeping {} seconds...".format(self.poll_interval))
time.sleep(self.poll_interval)
@@ -104,8 +105,11 @@ class ReviewBot:
for eg in eg_list:
# TODO: fetch annotations to ensure we haven't already annotated
annotation = self.review_editgroup(eg)
- print("Reviewed {} disposition:{}".format(
- eg.editgroup_id, annotation.extra['disposition']))
+ print(
+ "Reviewed {} disposition:{}".format(
+ eg.editgroup_id, annotation.extra["disposition"]
+ )
+ )
self.api.create_editgroup_annotation(eg.editgroup_id, annotation)
since = eg.submitted
# to prevent busy loops (TODO: needs review/rethink; multiple
@@ -125,10 +129,9 @@ class ReviewBot:
else:
raise ValueError
- for (status, title) in (('fail', 'Failed check'), ('warning', 'Warnings')):
+ for (status, title) in (("fail", "Failed check"), ("warning", "Warnings")):
if result_counts[status] > 0:
- comment += "\n\n### {} ({}):\n".format(
- status, result_counts[status])
+ comment += "\n\n### {} ({}):\n".format(status, result_counts[status])
for result in results:
if result.status == status and result.check_type == "editgroup":
comment += "\n- {description}".format(description=result.description)
@@ -137,15 +140,18 @@ class ReviewBot:
check_type=result.check_type,
rev=result.rev,
entity_type=result.check_type,
- description=result.description)
+ description=result.description,
+ )
extra = self.extra.copy()
- extra.update({
- "disposition": disposition,
- "submit_timestamp": editgroup.submitted.isoformat(),
- "checks": [check.name for check in self.checks],
- "result_counts": dict(result_counts),
- })
+ extra.update(
+ {
+ "disposition": disposition,
+ "submit_timestamp": editgroup.submitted.isoformat(),
+ "checks": [check.name for check in self.checks],
+ "result_counts": dict(result_counts),
+ }
+ )
annotation = fatcat_openapi_client.EditgroupAnnotation(
comment_markdown=comment,
editgroup_id=editgroup.editgroup_id,
@@ -156,7 +162,7 @@ class ReviewBot:
def result_counts(self, results):
counts = Counter()
for result in results:
- counts['total'] += 1
+ counts["total"] += 1
counts[result.status] += 1
return counts
@@ -217,13 +223,18 @@ class DummyCheck(EditCheck):
name = "DummyCheck"
def check_editgroup(self, editgroup):
- return CheckResult("pass", "editgroup",
+ return CheckResult(
+ "pass",
+ "editgroup",
"every edit is precious, thanks [editor {editor_id}](/editor/{editor_id})!".format(
- editor_id=editgroup.editor_id))
+ editor_id=editgroup.editor_id
+ ),
+ )
def check_work(self, entity, edit):
return CheckResult("pass", "work", "this work edit is beautiful")
+
class DummyReviewBot(ReviewBot):
"""
This bot reviews everything and always passes.
diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py
index ae9880e7..34212a6a 100644
--- a/python/fatcat_tools/transforms/access.py
+++ b/python/fatcat_tools/transforms/access.py
@@ -1,4 +1,3 @@
-
from enum import Enum
from typing import List, Optional
@@ -16,6 +15,7 @@ class AccessType(str, Enum):
openlibrary = "openlibrary"
wikipedia = "wikipedia"
+
class AccessOption(BaseModel):
access_type: AccessType
@@ -40,27 +40,31 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
option found
"""
options = []
- for f in (release.files or []):
+ for f in release.files or []:
thumbnail_url = None
- if f.mimetype == 'application/pdf' and f.sha1 and f.urls:
+ if f.mimetype == "application/pdf" and f.sha1 and f.urls:
# NOTE: scholar.archive.org does an actual database check before
# generating these URLs, but we skip that for speed
thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg"
- for u in (f.urls or []):
- if '://web.archive.org/' in u.url:
- return [AccessOption(
- access_type="wayback",
- access_url=u.url,
- mimetype=f.mimetype,
- size_bytes=f.size,
- thumbnail_url=thumbnail_url,
- )]
- elif '://archive.org/' in u.url:
- return [AccessOption(
- access_type="ia_file",
- access_url=u.url,
- mimetype=f.mimetype,
- size_bytes=f.size,
- thumbnail_url=thumbnail_url,
- )]
+ for u in f.urls or []:
+ if "://web.archive.org/" in u.url:
+ return [
+ AccessOption(
+ access_type="wayback",
+ access_url=u.url,
+ mimetype=f.mimetype,
+ size_bytes=f.size,
+ thumbnail_url=thumbnail_url,
+ )
+ ]
+ elif "://archive.org/" in u.url:
+ return [
+ AccessOption(
+ access_type="ia_file",
+ access_url=u.url,
+ mimetype=f.mimetype,
+ size_bytes=f.size,
+ thumbnail_url=thumbnail_url,
+ )
+ ]
return options
diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py
index f8b26bce..2b39068a 100644
--- a/python/fatcat_tools/transforms/csl.py
+++ b/python/fatcat_tools/transforms/csl.py
@@ -1,4 +1,3 @@
-
import json
from citeproc import (
@@ -13,10 +12,10 @@ from citeproc_styles import get_style_filepath
def contribs_by_role(contribs, role):
- ret = [c.copy() for c in contribs if c['role'] == role]
- [c.pop('role') for c in ret]
+ ret = [c.copy() for c in contribs if c["role"] == role]
+ [c.pop("role") for c in ret]
# TODO: some note to self here
- [c.pop('literal') for c in ret if 'literal' in c]
+ [c.pop("literal") for c in ret if "literal" in c]
if not ret:
return None
else:
@@ -33,26 +32,30 @@ def release_to_csl(entity):
Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json
"""
contribs = []
- for contrib in (entity.contribs or []):
+ for contrib in entity.contribs or []:
if contrib.creator:
# Default to "local" (publication-specific) metadata; fall back to
# creator-level
- family = contrib.creator.surname or contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
+ family = (
+ contrib.creator.surname
+ or contrib.surname
+ or (contrib.raw_name and contrib.raw_name.split()[-1])
+ )
if not family:
# CSL requires some surname (family name)
continue
c = dict(
family=family,
given=contrib.creator.given_name or contrib.given_name,
- #dropping-particle
- #non-dropping-particle
- #suffix
- #comma-suffix
- #static-ordering
+ # dropping-particle
+ # non-dropping-particle
+ # suffix
+ # comma-suffix
+ # static-ordering
literal=contrib.creator.display_name or contrib.raw_name,
- #parse-names,
+ # parse-names,
# role must be defined; default to author
- role=contrib.role or 'author',
+ role=contrib.role or "author",
)
else:
family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
@@ -64,7 +67,7 @@ def release_to_csl(entity):
given=contrib.given_name,
literal=contrib.raw_name,
# role must be defined; default to author
- role=contrib.role or 'author',
+ role=contrib.role or "author",
)
for k in list(c.keys()):
if not c[k]:
@@ -78,93 +81,108 @@ def release_to_csl(entity):
issued_date = None
if entity.release_date:
- issued_date = {"date-parts": [[
- entity.release_date.year,
- entity.release_date.month,
- entity.release_date.day,
- ]]}
+ issued_date = {
+ "date-parts": [
+ [
+ entity.release_date.year,
+ entity.release_date.month,
+ entity.release_date.day,
+ ]
+ ]
+ }
elif entity.release_year:
issued_date = {"date-parts": [[entity.release_year]]}
csl = dict(
- #id,
- #categories
- type=entity.release_type or "article", # can't be blank
+ # id,
+ # categories
+ type=entity.release_type or "article", # can't be blank
language=entity.language,
- #journalAbbreviation
- #shortTitle
+ # journalAbbreviation
+ # shortTitle
## see below for all contrib roles
- #accessed
- #container
- #event-date
+ # accessed
+ # container
+ # event-date
issued=issued_date,
- #original-date
- #submitted
+ # original-date
+ # submitted
abstract=abstract,
- #annote
- #archive
- #archive_location
- #archive-place
- #authority
- #call-number
- #chapter-number
- #citation-number
- #citation-label
- #collection-number
- #collection-title
+ # annote
+ # archive
+ # archive_location
+ # archive-place
+ # authority
+ # call-number
+ # chapter-number
+ # citation-number
+ # citation-label
+ # collection-number
+ # collection-title
container_title=entity.container and entity.container.name,
- #container-title-short
- #dimensions
+ # container-title-short
+ # dimensions
DOI=entity.ext_ids.doi,
- #edition
- #event
- #event-place
- #first-reference-note-number
- #genre
+ # edition
+ # event
+ # event-place
+ # first-reference-note-number
+ # genre
ISBN=entity.ext_ids.isbn13,
ISSN=entity.container and entity.container.issnl,
issue=entity.issue,
- #jurisdiction
- #keyword
- #locator
- #medium
- #note
- #number
- #number-of-pages
- #number-of-volumes
- #original-publisher
- #original-publisher-place
- #original-title
+ # jurisdiction
+ # keyword
+ # locator
+ # medium
+ # note
+ # number
+ # number-of-pages
+ # number-of-volumes
+ # original-publisher
+ # original-publisher-place
+ # original-title
# TODO: page=entity.pages,
- page_first=entity.pages and entity.pages.split('-')[0],
+ page_first=entity.pages and entity.pages.split("-")[0],
PMCID=entity.ext_ids.pmcid,
PMID=entity.ext_ids.pmid,
publisher=(entity.container and entity.container.publisher) or entity.publisher,
- #publisher-place
- #references
- #reviewed-title
- #scale
- #section
- #source
- #status
+ # publisher-place
+ # references
+ # reviewed-title
+ # scale
+ # section
+ # source
+ # status
title=entity.title,
- #title-short
- #URL
- #version
+ # title-short
+ # URL
+ # version
volume=entity.volume,
- #year-suffix
+ # year-suffix
)
- for role in ['author', 'collection-editor', 'composer', 'container-author',
- 'director', 'editor', 'editorial-director', 'interviewer',
- 'illustrator', 'original-author', 'recipient', 'reviewed-author',
- 'translator']:
+ for role in [
+ "author",
+ "collection-editor",
+ "composer",
+ "container-author",
+ "director",
+ "editor",
+ "editorial-director",
+ "interviewer",
+ "illustrator",
+ "original-author",
+ "recipient",
+ "reviewed-author",
+ "translator",
+ ]:
cbr = contribs_by_role(contribs, role)
if cbr:
csl[role] = cbr
# underline-to-dash
- csl['container-title'] = csl.pop('container_title')
- csl['page-first'] = csl.pop('page_first')
- empty_keys = [k for k,v in csl.items() if not v]
+ csl["container-title"] = csl.pop("container_title")
+ csl["page-first"] = csl.pop("page_first")
+ empty_keys = [k for k, v in csl.items() if not v]
for k in empty_keys:
csl.pop(k)
return csl
@@ -184,10 +202,11 @@ def refs_to_csl(entity):
title=ref.title,
issued=issued_date,
)
- csl['id'] = ref.key or ref.index, # zero- or one-indexed?
+ csl["id"] = (ref.key or ref.index,) # zero- or one-indexed?
ret.append(csl)
return ret
+
def citeproc_csl(csl_json, style, html=False):
"""
Renders a release entity to a styled citation.
@@ -200,8 +219,8 @@ def citeproc_csl(csl_json, style, html=False):
Returns a string; if the html flag is set, and the style isn't 'csl-json'
or 'bibtex', it will be HTML. Otherwise plain text.
"""
- if not csl_json.get('id'):
- csl_json['id'] = "unknown"
+ if not csl_json.get("id"):
+ csl_json["id"] = "unknown"
if style == "csl-json":
return json.dumps(csl_json)
bib_src = CiteProcJSON([csl_json])
@@ -211,7 +230,7 @@ def citeproc_csl(csl_json, style, html=False):
style_path = get_style_filepath(style)
bib_style = CitationStylesStyle(style_path, validate=False)
bib = CitationStylesBibliography(bib_style, bib_src, form)
- bib.register(Citation([CitationItem(csl_json['id'])]))
+ bib.register(Citation([CitationItem(csl_json["id"])]))
lines = bib.bibliography()[0]
if style == "bibtex":
out = ""
@@ -222,6 +241,6 @@ def citeproc_csl(csl_json, style, html=False):
out += "\n " + line
else:
out += line
- return ''.join(out)
+ return "".join(out)
else:
- return ''.join(lines)
+ return "".join(lines)
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 1826d4eb..e39e9ea4 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,4 +1,3 @@
-
import datetime
from typing import Any, Dict, Optional
@@ -13,13 +12,14 @@ from fatcat_openapi_client import (
def check_kbart(year: int, archive: dict) -> Optional[bool]:
- if not archive or not archive.get('year_spans'):
+ if not archive or not archive.get("year_spans"):
return None
- for span in archive['year_spans']:
+ for span in archive["year_spans"]:
if year >= span[0] and year <= span[1]:
return True
return False
+
def test_check_kbart() -> None:
assert check_kbart(1990, dict()) is None
@@ -40,87 +40,89 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->
Raises exception on error (never returns None)
"""
- if entity.state in ('redirect', 'deleted'):
+ if entity.state in ("redirect", "deleted"):
return dict(
- ident = entity.ident,
- state = entity.state,
+ ident=entity.ident,
+ state=entity.state,
)
- elif entity.state != 'active':
+ elif entity.state != "active":
raise ValueError("Unhandled entity state: {}".format(entity.state))
# First, the easy ones (direct copy)
release = entity
t: Dict[str, Any] = dict(
- doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z",
- ident = release.ident,
- state = release.state,
- revision = release.revision,
- work_id = release.work_id,
- title = release.title,
- subtitle = release.subtitle,
- original_title = release.original_title,
- release_type = release.release_type,
- release_stage = release.release_stage,
- withdrawn_status = release.withdrawn_status,
- language = release.language,
- volume = release.volume,
- issue = release.issue,
- pages = release.pages,
- number = release.number,
- license = release.license_slug,
- version = release.version,
- doi = release.ext_ids.doi,
- pmid = release.ext_ids.pmid,
- pmcid = release.ext_ids.pmcid,
- isbn13 = release.ext_ids.isbn13,
- wikidata_qid = release.ext_ids.wikidata_qid,
- core_id = release.ext_ids.core,
- arxiv_id = release.ext_ids.arxiv,
- jstor_id = release.ext_ids.jstor,
- ark_id = release.ext_ids.ark,
- mag_id = release.ext_ids.mag,
- dblp_id = release.ext_ids.dblp,
- doaj_id = release.ext_ids.doaj,
- hdl = release.ext_ids.hdl,
- tags = [],
+ doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z",
+ ident=release.ident,
+ state=release.state,
+ revision=release.revision,
+ work_id=release.work_id,
+ title=release.title,
+ subtitle=release.subtitle,
+ original_title=release.original_title,
+ release_type=release.release_type,
+ release_stage=release.release_stage,
+ withdrawn_status=release.withdrawn_status,
+ language=release.language,
+ volume=release.volume,
+ issue=release.issue,
+ pages=release.pages,
+ number=release.number,
+ license=release.license_slug,
+ version=release.version,
+ doi=release.ext_ids.doi,
+ pmid=release.ext_ids.pmid,
+ pmcid=release.ext_ids.pmcid,
+ isbn13=release.ext_ids.isbn13,
+ wikidata_qid=release.ext_ids.wikidata_qid,
+ core_id=release.ext_ids.core,
+ arxiv_id=release.ext_ids.arxiv,
+ jstor_id=release.ext_ids.jstor,
+ ark_id=release.ext_ids.ark,
+ mag_id=release.ext_ids.mag,
+ dblp_id=release.ext_ids.dblp,
+ doaj_id=release.ext_ids.doaj,
+ hdl=release.ext_ids.hdl,
+ tags=[],
)
- t.update(dict(
- is_oa = None,
- is_longtail_oa = None,
- is_preserved = None,
- in_web = False,
- in_dweb = False,
- in_ia = False,
- in_ia_sim = False,
- in_kbart = None,
- in_jstor = False,
- in_doaj= bool(release.ext_ids.doaj),
- in_shadows = False,
- ))
+ t.update(
+ dict(
+ is_oa=None,
+ is_longtail_oa=None,
+ is_preserved=None,
+ in_web=False,
+ in_dweb=False,
+ in_ia=False,
+ in_ia_sim=False,
+ in_kbart=None,
+ in_jstor=False,
+ in_doaj=bool(release.ext_ids.doaj),
+ in_shadows=False,
+ )
+ )
release_year = release.release_year
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
- t['release_date'] = release.release_date.isoformat()
+ t["release_date"] = release.release_date.isoformat()
if not release_year:
release_year = release.release_date.year
if release_year:
- t['release_year'] = release_year
+ t["release_year"] = release_year
- t['any_abstract'] = len(release.abstracts or []) > 0
- t['ref_count'] = len(release.refs or [])
+ t["any_abstract"] = len(release.abstracts or []) > 0
+ t["ref_count"] = len(release.refs or [])
ref_release_ids = []
- for r in (release.refs or []):
+ for r in release.refs or []:
if r.target_release_id:
ref_release_ids.append(r.target_release_id)
- t['ref_release_ids'] = ref_release_ids
- t['ref_linked_count'] = len(ref_release_ids)
- t['contrib_count'] = len(release.contribs or [])
+ t["ref_release_ids"] = ref_release_ids
+ t["ref_linked_count"] = len(ref_release_ids)
+ t["contrib_count"] = len(release.contribs or [])
contrib_names = []
contrib_affiliations = []
creator_ids = []
- for c in (release.contribs or []):
+ for c in release.contribs or []:
if c.creator and c.creator.display_name:
contrib_names.append(c.creator.display_name)
elif c.raw_name:
@@ -132,193 +134,218 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->
creator_ids.append(c.creator_id)
if c.raw_affiliation:
contrib_affiliations.append(c.raw_affiliation)
- t['contrib_names'] = contrib_names
- t['creator_ids'] = creator_ids
- t['affiliations'] = contrib_affiliations
+ t["contrib_names"] = contrib_names
+ t["creator_ids"] = creator_ids
+ t["affiliations"] = contrib_affiliations
# TODO: mapping... probably by lookup?
- t['affiliation_rors'] = None
+ t["affiliation_rors"] = None
if release.container:
t.update(_rte_container_helper(release.container, release_year))
# fall back to release-level container metadata if container not linked or
# missing context
- if not t.get('publisher'):
- t['publisher'] = release.publisher
- if not t.get('container_name') and release.extra:
- t['container_name'] = release.extra.get('container_name')
+ if not t.get("publisher"):
+ t["publisher"] = release.publisher
+ if not t.get("container_name") and release.extra:
+ t["container_name"] = release.extra.get("container_name")
- if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
- t['in_jstor'] = True
+ if release.ext_ids.jstor or (
+ release.ext_ids.doi and release.ext_ids.doi.startswith("10.2307/")
+ ):
+ t["in_jstor"] = True
# transform file/fileset/webcapture related fields
t.update(_rte_content_helper(release))
if release.ext_ids.doaj:
- t['is_oa'] = True
+ t["is_oa"] = True
if release.license_slug:
# TODO: more/better checks here, particularly strict *not* OA licenses
if release.license_slug.startswith("CC-"):
- t['is_oa'] = True
+ t["is_oa"] = True
if release.license_slug.startswith("ARXIV-"):
- t['is_oa'] = True
+ t["is_oa"] = True
- t['is_work_alias'] = None
+ t["is_work_alias"] = None
extra = release.extra or dict()
if extra:
- if extra.get('is_oa'):
+ if extra.get("is_oa"):
# NOTE: not actually setting this anywhere... but could
- t['is_oa'] = True
- if extra.get('is_work_alias') is not None:
- t['is_work_alias'] = bool(extra.get('is_work_alias'))
- if extra.get('longtail_oa'):
+ t["is_oa"] = True
+ if extra.get("is_work_alias") is not None:
+ t["is_work_alias"] = bool(extra.get("is_work_alias"))
+ if extra.get("longtail_oa"):
# sometimes set by GROBID/matcher
- t['is_oa'] = True
- t['is_longtail_oa'] = True
- if not t.get('container_name'):
- t['container_name'] = extra.get('container_name')
- if extra.get('crossref'):
- if extra['crossref'].get('archive'):
+ t["is_oa"] = True
+ t["is_longtail_oa"] = True
+ if not t.get("container_name"):
+ t["container_name"] = extra.get("container_name")
+ if extra.get("crossref"):
+ if extra["crossref"].get("archive"):
# all crossref archives are KBART, I believe
- t['in_kbart'] = True
+ t["in_kbart"] = True
# backwards compatible subtitle fetching
- if not t['subtitle'] and extra.get('subtitle'):
- if type(extra['subtitle']) == list:
- t['subtitle'] = extra['subtitle'][0]
+ if not t["subtitle"] and extra.get("subtitle"):
+ if type(extra["subtitle"]) == list:
+ t["subtitle"] = extra["subtitle"][0]
else:
- t['subtitle'] = extra['subtitle']
+ t["subtitle"] = extra["subtitle"]
- t['first_page'] = None
+ t["first_page"] = None
if release.pages:
- first = release.pages.split('-')[0]
- first = first.replace('p', '')
+ first = release.pages.split("-")[0]
+ first = first.replace("p", "")
if first.isdigit():
- t['first_page'] = first
+ t["first_page"] = first
# TODO: non-numerical first pages
- t['ia_microfilm_url'] = None
- if t['in_ia_sim']:
+ t["ia_microfilm_url"] = None
+ if t["in_ia_sim"]:
# TODO: determine URL somehow? I think this is in flux. Will probably
# need extra metadata in the container extra field.
# special case as a demo for now.
- if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
- and release.release_year in (2011, 2013) \
- and release.issue \
- and release.issue.isdigit() \
- and t['first_page']:
- t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+ if (
+ release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u"
+ and release.release_year in (2011, 2013)
+ and release.issue
+ and release.issue.isdigit()
+ and t["first_page"]
+ ):
+ t[
+ "ia_microfilm_url"
+ ] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
release.release_year,
int(release.issue) - 1,
- t['first_page'],
+ t["first_page"],
)
- t['doi_registrar'] = None
- if extra and t['doi']:
- for k in ('crossref', 'datacite', 'jalc'):
+ t["doi_registrar"] = None
+ if extra and t["doi"]:
+ for k in ("crossref", "datacite", "jalc"):
if k in extra:
- t['doi_registrar'] = k
- if 'doi_registrar' not in t:
- t['doi_registrar'] = 'crossref'
+ t["doi_registrar"] = k
+ if "doi_registrar" not in t:
+ t["doi_registrar"] = "crossref"
- if t['doi']:
- t['doi_prefix'] = t['doi'].split('/')[0]
+ if t["doi"]:
+ t["doi_prefix"] = t["doi"].split("/")[0]
- if t['is_longtail_oa']:
- t['is_oa'] = True
+ if t["is_longtail_oa"]:
+ t["is_oa"] = True
# optionally coerce all flags from Optional[bool] to bool
if force_bool:
- for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim',
- 'in_jstor', 'in_web', 'in_dweb', 'in_shadows',
- 'is_work_alias'):
+ for k in (
+ "is_oa",
+ "is_longtail_oa",
+ "in_kbart",
+ "in_ia_sim",
+ "in_jstor",
+ "in_web",
+ "in_dweb",
+ "in_shadows",
+ "is_work_alias",
+ ):
t[k] = bool(t[k])
- t['in_ia'] = bool(t['in_ia'])
- t['is_preserved'] = bool(
- t['is_preserved']
- or t['in_ia']
- or t['in_kbart']
- or t['in_jstor']
- or t.get('pmcid')
- or t.get('arxiv_id')
+ t["in_ia"] = bool(t["in_ia"])
+ t["is_preserved"] = bool(
+ t["is_preserved"]
+ or t["in_ia"]
+ or t["in_kbart"]
+ or t["in_jstor"]
+ or t.get("pmcid")
+ or t.get("arxiv_id")
)
- if t['in_ia']:
- t['preservation'] = 'bright'
- elif t['is_preserved']:
- t['preservation'] = 'dark'
- elif t['in_shadows']:
- t['preservation'] = 'shadows_only'
+ if t["in_ia"]:
+ t["preservation"] = "bright"
+ elif t["is_preserved"]:
+ t["preservation"] = "dark"
+ elif t["in_shadows"]:
+ t["preservation"] = "shadows_only"
else:
- t['preservation'] = 'none'
+ t["preservation"] = "none"
return t
+
def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict:
"""
Container metadata sub-section of release_to_elasticsearch()
"""
this_year = datetime.date.today().year
t = dict()
- t['publisher'] = container.publisher
- t['container_name'] = container.name
+ t["publisher"] = container.publisher
+ t["container_name"] = container.name
# this is container.ident, not release.container_id, because there may
# be a redirect involved
- t['container_id'] = container.ident
- t['container_issnl'] = container.issnl
+ t["container_id"] = container.ident
+ t["container_issnl"] = container.issnl
issns = [container.issnl, container.issne, container.issnp]
issns = list(set([i for i in issns if i]))
- t['container_issns'] = issns
- t['container_type'] = container.container_type
- t['container_publication_status'] = container.publication_status
+ t["container_issns"] = issns
+ t["container_type"] = container.container_type
+ t["container_publication_status"] = container.publication_status
if container.extra:
c_extra = container.extra
- if c_extra.get('kbart') and release_year:
- if check_kbart(release_year, c_extra['kbart'].get('jstor')):
- t['in_jstor'] = True
- if t.get('in_kbart') or t.get('in_jstor'):
- t['in_kbart'] = True
- for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
- 'hathitrust', 'scholarsportal', 'cariniana'):
- t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive))
+ if c_extra.get("kbart") and release_year:
+ if check_kbart(release_year, c_extra["kbart"].get("jstor")):
+ t["in_jstor"] = True
+ if t.get("in_kbart") or t.get("in_jstor"):
+ t["in_kbart"] = True
+ for archive in (
+ "portico",
+ "lockss",
+ "clockss",
+ "pkp_pln",
+ "hathitrust",
+ "scholarsportal",
+ "cariniana",
+ ):
+ t["in_kbart"] = t.get("in_kbart") or check_kbart(
+ release_year, c_extra["kbart"].get(archive)
+ )
# recent KBART coverage is often not updated for the
# current year. So for current-year publications, consider
# coverage from *last* year to also be included in the
# Keeper
- if not t.get('in_kbart') and release_year == this_year:
- t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
-
- if c_extra.get('ia'):
- if c_extra['ia'].get('sim') and release_year:
- t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim'])
- if c_extra['ia'].get('longtail_oa'):
- t['is_longtail_oa'] = True
- if c_extra.get('sherpa_romeo'):
- if c_extra['sherpa_romeo'].get('color') == 'white':
- t['is_oa'] = False
- if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
- t['is_oa'] = True
- if c_extra.get('doaj'):
- if c_extra['doaj'].get('as_of'):
- t['is_oa'] = True
- t['in_doaj'] = True
- if c_extra.get('road'):
- if c_extra['road'].get('as_of'):
- t['is_oa'] = True
- if c_extra.get('szczepanski'):
- if c_extra['szczepanski'].get('as_of'):
- t['is_oa'] = True
- if c_extra.get('country'):
- t['country_code'] = c_extra['country']
- t['country_code_upper'] = c_extra['country'].upper()
- if c_extra.get('publisher_type'):
- t['publisher_type'] = c_extra['publisher_type']
- if c_extra.get('discipline'):
- t['discipline'] = c_extra['discipline']
+ if not t.get("in_kbart") and release_year == this_year:
+ t["in_kbart"] = check_kbart(this_year - 1, c_extra["kbart"].get(archive))
+
+ if c_extra.get("ia"):
+ if c_extra["ia"].get("sim") and release_year:
+ t["in_ia_sim"] = check_kbart(release_year, c_extra["ia"]["sim"])
+ if c_extra["ia"].get("longtail_oa"):
+ t["is_longtail_oa"] = True
+ if c_extra.get("sherpa_romeo"):
+ if c_extra["sherpa_romeo"].get("color") == "white":
+ t["is_oa"] = False
+ if c_extra.get("default_license") and c_extra.get("default_license").startswith("CC-"):
+ t["is_oa"] = True
+ if c_extra.get("doaj"):
+ if c_extra["doaj"].get("as_of"):
+ t["is_oa"] = True
+ t["in_doaj"] = True
+ if c_extra.get("road"):
+ if c_extra["road"].get("as_of"):
+ t["is_oa"] = True
+ if c_extra.get("szczepanski"):
+ if c_extra["szczepanski"].get("as_of"):
+ t["is_oa"] = True
+ if c_extra.get("country"):
+ t["country_code"] = c_extra["country"]
+ t["country_code_upper"] = c_extra["country"].upper()
+ if c_extra.get("publisher_type"):
+ t["publisher_type"] = c_extra["publisher_type"]
+ if c_extra.get("discipline"):
+ t["discipline"] = c_extra["discipline"]
return t
+
def _rte_content_helper(release: ReleaseEntity) -> dict:
"""
File/FileSet/WebCapture sub-section of release_to_elasticsearch()
@@ -329,9 +356,9 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:
- any other URL
"""
t = dict(
- file_count = len(release.files or []),
- fileset_count = len(release.filesets or []),
- webcapture_count = len(release.webcaptures or []),
+ file_count=len(release.files or []),
+ fileset_count=len(release.filesets or []),
+ webcapture_count=len(release.webcaptures or []),
)
any_pdf_url = None
@@ -340,38 +367,42 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:
ia_pdf_url = None
for f in release.files or []:
- if f.extra and f.extra.get('shadows'):
- t['in_shadows'] = True
- is_pdf = 'pdf' in (f.mimetype or '')
- for release_url in (f.urls or []):
+ if f.extra and f.extra.get("shadows"):
+ t["in_shadows"] = True
+ is_pdf = "pdf" in (f.mimetype or "")
+ for release_url in f.urls or []:
# first generic flags
t.update(_rte_url_helper(release_url))
# then PDF specific stuff (for generating "best URL" fields)
- if not f.mimetype and 'pdf' in release_url.url.lower():
+ if not f.mimetype and "pdf" in release_url.url.lower():
is_pdf = True
if is_pdf:
any_pdf_url = release_url.url
- if release_url.rel in ('webarchive', 'repository', 'repo'):
+ if release_url.rel in ("webarchive", "repository", "repo"):
good_pdf_url = release_url.url
- if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+ if (
+ "//web.archive.org/" in release_url.url
+ or "//archive.org/" in release_url.url
+ ):
best_pdf_url = release_url.url
ia_pdf_url = release_url.url
# here is where we bake-in PDF url priority; IA-specific
- t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
- t['ia_pdf_url'] = ia_pdf_url
+ t["best_pdf_url"] = best_pdf_url or good_pdf_url or any_pdf_url
+ t["ia_pdf_url"] = ia_pdf_url
for fs in release.filesets or []:
- for url_obj in (fs.urls or []):
+ for url_obj in fs.urls or []:
t.update(_rte_url_helper(url_obj))
for wc in release.webcaptures or []:
- for url_obj in (wc.archive_urls or []):
+ for url_obj in wc.archive_urls or []:
t.update(_rte_url_helper(url_obj))
return t
+
def _rte_url_helper(url_obj) -> dict:
"""
Takes a location URL ('url' and 'rel' keys) and returns generic preservation status.
@@ -382,17 +413,17 @@ def _rte_url_helper(url_obj) -> dict:
these will be iteratively update() into the overal object.
"""
t = dict()
- if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'):
- t['is_preserved'] = True
- if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url:
- t['in_ia'] = True
- if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'):
- t['in_web'] = True
- if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ if url_obj.rel in ("webarchive", "repository", "archive", "repo"):
+ t["is_preserved"] = True
+ if "//web.archive.org/" in url_obj.url or "//archive.org/" in url_obj.url:
+ t["in_ia"] = True
+ if url_obj.url.lower().startswith("http") or url_obj.url.lower().startswith("ftp"):
+ t["in_web"] = True
+ if url_obj.rel in ("dweb", "p2p", "ipfs", "dat", "torrent"):
# not sure what rel will be for this stuff
- t['in_dweb'] = True
- if '//www.jstor.org/' in url_obj.url:
- t['in_jstor'] = True
+ t["in_dweb"] = True
+ if "//www.jstor.org/" in url_obj.url:
+ t["in_jstor"] = True
return t
@@ -404,50 +435,59 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None):
Raises exception on error (never returns None)
"""
- if entity.state in ('redirect', 'deleted'):
+ if entity.state in ("redirect", "deleted"):
return dict(
- ident = entity.ident,
- state = entity.state,
+ ident=entity.ident,
+ state=entity.state,
)
- elif entity.state != 'active':
+ elif entity.state != "active":
raise ValueError("Unhandled entity state: {}".format(entity.state))
# First, the easy ones (direct copy)
t = dict(
- doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z",
- ident = entity.ident,
- state = entity.state,
- revision = entity.revision,
-
- name = entity.name,
- publisher = entity.publisher,
- container_type = entity.container_type,
- publication_status= entity.publication_status,
- issnl = entity.issnl,
- issne = entity.issne,
- issnp = entity.issnp,
- wikidata_qid = entity.wikidata_qid,
+ doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z",
+ ident=entity.ident,
+ state=entity.state,
+ revision=entity.revision,
+ name=entity.name,
+ publisher=entity.publisher,
+ container_type=entity.container_type,
+ publication_status=entity.publication_status,
+ issnl=entity.issnl,
+ issne=entity.issne,
+ issnp=entity.issnp,
+ wikidata_qid=entity.wikidata_qid,
)
if not entity.extra:
entity.extra = dict()
- for key in ('country', 'languages', 'mimetypes', 'original_name',
- 'first_year', 'last_year', 'aliases', 'abbrev', 'region',
- 'discipline', 'publisher_type'):
+ for key in (
+ "country",
+ "languages",
+ "mimetypes",
+ "original_name",
+ "first_year",
+ "last_year",
+ "aliases",
+ "abbrev",
+ "region",
+ "discipline",
+ "publisher_type",
+ ):
if entity.extra.get(key):
t[key] = entity.extra[key]
- if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'):
- t['dblp_prefix'] = entity.extra['dblp']['prefix']
+ if entity.extra.get("dblp") and entity.extra["dblp"].get("prefix"):
+ t["dblp_prefix"] = entity.extra["dblp"]["prefix"]
- if 'country' in t:
- t['country_code'] = t.pop('country')
+ if "country" in t:
+ t["country_code"] = t.pop("country")
- t['issns'] = [entity.issnl, entity.issne, entity.issnp]
- for key in ('issnp', 'issne'):
+ t["issns"] = [entity.issnl, entity.issne, entity.issnp]
+ for key in ("issnp", "issne"):
if entity.extra.get(key):
- t['issns'].append(entity.extra[key])
- t['issns'] = list(set([i for i in t['issns'] if i]))
+ t["issns"].append(entity.extra[key])
+ t["issns"] = list(set([i for i in t["issns"] if i]))
in_doaj = None
in_road = None
@@ -459,72 +499,72 @@ def container_to_elasticsearch(entity, force_bool=True, stats=None):
keepers = []
extra = entity.extra
- if extra.get('doaj'):
- if extra['doaj'].get('as_of'):
+ if extra.get("doaj"):
+ if extra["doaj"].get("as_of"):
in_doaj = True
- if extra.get('road'):
- if extra['road'].get('as_of'):
+ if extra.get("road"):
+ if extra["road"].get("as_of"):
in_road = True
- if extra.get('szczepanski'):
- if extra['szczepanski'].get('as_of'):
+ if extra.get("szczepanski"):
+ if extra["szczepanski"].get("as_of"):
is_oa = True
- if extra.get('default_license'):
- if extra['default_license'].startswith('CC-'):
+ if extra.get("default_license"):
+ if extra["default_license"].startswith("CC-"):
is_oa = True
- t['sherpa_romeo_color'] = None
- if extra.get('sherpa_romeo'):
- t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color')
- if extra['sherpa_romeo'].get('color') == 'white':
+ t["sherpa_romeo_color"] = None
+ if extra.get("sherpa_romeo"):
+ t["sherpa_romeo_color"] = extra["sherpa_romeo"].get("color")
+ if extra["sherpa_romeo"].get("color") == "white":
is_oa = False
- if extra.get('kbart'):
+ if extra.get("kbart"):
any_kbart = True
- if extra['kbart'].get('jstor'):
+ if extra["kbart"].get("jstor"):
any_jstor = True
- for k, v in extra['kbart'].items():
+ for k, v in extra["kbart"].items():
if v and isinstance(v, dict):
keepers.append(k)
- if extra.get('ia'):
- if extra['ia'].get('sim'):
+ if extra.get("ia"):
+ if extra["ia"].get("sim"):
any_ia_sim = True
- if extra['ia'].get('longtail_oa'):
+ if extra["ia"].get("longtail_oa"):
is_longtail_oa = True
- t['is_superceded'] = bool(extra.get('superceded'))
+ t["is_superceded"] = bool(extra.get("superceded"))
- t['keepers'] = keepers
- t['in_doaj'] = bool(in_doaj)
- t['in_road'] = bool(in_road)
- t['any_kbart'] = bool(any_kbart)
+ t["keepers"] = keepers
+ t["in_doaj"] = bool(in_doaj)
+ t["in_road"] = bool(in_road)
+ t["any_kbart"] = bool(any_kbart)
if force_bool:
- t['is_oa'] = bool(in_doaj or in_road or is_oa)
- t['is_longtail_oa'] = bool(is_longtail_oa)
- t['any_jstor'] = bool(any_jstor)
- t['any_ia_sim'] = bool(any_ia_sim)
+ t["is_oa"] = bool(in_doaj or in_road or is_oa)
+ t["is_longtail_oa"] = bool(is_longtail_oa)
+ t["any_jstor"] = bool(any_jstor)
+ t["any_ia_sim"] = bool(any_ia_sim)
else:
- t['is_oa'] = in_doaj or in_road or is_oa
- t['is_longtail_oa'] = is_longtail_oa
- t['any_jstor'] = any_jstor
- t['any_ia_sim'] = any_ia_sim
+ t["is_oa"] = in_doaj or in_road or is_oa
+ t["is_longtail_oa"] = is_longtail_oa
+ t["any_jstor"] = any_jstor
+ t["any_ia_sim"] = any_ia_sim
# mix in stats, if provided
if stats:
- t['releases_total'] = stats['total']
- t['preservation_bright'] = stats['preservation']['bright']
- t['preservation_dark'] = stats['preservation']['dark']
- t['preservation_shadows_only'] = stats['preservation']['shadows_only']
- t['preservation_none'] = stats['preservation']['none']
+ t["releases_total"] = stats["total"]
+ t["preservation_bright"] = stats["preservation"]["bright"]
+ t["preservation_dark"] = stats["preservation"]["dark"]
+ t["preservation_shadows_only"] = stats["preservation"]["shadows_only"]
+ t["preservation_none"] = stats["preservation"]["none"]
return t
def _type_of_edit(edit: EntityEdit) -> str:
if edit.revision is None and edit.redirect_ident is None:
- return 'delete'
+ return "delete"
elif edit.redirect_ident:
# redirect
- return 'update'
+ return "update"
elif edit.prev_revision is None and edit.redirect_ident is None and edit.revision:
- return 'create'
+ return "create"
else:
- return 'update'
+ return "update"
def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:
@@ -536,7 +576,7 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:
editgroup = entity.editgroup
t = dict(
- doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z",
+ doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z",
index=entity.index,
editgroup_id=entity.editgroup_id,
timestamp=entity.timestamp.isoformat(),
@@ -547,8 +587,8 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:
)
extra = editgroup.extra or dict()
- if extra.get('agent'):
- t['agent'] = extra['agent']
+ if extra.get("agent"):
+ t["agent"] = extra["agent"]
containers = [_type_of_edit(e) for e in editgroup.edits.containers]
creators = [_type_of_edit(e) for e in editgroup.edits.creators]
@@ -558,27 +598,27 @@ def changelog_to_elasticsearch(entity: ChangelogEntry) -> Dict[str, Any]:
releases = [_type_of_edit(e) for e in editgroup.edits.releases]
works = [_type_of_edit(e) for e in editgroup.edits.works]
- t['containers'] = len(containers)
- t['new_containers'] = len([e for e in containers if e == 'create'])
- t['creators'] = len(creators)
- t['new_creators'] = len([e for e in creators if e == 'create'])
- t['files'] = len(files)
- t['new_files'] = len([e for e in files if e == 'create'])
- t['filesets'] = len(filesets)
- t['new_filesets'] = len([e for e in filesets if e == 'create'])
- t['webcaptures'] = len(webcaptures)
- t['new_webcaptures'] = len([e for e in webcaptures if e == 'create'])
- t['releases'] = len(releases)
- t['new_releases'] = len([e for e in releases if e == 'create'])
- t['works'] = len(works)
- t['new_works'] = len([e for e in works if e == 'create'])
+ t["containers"] = len(containers)
+ t["new_containers"] = len([e for e in containers if e == "create"])
+ t["creators"] = len(creators)
+ t["new_creators"] = len([e for e in creators if e == "create"])
+ t["files"] = len(files)
+ t["new_files"] = len([e for e in files if e == "create"])
+ t["filesets"] = len(filesets)
+ t["new_filesets"] = len([e for e in filesets if e == "create"])
+ t["webcaptures"] = len(webcaptures)
+ t["new_webcaptures"] = len([e for e in webcaptures if e == "create"])
+ t["releases"] = len(releases)
+ t["new_releases"] = len([e for e in releases if e == "create"])
+ t["works"] = len(works)
+ t["new_works"] = len([e for e in works if e == "create"])
all_edits = containers + creators + files + filesets + webcaptures + releases + works
- t['created'] = len([e for e in all_edits if e == 'create'])
- t['updated'] = len([e for e in all_edits if e == 'update'])
- t['deleted'] = len([e for e in all_edits if e == 'delete'])
- t['total'] = len(all_edits)
+ t["created"] = len([e for e in all_edits if e == "create"])
+ t["updated"] = len([e for e in all_edits if e == "update"])
+ t["deleted"] = len([e for e in all_edits if e == "delete"])
+ t["total"] = len(all_edits)
return t
@@ -590,47 +630,47 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]:
Raises exception on error (never returns None)
"""
- if entity.state in ('redirect', 'deleted'):
+ if entity.state in ("redirect", "deleted"):
return dict(
- ident = entity.ident,
- state = entity.state,
+ ident=entity.ident,
+ state=entity.state,
)
- elif entity.state != 'active':
+ elif entity.state != "active":
raise ValueError("Unhandled entity state: {}".format(entity.state))
# First, the easy ones (direct copy)
t = dict(
- doc_index_ts=datetime.datetime.utcnow().isoformat()+"Z",
- ident = entity.ident,
- state = entity.state,
- revision = entity.revision,
- release_ids = entity.release_ids,
- release_count = len(entity.release_ids),
- mimetype = entity.mimetype,
- size_bytes = entity.size,
- sha1 = entity.sha1,
- sha256 = entity.sha256,
- md5 = entity.md5,
+ doc_index_ts=datetime.datetime.utcnow().isoformat() + "Z",
+ ident=entity.ident,
+ state=entity.state,
+ revision=entity.revision,
+ release_ids=entity.release_ids,
+ release_count=len(entity.release_ids),
+ mimetype=entity.mimetype,
+ size_bytes=entity.size,
+ sha1=entity.sha1,
+ sha256=entity.sha256,
+ md5=entity.md5,
)
parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
- t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
- t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
- t['rels'] = list(set([u.rel for u in entity.urls]))
+ t["hosts"] = list(set([".".join([seg for seg in pu if seg]) for pu in parsed_urls]))
+ t["domains"] = list(set([pu.registered_domain for pu in parsed_urls]))
+ t["rels"] = list(set([u.rel for u in entity.urls]))
- t['in_ia'] = bool('archive.org' in t['domains'])
- t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+ t["in_ia"] = bool("archive.org" in t["domains"])
+ t["in_ia_petabox"] = bool("archive.org" in t["hosts"])
any_url = None
good_url = None
best_url = None
- for release_url in (entity.urls or []):
+ for release_url in entity.urls or []:
any_url = release_url.url
- if release_url.rel in ('webarchive', 'repository'):
+ if release_url.rel in ("webarchive", "repository"):
good_url = release_url.url
- if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+ if "//web.archive.org/" in release_url.url or "//archive.org/" in release_url.url:
best_url = release_url.url
# here is where we bake-in priority; IA-specific
- t['best_url'] = best_url or good_url or any_url
+ t["best_url"] = best_url or good_url or any_url
return t
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 9101a4ec..30b5b190 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -1,4 +1,3 @@
-
INGEST_TYPE_CONTAINER_MAP = {
# Optica
"twtpsm6ytje3nhuqfu3pa7ca7u": "html",
@@ -14,7 +13,8 @@ INGEST_TYPE_CONTAINER_MAP = {
"lovwr7ladjagzkhmoaszg7efqu": "html",
}
-def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):
+
+def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None):
"""
Takes a full release entity object and returns an ingest request (as dict),
or None if it seems like this release shouldn't be ingested.
@@ -27,27 +27,35 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
calling code should check the returned type field.
"""
- if release.state != 'active':
+ if release.state != "active":
return None
if (not ingest_type) and release.container_id:
ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)
if not ingest_type:
- if release.release_type == 'stub':
+ if release.release_type == "stub":
return None
- elif release.release_type in ['component', 'graphic']:
- ingest_type = 'component'
- elif release.release_type == 'dataset':
- ingest_type = 'dataset'
- elif release.release_type == 'software':
- ingest_type = 'software'
- elif release.release_type == 'post-weblog':
- ingest_type = 'html'
- elif release.release_type in ['article-journal', 'article', 'chapter', 'paper-conference', 'book', 'report', 'thesis']:
- ingest_type = 'pdf'
+ elif release.release_type in ["component", "graphic"]:
+ ingest_type = "component"
+ elif release.release_type == "dataset":
+ ingest_type = "dataset"
+ elif release.release_type == "software":
+ ingest_type = "software"
+ elif release.release_type == "post-weblog":
+ ingest_type = "html"
+ elif release.release_type in [
+ "article-journal",
+ "article",
+ "chapter",
+ "paper-conference",
+ "book",
+ "report",
+ "thesis",
+ ]:
+ ingest_type = "pdf"
else:
- ingest_type = 'pdf'
+ ingest_type = "pdf"
# generate a URL where we expect to find fulltext
url = None
@@ -59,8 +67,10 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
link_source_id = release.ext_ids.arxiv
elif release.ext_ids.pmcid and ingest_type == "pdf":
# TODO: how to tell if an author manuscript in PMC vs. published?
- #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
- url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
+ # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
+ url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(
+ release.ext_ids.pmcid
+ )
link_source = "pmc"
link_source_id = release.ext_ids.pmcid
elif release.ext_ids.doi:
@@ -75,19 +85,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
ingest_request = {
- 'ingest_type': ingest_type,
- 'ingest_request_source': ingest_request_source,
- 'base_url': url,
- 'release_stage': release.release_stage,
- 'fatcat': {
- 'release_ident': release.ident,
- 'work_ident': release.work_id,
+ "ingest_type": ingest_type,
+ "ingest_request_source": ingest_request_source,
+ "base_url": url,
+ "release_stage": release.release_stage,
+ "fatcat": {
+ "release_ident": release.ident,
+ "work_ident": release.work_id,
},
- 'ext_ids': ext_ids,
+ "ext_ids": ext_ids,
}
if link_source and link_source_id:
- ingest_request['link_source'] = link_source
- ingest_request['link_source_id'] = link_source_id
+ ingest_request["link_source"] = link_source
+ ingest_request["link_source_id"] = link_source_id
return ingest_request
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index a61e364c..1e4cb41d 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -1,4 +1,3 @@
-
import json
import time
@@ -16,11 +15,9 @@ class ChangelogWorker(FatcatWorker):
"""
def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
- super().__init__(kafka_hosts=kafka_hosts,
- produce_topic=produce_topic,
- api=api)
+ super().__init__(kafka_hosts=kafka_hosts, produce_topic=produce_topic, api=api)
self.poll_interval = poll_interval
- self.offset = offset # the fatcat changelog offset, not the kafka offset
+ self.offset = offset # the fatcat changelog offset, not the kafka offset
def run(self):
@@ -31,7 +28,7 @@ class ChangelogWorker(FatcatWorker):
print("Checking for most recent changelog offset...")
msg = most_recent_message(self.produce_topic, self.kafka_config)
if msg:
- self.offset = json.loads(msg.decode('utf-8'))['index']
+ self.offset = json.loads(msg.decode("utf-8"))["index"]
else:
self.offset = 0
print("Most recent changelog index in Kafka seems to be {}".format(self.offset))
@@ -44,28 +41,29 @@ class ChangelogWorker(FatcatWorker):
raise KafkaException(err)
producer_conf = self.kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
producer = Producer(producer_conf)
while True:
latest = int(self.api.get_changelog(limit=1)[0].index)
if latest > self.offset:
- print("Fetching changelogs from {} through {}".format(
- self.offset+1, latest))
- for i in range(self.offset+1, latest+1):
+ print("Fetching changelogs from {} through {}".format(self.offset + 1, latest))
+ for i in range(self.offset + 1, latest + 1):
cle = self.api.get_changelog_entry(i)
obj = self.api.api_client.sanitize_for_serialization(cle)
producer.produce(
self.produce_topic,
- json.dumps(obj).encode('utf-8'),
+ json.dumps(obj).encode("utf-8"),
key=str(i),
on_delivery=fail_fast,
- #NOTE timestamp could be timestamp=cle.timestamp (?)
+ # NOTE timestamp could be timestamp=cle.timestamp (?)
)
self.offset = i
producer.flush()
@@ -79,12 +77,19 @@ class EntityUpdatesWorker(FatcatWorker):
from API) to update topics.
"""
- def __init__(self, api, kafka_hosts, consume_topic, release_topic,
- file_topic, container_topic, ingest_file_request_topic,
- work_ident_topic, poll_interval=5.0):
- super().__init__(kafka_hosts=kafka_hosts,
- consume_topic=consume_topic,
- api=api)
+ def __init__(
+ self,
+ api,
+ kafka_hosts,
+ consume_topic,
+ release_topic,
+ file_topic,
+ container_topic,
+ ingest_file_request_topic,
+ work_ident_topic,
+ poll_interval=5.0,
+ ):
+ super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, api=api)
self.release_topic = release_topic
self.file_topic = file_topic
self.container_topic = container_topic
@@ -150,7 +155,7 @@ class EntityUpdatesWorker(FatcatWorker):
# Transactions of the Japan Society of Mechanical Engineers
"10.1299/kikai",
# protocols.io
- "10.17504/"
+ "10.17504/",
]
def want_live_ingest(self, release, ingest_request):
@@ -163,40 +168,40 @@ class EntityUpdatesWorker(FatcatWorker):
ingest crawling (via wayback SPN).
"""
- link_source = ingest_request.get('ingest_request')
- ingest_type = ingest_request.get('ingest_type')
- doi = ingest_request.get('ext_ids', {}).get('doi')
+ link_source = ingest_request.get("ingest_request")
+ ingest_type = ingest_request.get("ingest_type")
+ doi = ingest_request.get("ext_ids", {}).get("doi")
es = release_to_elasticsearch(release)
is_document = release.release_type in (
- 'article',
- 'article-journal',
- 'article-newspaper',
- 'book',
- 'chapter',
- 'editorial',
- 'interview',
- 'legal_case',
- 'legislation',
- 'letter',
- 'manuscript',
- 'paper-conference',
- 'patent',
- 'peer_review',
- 'post',
- 'report',
- 'retraction',
- 'review',
- 'review-book',
- 'thesis',
+ "article",
+ "article-journal",
+ "article-newspaper",
+ "book",
+ "chapter",
+ "editorial",
+ "interview",
+ "legal_case",
+ "legislation",
+ "letter",
+ "manuscript",
+ "paper-conference",
+ "patent",
+ "peer_review",
+ "post",
+ "report",
+ "retraction",
+ "review",
+ "review-book",
+ "thesis",
)
is_not_pdf = release.release_type in (
- 'component',
- 'dataset',
- 'figure',
- 'graphic',
- 'software',
- 'stub',
+ "component",
+ "dataset",
+ "figure",
+ "graphic",
+ "software",
+ "stub",
)
# accept list sets a default "crawl it" despite OA metadata for
@@ -207,19 +212,23 @@ class EntityUpdatesWorker(FatcatWorker):
if doi.startswith(prefix):
in_acceptlist = True
- if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+ if self.ingest_oa_only and link_source not in ("arxiv", "pmc"):
# most datacite documents are in IRs and should be crawled
is_datacite_doc = False
- if release.extra and ('datacite' in release.extra) and is_document:
+ if release.extra and ("datacite" in release.extra) and is_document:
is_datacite_doc = True
- if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
+ if not (es["is_oa"] or in_acceptlist or is_datacite_doc):
return False
# big publishers *generally* have accurate OA metadata, use
# preservation networks, and block our crawlers. So unless OA, or
# explicitly on accept list, or not preserved, skip crawling
- if es.get('publisher_type') == 'big5' and es.get('is_preserved') and not (es['is_oa'] or in_acceptlist):
+ if (
+ es.get("publisher_type") == "big5"
+ and es.get("is_preserved")
+ and not (es["is_oa"] or in_acceptlist)
+ ):
return False
# if ingest_type is pdf but release_type is almost certainly not a PDF,
@@ -233,23 +242,24 @@ class EntityUpdatesWorker(FatcatWorker):
return False
# figshare
- if doi and (doi.startswith('10.6084/') or doi.startswith('10.25384/')):
+ if doi and (doi.startswith("10.6084/") or doi.startswith("10.25384/")):
# don't crawl "most recent version" (aka "group") DOIs
if not release.version:
return False
# zenodo
- if doi and doi.startswith('10.5281/'):
+ if doi and doi.startswith("10.5281/"):
# if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
- if release.extra and release.extra.get('relations'):
- for rel in release.extra['relations']:
- if (rel.get('relationType') == 'HasVersion' and rel.get('relatedIdentifier', '').startswith('10.5281/')):
+ if release.extra and release.extra.get("relations"):
+ for rel in release.extra["relations"]:
+ if rel.get("relationType") == "HasVersion" and rel.get(
+ "relatedIdentifier", ""
+ ).startswith("10.5281/"):
return False
return True
def run(self):
-
def fail_fast(err, msg):
if err is not None:
print("Kafka producer delivery error: {}".format(err))
@@ -278,36 +288,40 @@ class EntityUpdatesWorker(FatcatWorker):
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions))
+ print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions))
consumer_conf = self.kafka_config.copy()
- consumer_conf.update({
- 'group.id': self.consumer_group,
- 'on_commit': fail_fast,
- # messages don't have offset marked as stored until pushed to
- # elastic, but we do auto-commit stored offsets to broker
- 'enable.auto.commit': True,
- 'enable.auto.offset.store': False,
- # user code timeout; if no poll after this long, assume user code
- # hung and rebalance (default: 5min)
- 'max.poll.interval.ms': 180000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
- },
- })
+ consumer_conf.update(
+ {
+ "group.id": self.consumer_group,
+ "on_commit": fail_fast,
+ # messages don't have offset marked as stored until pushed to
+ # elastic, but we do auto-commit stored offsets to broker
+ "enable.auto.commit": True,
+ "enable.auto.offset.store": False,
+ # user code timeout; if no poll after this long, assume user code
+ # hung and rebalance (default: 5min)
+ "max.poll.interval.ms": 180000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
+ },
+ }
+ )
consumer = Consumer(consumer_conf)
producer_conf = self.kafka_config.copy()
- producer_conf.update({
- 'delivery.report.only.error': True,
- 'default.topic.config': {
- 'request.required.acks': -1, # all brokers must confirm
- },
- })
+ producer_conf.update(
+ {
+ "delivery.report.only.error": True,
+ "default.topic.config": {
+ "request.required.acks": -1, # all brokers must confirm
+ },
+ }
+ )
producer = Producer(producer_conf)
- consumer.subscribe([self.consume_topic],
+ consumer.subscribe(
+ [self.consume_topic],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
@@ -316,14 +330,16 @@ class EntityUpdatesWorker(FatcatWorker):
while True:
msg = consumer.poll(self.poll_interval)
if not msg:
- print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval))
+ print(
+ "nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval)
+ )
continue
if msg.error():
raise KafkaException(msg.error())
- cle = json.loads(msg.value().decode('utf-8'))
- #print(cle)
- print("processing changelog index {}".format(cle['index']))
+ cle = json.loads(msg.value().decode("utf-8"))
+ # print(cle)
+ print("processing changelog index {}".format(cle["index"]))
release_ids = []
new_release_ids = []
file_ids = []
@@ -331,27 +347,27 @@ class EntityUpdatesWorker(FatcatWorker):
webcapture_ids = []
container_ids = []
work_ids = []
- release_edits = cle['editgroup']['edits']['releases']
+ release_edits = cle["editgroup"]["edits"]["releases"]
for re in release_edits:
- release_ids.append(re['ident'])
+ release_ids.append(re["ident"])
# filter to direct release edits which are not updates
- if not re.get('prev_revision') and not re.get('redirect_ident'):
- new_release_ids.append(re['ident'])
- file_edits = cle['editgroup']['edits']['files']
+ if not re.get("prev_revision") and not re.get("redirect_ident"):
+ new_release_ids.append(re["ident"])
+ file_edits = cle["editgroup"]["edits"]["files"]
for e in file_edits:
- file_ids.append(e['ident'])
- fileset_edits = cle['editgroup']['edits']['filesets']
+ file_ids.append(e["ident"])
+ fileset_edits = cle["editgroup"]["edits"]["filesets"]
for e in fileset_edits:
- fileset_ids.append(e['ident'])
- webcapture_edits = cle['editgroup']['edits']['webcaptures']
+ fileset_ids.append(e["ident"])
+ webcapture_edits = cle["editgroup"]["edits"]["webcaptures"]
for e in webcapture_edits:
- webcapture_ids.append(e['ident'])
- container_edits = cle['editgroup']['edits']['containers']
+ webcapture_ids.append(e["ident"])
+ container_edits = cle["editgroup"]["edits"]["containers"]
for e in container_edits:
- container_ids.append(e['ident'])
- work_edits = cle['editgroup']['edits']['works']
+ container_ids.append(e["ident"])
+ work_edits = cle["editgroup"]["edits"]["works"]
for e in work_edits:
- work_ids.append(e['ident'])
+ work_ids.append(e["ident"])
# TODO: do these fetches in parallel using a thread pool?
for ident in set(file_ids):
@@ -363,8 +379,8 @@ class EntityUpdatesWorker(FatcatWorker):
file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
producer.produce(
self.file_topic,
- json.dumps(file_dict).encode('utf-8'),
- key=ident.encode('utf-8'),
+ json.dumps(file_dict).encode("utf-8"),
+ key=ident.encode("utf-8"),
on_delivery=fail_fast,
)
@@ -385,30 +401,34 @@ class EntityUpdatesWorker(FatcatWorker):
container_dict = self.api.api_client.sanitize_for_serialization(container)
producer.produce(
self.container_topic,
- json.dumps(container_dict).encode('utf-8'),
- key=ident.encode('utf-8'),
+ json.dumps(container_dict).encode("utf-8"),
+ key=ident.encode("utf-8"),
on_delivery=fail_fast,
)
for ident in set(release_ids):
- release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
+ release = self.api.get_release(
+ ident, expand="files,filesets,webcaptures,container"
+ )
if release.work_id:
work_ids.append(release.work_id)
release_dict = self.api.api_client.sanitize_for_serialization(release)
producer.produce(
self.release_topic,
- json.dumps(release_dict).encode('utf-8'),
- key=ident.encode('utf-8'),
+ json.dumps(release_dict).encode("utf-8"),
+ key=ident.encode("utf-8"),
on_delivery=fail_fast,
)
# for ingest requests, filter to "new" active releases with no matched files
if release.ident in new_release_ids:
- ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
+ ir = release_ingest_request(
+ release, ingest_request_source="fatcat-changelog"
+ )
if ir and not release.files and self.want_live_ingest(release, ir):
producer.produce(
self.ingest_file_request_topic,
- json.dumps(ir).encode('utf-8'),
- #key=None,
+ json.dumps(ir).encode("utf-8"),
+ # key=None,
on_delivery=fail_fast,
)
@@ -420,13 +440,13 @@ class EntityUpdatesWorker(FatcatWorker):
key=key,
type="fatcat_work",
work_ident=ident,
- updated=cle['timestamp'],
- fatcat_changelog_index=cle['index'],
+ updated=cle["timestamp"],
+ fatcat_changelog_index=cle["index"],
)
producer.produce(
self.work_ident_topic,
- json.dumps(work_ident_dict).encode('utf-8'),
- key=key.encode('utf-8'),
+ json.dumps(work_ident_dict).encode("utf-8"),
+ key=key.encode("utf-8"),
on_delivery=fail_fast,
)
diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py
index f411073d..0d75f964 100644
--- a/python/fatcat_tools/workers/elasticsearch.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -1,4 +1,3 @@
-
import json
import sys
@@ -26,12 +25,20 @@ class ElasticsearchReleaseWorker(FatcatWorker):
Uses a consumer group to manage offset.
"""
- def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
- elasticsearch_release_index="fatcat_releases",
- batch_size=200, api_host="https://api.fatcat.wiki/v0", query_stats=False):
- super().__init__(kafka_hosts=kafka_hosts,
- consume_topic=consume_topic)
+ def __init__(
+ self,
+ kafka_hosts,
+ consume_topic,
+ poll_interval=10.0,
+ offset=None,
+ elasticsearch_backend="http://localhost:9200",
+ elasticsearch_index="fatcat",
+ elasticsearch_release_index="fatcat_releases",
+ batch_size=200,
+ api_host="https://api.fatcat.wiki/v0",
+ query_stats=False,
+ ):
+ super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic)
self.consumer_group = "elasticsearch-updates3"
self.batch_size = batch_size
self.poll_interval = poll_interval
@@ -63,45 +70,53 @@ class ElasticsearchReleaseWorker(FatcatWorker):
print("Bailing out...", file=sys.stderr)
# TODO: should it be sys.exit(-1)?
raise KafkaException(p.error)
- #print("Kafka consumer commit successful")
+ # print("Kafka consumer commit successful")
pass
def on_rebalance(consumer, partitions):
for p in partitions:
if p.error:
raise KafkaException(p.error)
- print("Kafka partitions rebalanced: {} / {}".format(
- consumer, partitions), file=sys.stderr)
+ print(
+ "Kafka partitions rebalanced: {} / {}".format(consumer, partitions),
+ file=sys.stderr,
+ )
consumer_conf = self.kafka_config.copy()
- consumer_conf.update({
- 'group.id': self.consumer_group,
- 'on_commit': fail_fast,
- # messages don't have offset marked as stored until pushed to
- # elastic, but we do auto-commit stored offsets to broker
- 'enable.auto.commit': True,
- 'enable.auto.offset.store': False,
- # user code timeout; if no poll after this long, assume user code
- # hung and rebalance (default: 5min)
- 'max.poll.interval.ms': 60000,
- 'default.topic.config': {
- 'auto.offset.reset': 'latest',
- },
- })
+ consumer_conf.update(
+ {
+ "group.id": self.consumer_group,
+ "on_commit": fail_fast,
+ # messages don't have offset marked as stored until pushed to
+ # elastic, but we do auto-commit stored offsets to broker
+ "enable.auto.commit": True,
+ "enable.auto.offset.store": False,
+ # user code timeout; if no poll after this long, assume user code
+ # hung and rebalance (default: 5min)
+ "max.poll.interval.ms": 60000,
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
+ },
+ }
+ )
consumer = Consumer(consumer_conf)
- consumer.subscribe([self.consume_topic],
+ consumer.subscribe(
+ [self.consume_topic],
on_assign=on_rebalance,
on_revoke=on_rebalance,
)
while True:
- batch = consumer.consume(
- num_messages=self.batch_size,
- timeout=self.poll_interval)
+ batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval)
if not batch:
if not consumer.assignment():
print("... no Kafka consumer partitions assigned yet", file=sys.stderr)
- print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval), file=sys.stderr)
+ print(
+ "... nothing new from kafka, try again (interval: {}".format(
+ self.poll_interval
+ ),
+ file=sys.stderr,
+ )
continue
print("... got {} kafka messages".format(len(batch)), file=sys.stderr)
# first check errors on entire batch...
@@ -111,19 +126,24 @@ class ElasticsearchReleaseWorker(FatcatWorker):
# ... then process
bulk_actions = []
for msg in batch:
- json_str = msg.value().decode('utf-8')
+ json_str = msg.value().decode("utf-8")
entity = entity_from_json(json_str, self.entity_type, api_client=ac)
assert isinstance(entity, self.entity_type)
if self.entity_type == ChangelogEntry:
key = entity.index
# might need to fetch from API
- if not (entity.editgroup and entity.editgroup.editor): # pylint: disable=no-member # (TODO)
+ if not (
+ entity.editgroup and entity.editgroup.editor
+ ): # pylint: disable=no-member # (TODO)
entity = api.get_changelog_entry(entity.index)
else:
key = entity.ident # pylint: disable=no-member # (TODO)
- if self.entity_type != ChangelogEntry and entity.state == 'wip':
- print(f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr)
+ if self.entity_type != ChangelogEntry and entity.state == "wip":
+ print(
+ f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}",
+ file=sys.stderr,
+ )
continue
if self.entity_type == ContainerEntity and self.query_stats:
@@ -138,9 +158,15 @@ class ElasticsearchReleaseWorker(FatcatWorker):
doc_dict = self.transform_func(entity)
# TODO: handle deletions from index
- bulk_actions.append(json.dumps({
- "index": { "_id": key, },
- }))
+ bulk_actions.append(
+ json.dumps(
+ {
+ "index": {
+ "_id": key,
+ },
+ }
+ )
+ )
bulk_actions.append(json.dumps(doc_dict))
# if only WIP entities, then skip
@@ -149,15 +175,22 @@ class ElasticsearchReleaseWorker(FatcatWorker):
consumer.store_offsets(message=msg)
continue
- print("Upserting, eg, {} (of {} {} in elasticsearch)".format(key, len(batch), self.entity_type.__name__), file=sys.stderr)
+ print(
+ "Upserting, eg, {} (of {} {} in elasticsearch)".format(
+ key, len(batch), self.entity_type.__name__
+ ),
+ file=sys.stderr,
+ )
elasticsearch_endpoint = "{}/{}/_bulk".format(
- self.elasticsearch_backend,
- self.elasticsearch_index)
- resp = requests.post(elasticsearch_endpoint,
+ self.elasticsearch_backend, self.elasticsearch_index
+ )
+ resp = requests.post(
+ elasticsearch_endpoint,
headers={"Content-Type": "application/x-ndjson"},
- data="\n".join(bulk_actions) + "\n")
+ data="\n".join(bulk_actions) + "\n",
+ )
resp.raise_for_status()
- if resp.json()['errors']:
+ if resp.json()["errors"]:
desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
print(desc, file=sys.stderr)
print(resp.content, file=sys.stderr)
@@ -169,20 +202,29 @@ class ElasticsearchReleaseWorker(FatcatWorker):
class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):
-
- def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- query_stats=False, elasticsearch_release_index="fatcat_release",
- elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
- batch_size=200):
- super().__init__(kafka_hosts=kafka_hosts,
- consume_topic=consume_topic,
- poll_interval=poll_interval,
- offset=offset,
- elasticsearch_backend=elasticsearch_backend,
- elasticsearch_index=elasticsearch_index,
- elasticsearch_release_index=elasticsearch_release_index,
- query_stats=query_stats,
- batch_size=batch_size)
+ def __init__(
+ self,
+ kafka_hosts,
+ consume_topic,
+ poll_interval=10.0,
+ offset=None,
+ query_stats=False,
+ elasticsearch_release_index="fatcat_release",
+ elasticsearch_backend="http://localhost:9200",
+ elasticsearch_index="fatcat",
+ batch_size=200,
+ ):
+ super().__init__(
+ kafka_hosts=kafka_hosts,
+ consume_topic=consume_topic,
+ poll_interval=poll_interval,
+ offset=offset,
+ elasticsearch_backend=elasticsearch_backend,
+ elasticsearch_index=elasticsearch_index,
+ elasticsearch_release_index=elasticsearch_release_index,
+ query_stats=query_stats,
+ batch_size=batch_size,
+ )
# previous group got corrupted (by pykafka library?)
self.consumer_group = "elasticsearch-updates3"
self.entity_type = ContainerEntity
@@ -196,11 +238,18 @@ class ElasticsearchChangelogWorker(ElasticsearchReleaseWorker):
Note: Very early versions of changelog entries did not contain details
about the editor or extra fields.
"""
- def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat_changelog",
- batch_size=200):
- super().__init__(kafka_hosts=kafka_hosts,
- consume_topic=consume_topic)
+
+ def __init__(
+ self,
+ kafka_hosts,
+ consume_topic,
+ poll_interval=10.0,
+ offset=None,
+ elasticsearch_backend="http://localhost:9200",
+ elasticsearch_index="fatcat_changelog",
+ batch_size=200,
+ ):
+ super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic)
self.consumer_group = "elasticsearch-updates3"
self.batch_size = batch_size
self.poll_interval = poll_interval
diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py
index 8c2936be..baec44f4 100644
--- a/python/fatcat_tools/workers/worker_common.py
+++ b/python/fatcat_tools/workers/worker_common.py
@@ -1,4 +1,3 @@
-
from confluent_kafka import Consumer, KafkaException, TopicPartition
@@ -13,22 +12,21 @@ def most_recent_message(topic, kafka_config):
print("Fetching most Kafka message from {}".format(topic))
conf = kafka_config.copy()
- conf.update({
- 'group.id': 'worker-init-last-msg', # should never commit
- 'delivery.report.only.error': True,
- 'enable.auto.commit': False,
- 'default.topic.config': {
- 'request.required.acks': -1,
- 'auto.offset.reset': 'latest',
- },
- })
+ conf.update(
+ {
+ "group.id": "worker-init-last-msg", # should never commit
+ "delivery.report.only.error": True,
+ "enable.auto.commit": False,
+ "default.topic.config": {
+ "request.required.acks": -1,
+ "auto.offset.reset": "latest",
+ },
+ }
+ )
consumer = Consumer(conf)
- hwm = consumer.get_watermark_offsets(
- TopicPartition(topic, 0),
- timeout=5.0,
- cached=False)
+ hwm = consumer.get_watermark_offsets(TopicPartition(topic, 0), timeout=5.0, cached=False)
if not hwm:
raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(topic))
print("High watermarks: {}".format(hwm))
@@ -37,7 +35,7 @@ def most_recent_message(topic, kafka_config):
print("topic is new; not 'most recent message'")
return None
- consumer.assign([TopicPartition(topic, 0, hwm[1]-1)])
+ consumer.assign([TopicPartition(topic, 0, hwm[1] - 1)])
msg = consumer.poll(2.0)
consumer.close()
if not msg:
@@ -56,8 +54,8 @@ class FatcatWorker:
if api:
self.api = api
self.kafka_config = {
- 'bootstrap.servers': kafka_hosts,
- 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
+ "bootstrap.servers": kafka_hosts,
+ "message.max.bytes": 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
}
self.produce_topic = produce_topic
self.consume_topic = consume_topic