aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/__init__.py19
-rw-r--r--python/fatcat_tools/importers/common.py390
-rw-r--r--python/fatcat_tools/importers/crossref.py263
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py123
-rw-r--r--python/fatcat_tools/importers/issn.py89
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py183
-rw-r--r--python/fatcat_tools/importers/matched.py150
-rw-r--r--python/fatcat_tools/importers/orcid.py51
-rw-r--r--python/fatcat_tools/transforms.py130
-rw-r--r--python/fatcat_tools/workers/changelog.py2
10 files changed, 933 insertions, 467 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index e6f081e5..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -1,7 +1,22 @@
-from .common import FatcatImporter, make_kafka_consumer
+"""
+To run an import you combine two classes; one each of:
+
+- RecordSource: somehow iterates over a source of raw records (eg, from a
+ database, Kafka, files on disk, stdin) and pushes into an entity importer.
+- EntityImporter: class that a record iterator pushes raw (unparsed) records
+ into. The entity importer parses and decides what to do (ignore, update,
+ insert, etc). There is usually a primary entity type, though related entities
+ can be created along the way. Maintains API connection and editgroup/batch
+ state.
+
+"""
+
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
from .grobid_metadata import GrobidMetadataImporter
-from .issn import IssnImporter
+from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
+#from .kafka_source import KafkaSource
+#from .file_source import FileSource
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 06897bee..89203a4f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
import sys
import csv
import json
+import ftfy
import itertools
import subprocess
from collections import Counter
@@ -12,30 +13,66 @@ import fatcat_client
from fatcat_client.rest import ApiException
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
- "Collect data into fixed-length chunks or blocks"
- args = [iter(iterable)] * n
- return itertools.zip_longest(*args, fillvalue=fillvalue)
+def clean(thing, force_xml=False):
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
-def make_kafka_consumer(hosts, env, topic_suffix, group):
- topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
- client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
- consume_topic = client.topics[topic_name]
- print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+ It will try to clean up commong unicode mangles, HTML characters, etc.
- consumer = consume_topic.get_balanced_consumer(
- consumer_group=group.encode('utf-8'),
- managed=True,
- auto_commit_enable=True,
- auto_commit_interval_ms=30000, # 30 seconds
- compacted_topic=True,
- )
- return consumer
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return thing
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+ if not fixed:
+ # wasn't zero-length before, but is now; return None
+ return None
+ return fixed
+
+def test_clean():
-class FatcatImporter:
+ assert clean(None) == None
+ assert clean('') == ''
+ assert clean('123') == '123'
+ assert clean('a&b') == 'a&b'
+ assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+class EntityImporter:
"""
- Base class for fatcat importers
+ Base class for fatcat entity importers.
+
+ The API exposed to record iterator is:
+
+ push_record(raw_record)
+ finish()
+
+ The API that implementations are expected to fill in are:
+
+ want(raw_record) -> boolean
+ parse(raw_record) -> entity
+ try_update(entity) -> boolean
+ insert_batch([entity]) -> None
+
+ This class exposes helpers for implementations:
+
+ self.api
+ self.create_<entity>(entity) -> EntityEdit
+ for related entity types
+ self.push_entity(entity)
+ self.counts['exists'] += 1
+ if didn't update or insert because of existing)
+ self.counts['update'] += 1
+ if updated an entity
"""
def __init__(self, api, **kwargs):
@@ -43,87 +80,135 @@ class FatcatImporter:
eg_extra = kwargs.get('editgroup_extra', dict())
eg_extra['git_rev'] = eg_extra.get('git_rev',
subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
self.api = api
- self._editgroup_description = kwargs.get('editgroup_description')
- self._editgroup_extra = kwargs.get('editgroup_extra')
- issn_map_file = kwargs.get('issn_map_file')
+ self.bezerk_mode = kwargs.get('bezerk_mode', False)
+ self.edit_batch_size = kwargs.get('edit_batch_size', 100)
+ self.editgroup_description = kwargs.get('editgroup_description')
+ self.editgroup_extra = kwargs.get('editgroup_extra')
+ self.reset()
self._issnl_id_map = dict()
self._orcid_id_map = dict()
- self._doi_id_map = dict()
- if issn_map_file:
- self.read_issn_map_file(issn_map_file)
self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
- self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0})
+ self._doi_id_map = dict()
- def _editgroup(self):
- eg = fatcat_client.Editgroup(
- description=self._editgroup_description,
- extra=self._editgroup_extra,
- )
- return self.api.create_editgroup(eg)
+ def reset(self):
+ self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+ self._edit_count = 0
+ self._editgroup_id = None
+ self._entity_queue = []
- def describe_run(self):
- print("Processed {} lines, inserted {}, updated {}.".format(
- self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
+ def push_record(self, raw_record):
+ """
+ Returns nothing.
+ """
+ if (not raw_record) or (not self.want(raw_record)):
+ self.counts['skip'] += 1
+ return
+ entity = self.parse_record(raw_record)
+ if not entity:
+ self.counts['skip'] += 1
+ return
+ if self.bezerk_mode:
+ self.push_entity(entity)
+ return
+ if self.try_update(entity):
+ self.push_entity(entity)
+ return
- def create_row(self, row, editgroup_id=None):
- # sub-classes expected to implement this
- raise NotImplementedError
+ def finish(self):
+ if self._edit_count > 0:
+ self.api.accept_editgroup(self._editgroup_id)
+ self._editgroup_id = None
+ self._edit_count = 0
+
+ if self._entity_queue:
+ self.insert_batch(self._entity_queue)
+ self.counts['insert'] += len(self._entity_queue)
+ self._entity_queue = []
+
+ self.counts['total'] = 0
+ for key in ('skip', 'insert', 'update', 'exists'):
+ self.counts['total'] += self.counts[key]
+ return self.counts
+
+ def _get_editgroup(self, edits=1):
+ if self._edit_count >= self.edit_batch_size:
+ self.api.accept_editgroup(self._editgroup_id)
+ self._editgroup_id = None
+ self._edit_count = 0
- def create_batch(self, rows, editgroup_id=None):
- # sub-classes expected to implement this
+ if not self._editgroup_id:
+ eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra))
+ self._editgroup_id = eg.editgroup_id
+
+ self._edit_count += edits
+ return self._editgroup_id
+
+ def create_container(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.container'] += 1
+ return self.api.create_container(entity, editgroup_id=eg_id)
+
+ def create_release(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.release'] += 1
+ return self.api.create_release(entity, editgroup_id=eg_id)
+
+ def create_file(self, entity):
+ eg_id = self._get_editgroup()
+ self.counts['inserted.file'] += 1
+ return self.api.create_file(entity, editgroup_id=eg_id)
+
+ def updated(self):
+ """
+ Implementations should call this from try_update() if the update was successful
+ """
+ self.counts['update'] += 1
+
+ def push_entity(self, entity):
+ self._entity_queue.append(entity)
+ if len(self._entity_queue) >= self.edit_batch_size:
+ self.insert_batch(self._entity_queue)
+ self.counts['insert'] += len(_entity_queue)
+ self._entity_queue = 0
+
+ def want(self, raw_record):
+ """
+ Implementations can override for optional fast-path to drop a record.
+ Must have no side-effects; returns bool.
+ """
+ return True
+
+ def parse(self, raw_record):
+ """
+ Returns an entity class type, or None if we should skip this one.
+
+ May have side-effects (eg, create related entities), but shouldn't
+ update/mutate the actual entity.
+ """
raise NotImplementedError
- def process_source(self, source, group_size=100):
- """Creates and auto-accepts editgroup every group_size rows"""
- eg = self._editgroup()
- i = 0
- for i, row in enumerate(source):
- self.create_row(row, editgroup_id=eg.editgroup_id)
- if i > 0 and (i % group_size) == 0:
- self.api.accept_editgroup(eg.editgroup_id)
- eg = self._editgroup()
- self.counts['processed_lines'] += 1
- if i == 0 or (i % group_size) != 0:
- self.api.accept_editgroup(eg.editgroup_id)
-
- def process_batch(self, source, size=50, decode_kafka=False):
- """Reads and processes in batches (not API-call-per-)"""
- for rows in grouper(source, size):
- if decode_kafka:
- rows = [msg.value.decode('utf-8') for msg in rows]
- self.counts['processed_lines'] += len(rows)
- #eg = self._editgroup()
- #self.create_batch(rows, editgroup_id=eg.editgroup_id)
- self.create_batch(rows)
-
- def process_csv_source(self, source, group_size=100, delimiter=','):
- reader = csv.DictReader(source, delimiter=delimiter)
- self.process_source(reader, group_size)
-
- def process_csv_batch(self, source, size=50, delimiter=','):
- reader = csv.DictReader(source, delimiter=delimiter)
- self.process_batch(reader, size)
+ def try_update(self, raw_record):
+ """
+ Passed the output of parse(). Should try to find an existing entity and
+ update it (PUT), decide we should do nothing (based on the existing
+ record), or create a new one.
- def is_issnl(self, issnl):
- return len(issnl) == 9 and issnl[4] == '-'
+ Implementations must update the exists/updated/skip counts
+ appropriately in this method.
- def lookup_issnl(self, issnl):
- """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
- if issnl in self._issnl_id_map:
- return self._issnl_id_map[issnl]
- container_id = None
- try:
- rv = self.api.lookup_container(issnl=issnl)
- container_id = rv.ident
- except ApiException as ae:
- # If anything other than a 404 (not found), something is wrong
- assert ae.status == 404
- self._issnl_id_map[issnl] = container_id # might be None
- return container_id
+ Returns boolean: True if the entity should still be inserted, False otherwise
+ """
+ raise NotImplementedError
+
+ def insert_batch(self, raw_record):
+ raise NotImplementedError
def is_orcid(self, orcid):
return self._orcid_regex.match(orcid) is not None
@@ -163,6 +248,23 @@ class FatcatImporter:
self._doi_id_map[doi] = release_id # might be None
return release_id
+ def is_issnl(self, issnl):
+ return len(issnl) == 9 and issnl[4] == '-'
+
+ def lookup_issnl(self, issnl):
+ """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+ if issnl in self._issnl_id_map:
+ return self._issnl_id_map[issnl]
+ container_id = None
+ try:
+ rv = self.api.lookup_container(issnl=issnl)
+ container_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._issnl_id_map[issnl] = container_id # might be None
+ return container_id
+
def read_issn_map_file(self, issn_map_file):
print("Loading ISSN map file...")
self._issn_issnl_map = dict()
@@ -179,3 +281,117 @@ class FatcatImporter:
if issn is None:
return None
return self._issn_issnl_map.get(issn)
+
+
+class RecordPusher:
+ """
+ Base class for different importer sources. Pretty trivial interface, just
+ wraps an importer and pushes records in to it.
+ """
+
+ def __init__(self, importer, **kwargs):
+ self.importer = importer
+
+ def run(self):
+ """
+ This will look something like:
+
+ for line in sys.stdin:
+ record = json.loads(line)
+ self.importer.push_record(record)
+ print(self.importer.finish())
+ """
+ raise NotImplementedError
+
+
+class JsonLinePusher(RecordPusher):
+
+ def __init__(self, importer, json_file, **kwargs):
+ self.importer = importer
+ self.json_file = json_file
+
+ def run(self):
+ for line in self.json_file:
+ if not line:
+ continue
+ record = json.loads(line)
+ self.importer.push_record(record)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class CsvPusher(RecordPusher):
+
+ def __init__(self, importer, csv_file, **kwargs):
+ self.importer = importer
+ self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+
+ def run(self):
+ for line in self.reader:
+ if not line:
+ continue
+ self.importer.push_record(line)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class LinePusher(RecordPusher):
+
+ def __init__(self, importer, text_file, **kwargs):
+ self.importer = importer
+ self.text_file = text_file
+
+ def run(self):
+ for line in self.text_file:
+ if not line:
+ continue
+ self.importer.push_record(line)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+class KafkaJsonPusher(RecordPusher):
+
+ def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+ self.importer = importer
+ self.consumer = make_kafka_consumer(
+ kafka_hosts,
+ kafka_env,
+ topic_suffix,
+ group,
+ )
+
+ def run(self):
+ count = 0
+ for msg in self.consumer:
+ if not msg:
+ continue
+ record = json.loads(msg.value.decode('utf-8'))
+ self.importer.push_record(record)
+ count += 1
+ if count % 500 == 0:
+ print("Import counts: {}".format(self.importer.counts))
+ # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+ # commit the current batch if it has been lingering
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
+def make_kafka_consumer(hosts, env, topic_suffix, group):
+ topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
+ client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
+ consume_topic = client.topics[topic_name]
+ print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+
+ consumer = consume_topic.get_balanced_consumer(
+ consumer_group=group.encode('utf-8'),
+ managed=True,
+ auto_commit_enable=True,
+ auto_commit_interval_ms=30000, # 30 seconds
+ compacted_topic=True,
+ )
+ return consumer
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 6365e491..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
import itertools
import subprocess
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
@@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
-class CrossrefImporter(FatcatImporter):
+CONTAINER_TYPE_MAP = {
+ 'article-journal': 'journal',
+ 'paper-conference': 'conference',
+ 'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+ "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+ "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+ "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # http://www.springer.com/tdm doesn't seem like a license
+}
+
+class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
+
+ self.create_containers = kwargs.get('create_containers')
extid_map_file = kwargs.get('extid_map_file')
- create_containers = kwargs.get('create_containers')
- check_existing = kwargs.get('check_existing')
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):
self.extid_map_db = sqlite3.connect(db_uri, uri=True)
else:
print("Not using external ID map")
- self.create_containers = create_containers
- self.check_existing = check_existing
+
+ self.read_issn_map_file(issn_map_file)
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
[doi.lower()]).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = [str(cell or '') or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
pmcid=row[2],
- wikidata_qid=row[3])
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
def map_release_type(self, crossref_type):
return CROSSREF_TYPE_MAP.get(crossref_type)
- def parse_crossref_dict(self, obj):
+ def map_container_type(self, crossref_type):
+ return CONTAINER_TYPE_MAP.get(crossref_type)
+
+ def want(self, obj):
+ if not obj.get('title'):
+ return False
+
+ # do most of these checks in-line below
+ return True
+
+ def parse_record(self, obj):
"""
obj is a python dict (parsed from json).
returns a ReleaseEntity
"""
- # Do require the 'title' keys to exsit, as release entities do
- if (not 'title' in obj) or (not obj['title']):
- return None
-
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
if obj.get('type') in (None, 'journal', 'proceedings',
@@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):
'book-track', 'proceedings-series'):
return None
- # lookup existing DOI
- existing_release = None
- if self.check_existing:
- try:
- existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
-
- # eventually we'll want to support "updates", but for now just skip if
- # entity already exists
- if existing_release:
+ # Do require the 'title' keys to exsit, as release entities do
+ if (not 'title' in obj) or (not obj['title']):
return None
+ release_type = self.map_release_type(obj['type'])
+
# contribs
def do_contribs(obj_list, ctype):
contribs = []
@@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):
index = i
else:
index = None
+ raw_affiliation = None
if am.get('affiliation'):
- # note: affiliation => affiliations
- extra['affiliations'] = am.get('affiliation')
+ if len(am.get('affiliation')) > 0:
+ raw_affiliation = am.get('affiliation')[0]['name']
+ if len(am.get('affiliation')) > 1:
+ # note: affiliation => more_affiliations
+ extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
if am.get('sequence') and am.get('sequence') != "additional":
- extra['sequence'] = am.get('sequence')
+ extra['seq'] = clean(am.get('sequence'))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
- raw_name=raw_name,
+ raw_name=clean(raw_name),
+ raw_affiliation=clean(raw_affiliation),
role=ctype,
extra=extra))
return contribs
@@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):
container_id = self.lookup_issnl(issnl)
publisher = obj.get('publisher')
- ce = None
if (container_id is None and self.create_containers and (issnl is not None)
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- publisher=publisher,
- name=obj['container-title'][0])
+ publisher=clean(publisher),
+ container_type=self.map_container_type(release_type),
+ name=clean(obj['container-title'][0], force_xml=True))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+
+ # license slug
+ license_slug = None
+ license_extra = []
+ for l in obj.get('license', []):
+ if l['content-version'] not in ('vor', 'unspecified'):
+ continue
+ slug = LICENSE_SLUG_MAP.get(l['URL'])
+ if slug:
+ license_slug = slug
+ if 'start' in l:
+ l['start'] = l['start']['date-time']
+ license_extra.append(l)
# references
refs = []
for i, rm in enumerate(obj.get('reference', [])):
try:
year = int(rm.get('year'))
- # NOTE: will need to update/config in the future!
+ # TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year > 2025 or year < 100:
year = None
except:
year = None
- extra = rm.copy()
- if rm.get('DOI'):
- extra['doi'] = rm.get('DOI').lower()
key = rm.get('key')
if key and key.startswith(obj['DOI'].upper()):
key = key.replace(obj['DOI'].upper() + "-", '')
@@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):
container_name = rm.get('volume-title')
if not container_name:
container_name = rm.get('journal-title')
- extra.pop('DOI', None)
- extra.pop('key', None)
- extra.pop('year', None)
- extra.pop('volume-name', None)
- extra.pop('journal-title', None)
- extra.pop('title', None)
- extra.pop('first-page', None)
- extra.pop('doi-asserted-by', None)
+ elif rm.get('journal-title'):
+ extra['journal-title'] = rm['journal-title']
+ extra = dict()
+ if rm.get('DOI'):
+ extra['doi'] = rm.get('DOI').lower()
+ # TODO: what fields here? CSL citation stuff
+ for k in ('author', 'editor', 'edition', 'authority', 'version',
+ 'genre', 'url', 'event', 'issue', 'volume', 'date',
+ 'accessed_date', 'issued', 'page', 'medium',
+ 'collection_title', 'chapter_number'):
+ if clean(rm.get(k)):
+ extra[k] = clean(rm[k])
if extra:
extra = dict(crossref=extra)
else:
@@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):
target_release_id=None,
key=key,
year=year,
- container_name=container_name,
- title=rm.get('title'),
- locator=rm.get('first-page'),
+ container_name=clean(container_name),
+ title=clean(rm.get('title')),
+ locator=clean(rm.get('first-page')),
# TODO: just dump JSON somewhere here?
extra=extra))
@@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):
if obj.get('abstract') != None:
abstracts.append(fatcat_client.ReleaseEntityAbstracts(
mimetype="application/xml+jats",
- content=obj.get('abstract')))
+ content=clean(obj.get('abstract'))))
# extra fields
extra = dict()
- for key in ('subject', 'type', 'license', 'alternative-id',
- 'container-title', 'original-title', 'subtitle', 'archive',
- 'funder', 'group-title'):
- # TODO: unpack "container-title" array
+ for key in ('subject', 'type', 'alternative-id', 'container-title',
+ 'subtitle', 'archive', 'funder', 'group-title'):
+ # TODO: unpack "container-title" array?
val = obj.get(key)
if val:
- extra[key] = val
- if 'license' in extra and extra['license']:
- for i in range(len(extra['license'])):
- if 'start' in extra['license'][i]:
- extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+ if type(val) == str:
+ extra[key] = clean(val)
+ else:
+ extra[key] = val
+ if license_extra:
+ extra['license'] = license_extra
+
if len(obj['title']) > 1:
- extra['other-titles'] = obj['title'][1:]
- # TODO: this should be top-level
- extra['is_kept'] = len(obj.get('archive', [])) > 0
+ extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
# ISBN
isbn13 = None
@@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):
re = fatcat_client.ReleaseEntity(
work_id=None,
- title=obj.get('title', [None])[0],
- contribs=contribs,
- refs=refs,
container_id=container_id,
- publisher=publisher,
- release_type=self.map_release_type(obj['type']),
+ title=clean(obj.get('title', [None])[0], force_xml=True),
+ original_title=clean(obj.get('original-title', [None])[0]),
+ release_type=release_type,
release_status=release_status,
+ release_date=release_date,
+ release_year=release_year,
+ publisher=clean(publisher),
doi=obj['DOI'].lower(),
- isbn13=isbn13,
- core_id=extids['core_id'],
pmid=extids['pmid'],
pmcid=extids['pmcid'],
wikidata_qid=extids['wikidata_qid'],
- release_date=release_date,
- release_year=release_year,
- issue=obj.get('issue'),
- volume=obj.get('volume'),
- pages=obj.get('page'),
+ isbn13=isbn13,
+ core_id=extids['core_id'],
+ arxiv_id=extids['arxiv_id'],
+ jstor_id=extids['jstor_id'],
+ volume=clean(obj.get('volume')),
+ issue=clean(obj.get('issue')),
+ pages=clean(obj.get('page')),
+ language=None, # crossref doesn't supply language info
+ license_slug=license_slug,
+ extra=dict(crossref=extra),
abstracts=abstracts,
- extra=dict(crossref=extra))
- return (re, ce)
+ contribs=contribs,
+ refs=refs,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing DOI (don't need to try other ext idents for crossref)
+ existing = None
+ try:
+ existing = self.api.lookup_release(doi=re.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
- def create_row(self, row, editgroup_id=None):
- if row is None:
- return
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- container = self.api.create_container(ce, editgroup_id=editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- self.api.create_release(re, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- def create_batch(self, batch):
- """Current work/release pairing disallows batch creation of releases.
- Could do batch work creation and then match against releases, but meh."""
- release_batch = []
- for row in batch:
- if row is None:
- continue
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
- container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
- self.api.accept_editgroup(ce_eg.editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- release_batch.append(re)
- self.api.create_release_batch(release_batch, autoaccept="true")
- self.counts['insert'] += len(release_batch)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
import base64
import datetime
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
MAX_ABSTRACT_BYTES=4096
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+ """
+ This is a complex case: we need to parse and create both file and release entities.
+
+ The "primary" entity here is really File, not Release. If a matching File
+ exists, we bail in want(); if not we insert the Release during parsing, and
+ insert both.
+
+ TODO: should instead check if the File has any releases; if not, insert and update.
+ TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+ """
def __init__(self, api, **kwargs):
@@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.longtail_oa = kwargs.get("longtail_oa", False)
+
+ def want(self, raw_record):
+ return True
+
+ def parse_record(self, row):
+
+ fields = row.split('\t')
+ sha1_key = fields[0]
+ cdx = json.loads(fields[1])
+ mimetype = fields[2]
+ file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+ fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+ re = self.parse_grobid_json(grobid_meta)
+
+ if not (fe and re):
+ return None
+
+ # lookup existing file SHA1
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # if file is already in here, presumably not actually long-tail
+ # HACK: this is doing an exists check in parse_record(), which is weird
+ # TODO: this is where we should check if the file actually has
+ # release_ids and/or URLs associated with it
+ if existing and not self.bezerk_mode:
+ self.counts['exists'] += 1
+ self.counts['skip'] -= 1
+ return None
+
+ release_edit = self.create_release(re)
+ fe.release_ids.append(release_edit.ident)
+ return fe
def parse_grobid_json(self, obj):
@@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):
abobj = dict(
mimetype="text/plain",
language=None,
- content=obj.get('abstract').strip())
+ content=clean(obj.get('abstract')))
abstracts = [abobj]
else:
abstracts = None
@@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):
for i, a in enumerate(obj.get('authors', [])):
contribs.append(fatcat_client.ReleaseContrib(
index=i,
- raw_name=a['name'],
+ raw_name=clean(a['name']),
role="author",
extra=None))
+ # XXX: why is this a dict()? not covered by tests?
refs = []
for raw in obj.get('citations', []):
cite_extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
+ ref['key'] = clean(raw.get('id'))
if raw.get('title'):
- ref['title'] = raw['title'].strip()
+ ref['title'] = clean(raw['title'])
if raw.get('date'):
try:
year = int(raw['date'].strip()[:4])
@@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):
pass
for key in ('volume', 'url', 'issue', 'publisher'):
if raw.get(key):
- cite_extra[key] = raw[key].strip()
+ cite_extra[key] = clean(raw[key])
if raw.get('authors'):
- cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
if cite_extra:
cite_extra = dict(grobid=cite_extra)
else:
@@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):
if obj.get('doi'):
extra['doi'] = obj['doi']
if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
-
- extra['is_longtail_oa'] = True
+ extra['container_name'] = clean(obj['journal']['name'])
# TODO: ISSN/eISSN handling? or just journal name lookup?
+ if self.longtail_oa:
+ extra['longtail_oa'] = True
+
if extra:
extra = dict(grobid=extra)
else:
extra = None
re = fatcat_client.ReleaseEntity(
- title=obj['title'].strip(),
+ title=clean(obj['title'], force_xml=True),
release_type="article-journal",
release_date=release_date,
release_year=release_year,
contribs=contribs,
refs=refs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
+ publisher=clean(obj['journal'].get('publisher')),
+ volume=clean(obj['journal'].get('volume')),
+ issue=clean(obj['journal'].get('issue')),
abstracts=abstracts,
extra=extra)
return re
@@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):
sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
- # lookup existing SHA1, or create new entity
- try:
- existing_file = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- existing_file = None
-
- if existing_file:
- # if file is already in here, presumably not actually long-tail
- return None
fe = fatcat_client.FileEntity(
sha1=sha1,
size=int(file_size),
@@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):
# parse URLs and CDX
original = cdx['url']
+ assert len(cdx['dt']) >= 8
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
@@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):
return fe
- def create_row(self, row, editgroup_id=None):
- if not row:
- return
- fields = row.split('\t')
- sha1_key = fields[0]
- cdx = json.loads(fields[1])
- mimetype = fields[2]
- file_size = int(fields[3])
- grobid_meta = json.loads(fields[4])
- fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
- re = self.parse_grobid_json(grobid_meta)
- if fe and re:
- release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
- # release ident can't already be in release list because we just
- # created it
- fe.release_ids.append(release_entity.ident)
- file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- # NB: batch mode not implemented
+ def try_update(self, entity):
+ # did the exists check in 'parse_record()', because we needed to create a release
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
deleted file mode 100644
index f4d525a4..00000000
--- a/python/fatcat_tools/importers/issn.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import itertools
-import fatcat_client
-from .common import FatcatImporter
-
-
-def or_none(s):
- if s is None:
- return None
- if len(s) == 0:
- return None
- return s
-
-def truthy(s):
- if s is None:
- return None
- s = s.lower()
-
- if s in ('true', 't', 'yes', 'y', '1'):
- return True
- elif s in ('false', 'f', 'no', 'n', '0'):
- return False
- else:
- return None
-
-class IssnImporter(FatcatImporter):
- """
- Imports journal metadata ("containers") by ISSN, currently from a custom
- (data munged) .csv file format
-
- CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
- ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
- """
-
- def __init__(self, api, **kwargs):
-
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra)
-
- def parse_issn_row(self, row):
- """
- row is a python dict (parsed from CSV).
- returns a ContainerEntity (or None if invalid or couldn't parse)
- """
- title = or_none(row['title'])
- issnl = or_none(row['ISSN-L'])
- if title is None or issnl is None:
- return None
- extra = dict(
- in_doaj=truthy(row['in_doaj']),
- in_road=truthy(row['in_road']),
- in_norwegian=truthy(row['in_norwegian']),
- language=or_none(row['lang']),
- url=or_none(row['url']),
- ISSNp=or_none(row['ISSN-print']),
- ISSNe=or_none(row['ISSN-electronic']),
- is_oa=truthy(row['is_oa']),
- is_kept=truthy(row['is_kept']),
- )
- ce = fatcat_client.ContainerEntity(
- issnl=issnl,
- name=title,
- publisher=or_none(row['publisher']),
- abbrev=None,
- coden=None,
- extra=extra)
- return ce
-
- def create_row(self, row, editgroup_id=None):
- ce = self.parse_issn_row(row)
- if ce is not None:
- self.api.create_container(ce, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_issn_row(l)
- for l in batch if (l is not None)]
- objects = [o for o in objects if (o is not None)]
- self.api.create_container_batch(objects, autoaccept="true")
- self.counts['insert'] += len(objects)
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
new file mode 100644
index 00000000..cf3971b5
--- /dev/null
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -0,0 +1,183 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+def or_none(s):
+ if s is None:
+ return None
+ if len(s) == 0:
+ return None
+ return s
+
+def truthy(s):
+ if s is None:
+ return None
+ s = s.lower()
+
+ if s in ('true', 't', 'yes', 'y', '1'):
+ return True
+ elif s in ('false', 'f', 'no', 'n', '0'):
+ return False
+ else:
+ return None
+
+class JournalMetadataImporter(EntityImporter):
+ """
+ Imports journal metadata ("containers") by ISSN, currently from a custom
+ (data munged) .csv file format
+
+ CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+ ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+ 'extra' fields:
+
+ doaj
+ as_of: datetime of most recent check; if not set, not actually in DOAJ
+ seal: bool
+ work_level: bool (are work-level publications deposited with DOAJ?)
+ archiving: array, can include 'library' or 'other'
+ road
+ as_of: datetime of most recent check; if not set, not actually in ROAD
+ pubmed (TODO: delete?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ norwegian (TODO: drop this?)
+ as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+ id (integer)
+ level (integer; 0-2)
+ kbart
+ lockss
+ year_rle
+ volume_rle
+ portico
+ ...
+ clockss
+ ...
+ sherpa_romeo
+ color
+ jstor
+ year_rle
+ volume_rle
+ scopus
+ id
+ TODO: print/electronic distinction?
+ wos
+ id
+ doi
+ crossref_doi: DOI of the title in crossref (if exists)
+ prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+ ia
+ sim
+ nap_id
+ year_rle
+ volume_rle
+ longtail: boolean
+ homepage
+ as_of: datetime of last attempt
+ url
+ status: HTTP/heritrix status of homepage crawl
+
+ issnp: string
+ issne: string
+ coden: string
+ abbrev: string
+ oclc_id: string (TODO: lookup?)
+ lccn_id: string (TODO: lookup?)
+ dblb_id: string
+ default_license: slug
+ original_name: native name (if name is translated)
+ platform: hosting platform: OJS, wordpress, scielo, etc
+ mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+ first_year: year (integer)
+ last_year: if publishing has stopped
+ primary_language: single ISO code, or 'mixed'
+ languages: array of ISO codes
+ region: TODO: continent/world-region
+ nation: shortcode of nation
+ discipline: TODO: highest-level subject; "life science", "humanities", etc
+ field: TODO: narrower description of field
+ subjects: TODO?
+ url: homepage
+ is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+ TODO: domains, if exclusive?
+ TODO: fulltext_regex, if a known pattern?
+
+ For KBART, etc:
+ We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+ year and volume spans are run-length-encoded arrays, using integers:
+ - if an integer, means that year is preserved
+ - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
+ def want(self, raw_record):
+ if raw_record.get('ISSN-L'):
+ return True
+ return False
+
+ def parse_record(self, row):
+ """
+ row is a python dict (parsed from CSV).
+ returns a ContainerEntity (or None if invalid or couldn't parse)
+ """
+ title = or_none(row['title'])
+ issnl = or_none(row['ISSN-L'])
+ if title is None or issnl is None:
+ return None
+ extra = dict(
+ in_doaj=truthy(row['in_doaj']),
+ in_road=truthy(row['in_road']),
+ in_norwegian=truthy(row['in_norwegian']),
+ language=or_none(row['lang']),
+ url=or_none(row['url']),
+ ISSNp=or_none(row['ISSN-print']),
+ ISSNe=or_none(row['ISSN-electronic']),
+ is_oa=truthy(row['is_oa']),
+ is_kept=truthy(row['is_kept']),
+ )
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ name=clean(title),
+ publisher=or_none(clean(row['publisher'])),
+ extra=extra)
+ return ce
+
+ def try_update(self, ce):
+
+ existing = None
+ try:
+ existing = self.api.lookup_container(issnl=ce.issnl)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_container_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 1e5c22f7..2ec6c95d 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,16 +4,10 @@ import json
import sqlite3
import itertools
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
-#row = row.split('\t')
-#assert len(row) == 2
-#sha1 = row[0].replace('sha1:')
-#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
-#print(sha1)
-#dois = [d.lower() for d in json.loads(row[1])]
-class MatchedImporter(FatcatImporter):
+class MatchedImporter(EntityImporter):
"""
Importer for "file to crossref DOI" matches.
@@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter):
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
self.default_mime = kwargs.get("default_mime", None)
- self.skip_file_updates = kwargs.get("skip_file_updates", False)
def make_url(self, raw):
rel = self.default_link_rel
@@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter):
rel = "repository"
elif "//web.archive.org/" in raw or "//archive.is/" in raw:
rel = "webarchive"
- return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+ return (rel, raw)
- def parse_matched_dict(self, obj):
- sha1 = obj['sha1']
- dois = [d.lower() for d in obj.get('dois', [])]
+ def want(self, raw_record):
+ return True
- # lookup sha1, or create new entity
- fe = None
- if not self.skip_file_updates:
- try:
- fe = self.api.lookup_file(sha1=sha1)
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
- if fe is None:
- fe = fatcat_client.FileEntity(
- sha1=sha1,
- release_ids=[],
- urls=[],
- )
+ def parse_record(self, obj):
+ dois = [d.lower() for d in obj.get('dois', [])]
# lookup dois
re_list = set()
@@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter):
print("DOI not found: {}".format(doi))
else:
re_list.add(re.ident)
- if len(re_list) == 0:
+ release_ids = list(re_list)
+ if len(release_ids) == 0:
return None
- if fe.release_ids == set(re_list):
- return None
- re_list.update(fe.release_ids)
- fe.release_ids = list(re_list)
# parse URLs and CDX
- existing_urls = [feu.url for feu in fe.urls]
+ urls = set()
for url in obj.get('url', []):
- if url not in existing_urls:
- url = self.make_url(url)
- if url != None:
- fe.urls.append(url)
+ url = self.make_url(url)
+ if url != None:
+ urls.add(url)
for cdx in obj.get('cdx', []):
original = cdx['url']
wayback = "https://web.archive.org/web/{}/{}".format(
cdx['dt'],
original)
- if wayback not in existing_urls:
- fe.urls.append(
- fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
- if original not in existing_urls:
- url = self.make_url(original)
- if url != None:
- fe.urls.append(url)
-
- if obj.get('size') != None:
- fe.size = int(obj['size'])
- fe.sha256 = obj.get('sha256', fe.sha256)
- fe.md5 = obj.get('md5', fe.sha256)
- if obj.get('mimetype') is None:
- if fe.mimetype is None:
- fe.mimetype = self.default_mime
- else:
- fe.mimetype = obj.get('mimetype')
+ urls.add(("webarchive", wayback))
+ url = self.make_url(original)
+ if url != None:
+ urls.add(url)
+ urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls]
+ if len(urls) == 0:
+ return None
+
+ size = obj.get('size')
+ if size:
+ size = int(size)
+
+ fe = fatcat_client.FileEntity(
+ md5=obj.get('md5'),
+ sha1=obj['sha1'],
+ sha256=obj.get('sha256'),
+ size=size,
+ mimetype=obj.get('mimetype'),
+ release_ids=release_ids,
+ urls=urls,
+ )
return fe
- def create_row(self, row, editgroup_id=None):
- obj = json.loads(row)
- fe = self.parse_matched_dict(obj)
- if fe is not None:
- if fe.ident is None:
- self.api.create_file(fe, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
- else:
- self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id)
- self.counts['update'] += 1
-
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_matched_dict(json.loads(l))
- for l in batch if l != None]
- new_objects = [o for o in objects if o != None and o.ident == None]
- update_objects = [o for o in objects if o != None and o.ident != None]
- if len(update_objects):
- update_eg = self._editgroup().editgroup_id
- for obj in update_objects:
- self.api.update_file(obj.ident, obj, editgroup_id=update_eg)
- self.api.accept_editgroup(update_eg)
- if len(new_objects) > 0:
- self.api.create_file_batch(new_objects, autoaccept="true")
- self.counts['update'] += len(update_objects)
- self.counts['insert'] += len(new_objects)
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ fe.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
+ # no new release matches *and* there are already existing URLs
+ self.counts['exists'] += 1
+ return False
+
+ # merge the existing into this one and update
+ existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+ existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha256 = existing.sha256 or fe.sha256
+ self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup())
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 0c8b1d62..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
def value_or_none(e):
if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
return None
return e
-class OrcidImporter(FatcatImporter):
+class OrcidImporter(EntityImporter):
def __init__(self, api, **kwargs):
@@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
- def parse_orcid_dict(self, obj):
+ def want(self, raw_record):
+ return True
+
+ def parse_record(self, obj):
"""
obj is a python dict (parsed from json).
returns a CreatorEntity
"""
name = obj['person']['name']
- if name is None:
- return None
+ assert name
extra = None
given = value_or_none(name.get('given-names'))
sur = value_or_none(name.get('family-name'))
@@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter):
return None
ce = fatcat_client.CreatorEntity(
orcid=orcid,
- given_name=given,
- surname=sur,
- display_name=display,
+ given_name=clean(given),
+ surname=clean(sur),
+ display_name=clean(display),
extra=extra)
return ce
- def create_row(self, row, editgroup_id=None):
- obj = json.loads(row)
- ce = self.parse_orcid_dict(obj)
- if ce is not None:
- self.api.create_creator(ce, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
+ def try_update(self, raw_record):
+ existing = None
+ try:
+ existing = self.api.lookup_creator(orcid=raw_record.orcid)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
- def create_batch(self, batch):
- """Reads and processes in batches (not API-call-per-line)"""
- objects = [self.parse_orcid_dict(json.loads(l))
- for l in batch if l != None]
- objects = [o for o in objects if o != None]
- self.api.create_creator_batch(objects, autoaccept="true")
- self.counts['insert'] += len(objects)
+ def insert_batch(self, batch):
+ self.api.create_creator_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 0f957f9a..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
+
import collections
from fatcat_client import ReleaseEntity, ApiClient
@@ -26,25 +27,43 @@ def release_to_elasticsearch(release):
Raises exception on error (never returns None)
"""
- if release.state != 'active':
- raise ValueError("Entity is not 'active'")
+ if release.state in ('redirect', 'deleted'):
+ return dict(
+ ident = release.ident,
+ state = release.state,
+ )
+ elif release.state != 'active':
+ raise ValueError("Unhandled release state: {}".format(release.state))
# First, the easy ones (direct copy)
t = dict(
ident = release.ident,
+ state = release.state,
revision = release.revision,
title = release.title,
+ original_title = release.original_title,
release_type = release.release_type,
release_status = release.release_status,
language = release.language,
+ license = release.license_slug,
doi = release.doi,
pmid = release.pmid,
pmcid = release.pmcid,
isbn13 = release.isbn13,
+ wikidata_qid = release.wikidata_qid,
core_id = release.core_id,
- wikidata_qid = release.wikidata_qid
+ arxiv_id = release.core_id,
+ jstor_id = release.jstor_id,
)
+ is_oa = None
+ is_longtail_oa = None
+ in_kbart = None
+ in_web = False
+ in_dweb = False
+ in_ia = False
+ in_shadow = False
+
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
t['release_date'] = release.release_date.isoformat()
@@ -53,52 +72,99 @@ def release_to_elasticsearch(release):
if release.release_year is not None:
t['release_year'] = release.release_year
+ t['any_abstract'] = len(release.abstracts) > 0
+ t['ref_count'] = len(release.refs or [])
+ t['contrib_count'] = len(release.contribs or [])
+ contrib_names = []
+ for c in (release.contribs or []):
+ if c.raw_name:
+ contrib_names.append(c.raw_name)
+ t['contrib_names'] = contrib_names
+
container = release.container
- container_is_kept = False
if container:
t['publisher'] = container.publisher
t['container_name'] = container.name
t['container_issnl'] = container.issnl
- container_extra = container.extra
- if container_extra:
- t['container_is_oa'] = container_extra.get('is_oa')
- container_is_kept = container_extra.get('is_kept', False)
- t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ t['container_type'] = container.container_type
+ if container.extra:
+ if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+ is_oa = True
+ if container.extra.get('in_kbart'):
+ # TODO: better KBART check goes here
+ in_kbart = True
+ if container.extra.get('ia'):
+ # TODO: container longtail check goes here
+ # TODO: sim/microfilm check goes here
+ pass
+ # TODO: SHERPA/Romeo goes here
else:
t['publisher'] = release.publisher
files = release.files or []
t['file_count'] = len(files)
- in_wa = False
- in_ia = False
- t['file_pdf_url'] = None
+ t['fileset_count'] = len(release.filesets or [])
+ t['webcapture_count'] = len(release.webcaptures or [])
+ any_pdf_url = None
+ good_pdf_url = None
+ best_pdf_url = None
+ ia_pdf_url = None
for f in files:
+ if f.extra and f.extra.get('shadows'):
+ # TODO: shadow check goes here
+ in_shadows = True
is_pdf = 'pdf' in (f.mimetype or '')
for url in (f.urls or []):
- if url.rel == 'webarchive':
- in_wa = True
- if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+ if url.url.lower().startswith('http'):
+ in_web = True
+ if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ # TODO: not sure what rel will be
+ in_dweb = True
+ if is_pdf:
+ any_pdf_url = url.url
+ if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+ is_preserved = True
+ good_pdf_url = url.url
+ if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
- t['file_pdf_url'] = url.url
- if not t['file_pdf_url'] and is_pdf:
- t['file_pdf_url'] = url.url
- t['file_in_webarchive'] = in_wa
- t['file_in_ia'] = in_ia
+ best_pdf_url = url.url
+ ia_pdf_url = url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+ t['ia_pdf_url'] = ia_pdf_url
+
+ if release.license_slug:
+ # TODO: more/better checks here, particularly strict *not* OA licenses
+ if release.license_slug.startswith("CC-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
- t['in_shadow'] = extra.get('in_shadow')
- if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
- t['container_is_longtail_oa'] = True
- t['any_abstract'] = bool(release.abstracts)
- t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+ # TODO: longtail OA check from GROBID here
+ if extra.get('in_kbart'):
+ # NOTE: not actually setting this anywhere
+ in_kbart = True
+ if extra.get('is_oa'):
+ # NOTE: not actually setting this anywhere
+ is_oa = True
+ if extra.get('grobid'):
+ if not t.get('container_name'):
+ t['container_name'] = extra['grobid'].get('container_name')
+ if extra['grobid'].get('longtail_oa'):
+ is_longtail_oa = True
+ if extra.get('crossref'):
+ if extra['crossref'].get('archive'):
+ # all crossref archives are KBART, I believe
+ in_kbart = True
- t['ref_count'] = len(release.refs or [])
- t['contrib_count'] = len(release.contribs or [])
- contrib_names = []
- for c in (release.contribs or []):
- if c.raw_name:
- contrib_names.append(c.raw_name)
- t['contrib_names'] = contrib_names
+ if is_longtail_oa:
+ is_oa = True
+ t['is_oa'] = is_oa
+ t['is_longtail_oa'] = is_longtail_oa
+ t['in_kbart'] = in_kbart
+ t['in_web'] = in_web
+ t['in_dweb'] = in_dweb
+ t['in_ia'] = in_ia
+ t['is_preserved'] = in_ia or in_kbart
return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 8690a791..636ed304 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker):
release_edits = cle['editgroup']['edits']['releases']
for re in release_edits:
ident = re['ident']
- release = self.api.get_release(ident, expand="files,container")
+ release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
release_dict = self.api.api_client.sanitize_for_serialization(release)
producer.produce(
message=json.dumps(release_dict).encode('utf-8'),