diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 19 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 390 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 263 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 123 | ||||
-rw-r--r-- | python/fatcat_tools/importers/issn.py | 89 | ||||
-rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 183 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 150 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 51 | ||||
-rw-r--r-- | python/fatcat_tools/transforms.py | 130 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 2 |
10 files changed, 933 insertions, 467 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index e6f081e5..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -1,7 +1,22 @@ -from .common import FatcatImporter, make_kafka_consumer +""" +To run an import you combine two classes; one each of: + +- RecordSource: somehow iterates over a source of raw records (eg, from a + database, Kafka, files on disk, stdin) and pushes into an entity importer. +- EntityImporter: class that a record iterator pushes raw (unparsed) records + into. The entity importer parses and decides what to do (ignore, update, + insert, etc). There is usually a primary entity type, though related entities + can be created along the way. Maintains API connection and editgroup/batch + state. + +""" + +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .grobid_metadata import GrobidMetadataImporter -from .issn import IssnImporter +from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +#from .kafka_source import KafkaSource +#from .file_source import FileSource diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 06897bee..89203a4f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re import sys import csv import json +import ftfy import itertools import subprocess from collections import Counter @@ -12,30 +13,66 @@ import fatcat_client from fatcat_client.rest import ApiException -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) +def clean(thing, force_xml=False): + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. -def make_kafka_consumer(hosts, env, topic_suffix, group): - topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') - client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") - consume_topic = client.topics[topic_name] - print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + It will try to clean up commong unicode mangles, HTML characters, etc. - consumer = consume_topic.get_balanced_consumer( - consumer_group=group.encode('utf-8'), - managed=True, - auto_commit_enable=True, - auto_commit_interval_ms=30000, # 30 seconds - compacted_topic=True, - ) - return consumer + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return thing + fix_entities = 'auto' + if force_xml: + fix_entities = True + fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() + if not fixed: + # wasn't zero-length before, but is now; return None + return None + return fixed + +def test_clean(): -class FatcatImporter: + assert clean(None) == None + assert clean('') == '' + assert clean('123') == '123' + assert clean('a&b') == 'a&b' + assert clean('<b>a&b</b>') == '<b>a&b</b>' + assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +class EntityImporter: """ - Base class for fatcat importers + Base class for fatcat entity importers. + + The API exposed to record iterator is: + + push_record(raw_record) + finish() + + The API that implementations are expected to fill in are: + + want(raw_record) -> boolean + parse(raw_record) -> entity + try_update(entity) -> boolean + insert_batch([entity]) -> None + + This class exposes helpers for implementations: + + self.api + self.create_<entity>(entity) -> EntityEdit + for related entity types + self.push_entity(entity) + self.counts['exists'] += 1 + if didn't update or insert because of existing) + self.counts['update'] += 1 + if updated an entity """ def __init__(self, api, **kwargs): @@ -43,87 +80,135 @@ class FatcatImporter: eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['git_rev'] = eg_extra.get('git_rev', subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api - self._editgroup_description = kwargs.get('editgroup_description') - self._editgroup_extra = kwargs.get('editgroup_extra') - issn_map_file = kwargs.get('issn_map_file') + self.bezerk_mode = kwargs.get('bezerk_mode', False) + self.edit_batch_size = kwargs.get('edit_batch_size', 100) + self.editgroup_description = kwargs.get('editgroup_description') + self.editgroup_extra = kwargs.get('editgroup_extra') + self.reset() self._issnl_id_map = dict() self._orcid_id_map = dict() - self._doi_id_map = dict() - if issn_map_file: - self.read_issn_map_file(issn_map_file) self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") - self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) + self._doi_id_map = dict() - def _editgroup(self): - eg = fatcat_client.Editgroup( - description=self._editgroup_description, - extra=self._editgroup_extra, - ) - return self.api.create_editgroup(eg) + def reset(self): + self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + self._edit_count = 0 + self._editgroup_id = None + self._entity_queue = [] - def describe_run(self): - print("Processed {} lines, inserted {}, updated {}.".format( - self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) + def push_record(self, raw_record): + """ + Returns nothing. + """ + if (not raw_record) or (not self.want(raw_record)): + self.counts['skip'] += 1 + return + entity = self.parse_record(raw_record) + if not entity: + self.counts['skip'] += 1 + return + if self.bezerk_mode: + self.push_entity(entity) + return + if self.try_update(entity): + self.push_entity(entity) + return - def create_row(self, row, editgroup_id=None): - # sub-classes expected to implement this - raise NotImplementedError + def finish(self): + if self._edit_count > 0: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 + + if self._entity_queue: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(self._entity_queue) + self._entity_queue = [] + + self.counts['total'] = 0 + for key in ('skip', 'insert', 'update', 'exists'): + self.counts['total'] += self.counts[key] + return self.counts + + def _get_editgroup(self, edits=1): + if self._edit_count >= self.edit_batch_size: + self.api.accept_editgroup(self._editgroup_id) + self._editgroup_id = None + self._edit_count = 0 - def create_batch(self, rows, editgroup_id=None): - # sub-classes expected to implement this + if not self._editgroup_id: + eg = self.api.create_editgroup( + fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra)) + self._editgroup_id = eg.editgroup_id + + self._edit_count += edits + return self._editgroup_id + + def create_container(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.container'] += 1 + return self.api.create_container(entity, editgroup_id=eg_id) + + def create_release(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.release'] += 1 + return self.api.create_release(entity, editgroup_id=eg_id) + + def create_file(self, entity): + eg_id = self._get_editgroup() + self.counts['inserted.file'] += 1 + return self.api.create_file(entity, editgroup_id=eg_id) + + def updated(self): + """ + Implementations should call this from try_update() if the update was successful + """ + self.counts['update'] += 1 + + def push_entity(self, entity): + self._entity_queue.append(entity) + if len(self._entity_queue) >= self.edit_batch_size: + self.insert_batch(self._entity_queue) + self.counts['insert'] += len(_entity_queue) + self._entity_queue = 0 + + def want(self, raw_record): + """ + Implementations can override for optional fast-path to drop a record. + Must have no side-effects; returns bool. + """ + return True + + def parse(self, raw_record): + """ + Returns an entity class type, or None if we should skip this one. + + May have side-effects (eg, create related entities), but shouldn't + update/mutate the actual entity. + """ raise NotImplementedError - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgroup every group_size rows""" - eg = self._editgroup() - i = 0 - for i, row in enumerate(source): - self.create_row(row, editgroup_id=eg.editgroup_id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg.editgroup_id) - eg = self._editgroup() - self.counts['processed_lines'] += 1 - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.editgroup_id) - - def process_batch(self, source, size=50, decode_kafka=False): - """Reads and processes in batches (not API-call-per-)""" - for rows in grouper(source, size): - if decode_kafka: - rows = [msg.value.decode('utf-8') for msg in rows] - self.counts['processed_lines'] += len(rows) - #eg = self._editgroup() - #self.create_batch(rows, editgroup_id=eg.editgroup_id) - self.create_batch(rows) - - def process_csv_source(self, source, group_size=100, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_source(reader, group_size) - - def process_csv_batch(self, source, size=50, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_batch(reader, size) + def try_update(self, raw_record): + """ + Passed the output of parse(). Should try to find an existing entity and + update it (PUT), decide we should do nothing (based on the existing + record), or create a new one. - def is_issnl(self, issnl): - return len(issnl) == 9 and issnl[4] == '-' + Implementations must update the exists/updated/skip counts + appropriately in this method. - def lookup_issnl(self, issnl): - """Caches calls to the ISSN-L lookup API endpoint in a local dict""" - if issnl in self._issnl_id_map: - return self._issnl_id_map[issnl] - container_id = None - try: - rv = self.api.lookup_container(issnl=issnl) - container_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._issnl_id_map[issnl] = container_id # might be None - return container_id + Returns boolean: True if the entity should still be inserted, False otherwise + """ + raise NotImplementedError + + def insert_batch(self, raw_record): + raise NotImplementedError def is_orcid(self, orcid): return self._orcid_regex.match(orcid) is not None @@ -163,6 +248,23 @@ class FatcatImporter: self._doi_id_map[doi] = release_id # might be None return release_id + def is_issnl(self, issnl): + return len(issnl) == 9 and issnl[4] == '-' + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + if issnl in self._issnl_id_map: + return self._issnl_id_map[issnl] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_id_map[issnl] = container_id # might be None + return container_id + def read_issn_map_file(self, issn_map_file): print("Loading ISSN map file...") self._issn_issnl_map = dict() @@ -179,3 +281,117 @@ class FatcatImporter: if issn is None: return None return self._issn_issnl_map.get(issn) + + +class RecordPusher: + """ + Base class for different importer sources. Pretty trivial interface, just + wraps an importer and pushes records in to it. + """ + + def __init__(self, importer, **kwargs): + self.importer = importer + + def run(self): + """ + This will look something like: + + for line in sys.stdin: + record = json.loads(line) + self.importer.push_record(record) + print(self.importer.finish()) + """ + raise NotImplementedError + + +class JsonLinePusher(RecordPusher): + + def __init__(self, importer, json_file, **kwargs): + self.importer = importer + self.json_file = json_file + + def run(self): + for line in self.json_file: + if not line: + continue + record = json.loads(line) + self.importer.push_record(record) + counts = self.importer.finish() + print(counts) + return counts + + +class CsvPusher(RecordPusher): + + def __init__(self, importer, csv_file, **kwargs): + self.importer = importer + self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + + def run(self): + for line in self.reader: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class LinePusher(RecordPusher): + + def __init__(self, importer, text_file, **kwargs): + self.importer = importer + self.text_file = text_file + + def run(self): + for line in self.text_file: + if not line: + continue + self.importer.push_record(line) + counts = self.importer.finish() + print(counts) + return counts + + +class KafkaJsonPusher(RecordPusher): + + def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + self.importer = importer + self.consumer = make_kafka_consumer( + kafka_hosts, + kafka_env, + topic_suffix, + group, + ) + + def run(self): + count = 0 + for msg in self.consumer: + if not msg: + continue + record = json.loads(msg.value.decode('utf-8')) + self.importer.push_record(record) + count += 1 + if count % 500 == 0: + print("Import counts: {}".format(self.importer.counts)) + # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or + # commit the current batch if it has been lingering + counts = self.importer.finish() + print(counts) + return counts + + +def make_kafka_consumer(hosts, env, topic_suffix, group): + topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') + client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") + consume_topic = client.topics[topic_name] + print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + + consumer = consume_topic.get_balanced_consumer( + consumer_group=group.encode('utf-8'), + managed=True, + auto_commit_enable=True, + auto_commit_interval_ms=30000, # 30 seconds + compacted_topic=True, + ) + return consumer diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 6365e491..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } -class CrossrefImporter(FatcatImporter): +CONTAINER_TYPE_MAP = { + 'article-journal': 'journal', + 'paper-conference': 'conference', + 'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { + "http://creativecommons.org/licenses/by/3.0/": "CC-BY", + "http://creativecommons.org/licenses/by/4.0/": "CC-BY", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # http://www.springer.com/tdm doesn't seem like a license +} + +class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter): issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra) + + self.create_containers = kwargs.get('create_containers') extid_map_file = kwargs.get('extid_map_file') - create_containers = kwargs.get('create_containers') - check_existing = kwargs.get('check_existing') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter): self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map") - self.create_containers = create_containers - self.check_existing = check_existing + + self.read_issn_map_file(issn_map_file) def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], - wikidata_qid=row[3]) + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def map_release_type(self, crossref_type): return CROSSREF_TYPE_MAP.get(crossref_type) - def parse_crossref_dict(self, obj): + def map_container_type(self, crossref_type): + return CONTAINER_TYPE_MAP.get(crossref_type) + + def want(self, obj): + if not obj.get('title'): + return False + + # do most of these checks in-line below + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ - # Do require the 'title' keys to exsit, as release entities do - if (not 'title' in obj) or (not obj['title']): - return None - # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', @@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None - # lookup existing DOI - existing_release = None - if self.check_existing: - try: - existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - - # eventually we'll want to support "updates", but for now just skip if - # entity already exists - if existing_release: + # Do require the 'title' keys to exsit, as release entities do + if (not 'title' in obj) or (not obj['title']): return None + release_type = self.map_release_type(obj['type']) + # contribs def do_contribs(obj_list, ctype): contribs = [] @@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter): index = i else: index = None + raw_affiliation = None if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') + if len(am.get('affiliation')) > 0: + raw_affiliation = am.get('affiliation')[0]['name'] + if len(am.get('affiliation')) > 1: + # note: affiliation => more_affiliations + extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": - extra['sequence'] = am.get('sequence') + extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter): container_id = self.lookup_issnl(issnl) publisher = obj.get('publisher') - ce = None if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, - name=obj['container-title'][0]) + publisher=clean(publisher), + container_type=self.map_container_type(release_type), + name=clean(obj['container-title'][0], force_xml=True)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + # license slug + license_slug = None + license_extra = [] + for l in obj.get('license', []): + if l['content-version'] not in ('vor', 'unspecified'): + continue + slug = LICENSE_SLUG_MAP.get(l['URL']) + if slug: + license_slug = slug + if 'start' in l: + l['start'] = l['start']['date-time'] + license_extra.append(l) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) - # NOTE: will need to update/config in the future! + # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None - extra = rm.copy() - if rm.get('DOI'): - extra['doi'] = rm.get('DOI').lower() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') @@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter): container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') - extra.pop('DOI', None) - extra.pop('key', None) - extra.pop('year', None) - extra.pop('volume-name', None) - extra.pop('journal-title', None) - extra.pop('title', None) - extra.pop('first-page', None) - extra.pop('doi-asserted-by', None) + elif rm.get('journal-title'): + extra['journal-title'] = rm['journal-title'] + extra = dict() + if rm.get('DOI'): + extra['doi'] = rm.get('DOI').lower() + # TODO: what fields here? CSL citation stuff + for k in ('author', 'editor', 'edition', 'authority', 'version', + 'genre', 'url', 'event', 'issue', 'volume', 'date', + 'accessed_date', 'issued', 'page', 'medium', + 'collection_title', 'chapter_number'): + if clean(rm.get(k)): + extra[k] = clean(rm[k]) if extra: extra = dict(crossref=extra) else: @@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter): target_release_id=None, key=key, year=year, - container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), + container_name=clean(container_name), + title=clean(rm.get('title')), + locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=extra)) @@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() - for key in ('subject', 'type', 'license', 'alternative-id', - 'container-title', 'original-title', 'subtitle', 'archive', - 'funder', 'group-title'): - # TODO: unpack "container-title" array + for key in ('subject', 'type', 'alternative-id', 'container-title', + 'subtitle', 'archive', 'funder', 'group-title'): + # TODO: unpack "container-title" array? val = obj.get(key) if val: - extra[key] = val - if 'license' in extra and extra['license']: - for i in range(len(extra['license'])): - if 'start' in extra['license'][i]: - extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val + if license_extra: + extra['license'] = license_extra + if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] - # TODO: this should be top-level - extra['is_kept'] = len(obj.get('archive', [])) > 0 + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # ISBN isbn13 = None @@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( work_id=None, - title=obj.get('title', [None])[0], - contribs=contribs, - refs=refs, container_id=container_id, - publisher=publisher, - release_type=self.map_release_type(obj['type']), + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), + release_type=release_type, release_status=release_status, + release_date=release_date, + release_year=release_year, + publisher=clean(publisher), doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - release_year=release_year, - issue=obj.get('issue'), - volume=obj.get('volume'), - pages=obj.get('page'), + isbn13=isbn13, + core_id=extids['core_id'], + arxiv_id=extids['arxiv_id'], + jstor_id=extids['jstor_id'], + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), + language=None, # crossref doesn't supply language info + license_slug=license_slug, + extra=dict(crossref=extra), abstracts=abstracts, - extra=dict(crossref=extra)) - return (re, ce) + contribs=contribs, + refs=refs, + ) + return re + + def try_update(self, re): + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) - def create_row(self, row, editgroup_id=None): - if row is None: - return - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - container = self.api.create_container(ce, editgroup_id=editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - self.api.create_release(re, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Current work/release pairing disallows batch creation of releases. - Could do batch work creation and then match against releases, but meh.""" - release_batch = [] - for row in batch: - if row is None: - continue - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) - container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) - self.api.accept_editgroup(ce_eg.editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - release_batch.append(re) - self.api.create_release_batch(release_batch, autoaccept="true") - self.counts['insert'] += len(release_batch) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 5e61a154..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,22 @@ import json import base64 import datetime import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean MAX_ABSTRACT_BYTES=4096 -class GrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(EntityImporter): + """ + This is a complex case: we need to parse and create both file and release entities. + + The "primary" entity here is really File, not Release. If a matching File + exists, we bail in want(); if not we insert the Release during parsing, and + insert both. + + TODO: should instead check if the File has any releases; if not, insert and update. + TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. + """ def __init__(self, api, **kwargs): @@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") + self.longtail_oa = kwargs.get("longtail_oa", False) + + def want(self, raw_record): + return True + + def parse_record(self, row): + + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + + if not (fe and re): + return None + + # lookup existing file SHA1 + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # if file is already in here, presumably not actually long-tail + # HACK: this is doing an exists check in parse_record(), which is weird + # TODO: this is where we should check if the file actually has + # release_ids and/or URLs associated with it + if existing and not self.bezerk_mode: + self.counts['exists'] += 1 + self.counts['skip'] -= 1 + return None + + release_edit = self.create_release(re) + fe.release_ids.append(release_edit.ident) + return fe def parse_grobid_json(self, obj): @@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter): abobj = dict( mimetype="text/plain", language=None, - content=obj.get('abstract').strip()) + content=clean(obj.get('abstract'))) abstracts = [abobj] else: abstracts = None @@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter): for i, a in enumerate(obj.get('authors', [])): contribs.append(fatcat_client.ReleaseContrib( index=i, - raw_name=a['name'], + raw_name=clean(a['name']), role="author", extra=None)) + # XXX: why is this a dict()? not covered by tests? refs = [] for raw in obj.get('citations', []): cite_extra = dict() ref = dict() - ref['key'] = raw.get('id') + ref['key'] = clean(raw.get('id')) if raw.get('title'): - ref['title'] = raw['title'].strip() + ref['title'] = clean(raw['title']) if raw.get('date'): try: year = int(raw['date'].strip()[:4]) @@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter): pass for key in ('volume', 'url', 'issue', 'publisher'): if raw.get(key): - cite_extra[key] = raw[key].strip() + cite_extra[key] = clean(raw[key]) if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] + cite_extra['authors'] = [clean(a['name']) for a in raw['authors']] if cite_extra: cite_extra = dict(grobid=cite_extra) else: @@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter): if obj.get('doi'): extra['doi'] = obj['doi'] if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True + extra['container_name'] = clean(obj['journal']['name']) # TODO: ISSN/eISSN handling? or just journal name lookup? + if self.longtail_oa: + extra['longtail_oa'] = True + if extra: extra = dict(grobid=extra) else: extra = None re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), + title=clean(obj['title'], force_xml=True), release_type="article-journal", release_date=release_date, release_year=release_year, contribs=contribs, refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=clean(obj['journal'].get('publisher')), + volume=clean(obj['journal'].get('volume')), + issue=clean(obj['journal'].get('issue')), abstracts=abstracts, extra=extra) return re @@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter): sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - # lookup existing SHA1, or create new entity - try: - existing_file = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - existing_file = None - - if existing_file: - # if file is already in here, presumably not actually long-tail - return None fe = fatcat_client.FileEntity( sha1=sha1, size=int(file_size), @@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter): # parse URLs and CDX original = cdx['url'] + assert len(cdx['dt']) >= 8 wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) @@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter): return fe - def create_row(self, row, editgroup_id=None): - if not row: - return - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - if fe and re: - release_entity = self.api.create_release(re, editgroup_id=editgroup_id) - # release ident can't already be in release list because we just - # created it - fe.release_ids.append(release_entity.ident) - file_entity = self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - # NB: batch mode not implemented + def try_update(self, entity): + # did the exists check in 'parse_record()', because we needed to create a release + return True + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py deleted file mode 100644 index f4d525a4..00000000 --- a/python/fatcat_tools/importers/issn.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from .common import FatcatImporter - - -def or_none(s): - if s is None: - return None - if len(s) == 0: - return None - return s - -def truthy(s): - if s is None: - return None - s = s.lower() - - if s in ('true', 't', 'yes', 'y', '1'): - return True - elif s in ('false', 'f', 'no', 'n', '0'): - return False - else: - return None - -class IssnImporter(FatcatImporter): - """ - Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - """ - - def __init__(self, api, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra) - - def parse_issn_row(self, row): - """ - row is a python dict (parsed from CSV). - returns a ContainerEntity (or None if invalid or couldn't parse) - """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) - ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=title, - publisher=or_none(row['publisher']), - abbrev=None, - coden=None, - extra=extra) - return ce - - def create_row(self, row, editgroup_id=None): - ce = self.parse_issn_row(row) - if ce is not None: - self.api.create_container(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_issn_row(l) - for l in batch if (l is not None)] - objects = [o for o in objects if (o is not None)] - self.api.create_container_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py new file mode 100644 index 00000000..cf3971b5 --- /dev/null +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -0,0 +1,183 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import EntityImporter, clean + + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class JournalMetadataImporter(EntityImporter): + """ + Imports journal metadata ("containers") by ISSN, currently from a custom + (data munged) .csv file format + + CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + + ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + + + 'extra' fields: + + doaj + as_of: datetime of most recent check; if not set, not actually in DOAJ + seal: bool + work_level: bool (are work-level publications deposited with DOAJ?) + archiving: array, can include 'library' or 'other' + road + as_of: datetime of most recent check; if not set, not actually in ROAD + pubmed (TODO: delete?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + norwegian (TODO: drop this?) + as_of: datetime of most recent check; if not set, not actually indexed in pubmed + id (integer) + level (integer; 0-2) + kbart + lockss + year_rle + volume_rle + portico + ... + clockss + ... + sherpa_romeo + color + jstor + year_rle + volume_rle + scopus + id + TODO: print/electronic distinction? + wos + id + doi + crossref_doi: DOI of the title in crossref (if exists) + prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) + ia + sim + nap_id + year_rle + volume_rle + longtail: boolean + homepage + as_of: datetime of last attempt + url + status: HTTP/heritrix status of homepage crawl + + issnp: string + issne: string + coden: string + abbrev: string + oclc_id: string (TODO: lookup?) + lccn_id: string (TODO: lookup?) + dblb_id: string + default_license: slug + original_name: native name (if name is translated) + platform: hosting platform: OJS, wordpress, scielo, etc + mimetypes: array of strings (eg, 'application/pdf', 'text/html') + first_year: year (integer) + last_year: if publishing has stopped + primary_language: single ISO code, or 'mixed' + languages: array of ISO codes + region: TODO: continent/world-region + nation: shortcode of nation + discipline: TODO: highest-level subject; "life science", "humanities", etc + field: TODO: narrower description of field + subjects: TODO? + url: homepage + is_oa: boolean. If true, can assume all releases under this container are "Open Access" + TODO: domains, if exclusive? + TODO: fulltext_regex, if a known pattern? + + For KBART, etc: + We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. + year and volume spans are run-length-encoded arrays, using integers: + - if an integer, means that year is preserved + - if an array of length 2, means everything between the two numbers (inclusive) is preserved + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + + def want(self, raw_record): + if raw_record.get('ISSN-L'): + return True + return False + + def parse_record(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return None + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=clean(title), + publisher=or_none(clean(row['publisher'])), + extra=extra) + return ce + + def try_update(self, ce): + + existing = None + try: + existing = self.api.lookup_container(issnl=ce.issnl) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_container_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 1e5c22f7..2ec6c95d 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,16 +4,10 @@ import json import sqlite3 import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean -#row = row.split('\t') -#assert len(row) == 2 -#sha1 = row[0].replace('sha1:') -#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() -#print(sha1) -#dois = [d.lower() for d in json.loads(row[1])] -class MatchedImporter(FatcatImporter): +class MatchedImporter(EntityImporter): """ Importer for "file to crossref DOI" matches. @@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter): editgroup_extra=eg_extra) self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mime = kwargs.get("default_mime", None) - self.skip_file_updates = kwargs.get("skip_file_updates", False) def make_url(self, raw): rel = self.default_link_rel @@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter): rel = "repository" elif "//web.archive.org/" in raw or "//archive.is/" in raw: rel = "webarchive" - return fatcat_client.FileEntityUrls(url=raw, rel=rel) + return (rel, raw) - def parse_matched_dict(self, obj): - sha1 = obj['sha1'] - dois = [d.lower() for d in obj.get('dois', [])] + def want(self, raw_record): + return True - # lookup sha1, or create new entity - fe = None - if not self.skip_file_updates: - try: - fe = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - if fe is None: - fe = fatcat_client.FileEntity( - sha1=sha1, - release_ids=[], - urls=[], - ) + def parse_record(self, obj): + dois = [d.lower() for d in obj.get('dois', [])] # lookup dois re_list = set() @@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter): print("DOI not found: {}".format(doi)) else: re_list.add(re.ident) - if len(re_list) == 0: + release_ids = list(re_list) + if len(release_ids) == 0: return None - if fe.release_ids == set(re_list): - return None - re_list.update(fe.release_ids) - fe.release_ids = list(re_list) # parse URLs and CDX - existing_urls = [feu.url for feu in fe.urls] + urls = set() for url in obj.get('url', []): - if url not in existing_urls: - url = self.make_url(url) - if url != None: - fe.urls.append(url) + url = self.make_url(url) + if url != None: + urls.add(url) for cdx in obj.get('cdx', []): original = cdx['url'] wayback = "https://web.archive.org/web/{}/{}".format( cdx['dt'], original) - if wayback not in existing_urls: - fe.urls.append( - fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) - if original not in existing_urls: - url = self.make_url(original) - if url != None: - fe.urls.append(url) - - if obj.get('size') != None: - fe.size = int(obj['size']) - fe.sha256 = obj.get('sha256', fe.sha256) - fe.md5 = obj.get('md5', fe.sha256) - if obj.get('mimetype') is None: - if fe.mimetype is None: - fe.mimetype = self.default_mime - else: - fe.mimetype = obj.get('mimetype') + urls.add(("webarchive", wayback)) + url = self.make_url(original) + if url != None: + urls.add(url) + urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls] + if len(urls) == 0: + return None + + size = obj.get('size') + if size: + size = int(size) + + fe = fatcat_client.FileEntity( + md5=obj.get('md5'), + sha1=obj['sha1'], + sha256=obj.get('sha256'), + size=size, + mimetype=obj.get('mimetype'), + release_ids=release_ids, + urls=urls, + ) return fe - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - fe = self.parse_matched_dict(obj) - if fe is not None: - if fe.ident is None: - self.api.create_file(fe, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - else: - self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id) - self.counts['update'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_matched_dict(json.loads(l)) - for l in batch if l != None] - new_objects = [o for o in objects if o != None and o.ident == None] - update_objects = [o for o in objects if o != None and o.ident != None] - if len(update_objects): - update_eg = self._editgroup().editgroup_id - for obj in update_objects: - self.api.update_file(obj.ident, obj, editgroup_id=update_eg) - self.api.accept_editgroup(update_eg) - if len(new_objects) > 0: - self.api.create_file_batch(new_objects, autoaccept="true") - self.counts['update'] += len(update_objects) - self.counts['insert'] += len(new_objects) + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + fe.release_ids = list(set(fe.release_ids + existing.release_ids)) + if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: + # no new release matches *and* there are already existing URLs + self.counts['exists'] += 1 + return False + + # merge the existing into this one and update + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha256 = existing.sha256 or fe.sha256 + self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup()) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0c8b1d62..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean def value_or_none(e): if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e): return None return e -class OrcidImporter(FatcatImporter): +class OrcidImporter(EntityImporter): def __init__(self, api, **kwargs): @@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra) - def parse_orcid_dict(self, obj): + def want(self, raw_record): + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a CreatorEntity """ name = obj['person']['name'] - if name is None: - return None + assert name extra = None given = value_or_none(name.get('given-names')) sur = value_or_none(name.get('family-name')) @@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter): return None ce = fatcat_client.CreatorEntity( orcid=orcid, - given_name=given, - surname=sur, - display_name=display, + given_name=clean(given), + surname=clean(sur), + display_name=clean(display), extra=extra) return ce - def create_row(self, row, editgroup_id=None): - obj = json.loads(row) - ce = self.parse_orcid_dict(obj) - if ce is not None: - self.api.create_creator(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 + def try_update(self, raw_record): + existing = None + try: + existing = self.api.lookup_creator(orcid=raw_record.orcid) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_orcid_dict(json.loads(l)) - for l in batch if l != None] - objects = [o for o in objects if o != None] - self.api.create_creator_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) + def insert_batch(self, batch): + self.api.create_creator_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 0f957f9a..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ + import collections from fatcat_client import ReleaseEntity, ApiClient @@ -26,25 +27,43 @@ def release_to_elasticsearch(release): Raises exception on error (never returns None) """ - if release.state != 'active': - raise ValueError("Entity is not 'active'") + if release.state in ('redirect', 'deleted'): + return dict( + ident = release.ident, + state = release.state, + ) + elif release.state != 'active': + raise ValueError("Unhandled release state: {}".format(release.state)) # First, the easy ones (direct copy) t = dict( ident = release.ident, + state = release.state, revision = release.revision, title = release.title, + original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, + license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, core_id = release.core_id, - wikidata_qid = release.wikidata_qid + arxiv_id = release.core_id, + jstor_id = release.jstor_id, ) + is_oa = None + is_longtail_oa = None + in_kbart = None + in_web = False + in_dweb = False + in_ia = False + in_shadow = False + if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() @@ -53,52 +72,99 @@ def release_to_elasticsearch(release): if release.release_year is not None: t['release_year'] = release.release_year + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + container = release.container - container_is_kept = False if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + t['container_type'] = container.container_type + if container.extra: + if container.extra.get('is_oa') or container.extra.get('in_doaj'): + is_oa = True + if container.extra.get('in_kbart'): + # TODO: better KBART check goes here + in_kbart = True + if container.extra.get('ia'): + # TODO: container longtail check goes here + # TODO: sim/microfilm check goes here + pass + # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): - if url.rel == 'webarchive': - in_wa = True - if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # TODO: not sure what rel will be + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: - t['file_pdf_url'] = url.url - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url.url - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True extra = release.extra or dict() if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(release.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) + # TODO: longtail OA check from GROBID here + if extra.get('in_kbart'): + # NOTE: not actually setting this anywhere + in_kbart = True + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere + is_oa = True + if extra.get('grobid'): + if not t.get('container_name'): + t['container_name'] = extra['grobid'].get('container_name') + if extra['grobid'].get('longtail_oa'): + is_longtail_oa = True + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names + if is_longtail_oa: + is_oa = True + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_web'] = in_web + t['in_dweb'] = in_dweb + t['in_ia'] = in_ia + t['is_preserved'] = in_ia or in_kbart return t diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 8690a791..636ed304 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker): release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: ident = re['ident'] - release = self.api.get_release(ident, expand="files,container") + release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( message=json.dumps(release_dict).encode('utf-8'), |