diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 19 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 390 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 263 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 123 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/issn.py | 89 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 183 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 150 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 51 | 
8 files changed, 834 insertions, 434 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index e6f081e5..70f38f5b 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -1,7 +1,22 @@ -from .common import FatcatImporter, make_kafka_consumer +""" +To run an import you combine two classes; one each of: + +- RecordSource: somehow iterates over a source of raw records (eg, from a +  database, Kafka, files on disk, stdin) and pushes into an entity importer. +- EntityImporter: class that a record iterator pushes raw (unparsed) records +  into. The entity importer parses and decides what to do (ignore, update, +  insert, etc). There is usually a primary entity type, though related entities +  can be created along the way. Maintains API connection and editgroup/batch +  state. + +""" + +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP  from .grobid_metadata import GrobidMetadataImporter -from .issn import IssnImporter +from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter  from .orcid import OrcidImporter +#from .kafka_source import KafkaSource +#from .file_source import FileSource diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 06897bee..89203a4f 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,6 +3,7 @@ import re  import sys  import csv  import json +import ftfy  import itertools  import subprocess  from collections import Counter @@ -12,30 +13,66 @@ import fatcat_client  from fatcat_client.rest import ApiException -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): -    "Collect data into fixed-length chunks or blocks" -    args = [iter(iterable)] * n -    return itertools.zip_longest(*args, fillvalue=fillvalue) +def clean(thing, force_xml=False): +    """ +    This function is appropriate to be called on any random, non-markup string, +    such as author names, titles, etc. -def make_kafka_consumer(hosts, env, topic_suffix, group): -    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') -    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") -    consume_topic = client.topics[topic_name] -    print("Consuming from kafka topic {}, group {}".format(topic_name, group)) +    It will try to clean up commong unicode mangles, HTML characters, etc. -    consumer = consume_topic.get_balanced_consumer( -        consumer_group=group.encode('utf-8'), -        managed=True, -        auto_commit_enable=True, -        auto_commit_interval_ms=30000, # 30 seconds -        compacted_topic=True, -    ) -    return consumer +    This will detect XML/HTML and "do the right thing" (aka, not remove +    entities like '&' if there are tags in the string), unless you pass the +    'force_xml' parameter, which might be appropriate for, eg, names and +    titles, which generally should be projected down to plain text. + +    Also strips extra whitespace. +    """ +    if not thing: +        return thing +    fix_entities = 'auto' +    if force_xml: +        fix_entities = True +    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() +    if not fixed: +        # wasn't zero-length before, but is now; return None +        return None +    return fixed + +def test_clean(): -class FatcatImporter: +    assert clean(None) == None +    assert clean('') == '' +    assert clean('123') == '123' +    assert clean('a&b') == 'a&b' +    assert clean('<b>a&b</b>') == '<b>a&b</b>' +    assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +class EntityImporter:      """ -    Base class for fatcat importers +    Base class for fatcat entity importers. +     +    The API exposed to record iterator is: + +        push_record(raw_record) +        finish() + +    The API that implementations are expected to fill in are: + +        want(raw_record) -> boolean +        parse(raw_record) -> entity +        try_update(entity) -> boolean +        insert_batch([entity]) -> None + +    This class exposes helpers for implementations: + +        self.api +        self.create_<entity>(entity) -> EntityEdit +            for related entity types +        self.push_entity(entity) +        self.counts['exists'] += 1 +            if didn't update or insert because of existing) +        self.counts['update'] += 1 +            if updated an entity      """      def __init__(self, api, **kwargs): @@ -43,87 +80,135 @@ class FatcatImporter:          eg_extra = kwargs.get('editgroup_extra', dict())          eg_extra['git_rev'] = eg_extra.get('git_rev',              subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')          self.api = api -        self._editgroup_description = kwargs.get('editgroup_description') -        self._editgroup_extra = kwargs.get('editgroup_extra') -        issn_map_file = kwargs.get('issn_map_file') +        self.bezerk_mode = kwargs.get('bezerk_mode', False) +        self.edit_batch_size = kwargs.get('edit_batch_size', 100) +        self.editgroup_description = kwargs.get('editgroup_description') +        self.editgroup_extra = kwargs.get('editgroup_extra') +        self.reset()          self._issnl_id_map = dict()          self._orcid_id_map = dict() -        self._doi_id_map = dict() -        if issn_map_file: -            self.read_issn_map_file(issn_map_file)          self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") -        self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) +        self._doi_id_map = dict() -    def _editgroup(self): -        eg = fatcat_client.Editgroup( -            description=self._editgroup_description, -            extra=self._editgroup_extra, -        ) -        return self.api.create_editgroup(eg) +    def reset(self): +        self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) +        self._edit_count = 0 +        self._editgroup_id = None +        self._entity_queue = [] -    def describe_run(self): -        print("Processed {} lines, inserted {}, updated {}.".format( -            self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) +    def push_record(self, raw_record): +        """ +        Returns nothing. +        """ +        if (not raw_record) or (not self.want(raw_record)): +            self.counts['skip'] += 1 +            return +        entity = self.parse_record(raw_record) +        if not entity: +            self.counts['skip'] += 1 +            return +        if self.bezerk_mode: +            self.push_entity(entity) +            return +        if self.try_update(entity): +            self.push_entity(entity) +        return -    def create_row(self, row, editgroup_id=None): -        # sub-classes expected to implement this -        raise NotImplementedError +    def finish(self): +        if self._edit_count > 0: +            self.api.accept_editgroup(self._editgroup_id) +            self._editgroup_id = None +            self._edit_count = 0 + +        if self._entity_queue: +            self.insert_batch(self._entity_queue) +            self.counts['insert'] += len(self._entity_queue) +            self._entity_queue =  [] + +        self.counts['total'] = 0 +        for key in ('skip', 'insert', 'update', 'exists'): +            self.counts['total'] += self.counts[key] +        return self.counts + +    def _get_editgroup(self, edits=1): +        if self._edit_count >= self.edit_batch_size: +            self.api.accept_editgroup(self._editgroup_id) +            self._editgroup_id = None +            self._edit_count = 0 -    def create_batch(self, rows, editgroup_id=None): -        # sub-classes expected to implement this +        if not self._editgroup_id: +            eg = self.api.create_editgroup( +                fatcat_client.Editgroup( +                    description=self.editgroup_description, +                    extra=self.editgroup_extra)) +            self._editgroup_id = eg.editgroup_id + +        self._edit_count += edits +        return self._editgroup_id + +    def create_container(self, entity): +        eg_id = self._get_editgroup() +        self.counts['inserted.container'] += 1 +        return self.api.create_container(entity, editgroup_id=eg_id) + +    def create_release(self, entity): +        eg_id = self._get_editgroup() +        self.counts['inserted.release'] += 1 +        return self.api.create_release(entity, editgroup_id=eg_id) + +    def create_file(self, entity): +        eg_id = self._get_editgroup() +        self.counts['inserted.file'] += 1 +        return self.api.create_file(entity, editgroup_id=eg_id) + +    def updated(self): +        """ +        Implementations should call this from try_update() if the update was successful +        """ +        self.counts['update'] += 1 + +    def push_entity(self, entity): +        self._entity_queue.append(entity) +        if len(self._entity_queue) >= self.edit_batch_size: +            self.insert_batch(self._entity_queue) +            self.counts['insert'] += len(_entity_queue) +            self._entity_queue = 0 + +    def want(self, raw_record): +        """ +        Implementations can override for optional fast-path to drop a record. +        Must have no side-effects; returns bool. +        """ +        return True + +    def parse(self, raw_record): +        """ +        Returns an entity class type, or None if we should skip this one. + +        May have side-effects (eg, create related entities), but shouldn't +        update/mutate the actual entity. +        """          raise NotImplementedError -    def process_source(self, source, group_size=100): -        """Creates and auto-accepts editgroup every group_size rows""" -        eg = self._editgroup() -        i = 0 -        for i, row in enumerate(source): -            self.create_row(row, editgroup_id=eg.editgroup_id) -            if i > 0 and (i % group_size) == 0: -                self.api.accept_editgroup(eg.editgroup_id) -                eg = self._editgroup() -            self.counts['processed_lines'] += 1 -        if i == 0 or (i % group_size) != 0: -            self.api.accept_editgroup(eg.editgroup_id) - -    def process_batch(self, source, size=50, decode_kafka=False): -        """Reads and processes in batches (not API-call-per-)""" -        for rows in grouper(source, size): -            if decode_kafka: -                rows = [msg.value.decode('utf-8') for msg in rows] -            self.counts['processed_lines'] += len(rows) -            #eg = self._editgroup() -            #self.create_batch(rows, editgroup_id=eg.editgroup_id) -            self.create_batch(rows) - -    def process_csv_source(self, source, group_size=100, delimiter=','): -        reader = csv.DictReader(source, delimiter=delimiter) -        self.process_source(reader, group_size) - -    def process_csv_batch(self, source, size=50, delimiter=','): -        reader = csv.DictReader(source, delimiter=delimiter) -        self.process_batch(reader, size) +    def try_update(self, raw_record): +        """ +        Passed the output of parse(). Should try to find an existing entity and +        update it (PUT), decide we should do nothing (based on the existing +        record), or create a new one. -    def is_issnl(self, issnl): -        return len(issnl) == 9 and issnl[4] == '-' +        Implementations must update the exists/updated/skip counts +        appropriately in this method. -    def lookup_issnl(self, issnl): -        """Caches calls to the ISSN-L lookup API endpoint in a local dict""" -        if issnl in self._issnl_id_map: -            return self._issnl_id_map[issnl] -        container_id = None -        try: -            rv = self.api.lookup_container(issnl=issnl) -            container_id = rv.ident -        except ApiException as ae: -            # If anything other than a 404 (not found), something is wrong -            assert ae.status == 404 -        self._issnl_id_map[issnl] = container_id # might be None -        return container_id +        Returns boolean: True if the entity should still be inserted, False otherwise +        """ +        raise NotImplementedError + +    def insert_batch(self, raw_record): +        raise NotImplementedError      def is_orcid(self, orcid):          return self._orcid_regex.match(orcid) is not None @@ -163,6 +248,23 @@ class FatcatImporter:          self._doi_id_map[doi] = release_id # might be None          return release_id +    def is_issnl(self, issnl): +        return len(issnl) == 9 and issnl[4] == '-' + +    def lookup_issnl(self, issnl): +        """Caches calls to the ISSN-L lookup API endpoint in a local dict""" +        if issnl in self._issnl_id_map: +            return self._issnl_id_map[issnl] +        container_id = None +        try: +            rv = self.api.lookup_container(issnl=issnl) +            container_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._issnl_id_map[issnl] = container_id # might be None +        return container_id +      def read_issn_map_file(self, issn_map_file):          print("Loading ISSN map file...")          self._issn_issnl_map = dict() @@ -179,3 +281,117 @@ class FatcatImporter:          if issn is None:              return None          return self._issn_issnl_map.get(issn) + + +class RecordPusher: +    """ +    Base class for different importer sources. Pretty trivial interface, just +    wraps an importer and pushes records in to it. +    """ + +    def __init__(self, importer, **kwargs): +        self.importer = importer + +    def run(self): +        """ +        This will look something like: + +            for line in sys.stdin: +                record = json.loads(line) +                self.importer.push_record(record) +            print(self.importer.finish()) +        """ +        raise NotImplementedError + + +class JsonLinePusher(RecordPusher): + +    def __init__(self, importer, json_file, **kwargs): +        self.importer = importer +        self.json_file = json_file + +    def run(self): +        for line in self.json_file: +            if not line: +                continue +            record = json.loads(line) +            self.importer.push_record(record) +        counts = self.importer.finish() +        print(counts) +        return counts + + +class CsvPusher(RecordPusher): + +    def __init__(self, importer, csv_file, **kwargs): +        self.importer = importer +        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ',')) + +    def run(self): +        for line in self.reader: +            if not line: +                continue +            self.importer.push_record(line) +        counts = self.importer.finish() +        print(counts) +        return counts + + +class LinePusher(RecordPusher): + +    def __init__(self, importer, text_file, **kwargs): +        self.importer = importer +        self.text_file = text_file + +    def run(self): +        for line in self.text_file: +            if not line: +                continue +            self.importer.push_record(line) +        counts = self.importer.finish() +        print(counts) +        return counts + + +class KafkaJsonPusher(RecordPusher): + +    def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): +        self.importer = importer +        self.consumer = make_kafka_consumer( +            kafka_hosts, +            kafka_env, +            topic_suffix, +            group, +        ) + +    def run(self): +        count = 0 +        for msg in self.consumer: +            if not msg: +                continue +            record = json.loads(msg.value.decode('utf-8')) +            self.importer.push_record(record) +            count += 1 +            if count % 500 == 0: +                print("Import counts: {}".format(self.importer.counts)) +        # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or +        # commit the current batch if it has been lingering +        counts = self.importer.finish() +        print(counts) +        return counts + + +def make_kafka_consumer(hosts, env, topic_suffix, group): +    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8') +    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0") +    consume_topic = client.topics[topic_name] +    print("Consuming from kafka topic {}, group {}".format(topic_name, group)) + +    consumer = consume_topic.get_balanced_consumer( +        consumer_group=group.encode('utf-8'), +        managed=True, +        auto_commit_enable=True, +        auto_commit_interval_ms=30000, # 30 seconds +        compacted_topic=True, +    ) +    return consumer diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 6365e491..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime  import itertools  import subprocess  import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean  # The docs/guide should be the cannonical home for these mappings; update there @@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {      'standard': 'standard',  } -class CrossrefImporter(FatcatImporter): +CONTAINER_TYPE_MAP = { +    'article-journal': 'journal', +    'paper-conference': 'conference', +    'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { +    "http://creativecommons.org/licenses/by/3.0/": "CC-BY", +    "http://creativecommons.org/licenses/by/4.0/": "CC-BY", +    "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", +    "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", +    "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", +    "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", +    "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", +    "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", +    "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", +    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", +    "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", +    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", +    "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", +    # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license +    # http://www.springer.com/tdm doesn't seem like a license +} + +class CrossrefImporter(EntityImporter):      """      Importer for Crossref metadata. @@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):              issn_map_file=issn_map_file,              editgroup_description=eg_desc,              editgroup_extra=eg_extra) + +        self.create_containers = kwargs.get('create_containers')          extid_map_file = kwargs.get('extid_map_file') -        create_containers = kwargs.get('create_containers') -        check_existing = kwargs.get('check_existing')          self.extid_map_db = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):              self.extid_map_db = sqlite3.connect(db_uri, uri=True)          else:              print("Not using external ID map") -        self.create_containers = create_containers -        self.check_existing = check_existing + +        self.read_issn_map_file(issn_map_file)      def lookup_ext_ids(self, doi):          if self.extid_map_db is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)          row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",              [doi.lower()]).fetchone()          if row is None: -            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)          row = [str(cell or '') or None for cell in row]          return dict(              core_id=row[0],              pmid=row[1],              pmcid=row[2], -            wikidata_qid=row[3]) +            wikidata_qid=row[3], +            # TODO: +            arxiv_id=None, +            jstor_id=None, +        )      def map_release_type(self, crossref_type):          return CROSSREF_TYPE_MAP.get(crossref_type) -    def parse_crossref_dict(self, obj): +    def map_container_type(self, crossref_type): +        return CONTAINER_TYPE_MAP.get(crossref_type) + +    def want(self, obj): +        if not obj.get('title'): +            return False + +        # do most of these checks in-line below +        return True + +    def parse_record(self, obj):          """          obj is a python dict (parsed from json).          returns a ReleaseEntity          """ -        # Do require the 'title' keys to exsit, as release entities do -        if (not 'title' in obj) or (not obj['title']): -            return None -          # Ways to be out of scope (provisionally)          # journal-issue and journal-volume map to None, but allowed for now          if obj.get('type') in (None, 'journal', 'proceedings', @@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):                  'book-track', 'proceedings-series'):              return None -        # lookup existing DOI -        existing_release = None -        if self.check_existing: -            try: -                existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) -            except fatcat_client.rest.ApiException as err: -                if err.status != 404: -                    raise err - -        # eventually we'll want to support "updates", but for now just skip if -        # entity already exists -        if existing_release: +        # Do require the 'title' keys to exsit, as release entities do +        if (not 'title' in obj) or (not obj['title']):              return None +        release_type = self.map_release_type(obj['type']) +          # contribs          def do_contribs(obj_list, ctype):              contribs = [] @@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):                      index = i                  else:                      index = None +                raw_affiliation = None                  if am.get('affiliation'): -                    # note: affiliation => affiliations -                    extra['affiliations'] = am.get('affiliation') +                    if len(am.get('affiliation')) > 0: +                        raw_affiliation = am.get('affiliation')[0]['name'] +                    if len(am.get('affiliation')) > 1: +                        # note: affiliation => more_affiliations +                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]                  if am.get('sequence') and am.get('sequence') != "additional": -                    extra['sequence'] = am.get('sequence') +                    extra['seq'] = clean(am.get('sequence'))                  if not extra:                      extra = None                  assert ctype in ("author", "editor", "translator")                  contribs.append(fatcat_client.ReleaseContrib(                      creator_id=creator_id,                      index=index, -                    raw_name=raw_name, +                    raw_name=clean(raw_name), +                    raw_affiliation=clean(raw_affiliation),                      role=ctype,                      extra=extra))              return contribs @@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):              container_id = self.lookup_issnl(issnl)          publisher = obj.get('publisher') -        ce = None          if (container_id is None and self.create_containers and (issnl is not None)              and obj.get('container-title') and len(obj['container-title']) > 0):              ce = fatcat_client.ContainerEntity(                  issnl=issnl, -                publisher=publisher, -                name=obj['container-title'][0]) +                publisher=clean(publisher), +                container_type=self.map_container_type(release_type), +                name=clean(obj['container-title'][0], force_xml=True)) +            ce_edit = self.create_container(ce) +            container_id = ce_edit.ident + +        # license slug +        license_slug = None +        license_extra = [] +        for l in obj.get('license', []): +            if l['content-version'] not in ('vor', 'unspecified'): +                continue +            slug = LICENSE_SLUG_MAP.get(l['URL']) +            if slug: +                license_slug = slug +            if 'start' in l: +                l['start'] = l['start']['date-time'] +            license_extra.append(l)          # references          refs = []          for i, rm in enumerate(obj.get('reference', [])):              try:                  year = int(rm.get('year')) -                # NOTE: will need to update/config in the future! +                # TODO: will need to update/config in the future!                  # NOTE: are there crossref works with year < 100?                  if year > 2025 or year < 100:                      year = None              except:                  year = None -            extra = rm.copy() -            if rm.get('DOI'): -                extra['doi'] = rm.get('DOI').lower()              key = rm.get('key')              if key and key.startswith(obj['DOI'].upper()):                  key = key.replace(obj['DOI'].upper() + "-", '') @@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):              container_name = rm.get('volume-title')              if not container_name:                  container_name = rm.get('journal-title') -            extra.pop('DOI', None) -            extra.pop('key', None) -            extra.pop('year', None) -            extra.pop('volume-name', None) -            extra.pop('journal-title', None) -            extra.pop('title', None) -            extra.pop('first-page', None) -            extra.pop('doi-asserted-by', None) +            elif rm.get('journal-title'): +                extra['journal-title'] = rm['journal-title'] +            extra = dict() +            if rm.get('DOI'): +                extra['doi'] = rm.get('DOI').lower() +            # TODO: what fields here? CSL citation stuff +            for k in ('author', 'editor', 'edition', 'authority', 'version', +                    'genre', 'url', 'event', 'issue', 'volume', 'date', +                    'accessed_date', 'issued', 'page', 'medium', +                    'collection_title', 'chapter_number'): +                if clean(rm.get(k)): +                    extra[k] = clean(rm[k])              if extra:                  extra = dict(crossref=extra)              else: @@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):                  target_release_id=None,                  key=key,                  year=year, -                container_name=container_name, -                title=rm.get('title'), -                locator=rm.get('first-page'), +                container_name=clean(container_name), +                title=clean(rm.get('title')), +                locator=clean(rm.get('first-page')),                  # TODO: just dump JSON somewhere here?                  extra=extra)) @@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):          if obj.get('abstract') != None:              abstracts.append(fatcat_client.ReleaseEntityAbstracts(                  mimetype="application/xml+jats", -                content=obj.get('abstract'))) +                content=clean(obj.get('abstract'))))          # extra fields          extra = dict() -        for key in ('subject', 'type', 'license', 'alternative-id', -                'container-title', 'original-title', 'subtitle', 'archive', -                'funder', 'group-title'): -            # TODO: unpack "container-title" array +        for key in ('subject', 'type', 'alternative-id', 'container-title', +                'subtitle', 'archive', 'funder', 'group-title'): +            # TODO: unpack "container-title" array?              val = obj.get(key)              if val: -                extra[key] = val -        if 'license' in extra and extra['license']: -            for i in range(len(extra['license'])): -                if 'start' in extra['license'][i]: -                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] +                if type(val) == str: +                    extra[key] = clean(val) +                else: +                    extra[key] = val +        if license_extra: +            extra['license'] = license_extra +          if len(obj['title']) > 1: -            extra['other-titles'] = obj['title'][1:] -        # TODO: this should be top-level -        extra['is_kept'] = len(obj.get('archive', [])) > 0 +            extra['other-titles'] = [clean(t) for t in obj['title'][1:]]          # ISBN          isbn13 = None @@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):          re = fatcat_client.ReleaseEntity(              work_id=None, -            title=obj.get('title', [None])[0], -            contribs=contribs, -            refs=refs,              container_id=container_id, -            publisher=publisher, -            release_type=self.map_release_type(obj['type']), +            title=clean(obj.get('title', [None])[0], force_xml=True), +            original_title=clean(obj.get('original-title', [None])[0]), +            release_type=release_type,              release_status=release_status, +            release_date=release_date, +            release_year=release_year, +            publisher=clean(publisher),              doi=obj['DOI'].lower(), -            isbn13=isbn13, -            core_id=extids['core_id'],              pmid=extids['pmid'],              pmcid=extids['pmcid'],              wikidata_qid=extids['wikidata_qid'], -            release_date=release_date, -            release_year=release_year, -            issue=obj.get('issue'), -            volume=obj.get('volume'), -            pages=obj.get('page'), +            isbn13=isbn13, +            core_id=extids['core_id'], +            arxiv_id=extids['arxiv_id'], +            jstor_id=extids['jstor_id'], +            volume=clean(obj.get('volume')), +            issue=clean(obj.get('issue')), +            pages=clean(obj.get('page')), +            language=None,  # crossref doesn't supply language info +            license_slug=license_slug, +            extra=dict(crossref=extra),              abstracts=abstracts, -            extra=dict(crossref=extra)) -        return (re, ce) +            contribs=contribs, +            refs=refs, +        ) +        return re + +    def try_update(self, re): + +        # lookup existing DOI (don't need to try other ext idents for crossref) +        existing = None +        try: +            existing = self.api.lookup_release(doi=re.doi) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # doesn't exist, need to update +            return True + +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False +         +        return True + +    def insert_batch(self, batch): +        self.api.create_release_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) -    def create_row(self, row, editgroup_id=None): -        if row is None: -            return -        obj = json.loads(row) -        entities = self.parse_crossref_dict(obj) -        if entities is not None: -            (re, ce) = entities -            if ce is not None: -                container = self.api.create_container(ce, editgroup_id=editgroup_id) -                re.container_id = container.ident -                self._issnl_id_map[ce.issnl] = container.ident -            self.api.create_release(re, editgroup_id=editgroup_id) -            self.counts['insert'] += 1 - -    def create_batch(self, batch): -        """Current work/release pairing disallows batch creation of releases. -        Could do batch work creation and then match against releases, but meh.""" -        release_batch = [] -        for row in batch: -            if row is None: -                continue -            obj = json.loads(row) -            entities = self.parse_crossref_dict(obj) -            if entities is not None: -                (re, ce) = entities -                if ce is not None: -                    ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) -                    container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) -                    self.api.accept_editgroup(ce_eg.editgroup_id) -                    re.container_id = container.ident -                    self._issnl_id_map[ce.issnl] = container.ident -                release_batch.append(re) -        self.api.create_release_batch(release_batch, autoaccept="true") -        self.counts['insert'] += len(release_batch) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 5e61a154..9d95fe0b 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,22 @@ import json  import base64  import datetime  import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean  MAX_ABSTRACT_BYTES=4096 -class GrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(EntityImporter): +    """ +    This is a complex case: we need to parse and create both file and release entities. + +    The "primary" entity here is really File, not Release. If a matching File +    exists, we bail in want(); if not we insert the Release during parsing, and +    insert both. + +    TODO: should instead check if the File has any releases; if not, insert and update. +    TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. +    """      def __init__(self, api, **kwargs): @@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):              editgroup_description=eg_desc,              editgroup_extra=eg_extra)          self.default_link_rel = kwargs.get("default_link_rel", "web") +        self.longtail_oa = kwargs.get("longtail_oa", False) + +    def want(self, raw_record): +        return True + +    def parse_record(self, row): + +        fields = row.split('\t') +        sha1_key = fields[0] +        cdx = json.loads(fields[1]) +        mimetype = fields[2] +        file_size = int(fields[3]) +        grobid_meta = json.loads(fields[4]) +        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) +        re = self.parse_grobid_json(grobid_meta) + +        if not (fe and re): +            return None + +        # lookup existing file SHA1 +        existing = None +        try: +            existing = self.api.lookup_file(sha1=fe.sha1) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        # if file is already in here, presumably not actually long-tail +        # HACK: this is doing an exists check in parse_record(), which is weird +        # TODO: this is where we should check if the file actually has +        # release_ids and/or URLs associated with it +        if existing and not self.bezerk_mode: +            self.counts['exists'] += 1 +            self.counts['skip'] -= 1 +            return None + +        release_edit = self.create_release(re) +        fe.release_ids.append(release_edit.ident) +        return fe      def parse_grobid_json(self, obj): @@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):              abobj = dict(                  mimetype="text/plain",                  language=None, -                content=obj.get('abstract').strip()) +                content=clean(obj.get('abstract')))              abstracts = [abobj]          else:              abstracts = None @@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):          for i, a in enumerate(obj.get('authors', [])):              contribs.append(fatcat_client.ReleaseContrib(                  index=i, -                raw_name=a['name'], +                raw_name=clean(a['name']),                  role="author",                  extra=None)) +        # XXX: why is this a dict()? not covered by tests?          refs = []          for raw in obj.get('citations', []):              cite_extra = dict()              ref = dict() -            ref['key'] = raw.get('id') +            ref['key'] = clean(raw.get('id'))              if raw.get('title'): -                ref['title'] = raw['title'].strip() +                ref['title'] = clean(raw['title'])              if raw.get('date'):                  try:                      year = int(raw['date'].strip()[:4]) @@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):                      pass              for key in ('volume', 'url', 'issue', 'publisher'):                  if raw.get(key): -                    cite_extra[key] = raw[key].strip() +                    cite_extra[key] = clean(raw[key])              if raw.get('authors'): -                cite_extra['authors'] = [a['name'] for a in raw['authors']] +                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]              if cite_extra:                  cite_extra = dict(grobid=cite_extra)              else: @@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):          if obj.get('doi'):              extra['doi'] = obj['doi']          if obj['journal'] and obj['journal'].get('name'): -            extra['container_name'] = obj['journal']['name'] - -        extra['is_longtail_oa'] = True +            extra['container_name'] = clean(obj['journal']['name'])          # TODO: ISSN/eISSN handling? or just journal name lookup? +        if self.longtail_oa: +            extra['longtail_oa'] = True +          if extra:              extra = dict(grobid=extra)          else:              extra = None          re = fatcat_client.ReleaseEntity( -            title=obj['title'].strip(), +            title=clean(obj['title'], force_xml=True),              release_type="article-journal",              release_date=release_date,              release_year=release_year,              contribs=contribs,              refs=refs, -            publisher=obj['journal'].get('publisher'), -            volume=obj['journal'].get('volume'), -            issue=obj['journal'].get('issue'), +            publisher=clean(obj['journal'].get('publisher')), +            volume=clean(obj['journal'].get('volume')), +            issue=clean(obj['journal'].get('issue')),              abstracts=abstracts,              extra=extra)          return re @@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):          sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() -        # lookup existing SHA1, or create new entity -        try: -            existing_file = self.api.lookup_file(sha1=sha1) -        except fatcat_client.rest.ApiException as err: -            if err.status != 404: -                raise err -            existing_file = None - -        if existing_file: -            # if file is already in here, presumably not actually long-tail -            return None          fe = fatcat_client.FileEntity(              sha1=sha1,              size=int(file_size), @@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):          # parse URLs and CDX          original = cdx['url'] +        assert len(cdx['dt']) >= 8          wayback = "https://web.archive.org/web/{}/{}".format(              cdx['dt'],              original) @@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):          return fe -    def create_row(self, row, editgroup_id=None): -        if not row: -            return -        fields = row.split('\t') -        sha1_key = fields[0] -        cdx = json.loads(fields[1]) -        mimetype = fields[2] -        file_size = int(fields[3]) -        grobid_meta = json.loads(fields[4]) -        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) -        re = self.parse_grobid_json(grobid_meta) -        if fe and re: -            release_entity = self.api.create_release(re, editgroup_id=editgroup_id) -            # release ident can't already be in release list because we just -            # created it -            fe.release_ids.append(release_entity.ident) -            file_entity = self.api.create_file(fe, editgroup_id=editgroup_id) -            self.counts['insert'] += 1 - -    # NB: batch mode not implemented +    def try_update(self, entity): +        # did the exists check in 'parse_record()', because we needed to create a release +        return True + +    def insert_batch(self, batch): +        self.api.create_file_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py deleted file mode 100644 index f4d525a4..00000000 --- a/python/fatcat_tools/importers/issn.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from .common import FatcatImporter - - -def or_none(s): -    if s is None: -        return None -    if len(s) == 0: -        return None -    return s - -def truthy(s): -    if s is None: -        return None -    s = s.lower() - -    if s in ('true', 't', 'yes', 'y', '1'): -        return True -    elif s in ('false', 'f', 'no', 'n', '0'): -        return False -    else: -        return None - -class IssnImporter(FatcatImporter): -    """ -    Imports journal metadata ("containers") by ISSN, currently from a custom -    (data munged) .csv file format - -    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - -        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count -    """ - -    def __init__(self, api, **kwargs): - -        eg_desc = kwargs.get('editgroup_description', -            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") -        eg_extra = kwargs.get('editgroup_extra', dict()) -        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') -        super().__init__(api, -            editgroup_description=eg_desc, -            editgroup_extra=eg_extra) - -    def parse_issn_row(self, row): -        """ -        row is a python dict (parsed from CSV). -        returns a ContainerEntity (or None if invalid or couldn't parse) -        """ -        title = or_none(row['title']) -        issnl = or_none(row['ISSN-L']) -        if title is None or issnl is None: -            return None -        extra = dict( -            in_doaj=truthy(row['in_doaj']), -            in_road=truthy(row['in_road']), -            in_norwegian=truthy(row['in_norwegian']), -            language=or_none(row['lang']), -            url=or_none(row['url']), -            ISSNp=or_none(row['ISSN-print']), -            ISSNe=or_none(row['ISSN-electronic']), -            is_oa=truthy(row['is_oa']), -            is_kept=truthy(row['is_kept']), -        ) -        ce = fatcat_client.ContainerEntity( -            issnl=issnl, -            name=title, -            publisher=or_none(row['publisher']), -            abbrev=None, -            coden=None, -            extra=extra) -        return ce - -    def create_row(self, row, editgroup_id=None): -        ce = self.parse_issn_row(row) -        if ce is not None: -            self.api.create_container(ce, editgroup_id=editgroup_id) -            self.counts['insert'] += 1 - -    def create_batch(self, batch): -        """Reads and processes in batches (not API-call-per-line)""" -        objects = [self.parse_issn_row(l) -                   for l in batch if (l is not None)] -        objects = [o for o in objects if (o is not None)] -        self.api.create_container_batch(objects, autoaccept="true") -        self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py new file mode 100644 index 00000000..cf3971b5 --- /dev/null +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -0,0 +1,183 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import EntityImporter, clean + + +def or_none(s): +    if s is None: +        return None +    if len(s) == 0: +        return None +    return s + +def truthy(s): +    if s is None: +        return None +    s = s.lower() + +    if s in ('true', 't', 'yes', 'y', '1'): +        return True +    elif s in ('false', 'f', 'no', 'n', '0'): +        return False +    else: +        return None + +class JournalMetadataImporter(EntityImporter): +    """ +    Imports journal metadata ("containers") by ISSN, currently from a custom +    (data munged) .csv file format + +    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + +        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + + +    'extra' fields: + +        doaj +            as_of: datetime of most recent check; if not set, not actually in DOAJ +            seal: bool +            work_level: bool (are work-level publications deposited with DOAJ?) +            archiving: array, can include 'library' or 'other' +        road +            as_of: datetime of most recent check; if not set, not actually in ROAD +        pubmed (TODO: delete?) +            as_of: datetime of most recent check; if not set, not actually indexed in pubmed +        norwegian (TODO: drop this?) +            as_of: datetime of most recent check; if not set, not actually indexed in pubmed +            id (integer) +            level (integer; 0-2) +        kbart +            lockss +                year_rle +                volume_rle +            portico +                ... +            clockss +                ... +        sherpa_romeo +            color +        jstor +            year_rle +            volume_rle +        scopus +            id +            TODO: print/electronic distinction? +        wos +            id +        doi +            crossref_doi: DOI of the title in crossref (if exists) +            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) +        ia +            sim +                nap_id +                year_rle +                volume_rle +            longtail: boolean +            homepage +                as_of: datetime of last attempt +                url +                status: HTTP/heritrix status of homepage crawl + +        issnp: string +        issne: string +        coden: string +        abbrev: string +        oclc_id: string (TODO: lookup?) +        lccn_id: string (TODO: lookup?) +        dblb_id: string +        default_license: slug +        original_name: native name (if name is translated) +        platform: hosting platform: OJS, wordpress, scielo, etc +        mimetypes: array of strings (eg, 'application/pdf', 'text/html') +        first_year: year (integer) +        last_year: if publishing has stopped +        primary_language: single ISO code, or 'mixed' +        languages: array of ISO codes +        region: TODO: continent/world-region +        nation: shortcode of nation +        discipline: TODO: highest-level subject; "life science", "humanities", etc +        field: TODO: narrower description of field +        subjects: TODO? +        url: homepage +        is_oa: boolean. If true, can assume all releases under this container are "Open Access" +        TODO: domains, if exclusive? +        TODO: fulltext_regex, if a known pattern? + +    For KBART, etc: +        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. +        year and volume spans are run-length-encoded arrays, using integers: +            - if an integer, means that year is preserved +            - if an array of length 2, means everything between the two numbers (inclusive) is preserved +    """ + +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) + +    def want(self, raw_record): +        if raw_record.get('ISSN-L'): +            return True +        return False + +    def parse_record(self, row): +        """ +        row is a python dict (parsed from CSV). +        returns a ContainerEntity (or None if invalid or couldn't parse) +        """ +        title = or_none(row['title']) +        issnl = or_none(row['ISSN-L']) +        if title is None or issnl is None: +            return None +        extra = dict( +            in_doaj=truthy(row['in_doaj']), +            in_road=truthy(row['in_road']), +            in_norwegian=truthy(row['in_norwegian']), +            language=or_none(row['lang']), +            url=or_none(row['url']), +            ISSNp=or_none(row['ISSN-print']), +            ISSNe=or_none(row['ISSN-electronic']), +            is_oa=truthy(row['is_oa']), +            is_kept=truthy(row['is_kept']), +        ) +        ce = fatcat_client.ContainerEntity( +            issnl=issnl, +            name=clean(title), +            publisher=or_none(clean(row['publisher'])), +            extra=extra) +        return ce + +    def try_update(self, ce): + +        existing = None +        try: +            existing = self.api.lookup_container(issnl=ce.issnl) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # doesn't exist, need to update +            return True + +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False +         +        return True + +    def insert_batch(self, batch): +        self.api.create_container_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 1e5c22f7..2ec6c95d 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,16 +4,10 @@ import json  import sqlite3  import itertools  import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean -#row = row.split('\t') -#assert len(row) == 2 -#sha1 = row[0].replace('sha1:') -#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() -#print(sha1) -#dois = [d.lower() for d in json.loads(row[1])] -class MatchedImporter(FatcatImporter): +class MatchedImporter(EntityImporter):      """      Importer for "file to crossref DOI" matches. @@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter):              editgroup_extra=eg_extra)          self.default_link_rel = kwargs.get("default_link_rel", "web")          self.default_mime = kwargs.get("default_mime", None) -        self.skip_file_updates = kwargs.get("skip_file_updates", False)      def make_url(self, raw):          rel = self.default_link_rel @@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter):              rel = "repository"          elif "//web.archive.org/" in raw or "//archive.is/" in raw:              rel = "webarchive" -        return fatcat_client.FileEntityUrls(url=raw, rel=rel) +        return (rel, raw) -    def parse_matched_dict(self, obj): -        sha1 = obj['sha1'] -        dois = [d.lower() for d in obj.get('dois', [])] +    def want(self, raw_record): +        return True -        # lookup sha1, or create new entity -        fe = None -        if not self.skip_file_updates: -            try: -                fe = self.api.lookup_file(sha1=sha1) -            except fatcat_client.rest.ApiException as err: -                if err.status != 404: -                    raise err -        if fe is None: -            fe = fatcat_client.FileEntity( -                sha1=sha1, -                release_ids=[], -                urls=[], -            ) +    def parse_record(self, obj): +        dois = [d.lower() for d in obj.get('dois', [])]          # lookup dois          re_list = set() @@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter):                  print("DOI not found: {}".format(doi))              else:                  re_list.add(re.ident) -        if len(re_list) == 0: +        release_ids = list(re_list) +        if len(release_ids) == 0:              return None -        if fe.release_ids == set(re_list): -            return None -        re_list.update(fe.release_ids) -        fe.release_ids = list(re_list)          # parse URLs and CDX -        existing_urls = [feu.url for feu in fe.urls] +        urls = set()          for url in obj.get('url', []): -            if url not in existing_urls: -                url = self.make_url(url) -                if url != None: -                    fe.urls.append(url) +            url = self.make_url(url) +            if url != None: +                urls.add(url)          for cdx in obj.get('cdx', []):              original = cdx['url']              wayback = "https://web.archive.org/web/{}/{}".format(                  cdx['dt'],                  original) -            if wayback not in existing_urls: -                fe.urls.append( -                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) -            if original not in existing_urls: -                url = self.make_url(original) -                if url != None: -                    fe.urls.append(url) - -        if obj.get('size') != None: -            fe.size = int(obj['size']) -        fe.sha256 = obj.get('sha256', fe.sha256) -        fe.md5 = obj.get('md5', fe.sha256) -        if obj.get('mimetype') is None: -            if fe.mimetype is None: -                fe.mimetype = self.default_mime -        else: -            fe.mimetype = obj.get('mimetype') +            urls.add(("webarchive", wayback)) +            url = self.make_url(original) +            if url != None: +                urls.add(url) +        urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls] +        if len(urls) == 0: +            return None + +        size = obj.get('size') +        if size: +            size = int(size) + +        fe = fatcat_client.FileEntity( +            md5=obj.get('md5'), +            sha1=obj['sha1'], +            sha256=obj.get('sha256'), +            size=size, +            mimetype=obj.get('mimetype'), +            release_ids=release_ids, +            urls=urls, +        )          return fe -    def create_row(self, row, editgroup_id=None): -        obj = json.loads(row) -        fe = self.parse_matched_dict(obj) -        if fe is not None: -            if fe.ident is None: -                self.api.create_file(fe, editgroup_id=editgroup_id) -                self.counts['insert'] += 1 -            else: -                self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id) -                self.counts['update'] += 1 - -    def create_batch(self, batch): -        """Reads and processes in batches (not API-call-per-line)""" -        objects = [self.parse_matched_dict(json.loads(l)) -                   for l in batch if l != None] -        new_objects = [o for o in objects if o != None and o.ident == None] -        update_objects = [o for o in objects if o != None and o.ident != None] -        if len(update_objects): -            update_eg = self._editgroup().editgroup_id -            for obj in update_objects: -                self.api.update_file(obj.ident, obj, editgroup_id=update_eg) -            self.api.accept_editgroup(update_eg) -        if len(new_objects) > 0: -            self.api.create_file_batch(new_objects, autoaccept="true") -        self.counts['update'] += len(update_objects) -        self.counts['insert'] += len(new_objects) +    def try_update(self, fe): +        # lookup sha1, or create new entity +        existing = None +        try: +            existing = self.api.lookup_file(sha1=fe.sha1) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            return True + +        fe.release_ids = list(set(fe.release_ids + existing.release_ids)) +        if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0: +            # no new release matches *and* there are already existing URLs +            self.counts['exists'] += 1 +            return False + +        # merge the existing into this one and update +        existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) +        existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] +        existing.release_ids = list(set(fe.release_ids + existing.release_ids)) +        existing.mimetype = existing.mimetype or fe.mimetype +        existing.size = existing.size or fe.size +        existing.md5 = existing.md5 or fe.md5 +        existing.sha256 = existing.sha256 or fe.sha256 +        self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup()) +        self.counts['update'] += 1 +        return False + +    def insert_batch(self, batch): +        self.api.create_file_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) + diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 0c8b1d62..02c9bf00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys  import json  import itertools  import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean  def value_or_none(e):      if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e):              return None      return e -class OrcidImporter(FatcatImporter): +class OrcidImporter(EntityImporter):      def __init__(self, api, **kwargs): @@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter):              editgroup_description=eg_desc,              editgroup_extra=eg_extra) -    def parse_orcid_dict(self, obj): +    def want(self, raw_record): +        return True + +    def parse_record(self, obj):          """          obj is a python dict (parsed from json).          returns a CreatorEntity          """          name = obj['person']['name'] -        if name is None: -            return None +        assert name          extra = None          given = value_or_none(name.get('given-names'))          sur = value_or_none(name.get('family-name')) @@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter):              return None          ce = fatcat_client.CreatorEntity(              orcid=orcid, -            given_name=given, -            surname=sur, -            display_name=display, +            given_name=clean(given), +            surname=clean(sur), +            display_name=clean(display),              extra=extra)          return ce -    def create_row(self, row, editgroup_id=None): -        obj = json.loads(row) -        ce = self.parse_orcid_dict(obj) -        if ce is not None: -            self.api.create_creator(ce, editgroup_id=editgroup_id) -            self.counts['insert'] += 1 +    def try_update(self, raw_record): +        existing = None +        try: +            existing = self.api.lookup_creator(orcid=raw_record.orcid) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False +         +        return True -    def create_batch(self, batch): -        """Reads and processes in batches (not API-call-per-line)""" -        objects = [self.parse_orcid_dict(json.loads(l)) -                   for l in batch if l != None] -        objects = [o for o in objects if o != None] -        self.api.create_creator_batch(objects, autoaccept="true") -        self.counts['insert'] += len(objects) +    def insert_batch(self, batch): +        self.api.create_creator_batch(batch, +            autoaccept=True, +            description=self.editgroup_description, +            extra=json.dumps(self.editgroup_extra)) | 
