aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/common.py137
-rw-r--r--python/fatcat_tools/importers/crossref.py272
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py168
-rw-r--r--python/fatcat_tools/importers/issn.py72
-rw-r--r--python/fatcat_tools/importers/matched.py144
-rw-r--r--python/fatcat_tools/importers/orcid.py73
6 files changed, 866 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
new file mode 100644
index 00000000..8dfee875
--- /dev/null
+++ b/python/fatcat_tools/importers/common.py
@@ -0,0 +1,137 @@
+
+import re
+import sys
+import csv
+import json
+import itertools
+import fatcat_client
+from fatcat_client.rest import ApiException
+
+# from: https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ args = [iter(iterable)] * n
+ return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+class FatcatImporter:
+
+ def __init__(self, host_url, issn_map_file=None):
+ conf = fatcat_client.Configuration()
+ conf.host = host_url
+ self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ self._issnl_id_map = dict()
+ self._orcid_id_map = dict()
+ self._doi_id_map = dict()
+ self._issn_issnl_map = None
+ self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
+ if issn_map_file:
+ self.read_issn_map_file(issn_map_file)
+ self.processed_lines = 0
+ self.insert_count = 0
+ self.update_count = 0
+
+ def describe_run(self):
+ print("Processed {} lines, inserted {}, updated {}.".format(
+ self.processed_lines, self.insert_count, self.update_count))
+
+ def process_source(self, source, group_size=100):
+ """Creates and auto-accepts editgroup every group_size rows"""
+ eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ for i, row in enumerate(source):
+ self.create_row(row, editgroup=eg.id)
+ if i > 0 and (i % group_size) == 0:
+ self.api.accept_editgroup(eg.id)
+ eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ self.processed_lines = self.processed_lines + 1
+ if i == 0 or (i % group_size) != 0:
+ self.api.accept_editgroup(eg.id)
+
+ def process_batch(self, source, size=50):
+ """Reads and processes in batches (not API-call-per-)"""
+ for rows in grouper(source, size):
+ self.processed_lines = self.processed_lines + len(rows)
+ eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ self.create_batch(rows, editgroup=eg.id)
+
+ def process_csv_source(self, source, group_size=100, delimiter=','):
+ reader = csv.DictReader(source, delimiter=delimiter)
+ self.process_source(reader, group_size)
+
+ def process_csv_batch(self, source, size=50, delimiter=','):
+ reader = csv.DictReader(source, delimiter=delimiter)
+ self.process_batch(reader, size)
+
+ def is_issnl(self, issnl):
+ return len(issnl) == 9 and issnl[4] == '-'
+
+ def lookup_issnl(self, issnl):
+ """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+ if issnl in self._issnl_id_map:
+ return self._issnl_id_map[issnl]
+ container_id = None
+ try:
+ rv = self.api.lookup_container(issnl=issnl)
+ container_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._issnl_id_map[issnl] = container_id # might be None
+ return container_id
+
+ def is_orcid(self, orcid):
+ return self._orcid_regex.match(orcid) != None
+
+ def lookup_orcid(self, orcid):
+ """Caches calls to the Orcid lookup API endpoint in a local dict"""
+ if not self.is_orcid(orcid):
+ return None
+ if orcid in self._orcid_id_map:
+ return self._orcid_id_map[orcid]
+ creator_id = None
+ try:
+ rv = self.api.lookup_creator(orcid=orcid)
+ creator_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._orcid_id_map[orcid] = creator_id # might be None
+ return creator_id
+
+ def is_doi(self, doi):
+ return doi.startswith("10.") and doi.count("/") >= 1
+
+ def lookup_doi(self, doi):
+ """Caches calls to the doi lookup API endpoint in a local dict"""
+ assert self.is_doi(doi)
+ doi = doi.lower()
+ if doi in self._doi_id_map:
+ return self._doi_id_map[doi]
+ release_id = None
+ try:
+ rv = self.api.lookup_release(doi=doi)
+ release_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._doi_id_map[doi] = release_id # might be None
+ return release_id
+
+ def read_issn_map_file(self, issn_map_file):
+ print("Loading ISSN map file...")
+ self._issn_issnl_map = dict()
+ for line in issn_map_file:
+ if line.startswith("ISSN") or len(line) == 0:
+ continue
+ (issn, issnl) = line.split()[0:2]
+ self._issn_issnl_map[issn] = issnl
+ # double mapping makes lookups easy
+ self._issn_issnl_map[issnl] = issnl
+ print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map)))
+
+ def issn2issnl(self, issn):
+ if issn is None:
+ return None
+ return self._issn_issnl_map.get(issn)
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
new file mode 100644
index 00000000..dddb58d1
--- /dev/null
+++ b/python/fatcat_tools/importers/crossref.py
@@ -0,0 +1,272 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import itertools
+import fatcat_client
+from fatcat_tools.importers.common import FatcatImporter
+
+
+class FatcatCrossrefImporter(FatcatImporter):
+
+ def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
+ super().__init__(host_url, issn_map_file)
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri))
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map")
+ self.create_containers = create_containers
+
+ def lookup_ext_ids(self, doi):
+ if self.extid_map_db is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+ [doi.lower()]).fetchone()
+ if row is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=row[0],
+ pmid=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3])
+
+ def parse_crossref_dict(self, obj):
+ """
+ obj is a python dict (parsed from json).
+ returns a ReleaseEntity
+ """
+
+ # This work is out of scope if it doesn't have authors and a title
+ if (not 'author' in obj) or (not 'title' in obj):
+ return None
+
+ # Other ways to be out of scope (provisionally)
+ if (not 'type' in obj):
+ return None
+
+ # contribs
+ def do_contribs(obj_list, ctype):
+ contribs = []
+ for i, am in enumerate(obj_list):
+ creator_id = None
+ if 'ORCID' in am.keys():
+ creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
+ # Sorry humans :(
+ if am.get('given') and am.get('family'):
+ raw_name = "{} {}".format(am['given'], am['family'])
+ elif am.get('family'):
+ raw_name = am['family']
+ else:
+ # TODO: defaults back to a pseudo-null value
+ raw_name = am.get('given', '<blank>')
+ extra = dict()
+ if ctype == "author":
+ index = i
+ else:
+ index = None
+ if am.get('affiliation'):
+ # note: affiliation => affiliations
+ extra['affiliations'] = am.get('affiliation')
+ if am.get('sequence') and am.get('sequence') != "additional":
+ extra['sequence'] = am.get('sequence')
+ if not extra:
+ extra = None
+ contribs.append(fatcat_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=index,
+ raw_name=raw_name,
+ role=ctype,
+ extra=extra))
+ return contribs
+ contribs = do_contribs(obj['author'], "author")
+ contribs.extend(do_contribs(obj.get('editor', []), "editor"))
+ contribs.extend(do_contribs(obj.get('translator', []), "translator"))
+
+ # container
+ issn = obj.get('ISSN', [None])[0]
+ issnl = self.issn2issnl(issn)
+ container_id = None
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
+ publisher = obj.get('publisher')
+
+ ce = None
+ if (container_id is None and self.create_containers and issnl != None
+ and obj.get('container-title') and len(obj['container-title']) > 0):
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ publisher=publisher,
+ name=obj['container-title'][0])
+
+ # references
+ refs = []
+ for i, rm in enumerate(obj.get('reference', [])):
+ try:
+ year = int(rm.get('year'))
+ # NOTE: will need to update/config in the future!
+ # NOTE: are there crossref works with year < 100?
+ if year > 2025 or year < 100:
+ year = None
+ except:
+ year = None
+ extra = rm.copy()
+ if rm.get('DOI'):
+ extra['doi'] = rm.get('DOI').lower()
+ key = rm.get('key')
+ if key and key.startswith(obj['DOI'].upper()):
+ key = key.replace(obj['DOI'].upper() + "-", '')
+ key = key.replace(obj['DOI'].upper(), '')
+ container_name = rm.get('volume-title')
+ if not container_name:
+ container_name = rm.get('journal-title')
+ extra.pop('DOI', None)
+ extra.pop('key', None)
+ extra.pop('year', None)
+ extra.pop('volume-name', None)
+ extra.pop('journal-title', None)
+ extra.pop('title', None)
+ extra.pop('first-page', None)
+ extra.pop('doi-asserted-by', None)
+ if extra:
+ extra = dict(crossref=extra)
+ else:
+ extra = None
+ refs.append(fatcat_client.ReleaseRef(
+ index=i,
+ # doing lookups would be a second import pass
+ target_release_id=None,
+ key=key,
+ year=year,
+ container_name=container_name,
+ title=rm.get('title'),
+ locator=rm.get('first-page'),
+ # TODO: just dump JSON somewhere here?
+ extra=extra))
+
+ # abstracts
+ abstracts = []
+ if obj.get('abstract') != None:
+ abstracts.append(fatcat_client.ReleaseEntityAbstracts(
+ mimetype="application/xml+jats",
+ content=obj.get('abstract')))
+
+ # extra fields
+ extra = dict()
+ for key in ('subject', 'type', 'license', 'alternative-id',
+ 'container-title', 'original-title', 'subtitle', 'archive',
+ 'funder', 'group-title'):
+ # TODO: unpack "container-title" array
+ val = obj.get(key)
+ if val:
+ extra[key] = val
+ if 'license' in extra and extra['license']:
+ for i in range(len(extra['license'])):
+ if 'start' in extra['license'][i]:
+ extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+ if len(obj['title']) > 1:
+ extra['other-titles'] = obj['title'][1:]
+ # TODO: this should be top-level
+ extra['is_kept'] = len(obj.get('archive', [])) > 0
+
+ # ISBN
+ isbn13 = None
+ for raw in obj.get('ISBN', []):
+ # TODO: convert if not ISBN-13 format
+ if len(raw) == 17:
+ isbn13 = raw
+ break
+
+ # release status
+ if obj['type'] in ('journal-article', 'conference-proceeding', 'book',
+ 'dissertation', 'book-chapter'):
+ release_status = "published"
+ else:
+ # unknown
+ release_status = None
+
+ # external identifiers
+ extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
+
+ # TODO: filter out huge releases; we'll get them later (and fix bug in
+ # fatcatd)
+ if max(len(contribs), len(refs), len(abstracts)) > 750:
+ return None
+
+ # release date parsing is amazingly complex
+ release_date = obj['issued']['date-parts'][0]
+ if not release_date or not release_date[0]:
+ # got some NoneType, even though at least year is supposed to be set
+ release_date = None
+ elif len(release_date) == 3:
+ release_date = datetime.datetime(year=release_date[0], month=release_date[1], day=release_date[2])
+ else:
+ # only the year is actually required; mangle to first day for date
+ # (TODO: something better?)
+ release_date = datetime.datetime(year=release_date[0], month=1, day=1)
+ # convert to string ISO datetime format (if not null)
+ if release_date:
+ release_date = release_date.isoformat() + "Z"
+
+ re = fatcat_client.ReleaseEntity(
+ work_id=None,
+ title=obj['title'][0],
+ contribs=contribs,
+ refs=refs,
+ container_id=container_id,
+ publisher=publisher,
+ release_type=obj['type'],
+ release_status=release_status,
+ doi=obj['DOI'].lower(),
+ isbn13=isbn13,
+ core_id=extids['core_id'],
+ pmid=extids['pmid'],
+ pmcid=extids['pmcid'],
+ wikidata_qid=extids['wikidata_qid'],
+ release_date=release_date,
+ issue=obj.get('issue'),
+ volume=obj.get('volume'),
+ pages=obj.get('page'),
+ abstracts=abstracts,
+ extra=dict(crossref=extra))
+ return (re, ce)
+
+ def create_row(self, row, editgroup=None):
+ if row is None:
+ return
+ obj = json.loads(row)
+ entities = self.parse_crossref_dict(obj)
+ if entities is not None:
+ (re, ce) = entities
+ if ce is not None:
+ container = self.api.create_container(ce, editgroup=editgroup)
+ re.container_id = container.ident
+ self._issnl_id_map[ce.issnl] = container.ident
+ self.api.create_release(re, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+
+ def create_batch(self, batch, editgroup=None):
+ """Current work/release pairing disallows batch creation of releases.
+ Could do batch work creation and then match against releases, but meh."""
+ release_batch = []
+ for row in batch:
+ if row is None:
+ continue
+ obj = json.loads(row)
+ entities = self.parse_crossref_dict(obj)
+ if entities is not None:
+ (re, ce) = entities
+ if ce is not None:
+ ce_eg = self.api.create_editgroup(
+ fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ container = self.api.create_container(ce, editgroup=ce_eg.id)
+ self.api.accept_editgroup(ce_eg.id)
+ re.container_id = container.ident
+ self._issnl_id_map[ce.issnl] = container.ident
+ release_batch.append(re)
+ self.api.create_release_batch(release_batch, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(release_batch)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
new file mode 100644
index 00000000..56b2ee02
--- /dev/null
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import base64
+import datetime
+import fatcat_client
+from fatcat_tools.importers.common import FatcatImporter
+
+MAX_ABSTRACT_BYTES=4096
+
+
+class FatcatGrobidMetadataImporter(FatcatImporter):
+
+ def __init__(self, host_url, default_link_rel="web"):
+ super().__init__(host_url)
+ self.default_link_rel = default_link_rel
+
+ def parse_grobid_json(self, obj):
+
+ if not obj.get('title'):
+ return None
+
+ release = dict()
+ extra = dict()
+
+ if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
+ abobj = dict(
+ mimetype="text/plain",
+ language=None,
+ content=obj.get('abstract').strip())
+ abstracts = [abobj]
+ else:
+ abstracts = None
+
+ contribs = []
+ for i, a in enumerate(obj.get('authors', [])):
+ c = dict(raw_name=a['name'], role="author")
+ contribs.append(fatcat_client.ReleaseContrib(
+ index=i,
+ raw_name=a['name'],
+ role="author",
+ extra=None))
+
+ refs = []
+ for raw in obj.get('citations', []):
+ cite_extra = dict()
+ ref = dict()
+ ref['key'] = raw.get('id')
+ if raw.get('title'):
+ ref['title'] = raw['title'].strip()
+ if raw.get('date'):
+ try:
+ year = int(raw['date'].strip()[:4])
+ ref['year'] = year
+ except:
+ pass
+ for key in ('volume', 'url', 'issue', 'publisher'):
+ if raw.get(key):
+ cite_extra[key] = raw[key].strip()
+ if raw.get('authors'):
+ cite_extra['authors'] = [a['name'] for a in raw['authors']]
+ if cite_extra:
+ cite_extra = dict(grobid=cite_extra)
+ else:
+ cite_extra = None
+ ref['extra'] = cite_extra
+ refs.append(ref)
+
+ release_type = "journal-article"
+ release_date = None
+ if obj.get('date'):
+ # TODO: only returns year, ever? how to handle?
+ release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1)
+
+ if obj.get('doi'):
+ extra['doi'] = obj['doi']
+ if obj['journal'] and obj['journal'].get('name'):
+ extra['container_name'] = obj['journal']['name']
+
+ extra['is_longtail_oa'] = True
+
+ # TODO: ISSN/eISSN handling? or just journal name lookup?
+
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+
+ re = fatcat_client.ReleaseEntity(
+ title=obj['title'].strip(),
+ contribs=contribs,
+ refs=refs,
+ publisher=obj['journal'].get('publisher'),
+ volume=obj['journal'].get('volume'),
+ issue=obj['journal'].get('issue'),
+ abstracts=abstracts,
+ extra=extra)
+ return re
+
+ # TODO: make this a common function somewhere
+ def make_url(self, raw):
+ rel = self.default_link_rel
+ # TODO: this is where we could map specific domains to rel types,
+ # and also filter out bad domains, invalid URLs, etc
+ if "//archive.org/" in raw or "//arxiv.org/" in raw:
+ # TODO: special-case the arxiv.org bulk mirror?
+ rel = "repository"
+ elif "//web.archive.org/" in raw or "//archive.is/" in raw:
+ rel = "webarchive"
+ return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+
+ def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):
+
+ sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
+
+ # lookup existing SHA1, or create new entity
+ try:
+ existing_file = self.api.lookup_file(sha1=sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ existing_file = None
+
+ if existing_file:
+ # if file is already in here, presumably not actually long-tail
+ return None
+ fe = fatcat_client.FileEntity(
+ sha1=sha1,
+ size=int(file_size),
+ mimetype=mimetype,
+ releases=[],
+ urls=[],
+ )
+
+ # parse URLs and CDX
+ original = cdx['url']
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ cdx['dt'],
+ original)
+ fe.urls.append(
+ fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
+ original_url = self.make_url(original)
+ if original_url != None:
+ fe.urls.append(original_url)
+
+ return fe
+
+ def create_row(self, row, editgroup=None):
+ if not row:
+ return
+ fields = row.split('\t')
+ sha1_key = fields[0]
+ cdx = json.loads(fields[1])
+ mimetype = fields[2]
+ file_size = int(fields[3])
+ grobid_meta = json.loads(fields[4])
+ fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+ re = self.parse_grobid_json(grobid_meta)
+ if fe and re:
+ release_entity = self.api.create_release(re, editgroup=editgroup)
+ # release ident can't already be in release list because we just
+ # created it
+ fe.releases.append(release_entity.ident)
+ file_entity = self.api.create_file(fe, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+
+ # NB: batch mode not implemented
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
new file mode 100644
index 00000000..d7fb9082
--- /dev/null
+++ b/python/fatcat_tools/importers/issn.py
@@ -0,0 +1,72 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat_tools.importers.common import FatcatImporter
+
+# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+def or_none(s):
+ if s is None:
+ return None
+ if len(s) == 0:
+ return None
+ return s
+
+def truthy(s):
+ if s is None:
+ return None
+ s = s.lower()
+ if s in ('true', 't', 'yes', 'y', '1'):
+ return True
+ elif s in ('false', 'f', 'no', 'n', '0'):
+ return False
+ else:
+ return None
+
+class FatcatIssnImporter(FatcatImporter):
+
+ def parse_issn_row(self, row):
+ """
+ row is a python dict (parsed from CSV).
+ returns a ContainerEntity
+ """
+ title = or_none(row['title'])
+ issnl = or_none(row['ISSN-L'])
+ if title is None or issnl is None:
+ return
+ extra = dict(
+ in_doaj=truthy(row['in_doaj']),
+ in_road=truthy(row['in_road']),
+ in_norwegian=truthy(row['in_norwegian']),
+ language=or_none(row['lang']),
+ url=or_none(row['url']),
+ ISSNp=or_none(row['ISSN-print']),
+ ISSNe=or_none(row['ISSN-electronic']),
+ is_oa=truthy(row['is_oa']),
+ is_kept=truthy(row['is_kept']),
+ )
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ name=title,
+ publisher=or_none(row['publisher']),
+ abbrev=None,
+ coden=None,
+ extra=extra)
+ return ce
+
+ def create_row(self, row, editgroup=None):
+ ce = self.parse_issn_row(row)
+ if ce is not None:
+ self.api.create_container(ce, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+
+ def create_batch(self, batch, editgroup=None):
+ """Reads and processes in batches (not API-call-per-line)"""
+ objects = [self.parse_issn_row(l)
+ for l in batch if l != None]
+ objects = [o for o in objects if o != None]
+ self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(objects)
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
new file mode 100644
index 00000000..6270fe88
--- /dev/null
+++ b/python/fatcat_tools/importers/matched.py
@@ -0,0 +1,144 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_client
+from fatcat_tools.importers.common import FatcatImporter
+
+#row = row.split('\t')
+#assert len(row) == 2
+#sha1 = row[0].replace('sha1:')
+#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
+#print(sha1)
+#dois = [d.lower() for d in json.loads(row[1])]
+
+class FatcatMatchedImporter(FatcatImporter):
+ """
+ Input format is JSON with keys:
+ - dois (list)
+ - sha1 (hex)
+ - md5 (hex)
+ - sha256 (hex)
+ - size (int)
+ - cdx (list of objects)
+ - dt
+ - url
+ - mimetype
+ - urls (list of strings... or objects?)
+
+ Future handlings/extensions:
+ - core_id, wikidata_id, pmcid, pmid: not as lists
+ """
+
+ def __init__(self, host_url, skip_file_update=False, default_mime=None,
+ default_link_rel="web"):
+ super().__init__(host_url)
+ self.default_mime = default_mime
+ self.default_link_rel = default_link_rel
+ self.skip_file_update = skip_file_update
+
+ def make_url(self, raw):
+ rel = self.default_link_rel
+ # TODO: this is where we could map specific domains to rel types,
+ # and also filter out bad domains, invalid URLs, etc
+ if "//archive.org/" in raw or "//arxiv.org/" in raw:
+ # TODO: special-case the arxiv.org bulk mirror?
+ rel = "repository"
+ elif "//web.archive.org/" in raw or "//archive.is/" in raw:
+ rel = "webarchive"
+ return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+
+ def parse_matched_dict(self, obj):
+ sha1 = obj['sha1']
+ dois = [d.lower() for d in obj.get('dois', [])]
+
+ # lookup sha1, or create new entity
+ fe = None
+ if not self.skip_file_update:
+ try:
+ fe = self.api.lookup_file(sha1=sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if fe is None:
+ fe = fatcat_client.FileEntity(
+ sha1=sha1,
+ releases=[],
+ urls=[],
+ )
+
+ # lookup dois
+ re_list = set()
+ for doi in dois:
+ try:
+ re = self.api.lookup_release(doi=doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ re = None
+ if re is None:
+ print("DOI not found: {}".format(doi))
+ else:
+ re_list.add(re.ident)
+ if len(re_list) == 0:
+ return None
+ if fe.releases == set(re_list):
+ return None
+ re_list.update(fe.releases)
+ fe.releases = list(re_list)
+
+ # parse URLs and CDX
+ existing_urls = [feu.url for feu in fe.urls]
+ for url in obj.get('url', []):
+ if url not in existing_urls:
+ url = self.make_url(url)
+ if url != None:
+ fe.urls.append(url)
+ for cdx in obj.get('cdx', []):
+ original = cdx['url']
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ cdx['dt'],
+ original)
+ if wayback not in existing_urls:
+ fe.urls.append(
+ fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
+ if original not in existing_urls:
+ url = self.make_url(original)
+ if url != None:
+ fe.urls.append(url)
+
+ if obj.get('size') != None:
+ fe.size = int(obj['size'])
+ fe.sha256 = obj.get('sha256', fe.sha256)
+ fe.md5 = obj.get('md5', fe.sha256)
+ if obj.get('mimetype') is None:
+ if fe.mimetype is None:
+ fe.mimetype = self.default_mime
+ else:
+ fe.mimetype = obj.get('mimetype')
+ return fe
+
+ def create_row(self, row, editgroup=None):
+ obj = json.loads(row)
+ fe = self.parse_matched_dict(obj)
+ if fe is not None:
+ if fe.ident is None:
+ self.api.create_file(fe, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+ else:
+ self.api.update_file(fe.ident, fe, editgroup=editgroup)
+ self.update_count = self.update_count + 1
+
+ def create_batch(self, batch, editgroup=None):
+ """Reads and processes in batches (not API-call-per-line)"""
+ objects = [self.parse_matched_dict(json.loads(l))
+ for l in batch if l != None]
+ new_objects = [o for o in objects if o != None and o.ident == None]
+ update_objects = [o for o in objects if o != None and o.ident != None]
+ for obj in update_objects:
+ self.api.update_file(obj.ident, obj, editgroup=editgroup)
+ if len(new_objects) > 0:
+ self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup)
+ self.update_count = self.update_count + len(update_objects)
+ self.insert_count = self.insert_count + len(new_objects)
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
new file mode 100644
index 00000000..350c4c57
--- /dev/null
+++ b/python/fatcat_tools/importers/orcid.py
@@ -0,0 +1,73 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat_tools.importers.common import FatcatImporter
+
+def value_or_none(e):
+ if type(e) == dict:
+ e = e.get('value')
+ if type(e) == str and len(e) == 0:
+ e = None
+ # TODO: this is probably bogus; patched in desperation; remove?
+ if e:
+ try:
+ e.encode()
+ except UnicodeEncodeError:
+ # Invalid JSON?
+ print("BAD UNICODE")
+ return None
+ return e
+
+class FatcatOrcidImporter(FatcatImporter):
+
+ def parse_orcid_dict(self, obj):
+ """
+ obj is a python dict (parsed from json).
+ returns a CreatorEntity
+ """
+ name = obj['person']['name']
+ if name is None:
+ return None
+ extra = None
+ given = value_or_none(name.get('given-names'))
+ sur = value_or_none(name.get('family-name'))
+ display = value_or_none(name.get('credit-name'))
+ if display is None:
+ # TODO: sorry human beings
+ if given and sur:
+ display = "{} {}".format(given, sur)
+ elif sur:
+ display = sur
+ elif given:
+ display = given
+ else:
+ # must have *some* name
+ return None
+ orcid = obj['orcid-identifier']['path']
+ if not self.is_orcid(orcid):
+ sys.stderr.write("Bad ORCID: {}\n".format(orcid))
+ return None
+ ce = fatcat_client.CreatorEntity(
+ orcid=orcid,
+ given_name=given,
+ surname=sur,
+ display_name=display,
+ extra=extra)
+ return ce
+
+ def create_row(self, row, editgroup=None):
+ obj = json.loads(row)
+ ce = self.parse_orcid_dict(obj)
+ if ce is not None:
+ self.api.create_creator(ce, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+
+ def create_batch(self, batch, editgroup=None):
+ """Reads and processes in batches (not API-call-per-line)"""
+ objects = [self.parse_orcid_dict(json.loads(l))
+ for l in batch if l != None]
+ objects = [o for o in objects if o != None]
+ self.api.create_creator_batch(objects, autoaccept="true", editgroup=editgroup)
+ self.insert_count = self.insert_count + len(objects)