From a1d94e6c28b080158fd65d0ec54ff6d64451df97 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Jun 2018 09:53:30 -0700 Subject: manifest importer --- python/client.py | 10 +++-- python/fatcat/importer_common.py | 17 +++++++++ python/fatcat/manifest_importer.py | 77 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 python/fatcat/manifest_importer.py diff --git a/python/client.py b/python/client.py index 9a2ed50d..ca6af603 100755 --- a/python/client.py +++ b/python/client.py @@ -3,7 +3,9 @@ import sys import argparse from fatcat.raw_api_client import RawFatcatApiClient +from fatcat.crossref_importer import FatcatCrossrefImporter from fatcat.orcid_importer import FatcatOrcidImporter +from fatcat.manifest_importer import FatcatManifestImporter def run_import_crossref(args): fcc = FatcatCrossrefClient(args.host_url) @@ -53,12 +55,12 @@ def main(): help="size of batch to send", default=50, type=int) - sub_import_manifest = subparsers.add_parser('import-orcid') - sub_import_manifest.set_defaults(func=run_import_orcid) + sub_import_manifest = subparsers.add_parser('import-manifest') + sub_import_manifest.set_defaults(func=run_import_manifest) sub_import_manifest.add_argument('db_path', help="sqlite3 database to import from", - type=argparse.FileType('r')) - sub_import_orcid.add_argument('--batch-size', + type=str) + sub_import_manifest.add_argument('--batch-size', help="size of batch to send", default=50, type=int) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index 5bdb61b4..f8638418 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -19,6 +19,7 @@ class FatcatImporter: self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) self._issnl_id_map = dict() self._orcid_id_map = dict() + self._doi_id_map = dict() self._issn_issnl_map = None if issn_map_file: self.read_issn_map_file(issn_map_file) @@ -71,6 +72,22 @@ class FatcatImporter: self._orcid_id_map[orcid] = creator_id # might be None return creator_id + def lookup_doi(self, doi): + """Caches calls to the doi lookup API endpoint in a local dict""" + assert doi.startswith('10.') + doi = doi.lower() + if doi in self._doi_id_map: + return self._doi_id_map[doi] + release_id = None + try: + rv = self.api.lookup_release(doi=doi) + release_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._doi_id_map[doi] = release_id # might be None + return release_id + def read_issn_map_file(self, issn_map_file): self._issn_issnl_map = dict() for line in issn_map_file: diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py new file mode 100644 index 00000000..47ebb020 --- /dev/null +++ b/python/fatcat/manifest_importer.py @@ -0,0 +1,77 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1" + +class FatcatManifestImporter(FatcatImporter): + + def parse_manifest_row(self, row): + """ + obj is a python dict (parsed from json). + returns a CreatorEntity + """ + (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row + + if url is None: + return None + release_ids = None + if doi is not None: + release_id = self.lookup_doi(doi.lower()) + if release_id: + release_ids = [release_id,] + extra = None + fe = fatcat_client.FileEntity( + sha1=sha1, + mimetype=mimetype, + size=size_bytes, + md5=md5, + url=url, + releases=release_ids, + extra=extra) + return fe + + def create_entity(self, entity, editgroup_id=None): + if entity is not None: + entity.editgroup_id = editgroup_id + self.api.create_file(entity) + + def process_db(self, db_path, size=100): + # TODO: multiple DOIs per sha1 + # TODO: multiple URLs per sha1 (with schema change) + # TODO: a test! + + db = sqlite3.connect(db_path) + last_sha1 = None + + print("Counting rows...") + total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0]) + print("{} rows to process".format(total_count)) + + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + i = 0 + j = -1 + for row in db.execute(QUERY): + j = j+1 + if row[0] == last_sha1: + continue + else: + last_sha1 = row[0] + fe = self.parse_manifest_row(row) + if fe is None: + continue + self.create_entity(fe, editgroup_id=eg.id) + if i > 0 and (i % size) == 0: + self.api.accept_editgroup(eg.id) + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format( + j, total_count, 100.0*j/total_count, i)) + i = i + 1 + if i == 0 or (i % size) != 0: + self.api.accept_editgroup(eg.id) + print("Done! Inserted {}".format(i)) -- cgit v1.2.3