diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/client.py | 10 | ||||
| -rw-r--r-- | python/fatcat/importer_common.py | 17 | ||||
| -rw-r--r-- | python/fatcat/manifest_importer.py | 77 | 
3 files changed, 100 insertions, 4 deletions
| diff --git a/python/client.py b/python/client.py index 9a2ed50d..ca6af603 100755 --- a/python/client.py +++ b/python/client.py @@ -3,7 +3,9 @@  import sys  import argparse  from fatcat.raw_api_client import RawFatcatApiClient +from fatcat.crossref_importer import FatcatCrossrefImporter  from fatcat.orcid_importer import FatcatOrcidImporter +from fatcat.manifest_importer import FatcatManifestImporter  def run_import_crossref(args):      fcc = FatcatCrossrefClient(args.host_url) @@ -53,12 +55,12 @@ def main():          help="size of batch to send",          default=50, type=int) -    sub_import_manifest = subparsers.add_parser('import-orcid') -    sub_import_manifest.set_defaults(func=run_import_orcid) +    sub_import_manifest = subparsers.add_parser('import-manifest') +    sub_import_manifest.set_defaults(func=run_import_manifest)      sub_import_manifest.add_argument('db_path',          help="sqlite3 database to import from", -        type=argparse.FileType('r')) -    sub_import_orcid.add_argument('--batch-size', +        type=str) +    sub_import_manifest.add_argument('--batch-size',          help="size of batch to send",          default=50, type=int) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index 5bdb61b4..f8638418 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -19,6 +19,7 @@ class FatcatImporter:          self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))          self._issnl_id_map = dict()          self._orcid_id_map = dict() +        self._doi_id_map = dict()          self._issn_issnl_map = None          if issn_map_file:              self.read_issn_map_file(issn_map_file) @@ -71,6 +72,22 @@ class FatcatImporter:          self._orcid_id_map[orcid] = creator_id # might be None          return creator_id +    def lookup_doi(self, doi): +        """Caches calls to the doi lookup API endpoint in a local dict""" +        assert doi.startswith('10.') +        doi = doi.lower() +        if doi in self._doi_id_map: +            return self._doi_id_map[doi] +        release_id = None +        try: +            rv = self.api.lookup_release(doi=doi) +            release_id = rv.ident +        except ApiException as ae: +            # If anything other than a 404 (not found), something is wrong +            assert ae.status == 404 +        self._doi_id_map[doi] = release_id # might be None +        return release_id +      def read_issn_map_file(self, issn_map_file):          self._issn_issnl_map = dict()          for line in issn_map_file: diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py new file mode 100644 index 00000000..47ebb020 --- /dev/null +++ b/python/fatcat/manifest_importer.py @@ -0,0 +1,77 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1" + +class FatcatManifestImporter(FatcatImporter): + +    def parse_manifest_row(self, row): +        """ +        obj is a python dict (parsed from json). +        returns a CreatorEntity +        """ +        (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row +         +        if url is None: +            return None +        release_ids = None +        if doi is not None: +            release_id = self.lookup_doi(doi.lower()) +            if release_id: +                release_ids = [release_id,] +        extra = None +        fe = fatcat_client.FileEntity( +            sha1=sha1, +            mimetype=mimetype, +            size=size_bytes, +            md5=md5, +            url=url, +            releases=release_ids, +            extra=extra) +        return fe + +    def create_entity(self, entity, editgroup_id=None): +        if entity is not None: +            entity.editgroup_id = editgroup_id +            self.api.create_file(entity) + +    def process_db(self, db_path, size=100): +        # TODO: multiple DOIs per sha1 +        # TODO: multiple URLs per sha1 (with schema change) +        # TODO: a test! +         +        db = sqlite3.connect(db_path) +        last_sha1 = None + +        print("Counting rows...") +        total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0]) +        print("{} rows to process".format(total_count)) + +        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) +        i = 0 +        j = -1 +        for row in db.execute(QUERY): +            j = j+1 +            if row[0] == last_sha1: +                continue +            else: +                last_sha1 = row[0] +            fe = self.parse_manifest_row(row) +            if fe is None: +                continue +            self.create_entity(fe, editgroup_id=eg.id) +            if i > 0 and (i % size) == 0: +                self.api.accept_editgroup(eg.id) +                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) +                print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format( +                    j, total_count, 100.0*j/total_count, i)) +            i = i + 1 +        if i == 0 or (i % size) != 0: +            self.api.accept_editgroup(eg.id) +        print("Done! Inserted {}".format(i)) | 
