summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-21 09:53:30 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-21 09:53:30 -0700
commita1d94e6c28b080158fd65d0ec54ff6d64451df97 (patch)
tree2c0b272ea5fdb19d0e97235b52d28ccfe5fb99d4
parent7ebe32a290aa160c43671c21e968d1614a878f02 (diff)
downloadfatcat-a1d94e6c28b080158fd65d0ec54ff6d64451df97.tar.gz
fatcat-a1d94e6c28b080158fd65d0ec54ff6d64451df97.zip
manifest importer
-rwxr-xr-xpython/client.py10
-rw-r--r--python/fatcat/importer_common.py17
-rw-r--r--python/fatcat/manifest_importer.py77
3 files changed, 100 insertions, 4 deletions
diff --git a/python/client.py b/python/client.py
index 9a2ed50d..ca6af603 100755
--- a/python/client.py
+++ b/python/client.py
@@ -3,7 +3,9 @@
import sys
import argparse
from fatcat.raw_api_client import RawFatcatApiClient
+from fatcat.crossref_importer import FatcatCrossrefImporter
from fatcat.orcid_importer import FatcatOrcidImporter
+from fatcat.manifest_importer import FatcatManifestImporter
def run_import_crossref(args):
fcc = FatcatCrossrefClient(args.host_url)
@@ -53,12 +55,12 @@ def main():
help="size of batch to send",
default=50, type=int)
- sub_import_manifest = subparsers.add_parser('import-orcid')
- sub_import_manifest.set_defaults(func=run_import_orcid)
+ sub_import_manifest = subparsers.add_parser('import-manifest')
+ sub_import_manifest.set_defaults(func=run_import_manifest)
sub_import_manifest.add_argument('db_path',
help="sqlite3 database to import from",
- type=argparse.FileType('r'))
- sub_import_orcid.add_argument('--batch-size',
+ type=str)
+ sub_import_manifest.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
index 5bdb61b4..f8638418 100644
--- a/python/fatcat/importer_common.py
+++ b/python/fatcat/importer_common.py
@@ -19,6 +19,7 @@ class FatcatImporter:
self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
self._issnl_id_map = dict()
self._orcid_id_map = dict()
+ self._doi_id_map = dict()
self._issn_issnl_map = None
if issn_map_file:
self.read_issn_map_file(issn_map_file)
@@ -71,6 +72,22 @@ class FatcatImporter:
self._orcid_id_map[orcid] = creator_id # might be None
return creator_id
+ def lookup_doi(self, doi):
+ """Caches calls to the doi lookup API endpoint in a local dict"""
+ assert doi.startswith('10.')
+ doi = doi.lower()
+ if doi in self._doi_id_map:
+ return self._doi_id_map[doi]
+ release_id = None
+ try:
+ rv = self.api.lookup_release(doi=doi)
+ release_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._doi_id_map[doi] = release_id # might be None
+ return release_id
+
def read_issn_map_file(self, issn_map_file):
self._issn_issnl_map = dict()
for line in issn_map_file:
diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py
new file mode 100644
index 00000000..47ebb020
--- /dev/null
+++ b/python/fatcat/manifest_importer.py
@@ -0,0 +1,77 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+
+QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1"
+
+class FatcatManifestImporter(FatcatImporter):
+
+ def parse_manifest_row(self, row):
+ """
+ obj is a python dict (parsed from json).
+ returns a CreatorEntity
+ """
+ (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row
+
+ if url is None:
+ return None
+ release_ids = None
+ if doi is not None:
+ release_id = self.lookup_doi(doi.lower())
+ if release_id:
+ release_ids = [release_id,]
+ extra = None
+ fe = fatcat_client.FileEntity(
+ sha1=sha1,
+ mimetype=mimetype,
+ size=size_bytes,
+ md5=md5,
+ url=url,
+ releases=release_ids,
+ extra=extra)
+ return fe
+
+ def create_entity(self, entity, editgroup_id=None):
+ if entity is not None:
+ entity.editgroup_id = editgroup_id
+ self.api.create_file(entity)
+
+ def process_db(self, db_path, size=100):
+ # TODO: multiple DOIs per sha1
+ # TODO: multiple URLs per sha1 (with schema change)
+ # TODO: a test!
+
+ db = sqlite3.connect(db_path)
+ last_sha1 = None
+
+ print("Counting rows...")
+ total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0])
+ print("{} rows to process".format(total_count))
+
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ i = 0
+ j = -1
+ for row in db.execute(QUERY):
+ j = j+1
+ if row[0] == last_sha1:
+ continue
+ else:
+ last_sha1 = row[0]
+ fe = self.parse_manifest_row(row)
+ if fe is None:
+ continue
+ self.create_entity(fe, editgroup_id=eg.id)
+ if i > 0 and (i % size) == 0:
+ self.api.accept_editgroup(eg.id)
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format(
+ j, total_count, 100.0*j/total_count, i))
+ i = i + 1
+ if i == 0 or (i % size) != 0:
+ self.api.accept_editgroup(eg.id)
+ print("Done! Inserted {}".format(i))