summaryrefslogtreecommitdiffstats
path: root/python/fatcat/manifest_importer.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat/manifest_importer.py')
-rw-r--r--python/fatcat/manifest_importer.py89
1 files changed, 0 insertions, 89 deletions
diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py
deleted file mode 100644
index 3b0b3815..00000000
--- a/python/fatcat/manifest_importer.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import sqlite3
-import itertools
-import fatcat_client
-from fatcat.importer_common import FatcatImporter
-
-
-QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1"
-
-class FatcatManifestImporter(FatcatImporter):
-
- def parse_manifest_row(self, row):
- """
- obj is a python dict (parsed from json).
- returns a CreatorEntity
- """
- (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row
-
- if url is None:
- return None
- release_ids = None
- if doi is not None:
- release_id = self.lookup_doi(doi.lower())
- if release_id:
- release_ids = [release_id,]
- if datetime is None:
- datetime = "1"
- urls = []
- if "//archive.org/" in url or "//arxiv.org/" in url:
- # TODO: special-case the arxiv.org bulk mirror?
- urls.append(fatcat_client.FileEntityUrls(url=url, rel="repository"))
- elif "//web.archive.org/" in url or "//archive.is/" in url:
- urls.append(fatcat_client.FileEntityUrls(url=url, rel="webarchive"))
- else:
- urls.append(fatcat_client.FileEntityUrls(url=url, rel="web"))
- urls.append(fatcat_client.FileEntityUrls(
- url="https://web.archive.org/web/{}/{}".format(datetime, url),
- rel="webarchive"))
-
- extra = None
- fe = fatcat_client.FileEntity(
- sha1=sha1,
- mimetype=mimetype,
- size=size_bytes,
- md5=md5,
- urls=urls,
- releases=release_ids,
- extra=extra)
- return fe
-
- def create_entity(self, entity, editgroup=None):
- if entity is not None:
- self.api.create_file(entity, editgroup=editgroup)
-
- def process_db(self, db_path, size=100):
- # TODO: multiple DOIs per sha1
- # TODO: multiple URLs per sha1 (with schema change)
-
- db = sqlite3.connect(db_path)
- last_sha1 = None
-
- print("Counting rows...")
- total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0])
- print("{} rows to process".format(total_count))
-
- eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae"))
- i = 0
- j = -1
- for row in db.execute(QUERY):
- j = j+1
- if row[0] == last_sha1:
- continue
- else:
- last_sha1 = row[0]
- fe = self.parse_manifest_row(row)
- if fe is None:
- continue
- self.create_entity(fe, editgroup=eg.id)
- if i > 0 and (i % size) == 0:
- self.api.accept_editgroup(eg.id)
- eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae"))
- print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format(
- j, total_count, 100.0*j/total_count, i))
- i = i + 1
- if i == 0 or (i % size) != 0:
- self.api.accept_editgroup(eg.id)
- print("Done! Inserted {}".format(i))