switch manifest importer to be json-based

author: Bryan Newbold <bnewbold@robocracy.org> 2018-09-14 16:53:28 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-09-14 16:53:28 -0700
commit: 03d7c929e1b415cbd612d612b9b1c9725f5690bb (patch)
tree: 40f65416f650f062243e524ff46a06f7b79d44b4 /python
parent: 32ab9f040b313ce421620a2df71332e24c425cfc (diff)
download: fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.tar.gz
fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.zip
5 files changed, 14 insertions, 146 deletions
diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py
deleted file mode 100644
index 3b0b3815..00000000
--- a/python/fatcat/manifest_importer.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import sqlite3
-import itertools
-import fatcat_client
-from fatcat.importer_common import FatcatImporter
-
-
-QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1"
-
-class FatcatManifestImporter(FatcatImporter):
-
-    def parse_manifest_row(self, row):
-        """
-        obj is a python dict (parsed from json).
-        returns a CreatorEntity
-        """
-        (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row
-        
-        if url is None:
-            return None
-        release_ids = None
-        if doi is not None:
-            release_id = self.lookup_doi(doi.lower())
-            if release_id:
-                release_ids = [release_id,]
-        if datetime is None:
-            datetime = "1"
-        urls = []
-        if "//archive.org/" in url or "//arxiv.org/" in url:
-            # TODO: special-case the arxiv.org bulk mirror?
-            urls.append(fatcat_client.FileEntityUrls(url=url, rel="repository"))
-        elif "//web.archive.org/" in url or "//archive.is/" in url:
-            urls.append(fatcat_client.FileEntityUrls(url=url, rel="webarchive"))
-        else:
-            urls.append(fatcat_client.FileEntityUrls(url=url, rel="web"))
-            urls.append(fatcat_client.FileEntityUrls(
-                url="https://web.archive.org/web/{}/{}".format(datetime, url),
-                rel="webarchive"))
-            
-        extra = None
-        fe = fatcat_client.FileEntity(
-            sha1=sha1,
-            mimetype=mimetype,
-            size=size_bytes,
-            md5=md5,
-            urls=urls,
-            releases=release_ids,
-            extra=extra)
-        return fe
-
-    def create_entity(self, entity, editgroup=None):
-        if entity is not None:
-            self.api.create_file(entity, editgroup=editgroup)
-
-    def process_db(self, db_path, size=100):
-        # TODO: multiple DOIs per sha1
-        # TODO: multiple URLs per sha1 (with schema change)
-        
-        db = sqlite3.connect(db_path)
-        last_sha1 = None
-
-        print("Counting rows...")
-        total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0])
-        print("{} rows to process".format(total_count))
-
-        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae"))
-        i = 0
-        j = -1
-        for row in db.execute(QUERY):
-            j = j+1
-            if row[0] == last_sha1:
-                continue
-            else:
-                last_sha1 = row[0]
-            fe = self.parse_manifest_row(row)
-            if fe is None:
-                continue
-            self.create_entity(fe, editgroup=eg.id)
-            if i > 0 and (i % size) == 0:
-                self.api.accept_editgroup(eg.id)
-                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae"))
-                print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format(
-                    j, total_count, 100.0*j/total_count, i))
-            i = i + 1
-        if i == 0 or (i % size) != 0:
-            self.api.accept_editgroup(eg.id)
-        print("Done! Inserted {}".format(i))
diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py
index 4b82b6b2..44735d52 100644
--- a/python/fatcat/matched_importer.py
+++ b/python/fatcat/matched_importer.py
@@ -25,10 +25,10 @@ class FatcatMatchedImporter(FatcatImporter):
         - dt
         - url
     - mimetype
+    - urls (list of strings... or objects?)
 
     Future handlings/extensions:
     - core_id, wikidata_id, pmcid, pmid: not as lists
-    - urls (list of strings... or objects?)
     """
 
     def __init__(self, host_url, skip_file_update=False, default_mime=None,
@@ -42,6 +42,11 @@ class FatcatMatchedImporter(FatcatImporter):
         rel = self.default_link_rel
         # TODO: this is where we could map specific domains to rel types,
         # and also filter out bad domains, invalid URLs, etc
+        if "//archive.org/" in url or "//arxiv.org/" in url:
+            # TODO: special-case the arxiv.org bulk mirror?
+            rel = "repository"
+        elif "//web.archive.org/" in url or "//archive.is/" in url:
+            rel = "webarchive"
         return fatcat_client.FileEntityUrls(url=raw, rel=rel)
 
     def parse_matched_dict(self, obj):
@@ -90,10 +95,10 @@ class FatcatMatchedImporter(FatcatImporter):
                 url = self.make_url(url)
                 if url != None:
                     fe.urls.append(url)
-        if obj.get('cdx') != None:
-            original = obj['cdx']['url']
+        for cdx in obj.get('cdx', []):
+            original = cdx['url']
             wayback = "https://web.archive.org/web/{}/{}".format(
-                obj['cdx']['dt'],
+                cdx['dt'],
                 original)
             if wayback not in existing_urls:
                 fe.urls.append(
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 525cf286..bf0a32ad 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -5,8 +5,8 @@ import argparse
 from fatcat.raw_api_client import RawFatcatApiClient
 from fatcat.crossref_importer import FatcatCrossrefImporter
 from fatcat.orcid_importer import FatcatOrcidImporter
-from fatcat.manifest_importer import FatcatManifestImporter
 from fatcat.issn_importer import FatcatIssnImporter
+from fatcat.matched_importer import FatcatMatchedImporter
 
 def run_import_crossref(args):
     fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file,
@@ -24,15 +24,10 @@ def run_import_issn(args):
     fii.process_csv_batch(args.csv_file, size=args.batch_size)
     fii.describe_run()
 
-def run_import_manifest(args):
-    fmi = FatcatManifestImporter(args.host_url)
-    fmi.process_db(args.db_path, size=args.batch_size)
-    fmi.describe_run()
-
 def run_import_matched(args):
     fmi = FatcatMatchedImporter(args.host_url,
         skip_file_update=args.no_file_update)
-    fmi.process_db(args.db_path, size=args.batch_size)
+    fmi.process_batch(args.json_file, size=args.batch_size)
     fmi.describe_run()
 
 def health(args):
@@ -85,15 +80,6 @@ def main():
         help="size of batch to send",
         default=50, type=int)
 
-    sub_import_manifest = subparsers.add_parser('import-manifest')
-    sub_import_manifest.set_defaults(func=run_import_manifest)
-    sub_import_manifest.add_argument('db_path',
-        help="sqlite3 database to import from",
-        type=str)
-    sub_import_manifest.add_argument('--batch-size',
-        help="size of batch to send",
-        default=50, type=int)
-
     sub_import_matched = subparsers.add_parser('import-matched')
     sub_import_matched.set_defaults(func=run_import_matched)
     sub_import_matched.add_argument('json_file',
diff --git a/python/tests/files/example_matched.json b/python/tests/files/example_matched.json
index 79db1296..d9d71669 100644
--- a/python/tests/files/example_matched.json
+++ b/python/tests/files/example_matched.json
@@ -1,3 +1,3 @@
-{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" }
-{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" }
-{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": { "dt": "20000000164644", "url": "http://no-plos.org/plosme" }, "mimetype": "application/txt" }
+{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": [{ "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }], "mimetype": "application/pdf" }
+{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": [{ "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }], "mimetype": "application/pdf" , "urls": ["http://other.tld/asdf"]}
+{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": [{ "dt": "20000000164644", "url": "http://no-plos.org/plosme" }], "mimetype": "application/txt" }
diff --git a/python/tests/manifest.py b/python/tests/manifest.py
deleted file mode 100644
index 030d9e48..00000000
--- a/python/tests/manifest.py
+++ /dev/null
@@ -1,34 +0,0 @@
-
-import json
-import pytest
-from fatcat.manifest_importer import FatcatManifestImporter
-
-
-@pytest.fixture(scope="function")
-def manifest_importer():
-    yield FatcatManifestImporter("http://localhost:9411/v0")
-
-# TODO: use API to check that entities actually created...
-#def test_manifest_importer_batch(manifest_importer):
-#    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
-#        manifest_importer.process_batch(f)
-
-#def test_manifest_importer(manifest_importer):
-#    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
-#        manifest_importer.process_source(f)
-
-def test_manifest_row_parse(manifest_importer):
-    # (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row
-
-    c = manifest_importer.parse_manifest_row(
-        (None, None, None, None, None, None, None))
-    assert c == None
-
-    c = manifest_importer.parse_manifest_row(
-        ("7d97e98f8af710c7e7fe703abc8f639e0ee507c4", "application/pdf", "12345", "8af710c7e7fe703abc8f639e0ee507c4", "10.1234/asdf", "https://example.com/thing.pdf", "200001010000"))
-    assert c.sha1 == "7d97e98f8af710c7e7fe703abc8f639e0ee507c4"
-    assert c.mimetype == "application/pdf"
-    assert c.urls[0].url == "https://example.com/thing.pdf"
-    assert c.urls[0].rel == "web"
-    assert c.urls[1].url == "https://web.archive.org/web/200001010000/https://example.com/thing.pdf"
-    assert c.urls[1].rel == "webarchive"
author	Bryan Newbold <bnewbold@robocracy.org>	2018-09-14 16:53:28 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-09-14 16:53:28 -0700
commit	03d7c929e1b415cbd612d612b9b1c9725f5690bb (patch)
tree	40f65416f650f062243e524ff46a06f7b79d44b4 /python
parent	32ab9f040b313ce421620a2df71332e24c425cfc (diff)
download	fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.tar.gz fatcat-03d7c929e1b415cbd612d612b9b1c9725f5690bb.zip