diff options
| -rw-r--r-- | python/fatcat/manifest_importer.py | 89 | ||||
| -rw-r--r-- | python/fatcat/matched_importer.py | 13 | ||||
| -rwxr-xr-x | python/fatcat_import.py | 18 | ||||
| -rw-r--r-- | python/tests/files/example_matched.json | 6 | ||||
| -rw-r--r-- | python/tests/manifest.py | 34 | 
5 files changed, 14 insertions, 146 deletions
| diff --git a/python/fatcat/manifest_importer.py b/python/fatcat/manifest_importer.py deleted file mode 100644 index 3b0b3815..00000000 --- a/python/fatcat/manifest_importer.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import sqlite3 -import itertools -import fatcat_client -from fatcat.importer_common import FatcatImporter - - -QUERY = "SELECT files_metadata.sha1, files_metadata.mimetype, files_metadata.size_bytes, files_metadata.md5, files_id_doi.doi, urls.url, urls.datetime from files_metadata JOIN files_id_doi ON files_metadata.sha1 = files_id_doi.sha1 JOIN urls ON files_metadata.sha1 = urls.sha1 ORDER BY files_metadata.sha1" - -class FatcatManifestImporter(FatcatImporter): - -    def parse_manifest_row(self, row): -        """ -        obj is a python dict (parsed from json). -        returns a CreatorEntity -        """ -        (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row -         -        if url is None: -            return None -        release_ids = None -        if doi is not None: -            release_id = self.lookup_doi(doi.lower()) -            if release_id: -                release_ids = [release_id,] -        if datetime is None: -            datetime = "1" -        urls = [] -        if "//archive.org/" in url or "//arxiv.org/" in url: -            # TODO: special-case the arxiv.org bulk mirror? -            urls.append(fatcat_client.FileEntityUrls(url=url, rel="repository")) -        elif "//web.archive.org/" in url or "//archive.is/" in url: -            urls.append(fatcat_client.FileEntityUrls(url=url, rel="webarchive")) -        else: -            urls.append(fatcat_client.FileEntityUrls(url=url, rel="web")) -            urls.append(fatcat_client.FileEntityUrls( -                url="https://web.archive.org/web/{}/{}".format(datetime, url), -                rel="webarchive")) -             -        extra = None -        fe = fatcat_client.FileEntity( -            sha1=sha1, -            mimetype=mimetype, -            size=size_bytes, -            md5=md5, -            urls=urls, -            releases=release_ids, -            extra=extra) -        return fe - -    def create_entity(self, entity, editgroup=None): -        if entity is not None: -            self.api.create_file(entity, editgroup=editgroup) - -    def process_db(self, db_path, size=100): -        # TODO: multiple DOIs per sha1 -        # TODO: multiple URLs per sha1 (with schema change) -         -        db = sqlite3.connect(db_path) -        last_sha1 = None - -        print("Counting rows...") -        total_count = int(list(db.execute("SELECT COUNT(*) FROM files_metadata;"))[0][0]) -        print("{} rows to process".format(total_count)) - -        eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae")) -        i = 0 -        j = -1 -        for row in db.execute(QUERY): -            j = j+1 -            if row[0] == last_sha1: -                continue -            else: -                last_sha1 = row[0] -            fe = self.parse_manifest_row(row) -            if fe is None: -                continue -            self.create_entity(fe, editgroup=eg.id) -            if i > 0 and (i % size) == 0: -                self.api.accept_editgroup(eg.id) -                eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id="aaaaaaaaaaaabkvkaaaaaaaaae")) -                print("Finished a batch; row {} of {} ({:.2f}%).\tTotal inserted: {}".format( -                    j, total_count, 100.0*j/total_count, i)) -            i = i + 1 -        if i == 0 or (i % size) != 0: -            self.api.accept_editgroup(eg.id) -        print("Done! Inserted {}".format(i)) diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py index 4b82b6b2..44735d52 100644 --- a/python/fatcat/matched_importer.py +++ b/python/fatcat/matched_importer.py @@ -25,10 +25,10 @@ class FatcatMatchedImporter(FatcatImporter):          - dt          - url      - mimetype +    - urls (list of strings... or objects?)      Future handlings/extensions:      - core_id, wikidata_id, pmcid, pmid: not as lists -    - urls (list of strings... or objects?)      """      def __init__(self, host_url, skip_file_update=False, default_mime=None, @@ -42,6 +42,11 @@ class FatcatMatchedImporter(FatcatImporter):          rel = self.default_link_rel          # TODO: this is where we could map specific domains to rel types,          # and also filter out bad domains, invalid URLs, etc +        if "//archive.org/" in url or "//arxiv.org/" in url: +            # TODO: special-case the arxiv.org bulk mirror? +            rel = "repository" +        elif "//web.archive.org/" in url or "//archive.is/" in url: +            rel = "webarchive"          return fatcat_client.FileEntityUrls(url=raw, rel=rel)      def parse_matched_dict(self, obj): @@ -90,10 +95,10 @@ class FatcatMatchedImporter(FatcatImporter):                  url = self.make_url(url)                  if url != None:                      fe.urls.append(url) -        if obj.get('cdx') != None: -            original = obj['cdx']['url'] +        for cdx in obj.get('cdx', []): +            original = cdx['url']              wayback = "https://web.archive.org/web/{}/{}".format( -                obj['cdx']['dt'], +                cdx['dt'],                  original)              if wayback not in existing_urls:                  fe.urls.append( diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 525cf286..bf0a32ad 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -5,8 +5,8 @@ import argparse  from fatcat.raw_api_client import RawFatcatApiClient  from fatcat.crossref_importer import FatcatCrossrefImporter  from fatcat.orcid_importer import FatcatOrcidImporter -from fatcat.manifest_importer import FatcatManifestImporter  from fatcat.issn_importer import FatcatIssnImporter +from fatcat.matched_importer import FatcatMatchedImporter  def run_import_crossref(args):      fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, @@ -24,15 +24,10 @@ def run_import_issn(args):      fii.process_csv_batch(args.csv_file, size=args.batch_size)      fii.describe_run() -def run_import_manifest(args): -    fmi = FatcatManifestImporter(args.host_url) -    fmi.process_db(args.db_path, size=args.batch_size) -    fmi.describe_run() -  def run_import_matched(args):      fmi = FatcatMatchedImporter(args.host_url,          skip_file_update=args.no_file_update) -    fmi.process_db(args.db_path, size=args.batch_size) +    fmi.process_batch(args.json_file, size=args.batch_size)      fmi.describe_run()  def health(args): @@ -85,15 +80,6 @@ def main():          help="size of batch to send",          default=50, type=int) -    sub_import_manifest = subparsers.add_parser('import-manifest') -    sub_import_manifest.set_defaults(func=run_import_manifest) -    sub_import_manifest.add_argument('db_path', -        help="sqlite3 database to import from", -        type=str) -    sub_import_manifest.add_argument('--batch-size', -        help="size of batch to send", -        default=50, type=int) -      sub_import_matched = subparsers.add_parser('import-matched')      sub_import_matched.set_defaults(func=run_import_matched)      sub_import_matched.add_argument('json_file', diff --git a/python/tests/files/example_matched.json b/python/tests/files/example_matched.json index 79db1296..d9d71669 100644 --- a/python/tests/files/example_matched.json +++ b/python/tests/files/example_matched.json @@ -1,3 +1,3 @@ -{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } -{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } -{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": { "dt": "20000000164644", "url": "http://no-plos.org/plosme" }, "mimetype": "application/txt" } +{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": [{ "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }], "mimetype": "application/pdf" } +{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": [{ "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }], "mimetype": "application/pdf" , "urls": ["http://other.tld/asdf"]} +{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": [{ "dt": "20000000164644", "url": "http://no-plos.org/plosme" }], "mimetype": "application/txt" } diff --git a/python/tests/manifest.py b/python/tests/manifest.py deleted file mode 100644 index 030d9e48..00000000 --- a/python/tests/manifest.py +++ /dev/null @@ -1,34 +0,0 @@ - -import json -import pytest -from fatcat.manifest_importer import FatcatManifestImporter - - -@pytest.fixture(scope="function") -def manifest_importer(): -    yield FatcatManifestImporter("http://localhost:9411/v0") - -# TODO: use API to check that entities actually created... -#def test_manifest_importer_batch(manifest_importer): -#    with open('tests/files/0000-0001-8254-7103.json', 'r') as f: -#        manifest_importer.process_batch(f) - -#def test_manifest_importer(manifest_importer): -#    with open('tests/files/0000-0001-8254-7103.json', 'r') as f: -#        manifest_importer.process_source(f) - -def test_manifest_row_parse(manifest_importer): -    # (sha1, mimetype, size_bytes, md5, doi, url, datetime) = row - -    c = manifest_importer.parse_manifest_row( -        (None, None, None, None, None, None, None)) -    assert c == None - -    c = manifest_importer.parse_manifest_row( -        ("7d97e98f8af710c7e7fe703abc8f639e0ee507c4", "application/pdf", "12345", "8af710c7e7fe703abc8f639e0ee507c4", "10.1234/asdf", "https://example.com/thing.pdf", "200001010000")) -    assert c.sha1 == "7d97e98f8af710c7e7fe703abc8f639e0ee507c4" -    assert c.mimetype == "application/pdf" -    assert c.urls[0].url == "https://example.com/thing.pdf" -    assert c.urls[0].rel == "web" -    assert c.urls[1].url == "https://web.archive.org/web/200001010000/https://example.com/thing.pdf" -    assert c.urls[1].rel == "webarchive" | 
