diff options
| -rw-r--r-- | python/fatcat/matched_importer.py | 139 | ||||
| -rw-r--r-- | python/tests/files/example_matched.json | 3 | ||||
| -rw-r--r-- | python/tests/matched_importer.py | 32 | 
3 files changed, 174 insertions, 0 deletions
| diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py new file mode 100644 index 00000000..4b82b6b2 --- /dev/null +++ b/python/fatcat/matched_importer.py @@ -0,0 +1,139 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +#row = row.split('\t') +#assert len(row) == 2 +#sha1 = row[0].replace('sha1:') +#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() +#print(sha1) +#dois = [d.lower() for d in json.loads(row[1])] + +class FatcatMatchedImporter(FatcatImporter): +    """ +    Input format is JSON with keys: +    - dois (list) +    - sha1 (hex) +    - md5 (hex) +    - sha256 (hex) +    - size (int) +    - cdx (list of objects) +        - dt +        - url +    - mimetype + +    Future handlings/extensions: +    - core_id, wikidata_id, pmcid, pmid: not as lists +    - urls (list of strings... or objects?) +    """ + +    def __init__(self, host_url, skip_file_update=False, default_mime=None, +            default_link_rel="web"): +        super().__init__(host_url) +        self.default_mime = default_mime +        self.default_link_rel = default_link_rel +        self.skip_file_update = skip_file_update + +    def make_url(self, raw): +        rel = self.default_link_rel +        # TODO: this is where we could map specific domains to rel types, +        # and also filter out bad domains, invalid URLs, etc +        return fatcat_client.FileEntityUrls(url=raw, rel=rel) + +    def parse_matched_dict(self, obj): +        sha1 = obj['sha1'] +        dois = [d.lower() for d in obj.get('dois', [])] + +        # lookup sha1, or create new entity +        fe = None +        if not self.skip_file_update: +            try: +                fe = self.api.lookup_file(sha1=sha1) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +        if fe is None: +            fe = fatcat_client.FileEntity( +                sha1=sha1, +                releases=[], +                urls=[], +            ) + +        # lookup dois +        re_list = set() +        for doi in dois: +            try: +                re = self.api.lookup_release(doi=doi) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +                re = None +            if re is None: +                print("DOI not found: {}".format(doi)) +            else: +                re_list.add(re.ident) +        if len(re_list) == 0: +            return None +        if fe.releases == set(re_list): +            return None +        re_list.update(fe.releases) +        fe.releases = list(re_list) + +        # parse URLs and CDX +        existing_urls = [feu.url for feu in fe.urls] +        for url in obj.get('url', []): +            if url not in existing_urls: +                url = self.make_url(url) +                if url != None: +                    fe.urls.append(url) +        if obj.get('cdx') != None: +            original = obj['cdx']['url'] +            wayback = "https://web.archive.org/web/{}/{}".format( +                obj['cdx']['dt'], +                original) +            if wayback not in existing_urls: +                fe.urls.append( +                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) +            if original not in existing_urls: +                url = self.make_url(original) +                if url != None: +                    fe.urls.append(url) + +        if obj.get('size') != None: +            fe.size = int(obj['size']) +        fe.sha256 = obj.get('sha256', fe.sha256) +        fe.md5 = obj.get('md5', fe.sha256) +        if obj.get('mimetype') is None: +            if fe.mimetype is None: +                fe.mimetype = self.default_mime +        else: +            fe.mimetype = obj.get('mimetype') +        return fe + +    def create_row(self, row, editgroup=None): +        obj = json.loads(row) +        fe = self.parse_matched_dict(obj) +        if fe is not None: +            if fe.ident is None: +                self.api.create_file(fe, editgroup=editgroup) +                self.insert_count = self.insert_count + 1 +            else: +                self.api.update_file(fe.ident, fe, editgroup=editgroup) +                self.update_count = self.update_count + 1 + +    def create_batch(self, batch, editgroup=None): +        """Reads and processes in batches (not API-call-per-line)""" +        objects = [self.parse_matched_dict(json.loads(l)) +                   for l in batch if l != None] +        new_objects = [o for o in objects if o != None and o.ident == None] +        update_objects = [o for o in objects if o != None and o.ident != None] +        for obj in update_objects: +            self.api.update_file(obj.ident, obj, editgroup=editgroup) +        if len(new_objects) > 0: +            self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup) +        self.update_count = self.update_count + len(update_objects) +        self.insert_count = self.insert_count + len(new_objects) diff --git a/python/tests/files/example_matched.json b/python/tests/files/example_matched.json new file mode 100644 index 00000000..79db1296 --- /dev/null +++ b/python/tests/files/example_matched.json @@ -0,0 +1,3 @@ +{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } +{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } +{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": { "dt": "20000000164644", "url": "http://no-plos.org/plosme" }, "mimetype": "application/txt" } diff --git a/python/tests/matched_importer.py b/python/tests/matched_importer.py new file mode 100644 index 00000000..9cc6aa32 --- /dev/null +++ b/python/tests/matched_importer.py @@ -0,0 +1,32 @@ + +import json +import pytest +from fatcat.matched_importer import FatcatMatchedImporter + + +@pytest.fixture(scope="function") +def matched_importer(): +    yield FatcatMatchedImporter("http://localhost:9411/v0") + +# TODO: use API to check that entities actually created... +def test_matched_importer_batch(matched_importer): +    with open('tests/files/example_matched.json', 'r') as f: +        matched_importer.process_batch(f) + +def test_matched_importer(matched_importer): +    with open('tests/files/example_matched.json', 'r') as f: +        matched_importer.process_source(f) + +def test_matched_dict_parse(matched_importer): +    with open('tests/files/example_matched.json', 'r') as f: +        raw = json.loads(f.readline()) +        f = matched_importer.parse_matched_dict(raw) +        assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" +        assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" +        assert f.mimetype == "application/pdf" +        assert f.size == 255629 +        assert f.urls[1].url.startswith("http://journals.plos.org") +        assert f.urls[1].rel == "web" +        assert f.urls[0].url.startswith("https://web.archive.org/") +        assert f.urls[0].rel == "webarchive" +        assert len(f.releases) == 1 | 
