diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-14 15:44:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-14 15:44:55 -0700 |
commit | 8b2c46c4d578ba4129316a07dd1e3981520b8921 (patch) | |
tree | b8b7d6e6c85041dca17873bf7338f2ac9587777a /python | |
parent | ba3adecfaf626b92fb5231cc445ce9e07fb5f877 (diff) | |
download | fatcat-8b2c46c4d578ba4129316a07dd1e3981520b8921.tar.gz fatcat-8b2c46c4d578ba4129316a07dd1e3981520b8921.zip |
fixes to matched importer (and a test)
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat/matched_importer.py | 139 | ||||
-rw-r--r-- | python/tests/files/example_matched.json | 3 | ||||
-rw-r--r-- | python/tests/matched_importer.py | 32 |
3 files changed, 174 insertions, 0 deletions
diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py new file mode 100644 index 00000000..4b82b6b2 --- /dev/null +++ b/python/fatcat/matched_importer.py @@ -0,0 +1,139 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +#row = row.split('\t') +#assert len(row) == 2 +#sha1 = row[0].replace('sha1:') +#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() +#print(sha1) +#dois = [d.lower() for d in json.loads(row[1])] + +class FatcatMatchedImporter(FatcatImporter): + """ + Input format is JSON with keys: + - dois (list) + - sha1 (hex) + - md5 (hex) + - sha256 (hex) + - size (int) + - cdx (list of objects) + - dt + - url + - mimetype + + Future handlings/extensions: + - core_id, wikidata_id, pmcid, pmid: not as lists + - urls (list of strings... or objects?) + """ + + def __init__(self, host_url, skip_file_update=False, default_mime=None, + default_link_rel="web"): + super().__init__(host_url) + self.default_mime = default_mime + self.default_link_rel = default_link_rel + self.skip_file_update = skip_file_update + + def make_url(self, raw): + rel = self.default_link_rel + # TODO: this is where we could map specific domains to rel types, + # and also filter out bad domains, invalid URLs, etc + return fatcat_client.FileEntityUrls(url=raw, rel=rel) + + def parse_matched_dict(self, obj): + sha1 = obj['sha1'] + dois = [d.lower() for d in obj.get('dois', [])] + + # lookup sha1, or create new entity + fe = None + if not self.skip_file_update: + try: + fe = self.api.lookup_file(sha1=sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if fe is None: + fe = fatcat_client.FileEntity( + sha1=sha1, + releases=[], + urls=[], + ) + + # lookup dois + re_list = set() + for doi in dois: + try: + re = self.api.lookup_release(doi=doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + re = None + if re is None: + print("DOI not found: {}".format(doi)) + else: + re_list.add(re.ident) + if len(re_list) == 0: + return None + if fe.releases == set(re_list): + return None + re_list.update(fe.releases) + fe.releases = list(re_list) + + # parse URLs and CDX + existing_urls = [feu.url for feu in fe.urls] + for url in obj.get('url', []): + if url not in existing_urls: + url = self.make_url(url) + if url != None: + fe.urls.append(url) + if obj.get('cdx') != None: + original = obj['cdx']['url'] + wayback = "https://web.archive.org/web/{}/{}".format( + obj['cdx']['dt'], + original) + if wayback not in existing_urls: + fe.urls.append( + fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) + if original not in existing_urls: + url = self.make_url(original) + if url != None: + fe.urls.append(url) + + if obj.get('size') != None: + fe.size = int(obj['size']) + fe.sha256 = obj.get('sha256', fe.sha256) + fe.md5 = obj.get('md5', fe.sha256) + if obj.get('mimetype') is None: + if fe.mimetype is None: + fe.mimetype = self.default_mime + else: + fe.mimetype = obj.get('mimetype') + return fe + + def create_row(self, row, editgroup=None): + obj = json.loads(row) + fe = self.parse_matched_dict(obj) + if fe is not None: + if fe.ident is None: + self.api.create_file(fe, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + else: + self.api.update_file(fe.ident, fe, editgroup=editgroup) + self.update_count = self.update_count + 1 + + def create_batch(self, batch, editgroup=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_matched_dict(json.loads(l)) + for l in batch if l != None] + new_objects = [o for o in objects if o != None and o.ident == None] + update_objects = [o for o in objects if o != None and o.ident != None] + for obj in update_objects: + self.api.update_file(obj.ident, obj, editgroup=editgroup) + if len(new_objects) > 0: + self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup) + self.update_count = self.update_count + len(update_objects) + self.insert_count = self.insert_count + len(new_objects) diff --git a/python/tests/files/example_matched.json b/python/tests/files/example_matched.json new file mode 100644 index 00000000..79db1296 --- /dev/null +++ b/python/tests/files/example_matched.json @@ -0,0 +1,3 @@ +{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } +{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" } +{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": { "dt": "20000000164644", "url": "http://no-plos.org/plosme" }, "mimetype": "application/txt" } diff --git a/python/tests/matched_importer.py b/python/tests/matched_importer.py new file mode 100644 index 00000000..9cc6aa32 --- /dev/null +++ b/python/tests/matched_importer.py @@ -0,0 +1,32 @@ + +import json +import pytest +from fatcat.matched_importer import FatcatMatchedImporter + + +@pytest.fixture(scope="function") +def matched_importer(): + yield FatcatMatchedImporter("http://localhost:9411/v0") + +# TODO: use API to check that entities actually created... +def test_matched_importer_batch(matched_importer): + with open('tests/files/example_matched.json', 'r') as f: + matched_importer.process_batch(f) + +def test_matched_importer(matched_importer): + with open('tests/files/example_matched.json', 'r') as f: + matched_importer.process_source(f) + +def test_matched_dict_parse(matched_importer): + with open('tests/files/example_matched.json', 'r') as f: + raw = json.loads(f.readline()) + f = matched_importer.parse_matched_dict(raw) + assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" + assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" + assert f.mimetype == "application/pdf" + assert f.size == 255629 + assert f.urls[1].url.startswith("http://journals.plos.org") + assert f.urls[1].rel == "web" + assert f.urls[0].url.startswith("https://web.archive.org/") + assert f.urls[0].rel == "webarchive" + assert len(f.releases) == 1 |