aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-14 15:44:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-14 15:44:55 -0700
commit8b2c46c4d578ba4129316a07dd1e3981520b8921 (patch)
treeb8b7d6e6c85041dca17873bf7338f2ac9587777a
parentba3adecfaf626b92fb5231cc445ce9e07fb5f877 (diff)
downloadfatcat-8b2c46c4d578ba4129316a07dd1e3981520b8921.tar.gz
fatcat-8b2c46c4d578ba4129316a07dd1e3981520b8921.zip
fixes to matched importer (and a test)
-rw-r--r--python/fatcat/matched_importer.py139
-rw-r--r--python/tests/files/example_matched.json3
-rw-r--r--python/tests/matched_importer.py32
3 files changed, 174 insertions, 0 deletions
diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py
new file mode 100644
index 00000000..4b82b6b2
--- /dev/null
+++ b/python/fatcat/matched_importer.py
@@ -0,0 +1,139 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+#row = row.split('\t')
+#assert len(row) == 2
+#sha1 = row[0].replace('sha1:')
+#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
+#print(sha1)
+#dois = [d.lower() for d in json.loads(row[1])]
+
+class FatcatMatchedImporter(FatcatImporter):
+ """
+ Input format is JSON with keys:
+ - dois (list)
+ - sha1 (hex)
+ - md5 (hex)
+ - sha256 (hex)
+ - size (int)
+ - cdx (list of objects)
+ - dt
+ - url
+ - mimetype
+
+ Future handlings/extensions:
+ - core_id, wikidata_id, pmcid, pmid: not as lists
+ - urls (list of strings... or objects?)
+ """
+
+ def __init__(self, host_url, skip_file_update=False, default_mime=None,
+ default_link_rel="web"):
+ super().__init__(host_url)
+ self.default_mime = default_mime
+ self.default_link_rel = default_link_rel
+ self.skip_file_update = skip_file_update
+
+ def make_url(self, raw):
+ rel = self.default_link_rel
+ # TODO: this is where we could map specific domains to rel types,
+ # and also filter out bad domains, invalid URLs, etc
+ return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+
+ def parse_matched_dict(self, obj):
+ sha1 = obj['sha1']
+ dois = [d.lower() for d in obj.get('dois', [])]
+
+ # lookup sha1, or create new entity
+ fe = None
+ if not self.skip_file_update:
+ try:
+ fe = self.api.lookup_file(sha1=sha1)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if fe is None:
+ fe = fatcat_client.FileEntity(
+ sha1=sha1,
+ releases=[],
+ urls=[],
+ )
+
+ # lookup dois
+ re_list = set()
+ for doi in dois:
+ try:
+ re = self.api.lookup_release(doi=doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ re = None
+ if re is None:
+ print("DOI not found: {}".format(doi))
+ else:
+ re_list.add(re.ident)
+ if len(re_list) == 0:
+ return None
+ if fe.releases == set(re_list):
+ return None
+ re_list.update(fe.releases)
+ fe.releases = list(re_list)
+
+ # parse URLs and CDX
+ existing_urls = [feu.url for feu in fe.urls]
+ for url in obj.get('url', []):
+ if url not in existing_urls:
+ url = self.make_url(url)
+ if url != None:
+ fe.urls.append(url)
+ if obj.get('cdx') != None:
+ original = obj['cdx']['url']
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ obj['cdx']['dt'],
+ original)
+ if wayback not in existing_urls:
+ fe.urls.append(
+ fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
+ if original not in existing_urls:
+ url = self.make_url(original)
+ if url != None:
+ fe.urls.append(url)
+
+ if obj.get('size') != None:
+ fe.size = int(obj['size'])
+ fe.sha256 = obj.get('sha256', fe.sha256)
+ fe.md5 = obj.get('md5', fe.sha256)
+ if obj.get('mimetype') is None:
+ if fe.mimetype is None:
+ fe.mimetype = self.default_mime
+ else:
+ fe.mimetype = obj.get('mimetype')
+ return fe
+
+ def create_row(self, row, editgroup=None):
+ obj = json.loads(row)
+ fe = self.parse_matched_dict(obj)
+ if fe is not None:
+ if fe.ident is None:
+ self.api.create_file(fe, editgroup=editgroup)
+ self.insert_count = self.insert_count + 1
+ else:
+ self.api.update_file(fe.ident, fe, editgroup=editgroup)
+ self.update_count = self.update_count + 1
+
+ def create_batch(self, batch, editgroup=None):
+ """Reads and processes in batches (not API-call-per-line)"""
+ objects = [self.parse_matched_dict(json.loads(l))
+ for l in batch if l != None]
+ new_objects = [o for o in objects if o != None and o.ident == None]
+ update_objects = [o for o in objects if o != None and o.ident != None]
+ for obj in update_objects:
+ self.api.update_file(obj.ident, obj, editgroup=editgroup)
+ if len(new_objects) > 0:
+ self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup)
+ self.update_count = self.update_count + len(update_objects)
+ self.insert_count = self.insert_count + len(new_objects)
diff --git a/python/tests/files/example_matched.json b/python/tests/files/example_matched.json
new file mode 100644
index 00000000..79db1296
--- /dev/null
+++ b/python/tests/files/example_matched.json
@@ -0,0 +1,3 @@
+{ "dois": ["10.123/abc"], "sha1": "00242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" }
+{ "dois": ["10.123/abc"], "sha1": "3f242a192acc258bdfdb151943419437f440c313", "md5": "f4de91152c7ab9fdc2a128f962faebff", "sha256": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 255629, "cdx": { "dt": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "mimetype": "application/pdf" }
+{ "dois": ["10.456/1231123"], "sha1": "000000000000258bdfdb151943419437f440c313", "md5": "000000000000b9fdc2a128f962faebff", "sha256": "000000000000620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size": 123, "cdx": { "dt": "20000000164644", "url": "http://no-plos.org/plosme" }, "mimetype": "application/txt" }
diff --git a/python/tests/matched_importer.py b/python/tests/matched_importer.py
new file mode 100644
index 00000000..9cc6aa32
--- /dev/null
+++ b/python/tests/matched_importer.py
@@ -0,0 +1,32 @@
+
+import json
+import pytest
+from fatcat.matched_importer import FatcatMatchedImporter
+
+
+@pytest.fixture(scope="function")
+def matched_importer():
+ yield FatcatMatchedImporter("http://localhost:9411/v0")
+
+# TODO: use API to check that entities actually created...
+def test_matched_importer_batch(matched_importer):
+ with open('tests/files/example_matched.json', 'r') as f:
+ matched_importer.process_batch(f)
+
+def test_matched_importer(matched_importer):
+ with open('tests/files/example_matched.json', 'r') as f:
+ matched_importer.process_source(f)
+
+def test_matched_dict_parse(matched_importer):
+ with open('tests/files/example_matched.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = matched_importer.parse_matched_dict(raw)
+ assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
+ assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 255629
+ assert f.urls[1].url.startswith("http://journals.plos.org")
+ assert f.urls[1].rel == "web"
+ assert f.urls[0].url.startswith("https://web.archive.org/")
+ assert f.urls[0].rel == "webarchive"
+ assert len(f.releases) == 1