basic shadow importer

author: Bryan Newbold <bnewbold@robocracy.org> 2019-12-23 17:59:10 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-02-13 22:24:20 -0800
commit: e59d1b617d4abd5f002d9e59b6bbaebc9ff30993 (patch)
tree: 902846f104b5679d92a85f2b6e305e6397410265 /python
parent: 07fabec32aada55a75c064e5c1e01a46da30d854 (diff)
download: fatcat-e59d1b617d4abd5f002d9e59b6bbaebc9ff30993.tar.gz
fatcat-e59d1b617d4abd5f002d9e59b6bbaebc9ff30993.zip
5 files changed, 262 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ad4de0e2..1f026edc 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,11 @@ def run_grobid_metadata(args):
         bezerk_mode=args.bezerk_mode)
     LinePusher(fmi, args.tsv_file).run()
 
+def run_shadow_lib(args):
+    fmi = ShadowLibraryImporter(args.api,
+        edit_batch_size=100)
+    JsonLinePusher(fmi, args.json_file).run()
+
 def run_wayback_static(args):
     api = args.api
 
@@ -473,6 +478,16 @@ def main():
         action='store_true',
         help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
 
+    sub_shadow_lib = subparsers.add_parser('shadow-lib',
+        help="create release and file entities based on GROBID PDF metadata extraction")
+    sub_shadow_lib.set_defaults(
+        func=run_shadow_lib,
+        auth_var="FATCAT_API_AUTH_TOKEN",
+    )
+    sub_shadow_lib.add_argument('json_file',
+        help="JSON file to import from (or stdin)",
+        default=sys.stdin, type=argparse.FileType('r'))
+
     sub_wayback_static = subparsers.add_parser('wayback-static',
         help="crude crawl+ingest tool for single-page HTML docs from wayback")
     sub_wayback_static.set_defaults(
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d936605f..10557ef8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
 from .wayback_static import auto_wayback_static
 from .cdl_dash_dat import auto_cdl_dash_dat
 from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..21a18837
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,175 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+    """
+    Importer for shadow library files (matched to releases)
+
+    Input format is JSON with keys:
+    - shadow
+        - shadow_corpus (string slug)
+        - shadow_id (string)
+        - doi
+        - pmid
+        - isbn13
+    - file_meta
+        - sha1hex
+        - sha256hex
+        - md5hex
+        - size_bytes
+        - mimetype
+    - cdx (may be null)
+        - url
+        - datetime
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+        eg_extra = kwargs.pop('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+        self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+    def want(self, raw_record):
+        return True
+
+    def parse_record(self, obj):
+        """
+        We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+        """
+
+        shadow_corpus = obj['shadow']['shadow_corpus']
+        assert shadow_corpus == shadow_corpus.strip().lower()
+        doi = clean_doi(obj['shadow'].get('doi'))
+        pmid = clean_pmid(obj['shadow'].get('pmid'))
+        isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+        shadow_id = obj['shadow'].get('shadow_id').strip()
+        assert shadow_id
+
+        extra = { '{}_id'.format(shadow_corpus): shadow_id }
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+        # lookup release via several idents
+        re = None
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            try:
+                re = self.api.lookup_release(**{ext_type: ext_id})
+            except fatcat_openapi_client.rest.ApiException as err:
+                if err.status not in (404, 400):
+                    raise err
+                re = None
+            if re:
+                break
+
+        if not re:
+            self.counts['skip-release-not-found'] += 1
+            return None
+
+        release_ids = [re.ident,]
+
+        # parse single CDX into URLs (if exists)
+        urls = []
+        if obj.get('cdx'):
+            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+            if url != None:
+                urls.append(url)
+            wayback = "https://web.archive.org/web/{}/{}".format(
+                obj['cdx']['datetime'],
+                obj['cdx']['url'])
+            urls.append(("webarchive", wayback))
+        urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+        fe = fatcat_openapi_client.FileEntity(
+            md5=obj['file_meta']['md5hex'],
+            sha1=obj['file_meta']['sha1hex'],
+            sha256=obj['file_meta']['sha256hex'],
+            size=int(obj['file_meta']['size_bytes']),
+            mimetype=obj['file_meta']['mimetype'] or None,
+            release_ids=release_ids,
+            urls=urls,
+            extra=dict(shadows=extra),
+        )
+        return fe
+
+    def try_update(self, fe):
+        # lookup sha1, or create new entity
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            return True
+
+        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+            # already imported from this shadow library; skip
+            self.counts['exists'] += 1
+            return False
+
+        # check for edit conflicts
+        if existing.ident in [e.ident for e in self._edits_inflight]:
+            self.counts['skip-update-inflight'] += 1
+            return False
+        if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+            raise Exception("Inflight insert; shouldn't happen")
+
+        # minimum viable "existing" URL cleanup to fix dupes and broken links:
+        # remove 'None' wayback URLs, and set archive.org rel 'archive'
+        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+        for i in range(len(existing.urls)):
+            u = existing.urls[i]
+            if u.rel == 'repository' and '://archive.org/download/' in u.url:
+                existing.urls[i].rel = 'archive'
+
+        # merge the existing into this one and update
+        existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+        existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls]
+        if not existing.extra.get('shadows'):
+            existing.extra['shadows'] = fe.extra['shadows']
+        else:
+            existing.extra['shadows'].update(fe.extra['shadows'])
+
+        # do these "plus ones" because we really want to do these updates when possible
+        if len(existing.urls) > SANE_MAX_URLS + 1:
+            self.counts['skip-update-too-many-url'] += 1
+            return None
+        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+        if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+            self.counts['skip-update-too-many-releases'] += 1
+            return None
+        existing.mimetype = existing.mimetype or fe.mimetype
+        existing.size = existing.size or fe.size
+        existing.md5 = existing.md5 or fe.md5
+        existing.sha1 = existing.sha1 or fe.sha1
+        existing.sha256 = existing.sha256 or fe.sha256
+        edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        self._edits_inflight.append(edit)
+        self.counts['update'] += 1
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json
new file mode 100644
index 00000000..f84a61a5
--- /dev/null
+++ b/python/tests/files/example_shadow.json
@@ -0,0 +1,12 @@
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"8149931","sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","sha256hex":"18b341119bbbf297a7dfa21aca86211da446617600baa153df70b4209c2c6e84","md5hex":"629e84885be85bc8d88345b98cffa0b0","size_bytes":39955,"mimetype":null},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"33139096","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","doi":"10.0000/cyberleninka.ru/article/n/analiz-primeneniya-fazochastotnyh-algoritmov-proslezhivaniya-signalov-dlya-izmereniya-urovnya-zhidkosti-v-neftedobyvayuschih","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","sha256hex":"99f15c58c2343f46c8cae75ff01c11b1b9e3c6d2f57189ec78df94e234b2c633","md5hex":"488681b249f6e9292bcde1fab1422550","size_bytes":182449,"mimetype":null},"cdx":{"url":"http://www.lib.tpu.ru/fulltext/v/Bulletin_TPU/2011/v319/i5/12.pdf","datetime":"20180412144307","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"OA-JOURNAL-TESTCRAWL-TWO-2018-20180412133030095-00799-00808-wbgrp-svc284/OA-JOURNAL-TESTCRAWL-TWO-2018-20180412142334247-00807-23249~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":126165,"warc_offset":924893749,"row_created":"2019-08-09T05:16:39.785581+00:00"}}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"33139096","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","doi":null,"pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","sha256hex":"99f15c58c2343f46c8cae75ff01c11b1b9e3c6d2f57189ec78df94e234b2c633","md5hex":"488681b249f6e9292bcde1fab1422550","size_bytes":182449,"mimetype":null},"cdx":{"url":"https://cyberleninka.ru/article/n/analiz-primeneniya-fazochastotnyh-algoritmov-proslezhivaniya-signalov-dlya-izmereniya-urovnya-zhidkosti-v-neftedobyvayuschih.pdf","datetime":"20180506175847","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"OA-JOURNAL-TESTCRAWL-TWO-2018-20180506171133875-05766-05775-wbgrp-svc284/OA-JOURNAL-TESTCRAWL-TWO-2018-20180506174415763-05771-23249~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":126144,"warc_offset":532659301,"row_created":"2019-08-09T05:16:39.785581+00:00"}}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"21389391","sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","doi":"10.1038/nn.3419","pmid":"23727820","isbn13":null},"file_meta":{"sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","sha256hex":"8c48dd68b974ed117f839dc88db44884e7e1df9ddef30f26c541437d7f390d96","md5hex":"c2a43160b62ef0f13256c789270ec2a9","size_bytes":1375452,"mimetype":null},"cdx":{"url":"https://www.janelia.org/sites/default/files/Library/nn.3419.pdf","datetime":"20170829032635","sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170829031124939-00100-00109-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170829032404137-00107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":973733,"warc_offset":262621802,"row_created":"2019-08-09T05:21:39.486744+00:00"}}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"16462885","sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","doi":"10.1016/j.cell.2007.04.022","pmid":"17482536","isbn13":null},"file_meta":{"sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","sha256hex":null,"md5hex":"995e7145d09d50eadccf322780e474d3","size_bytes":206812,"mimetype":"application/pdf"},"cdx":{"url":"http://publisher-connector.core.ac.uk/resourcesync/data/elsevier/pdf/212/aHR0cDovL2FwaS5lbHNldmllci5jb20vY29udGVudC9hcnRpY2xlL3BpaS9zMDA5Mjg2NzQwNzAwNTI4NA%3D%3D.pdf","datetime":"20170925031039","sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"TARGETED-PDF-CRAWL-2017-08-04-20170925022437721-01046-01055-wbgrp-svc284/TARGETED-PDF-CRAWL-2017-08-04-20170925024811552-01048-15075~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":188232,"warc_offset":677858259,"row_created":"2019-08-10T03:02:21.656961+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1007/s11061-011-9281-1","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":null},"cdx":null}
+ {"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":null},"cdx":null}
diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py
new file mode 100644
index 00000000..30e1724f
--- /dev/null
+++ b/python/tests/import_shadow.py
@@ -0,0 +1,59 @@
+
+import json
+import pytest
+from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def shadow_importer(api):
+    yield ShadowLibraryImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_shadow_importer_basic(shadow_importer):
+    with open('tests/files/example_shadow.json', 'r') as f:
+        JsonLinePusher(shadow_importer, f).run()
+
+def test_shadow_importer(shadow_importer):
+    last_index = shadow_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/example_shadow.json', 'r') as f:
+        shadow_importer.bezerk_mode = True
+        counts = JsonLinePusher(shadow_importer, f).run()
+    assert counts['insert'] == 2
+    assert counts['exists'] == 0
+    assert counts['skip'] == 10
+
+    # fetch most recent editgroup
+    change = shadow_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "shadow library" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']
+
+    # re-insert; should skip
+    with open('tests/files/example_shadow.json', 'r') as f:
+        shadow_importer.reset()
+        shadow_importer.bezerk_mode = False
+        counts = JsonLinePusher(shadow_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 2
+    assert counts['skip'] == 10
+
+def test_shadow_dict_parse(shadow_importer):
+    with open('tests/files/example_shadow.json', 'r') as f:
+        raw = json.loads(f.readline())
+        f = shadow_importer.parse_record(raw)
+        assert f.sha1 == "000008bc38cb80636b647b38653fc1574936c03e"
+        assert f.md5 == "629e84885be85bc8d88345b98cffa0b0"
+        assert f.mimetype == None # "application/pdf"
+        assert f.size == 39955
+        assert len(f.urls) == 2
+        for u in f.urls:
+            if u.rel == "publisher":
+                assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf")
+            if u.rel == "webarchive":
+                assert u.url.startswith("https://web.archive.org/")
+                assert "20180729135948" in u.url
+        assert len(f.release_ids) == 1
+
author	Bryan Newbold <bnewbold@robocracy.org>	2019-12-23 17:59:10 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-02-13 22:24:20 -0800
commit	e59d1b617d4abd5f002d9e59b6bbaebc9ff30993 (patch)
tree	902846f104b5679d92a85f2b6e305e6397410265 /python
parent	07fabec32aada55a75c064e5c1e01a46da30d854 (diff)
download	fatcat-e59d1b617d4abd5f002d9e59b6bbaebc9ff30993.tar.gz fatcat-e59d1b617d4abd5f002d9e59b6bbaebc9ff30993.zip