summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-02-19 17:32:27 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-19 17:32:27 -0800
commitf6ef597e3fb9b6f89dc054960725a2cfb7ded851 (patch)
tree622d225509d94bf3a76c1fd731d5a874e9a23cc0 /python
parent64d3c5475921a8024b083558ca96bb15e27f48c1 (diff)
parent016d6d28c24f616897bdb7587205cfe2cc32ec89 (diff)
downloadfatcat-f6ef597e3fb9b6f89dc054960725a2cfb7ded851.tar.gz
fatcat-f6ef597e3fb9b6f89dc054960725a2cfb7ded851.zip
Merge branch 'bnewbold-shadow-import'
Diffstat (limited to 'python')
-rwxr-xr-xpython/fatcat_import.py15
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/shadow.py195
-rw-r--r--python/fatcat_web/entity_helpers.py4
-rw-r--r--python/fatcat_web/templates/release_view.html5
-rw-r--r--python/tests/files/example_shadow.json10
-rw-r--r--python/tests/import_shadow.py61
7 files changed, 289 insertions, 2 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ad4de0e2..843685aa 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,11 @@ def run_grobid_metadata(args):
bezerk_mode=args.bezerk_mode)
LinePusher(fmi, args.tsv_file).run()
+def run_shadow_lib(args):
+ fmi = ShadowLibraryImporter(args.api,
+ edit_batch_size=100)
+ JsonLinePusher(fmi, args.json_file).run()
+
def run_wayback_static(args):
api = args.api
@@ -473,6 +478,16 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
+ sub_shadow_lib = subparsers.add_parser('shadow-lib',
+ help="create release and file entities based on GROBID PDF metadata extraction")
+ sub_shadow_lib.set_defaults(
+ func=run_shadow_lib,
+ auth_var="FATCAT_AUTH_WORKER_SHADOW",
+ )
+ sub_shadow_lib.add_argument('json_file',
+ help="JSON file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+
sub_wayback_static = subparsers.add_parser('wayback-static',
help="crude crawl+ingest tool for single-page HTML docs from wayback")
sub_wayback_static.set_defaults(
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d936605f..10557ef8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+ """
+ Importer for shadow library files (matched to releases)
+
+ Input format is JSON with keys:
+ - shadow
+ - shadow_corpus (string slug)
+ - shadow_id (string)
+ - doi
+ - pmid
+ - isbn13
+ - file_meta
+ - sha1hex
+ - sha256hex
+ - md5hex
+ - size_bytes
+ - mimetype
+ - cdx (may be null)
+ - url
+ - datetime
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+ def want(self, raw_record):
+ """
+ Only want to import records with complete file-level metadata
+ """
+ fm = raw_record['file_meta']
+ if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+ self.counts['skip-file-meta-incomplete'] += 1
+ return False
+ if fm['mimetype'] != 'application/pdf':
+ self.counts['skip-not-pdf'] += 1
+ return False
+ return True
+
+ def parse_record(self, obj):
+ """
+ We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+ """
+
+ shadow_corpus = obj['shadow']['shadow_corpus']
+ assert shadow_corpus == shadow_corpus.strip().lower()
+ doi = clean_doi(obj['shadow'].get('doi'))
+ pmid = clean_pmid(obj['shadow'].get('pmid'))
+ isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+ shadow_id = obj['shadow'].get('shadow_id').strip()
+ assert shadow_id
+
+ extra = { '{}_id'.format(shadow_corpus): shadow_id }
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+ # lookup release via several idents
+ re = None
+ for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+ if not ext_id:
+ continue
+ try:
+ re = self.api.lookup_release(**{ext_type: ext_id})
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status not in (404, 400):
+ raise err
+ re = None
+ if re:
+ break
+
+ if not re:
+ self.counts['skip-release-not-found'] += 1
+ return None
+
+ release_ids = [re.ident,]
+
+ # parse single CDX into URLs (if exists)
+ urls = []
+ if obj.get('cdx'):
+ url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+ if url != None:
+ urls.append(url)
+ wayback = "https://web.archive.org/web/{}/{}".format(
+ obj['cdx']['datetime'],
+ obj['cdx']['url'])
+ urls.append(("webarchive", wayback))
+ urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+ fe = fatcat_openapi_client.FileEntity(
+ md5=obj['file_meta']['md5hex'],
+ sha1=obj['file_meta']['sha1hex'],
+ sha256=obj['file_meta']['sha256hex'],
+ size=int(obj['file_meta']['size_bytes']),
+ mimetype=obj['file_meta']['mimetype'] or None,
+ release_ids=release_ids,
+ urls=urls,
+ extra=dict(shadows=extra),
+ )
+ return fe
+
+ def try_update(self, fe):
+ # lookup sha1, or create new entity
+ existing = None
+ try:
+ existing = self.api.lookup_file(sha1=fe.sha1)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ return True
+
+ if not existing.extra:
+ existing.extra = {}
+
+ if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+ # already imported from this shadow library; skip
+ self.counts['exists'] += 1
+ return False
+
+ # check for edit conflicts
+ if existing.ident in [e.ident for e in self._edits_inflight]:
+ self.counts['skip-update-inflight'] += 1
+ return False
+ if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+ raise Exception("Inflight insert; shouldn't happen")
+
+ # minimum viable "existing" URL cleanup to fix dupes and broken links:
+ # remove 'None' wayback URLs, and set archive.org rel 'archive'
+ existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+ for i in range(len(existing.urls)):
+ u = existing.urls[i]
+ if u.rel == 'repository' and '://archive.org/download/' in u.url:
+ existing.urls[i].rel = 'archive'
+ if u.rel == 'social':
+ u.rel = 'academicsocial'
+
+ # merge the existing into this one and update
+ merged_urls = {}
+ for u in fe.urls + existing.urls:
+ merged_urls[u.url] = u
+ existing.urls = list(merged_urls.values())
+ if not existing.extra.get('shadows'):
+ existing.extra['shadows'] = fe.extra['shadows']
+ else:
+ existing.extra['shadows'].update(fe.extra['shadows'])
+
+ # do these "plus ones" because we really want to do these updates when possible
+ if len(existing.urls) > SANE_MAX_URLS + 1:
+ self.counts['skip-update-too-many-url'] += 1
+ return None
+ existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+ if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+ self.counts['skip-update-too-many-releases'] += 1
+ return None
+ existing.mimetype = existing.mimetype or fe.mimetype
+ existing.size = existing.size or fe.size
+ existing.md5 = existing.md5 or fe.md5
+ existing.sha1 = existing.sha1 or fe.sha1
+ existing.sha256 = existing.sha256 or fe.sha256
+ edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ # add sha1 to non-entity edit row, so we can do more aggressive
+ # group-level de-dupe
+ edit.sha1 = existing.sha1
+ self._edits_inflight.append(edit)
+ self.counts['update'] += 1
+ return False
+
+ def insert_batch(self, batch):
+ self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index af0fea83..591dda80 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -53,6 +53,10 @@ def enrich_release_entity(entity):
entity._es = release_to_elasticsearch(entity, force_bool=False)
if entity.container and entity.container.state == "active":
entity.container._es = container_to_elasticsearch(entity.container, force_bool=False)
+ if entity.files:
+ # remove shadows-only files with no URLs
+ entity.files = [f for f in entity.files
+ if not (f.extra and f.extra.get('shadows') and not f.urls)]
if entity.filesets:
for fs in entity.filesets:
fs._total_size = sum([f.size for f in fs.manifest])
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 83ecd1c8..961b4759 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -196,8 +196,9 @@
</tbody>
</table>
{% else %}
-<p>There are no known files associated with this release (you could try
-<a href="/work/{{ release.work_id }}">other releases for this work?</a>).
+<p>There are no accessible files associated with this release. You could check
+<a href="/work/{{ release.work_id }}">other releases for this work</a> for an
+accessible version.
{% endif %}
{% endif %}
diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json
new file mode 100644
index 00000000..3386f481
--- /dev/null
+++ b/python/tests/files/example_shadow.json
@@ -0,0 +1,10 @@
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null}
diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py
new file mode 100644
index 00000000..70a918d2
--- /dev/null
+++ b/python/tests/import_shadow.py
@@ -0,0 +1,61 @@
+
+import json
+import pytest
+from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def shadow_importer(api):
+ yield ShadowLibraryImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_shadow_importer_basic(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ JsonLinePusher(shadow_importer, f).run()
+
+def test_shadow_importer(shadow_importer):
+ last_index = shadow_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.bezerk_mode = True
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 2
+ assert counts['exists'] == 0
+ assert counts['skip'] == 8
+
+ # fetch most recent editgroup
+ change = shadow_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "shadow library" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']
+
+ # re-insert; should skip
+ with open('tests/files/example_shadow.json', 'r') as f:
+ shadow_importer.reset()
+ shadow_importer.bezerk_mode = False
+ counts = JsonLinePusher(shadow_importer, f).run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 2
+ assert counts['skip'] == 8
+
+def test_shadow_dict_parse(shadow_importer):
+ with open('tests/files/example_shadow.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = shadow_importer.parse_record(raw)
+
+ assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7"
+ assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e"
+ assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 206121
+ assert len(f.urls) == 2
+ for u in f.urls:
+ if u.rel == "publisher":
+ assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf")
+ if u.rel == "webarchive":
+ assert u.url.startswith("https://web.archive.org/")
+ assert "20180729135948" in u.url
+ assert len(f.release_ids) == 1
+