From e59d1b617d4abd5f002d9e59b6bbaebc9ff30993 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Dec 2019 17:59:10 -0800 Subject: basic shadow importer --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/shadow.py | 175 ++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 python/fatcat_tools/importers/shadow.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d936605f..10557ef8 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .shadow import ShadowLibraryImporter diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py new file mode 100644 index 00000000..21a18837 --- /dev/null +++ b/python/fatcat_tools/importers/shadow.py @@ -0,0 +1,175 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_openapi_client + +from fatcat_tools.normal import * +from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS + + +class ShadowLibraryImporter(EntityImporter): + """ + Importer for shadow library files (matched to releases) + + Input format is JSON with keys: + - shadow + - shadow_corpus (string slug) + - shadow_id (string) + - doi + - pmid + - isbn13 + - file_meta + - sha1hex + - sha256hex + - md5hex + - size_bytes + - mimetype + - cdx (may be null) + - url + - datetime + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + self.default_link_rel = kwargs.get("default_link_rel", "web") + + def want(self, raw_record): + return True + + def parse_record(self, obj): + """ + We do the release lookup in this method. Try DOI, then PMID, last ISBN13. + """ + + shadow_corpus = obj['shadow']['shadow_corpus'] + assert shadow_corpus == shadow_corpus.strip().lower() + doi = clean_doi(obj['shadow'].get('doi')) + pmid = clean_pmid(obj['shadow'].get('pmid')) + isbn13 = clean_isbn13(obj['shadow'].get('isbn13')) + shadow_id = obj['shadow'].get('shadow_id').strip() + assert shadow_id + + extra = { '{}_id'.format(shadow_corpus): shadow_id } + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id + + # lookup release via several idents + re = None + for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]: + if not ext_id: + continue + try: + re = self.api.lookup_release(**{ext_type: ext_id}) + except fatcat_openapi_client.rest.ApiException as err: + if err.status not in (404, 400): + raise err + re = None + if re: + break + + if not re: + self.counts['skip-release-not-found'] += 1 + return None + + release_ids = [re.ident,] + + # parse single CDX into URLs (if exists) + urls = [] + if obj.get('cdx'): + url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel) + if url != None: + urls.append(url) + wayback = "https://web.archive.org/web/{}/{}".format( + obj['cdx']['datetime'], + obj['cdx']['url']) + urls.append(("webarchive", wayback)) + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + + fe = fatcat_openapi_client.FileEntity( + md5=obj['file_meta']['md5hex'], + sha1=obj['file_meta']['sha1hex'], + sha256=obj['file_meta']['sha256hex'], + size=int(obj['file_meta']['size_bytes']), + mimetype=obj['file_meta']['mimetype'] or None, + release_ids=release_ids, + urls=urls, + extra=dict(shadows=extra), + ) + return fe + + def try_update(self, fe): + # lookup sha1, or create new entity + existing = None + try: + existing = self.api.lookup_file(sha1=fe.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + + if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: + # already imported from this shadow library; skip + self.counts['exists'] += 1 + return False + + # check for edit conflicts + if existing.ident in [e.ident for e in self._edits_inflight]: + self.counts['skip-update-inflight'] += 1 + return False + if fe.sha1 in [e.sha1 for e in self._edits_inflight]: + raise Exception("Inflight insert; shouldn't happen") + + # minimum viable "existing" URL cleanup to fix dupes and broken links: + # remove 'None' wayback URLs, and set archive.org rel 'archive' + existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)] + for i in range(len(existing.urls)): + u = existing.urls[i] + if u.rel == 'repository' and '://archive.org/download/' in u.url: + existing.urls[i].rel = 'archive' + + # merge the existing into this one and update + existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) + existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + if not existing.extra.get('shadows'): + existing.extra['shadows'] = fe.extra['shadows'] + else: + existing.extra['shadows'].update(fe.extra['shadows']) + + # do these "plus ones" because we really want to do these updates when possible + if len(existing.urls) > SANE_MAX_URLS + 1: + self.counts['skip-update-too-many-url'] += 1 + return None + existing.release_ids = list(set(fe.release_ids + existing.release_ids)) + if len(existing.release_ids) > SANE_MAX_RELEASES + 1: + self.counts['skip-update-too-many-releases'] += 1 + return None + existing.mimetype = existing.mimetype or fe.mimetype + existing.size = existing.size or fe.size + existing.md5 = existing.md5 or fe.md5 + existing.sha1 = existing.sha1 or fe.sha1 + existing.sha256 = existing.sha256 or fe.sha256 + edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + self._edits_inflight.append(edit) + self.counts['update'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + -- cgit v1.2.3 From 87029cb13d244381f915fe66e40760477edb5675 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 12:15:09 -0800 Subject: shadow import: more filtering of file_meta fields --- python/fatcat_tools/importers/shadow.py | 10 ++++++++++ python/tests/files/example_shadow.json | 22 ++++++++++------------ python/tests/import_shadow.py | 14 ++++++++------ 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 21a18837..cfe1b1cf 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -43,6 +43,16 @@ class ShadowLibraryImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") def want(self, raw_record): + """ + Only want to import records with complete file-level metadata + """ + fm = raw_record['file_meta'] + if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']): + self.counts['skip-file-meta-incomplete'] += 1 + return False + if fm['mimetype'] != 'application/pdf': + self.counts['skip-not-pdf'] += 1 + return False return True def parse_record(self, obj): diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json index f84a61a5..3386f481 100644 --- a/python/tests/files/example_shadow.json +++ b/python/tests/files/example_shadow.json @@ -1,12 +1,10 @@ -{"shadow":{"shadow_corpus":"scimag","shadow_id":"8149931","sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","sha256hex":"18b341119bbbf297a7dfa21aca86211da446617600baa153df70b4209c2c6e84","md5hex":"629e84885be85bc8d88345b98cffa0b0","size_bytes":39955,"mimetype":null},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"000008bc38cb80636b647b38653fc1574936c03e","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"33139096","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","doi":"10.0000/cyberleninka.ru/article/n/analiz-primeneniya-fazochastotnyh-algoritmov-proslezhivaniya-signalov-dlya-izmereniya-urovnya-zhidkosti-v-neftedobyvayuschih","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","sha256hex":"99f15c58c2343f46c8cae75ff01c11b1b9e3c6d2f57189ec78df94e234b2c633","md5hex":"488681b249f6e9292bcde1fab1422550","size_bytes":182449,"mimetype":null},"cdx":{"url":"http://www.lib.tpu.ru/fulltext/v/Bulletin_TPU/2011/v319/i5/12.pdf","datetime":"20180412144307","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"OA-JOURNAL-TESTCRAWL-TWO-2018-20180412133030095-00799-00808-wbgrp-svc284/OA-JOURNAL-TESTCRAWL-TWO-2018-20180412142334247-00807-23249~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":126165,"warc_offset":924893749,"row_created":"2019-08-09T05:16:39.785581+00:00"}} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"33139096","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","doi":null,"pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","sha256hex":"99f15c58c2343f46c8cae75ff01c11b1b9e3c6d2f57189ec78df94e234b2c633","md5hex":"488681b249f6e9292bcde1fab1422550","size_bytes":182449,"mimetype":null},"cdx":{"url":"https://cyberleninka.ru/article/n/analiz-primeneniya-fazochastotnyh-algoritmov-proslezhivaniya-signalov-dlya-izmereniya-urovnya-zhidkosti-v-neftedobyvayuschih.pdf","datetime":"20180506175847","sha1hex":"00000c4296e2c5f8f70ab265c683235fbf5e354b","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"OA-JOURNAL-TESTCRAWL-TWO-2018-20180506171133875-05766-05775-wbgrp-svc284/OA-JOURNAL-TESTCRAWL-TWO-2018-20180506174415763-05771-23249~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":126144,"warc_offset":532659301,"row_created":"2019-08-09T05:16:39.785581+00:00"}} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"21389391","sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","doi":"10.1038/nn.3419","pmid":"23727820","isbn13":null},"file_meta":{"sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","sha256hex":"8c48dd68b974ed117f839dc88db44884e7e1df9ddef30f26c541437d7f390d96","md5hex":"c2a43160b62ef0f13256c789270ec2a9","size_bytes":1375452,"mimetype":null},"cdx":{"url":"https://www.janelia.org/sites/default/files/Library/nn.3419.pdf","datetime":"20170829032635","sha1hex":"00000d5508d7d7106560ade65c33c628c54d7c75","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170829031124939-00100-00109-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170829032404137-00107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":973733,"warc_offset":262621802,"row_created":"2019-08-09T05:21:39.486744+00:00"}} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"16462885","sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","doi":"10.1016/j.cell.2007.04.022","pmid":"17482536","isbn13":null},"file_meta":{"sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","sha256hex":null,"md5hex":"995e7145d09d50eadccf322780e474d3","size_bytes":206812,"mimetype":"application/pdf"},"cdx":{"url":"http://publisher-connector.core.ac.uk/resourcesync/data/elsevier/pdf/212/aHR0cDovL2FwaS5lbHNldmllci5jb20vY29udGVudC9hcnRpY2xlL3BpaS9zMDA5Mjg2NzQwNzAwNTI4NA%3D%3D.pdf","datetime":"20170925031039","sha1hex":"0000102db78329a149d3b6319f6ccf8cc90483e2","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"TARGETED-PDF-CRAWL-2017-08-04-20170925022437721-01046-01055-wbgrp-svc284/TARGETED-PDF-CRAWL-2017-08-04-20170925024811552-01048-15075~wbgrp-svc284.us.archive.org~8443.warc.gz","warc_csize":188232,"warc_offset":677858259,"row_created":"2019-08-10T03:02:21.656961+00:00"}} -{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1007/s11061-011-9281-1","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":null},"cdx":null} - {"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":null},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null} +{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null} diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py index 30e1724f..70a918d2 100644 --- a/python/tests/import_shadow.py +++ b/python/tests/import_shadow.py @@ -21,7 +21,7 @@ def test_shadow_importer(shadow_importer): counts = JsonLinePusher(shadow_importer, f).run() assert counts['insert'] == 2 assert counts['exists'] == 0 - assert counts['skip'] == 10 + assert counts['skip'] == 8 # fetch most recent editgroup change = shadow_importer.api.get_changelog_entry(index=last_index+1) @@ -38,16 +38,18 @@ def test_shadow_importer(shadow_importer): counts = JsonLinePusher(shadow_importer, f).run() assert counts['insert'] == 0 assert counts['exists'] == 2 - assert counts['skip'] == 10 + assert counts['skip'] == 8 def test_shadow_dict_parse(shadow_importer): with open('tests/files/example_shadow.json', 'r') as f: raw = json.loads(f.readline()) f = shadow_importer.parse_record(raw) - assert f.sha1 == "000008bc38cb80636b647b38653fc1574936c03e" - assert f.md5 == "629e84885be85bc8d88345b98cffa0b0" - assert f.mimetype == None # "application/pdf" - assert f.size == 39955 + + assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7" + assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e" + assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79" + assert f.mimetype == "application/pdf" + assert f.size == 206121 assert len(f.urls) == 2 for u in f.urls: if u.rel == "publisher": -- cgit v1.2.3 From 00754db377df53af18f9c4dddacdeb2e2c559206 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 16:31:57 -0800 Subject: shadow import fixes from QA testing --- python/fatcat_import.py | 2 +- python/fatcat_tools/importers/shadow.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 1f026edc..843685aa 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -482,7 +482,7 @@ def main(): help="create release and file entities based on GROBID PDF metadata extraction") sub_shadow_lib.set_defaults( func=run_shadow_lib, - auth_var="FATCAT_API_AUTH_TOKEN", + auth_var="FATCAT_AUTH_WORKER_SHADOW", ) sub_shadow_lib.add_argument('json_file', help="JSON file to import from (or stdin)", diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index cfe1b1cf..261cf888 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -130,6 +130,9 @@ class ShadowLibraryImporter(EntityImporter): if not existing: return True + if not existing.extra: + existing.extra = {} + if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']: # already imported from this shadow library; skip self.counts['exists'] += 1 @@ -172,6 +175,9 @@ class ShadowLibraryImporter(EntityImporter): existing.sha1 = existing.sha1 or fe.sha1 existing.sha256 = existing.sha256 or fe.sha256 edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) + # add sha1 to non-entity edit row, so we can do more aggressive + # group-level de-dupe + edit.sha1 = existing.sha1 self._edits_inflight.append(edit) self.counts['update'] += 1 return False -- cgit v1.2.3 From 3011c6a088498ba566672d35aeee805c762808ba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 17:01:01 -0800 Subject: improve shadow import file url cleanup path Should probably be refactored out in to shared cleanup code. --- python/fatcat_tools/importers/shadow.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 261cf888..1a76299e 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -152,10 +152,20 @@ class ShadowLibraryImporter(EntityImporter): u = existing.urls[i] if u.rel == 'repository' and '://archive.org/download/' in u.url: existing.urls[i].rel = 'archive' + if u.rel == 'social': + u.rel = 'academicsocial' + + # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug) + new_wb_urls = [u.url for u in fe.urls] + new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format( + u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls] + existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls] # merge the existing into this one and update - existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls])) - existing.urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in existing.urls] + merged_urls = {} + for u in fe.urls + existing.urls: + merged_urls[u.url] = u + existing.urls = list(merged_urls.values()) if not existing.extra.get('shadows'): existing.extra['shadows'] = fe.extra['shadows'] else: -- cgit v1.2.3 From 016d6d28c24f616897bdb7587205cfe2cc32ec89 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 14 Feb 2020 00:12:23 -0800 Subject: remove arabesque short wayback URL hack --- python/fatcat_tools/importers/shadow.py | 6 ------ 1 file changed, 6 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 1a76299e..4cd22775 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -155,12 +155,6 @@ class ShadowLibraryImporter(EntityImporter): if u.rel == 'social': u.rel = 'academicsocial' - # new wayback URLs, could replace bad old short wayback URLs (from arabesque bug) - new_wb_urls = [u.url for u in fe.urls] - new_short_wb_urls = ['https://web.archive.org/web/{}/{}'.format( - u.split('/')[4][:12], '/'.join(u.split('/')[5:])) for u in new_wb_urls] - existing.urls = [u for u in existing.urls if not u.url in new_short_wb_urls] - # merge the existing into this one and update merged_urls = {} for u in fe.urls + existing.urls: -- cgit v1.2.3