From 8b6b1447cc37fb76865fd80377c55463e59db3b9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 17:19:37 -0800 Subject: ingest: basic checks for ingest_type --- python/fatcat_tools/importers/ingest.py | 32 +++++++++++++++++++++++++++++--- python/tests/files/example_ingest.json | 2 +- python/tests/import_ingest.py | 6 ++++++ 3 files changed, 36 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4b1d3702..c88ec86a 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -20,7 +20,7 @@ class IngestFileResultImporter(EntityImporter): assert self.default_link_rel self.require_grobid = require_grobid if self.require_grobid: - print("Requiring GROBID status == 200") + print("Requiring GROBID status == 200 (for PDFs)") else: print("NOT checking GROBID success") self.ingest_request_source_whitelist = [ @@ -74,8 +74,22 @@ class IngestFileResultImporter(EntityImporter): if not row.get('file_meta'): self.counts['skip-file-meta'] += 1 return False - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + + # type-specific filters + if row['request'].get('ingest_type') == 'pdf': + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("application/pdf",): + self.counts['skip-mimetype'] += 1 + return False + elif row['request'].get('ingest_type') == 'xml': + if row['file_meta'].get('mimetype') not in ("application/xml", + "application/jats+xml", "application/tei+xml", "text/xml"): + self.counts['skip-mimetype'] += 1 + return False + else: + self.counts['skip-ingest-type'] += 1 return False return True @@ -85,6 +99,18 @@ class IngestFileResultImporter(EntityImporter): request = row['request'] fatcat = request.get('fatcat') file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') not in ('pdf', 'xml'): + self.counts['skip-ingest-type'] += 1 + return None + assert (request['ingest_type'], file_meta['mimetype']) in [ + ("pdf", "application/pdf"), + ("xml", "application/xml"), + ("xml", "application/jats+xml"), + ("xml", "application/tei+xml"), + ("xml", "text/xml"), + ] # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json index cea67fa7..a9791587 100644 --- a/python/tests/files/example_ingest.json +++ b/python/tests/files/example_ingest.json @@ -1,2 +1,2 @@ -{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"} +{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_type": "pdf", "ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"} {"request":{"ingest_type":"pdf","ingest_request_source":"fatcat-changelog","base_url":"https://doi.org/10.3917/popav.748.0017","release_stage":"published","fatcat":{"release_ident":"weeqjkvsx5abze2bhithyrx6wu","work_ident":"ujatsk25yrdw5gofubw7nogzgq"},"ext_ids":{"doi":"10.3917/popav.748.0017"},"link_source":"doi","link_source_id":"10.3917/popav.748.0017"},"hit":false,"hops":["https://doi.org/10.3917/popav.748.0017"],"status":"wayback-error","error_message":"replay fetch didn't return X-Archive-Src in headers"} diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 4a46232a..05287af4 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -94,6 +94,12 @@ def test_ingest_dict_parse(ingest_importer): def test_ingest_dict_parse_old(ingest_importer): with open('tests/files/example_ingest.old.json', 'r') as f: raw = json.loads(f.readline()) + + # ancient ingest requests had no type; skip them + f = ingest_importer.parse_record(raw) + assert f == None + raw['request']['ingest_type'] = 'pdf' + f = ingest_importer.parse_record(raw) assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" -- cgit v1.2.3 From cad812bf78f7363e698139cd7b95d7434f8ae4bb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 17:29:44 -0800 Subject: ingest: tests for basic XML ingest --- python/tests/files/example_ingest_xml.json | 1 + python/tests/import_ingest.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 python/tests/files/example_ingest_xml.json (limited to 'python') diff --git a/python/tests/files/example_ingest_xml.json b/python/tests/files/example_ingest_xml.json new file mode 100644 index 00000000..ba61b183 --- /dev/null +++ b/python/tests/files/example_ingest_xml.json @@ -0,0 +1 @@ +{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}} diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 05287af4..21552fb9 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -41,6 +41,23 @@ def test_ingest_importer(ingest_importer): assert counts['exists'] == 1 assert counts['skip'] == 1 +def test_ingest_importer_xml(ingest_importer): + last_index = ingest_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_importer, f).run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] + def test_ingest_importer_stage(ingest_importer, api): """ Tests that ingest importer correctly handles release stage matching -- cgit v1.2.3 From 1ed31621ae384f8b5e2a7d389347b8c97bcfefe3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 20:27:57 -0800 Subject: ingest: whitelist -> allowlist --- python/fatcat_import.py | 6 +++--- python/fatcat_tools/importers/ingest.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index e92b3106..5dc10f0e 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -126,7 +126,7 @@ def run_arabesque_match(args): def run_ingest_file(args): ifri = IngestFileResultImporter(args.api, editgroup_description=args.editgroup_description_override, - skip_source_whitelist=args.skip_source_whitelist, + skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), @@ -442,9 +442,9 @@ def main(): sub_ingest_file.add_argument('json_file', help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_file.add_argument('--skip-source-whitelist', + sub_ingest_file.add_argument('--skip-source-allowlist', action='store_true', - help="don't filter import based on request source whitelist") + help="don't filter import based on request source allowlist") sub_ingest_file.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index c88ec86a..b6851eec 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -23,7 +23,7 @@ class IngestFileResultImporter(EntityImporter): print("Requiring GROBID status == 200 (for PDFs)") else: print("NOT checking GROBID success") - self.ingest_request_source_whitelist = [ + self.ingest_request_source_allowlist = [ 'fatcat-changelog', 'fatcat-ingest-container', 'fatcat-ingest', @@ -35,8 +35,8 @@ class IngestFileResultImporter(EntityImporter): 's2-corpus', 's2', ] - if kwargs.get('skip_source_whitelist', False): - self.ingest_request_source_whitelist = [] + if kwargs.get('skip_source_allowlist', False): + self.ingest_request_source_allowlist = [] def want(self, row): """ -- cgit v1.2.3 From 0c7dd38ed09c7a0584d079335fb3d1d53434628c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 20:28:54 -0800 Subject: refactor: white/black -> allow/block --- python/fatcat_tools/importers/datacite.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 86740e80..5cdc5577 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -151,7 +151,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( 'Unknown', ))) -# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist. +# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi @@ -346,7 +346,7 @@ class DataciteImporter(EntityImporter): print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False - # check for blacklisted "spam", e.g. "FULL MOVIE" + # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() for token in rule.get("tokens", []): @@ -819,7 +819,7 @@ class DataciteImporter(EntityImporter): contribs = [] # Names, that should be ignored right away. - name_blacklist = set(('Occdownload Gbif.Org',)) + name_blocklist = set(('Occdownload Gbif.Org',)) i = 0 for c in creators: @@ -861,7 +861,7 @@ class DataciteImporter(EntityImporter): continue if not name: name = "{} {}".format(given_name or '', surname or '').strip() - if name in name_blacklist: + if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue -- cgit v1.2.3 From 931d5e450c9998177fc222b3d5b41ce16a947569 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 22:24:58 -0800 Subject: ingest: initial 'web' worker implementation --- python/fatcat_import.py | 42 ++++ python/fatcat_tools/importers/__init__.py | 2 +- python/fatcat_tools/importers/ingest.py | 324 ++++++++++++++++++++++++------ 3 files changed, 301 insertions(+), 67 deletions(-) (limited to 'python') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 5dc10f0e..19cf43ec 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -144,6 +144,26 @@ def run_ingest_file(args): else: JsonLinePusher(ifri, args.json_file).run() +def run_ingest_web(args): + iwri = IngestWebResultImporter(args.api, + editgroup_description=args.editgroup_description_override, + skip_source_allowlist=args.skip_source_allowlist, + do_updates=args.do_updates, + default_link_rel=args.default_link_rel, + edit_batch_size=args.batch_size) + if args.kafka_mode: + KafkaJsonPusher( + iwri, + args.kafka_hosts, + args.kafka_env, + "ingest-file-results", + "fatcat-{}-ingest-web-result".format(args.kafka_env), + kafka_namespace="sandcrawler", + consume_batch_size=args.batch_size, + ).run() + else: + JsonLinePusher(iwri, args.json_file).run() + def run_savepapernow_file(args): ifri = SavePaperNowFileImporter(args.api, editgroup_description=args.editgroup_description_override, @@ -458,6 +478,28 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") + sub_ingest_web = subparsers.add_parser('ingest-web-results', + help="add/update web entities linked to releases based on sandcrawler ingest results") + sub_ingest_web.set_defaults( + func=run_ingest_web, + auth_var="FATCAT_AUTH_WORKER_CRAWL", + ) + sub_ingest_web.add_argument('json_file', + help="ingest_web JSON file to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_ingest_web.add_argument('--skip-source-allowlist', + action='store_true', + help="don't filter import based on request source allowlist") + sub_ingest_web.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_ingest_web.add_argument('--do-updates', + action='store_true', + help="update pre-existing web entities if new match (instead of skipping)") + sub_ingest_web.add_argument('--default-link-rel', + default="web", + help="default URL rel for matches (eg, 'publisher', 'web')") + sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results', help="add file entities crawled due to async Save Paper Now request") sub_savepapernow_file.set_defaults( diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index b82eb11a..c08e04c2 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -27,6 +27,6 @@ from .orcid import OrcidImporter from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE from .wayback_static import auto_wayback_static from .cdl_dash_dat import auto_cdl_dash_dat -from .ingest import IngestFileResultImporter, SavePaperNowFileImporter +from .ingest import IngestFileResultImporter, SavePaperNowFileImporter, IngestWebResultImporter from .shadow import ShadowLibraryImporter from .file_meta import FileMetaImporter diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index b6851eec..2042d331 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -38,39 +38,11 @@ class IngestFileResultImporter(EntityImporter): if kwargs.get('skip_source_allowlist', False): self.ingest_request_source_allowlist = [] - def want(self, row): + def want_file(self, row) -> bool: """ - Logic here probably needs work (TODO): - - - Direct ingests via DOI from fatcat-changelog should probably go - through regardless of GROBID status - - We should filter/block things like single-page PDFs here - - public/anonymous submissions could require successful biblio-glutton - match, or some other sanity check on the fatcat side (eg, fuzzy title - match) - - handle the case of release_stage not being 'published'; if pre-print, - potentially create a new release. - - The current logic is intentionally conservative as a first step. + File-specific part of want(). Generic across general ingest and save-paper-now. """ - if row.get('hit') != True: - self.counts['skip-hit'] += 1 - return False - source = row['request'].get('ingest_request_source') - if not source: - self.counts['skip-ingest_request_source'] += 1 - return False - if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: - self.counts['skip-ingest_request_source'] += 1 - return False - if source.startswith('arabesque'): - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): - self.counts['skip-arabesque-source'] += 1 - return False - if source.startswith('savepapernow'): - # never process async savepapernow requests - self.counts['skip-savepapernow'] += 1 - return False + if not row.get('file_meta'): self.counts['skip-file-meta'] += 1 return False @@ -94,25 +66,60 @@ class IngestFileResultImporter(EntityImporter): return True - def parse_record(self, row): + def want_ingest(self, row) -> bool: + """ + Sandcrawler ingest-specific part of want(). Generic across file and + webcapture ingest. + """ + if row.get('hit') != True: + self.counts['skip-hit'] += 1 + return False + source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False + if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: + self.counts['skip-ingest_request_source'] += 1 + return False + + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): + self.counts['skip-link-source'] += 1 + return False + + if source.startswith('savepapernow'): + # never process async savepapernow requests + self.counts['skip-savepapernow'] += 1 + return False + + return True + + def want(self, row): + """ + Overall logic here probably needs work (TODO): + + - Direct ingests via DOI from fatcat-changelog should probably go + through regardless of GROBID status + - We should filter/block things like single-page PDFs here + - public/anonymous submissions could require successful biblio-glutton + match, or some other sanity check on the fatcat side (eg, fuzzy title + match) + - handle the case of release_stage not being 'published'; if pre-print, + potentially create a new release. + + The current logic is intentionally conservative as a first step. + """ + if not self.want_file(row): + return False + if not self.want_ingest(row): + return False + + return True + + def parse_ingest_release_ident(self, row): request = row['request'] fatcat = request.get('fatcat') - file_meta = row['file_meta'] - - # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') not in ('pdf', 'xml'): - self.counts['skip-ingest-type'] += 1 - return None - assert (request['ingest_type'], file_meta['mimetype']) in [ - ("pdf", "application/pdf"), - ("xml", "application/xml"), - ("xml", "application/jats+xml"), - ("xml", "application/tei+xml"), - ("xml", "text/xml"), - ] - # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None if fatcat and fatcat.get('release_ident'): release_ident = fatcat.get('release_ident') @@ -138,16 +145,16 @@ class IngestFileResultImporter(EntityImporter): return None release_ident = release.ident break + if self.use_glutton_match and not release_ident and row.get('grobid'): # try biblio-glutton extracted hit if row['grobid'].get('fatcat_release'): release_ident = row['grobid']['fatcat_release'].split('_')[-1] self.counts['glutton-match'] += 1 - if not release_ident: - self.counts['skip-release-not-found'] += 1 - return None + return release_ident + def parse_terminal(self, row): terminal = row.get('terminal') if not terminal: # support old cdx-only ingest results @@ -170,6 +177,10 @@ class IngestFileResultImporter(EntityImporter): terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 + def parse_urls(self, row, terminal): + + request = row['request'] + default_rel = self.default_link_rel if request.get('link_source') == 'doi': default_rel = 'publisher' @@ -185,6 +196,51 @@ class IngestFileResultImporter(EntityImporter): urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + return urls + + def parse_edit_extra(self, row): + + request = row['request'] + edit_extra = dict() + + if request.get('edit_extra'): + edit_extra = request['edit_extra'] + + if request.get('ingest_request_source'): + edit_extra['ingest_request_source'] = request['ingest_request_source'] + if request.get('link_source') and request.get('link_source_id'): + edit_extra['link_source'] = request['link_source'] + edit_extra['link_source_id'] = request['link_source_id'] + + return edit_extra + + def parse_record(self, row): + + request = row['request'] + fatcat = request.get('fatcat') + file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') not in ('pdf', 'xml'): + self.counts['skip-ingest-type'] += 1 + return None + assert (request['ingest_type'], file_meta['mimetype']) in [ + ("pdf", "application/pdf"), + ("xml", "application/xml"), + ("xml", "application/jats+xml"), + ("xml", "application/tei+xml"), + ("xml", "text/xml"), + ] + + # identify release by fatcat ident, or extid lookup, or biblio-glutton match + release_ident = self.parse_ingest_release_ident(row) + + if not release_ident: + self.counts['skip-release-not-found'] += 1 + return None + + terminal = self.parse_terminal(row) + urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( md5=file_meta['md5hex'], @@ -195,17 +251,10 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if request.get('edit_extra'): - fe.edit_extra = request['edit_extra'] - else: - fe.edit_extra = dict() - if request.get('ingest_request_source'): - fe.edit_extra['ingest_request_source'] = request['ingest_request_source'] - if request.get('link_source') and request.get('link_source_id'): - fe.edit_extra['link_source'] = request['link_source'] - fe.edit_extra['link_source_id'] = request['link_source_id'] - if not fe.edit_extra: - fe.edit_extra = None + + edit_extra = self.parse_edit_extra(row) + if edit_extra: + fe.edit_extra = edit_extra return fe def try_update(self, fe): @@ -270,6 +319,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): + if not self.want_file(row): + return False + source = row['request'].get('ingest_request_source') if not source: self.counts['skip-ingest_request_source'] += 1 @@ -280,12 +332,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter): if row.get('hit') != True: self.counts['skip-hit'] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 - return False - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 - return False return True @@ -306,3 +352,149 @@ class SavePaperNowFileImporter(IngestFileResultImporter): description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) + +class IngestWebResultImporter(IngestFileResultImporter): + """ + Variant of IngestFileResultImporter for processing HTML ingest requests + into webcapture objects. + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "WebCaptures crawled from web using sandcrawler ingest tool" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') + kwargs['do_updates'] = False + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + + if not self.want_ingest(row): + return False + + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + + # webcapture-specific filters + if row['request'].get('ingest_type') != 'html': + self.counts['skip-ingest-type'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("text/html", "application/html"): + self.counts['skip-mimetype'] += 1 + return False + + return True + + + def parse_record(self, row): + """ + TODO: more of this parsing could be DRY with the file version + """ + + request = row['request'] + file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') != "html": + self.counts['skip-ingest-type'] += 1 + return None + if file_meta['mimetype'] not in ("text/html", "application/html"): + self.counts['skip-mimetype'] += 1 + return None + + # identify release by fatcat ident, or extid lookup + release_ident = self.parse_ingest_release_ident(row) + + if not release_ident: + self.counts['skip-release-not-found'] += 1 + return None + + terminal = self.parse_terminal(row) + urls = self.parse_urls(row, terminal) + archive_urls = [u for u in urls if u['rel'] == 'webarchive'] + + if terminal['terminal_status_code'] != 200: + self.counts['skip-terminal-status-code'] += 1 + return None + + terminal_cdx = row['cdx'] + if 'revisit_cdx' in row: + terminal_cdx = row['revisit_cdx'] + assert terminal_cdx['surt'] + assert terminal_cdx['url'] == terminal['terminal_url'] + + wc_cdx = [] + # primary resource first + wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( + surt=terminal['terminal_surt'], # XXX: from CDX? + timestamp=terminal['terminal_dt'], # as an ISO datetime + url=terminal['terminal_url'], + mimetype=file_meta['mimetype'], + status_code=terminal['terminal_status_code'], + sha1=file_meta['sha1hex'], + sha256=file_meta['sha256hex'], + size=file_meta['size_bytes'], + )) + + for resource in row.get('html_resources', []): + wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( + surt=resource['surt'], + timestamp=resource['timestamp'], + url=resource['url'], + mimetype=resource.get('mimetype'), + size=resource.get('size_bytes'), + sha1=resource.get('sha1hex'), + sha256=resource.get('sha256hex'), + )) + + wc = fatcat_openapi_client.WebCaptureEntity( + cdx=wc_cdx, + archive_urls=archive_urls, + original_url=terminal['terminal_url'], + timestamp=terminal['terminal_dt'], + release_ids=[release_ident], + urls=urls, + ) + + edit_extra = self.parse_edit_extra(row) + + if edit_extra: + wc.edit_extra = edit_extra + return wc + + + def try_update(self, wc): + + # check for existing edits-in-progress with same file hash + for other in self._entity_queue: + if other.sha1 == wc.sha1: + self.counts['skip-in-queue'] += 1 + return False + + # lookup sha1, or create new entity + existing = None + # XXX: lookup *release* instead; skip if any existing web capture entities + # XXX: only one release per webcapture + try: + existing = self.api.lookup_file(sha1=wc.sha1) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + + if not existing: + return True + else: + # TODO: for now, never update + self.counts['skip-update-disabled'] += 1 + return False + + def insert_batch(self, batch): + self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebCaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) -- cgit v1.2.3 From e16672c4c21e17c2d2c653e7d480f4ba671771fb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 23:04:24 -0800 Subject: ingest: progress on HTML ingest --- python/fatcat_tools/importers/ingest.py | 44 +++++++++++++++++++--------- python/tests/files/example_ingest_html.json | 1 + python/tests/import_ingest.py | 45 +++++++++++++++++++++++++++-- 3 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 python/tests/files/example_ingest_html.json (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 2042d331..2965f229 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,6 @@ +import datetime + import fatcat_openapi_client from .common import EntityImporter, make_rel_url @@ -160,8 +162,6 @@ class IngestFileResultImporter(EntityImporter): # support old cdx-only ingest results cdx = row.get('cdx') if not cdx: - # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 return None else: terminal = { @@ -175,7 +175,11 @@ class IngestFileResultImporter(EntityImporter): terminal['terminal_url'] = terminal['url'] if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] + + # convert CDX-style digits to ISO-style timestamp assert len(terminal['terminal_dt']) == 14 + terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" + return terminal def parse_urls(self, row, terminal): @@ -240,6 +244,11 @@ class IngestFileResultImporter(EntityImporter): return None terminal = self.parse_terminal(row) + if not terminal: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( @@ -353,6 +362,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): extra=self.editgroup_extra), entity_list=batch)) + class IngestWebResultImporter(IngestFileResultImporter): """ Variant of IngestFileResultImporter for processing HTML ingest requests @@ -361,7 +371,7 @@ class IngestWebResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "WebCaptures crawled from web using sandcrawler ingest tool" + eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') kwargs['do_updates'] = False @@ -391,9 +401,6 @@ class IngestWebResultImporter(IngestFileResultImporter): def parse_record(self, row): - """ - TODO: more of this parsing could be DRY with the file version - """ request = row['request'] file_meta = row['file_meta'] @@ -414,8 +421,13 @@ class IngestWebResultImporter(IngestFileResultImporter): return None terminal = self.parse_terminal(row) + if not terminal: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + urls = self.parse_urls(row, terminal) - archive_urls = [u for u in urls if u['rel'] == 'webarchive'] + archive_urls = [u for u in urls if u.rel == 'webarchive'] if terminal['terminal_status_code'] != 200: self.counts['skip-terminal-status-code'] += 1 @@ -430,8 +442,10 @@ class IngestWebResultImporter(IngestFileResultImporter): wc_cdx = [] # primary resource first wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=terminal['terminal_surt'], # XXX: from CDX? - timestamp=terminal['terminal_dt'], # as an ISO datetime + # XXX + #surt=terminal['terminal_surt'], # XXX: from CDX? + surt=terminal['terminal_url'], + timestamp=terminal['terminal_timestamp'], url=terminal['terminal_url'], mimetype=file_meta['mimetype'], status_code=terminal['terminal_status_code'], @@ -441,9 +455,12 @@ class IngestWebResultImporter(IngestFileResultImporter): )) for resource in row.get('html_resources', []): + timestamp = resource['timestamp'] + if not "+" in timestamp and not "Z" in timestamp: + timestamp += "Z" wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( surt=resource['surt'], - timestamp=resource['timestamp'], + timestamp=timestamp, url=resource['url'], mimetype=resource.get('mimetype'), size=resource.get('size_bytes'), @@ -451,13 +468,12 @@ class IngestWebResultImporter(IngestFileResultImporter): sha256=resource.get('sha256hex'), )) - wc = fatcat_openapi_client.WebCaptureEntity( + wc = fatcat_openapi_client.WebcaptureEntity( cdx=wc_cdx, archive_urls=archive_urls, original_url=terminal['terminal_url'], - timestamp=terminal['terminal_dt'], + timestamp=terminal['terminal_timestamp'], release_ids=[release_ident], - urls=urls, ) edit_extra = self.parse_edit_extra(row) @@ -493,7 +509,7 @@ class IngestWebResultImporter(IngestFileResultImporter): return False def insert_batch(self, batch): - self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebCaptureAutoBatch( + self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), diff --git a/python/tests/files/example_ingest_html.json b/python/tests/files/example_ingest_html.json new file mode 100644 index 00000000..6c646814 --- /dev/null +++ b/python/tests/files/example_ingest_html.json @@ -0,0 +1 @@ +{"cdx": {"datetime": "20200708025309", "mimetype": "text/html", "sha1b32": "THJFFZJR2VYN2FAR7X7LHFGRU2X5IC2U", "sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "status_code": 200, "surt": "py,una,iics,scielo)/scielo.php?lng=en&nrm=iso&pid=s1683-98032015000200002&script=sci_arttext&tlng=es", "url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es", "warc_csize": 13123, "warc_offset": 77579308, "warc_path": "SCIELO-CRAWL-2020-07-20200707211940442-00279-00347-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200708024511243-00332-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "515a61845a2f898438e3986e4506da8f", "mimetype": "text/html", "sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "sha256hex": "c4559d548476a325891461b71c796beee717e820d6a00cb8411176ce83a0f23f", "size_bytes": 47442}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es"], "html_biblio": {"container_issn": "1683-9803", "container_name": "Pediatr\u00eda (Asunci\u00f3n)", "contrib_names": ["Ruiz Valiente, Syntia Carolina", "Ruiz Ca\u00f1ete, Manuel", "Cohene Velazquez, Bartola"], "doi": "10.18004/ped.2015.agosto.102-107", "first_page": "102", "html_fulltext_url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es", "issue": "2", "last_page": "107", "pdf_fulltext_url": "http://scielo.iics.una.py/pdf/ped/v42n2/v42n2a02.pdf", "publisher": "Sociedad Paraguaya de Pediatr\u00eda", "release_date": "2015-08-06", "title": "Prevalence of malnutrition and eating habits in children under 5 years of age in indigenous communities in Azote'y and Yby Yau, 2011", "volume": "42", "xml_fulltext_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "html_body": {"agent": "trafilatura/0.5.1", "status": "success", "word_count": 3500}, "html_resources": [{"mimetype": "image/gif", "resource_type": "image", "sha1hex": "4991aa771874daf8cba79be38d18d534f946b5d6", "sha256hex": "5e76fad755b873a439dd5e775684696c547008d45cc901606132e9a1ed970757", "size": 220, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/alpha.gif", "timestamp": "2020-10-31T14:07:30", "url": "http://scielo.iics.una.py/img/en/alpha.gif"}, {"mimetype": "text/plain", "resource_type": "script", "sha1hex": "fd28e342fa1b40b84cc17dc66d22df3bf260170b", "sha256hex": "9cf2e81dd65d5a64200970bbd1cd9497b46b2af232e2fbfb79fef95b070f23d1", "size": 3653, "status_code": 200, "surt": "py,una,iics,scielo)/applications/scielo-org/js/toolbox.js", "timestamp": "2020-10-31T20:14:35", "url": "http://scielo.iics.una.py/applications/scielo-org/js/toolbox.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "c5ea6229ce6a97f2dc2b2e2c8ffac26400dfcd58", "sha256hex": "7fb3d59ea14ab060c2b6cbdd5e63d57e158d6cc9e613ceb05ab1e6ec60d64995", "size": 382, "status_code": 200, "surt": "py,una,iics,scielo)/img/common/iconpermalink.gif", "timestamp": "2020-10-31T20:14:52", "url": "http://scielo.iics.una.py/img/common/iconPermalink.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "fbd3488e6b8cd241605fa2db14ba15e0f037d3a7", "sha256hex": "5492829967d521386bec4323f0d7ef951e9a0b16caa1bcd8e75576dc41bd3b55", "size": 26759, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f1.jpg", "timestamp": "2020-07-08T02:53:11", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f1.jpg"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "9f1833948223109dfaca2c37fbdbacb81002a346", "sha256hex": "f5b08a2022fce73ae04c3b9fe368645a084132a942bb29950bba705ed89e6d91", "size": 35440, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02t1.jpg", "timestamp": "2020-07-08T02:53:18", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02t1.jpg"}, {"mimetype": "image/png", "resource_type": "image", "sha1hex": "0d2d329000cba763e5eec45bd8ee2743393ebd62", "sha256hex": "d964eed5974264b8f107a905b74796cb3d5e60f78da1c500bb547a419538915e", "size": 3091, "status_code": 200, "surt": "py,una,iics,scielo)/img/common/icon-close.png", "timestamp": "2020-10-24T10:17:13", "url": "http://scielo.iics.una.py/img/common/icon-close.png"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "5812dc64389992d7d59d10e57449407778bbd0c0", "sha256hex": "605ce931ded871d924f31765c6bbf778eb8b5194b3396f49638a88331f53dc21", "size": 652, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconxmldocument.gif", "timestamp": "2020-10-24T14:58:51", "url": "http://scielo.iics.una.py/img/en/iconXMLDocument.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "f20584095b9c7d06250140bf7f51f7bd91e2ba08", "sha256hex": "f8292c0c25d5eec546fe16e8a53101b4933adb2e75e58d7335158dc94b2bae91", "size": 239, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/artsrc.gif", "timestamp": "2020-10-29T17:34:52", "url": "http://scielo.iics.una.py/img/en/artsrc.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "09fe461f38958a267695edf5675f668323f754ec", "sha256hex": "d0792cfc52df6414126a541e8cd32ba151d75f87225c63d38a9ddad389b913b3", "size": 229, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/subject.gif", "timestamp": "2020-10-28T12:59:36", "url": "http://scielo.iics.una.py/img/en/subject.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "7b2f78593847928d8f0f8a2068b0cb366501c3e5", "sha256hex": "97dfc989c7af7a0139950696e533fe71c373539091200edba96f151efb045f8d", "size": 181, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/grp1c.gif", "timestamp": "2020-10-24T10:17:25", "url": "http://scielo.iics.una.py/img/en/grp1c.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "a1e6d8818d56678a52a18859b0cf919b8663a5aa", "sha256hex": "2d34923f1bb8e417a4c244ba5be13b7fe52e0dc6dba9dbcdf512a9fb3cb84d91", "size": 27383, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f2.jpg", "timestamp": "2020-07-08T02:53:13", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f2.jpg"}, {"mimetype": "text/plain", "resource_type": "stylesheet", "sha1hex": "3754bfd4a8608ec125c79ccc7b62ead02c323bbc", "sha256hex": "4dc9b9edd3fc1e58d7a1c39c64551ac07530bedf0721323fc2c820a90a7b4a64", "size": 87, "status_code": 200, "surt": "py,una,iics,scielo)/css/screen.css", "timestamp": "2020-10-24T11:51:22", "url": "http://scielo.iics.una.py/css/screen.css"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "818ff217eae41fe796f21e9b56336011d8806de0", "sha256hex": "a3853400c16b0628dd226487d1ad7710f44a2e6ea8de85f2b2a6a34b7334d5b6", "size": 210, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/search.gif", "timestamp": "2020-10-28T12:59:22", "url": "http://scielo.iics.una.py/img/en/search.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "a6300c0530bdc13b1bf75a7f380cef6c1be48cc7", "sha256hex": "aa7fa5a5bedea888ddbb89f20838207eb303323c98c414452f632f96acaccbfe", "size": 660, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconemail.gif", "timestamp": "2020-11-01T12:16:11", "url": "http://scielo.iics.una.py/img/en/iconEmail.gif"}, {"mimetype": "image/png", "resource_type": "image", "sha1hex": "100a6b57582fbf383f96c289c92fbbc9aaa63f06", "sha256hex": "f43d4d35e7ac1e815dc0c8897806e30d928ee62e1aa6ac20f49c649f8b694004", "size": 430, "status_code": 200, "surt": "net,licensebuttons)/l/by/4.0/80x15.png", "timestamp": "2020-07-08T21:51:45", "url": "https://licensebuttons.net/l/by/4.0/80x15.png"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "dbfce64d671bbb03591a297983c81ede279b051d", "sha256hex": "2ef85ef9dd7926099287dd33ab43fc6819b393446e85ddb754897ad457c56282", "size": 244, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/prev.gif", "timestamp": "2020-10-29T01:22:04", "url": "http://scielo.iics.una.py/img/en/prev.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "bf5bed62fc6cc82a8a7e862fceac9ce8ffb12cb8", "sha256hex": "1a90de599f61e3191fec24d504798c372b72ccbc511c3c44d48070a0dddefe25", "size": 262, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconrelatedoff.gif", "timestamp": "2020-10-28T12:59:51", "url": "http://scielo.iics.una.py/img/en/iconRelatedOff.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "4a59b2b1d57a210252311d563eea138afcc7a886", "sha256hex": "f96585d38fb34040d9bd81e83538a7beade916bc2d4456e75d5911181281cb6f", "size": 586, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/e-mailt.gif", "timestamp": "2020-10-31T20:53:28", "url": "http://scielo.iics.una.py/img/en/e-mailt.gif"}, {"mimetype": "image/jpeg", "resource_type": "image", "sha1hex": "d465c5da9dea2a6e10b5d340c5af6af6cc10f3ec", "sha256hex": "ffc1411a8185c8df5ca9c0725fbfab41380706d213ca42c8552f32323b67901d", "size": 33992, "status_code": 200, "surt": "py,una,iics,scielo)/img/revistas/ped/v42n2/2a02f3.jpg", "timestamp": "2020-07-08T02:53:19", "url": "http://scielo.iics.una.py/img/revistas/ped/v42n2/2a02f3.jpg"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "86dbd3881975bc15fef15536e5cbd54bad53271c", "sha256hex": "75cbc76c44915b46c6c44fdeeeabd1bfab774ecf692d37ed8d1b4674f5ee583d", "size": 628, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconpdfdocument.gif", "timestamp": "2020-10-24T11:51:12", "url": "http://scielo.iics.una.py/img/en/iconPDFDocument.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "080333f92aa899e53bcd30a4e734d9b36d7ac7a4", "sha256hex": "e6834ac24d48ec9d75b178de59964eb9fb66e9cff05b439ad247f5af5d5fc1ff", "size": 374, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconreferences.gif", "timestamp": "2020-10-29T05:40:05", "url": "http://scielo.iics.una.py/img/en/iconReferences.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "fe879762fd80c756df0af9c81e3424d651fa1b6a", "sha256hex": "4333f6c0ccd89f3240b6c8bb9b2c109792da6d0513e618c35033e2474981b55d", "size": 578, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/icontranslation.gif", "timestamp": "2020-10-28T22:59:05", "url": "http://scielo.iics.una.py/img/en/iconTranslation.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "8846563b9722db2f3b832c03ad2ee9b6318c1d0e", "sha256hex": "6843f628c71f39631ec5d501f6b62506ae9f8454c0a3cd957f4dc67985c371bb", "size": 219, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/author.gif", "timestamp": "2020-10-24T10:16:50", "url": "http://scielo.iics.una.py/img/en/author.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "db35a7e4171a130d25632d6e1ba9c3806eec1e87", "sha256hex": "bd6496501a92a6ed3c5e8c16ce0af4ac9b4cece3562934010d0878f6ea06ead0", "size": 288, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/iconcitedoff.gif", "timestamp": "2020-10-29T17:34:57", "url": "http://scielo.iics.una.py/img/en/iconCitedOff.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "0d844cf48e3ed3849d7e2deed30fb2e7318107b0", "sha256hex": "f257802855722fed0b2b6936a9aede3a4869fc347e91860a26cc81c1ba9df3a3", "size": 164, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/toc.gif", "timestamp": "2020-10-28T22:59:18", "url": "http://scielo.iics.una.py/img/en/toc.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "33e2d3699619eb6dac7c91c207c748599def84f0", "sha256hex": "d8af84c5c4c10e724a081409b0f0e50eb08b9c2cd3d3e0ee0b33cc9eaa20086c", "size": 193, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/next.gif", "timestamp": "2020-11-01T12:16:02", "url": "http://scielo.iics.una.py/img/en/next.gif"}, {"mimetype": "text/html", "resource_type": "script", "sha1hex": "731cf720a546953efe311566a2d874fae715bfc6", "sha256hex": "0d3602fb417d811e15e1a7bd6725384e5bda874dab0eb7be7ee59cd26d64dbd1", "size": 8231, "status_code": 200, "surt": "py,una,iics,scielo)/article.js", "timestamp": "2020-10-29T05:40:31", "url": "http://scielo.iics.una.py/article.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "756779e5ff89d107b2eb4843cc47dd4b63efc829", "sha256hex": "b52a6dc8cbbf4212790cf57af7489b9dc21c040ae7372c07bb6aa18473098759", "size": 190, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/home.gif", "timestamp": "2020-07-06T21:30:58", "url": "http://scielo.iics.una.py/img/en/home.gif"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "e541555caec87c313c9c1859c4b5913b31f955c5", "sha256hex": "64eeb2c0e97f96d9144aa83d027fb5b9d57d96c74681611c39af99d49b148c6e", "size": 643, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/fulltxt.gif", "timestamp": "2020-10-29T22:47:53", "url": "http://scielo.iics.una.py/img/en/fulltxt.gif"}, {"mimetype": "text/plain", "resource_type": "script", "sha1hex": "65cbff4e9d95d47a6f31d96ab4ea361c1f538a7b", "sha256hex": "e23a2a4e2d7c2b41ebcdd8ffc0679df7140eb7f52e1eebabf827a88182643c59", "size": 72174, "status_code": 200, "surt": "py,una,iics,scielo)/applications/scielo-org/js/jquery-1.4.2.min.js", "timestamp": "2020-07-06T21:17:01", "url": "http://scielo.iics.una.py/applications/scielo-org/js/jquery-1.4.2.min.js"}, {"mimetype": "image/gif", "resource_type": "image", "sha1hex": "20368dd206a00eaf8bb117f98291a30eb0cc8e73", "sha256hex": "534434f1716e29928e0376d0e5dc113808c96d9cedab8675adff7dbf22cb9fd1", "size": 1353, "status_code": 200, "surt": "py,una,iics,scielo)/img/en/fbpelogp.gif", "timestamp": "2020-07-06T21:16:38", "url": "http://scielo.iics.una.py/img/en/fbpelogp.gif"}], "request": {"link_source": "doi", "ingest_request_source": "fatcat-changelog", "base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "html"}, "scope": "article-fulltext", "status": "success", "terminal": {"terminal_dt": "20200708025309", "terminal_sha1hex": "99d252e531d570dd1411fdfeb394d1a6afd40b54", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scielo.php?script=sci_arttext&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=es"}} diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 21552fb9..92539f1a 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -2,7 +2,7 @@ import json import pytest -from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher +from fatcat_tools.importers import IngestFileResultImporter, IngestWebResultImporter, JsonLinePusher from fixtures import * @@ -10,6 +10,10 @@ from fixtures import * def ingest_importer(api): yield IngestFileResultImporter(api) +@pytest.fixture(scope="function") +def ingest_web_importer(api): + yield IngestWebResultImporter(api) + # TODO: use API to check that entities actually created... def test_ingest_importer_basic(ingest_importer): with open('tests/files/example_ingest.json', 'r') as f: @@ -46,6 +50,7 @@ def test_ingest_importer_xml(ingest_importer): with open('tests/files/example_ingest_xml.json', 'r') as f: ingest_importer.bezerk_mode = True counts = JsonLinePusher(ingest_importer, f).run() + print(counts) assert counts['insert'] == 1 assert counts['exists'] == 0 assert counts['skip'] == 0 @@ -58,6 +63,42 @@ def test_ingest_importer_xml(ingest_importer): assert eg.extra['git_rev'] assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent'] + # re-import should skip + with open('tests/files/example_ingest_xml.json', 'r') as f: + ingest_importer.reset() + ingest_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + +def test_ingest_importer_web(ingest_web_importer): + last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_web_importer, f).run() + print(counts) + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = ingest_web_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IngestWebResultImporter" in eg.extra['agent'] + + # re-import should skip + with open('tests/files/example_ingest_html.json', 'r') as f: + ingest_web_importer.reset() + ingest_web_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_web_importer, f).run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + def test_ingest_importer_stage(ingest_importer, api): """ Tests that ingest importer correctly handles release stage matching @@ -74,7 +115,7 @@ def test_ingest_importer_stage(ingest_importer, api): with open('tests/files/example_ingest.json', 'r') as f: raw = json.loads(f.readline()) for row in test_table: - print(row) + #print(row) # set dummy record stage eg = quick_eg(api) -- cgit v1.2.3 From 013ee4d4ea51ce2c348ed051777fb2d0c18fe903 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 5 Nov 2020 23:05:27 -0800 Subject: ingest: fix XML ingest test file --- python/tests/files/example_ingest_xml.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/tests/files/example_ingest_xml.json b/python/tests/files/example_ingest_xml.json index ba61b183..2f525998 100644 --- a/python/tests/files/example_ingest_xml.json +++ b/python/tests/files/example_ingest_xml.json @@ -1 +1 @@ -{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}} +{"cdx": {"datetime": "20200710091403", "mimetype": "text/xml", "sha1b32": "PWMQ2L4RHPJ3NVWC66GIJC36L5FXPOM6", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "status_code": 200, "surt": "py,una,iics,scielo)/scieloorg/php/articlexml.php?lang=en&pid=s1683-98032015000200002", "url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en", "warc_csize": 12108, "warc_offset": 94730348, "warc_path": "SCIELO-CRAWL-2020-07-20200710082036515-00773-00843-wbgrp-svc206/SCIELO-CRAWL-2020-07-20200710085423121-00779-13069~wbgrp-svc206.us.archive.org~8443.warc.gz"}, "file_meta": {"md5hex": "cda133a706ce02a07fae8bd8d2694a2a", "mimetype": "application/jats+xml", "sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "sha256hex": "be982ca211e4debb3f93f36d9f9dc1c80f99a8809eb4c41569b2b9503c27e751", "size_bytes": 49242}, "hit": true, "hops": ["http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"], "request": {"link_source": "doi", "ingest_request_source": "fatcat-changelog","base_url": "http://scielo.iics.una.py/scielo.php?script=sci_abstract&pid=S1683-98032015000200002&lng=en&nrm=iso&tlng=en", "ext_ids": {"doi": "10.123/abc"}, "fatcat": {"release_ident": null}, "ingest_type": "xml"}, "status": "success", "terminal": {"terminal_dt": "20200710091403", "terminal_sha1hex": "7d990d2f913bd3b6d6c2f78c848b7e5f4b77b99e", "terminal_status_code": 200, "terminal_url": "http://scielo.iics.una.py/scieloOrg/php/articleXML.php?pid=S1683-98032015000200002&lang=en"}, "xml_meta": {"status": "success"}} -- cgit v1.2.3 From f32ff2bd5ab1dba1dc3108b75b28ce4090d9c00f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 6 Nov 2020 18:40:33 -0800 Subject: html ingest: remaining implementation --- python/fatcat_tools/importers/ingest.py | 41 +++++++++++++++------------------ 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 2965f229..4dcb1ec3 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -221,7 +221,6 @@ class IngestFileResultImporter(EntityImporter): def parse_record(self, row): request = row['request'] - fatcat = request.get('fatcat') file_meta = row['file_meta'] # double check that want() filtered request correctly (eg, old requests) @@ -399,7 +398,6 @@ class IngestWebResultImporter(IngestFileResultImporter): return True - def parse_record(self, row): request = row['request'] @@ -442,9 +440,7 @@ class IngestWebResultImporter(IngestFileResultImporter): wc_cdx = [] # primary resource first wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - # XXX - #surt=terminal['terminal_surt'], # XXX: from CDX? - surt=terminal['terminal_url'], + surt=terminal_cdx['surt'], timestamp=terminal['terminal_timestamp'], url=terminal['terminal_url'], mimetype=file_meta['mimetype'], @@ -463,7 +459,7 @@ class IngestWebResultImporter(IngestFileResultImporter): timestamp=timestamp, url=resource['url'], mimetype=resource.get('mimetype'), - size=resource.get('size_bytes'), + size=resource.get('size'), sha1=resource.get('sha1hex'), sha256=resource.get('sha256hex'), )) @@ -482,7 +478,6 @@ class IngestWebResultImporter(IngestFileResultImporter): wc.edit_extra = edit_extra return wc - def try_update(self, wc): # check for existing edits-in-progress with same file hash @@ -491,23 +486,25 @@ class IngestWebResultImporter(IngestFileResultImporter): self.counts['skip-in-queue'] += 1 return False - # lookup sha1, or create new entity - existing = None - # XXX: lookup *release* instead; skip if any existing web capture entities - # XXX: only one release per webcapture - try: - existing = self.api.lookup_file(sha1=wc.sha1) - except fatcat_openapi_client.rest.ApiException as err: - if err.status != 404: - raise err - - if not existing: - return True - else: - # TODO: for now, never update - self.counts['skip-update-disabled'] += 1 + # lookup sha1, or create new entity (TODO: API doesn't support this yet) + #existing = None + + # TODO: currently only allow one release per webcapture + release = self.api.get_release(wc.release_ids[0], expand="webcaptures") + if release.webcaptures: + # check if this is an existing match, or just a similar hit + for other in release.webcaptures: + if wc.original_url == other.original_url: + # TODO: compare very similar timestamps of same time (different formats) + self.counts['exists'] += 1 + return False + self.counts['skip-release-has-webcapture'] += 1 return False + # TODO: for now, never update + self.counts['skip-update-disabled'] += 1 + return False + def insert_batch(self, batch): self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( editgroup=fatcat_openapi_client.Editgroup( -- cgit v1.2.3 From b1b34d44ce1a416ee70be665b71b99ba9f98d9a3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 6 Nov 2020 19:16:31 -0800 Subject: ingest tool: support for setting ingest type --- python/fatcat_ingest.py | 4 ++++ python/fatcat_tools/transforms/ingest.py | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'python') diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 68676ad2..b9d71a7c 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -87,6 +87,7 @@ def _run_search_dump(args, search): ingest_request = release_ingest_request( release, ingest_request_source="fatcat-ingest", + ingest_type=args.ingest_type, ) if not ingest_request: continue @@ -214,6 +215,9 @@ def main(): parser.add_argument('--force-recrawl', action='store_true', help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl") + parser.add_argument('--ingest-type', + default="pdf", + help="What medium to ingest (pdf, xml, html)") subparsers = parser.add_subparsers() sub_container = subparsers.add_parser('container', diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 2f4e2271..59831017 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -15,15 +15,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= if release.state != 'active': return None + # TODO: infer ingest type based on release_type or container metadata? + if not ingest_type: + ingest_type = 'pdf' + # generate a URL where we expect to find fulltext url = None link_source = None link_source_id = None - if release.ext_ids.arxiv: + if release.ext_ids.arxiv and ingest_type == "pdf": url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) link_source = "arxiv" link_source_id = release.ext_ids.arxiv - elif release.ext_ids.pmcid: + elif release.ext_ids.pmcid and ingest_type == "pdf": # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) @@ -40,10 +44,6 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - # TODO: infer ingest type based on release_type or container metadata? - if not ingest_type: - ingest_type = 'pdf' - ingest_request = { 'ingest_type': ingest_type, 'ingest_request_source': ingest_request_source, -- cgit v1.2.3 From a73b73c2944b3df2a62886c4e6b69c93f5e74222 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 16 Nov 2020 17:50:06 -0800 Subject: html ingest: actual xhtml mimetype --- python/fatcat_tools/importers/ingest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4dcb1ec3..4fbd19f1 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -392,7 +392,7 @@ class IngestWebResultImporter(IngestFileResultImporter): if row['request'].get('ingest_type') != 'html': self.counts['skip-ingest-type'] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/html"): + if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): self.counts['skip-mimetype'] += 1 return False @@ -407,7 +407,7 @@ class IngestWebResultImporter(IngestFileResultImporter): if request.get('ingest_type') != "html": self.counts['skip-ingest-type'] += 1 return None - if file_meta['mimetype'] not in ("text/html", "application/html"): + if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): self.counts['skip-mimetype'] += 1 return None -- cgit v1.2.3