diff options
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 693 |
1 files changed, 374 insertions, 319 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index f0943c1e..e0a6c3f5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,4 +1,3 @@ - import datetime import fatcat_openapi_client @@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): - def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter") + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.use_glutton_match = False self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel @@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter): else: print("NOT checking GROBID success") self.ingest_request_source_allowlist = [ - 'fatcat-changelog', - 'fatcat-ingest-container', - 'fatcat-ingest', - 'arabesque', + "fatcat-changelog", + "fatcat-ingest-container", + "fatcat-ingest", + "arabesque", #'mag-corpus', #'mag', - 'unpaywall-corpus', - 'unpaywall', + "unpaywall-corpus", + "unpaywall", #'s2-corpus', #'s2', - 'doaj', - 'dblp', + "doaj", + "dblp", ] - if kwargs.get('skip_source_allowlist', False): + if kwargs.get("skip_source_allowlist", False): self.ingest_request_source_allowlist = [] def want_file(self, row) -> bool: @@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter): File-specific part of want(). Generic across general ingest and save-paper-now. """ - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False # type-specific filters - if row['request'].get('ingest_type') == 'pdf': - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + if row["request"].get("ingest_type") == "pdf": + if self.require_grobid and row.get("grobid", {}).get("status_code") != 200: + self.counts["skip-grobid"] += 1 return False - if row['file_meta'].get('mimetype') not in ("application/pdf",): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("application/pdf",): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') == 'xml': - if row['file_meta'].get('mimetype') not in ("application/xml", - "application/jats+xml", "application/tei+xml", "text/xml"): - self.counts['skip-mimetype'] += 1 + elif row["request"].get("ingest_type") == "xml": + if row["file_meta"].get("mimetype") not in ( + "application/xml", + "application/jats+xml", + "application/tei+xml", + "text/xml", + ): + self.counts["skip-mimetype"] += 1 return False - elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']: + elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]: # we rely on sandcrawler for these checks pass else: - self.counts['skip-ingest-type'] += 1 + self.counts["skip-ingest-type"] += 1 return False return True @@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter): Sandcrawler ingest-specific part of want(). Generic across file and webcapture ingest. """ - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist: - self.counts['skip-ingest_request_source'] += 1 + if ( + self.ingest_request_source_allowlist + and source not in self.ingest_request_source_allowlist + ): + self.counts["skip-ingest_request_source"] += 1 return False - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'): - self.counts['skip-link-source'] += 1 + if row["request"].get("link_source") not in ( + "arxiv", + "pmc", + "unpaywall", + "doi", + "mag", + "s2", + "doaj", + "dblp", + ): + self.counts["skip-link-source"] += 1 return False - if source.startswith('savepapernow'): + if source.startswith("savepapernow"): # never process async savepapernow requests - self.counts['skip-savepapernow'] += 1 + self.counts["skip-savepapernow"] += 1 return False return True @@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter): def parse_ingest_release_ident(self, row): - request = row['request'] - fatcat = request.get('fatcat') + request = row["request"] + fatcat = request.get("fatcat") release_ident = None - if fatcat and fatcat.get('release_ident'): - release_ident = fatcat.get('release_ident') - elif request.get('ext_ids'): + if fatcat and fatcat.get("release_ident"): + release_ident = fatcat.get("release_ident") + elif request.get("ext_ids"): # if no fatcat ident, try extids - for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'): - extid = request['ext_ids'].get(extid_type) + for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"): + extid = request["ext_ids"].get(extid_type) if not extid: continue - if extid_type == 'doi': + if extid_type == "doi": extid = extid.lower() try: release = self.api.lookup_release(**{extid_type: extid}) @@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter): if err.status == 404: continue elif err.status == 400: - self.counts['warn-extid-invalid'] += 1 + self.counts["warn-extid-invalid"] += 1 continue raise err # verify release_stage - if request.get('release_stage') and release.release_stage: - if request['release_stage'] != release.release_stage: - self.counts['skip-release-stage'] += 1 + if request.get("release_stage") and release.release_stage: + if request["release_stage"] != release.release_stage: + self.counts["skip-release-stage"] += 1 return None release_ident = release.ident break - if self.use_glutton_match and not release_ident and row.get('grobid'): + if self.use_glutton_match and not release_ident and row.get("grobid"): # try biblio-glutton extracted hit - if row['grobid'].get('fatcat_release'): - release_ident = row['grobid']['fatcat_release'].split('_')[-1] - self.counts['glutton-match'] += 1 + if row["grobid"].get("fatcat_release"): + release_ident = row["grobid"]["fatcat_release"].split("_")[-1] + self.counts["glutton-match"] += 1 return release_ident def parse_terminal(self, row): - terminal = row.get('terminal') + terminal = row.get("terminal") if not terminal: # support old cdx-only ingest results - cdx = row.get('cdx') + cdx = row.get("cdx") if not cdx: return None else: terminal = { - 'terminal_url': cdx['url'], - 'terminal_dt': cdx['datetime'], - 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), + "terminal_url": cdx["url"], + "terminal_dt": cdx["datetime"], + "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"), } # work around old schema - if 'terminal_url' not in terminal: - terminal['terminal_url'] = terminal['url'] - if 'terminal_dt' not in terminal: - terminal['terminal_dt'] = terminal['dt'] + if "terminal_url" not in terminal: + terminal["terminal_url"] = terminal["url"] + if "terminal_dt" not in terminal: + terminal["terminal_dt"] = terminal["dt"] # convert CDX-style digits to ISO-style timestamp - assert len(terminal['terminal_dt']) == 14 - terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z" + assert len(terminal["terminal_dt"]) == 14 + terminal["terminal_timestamp"] = ( + datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat() + + "Z" + ) return terminal def parse_urls(self, row, terminal): - request = row['request'] + request = row["request"] default_rel = self.default_link_rel - if request.get('link_source') == 'doi': - default_rel = 'publisher' - default_rel = request.get('rel', default_rel) - url = make_rel_url(terminal['terminal_url'], default_rel) + if request.get("link_source") == "doi": + default_rel = "publisher" + default_rel = request.get("rel", default_rel) + url = make_rel_url(terminal["terminal_url"], default_rel) if not url: - self.counts['skip-url'] += 1 + self.counts["skip-url"] += 1 return None wayback = "https://web.archive.org/web/{}/{}".format( - terminal['terminal_dt'], - terminal['terminal_url']) + terminal["terminal_dt"], terminal["terminal_url"] + ) urls = [url, ("webarchive", wayback)] urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] @@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter): def parse_edit_extra(self, row): - request = row['request'] + request = row["request"] edit_extra = dict() - if request.get('edit_extra'): - edit_extra = request['edit_extra'] + if request.get("edit_extra"): + edit_extra = request["edit_extra"] - if request.get('ingest_request_source'): - edit_extra['ingest_request_source'] = request['ingest_request_source'] - if request.get('link_source') and request.get('link_source_id'): - edit_extra['link_source'] = request['link_source'] - edit_extra['link_source_id'] = request['link_source_id'] - if edit_extra['link_source'] == 'doi': - edit_extra['link_source_id'] = edit_extra['link_source_id'].lower() + if request.get("ingest_request_source"): + edit_extra["ingest_request_source"] = request["ingest_request_source"] + if request.get("link_source") and request.get("link_source_id"): + edit_extra["link_source"] = request["link_source"] + edit_extra["link_source_id"] = request["link_source_id"] + if edit_extra["link_source"] == "doi": + edit_extra["link_source_id"] = edit_extra["link_source_id"].lower() # GROBID metadata, for SPN requests (when there might not be 'success') - if request.get('ingest_type') == 'pdf': - if row.get('grobid') and row['grobid'].get('status') != 'success': - edit_extra['grobid_status_code'] = row['grobid']['status_code'] - edit_extra['grobid_version'] = row['grobid'].get('grobid_version') + if request.get("ingest_type") == "pdf": + if row.get("grobid") and row["grobid"].get("status") != "success": + edit_extra["grobid_status_code"] = row["grobid"]["status_code"] + edit_extra["grobid_version"] = row["grobid"].get("grobid_version") return edit_extra def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') not in ('pdf', 'xml'): - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in ("pdf", "xml"): + self.counts["skip-ingest-type"] += 1 return None - assert (request['ingest_type'], file_meta['mimetype']) in [ + assert (request["ingest_type"], file_meta["mimetype"]) in [ ("pdf", "application/pdf"), ("xml", "application/xml"), ("xml", "application/jats+xml"), @@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter): release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) fe = fatcat_openapi_client.FileEntity( - md5=file_meta['md5hex'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'], + md5=file_meta["md5hex"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + mimetype=file_meta["mimetype"], release_ids=[release_ident], urls=urls, ) @@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter): # check for existing edits-in-progress with same file hash for other in self._entity_queue: if other.sha1 == fe.sha1: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False if not existing: @@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter): # NOTE: the following checks all assume there is an existing item if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if not self.do_updates: - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False # TODO: for now, never update - self.counts['skip-update-disabled'] += 1 + self.counts["skip-update-disabled"] += 1 return False def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_file(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_file_auto_batch( + fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFileImporter(IngestFileResultImporter): @@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter') - kwargs['submit_mode'] = submit_mode - kwargs['require_grobid'] = False - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Files crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter") + kwargs["submit_mode"] = submit_mode + kwargs["require_grobid"] = False + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_file(row): @@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): @@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter): return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False return True def parse_record(self, row): - request = row['request'] - file_meta = row['file_meta'] + request = row["request"] + file_meta = row["file_meta"] # double check that want() filtered request correctly (eg, old requests) - if request.get('ingest_type') != "html": - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return None - if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None terminal = self.parse_terminal(row) if not terminal: # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 + self.counts["skip-no-terminal"] += 1 return None urls = self.parse_urls(row, terminal) - archive_urls = [u for u in urls if u.rel == 'webarchive'] + archive_urls = [u for u in urls if u.rel == "webarchive"] - if terminal['terminal_status_code'] != 200: - self.counts['skip-terminal-status-code'] += 1 + if terminal["terminal_status_code"] != 200: + self.counts["skip-terminal-status-code"] += 1 return None - terminal_cdx = row['cdx'] - if 'revisit_cdx' in row: - terminal_cdx = row['revisit_cdx'] - assert terminal_cdx['surt'] - if terminal_cdx['url'] != terminal['terminal_url']: - self.counts['skip-terminal-url-mismatch'] += 1 + terminal_cdx = row["cdx"] + if "revisit_cdx" in row: + terminal_cdx = row["revisit_cdx"] + assert terminal_cdx["surt"] + if terminal_cdx["url"] != terminal["terminal_url"]: + self.counts["skip-terminal-url-mismatch"] += 1 return None wc_cdx = [] # primary resource first - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=terminal_cdx['surt'], - timestamp=terminal['terminal_timestamp'], - url=terminal['terminal_url'], - mimetype=file_meta['mimetype'], - status_code=terminal['terminal_status_code'], - sha1=file_meta['sha1hex'], - sha256=file_meta['sha256hex'], - size=file_meta['size_bytes'], - )) - - for resource in row.get('html_resources', []): - timestamp = resource['timestamp'] + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=terminal_cdx["surt"], + timestamp=terminal["terminal_timestamp"], + url=terminal["terminal_url"], + mimetype=file_meta["mimetype"], + status_code=terminal["terminal_status_code"], + sha1=file_meta["sha1hex"], + sha256=file_meta["sha256hex"], + size=file_meta["size_bytes"], + ) + ) + + for resource in row.get("html_resources", []): + timestamp = resource["timestamp"] if "+" not in timestamp and "Z" not in timestamp: timestamp += "Z" - wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine( - surt=resource['surt'], - timestamp=timestamp, - url=resource['url'], - mimetype=resource.get('mimetype'), - size=resource.get('size'), - sha1=resource.get('sha1hex'), - sha256=resource.get('sha256hex'), - )) + wc_cdx.append( + fatcat_openapi_client.WebcaptureCdxLine( + surt=resource["surt"], + timestamp=timestamp, + url=resource["url"], + mimetype=resource.get("mimetype"), + size=resource.get("size"), + sha1=resource.get("sha1hex"), + sha256=resource.get("sha256hex"), + ) + ) wc = fatcat_openapi_client.WebcaptureEntity( cdx=wc_cdx, archive_urls=archive_urls, - original_url=terminal['terminal_url'], - timestamp=terminal['terminal_timestamp'], + original_url=terminal["terminal_url"], + timestamp=terminal["terminal_timestamp"], release_ids=[release_ident], ) @@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter): # check for existing edits-in-progress with same URL for other in self._entity_queue: if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # TODO: currently only allow one release per webcapture release = self.api.get_release(wc.release_ids[0], expand="webcaptures") @@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter): for other in release.webcaptures: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-webcapture'] += 1 + self.counts["skip-release-has-webcapture"] += 1 return False # Ok, if we got here then no existing web capture for (first) release, @@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter): def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_webcapture(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_webcapture_auto_batch( + fatcat_openapi_client.WebcaptureAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) + class SavePaperNowWebImporter(IngestWebResultImporter): """ @@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Webcaptures crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): """ @@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter): path, which means allowing hit=false. """ - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False # webcapture-specific filters - if row['request'].get('ingest_type') != 'html': - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") != "html": + self.counts["skip-ingest-type"] += 1 return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 + if not row.get("file_meta"): + self.counts["skip-file-meta"] += 1 return False - if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): - self.counts['skip-mimetype'] += 1 + if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"): + self.counts["skip-mimetype"] += 1 return False - if row.get('status') not in ['success', 'unknown-scope']: - self.counts['skip-hit'] += 1 + if row.get("status") not in ["success", "unknown-scope"]: + self.counts["skip-hit"] += 1 return False return True @@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter): def __init__(self, api, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter') - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Filesets crawled from web using sandcrawler ingest tool" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter") + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.max_file_count = 300 def want_fileset(self, row): - if not row.get('manifest') or len(row.get('manifest')) == 0: - self.counts['skip-empty-manifest'] += 1 + if not row.get("manifest") or len(row.get("manifest")) == 0: + self.counts["skip-empty-manifest"] += 1 return False - if len(row.get('manifest')) == 1: - self.counts['skip-single-file'] += 1 + if len(row.get("manifest")) == 1: + self.counts["skip-single-file"] += 1 return False - if len(row.get('manifest')) > self.max_file_count: - self.counts['skip-too-many-files'] += 1 + if len(row.get("manifest")) > self.max_file_count: + self.counts["skip-too-many-files"] += 1 return False return True @@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return False # fileset-specific filters - if row['request'].get('ingest_type') not in ['dataset',]: - self.counts['skip-ingest-type'] += 1 + if row["request"].get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return False if not self.want_fileset(row): @@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True def parse_fileset_urls(self, row): - if not row.get('strategy'): + if not row.get("strategy"): return [] - strategy = row['strategy'] + strategy = row["strategy"] urls = [] - if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/", - rel="archive-base", - )) - if row['strategy'].startswith('web-') and row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", - rel="webarchive-base", - )) + if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/", + rel="archive-base", + ) + ) + if row["strategy"].startswith("web-") and row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}", + rel="webarchive-base", + ) + ) # TODO: repository-base # TODO: web-base - if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", - rel="archive-bundle", - )) + if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}", + rel="archive-bundle", + ) + ) - if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", - rel="webarchive-bundle", - )) + if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}", + rel="webarchive-bundle", + ) + ) # add any additional / platform URLs here - if row.get('platform_bundle_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-bundle", - )) - if row.get('platform_base_url'): - urls.append(fatcat_openapi_client.FilesetUrl( - url=row['platform_bundle_url'], - rel="repository-base", - )) + if row.get("platform_bundle_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-bundle", + ) + ) + if row.get("platform_base_url"): + urls.append( + fatcat_openapi_client.FilesetUrl( + url=row["platform_bundle_url"], + rel="repository-base", + ) + ) return urls def parse_record(self, row): - request = row['request'] + request = row["request"] # double check that want() filtered request correctly - if request.get('ingest_type') not in ["dataset",]: - self.counts['skip-ingest-type'] += 1 + if request.get("ingest_type") not in [ + "dataset", + ]: + self.counts["skip-ingest-type"] += 1 return None # identify release by fatcat ident, or extid lookup release_ident = self.parse_ingest_release_ident(row) if not release_ident: - self.counts['skip-release-not-found'] += 1 + self.counts["skip-release-not-found"] += 1 return None entity_extra = dict() edit_extra = self.parse_edit_extra(row) - edit_extra['ingest_strategy'] = row['ingest_strategy'] - if row.get('platform'): - edit_extra['platform'] = row['platform'] - if row.get('platform_id'): - edit_extra['platform_id'] = row['platform_id'] + edit_extra["ingest_strategy"] = row["ingest_strategy"] + if row.get("platform"): + edit_extra["platform"] = row["platform"] + if row.get("platform_id"): + edit_extra["platform_id"] = row["platform_id"] entity_urls = self.parse_fileset_urls(row) if not entity_urls: - self.counts['skip-no-access-url'] += 1 + self.counts["skip-no-access-url"] += 1 return None - assert row['file_count'] == len(row['manifest']) - if row['file_count'] > self.max_file_count: - self.counts['skip-too-many-manifest-files'] += 1 + assert row["file_count"] == len(row["manifest"]) + if row["file_count"] > self.max_file_count: + self.counts["skip-too-many-manifest-files"] += 1 return None manifest = [] - for ingest_file in row['manifest']: + for ingest_file in row["manifest"]: fsf = fatcat_openapi_client.FilesetFile( - path=ingest_file['path'], - size=ingest_file['size'], - md5=ingest_file['md5'], - sha1=ingest_file['sha1'], - sha256=ingest_file.get('sha256'), + path=ingest_file["path"], + size=ingest_file["size"], + md5=ingest_file["md5"], + sha1=ingest_file["sha1"], + sha256=ingest_file.get("sha256"), extra=dict( - mimetype=ingest_file['mimetype'], + mimetype=ingest_file["mimetype"], ), ) if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size): - self.counts['skip-partial-file-info'] += 1 + self.counts["skip-partial-file-info"] += 1 return None - if ingest_file.get('platform_url'): + if ingest_file.get("platform_url"): # XXX: should we include this? - fsf.extra['original_url'] = ingest_file['platform_url'] - if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'): - fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" + fsf.extra["original_url"] = ingest_file["platform_url"] + if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"): + fsf.extra[ + "wayback_url" + ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}" manifest.append(fsf) fe = fatcat_openapi_client.FilesetEntity( @@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in self._entity_queue: # XXX: how to duplicate check? if other.original_url == wc.original_url: - self.counts['skip-in-queue'] += 1 + self.counts["skip-in-queue"] += 1 return False # lookup sha1, or create new entity (TODO: API doesn't support this yet) - #existing = None + # existing = None # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release release = self.api.get_release(wc.release_ids[0], expand="filesets") @@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter): for other in release.filesets: if wc.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False - self.counts['skip-release-has-fileset'] += 1 + self.counts["skip-release-has-fileset"] += 1 return False return True def insert_batch(self, batch): if self.submit_mode: - eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra)) + eg = self.api.create_editgroup( + fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ) + ) for fe in batch: self.api.create_fileset(eg.editgroup_id, fe) self.api.update_editgroup(eg.editgroup_id, eg, submit=True) else: - self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_fileset_auto_batch( + fatcat_openapi_client.FilesetAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) class SavePaperNowFilesetImporter(IngestFilesetResultImporter): @@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): def __init__(self, api, submit_mode=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request" - eg_extra = kwargs.pop('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter') - kwargs['submit_mode'] = submit_mode - kwargs['do_updates'] = False - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + eg_desc = ( + kwargs.pop("editgroup_description", None) + or "Fileset crawled after a public 'Save Paper Now' request" + ) + eg_extra = kwargs.pop("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter") + kwargs["submit_mode"] = submit_mode + kwargs["do_updates"] = False + super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, row): - source = row['request'].get('ingest_request_source') + source = row["request"].get("ingest_request_source") if not source: - self.counts['skip-ingest_request_source'] += 1 + self.counts["skip-ingest_request_source"] += 1 return False - if not source.startswith('savepapernow'): - self.counts['skip-not-savepapernow'] += 1 + if not source.startswith("savepapernow"): + self.counts["skip-not-savepapernow"] += 1 return False - if row.get('hit') is not True: - self.counts['skip-hit'] += 1 + if row.get("hit") is not True: + self.counts["skip-hit"] += 1 return False if not self.want_fileset(row): |