summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r--python/fatcat_tools/importers/ingest.py693
1 files changed, 374 insertions, 319 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index f0943c1e..e0a6c3f5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -1,4 +1,3 @@
-
import datetime
import fatcat_openapi_client
@@ -7,17 +6,16 @@ from .common import EntityImporter, make_rel_url
class IngestFileResultImporter(EntityImporter):
-
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFileResultImporter")
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.use_glutton_match = False
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
@@ -27,20 +25,20 @@ class IngestFileResultImporter(EntityImporter):
else:
print("NOT checking GROBID success")
self.ingest_request_source_allowlist = [
- 'fatcat-changelog',
- 'fatcat-ingest-container',
- 'fatcat-ingest',
- 'arabesque',
+ "fatcat-changelog",
+ "fatcat-ingest-container",
+ "fatcat-ingest",
+ "arabesque",
#'mag-corpus',
#'mag',
- 'unpaywall-corpus',
- 'unpaywall',
+ "unpaywall-corpus",
+ "unpaywall",
#'s2-corpus',
#'s2',
- 'doaj',
- 'dblp',
+ "doaj",
+ "dblp",
]
- if kwargs.get('skip_source_allowlist', False):
+ if kwargs.get("skip_source_allowlist", False):
self.ingest_request_source_allowlist = []
def want_file(self, row) -> bool:
@@ -48,28 +46,32 @@ class IngestFileResultImporter(EntityImporter):
File-specific part of want(). Generic across general ingest and save-paper-now.
"""
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
# type-specific filters
- if row['request'].get('ingest_type') == 'pdf':
- if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
- self.counts['skip-grobid'] += 1
+ if row["request"].get("ingest_type") == "pdf":
+ if self.require_grobid and row.get("grobid", {}).get("status_code") != 200:
+ self.counts["skip-grobid"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("application/pdf",):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("application/pdf",):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') == 'xml':
- if row['file_meta'].get('mimetype') not in ("application/xml",
- "application/jats+xml", "application/tei+xml", "text/xml"):
- self.counts['skip-mimetype'] += 1
+ elif row["request"].get("ingest_type") == "xml":
+ if row["file_meta"].get("mimetype") not in (
+ "application/xml",
+ "application/jats+xml",
+ "application/tei+xml",
+ "text/xml",
+ ):
+ self.counts["skip-mimetype"] += 1
return False
- elif row['request'].get('ingest_type') in ['component', 'src', 'dataset-file']:
+ elif row["request"].get("ingest_type") in ["component", "src", "dataset-file"]:
# we rely on sandcrawler for these checks
pass
else:
- self.counts['skip-ingest-type'] += 1
+ self.counts["skip-ingest-type"] += 1
return False
return True
@@ -79,24 +81,36 @@ class IngestFileResultImporter(EntityImporter):
Sandcrawler ingest-specific part of want(). Generic across file and
webcapture ingest.
"""
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if self.ingest_request_source_allowlist and source not in self.ingest_request_source_allowlist:
- self.counts['skip-ingest_request_source'] += 1
+ if (
+ self.ingest_request_source_allowlist
+ and source not in self.ingest_request_source_allowlist
+ ):
+ self.counts["skip-ingest_request_source"] += 1
return False
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2', 'doaj', 'dblp'):
- self.counts['skip-link-source'] += 1
+ if row["request"].get("link_source") not in (
+ "arxiv",
+ "pmc",
+ "unpaywall",
+ "doi",
+ "mag",
+ "s2",
+ "doaj",
+ "dblp",
+ ):
+ self.counts["skip-link-source"] += 1
return False
- if source.startswith('savepapernow'):
+ if source.startswith("savepapernow"):
# never process async savepapernow requests
- self.counts['skip-savepapernow'] += 1
+ self.counts["skip-savepapernow"] += 1
return False
return True
@@ -125,19 +139,19 @@ class IngestFileResultImporter(EntityImporter):
def parse_ingest_release_ident(self, row):
- request = row['request']
- fatcat = request.get('fatcat')
+ request = row["request"]
+ fatcat = request.get("fatcat")
release_ident = None
- if fatcat and fatcat.get('release_ident'):
- release_ident = fatcat.get('release_ident')
- elif request.get('ext_ids'):
+ if fatcat and fatcat.get("release_ident"):
+ release_ident = fatcat.get("release_ident")
+ elif request.get("ext_ids"):
# if no fatcat ident, try extids
- for extid_type in ('doi', 'pmid', 'pmcid', 'arxiv', 'doaj', 'dblp'):
- extid = request['ext_ids'].get(extid_type)
+ for extid_type in ("doi", "pmid", "pmcid", "arxiv", "doaj", "dblp"):
+ extid = request["ext_ids"].get(extid_type)
if not extid:
continue
- if extid_type == 'doi':
+ if extid_type == "doi":
extid = extid.lower()
try:
release = self.api.lookup_release(**{extid_type: extid})
@@ -145,66 +159,69 @@ class IngestFileResultImporter(EntityImporter):
if err.status == 404:
continue
elif err.status == 400:
- self.counts['warn-extid-invalid'] += 1
+ self.counts["warn-extid-invalid"] += 1
continue
raise err
# verify release_stage
- if request.get('release_stage') and release.release_stage:
- if request['release_stage'] != release.release_stage:
- self.counts['skip-release-stage'] += 1
+ if request.get("release_stage") and release.release_stage:
+ if request["release_stage"] != release.release_stage:
+ self.counts["skip-release-stage"] += 1
return None
release_ident = release.ident
break
- if self.use_glutton_match and not release_ident and row.get('grobid'):
+ if self.use_glutton_match and not release_ident and row.get("grobid"):
# try biblio-glutton extracted hit
- if row['grobid'].get('fatcat_release'):
- release_ident = row['grobid']['fatcat_release'].split('_')[-1]
- self.counts['glutton-match'] += 1
+ if row["grobid"].get("fatcat_release"):
+ release_ident = row["grobid"]["fatcat_release"].split("_")[-1]
+ self.counts["glutton-match"] += 1
return release_ident
def parse_terminal(self, row):
- terminal = row.get('terminal')
+ terminal = row.get("terminal")
if not terminal:
# support old cdx-only ingest results
- cdx = row.get('cdx')
+ cdx = row.get("cdx")
if not cdx:
return None
else:
terminal = {
- 'terminal_url': cdx['url'],
- 'terminal_dt': cdx['datetime'],
- 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'),
+ "terminal_url": cdx["url"],
+ "terminal_dt": cdx["datetime"],
+ "terminal_status_code": cdx.get("status_code") or cdx.get("http_status"),
}
# work around old schema
- if 'terminal_url' not in terminal:
- terminal['terminal_url'] = terminal['url']
- if 'terminal_dt' not in terminal:
- terminal['terminal_dt'] = terminal['dt']
+ if "terminal_url" not in terminal:
+ terminal["terminal_url"] = terminal["url"]
+ if "terminal_dt" not in terminal:
+ terminal["terminal_dt"] = terminal["dt"]
# convert CDX-style digits to ISO-style timestamp
- assert len(terminal['terminal_dt']) == 14
- terminal['terminal_timestamp'] = datetime.datetime.strptime(terminal['terminal_dt'], "%Y%m%d%H%M%S").isoformat() + "Z"
+ assert len(terminal["terminal_dt"]) == 14
+ terminal["terminal_timestamp"] = (
+ datetime.datetime.strptime(terminal["terminal_dt"], "%Y%m%d%H%M%S").isoformat()
+ + "Z"
+ )
return terminal
def parse_urls(self, row, terminal):
- request = row['request']
+ request = row["request"]
default_rel = self.default_link_rel
- if request.get('link_source') == 'doi':
- default_rel = 'publisher'
- default_rel = request.get('rel', default_rel)
- url = make_rel_url(terminal['terminal_url'], default_rel)
+ if request.get("link_source") == "doi":
+ default_rel = "publisher"
+ default_rel = request.get("rel", default_rel)
+ url = make_rel_url(terminal["terminal_url"], default_rel)
if not url:
- self.counts['skip-url'] += 1
+ self.counts["skip-url"] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- terminal['terminal_dt'],
- terminal['terminal_url'])
+ terminal["terminal_dt"], terminal["terminal_url"]
+ )
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
@@ -212,38 +229,38 @@ class IngestFileResultImporter(EntityImporter):
def parse_edit_extra(self, row):
- request = row['request']
+ request = row["request"]
edit_extra = dict()
- if request.get('edit_extra'):
- edit_extra = request['edit_extra']
+ if request.get("edit_extra"):
+ edit_extra = request["edit_extra"]
- if request.get('ingest_request_source'):
- edit_extra['ingest_request_source'] = request['ingest_request_source']
- if request.get('link_source') and request.get('link_source_id'):
- edit_extra['link_source'] = request['link_source']
- edit_extra['link_source_id'] = request['link_source_id']
- if edit_extra['link_source'] == 'doi':
- edit_extra['link_source_id'] = edit_extra['link_source_id'].lower()
+ if request.get("ingest_request_source"):
+ edit_extra["ingest_request_source"] = request["ingest_request_source"]
+ if request.get("link_source") and request.get("link_source_id"):
+ edit_extra["link_source"] = request["link_source"]
+ edit_extra["link_source_id"] = request["link_source_id"]
+ if edit_extra["link_source"] == "doi":
+ edit_extra["link_source_id"] = edit_extra["link_source_id"].lower()
# GROBID metadata, for SPN requests (when there might not be 'success')
- if request.get('ingest_type') == 'pdf':
- if row.get('grobid') and row['grobid'].get('status') != 'success':
- edit_extra['grobid_status_code'] = row['grobid']['status_code']
- edit_extra['grobid_version'] = row['grobid'].get('grobid_version')
+ if request.get("ingest_type") == "pdf":
+ if row.get("grobid") and row["grobid"].get("status") != "success":
+ edit_extra["grobid_status_code"] = row["grobid"]["status_code"]
+ edit_extra["grobid_version"] = row["grobid"].get("grobid_version")
return edit_extra
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') not in ('pdf', 'xml'):
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in ("pdf", "xml"):
+ self.counts["skip-ingest-type"] += 1
return None
- assert (request['ingest_type'], file_meta['mimetype']) in [
+ assert (request["ingest_type"], file_meta["mimetype"]) in [
("pdf", "application/pdf"),
("xml", "application/xml"),
("xml", "application/jats+xml"),
@@ -255,23 +272,23 @@ class IngestFileResultImporter(EntityImporter):
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
fe = fatcat_openapi_client.FileEntity(
- md5=file_meta['md5hex'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- mimetype=file_meta['mimetype'],
+ md5=file_meta["md5hex"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ mimetype=file_meta["mimetype"],
release_ids=[release_ident],
urls=urls,
)
@@ -293,7 +310,7 @@ class IngestFileResultImporter(EntityImporter):
# check for existing edits-in-progress with same file hash
for other in self._entity_queue:
if other.sha1 == fe.sha1:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
if not existing:
@@ -302,31 +319,36 @@ class IngestFileResultImporter(EntityImporter):
# NOTE: the following checks all assume there is an existing item
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if not self.do_updates:
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
# TODO: for now, never update
- self.counts['skip-update-disabled'] += 1
+ self.counts["skip-update-disabled"] += 1
return False
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_file(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_file_auto_batch(
+ fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFileImporter(IngestFileResultImporter):
@@ -338,29 +360,29 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFileImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['require_grobid'] = False
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Files crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFileImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["require_grobid"] = False
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_file(row):
@@ -377,14 +399,14 @@ class IngestWebResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestWebResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestWebResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
@@ -392,91 +414,95 @@ class IngestWebResultImporter(IngestFileResultImporter):
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
return True
def parse_record(self, row):
- request = row['request']
- file_meta = row['file_meta']
+ request = row["request"]
+ file_meta = row["file_meta"]
# double check that want() filtered request correctly (eg, old requests)
- if request.get('ingest_type') != "html":
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return None
- if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
terminal = self.parse_terminal(row)
if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
+ self.counts["skip-no-terminal"] += 1
return None
urls = self.parse_urls(row, terminal)
- archive_urls = [u for u in urls if u.rel == 'webarchive']
+ archive_urls = [u for u in urls if u.rel == "webarchive"]
- if terminal['terminal_status_code'] != 200:
- self.counts['skip-terminal-status-code'] += 1
+ if terminal["terminal_status_code"] != 200:
+ self.counts["skip-terminal-status-code"] += 1
return None
- terminal_cdx = row['cdx']
- if 'revisit_cdx' in row:
- terminal_cdx = row['revisit_cdx']
- assert terminal_cdx['surt']
- if terminal_cdx['url'] != terminal['terminal_url']:
- self.counts['skip-terminal-url-mismatch'] += 1
+ terminal_cdx = row["cdx"]
+ if "revisit_cdx" in row:
+ terminal_cdx = row["revisit_cdx"]
+ assert terminal_cdx["surt"]
+ if terminal_cdx["url"] != terminal["terminal_url"]:
+ self.counts["skip-terminal-url-mismatch"] += 1
return None
wc_cdx = []
# primary resource first
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=terminal_cdx['surt'],
- timestamp=terminal['terminal_timestamp'],
- url=terminal['terminal_url'],
- mimetype=file_meta['mimetype'],
- status_code=terminal['terminal_status_code'],
- sha1=file_meta['sha1hex'],
- sha256=file_meta['sha256hex'],
- size=file_meta['size_bytes'],
- ))
-
- for resource in row.get('html_resources', []):
- timestamp = resource['timestamp']
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=terminal_cdx["surt"],
+ timestamp=terminal["terminal_timestamp"],
+ url=terminal["terminal_url"],
+ mimetype=file_meta["mimetype"],
+ status_code=terminal["terminal_status_code"],
+ sha1=file_meta["sha1hex"],
+ sha256=file_meta["sha256hex"],
+ size=file_meta["size_bytes"],
+ )
+ )
+
+ for resource in row.get("html_resources", []):
+ timestamp = resource["timestamp"]
if "+" not in timestamp and "Z" not in timestamp:
timestamp += "Z"
- wc_cdx.append(fatcat_openapi_client.WebcaptureCdxLine(
- surt=resource['surt'],
- timestamp=timestamp,
- url=resource['url'],
- mimetype=resource.get('mimetype'),
- size=resource.get('size'),
- sha1=resource.get('sha1hex'),
- sha256=resource.get('sha256hex'),
- ))
+ wc_cdx.append(
+ fatcat_openapi_client.WebcaptureCdxLine(
+ surt=resource["surt"],
+ timestamp=timestamp,
+ url=resource["url"],
+ mimetype=resource.get("mimetype"),
+ size=resource.get("size"),
+ sha1=resource.get("sha1hex"),
+ sha256=resource.get("sha256hex"),
+ )
+ )
wc = fatcat_openapi_client.WebcaptureEntity(
cdx=wc_cdx,
archive_urls=archive_urls,
- original_url=terminal['terminal_url'],
- timestamp=terminal['terminal_timestamp'],
+ original_url=terminal["terminal_url"],
+ timestamp=terminal["terminal_timestamp"],
release_ids=[release_ident],
)
@@ -491,11 +517,11 @@ class IngestWebResultImporter(IngestFileResultImporter):
# check for existing edits-in-progress with same URL
for other in self._entity_queue:
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# TODO: currently only allow one release per webcapture
release = self.api.get_release(wc.release_ids[0], expand="webcaptures")
@@ -504,9 +530,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
for other in release.webcaptures:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-webcapture'] += 1
+ self.counts["skip-release-has-webcapture"] += 1
return False
# Ok, if we got here then no existing web capture for (first) release,
@@ -515,18 +541,24 @@ class IngestWebResultImporter(IngestFileResultImporter):
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_webcapture(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_webcapture_auto_batch(fatcat_openapi_client.WebcaptureAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_webcapture_auto_batch(
+ fatcat_openapi_client.WebcaptureAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
+
class SavePaperNowWebImporter(IngestWebResultImporter):
"""
@@ -535,15 +567,15 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Webcaptures crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowWebImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Webcaptures crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowWebImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
"""
@@ -553,27 +585,27 @@ class SavePaperNowWebImporter(IngestWebResultImporter):
path, which means allowing hit=false.
"""
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
# webcapture-specific filters
- if row['request'].get('ingest_type') != 'html':
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") != "html":
+ self.counts["skip-ingest-type"] += 1
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
+ if not row.get("file_meta"):
+ self.counts["skip-file-meta"] += 1
return False
- if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
- self.counts['skip-mimetype'] += 1
+ if row["file_meta"].get("mimetype") not in ("text/html", "application/xhtml+xml"):
+ self.counts["skip-mimetype"] += 1
return False
- if row.get('status') not in ['success', 'unknown-scope']:
- self.counts['skip-hit'] += 1
+ if row.get("status") not in ["success", "unknown-scope"]:
+ self.counts["skip-hit"] += 1
return False
return True
@@ -587,28 +619,28 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Filesets crawled from web using sandcrawler ingest tool"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFilesetResultImporter')
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Filesets crawled from web using sandcrawler ingest tool"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.IngestFilesetResultImporter")
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
self.max_file_count = 300
def want_fileset(self, row):
- if not row.get('manifest') or len(row.get('manifest')) == 0:
- self.counts['skip-empty-manifest'] += 1
+ if not row.get("manifest") or len(row.get("manifest")) == 0:
+ self.counts["skip-empty-manifest"] += 1
return False
- if len(row.get('manifest')) == 1:
- self.counts['skip-single-file'] += 1
+ if len(row.get("manifest")) == 1:
+ self.counts["skip-single-file"] += 1
return False
- if len(row.get('manifest')) > self.max_file_count:
- self.counts['skip-too-many-files'] += 1
+ if len(row.get("manifest")) > self.max_file_count:
+ self.counts["skip-too-many-files"] += 1
return False
return True
@@ -619,8 +651,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return False
# fileset-specific filters
- if row['request'].get('ingest_type') not in ['dataset',]:
- self.counts['skip-ingest-type'] += 1
+ if row["request"].get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return False
if not self.want_fileset(row):
@@ -629,102 +663,118 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return True
def parse_fileset_urls(self, row):
- if not row.get('strategy'):
+ if not row.get("strategy"):
return []
- strategy = row['strategy']
+ strategy = row["strategy"]
urls = []
- if strategy == 'archiveorg-fileset' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
- rel="archive-base",
- ))
- if row['strategy'].startswith('web-') and row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
- rel="webarchive-base",
- ))
+ if strategy == "archiveorg-fileset" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/",
+ rel="archive-base",
+ )
+ )
+ if row["strategy"].startswith("web-") and row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_base_url_dt']}/{row['web_base_url']}",
+ rel="webarchive-base",
+ )
+ )
# TODO: repository-base
# TODO: web-base
- if row['strategy'] == 'archiveorg-fileset-bundle' and row.get('archiveorg_item_name'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
- rel="archive-bundle",
- ))
+ if row["strategy"] == "archiveorg-fileset-bundle" and row.get("archiveorg_item_name"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://archive.org/download/{row['archiveorg_item_name']}/{row['archiveorg_bundle_path']}",
+ rel="archive-bundle",
+ )
+ )
- if row['strategy'] == 'web-fileset-bundle' and row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
- rel="webarchive-bundle",
- ))
+ if row["strategy"] == "web-fileset-bundle" and row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=f"https://web.archive.org/web/{row['web_bundle_url_dt']}/{row['web_bundle_url']}",
+ rel="webarchive-bundle",
+ )
+ )
# add any additional / platform URLs here
- if row.get('platform_bundle_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-bundle",
- ))
- if row.get('platform_base_url'):
- urls.append(fatcat_openapi_client.FilesetUrl(
- url=row['platform_bundle_url'],
- rel="repository-base",
- ))
+ if row.get("platform_bundle_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-bundle",
+ )
+ )
+ if row.get("platform_base_url"):
+ urls.append(
+ fatcat_openapi_client.FilesetUrl(
+ url=row["platform_bundle_url"],
+ rel="repository-base",
+ )
+ )
return urls
def parse_record(self, row):
- request = row['request']
+ request = row["request"]
# double check that want() filtered request correctly
- if request.get('ingest_type') not in ["dataset",]:
- self.counts['skip-ingest-type'] += 1
+ if request.get("ingest_type") not in [
+ "dataset",
+ ]:
+ self.counts["skip-ingest-type"] += 1
return None
# identify release by fatcat ident, or extid lookup
release_ident = self.parse_ingest_release_ident(row)
if not release_ident:
- self.counts['skip-release-not-found'] += 1
+ self.counts["skip-release-not-found"] += 1
return None
entity_extra = dict()
edit_extra = self.parse_edit_extra(row)
- edit_extra['ingest_strategy'] = row['ingest_strategy']
- if row.get('platform'):
- edit_extra['platform'] = row['platform']
- if row.get('platform_id'):
- edit_extra['platform_id'] = row['platform_id']
+ edit_extra["ingest_strategy"] = row["ingest_strategy"]
+ if row.get("platform"):
+ edit_extra["platform"] = row["platform"]
+ if row.get("platform_id"):
+ edit_extra["platform_id"] = row["platform_id"]
entity_urls = self.parse_fileset_urls(row)
if not entity_urls:
- self.counts['skip-no-access-url'] += 1
+ self.counts["skip-no-access-url"] += 1
return None
- assert row['file_count'] == len(row['manifest'])
- if row['file_count'] > self.max_file_count:
- self.counts['skip-too-many-manifest-files'] += 1
+ assert row["file_count"] == len(row["manifest"])
+ if row["file_count"] > self.max_file_count:
+ self.counts["skip-too-many-manifest-files"] += 1
return None
manifest = []
- for ingest_file in row['manifest']:
+ for ingest_file in row["manifest"]:
fsf = fatcat_openapi_client.FilesetFile(
- path=ingest_file['path'],
- size=ingest_file['size'],
- md5=ingest_file['md5'],
- sha1=ingest_file['sha1'],
- sha256=ingest_file.get('sha256'),
+ path=ingest_file["path"],
+ size=ingest_file["size"],
+ md5=ingest_file["md5"],
+ sha1=ingest_file["sha1"],
+ sha256=ingest_file.get("sha256"),
extra=dict(
- mimetype=ingest_file['mimetype'],
+ mimetype=ingest_file["mimetype"],
),
)
if not (fsf.md5 and fsf.sha1 and fsf.path and fsf.size):
- self.counts['skip-partial-file-info'] += 1
+ self.counts["skip-partial-file-info"] += 1
return None
- if ingest_file.get('platform_url'):
+ if ingest_file.get("platform_url"):
# XXX: should we include this?
- fsf.extra['original_url'] = ingest_file['platform_url']
- if ingest_file.get('terminal_url') and ingest_file.get('terminal_dt'):
- fsf.extra['wayback_url'] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
+ fsf.extra["original_url"] = ingest_file["platform_url"]
+ if ingest_file.get("terminal_url") and ingest_file.get("terminal_dt"):
+ fsf.extra[
+ "wayback_url"
+ ] = f"https://web.archive.org/web/{ingest_file['terminal_dt']}/{ingest_file['terminal_url']}"
manifest.append(fsf)
fe = fatcat_openapi_client.FilesetEntity(
@@ -745,11 +795,11 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in self._entity_queue:
# XXX: how to duplicate check?
if other.original_url == wc.original_url:
- self.counts['skip-in-queue'] += 1
+ self.counts["skip-in-queue"] += 1
return False
# lookup sha1, or create new entity (TODO: API doesn't support this yet)
- #existing = None
+ # existing = None
# NOTE: in lieu of existing checks (by lookup), only allow one fileset per release
release = self.api.get_release(wc.release_ids[0], expand="filesets")
@@ -759,27 +809,32 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
for other in release.filesets:
if wc.original_url == other.original_url:
# TODO: compare very similar timestamps of same time (different formats)
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
- self.counts['skip-release-has-fileset'] += 1
+ self.counts["skip-release-has-fileset"] += 1
return False
return True
def insert_batch(self, batch):
if self.submit_mode:
- eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra))
+ eg = self.api.create_editgroup(
+ fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ )
+ )
for fe in batch:
self.api.create_fileset(eg.editgroup_id, fe)
self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
else:
- self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_fileset_auto_batch(
+ fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
@@ -789,28 +844,28 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter):
def __init__(self, api, submit_mode=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description', None) or "Fileset crawled after a public 'Save Paper Now' request"
- eg_extra = kwargs.pop('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.SavePaperNowFilesetImporter')
- kwargs['submit_mode'] = submit_mode
- kwargs['do_updates'] = False
- super().__init__(api,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Fileset crawled after a public 'Save Paper Now' request"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.SavePaperNowFilesetImporter")
+ kwargs["submit_mode"] = submit_mode
+ kwargs["do_updates"] = False
+ super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs)
def want(self, row):
- source = row['request'].get('ingest_request_source')
+ source = row["request"].get("ingest_request_source")
if not source:
- self.counts['skip-ingest_request_source'] += 1
+ self.counts["skip-ingest_request_source"] += 1
return False
- if not source.startswith('savepapernow'):
- self.counts['skip-not-savepapernow'] += 1
+ if not source.startswith("savepapernow"):
+ self.counts["skip-not-savepapernow"] += 1
return False
- if row.get('hit') is not True:
- self.counts['skip-hit'] += 1
+ if row.get("hit") is not True:
+ self.counts["skip-hit"] += 1
return False
if not self.want_fileset(row):