summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-15 14:04:49 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-15 14:04:51 -0800
commita02d51650bb5a3165ec89e822f43ff98807d01c3 (patch)
tree0764b363cf552a77127ee1b9285ecd0648cf8417
parent234dfa11d4e552d4ef784d8c7b13bfdaf42c597c (diff)
downloadfatcat-a02d51650bb5a3165ec89e822f43ff98807d01c3.tar.gz
fatcat-a02d51650bb5a3165ec89e822f43ff98807d01c3.zip
update ingest worker for schema tweaks
Should be backwards compatible with old ingest results. Fixed a bug with glutton ident detection.
-rw-r--r--python/fatcat_tools/importers/ingest.py23
1 files changed, 15 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index e53dcae5..3d391bd8 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -102,27 +102,34 @@ class IngestFileResultImporter(EntityImporter):
break
if not release_ident and row.get('grobid'):
# try biblio-glutton extracted hit
- if row['grobid'].get('fatcat_ident'):
- release = row['grobid']['fatcat_ident'].split('_')[-1]
+ if row['grobid'].get('fatcat_release'):
+ release_ident = row['grobid']['fatcat_release'].split('_')[-1]
+ self.counts['glutton-match'] += 1
if not release_ident:
self.counts['skip-release-not-found'] += 1
return None
- cdx = row.get('cdx')
- if not cdx:
+ terminal = row.get('terminal')
+ if not terminal:
# TODO: support archive.org hits?
- self.counts['skip-no-cdx'] += 1
+ self.counts['skip-no-terminal'] += 1
return None
- url = make_rel_url(cdx['url'], self.default_link_rel)
+ # work around old schema
+ if not 'terminal_url' in terminal:
+ terminal['terminal_url'] = terminal['url']
+ if not 'terminal_dt' in terminal:
+ terminal['terminal_dt'] = terminal['dt']
+ assert len(terminal['terminal_dt']) == 14
+ url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
if not url:
self.counts['skip-url'] += 1
return None
wayback = "https://web.archive.org/web/{}/{}".format(
- cdx['datetime'],
- cdx['url'])
+ terminal['terminal_dt'],
+ terminal['terminal_url'])
urls = [url, ("webarchive", wayback)]
urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]