aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 19:28:18 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 19:28:24 -0800
commitf4862bd582577749c7d71979e3e56650a4a58200 (patch)
treeab197d91416cf6d63ade16031b97a6aad64f3ed3
parent0c4b686cf6c536087683d7982558bec3c5696c7f (diff)
downloadsandcrawler-f4862bd582577749c7d71979e3e56650a4a58200.tar.gz
sandcrawler-f4862bd582577749c7d71979e3e56650a4a58200.zip
persist: fix dupe field copying
In testing hit: AttributeError: 'str' object has no attribute 'get'
-rw-r--r--python/sandcrawler/db.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index a2407b5..3ec325e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -151,12 +151,19 @@ class SandcrawlerPostgresClient:
sql += " RETURNING xmax;"
for r in batch:
if r.get('metadata'):
+ # sometimes these are only in metadata; shouldn't pass through
+ # though (to save database space)
+ dupe_fields = ('fatcat_release', 'grobid_version')
+ for k in dupe_fields:
+ if not k in r:
+ r[k] = r['metadata'].get(k)
+ r['metadata'].pop(k, None)
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
d['status_code'],
d['status'],
- d.get('fatcat_release') or d.get('metadata', {}).get('fatcat_release') or None,
+ d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
d.get('metadata') or None ,
)