From f4862bd582577749c7d71979e3e56650a4a58200 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Jan 2020 19:28:18 -0800 Subject: persist: fix dupe field copying In testing hit: AttributeError: 'str' object has no attribute 'get' --- python/sandcrawler/db.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index a2407b5..3ec325e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -151,12 +151,19 @@ class SandcrawlerPostgresClient: sql += " RETURNING xmax;" for r in batch: if r.get('metadata'): + # sometimes these are only in metadata; shouldn't pass through + # though (to save database space) + dupe_fields = ('fatcat_release', 'grobid_version') + for k in dupe_fields: + if not k in r: + r[k] = r['metadata'].get(k) + r['metadata'].pop(k, None) r['metadata'] = json.dumps(r['metadata'], sort_keys=True) batch = [(d['key'], d.get('grobid_version') or None, d['status_code'], d['status'], - d.get('fatcat_release') or d.get('metadata', {}).get('fatcat_release') or None, + d.get('fatcat_release') or None, d.get('updated') or datetime.datetime.now(), d.get('metadata') or None , ) -- cgit v1.2.3