diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-01-15 19:28:18 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-15 19:28:24 -0800 | 
| commit | f4862bd582577749c7d71979e3e56650a4a58200 (patch) | |
| tree | ab197d91416cf6d63ade16031b97a6aad64f3ed3 /python | |
| parent | 0c4b686cf6c536087683d7982558bec3c5696c7f (diff) | |
| download | sandcrawler-f4862bd582577749c7d71979e3e56650a4a58200.tar.gz sandcrawler-f4862bd582577749c7d71979e3e56650a4a58200.zip | |
persist: fix dupe field copying
In testing hit:
    AttributeError: 'str' object has no attribute 'get'
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/db.py | 9 | 
1 files changed, 8 insertions, 1 deletions
| diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index a2407b5..3ec325e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -151,12 +151,19 @@ class SandcrawlerPostgresClient:          sql += " RETURNING xmax;"          for r in batch:              if r.get('metadata'): +                # sometimes these are only in metadata; shouldn't pass through +                # though (to save database space) +                dupe_fields = ('fatcat_release', 'grobid_version') +                for k in dupe_fields: +                    if not k in r: +                        r[k] = r['metadata'].get(k) +                    r['metadata'].pop(k, None)                  r['metadata'] = json.dumps(r['metadata'], sort_keys=True)          batch = [(d['key'],                    d.get('grobid_version') or None,                    d['status_code'],                    d['status'], -                  d.get('fatcat_release') or d.get('metadata', {}).get('fatcat_release') or None, +                  d.get('fatcat_release') or None,                    d.get('updated') or datetime.datetime.now(),                    d.get('metadata') or None ,                   ) | 
