diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:43:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 12:43:38 -0700 |
commit | 9fc10fc243cdcb0bf2c01d36254d55baae411153 (patch) | |
tree | 21fb7c09973245e7e736acc71d3303f2a568e340 /elastic_transform.py | |
parent | 2ec0dfcdefb954d9232a0025cafbf1cc426ae7c2 (diff) | |
download | fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.tar.gz fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.zip |
tweak fulltext ES schema
Diffstat (limited to 'elastic_transform.py')
-rwxr-xr-x | elastic_transform.py | 93 |
1 files changed, 54 insertions, 39 deletions
diff --git a/elastic_transform.py b/elastic_transform.py index 93b0310..04fba33 100755 --- a/elastic_transform.py +++ b/elastic_transform.py @@ -28,6 +28,9 @@ def fulltext_to_elasticsearch(row, force_bool=True): release = row['fatcat_release'] + abstracts = [] + abstract_langs = [] + # first, easy fatcat metadata t = { 'fatcat_ident': release['ident'], @@ -66,51 +69,12 @@ def fulltext_to_elasticsearch(row, force_bool=True): for key in EXT_IDS: t[key] = release['ext_ids'].get(key) or None - abstracts = [] - abstract_langs = [] - - # then the fulltext stuff - t['fulltext']['status'] = row.get('fulltext_status', 'none') - if 'fulltext_file' in row: - full = row['fulltext_file'] - t['fulltext']['sha1'] = full['sha1'] - t['fulltext']['pdf_url'] = "/" + full['pdf_path'] - if full.get('pdftotext_path'): - t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path'] - if full.get('thumbnail_path'): - t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] - if full.get('grobid_xml_path'): - t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] - - if 'fulltext_grobid' in row: - grobid = row['fulltext_grobid'] - if grobid.get('abstract'): - abstracts.append(grobid['abstract']) - abstract_langs.append(grobid['language_code']) - t['fulltext']['abstract'] = grobid.get('abstract', None) - t['fulltext']['body'] = grobid.get('body', None) - t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None) - t['fulltext']['annex'] = grobid.get('annex', None) - t['fulltext']['lang'] = grobid.get('language_code', None) - elif 'fulltext_pdftotext' in row: - pdftotext = row['fulltext_pdftotext'] - t['fulltext']['body'] = pdftotext.get('body', None) - - if 'cord19_paper' in row: - paper = row['cord19_paper'] - t['cord19_uid'] = paper['cord_uid'] - if paper.get('abstract'): - abstracts.append(paper['abstract']) - t['contrib_count'] = len(release['contribs'] or []) if release.get('abstracts'): for a in release['abstracts']: abstracts.append(a['content']) abstract_langs.append(a['lang']) - - t['abstract'] = abstracts - t['abstract_lang'] = list(set(abstract_langs)) contrib_names = [] contrib_affiliations = [] @@ -181,6 +145,57 @@ def fulltext_to_elasticsearch(row, force_bool=True): if t['doi']: t['doi_prefix'] = t['doi'].split('/')[0] + # then the fulltext stuff + t['fulltext']['status'] = row.get('fulltext_status', 'none') + if 'fulltext_file' in row: + full = row['fulltext_file'] + t['fulltext']['sha1'] = full['sha1'] + t['fulltext']['pdf_url'] = "/" + full['pdf_path'] + if full.get('pdftotext_path'): + t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path'] + if full.get('thumbnail_path'): + t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] + if full.get('grobid_xml_path'): + t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] + + if 'fulltext_grobid' in row: + grobid = row['fulltext_grobid'] + if grobid.get('abstract'): + abstracts.append(grobid['abstract']) + abstract_langs.append(grobid['language_code']) + t['fulltext']['abstract'] = grobid.get('abstract', None) + t['fulltext']['body'] = grobid.get('body', None) + t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None) + t['fulltext']['annex'] = grobid.get('annex', None) + t['fulltext']['lang'] = grobid.get('language_code', None) + elif 'fulltext_pdftotext' in row: + pdftotext = row['fulltext_pdftotext'] + t['fulltext']['body'] = pdftotext.get('body', None) + + # then other metadata stuff + if row.get('source_tags'): + # will get set-uniq at the end + t['source_tags'] = row['source_tags'] + else: + t['source_tags'] = [] + + if 'cord19_paper' in row: + t['source_tags'].append('cord19') + paper = row['cord19_paper'] + t['cord19_uid'] = paper['cord_uid'] + if paper.get('who_covidence_id'): + t['who_covidence_id'] = paper['who_covidence_id'] + t['source_tags'].append('who') + if paper.get('abstract') and not abstracts: + abstracts.append(paper['abstract']) + if not t['license']: + t['license'] = paper.get('license') or None + + t['abstract'] = abstracts + t['abstract_lang'] = list(set(abstract_langs)) + + t['source_tags'] = list(set(t['source_tags'])) + return t def run(args): |