aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 12:43:38 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 12:43:38 -0700
commit9fc10fc243cdcb0bf2c01d36254d55baae411153 (patch)
tree21fb7c09973245e7e736acc71d3303f2a568e340
parent2ec0dfcdefb954d9232a0025cafbf1cc426ae7c2 (diff)
downloadfatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.tar.gz
fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.zip
tweak fulltext ES schema
-rwxr-xr-xelastic_transform.py93
-rw-r--r--schema/fulltext_schema.v00.json10
2 files changed, 62 insertions, 41 deletions
diff --git a/elastic_transform.py b/elastic_transform.py
index 93b0310..04fba33 100755
--- a/elastic_transform.py
+++ b/elastic_transform.py
@@ -28,6 +28,9 @@ def fulltext_to_elasticsearch(row, force_bool=True):
release = row['fatcat_release']
+ abstracts = []
+ abstract_langs = []
+
# first, easy fatcat metadata
t = {
'fatcat_ident': release['ident'],
@@ -66,51 +69,12 @@ def fulltext_to_elasticsearch(row, force_bool=True):
for key in EXT_IDS:
t[key] = release['ext_ids'].get(key) or None
- abstracts = []
- abstract_langs = []
-
- # then the fulltext stuff
- t['fulltext']['status'] = row.get('fulltext_status', 'none')
- if 'fulltext_file' in row:
- full = row['fulltext_file']
- t['fulltext']['sha1'] = full['sha1']
- t['fulltext']['pdf_url'] = "/" + full['pdf_path']
- if full.get('pdftotext_path'):
- t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path']
- if full.get('thumbnail_path'):
- t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path']
- if full.get('grobid_xml_path'):
- t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path']
-
- if 'fulltext_grobid' in row:
- grobid = row['fulltext_grobid']
- if grobid.get('abstract'):
- abstracts.append(grobid['abstract'])
- abstract_langs.append(grobid['language_code'])
- t['fulltext']['abstract'] = grobid.get('abstract', None)
- t['fulltext']['body'] = grobid.get('body', None)
- t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None)
- t['fulltext']['annex'] = grobid.get('annex', None)
- t['fulltext']['lang'] = grobid.get('language_code', None)
- elif 'fulltext_pdftotext' in row:
- pdftotext = row['fulltext_pdftotext']
- t['fulltext']['body'] = pdftotext.get('body', None)
-
- if 'cord19_paper' in row:
- paper = row['cord19_paper']
- t['cord19_uid'] = paper['cord_uid']
- if paper.get('abstract'):
- abstracts.append(paper['abstract'])
-
t['contrib_count'] = len(release['contribs'] or [])
if release.get('abstracts'):
for a in release['abstracts']:
abstracts.append(a['content'])
abstract_langs.append(a['lang'])
-
- t['abstract'] = abstracts
- t['abstract_lang'] = list(set(abstract_langs))
contrib_names = []
contrib_affiliations = []
@@ -181,6 +145,57 @@ def fulltext_to_elasticsearch(row, force_bool=True):
if t['doi']:
t['doi_prefix'] = t['doi'].split('/')[0]
+ # then the fulltext stuff
+ t['fulltext']['status'] = row.get('fulltext_status', 'none')
+ if 'fulltext_file' in row:
+ full = row['fulltext_file']
+ t['fulltext']['sha1'] = full['sha1']
+ t['fulltext']['pdf_url'] = "/" + full['pdf_path']
+ if full.get('pdftotext_path'):
+ t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path']
+ if full.get('thumbnail_path'):
+ t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path']
+ if full.get('grobid_xml_path'):
+ t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path']
+
+ if 'fulltext_grobid' in row:
+ grobid = row['fulltext_grobid']
+ if grobid.get('abstract'):
+ abstracts.append(grobid['abstract'])
+ abstract_langs.append(grobid['language_code'])
+ t['fulltext']['abstract'] = grobid.get('abstract', None)
+ t['fulltext']['body'] = grobid.get('body', None)
+ t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None)
+ t['fulltext']['annex'] = grobid.get('annex', None)
+ t['fulltext']['lang'] = grobid.get('language_code', None)
+ elif 'fulltext_pdftotext' in row:
+ pdftotext = row['fulltext_pdftotext']
+ t['fulltext']['body'] = pdftotext.get('body', None)
+
+ # then other metadata stuff
+ if row.get('source_tags'):
+ # will get set-uniq at the end
+ t['source_tags'] = row['source_tags']
+ else:
+ t['source_tags'] = []
+
+ if 'cord19_paper' in row:
+ t['source_tags'].append('cord19')
+ paper = row['cord19_paper']
+ t['cord19_uid'] = paper['cord_uid']
+ if paper.get('who_covidence_id'):
+ t['who_covidence_id'] = paper['who_covidence_id']
+ t['source_tags'].append('who')
+ if paper.get('abstract') and not abstracts:
+ abstracts.append(paper['abstract'])
+ if not t['license']:
+ t['license'] = paper.get('license') or None
+
+ t['abstract'] = abstracts
+ t['abstract_lang'] = list(set(abstract_langs))
+
+ t['source_tags'] = list(set(t['source_tags']))
+
return t
def run(args):
diff --git a/schema/fulltext_schema.v00.json b/schema/fulltext_schema.v00.json
index 26bafe7..11bb49b 100644
--- a/schema/fulltext_schema.v00.json
+++ b/schema/fulltext_schema.v00.json
@@ -45,13 +45,14 @@
"fulltext.body",
"fulltext.acknowledgment",
"fulltext.annex",
+ "biblio_all",
"everything"
]
},
"properties": {
"fatcat_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
"fatcat_revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "source_tags": { "type": "keyword", "normalizer": "default", "doc_values": false },
"work_id": { "type": "keyword", "normalizer": "default" },
"title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
@@ -80,6 +81,8 @@
"jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "who_covidence_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"license": { "type": "keyword", "normalizer": "default" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"publisher_type": { "type": "keyword", "normalizer": "default" },
@@ -106,6 +109,7 @@
"acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
"annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
+ "ia_pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
"pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
"pdf_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
"thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
@@ -115,7 +119,7 @@
},
"everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"affiliation": { "type": "alias", "path": "affiliations" },
"author": { "type": "alias", "path": "contrib_names" },
@@ -126,7 +130,9 @@
"lang": { "type": "alias", "path": "language" },
"stage": { "type": "alias", "path": "release_stage" },
"type": { "type": "alias", "path": "release_type" },
+ "country": { "type": "alias", "path": "country_code" },
+ "source": { "type": "alias", "path": "source_tags" },
"body": { "type": "alias", "path": "fulltext.body" }
}