tweak fulltext ES schema

author: Bryan Newbold <bnewbold@archive.org> 2020-04-03 12:43:38 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-04-03 12:43:38 -0700
commit: 9fc10fc243cdcb0bf2c01d36254d55baae411153 (patch)
tree: 21fb7c09973245e7e736acc71d3303f2a568e340
parent: 2ec0dfcdefb954d9232a0025cafbf1cc426ae7c2 (diff)
download: fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.tar.gz
fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.zip
2 files changed, 62 insertions, 41 deletions
diff --git a/elastic_transform.py b/elastic_transform.py
index 93b0310..04fba33 100755
--- a/elastic_transform.py
+++ b/elastic_transform.py
@@ -28,6 +28,9 @@ def fulltext_to_elasticsearch(row, force_bool=True):
 
     release = row['fatcat_release']
 
+    abstracts = []
+    abstract_langs = []
+
     # first, easy fatcat metadata
     t = {
         'fatcat_ident': release['ident'],
@@ -66,51 +69,12 @@ def fulltext_to_elasticsearch(row, force_bool=True):
     for key in EXT_IDS:
         t[key] = release['ext_ids'].get(key) or None
 
-    abstracts = []
-    abstract_langs = []
-
-    # then the fulltext stuff
-    t['fulltext']['status'] = row.get('fulltext_status', 'none')
-    if 'fulltext_file' in row:
-        full = row['fulltext_file']
-        t['fulltext']['sha1'] = full['sha1']
-        t['fulltext']['pdf_url'] = "/" + full['pdf_path']
-        if full.get('pdftotext_path'):
-            t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path']
-        if full.get('thumbnail_path'):
-            t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path']
-        if full.get('grobid_xml_path'):
-            t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path']
-
-    if 'fulltext_grobid' in row:
-        grobid = row['fulltext_grobid']
-        if grobid.get('abstract'):
-            abstracts.append(grobid['abstract'])
-            abstract_langs.append(grobid['language_code'])
-        t['fulltext']['abstract'] = grobid.get('abstract', None)
-        t['fulltext']['body'] = grobid.get('body', None)
-        t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None)
-        t['fulltext']['annex'] = grobid.get('annex', None)
-        t['fulltext']['lang'] = grobid.get('language_code', None)
-    elif 'fulltext_pdftotext' in row:
-        pdftotext = row['fulltext_pdftotext']
-        t['fulltext']['body'] = pdftotext.get('body', None)
-
-    if 'cord19_paper' in row:
-        paper = row['cord19_paper']
-        t['cord19_uid'] = paper['cord_uid']
-        if paper.get('abstract'):
-            abstracts.append(paper['abstract'])
-
     t['contrib_count'] = len(release['contribs'] or [])
 
     if release.get('abstracts'):
         for a in release['abstracts']:
             abstracts.append(a['content'])
             abstract_langs.append(a['lang'])
-    
-    t['abstract'] = abstracts
-    t['abstract_lang'] = list(set(abstract_langs))
 
     contrib_names = []
     contrib_affiliations = []
@@ -181,6 +145,57 @@ def fulltext_to_elasticsearch(row, force_bool=True):
     if t['doi']:
         t['doi_prefix'] = t['doi'].split('/')[0]
 
+    # then the fulltext stuff
+    t['fulltext']['status'] = row.get('fulltext_status', 'none')
+    if 'fulltext_file' in row:
+        full = row['fulltext_file']
+        t['fulltext']['sha1'] = full['sha1']
+        t['fulltext']['pdf_url'] = "/" + full['pdf_path']
+        if full.get('pdftotext_path'):
+            t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path']
+        if full.get('thumbnail_path'):
+            t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path']
+        if full.get('grobid_xml_path'):
+            t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path']
+
+    if 'fulltext_grobid' in row:
+        grobid = row['fulltext_grobid']
+        if grobid.get('abstract'):
+            abstracts.append(grobid['abstract'])
+            abstract_langs.append(grobid['language_code'])
+        t['fulltext']['abstract'] = grobid.get('abstract', None)
+        t['fulltext']['body'] = grobid.get('body', None)
+        t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None)
+        t['fulltext']['annex'] = grobid.get('annex', None)
+        t['fulltext']['lang'] = grobid.get('language_code', None)
+    elif 'fulltext_pdftotext' in row:
+        pdftotext = row['fulltext_pdftotext']
+        t['fulltext']['body'] = pdftotext.get('body', None)
+
+    # then other metadata stuff
+    if row.get('source_tags'):
+        # will get set-uniq at the end
+        t['source_tags'] = row['source_tags']
+    else:
+        t['source_tags'] = []
+
+    if 'cord19_paper' in row:
+        t['source_tags'].append('cord19')
+        paper = row['cord19_paper']
+        t['cord19_uid'] = paper['cord_uid']
+        if paper.get('who_covidence_id'):
+            t['who_covidence_id'] = paper['who_covidence_id']
+            t['source_tags'].append('who')
+        if paper.get('abstract') and not abstracts:
+            abstracts.append(paper['abstract'])
+        if not t['license']:
+            t['license'] = paper.get('license') or None
+    
+    t['abstract'] = abstracts
+    t['abstract_lang'] = list(set(abstract_langs))
+
+    t['source_tags'] = list(set(t['source_tags']))
+
     return t
 
 def run(args):
diff --git a/schema/fulltext_schema.v00.json b/schema/fulltext_schema.v00.json
index 26bafe7..11bb49b 100644
--- a/schema/fulltext_schema.v00.json
+++ b/schema/fulltext_schema.v00.json
@@ -45,13 +45,14 @@
         "fulltext.body",
         "fulltext.acknowledgment",
         "fulltext.annex",
+        "biblio_all",
         "everything"
       ]
     },
     "properties": {
         "fatcat_ident":     { "type": "keyword", "normalizer": "default", "doc_values": false },
         "fatcat_revision":  { "type": "keyword", "normalizer": "default", "doc_values": false },
-        "cord19_uid":       { "type": "keyword", "normalizer": "default", "doc_values": false },
+        "source_tags":      { "type": "keyword", "normalizer": "default", "doc_values": false },
 
         "work_id":        { "type": "keyword", "normalizer": "default" },
         "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
@@ -80,6 +81,8 @@
         "jstor_id":       { "type": "keyword", "normalizer": "default", "doc_values": false },
         "mag_id":         { "type": "keyword", "normalizer": "default", "doc_values": false },
         "s2_id":          { "type": "keyword", "normalizer": "default", "doc_values": false },
+        "cord19_uid":     { "type": "keyword", "normalizer": "default", "doc_values": false },
+        "who_covidence_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
         "license":        { "type": "keyword", "normalizer": "default" },
         "publisher":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
         "publisher_type":       { "type": "keyword", "normalizer": "default" },
@@ -106,6 +109,7 @@
             "acknowledgement":  { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
             "annex":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
 
+            "ia_pdf_url":       { "type": "keyword", "normalizer": "default", "doc_values": false },
             "pdf_url":          { "type": "keyword", "normalizer": "default", "doc_values": false },
             "pdf_sha1":         { "type": "keyword", "normalizer": "default", "doc_values": false },
             "thumbnail_url":    { "type": "keyword", "normalizer": "default", "doc_values": false },
@@ -115,7 +119,7 @@
         },
 
         "everything":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-        "biblio_all":               { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+        "biblio_all":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
 
         "affiliation":    { "type": "alias", "path": "affiliations" },
         "author":         { "type": "alias", "path": "contrib_names" },
@@ -126,7 +130,9 @@
         "lang":           { "type": "alias", "path": "language" },
         "stage":          { "type": "alias", "path": "release_stage" },
         "type":           { "type": "alias", "path": "release_type" },
+        "country":        { "type": "alias", "path": "country_code" },
 
+        "source":         { "type": "alias", "path": "source_tags" },
         "body":           { "type": "alias", "path": "fulltext.body" }
 
     }
author	Bryan Newbold <bnewbold@archive.org>	2020-04-03 12:43:38 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-04-03 12:43:38 -0700
commit	9fc10fc243cdcb0bf2c01d36254d55baae411153 (patch)
tree	21fb7c09973245e7e736acc71d3303f2a568e340
parent	2ec0dfcdefb954d9232a0025cafbf1cc426ae7c2 (diff)
download	fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.tar.gz fatcat-covid19-9fc10fc243cdcb0bf2c01d36254d55baae411153.zip