From 642501efc92e31ac438fe0f70820415c825a3802 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 23 Jan 2019 21:07:01 -0800
Subject: start changes to release ES schema

---
 python/fatcat_tools/transforms.py | 120 ++++++++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 30 deletions(-)

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index b1fd7e68..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
 
+
 import collections
 from fatcat_client import ReleaseEntity, ApiClient
 
@@ -40,17 +41,29 @@ def release_to_elasticsearch(release):
         state = release.state,
         revision = release.revision,
         title = release.title,
+        original_title = release.original_title,
         release_type = release.release_type,
         release_status = release.release_status,
         language = release.language,
+        license = release.license_slug,
         doi = release.doi,
         pmid = release.pmid,
         pmcid = release.pmcid,
         isbn13 = release.isbn13,
+        wikidata_qid = release.wikidata_qid,
         core_id = release.core_id,
-        wikidata_qid = release.wikidata_qid
+        arxiv_id = release.core_id,
+        jstor_id = release.jstor_id,
     )
 
+    is_oa = None
+    is_longtail_oa = None
+    in_kbart = None
+    in_web = False
+    in_dweb = False
+    in_ia = False
+    in_shadow = False
+
     if release.release_date:
         # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
         t['release_date'] = release.release_date.isoformat()
@@ -59,52 +72,99 @@ def release_to_elasticsearch(release):
     if release.release_year is not None:
         t['release_year'] = release.release_year
 
+    t['any_abstract'] = len(release.abstracts) > 0
+    t['ref_count'] = len(release.refs or [])
+    t['contrib_count'] = len(release.contribs or [])
+    contrib_names = []
+    for c in (release.contribs or []):
+        if c.raw_name:
+            contrib_names.append(c.raw_name)
+    t['contrib_names'] = contrib_names
+
     container = release.container
-    container_is_kept = False
     if container:
         t['publisher'] = container.publisher
         t['container_name'] = container.name
         t['container_issnl'] = container.issnl
-        container_extra = container.extra
-        if container_extra:
-            t['container_is_oa'] = container_extra.get('is_oa')
-            container_is_kept = container_extra.get('is_kept', False)
-            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+        t['container_type'] = container.container_type
+        if container.extra:
+            if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+                is_oa = True
+            if container.extra.get('in_kbart'):
+                # TODO: better KBART check goes here
+                in_kbart = True
+            if container.extra.get('ia'):
+                # TODO: container longtail check goes here
+                # TODO: sim/microfilm check goes here
+                pass
+            # TODO: SHERPA/Romeo goes here
     else:
         t['publisher'] = release.publisher
 
     files = release.files or []
     t['file_count'] = len(files)
-    in_wa = False
-    in_ia = False
-    t['file_pdf_url'] = None
+    t['fileset_count'] = len(release.filesets or [])
+    t['webcapture_count'] = len(release.webcaptures or [])
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
     for f in files:
+        if f.extra and f.extra.get('shadows'):
+            # TODO: shadow check goes here
+            in_shadows = True
         is_pdf = 'pdf' in (f.mimetype or '')
         for url in (f.urls or []):
-            if url.rel == 'webarchive':
-                in_wa = True
-            if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+            if url.url.lower().startswith('http'):
+                in_web = True
+            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+                # TODO: not sure what rel will be
+                in_dweb = True
+            if is_pdf:
+                any_pdf_url = url.url
+            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+                is_preserved = True
+                good_pdf_url = url.url
+            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
                 in_ia = True
                 if is_pdf:
-                    t['file_pdf_url'] = url.url
-            if not t['file_pdf_url'] and is_pdf:
-                t['file_pdf_url'] = url.url
-    t['file_in_webarchive'] = in_wa
-    t['file_in_ia'] = in_ia
+                    best_pdf_url = url.url
+                    ia_pdf_url = url.url
+    # here is where we bake-in priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    if release.license_slug:
+        # TODO: more/better checks here, particularly strict *not* OA licenses
+        if release.license_slug.startswith("CC-"):
+            is_oa = True
 
     extra = release.extra or dict()
     if extra:
-        t['in_shadow'] = extra.get('in_shadow')
-        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
-            t['container_is_longtail_oa'] = True
-    t['any_abstract'] = len(release.abstracts) > 0
-    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+        # TODO: longtail OA check from GROBID here
+        if extra.get('in_kbart'):
+            # NOTE: not actually setting this anywhere
+            in_kbart = True
+        if extra.get('is_oa'):
+            # NOTE: not actually setting this anywhere
+            is_oa = True
+        if extra.get('grobid'):
+            if not t.get('container_name'):
+                t['container_name'] = extra['grobid'].get('container_name')
+            if extra['grobid'].get('longtail_oa'):
+                is_longtail_oa = True
+        if extra.get('crossref'):
+            if extra['crossref'].get('archive'):
+                # all crossref archives are KBART, I believe
+                in_kbart = True
 
-    t['ref_count'] = len(release.refs or [])
-    t['contrib_count'] = len(release.contribs or [])
-    contrib_names = []
-    for c in (release.contribs or []):
-        if c.raw_name:
-            contrib_names.append(c.raw_name)
-    t['contrib_names'] = contrib_names
+    if is_longtail_oa:
+        is_oa = True
+    t['is_oa'] = is_oa
+    t['is_longtail_oa'] = is_longtail_oa
+    t['in_kbart'] = in_kbart
+    t['in_web'] = in_web
+    t['in_dweb'] = in_dweb
+    t['in_ia'] = in_ia
+    t['is_preserved'] = in_ia or in_kbart
     return t
-- 
cgit v1.2.3