remove redundant transform_release.py ES script

author: Bryan Newbold <bnewbold@robocracy.org> 2018-12-24 16:17:15 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-12-24 16:17:15 -0800
commit: 8b2590b32e7b6bd2bfa518ec59c492dd48f8047f (patch)
tree: aac31c50715b25339903cee3a39aa080a2b98320 /extra/elasticsearch
parent: 6d00f774eaee1fab1a3af7e61a4f20273490f52d (diff)
download: fatcat-8b2590b32e7b6bd2bfa518ec59c492dd48f8047f.tar.gz
fatcat-8b2590b32e7b6bd2bfa518ec59c492dd48f8047f.zip
2 files changed, 1 insertions, 88 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 420f119b..761ad6ab 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -53,8 +53,7 @@ Bulk insert from a file on disk:
 Or, in a bulk production live-stream conversion:
 
     export LC_ALL=C.UTF-8
-    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
-    # 2018/09/24 21:42:26 53028167 docs in 1h0m56.853006293s at 14501.039 docs/s with 8 workers
+    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
 
 ## Full-Text Querying
 
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
deleted file mode 100755
index 86d6e4b7..00000000
--- a/extra/elasticsearch/transform_release.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-
-def transform(m):
-
-    if m['state'] != 'active':
-        return None
-
-    # First, the easy ones (direct copy)
-    t = dict(
-        ident = m['ident'],
-        revision = m['revision'],
-        title = m['title'],
-        release_date = m.get('release_date'),
-        release_type = m.get('release_type'),
-        release_status = m.get('release_status'),
-        language = m.get('language'),
-        doi = m.get('doi'),
-        pmid = m.get('pmid'),
-        pmcid = m.get('pmcid'),
-        isbn13 = m.get('isbn13'),
-        core_id = m.get('core_id'),
-        wikidata_qid = m.get('wikidata_qid')
-    )
-
-    container = m.get('container')
-    container_is_kept = False
-    if container:
-        t['publisher'] = container.get('publisher')
-        t['container_name'] = container.get('name')
-        t['container_issnl'] = container.get('issnl')
-        container_extra = container.get('extra')
-        if container_extra:
-            t['container_is_oa'] = container_extra.get('is_oa')
-            container_is_kept = container_extra.get('is_kept', False)
-            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
-    else:
-        t['publisher'] = m.get('publisher')
-        t['container_name'] = m.get('container_name')
-
-    files = m.get('files', [])
-    t['file_count'] = len(files)
-    in_wa = False
-    in_ia = False
-    t['file_pdf_url'] = None
-    for f in files:
-        is_pdf = 'pdf' in f.get('mimetype', '')
-        for url in f.get('urls', []):
-            if url.get('rel', '') == 'webarchive':
-                in_wa = True
-            if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
-                in_ia = True
-                if is_pdf:
-                    t['file_pdf_url'] = url['url']
-            if not t['file_pdf_url'] and is_pdf:
-                t['file_pdf_url'] = url['url']
-    t['file_in_webarchive'] = in_wa
-    t['file_in_ia'] = in_ia
-
-    extra = m.get('extra', dict())
-    if extra:
-        t['in_shadow'] = extra.get('in_shadow')
-        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
-            t['container_is_longtail_oa'] = True
-    t['any_abstract'] = bool(t.get('abstracts'))
-    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
-
-    t['ref_count'] = len(m.get('refs', []))
-    t['contrib_count'] = len(m.get('contribs', []))
-    contrib_names = []
-    for c in m.get('contribs', []):
-        if c.get('raw_name'):
-            contrib_names.append(c.get('raw_name'))
-    t['contrib_names'] = contrib_names
-    return t
-
-def run():
-    for line in sys.stdin:
-        obj = transform(json.loads(line))
-        if obj:
-            print(json.dumps(obj))
-
-if __name__=="__main__":
-    run()
author	Bryan Newbold <bnewbold@robocracy.org>	2018-12-24 16:17:15 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-12-24 16:17:15 -0800
commit	8b2590b32e7b6bd2bfa518ec59c492dd48f8047f (patch)
tree	aac31c50715b25339903cee3a39aa080a2b98320 /extra/elasticsearch
parent	6d00f774eaee1fab1a3af7e61a4f20273490f52d (diff)
download	fatcat-8b2590b32e7b6bd2bfa518ec59c492dd48f8047f.tar.gz fatcat-8b2590b32e7b6bd2bfa518ec59c492dd48f8047f.zip