summaryrefslogtreecommitdiffstats
path: root/extra
diff options
context:
space:
mode:
Diffstat (limited to 'extra')
-rw-r--r--extra/elasticsearch/README.md3
-rwxr-xr-xextra/elasticsearch/transform_release.py86
2 files changed, 1 insertions, 88 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 420f119b..761ad6ab 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -53,8 +53,7 @@ Bulk insert from a file on disk:
Or, in a bulk production live-stream conversion:
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
- # 2018/09/24 21:42:26 53028167 docs in 1h0m56.853006293s at 14501.039 docs/s with 8 workers
+ time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
## Full-Text Querying
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
deleted file mode 100755
index 86d6e4b7..00000000
--- a/extra/elasticsearch/transform_release.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-
-def transform(m):
-
- if m['state'] != 'active':
- return None
-
- # First, the easy ones (direct copy)
- t = dict(
- ident = m['ident'],
- revision = m['revision'],
- title = m['title'],
- release_date = m.get('release_date'),
- release_type = m.get('release_type'),
- release_status = m.get('release_status'),
- language = m.get('language'),
- doi = m.get('doi'),
- pmid = m.get('pmid'),
- pmcid = m.get('pmcid'),
- isbn13 = m.get('isbn13'),
- core_id = m.get('core_id'),
- wikidata_qid = m.get('wikidata_qid')
- )
-
- container = m.get('container')
- container_is_kept = False
- if container:
- t['publisher'] = container.get('publisher')
- t['container_name'] = container.get('name')
- t['container_issnl'] = container.get('issnl')
- container_extra = container.get('extra')
- if container_extra:
- t['container_is_oa'] = container_extra.get('is_oa')
- container_is_kept = container_extra.get('is_kept', False)
- t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
- else:
- t['publisher'] = m.get('publisher')
- t['container_name'] = m.get('container_name')
-
- files = m.get('files', [])
- t['file_count'] = len(files)
- in_wa = False
- in_ia = False
- t['file_pdf_url'] = None
- for f in files:
- is_pdf = 'pdf' in f.get('mimetype', '')
- for url in f.get('urls', []):
- if url.get('rel', '') == 'webarchive':
- in_wa = True
- if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
- in_ia = True
- if is_pdf:
- t['file_pdf_url'] = url['url']
- if not t['file_pdf_url'] and is_pdf:
- t['file_pdf_url'] = url['url']
- t['file_in_webarchive'] = in_wa
- t['file_in_ia'] = in_ia
-
- extra = m.get('extra', dict())
- if extra:
- t['in_shadow'] = extra.get('in_shadow')
- if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
- t['container_is_longtail_oa'] = True
- t['any_abstract'] = bool(t.get('abstracts'))
- t['is_kept'] = container_is_kept or extra.get('is_kept', False)
-
- t['ref_count'] = len(m.get('refs', []))
- t['contrib_count'] = len(m.get('contribs', []))
- contrib_names = []
- for c in m.get('contribs', []):
- if c.get('raw_name'):
- contrib_names.append(c.get('raw_name'))
- t['contrib_names'] = contrib_names
- return t
-
-def run():
- for line in sys.stdin:
- obj = transform(json.loads(line))
- if obj:
- print(json.dumps(obj))
-
-if __name__=="__main__":
- run()