diff options
Diffstat (limited to 'extra')
| -rw-r--r-- | extra/elasticsearch/README.md | 3 | ||||
| -rwxr-xr-x | extra/elasticsearch/transform_release.py | 86 | 
2 files changed, 1 insertions, 88 deletions
| diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 420f119b..761ad6ab 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -53,8 +53,7 @@ Bulk insert from a file on disk:  Or, in a bulk production live-stream conversion:      export LC_ALL=C.UTF-8 -    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release -    # 2018/09/24 21:42:26 53028167 docs in 1h0m56.853006293s at 14501.039 docs/s with 8 workers +    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release  ## Full-Text Querying diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py deleted file mode 100755 index 86d6e4b7..00000000 --- a/extra/elasticsearch/transform_release.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json - -def transform(m): - -    if m['state'] != 'active': -        return None - -    # First, the easy ones (direct copy) -    t = dict( -        ident = m['ident'], -        revision = m['revision'], -        title = m['title'], -        release_date = m.get('release_date'), -        release_type = m.get('release_type'), -        release_status = m.get('release_status'), -        language = m.get('language'), -        doi = m.get('doi'), -        pmid = m.get('pmid'), -        pmcid = m.get('pmcid'), -        isbn13 = m.get('isbn13'), -        core_id = m.get('core_id'), -        wikidata_qid = m.get('wikidata_qid') -    ) - -    container = m.get('container') -    container_is_kept = False -    if container: -        t['publisher'] = container.get('publisher') -        t['container_name'] = container.get('name') -        t['container_issnl'] = container.get('issnl') -        container_extra = container.get('extra') -        if container_extra: -            t['container_is_oa'] = container_extra.get('is_oa') -            container_is_kept = container_extra.get('is_kept', False) -            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') -    else: -        t['publisher'] = m.get('publisher') -        t['container_name'] = m.get('container_name') - -    files = m.get('files', []) -    t['file_count'] = len(files) -    in_wa = False -    in_ia = False -    t['file_pdf_url'] = None -    for f in files: -        is_pdf = 'pdf' in f.get('mimetype', '') -        for url in f.get('urls', []): -            if url.get('rel', '') == 'webarchive': -                in_wa = True -            if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: -                in_ia = True -                if is_pdf: -                    t['file_pdf_url'] = url['url'] -            if not t['file_pdf_url'] and is_pdf: -                t['file_pdf_url'] = url['url'] -    t['file_in_webarchive'] = in_wa -    t['file_in_ia'] = in_ia - -    extra = m.get('extra', dict()) -    if extra: -        t['in_shadow'] = extra.get('in_shadow') -        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): -            t['container_is_longtail_oa'] = True -    t['any_abstract'] = bool(t.get('abstracts')) -    t['is_kept'] = container_is_kept or extra.get('is_kept', False) - -    t['ref_count'] = len(m.get('refs', [])) -    t['contrib_count'] = len(m.get('contribs', [])) -    contrib_names = [] -    for c in m.get('contribs', []): -        if c.get('raw_name'): -            contrib_names.append(c.get('raw_name')) -    t['contrib_names'] = contrib_names -    return t - -def run(): -    for line in sys.stdin: -        obj = transform(json.loads(line)) -        if obj: -            print(json.dumps(obj)) - -if __name__=="__main__": -    run() | 
