aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19/enrich.py
blob: 245a357c09eb6e38cc52bcb7acb9a248c9209a36 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

import sys
import json
import datetime

from fatcat_covid19.common import requests_retry_session


def enrich_fatcat_row(row, api_session):

    cord19_paper = row.get('cord19_paper')
    if not cord19_paper:
        return row

    pubmed_id = cord19_paper.get('pubmed_id') or None
    pmcid = cord19_paper.get('pmcid') or None
    doi = cord19_paper.get('doi') or None
    fatcat_release = None

    if doi == '0.1126/science.abb7331':
        doi = '10.1126/science.abb7331'

    if not fatcat_release and pmcid:
        resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
            params={
                'pmcid': pmcid,
                'expand': 'container,files,filesets,webcaptures',
                'hide': 'abstracts,references',
        })
        if resp.status_code == 200:
            fatcat_release = resp.json()
    if not fatcat_release and doi:
        resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
            params={
                'doi': doi,
                'expand': 'container,files,filesets,webcaptures',
                'hide': 'abstracts,references',
        })
        if resp.status_code == 200:
            fatcat_release = resp.json()
    if not fatcat_release and pubmed_id:
        resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
            params={
                'pmid': pubmed_id,
                'expand': 'container,files,filesets,webcaptures',
                'hide': 'abstracts,references',
        })
        if resp.status_code == 200:
            fatcat_release = resp.json()

    if fatcat_release:
        row['fatcat_release'] = fatcat_release
        row['release_id'] = fatcat_release['ident']
    print(json.dumps(row, sort_keys=True))


def enrich_fatcat_file(json_input, json_output):
    """
    Takes a JSON-transformed CORD-19 *metadata* file and enriches it with
    fatcat metadata.
    """
    api_session = requests_retry_session()
    for l in json_input:
        l = json.loads(l)
        result = enrich_fatcat_row(l, api_session)
        if result:
            print(json.dumps(result, sort_keys=True), file=json_output)