aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_work_pipeline.py
blob: bc8a79a7fa8607cc4cc65768da6e7217e71d7bcd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from typing import Any

import responses

from fatcat_scholar.config import settings
from fatcat_scholar.issue_db import IssueDB
from fatcat_scholar.sandcrawler import (
    SandcrawlerMinioClient,
    SandcrawlerPostgrestClient,
)
from fatcat_scholar.work_pipeline import WorkPipeline


@responses.activate
def test_run_transform(mocker: Any) -> None:

    issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH)
    issue_db.init_db()

    responses.add(
        responses.GET,
        "http://disabled-during-tests-bogus.xyz:3333/grobid?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
        status=200,
        json=[
            {
                "sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
                "updated": "2019-11-30T04:44:00+00:00",
                "grobid_version": "0.5.5-fatcat",
                "status_code": 200,
                "status": "success",
                "fatcat_release": "hsmo6p4smrganpb3fndaj2lon4",
                "metadata": {
                    "biblio": {
                        "doi": "10.7717/peerj.4375",
                        "date": "2018-02-13",
                        "title": "Distributed under Creative Commons CC-BY 4.0 The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles",
                        "authors": [],
                    },
                    "language_code": "en",
                    "grobid_timestamp": "2019-11-30T04:44+0000",
                },
            }
        ],
    )

    responses.add(
        responses.GET,
        "http://disabled-during-tests-bogus.xyz:3333/pdf_meta?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
        status=200,
        json=[
            {
                "sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
                "updated": "2020-07-07T02:15:52.98309+00:00",
                "status": "success",
                "has_page0_thumbnail": True,
                "page_count": 23,
                "word_count": 10534,
                "page0_height": 792,
                "page0_width": 612,
                "permanent_id": "52f2164b9cc9e47fd150e7ee389b595a",
                "pdf_created": "2018-02-09T06:06:06+00:00",
                "pdf_version": "1.5",
                "metadata": {
                    "title": "",
                    "author": "",
                    "creator": "River Valley",
                    "subject": "Legal Issues, Science Policy, Data Science",
                    "producer": "pdfTeX-1.40.16",
                },
            }
        ],
    )

    responses.add(
        responses.GET,
        "http://disabled-during-tests-bogus.xyz:3333/crossref_with_refs?doi=eq.10.7717%2Fpeerj.4375",
        status=200,
        json=[
            {
                "doi": "10.7717/peerj.4375",
                "indexed": "2020-07-07T02:15:52.98309+00:00",
                "record": {
                    "title": "something",
                    "TODO_better_object": 3,
                },
                "refs_json": [],
            }
        ],
    )

    es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid")
    es_raw.side_effect = [
        {"tei_xml": "<xml>dummy", "release_ident": "asdf123", "file_ident": "xyq9876"},
    ]

    wp = WorkPipeline(
        issue_db=issue_db,
        sandcrawler_db_client=SandcrawlerPostgrestClient(
            api_url=settings.SANDCRAWLER_DB_API
        ),
        sandcrawler_s3_client=SandcrawlerMinioClient(
            host_url=settings.SANDCRAWLER_S3_API
        ),
    )

    with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json", "r") as f:
        wp.run_releases(f.readlines())