1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
import responses
from fatcat_scholar.issue_db import IssueDB
from fatcat_scholar.sandcrawler import (
SandcrawlerPostgrestClient,
SandcrawlerMinioClient,
)
from fatcat_scholar.work_pipeline import *
from fatcat_scholar.config import settings
@responses.activate
def test_run_transform(mocker: Any) -> None:
issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH)
issue_db.init_db()
responses.add(
responses.GET,
"http://disabled-during-tests-bogus.xyz:3333/grobid?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
status=200,
json=[
{
"sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
"updated": "2019-11-30T04:44:00+00:00",
"grobid_version": "0.5.5-fatcat",
"status_code": 200,
"status": "success",
"fatcat_release": "hsmo6p4smrganpb3fndaj2lon4",
"metadata": {
"biblio": {
"doi": "10.7717/peerj.4375",
"date": "2018-02-13",
"title": "Distributed under Creative Commons CC-BY 4.0 The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles",
"authors": [],
},
"language_code": "en",
"grobid_timestamp": "2019-11-30T04:44+0000",
},
}
],
)
responses.add(
responses.GET,
"http://disabled-during-tests-bogus.xyz:3333/pdf_meta?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
status=200,
json=[
{
"sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
"updated": "2020-07-07T02:15:52.98309+00:00",
"status": "success",
"has_page0_thumbnail": True,
"page_count": 23,
"word_count": 10534,
"page0_height": 792,
"page0_width": 612,
"permanent_id": "52f2164b9cc9e47fd150e7ee389b595a",
"pdf_created": "2018-02-09T06:06:06+00:00",
"pdf_version": "1.5",
"metadata": {
"title": "",
"author": "",
"creator": "River Valley",
"subject": "Legal Issues, Science Policy, Data Science",
"producer": "pdfTeX-1.40.16",
},
}
],
)
es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid")
es_raw.side_effect = [
{"tei_xml": "<xml>dummy", "release_ident": "asdf123", "file_ident": "xyq9876"},
]
wp = WorkPipeline(
issue_db=issue_db,
sandcrawler_db_client=SandcrawlerPostgrestClient(
api_url=settings.SANDCRAWLER_DB_API
),
sandcrawler_s3_client=SandcrawlerMinioClient(
host_url=settings.SANDCRAWLER_S3_API
),
)
with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json") as f:
wp.run_releases(f.readlines())
|