aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xfatcat_scholar/grobid2json.py30
-rw-r--r--fatcat_scholar/issue_db.py2
-rw-r--r--fatcat_scholar/kafka.py8
-rw-r--r--fatcat_scholar/query_fatcat.py4
-rw-r--r--fatcat_scholar/query_parse.py8
-rw-r--r--fatcat_scholar/sandcrawler.py2
-rw-r--r--fatcat_scholar/search.py4
-rw-r--r--fatcat_scholar/web.py8
-rw-r--r--fatcat_scholar/work_pipeline.py2
-rw-r--r--tests/test_djvu_parse.py2
-rw-r--r--tests/test_grobid2json.py2
-rw-r--r--tests/test_issue_db.py4
-rw-r--r--tests/test_refs_transform.py2
-rw-r--r--tests/test_transform.py10
-rw-r--r--tests/test_work_pipeline.py2
15 files changed, 45 insertions, 45 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index e94bed2..2c85047 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -77,9 +77,9 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]
def journal_info(elem: ET.Element) -> Dict[str, Any]:
journal = dict()
- journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
+ journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
journal["publisher"] = elem.findtext(
- ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
+ f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"
)
if journal["publisher"] == "":
journal["publisher"] = None
@@ -101,8 +101,8 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]:
ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns)
# Title stuff is messy in references...
- ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
+ ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
+ other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
if other_title:
if ref["title"]:
ref["journal"] = other_title
@@ -110,9 +110,9 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]:
ref["journal"] = None
ref["title"] = other_title
ref["authors"] = all_authors(elem, ns=ns)
- ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
+ ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
if not ref["publisher"]:
- ref["publisher"] = elem.findtext(".//{%s}imprint/{%s}publisher" % (ns, ns))
+ ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
if ref["publisher"] == "":
ref["publisher"] = None
date = elem.find('.//{%s}date[@type="published"]' % ns)
@@ -162,12 +162,12 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
header = tei.find(".//{%s}teiHeader" % ns)
if header is None:
raise ValueError("XML does not look like TEI format")
- application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0]
+ application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0]
info["grobid_version"] = application_tag.attrib["version"].strip()
info["grobid_timestamp"] = application_tag.attrib["when"].strip()
- info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
+ info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
info["authors"] = all_authors(
- header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
+ header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")
)
info["journal"] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
@@ -178,7 +178,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
info["doi"] = info["doi"].lower()
refs = []
- for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))):
+ for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
ref = biblio_info(bs)
ref["index"] = i
refs.append(ref)
@@ -190,13 +190,13 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang
if encumbered:
- el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns))
+ el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find(".//{%s}text/{%s}body" % (ns, ns))
+ el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
info["body"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
+ el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
- el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
+ el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
info["annex"] = (el or None) and " ".join(el.itertext()).strip()
# remove empty/null keys
@@ -223,7 +223,7 @@ def main() -> None: # pragma no cover
args = parser.parse_args()
for filename in args.teifiles:
- content = open(filename, "r").read()
+ content = open(filename).read()
print(
json.dumps(
teixml2json(content, encumbered=(not args.no_encumbered)),
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 3c55b51..057869b 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -167,7 +167,7 @@ class IssueDB:
PRAGMA main.synchronous = OFF;
"""
)
- with open("schema/issue_db.sql", "r") as fschema:
+ with open("schema/issue_db.sql") as fschema:
self.db.executescript(fschema.read())
def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
diff --git a/fatcat_scholar/kafka.py b/fatcat_scholar/kafka.py
index e71bc3d..bee2fee 100644
--- a/fatcat_scholar/kafka.py
+++ b/fatcat_scholar/kafka.py
@@ -7,7 +7,7 @@ from typing import List, Any
from confluent_kafka import Consumer, Producer, KafkaException
-class KafkaWorker(object):
+class KafkaWorker:
"""
Base class for Scholar workers which consume from Kafka topics.
@@ -69,7 +69,7 @@ class KafkaWorker(object):
@staticmethod
def _fail_fast_produce(err: Any, msg: Any) -> None:
if err is not None:
- print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+ print(f"Kafka producer delivery error: {err}", file=sys.stderr)
raise KafkaException(err)
@staticmethod
@@ -97,13 +97,13 @@ class KafkaWorker(object):
def _fail_fast_consume(err: Any, partitions: Any) -> None:
if err is not None:
- print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
+ print(f"Kafka consumer commit error: {err}", file=sys.stderr)
raise KafkaException(err)
for p in partitions:
# check for partition-specific commit errors
if p.error:
print(
- "Kafka consumer commit error: {}".format(p.error),
+ f"Kafka consumer commit error: {p.error}",
file=sys.stderr,
)
raise KafkaException(p.error)
diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py
index 24bd662..edac35d 100644
--- a/fatcat_scholar/query_fatcat.py
+++ b/fatcat_scholar/query_fatcat.py
@@ -67,7 +67,7 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
Q("query_string", query=query, default_operator="AND", fields=["biblio"])
)
- print("Expecting {} search hits".format(search.count()), file=sys.stderr)
+ print(f"Expecting {search.count()} search hits", file=sys.stderr)
search = search.params(clear_scroll=False)
search = search.params(_source=False)
@@ -76,7 +76,7 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
for hit in results:
release_id = hit.meta.id
resp = api_session.get(
- "https://api.fatcat.wiki/v0/release/{}".format(release_id),
+ f"https://api.fatcat.wiki/v0/release/{release_id}",
params={
"expand": "container,files,filesets,webcaptures",
"hide": "references",
diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py
index 0f5086b..8c49925 100644
--- a/fatcat_scholar/query_parse.py
+++ b/fatcat_scholar/query_parse.py
@@ -11,7 +11,7 @@ import shlex
def _clean_token(raw: str) -> str:
raw = raw.strip()
if not raw:
- return '"{}"'.format(raw)
+ return f'"{raw}"'
if len(raw.split()) > 1:
# has whitespace, will get quoted
return raw
@@ -19,11 +19,11 @@ def _clean_token(raw: str) -> str:
# is quoted already
return raw
if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"):
- return '"{}"'.format(raw)
+ return f'"{raw}"'
if raw.startswith("[") and raw.endswith("]"):
- return '"{}"'.format(raw)
+ return f'"{raw}"'
if raw.startswith("{") and raw.endswith("}"):
- return '"{}"'.format(raw)
+ return f'"{raw}"'
return raw
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 416ed83..356b373 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -39,7 +39,7 @@ class SandcrawlerPostgrestClient:
return None
-class SandcrawlerMinioClient(object):
+class SandcrawlerMinioClient:
def __init__(
self,
host_url: str,
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index c75ccdb..5b45ba9 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -386,8 +386,8 @@ def do_fulltext_search(
raise ValueError(str(e.info)) from e
except elasticsearch.exceptions.TransportError as e:
# all other errors
- logging.warn("elasticsearch non-200 status code: {}".format(e.info))
- raise IOError(str(e.info)) from e
+ logging.warn(f"elasticsearch non-200 status code: {e.info}")
+ raise OSError(str(e.info)) from e
query_delta = datetime.datetime.now() - query_start
# convert from API objects to dicts
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index dce732b..43a733a 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -106,7 +106,7 @@ async def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
sentry_sdk.set_level("warning")
sentry_sdk.capture_exception(e)
raise HTTPException(status_code=400, detail=f"Query Error: {e}")
- except IOError as e:
+ except OSError as e:
sentry_sdk.capture_exception(e)
raise HTTPException(status_code=500, detail=f"Backend Error: {e}")
@@ -224,7 +224,7 @@ async def web_search(
sentry_sdk.capture_exception(e)
search_error = dict(type="query", message=str(e))
status_code = 400
- except IOError as e:
+ except OSError as e:
sentry_sdk.capture_exception(e)
search_error = dict(type="backend", message=str(e))
status_code = 500
@@ -282,8 +282,8 @@ async def favicon() -> Any:
)
-ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt", "r").read()
-ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt", "r").read()
+ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt").read()
+ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt").read()
@app.get("/robots.txt", include_in_schema=False)
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 10b701b..4c8f1be 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -320,7 +320,7 @@ class WorkPipeline:
2. find best SIM microfilm copy available
"""
pref_idents = fulltext_pref_list(releases)
- release_dict = dict([(r.ident, r) for r in releases])
+ release_dict = {r.ident: r for r in releases}
# print(f"pref_idents={pref_idents}", file=sys.stderr)
diff --git a/tests/test_djvu_parse.py b/tests/test_djvu_parse.py
index 777f8bf..f661f33 100644
--- a/tests/test_djvu_parse.py
+++ b/tests/test_djvu_parse.py
@@ -5,7 +5,7 @@ from fatcat_scholar.djvu import djvu_extract_leaf_texts
def test_djvu_extract_leaf_texts() -> None:
# https://archive.org/details/ERIC_ED441501
- with open("tests/files/ERIC_ED441501_djvu.xml", "r") as f:
+ with open("tests/files/ERIC_ED441501_djvu.xml") as f:
blob = f.read()
leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3, 6])
diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py
index 345fd91..bd5c0e4 100644
--- a/tests/test_grobid2json.py
+++ b/tests/test_grobid2json.py
@@ -3,7 +3,7 @@ from fatcat_scholar.grobid2json import teixml2json
def test_grobid_teixml2json() -> None:
- with open("tests/files/example_grobid.tei.xml", "r") as f:
+ with open("tests/files/example_grobid.tei.xml") as f:
blob = f.read()
obj = teixml2json(blob, True)
diff --git a/tests/test_issue_db.py b/tests/test_issue_db.py
index 9fdab94..d164ef6 100644
--- a/tests/test_issue_db.py
+++ b/tests/test_issue_db.py
@@ -16,8 +16,8 @@ def test_issue_db_basics() -> None:
issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH)
issue_db.init_db()
- with open("tests/files/sim_collections.json", "r") as f:
+ with open("tests/files/sim_collections.json") as f:
issue_db.load_pubs(f.readlines(), api)
- with open("tests/files/sim_items.json", "r") as f:
+ with open("tests/files/sim_items.json") as f:
issue_db.load_issues(f.readlines(), es_client)
diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py
index c26ee1e..3b18557 100644
--- a/tests/test_refs_transform.py
+++ b/tests/test_refs_transform.py
@@ -6,7 +6,7 @@ from fatcat_scholar.transform import refs_from_grobid
def test_transform_refs_grobid() -> None:
- with open("tests/files/example_grobid.tei.xml", "r") as f:
+ with open("tests/files/example_grobid.tei.xml") as f:
blob = f.read()
dummy_release = ReleaseEntity(
diff --git a/tests/test_transform.py b/tests/test_transform.py
index 927c13b..c5e189b 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -7,7 +7,7 @@ from fatcat_scholar.transform import *
def test_es_release_from_release() -> None:
- with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f:
+ with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json") as f:
release = entity_from_json(f.read(), ReleaseEntity)
obj = es_release_from_release(release)
@@ -20,7 +20,7 @@ def test_es_release_from_release() -> None:
def test_es_biblio_from_release() -> None:
- with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f:
+ with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json") as f:
release = entity_from_json(f.read(), ReleaseEntity)
obj = es_biblio_from_release(release)
@@ -36,16 +36,16 @@ def test_es_biblio_from_release() -> None:
def test_run_refs() -> None:
- with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json", "r") as f:
+ with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json") as f:
run_refs(f.readlines())
def test_run_transform() -> None:
- with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json", "r") as f:
+ with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json") as f:
run_transform(f.readlines())
- with open("tests/files/sim_page_bundle.json", "r") as f:
+ with open("tests/files/sim_page_bundle.json") as f:
run_transform(f.readlines())
diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py
index e0e4a82..39b09dc 100644
--- a/tests/test_work_pipeline.py
+++ b/tests/test_work_pipeline.py
@@ -84,5 +84,5 @@ def test_run_transform(mocker: Any) -> None:
),
)
- with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json", "r") as f:
+ with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json") as f:
wp.run_releases(f.readlines())