diff options
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 30 | ||||
-rw-r--r-- | fatcat_scholar/issue_db.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/kafka.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/query_fatcat.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/query_parse.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 2 | ||||
-rw-r--r-- | tests/test_djvu_parse.py | 2 | ||||
-rw-r--r-- | tests/test_grobid2json.py | 2 | ||||
-rw-r--r-- | tests/test_issue_db.py | 4 | ||||
-rw-r--r-- | tests/test_refs_transform.py | 2 | ||||
-rw-r--r-- | tests/test_transform.py | 10 | ||||
-rw-r--r-- | tests/test_work_pipeline.py | 2 |
15 files changed, 45 insertions, 45 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index e94bed2..2c85047 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -77,9 +77,9 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any] def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() - journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) + journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") journal["publisher"] = elem.findtext( - ".//{%s}publicationStmt/{%s}publisher" % (ns, ns) + f".//{{{ns}}}publicationStmt/{{{ns}}}publisher" ) if journal["publisher"] == "": journal["publisher"] = None @@ -101,8 +101,8 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... - ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) - other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) + ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") + other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") if other_title: if ref["title"]: ref["journal"] = other_title @@ -110,9 +110,9 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext(".//{%s}imprint/{%s}publisher" % (ns, ns)) + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -162,12 +162,12 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() - info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) + info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") info["authors"] = all_authors( - header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)) + header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct") ) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) @@ -178,7 +178,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -190,13 +190,13 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang if encumbered: - el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns)) + el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") info["abstract"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find(".//{%s}text/{%s}body" % (ns, ns)) + el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns)) + el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) + el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() # remove empty/null keys @@ -223,7 +223,7 @@ def main() -> None: # pragma no cover args = parser.parse_args() for filename in args.teifiles: - content = open(filename, "r").read() + content = open(filename).read() print( json.dumps( teixml2json(content, encumbered=(not args.no_encumbered)), diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 3c55b51..057869b 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -167,7 +167,7 @@ class IssueDB: PRAGMA main.synchronous = OFF; """ ) - with open("schema/issue_db.sql", "r") as fschema: + with open("schema/issue_db.sql") as fschema: self.db.executescript(fschema.read()) def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None: diff --git a/fatcat_scholar/kafka.py b/fatcat_scholar/kafka.py index e71bc3d..bee2fee 100644 --- a/fatcat_scholar/kafka.py +++ b/fatcat_scholar/kafka.py @@ -7,7 +7,7 @@ from typing import List, Any from confluent_kafka import Consumer, Producer, KafkaException -class KafkaWorker(object): +class KafkaWorker: """ Base class for Scholar workers which consume from Kafka topics. @@ -69,7 +69,7 @@ class KafkaWorker(object): @staticmethod def _fail_fast_produce(err: Any, msg: Any) -> None: if err is not None: - print("Kafka producer delivery error: {}".format(err), file=sys.stderr) + print(f"Kafka producer delivery error: {err}", file=sys.stderr) raise KafkaException(err) @staticmethod @@ -97,13 +97,13 @@ class KafkaWorker(object): def _fail_fast_consume(err: Any, partitions: Any) -> None: if err is not None: - print("Kafka consumer commit error: {}".format(err), file=sys.stderr) + print(f"Kafka consumer commit error: {err}", file=sys.stderr) raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print( - "Kafka consumer commit error: {}".format(p.error), + f"Kafka consumer commit error: {p.error}", file=sys.stderr, ) raise KafkaException(p.error) diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py index 24bd662..edac35d 100644 --- a/fatcat_scholar/query_fatcat.py +++ b/fatcat_scholar/query_fatcat.py @@ -67,7 +67,7 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: Q("query_string", query=query, default_operator="AND", fields=["biblio"]) ) - print("Expecting {} search hits".format(search.count()), file=sys.stderr) + print(f"Expecting {search.count()} search hits", file=sys.stderr) search = search.params(clear_scroll=False) search = search.params(_source=False) @@ -76,7 +76,7 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: for hit in results: release_id = hit.meta.id resp = api_session.get( - "https://api.fatcat.wiki/v0/release/{}".format(release_id), + f"https://api.fatcat.wiki/v0/release/{release_id}", params={ "expand": "container,files,filesets,webcaptures", "hide": "references", diff --git a/fatcat_scholar/query_parse.py b/fatcat_scholar/query_parse.py index 0f5086b..8c49925 100644 --- a/fatcat_scholar/query_parse.py +++ b/fatcat_scholar/query_parse.py @@ -11,7 +11,7 @@ import shlex def _clean_token(raw: str) -> str: raw = raw.strip() if not raw: - return '"{}"'.format(raw) + return f'"{raw}"' if len(raw.split()) > 1: # has whitespace, will get quoted return raw @@ -19,11 +19,11 @@ def _clean_token(raw: str) -> str: # is quoted already return raw if "/" in raw or raw.endswith(":") or raw.endswith("!") or raw.endswith("?"): - return '"{}"'.format(raw) + return f'"{raw}"' if raw.startswith("[") and raw.endswith("]"): - return '"{}"'.format(raw) + return f'"{raw}"' if raw.startswith("{") and raw.endswith("}"): - return '"{}"'.format(raw) + return f'"{raw}"' return raw diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 416ed83..356b373 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -39,7 +39,7 @@ class SandcrawlerPostgrestClient: return None -class SandcrawlerMinioClient(object): +class SandcrawlerMinioClient: def __init__( self, host_url: str, diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index c75ccdb..5b45ba9 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -386,8 +386,8 @@ def do_fulltext_search( raise ValueError(str(e.info)) from e except elasticsearch.exceptions.TransportError as e: # all other errors - logging.warn("elasticsearch non-200 status code: {}".format(e.info)) - raise IOError(str(e.info)) from e + logging.warn(f"elasticsearch non-200 status code: {e.info}") + raise OSError(str(e.info)) from e query_delta = datetime.datetime.now() - query_start # convert from API objects to dicts diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index dce732b..43a733a 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -106,7 +106,7 @@ async def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits: sentry_sdk.set_level("warning") sentry_sdk.capture_exception(e) raise HTTPException(status_code=400, detail=f"Query Error: {e}") - except IOError as e: + except OSError as e: sentry_sdk.capture_exception(e) raise HTTPException(status_code=500, detail=f"Backend Error: {e}") @@ -224,7 +224,7 @@ async def web_search( sentry_sdk.capture_exception(e) search_error = dict(type="query", message=str(e)) status_code = 400 - except IOError as e: + except OSError as e: sentry_sdk.capture_exception(e) search_error = dict(type="backend", message=str(e)) status_code = 500 @@ -282,8 +282,8 @@ async def favicon() -> Any: ) -ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt", "r").read() -ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt", "r").read() +ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt").read() +ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt").read() @app.get("/robots.txt", include_in_schema=False) diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 10b701b..4c8f1be 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -320,7 +320,7 @@ class WorkPipeline: 2. find best SIM microfilm copy available """ pref_idents = fulltext_pref_list(releases) - release_dict = dict([(r.ident, r) for r in releases]) + release_dict = {r.ident: r for r in releases} # print(f"pref_idents={pref_idents}", file=sys.stderr) diff --git a/tests/test_djvu_parse.py b/tests/test_djvu_parse.py index 777f8bf..f661f33 100644 --- a/tests/test_djvu_parse.py +++ b/tests/test_djvu_parse.py @@ -5,7 +5,7 @@ from fatcat_scholar.djvu import djvu_extract_leaf_texts def test_djvu_extract_leaf_texts() -> None: # https://archive.org/details/ERIC_ED441501 - with open("tests/files/ERIC_ED441501_djvu.xml", "r") as f: + with open("tests/files/ERIC_ED441501_djvu.xml") as f: blob = f.read() leaves = djvu_extract_leaf_texts(io.StringIO(blob), [3, 6]) diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index 345fd91..bd5c0e4 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -3,7 +3,7 @@ from fatcat_scholar.grobid2json import teixml2json def test_grobid_teixml2json() -> None: - with open("tests/files/example_grobid.tei.xml", "r") as f: + with open("tests/files/example_grobid.tei.xml") as f: blob = f.read() obj = teixml2json(blob, True) diff --git a/tests/test_issue_db.py b/tests/test_issue_db.py index 9fdab94..d164ef6 100644 --- a/tests/test_issue_db.py +++ b/tests/test_issue_db.py @@ -16,8 +16,8 @@ def test_issue_db_basics() -> None: issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH) issue_db.init_db() - with open("tests/files/sim_collections.json", "r") as f: + with open("tests/files/sim_collections.json") as f: issue_db.load_pubs(f.readlines(), api) - with open("tests/files/sim_items.json", "r") as f: + with open("tests/files/sim_items.json") as f: issue_db.load_issues(f.readlines(), es_client) diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index c26ee1e..3b18557 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -6,7 +6,7 @@ from fatcat_scholar.transform import refs_from_grobid def test_transform_refs_grobid() -> None: - with open("tests/files/example_grobid.tei.xml", "r") as f: + with open("tests/files/example_grobid.tei.xml") as f: blob = f.read() dummy_release = ReleaseEntity( diff --git a/tests/test_transform.py b/tests/test_transform.py index 927c13b..c5e189b 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -7,7 +7,7 @@ from fatcat_scholar.transform import * def test_es_release_from_release() -> None: - with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: + with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json") as f: release = entity_from_json(f.read(), ReleaseEntity) obj = es_release_from_release(release) @@ -20,7 +20,7 @@ def test_es_release_from_release() -> None: def test_es_biblio_from_release() -> None: - with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json", "r") as f: + with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4.json") as f: release = entity_from_json(f.read(), ReleaseEntity) obj = es_biblio_from_release(release) @@ -36,16 +36,16 @@ def test_es_biblio_from_release() -> None: def test_run_refs() -> None: - with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json", "r") as f: + with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json") as f: run_refs(f.readlines()) def test_run_transform() -> None: - with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json", "r") as f: + with open("tests/files/work_iarm6swodra2bcrzhxrfaah7py_bundle.json") as f: run_transform(f.readlines()) - with open("tests/files/sim_page_bundle.json", "r") as f: + with open("tests/files/sim_page_bundle.json") as f: run_transform(f.readlines()) diff --git a/tests/test_work_pipeline.py b/tests/test_work_pipeline.py index e0e4a82..39b09dc 100644 --- a/tests/test_work_pipeline.py +++ b/tests/test_work_pipeline.py @@ -84,5 +84,5 @@ def test_run_transform(mocker: Any) -> None: ), ) - with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json", "r") as f: + with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json") as f: wp.run_releases(f.readlines()) |