aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/config.py5
-rw-r--r--fatcat_scholar/kafka.py14
-rw-r--r--fatcat_scholar/query_citation.py5
-rw-r--r--fatcat_scholar/query_fatcat.py8
-rw-r--r--fatcat_scholar/sandcrawler.py17
-rw-r--r--fatcat_scholar/schema.py1
-rw-r--r--fatcat_scholar/search.py32
-rw-r--r--fatcat_scholar/transform.py13
-rw-r--r--fatcat_scholar/web.py26
-rw-r--r--fatcat_scholar/work_pipeline.py22
-rw-r--r--fatcat_scholar/worker.py11
11 files changed, 119 insertions, 35 deletions
diff --git a/fatcat_scholar/config.py b/fatcat_scholar/config.py
index ddb2844..86870bc 100644
--- a/fatcat_scholar/config.py
+++ b/fatcat_scholar/config.py
@@ -2,7 +2,10 @@ import subprocess
from dynaconf import Dynaconf
-settings = Dynaconf(settings_file="settings.toml", environments=True,)
+settings = Dynaconf(
+ settings_file="settings.toml",
+ environments=True,
+)
GIT_REVISION = (
subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
diff --git a/fatcat_scholar/kafka.py b/fatcat_scholar/kafka.py
index 71067c1..9fd43cf 100644
--- a/fatcat_scholar/kafka.py
+++ b/fatcat_scholar/kafka.py
@@ -103,7 +103,8 @@ class KafkaWorker:
# check for partition-specific commit errors
if p.error:
print(
- f"Kafka consumer commit error: {p.error}", file=sys.stderr,
+ f"Kafka consumer commit error: {p.error}",
+ file=sys.stderr,
)
raise KafkaException(p.error)
@@ -118,12 +119,16 @@ class KafkaWorker:
# user code timeout; if no poll after this long, assume user code
# hung and rebalance (default: 6min)
"max.poll.interval.ms": 360000,
- "default.topic.config": {"auto.offset.reset": "latest",},
+ "default.topic.config": {
+ "auto.offset.reset": "latest",
+ },
}
consumer = Consumer(config)
consumer.subscribe(
- consume_topics, on_assign=_on_rebalance, on_revoke=_on_rebalance,
+ consume_topics,
+ on_assign=_on_rebalance,
+ on_revoke=_on_rebalance,
)
print(
f"Consuming from kafka topics {consume_topics}, group {consumer_group}",
@@ -161,7 +166,8 @@ class KafkaWorker:
while True:
batch = self.consumer.consume(
- num_messages=self.batch_size, timeout=self.poll_interval_sec,
+ num_messages=self.batch_size,
+ timeout=self.poll_interval_sec,
)
print(
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py
index 3f741f0..6cc9086 100644
--- a/fatcat_scholar/query_citation.py
+++ b/fatcat_scholar/query_citation.py
@@ -31,7 +31,10 @@ def grobid_process_citation(
try:
grobid_response = requests.post(
grobid_host + "/api/processCitation",
- data={"citations": raw, "consolidateCitations": 0,},
+ data={
+ "citations": raw,
+ "consolidateCitations": 0,
+ },
timeout=timeout,
)
except requests.Timeout:
diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py
index 45c7e47..b63d834 100644
--- a/fatcat_scholar/query_fatcat.py
+++ b/fatcat_scholar/query_fatcat.py
@@ -84,7 +84,9 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
)
resp.raise_for_status()
row = dict(
- fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(),
+ fatcat_hit=hit.meta._d_,
+ release_id=release_id,
+ fatcat_release=resp.json(),
)
print(json.dumps(row, sort_keys=True), file=json_output)
@@ -100,7 +102,9 @@ def main() -> None:
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
- "query", help="base query string to use", type=str,
+ "query",
+ help="base query string to use",
+ type=str,
)
parser.add_argument(
"--fulltext-only",
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 9f9f7e4..087cdc6 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -69,7 +69,10 @@ class SandcrawlerMinioClient:
secret_key=os.environ['MINIO_SECRET_KEY'],
"""
self.mc = minio.Minio(
- host_url, access_key=access_key, secret_key=secret_key, secure=False,
+ host_url,
+ access_key=access_key,
+ secret_key=secret_key,
+ secure=False,
)
self.default_bucket = default_bucket
@@ -80,7 +83,12 @@ class SandcrawlerMinioClient:
prefix = ""
assert len(sha1hex) == 40
obj_path = "{}{}/{}/{}/{}{}".format(
- prefix, folder, sha1hex[0:2], sha1hex[2:4], sha1hex, extension,
+ prefix,
+ folder,
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
)
return obj_path
@@ -101,6 +109,9 @@ class SandcrawlerMinioClient:
if not bucket:
bucket = self.default_bucket
assert bucket
- blob = self.mc.get_object(bucket, obj_path,)
+ blob = self.mc.get_object(
+ bucket,
+ obj_path,
+ )
# TODO: optionally verify SHA-1?
return blob.data
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 4230b7e..633d30b 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -23,7 +23,6 @@ from fatcat_scholar.biblio_hacks import doi_link_domain
# pytype: enable=import-error
-
class DocType(str, Enum):
work = "work"
sim_page = "sim_page"
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 6522fe3..7bb7424 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -156,9 +156,21 @@ def apply_filters(search: Search, query: FulltextQuery) -> Search:
"terms", type=["article-journal", "paper-conference", "chapter", "article"]
)
elif query.filter_type == "reports":
- search = search.filter("terms", type=["report", "standard",])
+ search = search.filter(
+ "terms",
+ type=[
+ "report",
+ "standard",
+ ],
+ )
elif query.filter_type == "datasets":
- search = search.filter("terms", type=["dataset", "software",])
+ search = search.filter(
+ "terms",
+ type=[
+ "dataset",
+ "software",
+ ],
+ )
elif query.filter_type == "everything":
pass
else:
@@ -291,7 +303,10 @@ def do_fulltext_search(
search = search.extra(
collapse={
"field": "collapse_key",
- "inner_hits": {"name": "more_pages", "size": 0,},
+ "inner_hits": {
+ "name": "more_pages",
+ "size": 0,
+ },
}
)
@@ -309,7 +324,11 @@ def do_fulltext_search(
allow_leading_wildcard=False,
lenient=True,
quote_field_suffix=".exact",
- fields=["title^4", "biblio_all^3", "everything",],
+ fields=[
+ "title^4",
+ "biblio_all^3",
+ "everything",
+ ],
)
has_fulltext = Q("terms", **{"access_type": ["ia_sim", "ia_file", "wayback"]})
poor_metadata = Q(
@@ -334,7 +353,10 @@ def do_fulltext_search(
search = search.sort("_doc")
else:
search = search.query(
- "boosting", positive=base_query, negative=poor_metadata, negative_boost=0.5,
+ "boosting",
+ positive=base_query,
+ negative=poor_metadata,
+ negative_boost=0.5,
)
# simplified version of basic_fulltext query, for highlighting
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 7264540..db631cf 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -246,13 +246,18 @@ def es_fulltext_from_pdftotext(
if raw_text and len(raw_text) > MAX_BODY_CHARS:
raw_text = raw_text[:MAX_BODY_CHARS]
ret = ScholarFulltext(
- lang_code=re.language, body=raw_text, acknowledgement=None, annex=None,
+ lang_code=re.language,
+ body=raw_text,
+ acknowledgement=None,
+ annex=None,
)
return _add_file_release_meta(ret, pdf_meta, re, fe)
def es_fulltext_from_html(
- html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity,
+ html_fulltext: Dict[str, Any],
+ re: ReleaseEntity,
+ wc: WebcaptureEntity,
) -> Optional[ScholarFulltext]:
if not wc.archive_urls or not html_fulltext.get("tei_xml"):
@@ -546,7 +551,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.html_fulltext["webcapture_ident"]
][0]
html_fulltext = es_fulltext_from_html(
- heavy.html_fulltext, fulltext_release, fulltext_webcapture,
+ heavy.html_fulltext,
+ fulltext_release,
+ fulltext_webcapture,
)
if exclude_web_fulltext and html_fulltext:
fulltext = html_fulltext.remove_access()
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index e835c01..4e21ecd 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -79,7 +79,8 @@ class LangPrefix:
# first try to parse a language code from header
try:
accept_code = parse_accept_lang(
- request.headers.get("accept-language", ""), I18N_LANG_OPTIONS,
+ request.headers.get("accept-language", ""),
+ I18N_LANG_OPTIONS,
)
if accept_code:
self.code = accept_code
@@ -215,14 +216,18 @@ def load_i18n_templates() -> Any:
d = dict()
for lang_opt in I18N_LANG_OPTIONS:
translations = babel.support.Translations.load(
- dirname="fatcat_scholar/translations", locales=[lang_opt],
+ dirname="fatcat_scholar/translations",
+ locales=[lang_opt],
)
templates = Jinja2Templates(
- directory="fatcat_scholar/templates", extensions=["jinja2.ext.i18n"],
+ directory="fatcat_scholar/templates",
+ extensions=["jinja2.ext.i18n"],
)
templates.env.install_gettext_translations(translations, newstyle=True) # type: ignore
templates.env.install_gettext_callables( # type: ignore
- locale_gettext(translations), locale_ngettext(translations), newstyle=True,
+ locale_gettext(translations),
+ locale_ngettext(translations),
+ newstyle=True,
)
# remove a lot of whitespace in HTML output with these configs
templates.env.trim_blocks = True
@@ -375,7 +380,8 @@ def access_redirect_fallback(
if work_entity.redirect:
work_ident = work_entity.redirect
partial_releases = api_client.get_work_releases(
- ident=work_ident, hide="abstracts,references",
+ ident=work_ident,
+ hide="abstracts,references",
)
except fatcat_openapi_client.ApiException as ae:
raise HTTPException(
@@ -446,7 +452,10 @@ def access_redirect_wayback(
raw_original_url = "/".join(str(request.url).split("/")[7:])
# the quote() call is necessary because the URL is un-encoded in the path parameter
# see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
- original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+ original_url = urllib.parse.quote(
+ raw_original_url,
+ safe=":/%#?=@[]!$&'()*+,;",
+ )
doc_dict = get_es_scholar_doc(f"work_{work_ident}")
if not doc_dict:
return access_redirect_fallback(
@@ -580,7 +589,10 @@ async def http_exception_handler(request: Request, exc: StarletteHTTPException)
resp: Dict[str, Any] = {"status_code": exc.status_code}
if exc.detail:
resp["detail"] = exc.detail
- return JSONResponse(status_code=exc.status_code, content=resp,)
+ return JSONResponse(
+ status_code=exc.status_code,
+ content=resp,
+ )
# configure middleware
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index 54e1ed3..92b0943 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -122,7 +122,9 @@ class WorkPipeline:
print(f"seaweedfs failure: sha1hex={fe.sha1}", file=sys.stderr)
return None
return dict(
- tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident,
+ tei_xml=grobid_xml,
+ release_ident=release_ident,
+ file_ident=fe.ident,
)
def fetch_pdf_meta(
@@ -144,7 +146,9 @@ class WorkPipeline:
if not pdf_meta or pdf_meta["status"] != "success":
return None
return dict(
- pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident,
+ pdf_meta=pdf_meta,
+ release_ident=release_ident,
+ file_ident=fe.ident,
)
def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]:
@@ -173,11 +177,15 @@ class WorkPipeline:
print(f"seaweedfs failure: sha1hex={fe.sha1}", file=sys.stderr)
return None
return dict(
- raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,
+ raw_text=raw_text,
+ release_ident=release_ident,
+ file_ident=fe.ident,
)
def fetch_webcapture_html_fulltext(
- self, wc: WebcaptureEntity, release_ident: str,
+ self,
+ wc: WebcaptureEntity,
+ release_ident: str,
) -> Optional[Dict[str, Any]]:
primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url]
@@ -240,7 +248,11 @@ class WorkPipeline:
crossref_meta = self.sandcrawler_db_client.get_crossref(doi)
if not crossref_meta or not crossref_meta.get("record"):
return None
- return dict(release_ident=re.ident, doi=doi, record=crossref_meta["record"],)
+ return dict(
+ release_ident=re.ident,
+ doi=doi,
+ record=crossref_meta["record"],
+ )
def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:
"""
diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py
index b836d7c..ef1a8c7 100644
--- a/fatcat_scholar/worker.py
+++ b/fatcat_scholar/worker.py
@@ -47,7 +47,8 @@ class FetchDocsWorker(KafkaWorker):
key = msg["key"]
if key.startswith("work_") and msg.get("work_ident"):
stubs = self.fatcat_api.get_work_releases(
- ident=msg["work_ident"], hide="abstracts,references",
+ ident=msg["work_ident"],
+ hide="abstracts,references",
)
full_releases = []
for r in stubs:
@@ -171,10 +172,14 @@ def main() -> None:
type=str,
)
- sub = subparsers.add_parser("fetch-docs-worker",)
+ sub = subparsers.add_parser(
+ "fetch-docs-worker",
+ )
sub.set_defaults(worker="fetch-docs-worker")
- sub = subparsers.add_parser("index-docs-worker",)
+ sub = subparsers.add_parser(
+ "index-docs-worker",
+ )
sub.set_defaults(worker="index-docs-worker")
args = parser.parse_args()