diff options
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/config.py | 5 | ||||
-rw-r--r-- | fatcat_scholar/kafka.py | 14 | ||||
-rw-r--r-- | fatcat_scholar/query_citation.py | 5 | ||||
-rw-r--r-- | fatcat_scholar/query_fatcat.py | 8 | ||||
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 17 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 1 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 32 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 13 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 26 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 22 | ||||
-rw-r--r-- | fatcat_scholar/worker.py | 11 |
11 files changed, 119 insertions, 35 deletions
diff --git a/fatcat_scholar/config.py b/fatcat_scholar/config.py index ddb2844..86870bc 100644 --- a/fatcat_scholar/config.py +++ b/fatcat_scholar/config.py @@ -2,7 +2,10 @@ import subprocess from dynaconf import Dynaconf -settings = Dynaconf(settings_file="settings.toml", environments=True,) +settings = Dynaconf( + settings_file="settings.toml", + environments=True, +) GIT_REVISION = ( subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") diff --git a/fatcat_scholar/kafka.py b/fatcat_scholar/kafka.py index 71067c1..9fd43cf 100644 --- a/fatcat_scholar/kafka.py +++ b/fatcat_scholar/kafka.py @@ -103,7 +103,8 @@ class KafkaWorker: # check for partition-specific commit errors if p.error: print( - f"Kafka consumer commit error: {p.error}", file=sys.stderr, + f"Kafka consumer commit error: {p.error}", + file=sys.stderr, ) raise KafkaException(p.error) @@ -118,12 +119,16 @@ class KafkaWorker: # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 6min) "max.poll.interval.ms": 360000, - "default.topic.config": {"auto.offset.reset": "latest",}, + "default.topic.config": { + "auto.offset.reset": "latest", + }, } consumer = Consumer(config) consumer.subscribe( - consume_topics, on_assign=_on_rebalance, on_revoke=_on_rebalance, + consume_topics, + on_assign=_on_rebalance, + on_revoke=_on_rebalance, ) print( f"Consuming from kafka topics {consume_topics}, group {consumer_group}", @@ -161,7 +166,8 @@ class KafkaWorker: while True: batch = self.consumer.consume( - num_messages=self.batch_size, timeout=self.poll_interval_sec, + num_messages=self.batch_size, + timeout=self.poll_interval_sec, ) print( diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py index 3f741f0..6cc9086 100644 --- a/fatcat_scholar/query_citation.py +++ b/fatcat_scholar/query_citation.py @@ -31,7 +31,10 @@ def grobid_process_citation( try: grobid_response = requests.post( grobid_host + "/api/processCitation", - data={"citations": raw, "consolidateCitations": 0,}, + data={ + "citations": raw, + "consolidateCitations": 0, + }, timeout=timeout, ) except requests.Timeout: diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py index 45c7e47..b63d834 100644 --- a/fatcat_scholar/query_fatcat.py +++ b/fatcat_scholar/query_fatcat.py @@ -84,7 +84,9 @@ def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: ) resp.raise_for_status() row = dict( - fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(), + fatcat_hit=hit.meta._d_, + release_id=release_id, + fatcat_release=resp.json(), ) print(json.dumps(row, sort_keys=True), file=json_output) @@ -100,7 +102,9 @@ def main() -> None: formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( - "query", help="base query string to use", type=str, + "query", + help="base query string to use", + type=str, ) parser.add_argument( "--fulltext-only", diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 9f9f7e4..087cdc6 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -69,7 +69,10 @@ class SandcrawlerMinioClient: secret_key=os.environ['MINIO_SECRET_KEY'], """ self.mc = minio.Minio( - host_url, access_key=access_key, secret_key=secret_key, secure=False, + host_url, + access_key=access_key, + secret_key=secret_key, + secure=False, ) self.default_bucket = default_bucket @@ -80,7 +83,12 @@ class SandcrawlerMinioClient: prefix = "" assert len(sha1hex) == 40 obj_path = "{}{}/{}/{}/{}{}".format( - prefix, folder, sha1hex[0:2], sha1hex[2:4], sha1hex, extension, + prefix, + folder, + sha1hex[0:2], + sha1hex[2:4], + sha1hex, + extension, ) return obj_path @@ -101,6 +109,9 @@ class SandcrawlerMinioClient: if not bucket: bucket = self.default_bucket assert bucket - blob = self.mc.get_object(bucket, obj_path,) + blob = self.mc.get_object( + bucket, + obj_path, + ) # TODO: optionally verify SHA-1? return blob.data diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 4230b7e..633d30b 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -23,7 +23,6 @@ from fatcat_scholar.biblio_hacks import doi_link_domain # pytype: enable=import-error - class DocType(str, Enum): work = "work" sim_page = "sim_page" diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 6522fe3..7bb7424 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -156,9 +156,21 @@ def apply_filters(search: Search, query: FulltextQuery) -> Search: "terms", type=["article-journal", "paper-conference", "chapter", "article"] ) elif query.filter_type == "reports": - search = search.filter("terms", type=["report", "standard",]) + search = search.filter( + "terms", + type=[ + "report", + "standard", + ], + ) elif query.filter_type == "datasets": - search = search.filter("terms", type=["dataset", "software",]) + search = search.filter( + "terms", + type=[ + "dataset", + "software", + ], + ) elif query.filter_type == "everything": pass else: @@ -291,7 +303,10 @@ def do_fulltext_search( search = search.extra( collapse={ "field": "collapse_key", - "inner_hits": {"name": "more_pages", "size": 0,}, + "inner_hits": { + "name": "more_pages", + "size": 0, + }, } ) @@ -309,7 +324,11 @@ def do_fulltext_search( allow_leading_wildcard=False, lenient=True, quote_field_suffix=".exact", - fields=["title^4", "biblio_all^3", "everything",], + fields=[ + "title^4", + "biblio_all^3", + "everything", + ], ) has_fulltext = Q("terms", **{"access_type": ["ia_sim", "ia_file", "wayback"]}) poor_metadata = Q( @@ -334,7 +353,10 @@ def do_fulltext_search( search = search.sort("_doc") else: search = search.query( - "boosting", positive=base_query, negative=poor_metadata, negative_boost=0.5, + "boosting", + positive=base_query, + negative=poor_metadata, + negative_boost=0.5, ) # simplified version of basic_fulltext query, for highlighting diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 7264540..db631cf 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -246,13 +246,18 @@ def es_fulltext_from_pdftotext( if raw_text and len(raw_text) > MAX_BODY_CHARS: raw_text = raw_text[:MAX_BODY_CHARS] ret = ScholarFulltext( - lang_code=re.language, body=raw_text, acknowledgement=None, annex=None, + lang_code=re.language, + body=raw_text, + acknowledgement=None, + annex=None, ) return _add_file_release_meta(ret, pdf_meta, re, fe) def es_fulltext_from_html( - html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity, + html_fulltext: Dict[str, Any], + re: ReleaseEntity, + wc: WebcaptureEntity, ) -> Optional[ScholarFulltext]: if not wc.archive_urls or not html_fulltext.get("tei_xml"): @@ -546,7 +551,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.html_fulltext["webcapture_ident"] ][0] html_fulltext = es_fulltext_from_html( - heavy.html_fulltext, fulltext_release, fulltext_webcapture, + heavy.html_fulltext, + fulltext_release, + fulltext_webcapture, ) if exclude_web_fulltext and html_fulltext: fulltext = html_fulltext.remove_access() diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index e835c01..4e21ecd 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -79,7 +79,8 @@ class LangPrefix: # first try to parse a language code from header try: accept_code = parse_accept_lang( - request.headers.get("accept-language", ""), I18N_LANG_OPTIONS, + request.headers.get("accept-language", ""), + I18N_LANG_OPTIONS, ) if accept_code: self.code = accept_code @@ -215,14 +216,18 @@ def load_i18n_templates() -> Any: d = dict() for lang_opt in I18N_LANG_OPTIONS: translations = babel.support.Translations.load( - dirname="fatcat_scholar/translations", locales=[lang_opt], + dirname="fatcat_scholar/translations", + locales=[lang_opt], ) templates = Jinja2Templates( - directory="fatcat_scholar/templates", extensions=["jinja2.ext.i18n"], + directory="fatcat_scholar/templates", + extensions=["jinja2.ext.i18n"], ) templates.env.install_gettext_translations(translations, newstyle=True) # type: ignore templates.env.install_gettext_callables( # type: ignore - locale_gettext(translations), locale_ngettext(translations), newstyle=True, + locale_gettext(translations), + locale_ngettext(translations), + newstyle=True, ) # remove a lot of whitespace in HTML output with these configs templates.env.trim_blocks = True @@ -375,7 +380,8 @@ def access_redirect_fallback( if work_entity.redirect: work_ident = work_entity.redirect partial_releases = api_client.get_work_releases( - ident=work_ident, hide="abstracts,references", + ident=work_ident, + hide="abstracts,references", ) except fatcat_openapi_client.ApiException as ae: raise HTTPException( @@ -446,7 +452,10 @@ def access_redirect_wayback( raw_original_url = "/".join(str(request.url).split("/")[7:]) # the quote() call is necessary because the URL is un-encoded in the path parameter # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d - original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) + original_url = urllib.parse.quote( + raw_original_url, + safe=":/%#?=@[]!$&'()*+,;", + ) doc_dict = get_es_scholar_doc(f"work_{work_ident}") if not doc_dict: return access_redirect_fallback( @@ -580,7 +589,10 @@ async def http_exception_handler(request: Request, exc: StarletteHTTPException) resp: Dict[str, Any] = {"status_code": exc.status_code} if exc.detail: resp["detail"] = exc.detail - return JSONResponse(status_code=exc.status_code, content=resp,) + return JSONResponse( + status_code=exc.status_code, + content=resp, + ) # configure middleware diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 54e1ed3..92b0943 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -122,7 +122,9 @@ class WorkPipeline: print(f"seaweedfs failure: sha1hex={fe.sha1}", file=sys.stderr) return None return dict( - tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident, + tei_xml=grobid_xml, + release_ident=release_ident, + file_ident=fe.ident, ) def fetch_pdf_meta( @@ -144,7 +146,9 @@ class WorkPipeline: if not pdf_meta or pdf_meta["status"] != "success": return None return dict( - pdf_meta=pdf_meta, release_ident=release_ident, file_ident=fe.ident, + pdf_meta=pdf_meta, + release_ident=release_ident, + file_ident=fe.ident, ) def fetch_file_pdftotext(self, fe: FileEntity, release_ident: str) -> Optional[Any]: @@ -173,11 +177,15 @@ class WorkPipeline: print(f"seaweedfs failure: sha1hex={fe.sha1}", file=sys.stderr) return None return dict( - raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident, + raw_text=raw_text, + release_ident=release_ident, + file_ident=fe.ident, ) def fetch_webcapture_html_fulltext( - self, wc: WebcaptureEntity, release_ident: str, + self, + wc: WebcaptureEntity, + release_ident: str, ) -> Optional[Dict[str, Any]]: primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url] @@ -240,7 +248,11 @@ class WorkPipeline: crossref_meta = self.sandcrawler_db_client.get_crossref(doi) if not crossref_meta or not crossref_meta.get("record"): return None - return dict(release_ident=re.ident, doi=doi, record=crossref_meta["record"],) + return dict( + release_ident=re.ident, + doi=doi, + record=crossref_meta["record"], + ) def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]: """ diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index b836d7c..ef1a8c7 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -47,7 +47,8 @@ class FetchDocsWorker(KafkaWorker): key = msg["key"] if key.startswith("work_") and msg.get("work_ident"): stubs = self.fatcat_api.get_work_releases( - ident=msg["work_ident"], hide="abstracts,references", + ident=msg["work_ident"], + hide="abstracts,references", ) full_releases = [] for r in stubs: @@ -171,10 +172,14 @@ def main() -> None: type=str, ) - sub = subparsers.add_parser("fetch-docs-worker",) + sub = subparsers.add_parser( + "fetch-docs-worker", + ) sub.set_defaults(worker="fetch-docs-worker") - sub = subparsers.add_parser("index-docs-worker",) + sub = subparsers.add_parser( + "index-docs-worker", + ) sub.set_defaults(worker="index-docs-worker") args = parser.parse_args() |