aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/sandcrawler.py4
-rw-r--r--fatcat_scholar/search.py6
-rw-r--r--fatcat_scholar/transform.py21
-rw-r--r--fatcat_scholar/work_pipeline.py4
4 files changed, 22 insertions, 13 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 9c48cd9..25c7002 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -17,7 +17,9 @@ class SandcrawlerPostgrestClient:
return None
def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1))
+ resp = requests.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index a2e19bc..0985081 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -207,7 +207,7 @@ def do_fulltext_search(
number_of_fragments=2,
fragment_size=300,
# TODO: this will fix highlight encoding, but requires ES 7.x
- #encoder="html",
+ # encoder="html",
)
# sort order
@@ -270,8 +270,8 @@ def do_fulltext_search(
if type(h[key]) is str:
h[key] = h[key].encode("utf8", "ignore").decode("utf8")
# ensure collapse_key is a single value, not an array
- if type(h['collapse_key']) == list:
- h['collapse_key'] = h['collapse_key'][0]
+ if type(h["collapse_key"]) == list:
+ h["collapse_key"] = h["collapse_key"][0]
count_found: int = int(resp.hits.total)
count_returned = len(results)
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index c219528..ac80efc 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -104,10 +104,12 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
# original_title=release.original_title,
release_date=release_date,
release_year=release_year,
- release_type=SIM_RELEASE_TYPE_MAP.get(issue_meta.get("pub_type")) or SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")),
+ release_type=SIM_RELEASE_TYPE_MAP.get(issue_meta.get("pub_type"))
+ or SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")),
release_stage="published", # as a default
# withdrawn_status=release.withdrawn_status,
- lang_code=SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get(pub_meta.get("language")),
+ lang_code=SIM_LANG_MAP.get(issue_meta.get("language"))
+ or SIM_LANG_MAP.get(pub_meta.get("language")),
country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")),
volume=volume,
volume_int=volume_int,
@@ -133,7 +135,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
def _add_file_release_meta(
- fulltext: ScholarFulltext, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
+ fulltext: ScholarFulltext,
+ pdf_meta: Optional[dict],
+ re: ReleaseEntity,
+ fe: FileEntity,
) -> ScholarFulltext:
best_url = None
best_url_type = None
@@ -181,10 +186,7 @@ def es_fulltext_from_pdftotext(
) -> Optional[ScholarFulltext]:
ret = ScholarFulltext(
- lang_code=re.language,
- body=raw_text,
- acknowledgement=None,
- annex=None,
+ lang_code=re.language, body=raw_text, acknowledgement=None, annex=None,
)
return _add_file_release_meta(ret, pdf_meta, re, fe)
@@ -257,7 +259,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.pdftotext_fulltext["file_ident"]
][0]
fulltext = es_fulltext_from_pdftotext(
- heavy.pdftotext_fulltext["raw_text"], heavy.pdf_meta, fulltext_release, fulltext_file
+ heavy.pdftotext_fulltext["raw_text"],
+ heavy.pdf_meta,
+ fulltext_release,
+ fulltext_file,
)
# TODO: additional access list
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index e3a0d8d..17a0f7a 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -119,7 +119,9 @@ class WorkPipeline:
tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident,
)
- def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]:
+ def fetch_pdf_meta(
+ self, fe: FileEntity, release_ident: str
+ ) -> Optional[Dict[str, Any]]:
"""
Fetches pdftext metadata from sandcrawler-db via postgrest HTTP
interface.