diff options
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 4 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 6 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 21 | ||||
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 4 |
4 files changed, 22 insertions, 13 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 9c48cd9..25c7002 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -17,7 +17,9 @@ class SandcrawlerPostgrestClient: return None def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]: - resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)) + resp = requests.get( + self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1) + ) resp.raise_for_status() resp_json = resp.json() if resp_json: diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index a2e19bc..0985081 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -207,7 +207,7 @@ def do_fulltext_search( number_of_fragments=2, fragment_size=300, # TODO: this will fix highlight encoding, but requires ES 7.x - #encoder="html", + # encoder="html", ) # sort order @@ -270,8 +270,8 @@ def do_fulltext_search( if type(h[key]) is str: h[key] = h[key].encode("utf8", "ignore").decode("utf8") # ensure collapse_key is a single value, not an array - if type(h['collapse_key']) == list: - h['collapse_key'] = h['collapse_key'][0] + if type(h["collapse_key"]) == list: + h["collapse_key"] = h["collapse_key"][0] count_found: int = int(resp.hits.total) count_returned = len(results) diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index c219528..ac80efc 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -104,10 +104,12 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: # original_title=release.original_title, release_date=release_date, release_year=release_year, - release_type=SIM_RELEASE_TYPE_MAP.get(issue_meta.get("pub_type")) or SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")), + release_type=SIM_RELEASE_TYPE_MAP.get(issue_meta.get("pub_type")) + or SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")), release_stage="published", # as a default # withdrawn_status=release.withdrawn_status, - lang_code=SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get(pub_meta.get("language")), + lang_code=SIM_LANG_MAP.get(issue_meta.get("language")) + or SIM_LANG_MAP.get(pub_meta.get("language")), country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")), volume=volume, volume_int=volume_int, @@ -133,7 +135,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: def _add_file_release_meta( - fulltext: ScholarFulltext, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity + fulltext: ScholarFulltext, + pdf_meta: Optional[dict], + re: ReleaseEntity, + fe: FileEntity, ) -> ScholarFulltext: best_url = None best_url_type = None @@ -181,10 +186,7 @@ def es_fulltext_from_pdftotext( ) -> Optional[ScholarFulltext]: ret = ScholarFulltext( - lang_code=re.language, - body=raw_text, - acknowledgement=None, - annex=None, + lang_code=re.language, body=raw_text, acknowledgement=None, annex=None, ) return _add_file_release_meta(ret, pdf_meta, re, fe) @@ -257,7 +259,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.pdftotext_fulltext["file_ident"] ][0] fulltext = es_fulltext_from_pdftotext( - heavy.pdftotext_fulltext["raw_text"], heavy.pdf_meta, fulltext_release, fulltext_file + heavy.pdftotext_fulltext["raw_text"], + heavy.pdf_meta, + fulltext_release, + fulltext_file, ) # TODO: additional access list diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index e3a0d8d..17a0f7a 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -119,7 +119,9 @@ class WorkPipeline: tei_xml=grobid_xml, release_ident=release_ident, file_ident=fe.ident, ) - def fetch_pdf_meta(self, fe: FileEntity, release_ident: str) -> Optional[Dict[str, Any]]: + def fetch_pdf_meta( + self, fe: FileEntity, release_ident: str + ) -> Optional[Dict[str, Any]]: """ Fetches pdftext metadata from sandcrawler-db via postgrest HTTP interface. |