diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 22:46:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 22:46:16 -0700 |
commit | 62d3299471b7b4229ae63e21e9d45ef7efde4348 (patch) | |
tree | fc0d04531ce4c81224723bd3646c88f74c7c42a2 /fatcat_scholar/transform.py | |
parent | 58688676ee2d96d9d7006e2fd8a14d47f271aa8f (diff) | |
download | fatcat-scholar-62d3299471b7b4229ae63e21e9d45ef7efde4348.tar.gz fatcat-scholar-62d3299471b7b4229ae63e21e9d45ef7efde4348.zip |
reformat python code with black
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 267 |
1 files changed, 158 insertions, 109 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 953ebff..b5a0223 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,4 +1,3 @@ - import os import io import sys @@ -10,83 +9,89 @@ import internetarchive from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient +from fatcat_scholar.sandcrawler import ( + SandcrawlerPostgrestClient, + SandcrawlerMinioClient, +) from fatcat_scholar.issue_db import IssueDB, SimIssueRow from fatcat_scholar.schema import * from fatcat_scholar.grobid2json import teixml2json def es_fulltext_from_sim(sim: Dict[str, Any]) -> Optional[ScholarFulltext]: - if not sim['page_texts']: + if not sim["page_texts"]: return None - first_page = sim['page_texts'][0]['page_num'] - issue_item = sim['issue_item'] + first_page = sim["page_texts"][0]["page_num"] + issue_item = sim["issue_item"] return ScholarFulltext( - lang_code=None, # TODO: pub/issue metadata? or langdetect? - body="\n".join([p['raw_text'] for p in sim['page_texts']]), - #acknowledgement=None, - #annex=None, - release_ident=sim.get('release_ident'), - #file_ident=None, - #file_sha1=None, - #file_mimetype=None, + lang_code=None, # TODO: pub/issue metadata? or langdetect? + body="\n".join([p["raw_text"] for p in sim["page_texts"]]), + # acknowledgement=None, + # annex=None, + release_ident=sim.get("release_ident"), + # file_ident=None, + # file_sha1=None, + # file_mimetype=None, thumbnail_url=f"https://archive.org/serve/{issue_item}/__ia_thumb.jpg", access_url=f"https://archive.org/details/{issue_item}/page/{first_page}", access_type=AccessType.ia_sim, ) + def es_sim_from_sim(sim: Dict[str, Any]) -> ScholarSim: first_page = None - if sim['page_texts']: - first_page = sim['page_texts'][0]['page_num'] + if sim["page_texts"]: + first_page = sim["page_texts"][0]["page_num"] return ScholarSim( - issue_item=sim['issue_item'], - pub_collection=sim['pub_item_metadata']['metadata']['identifier'], - sim_pubid=sim['issue_item_metadata']['metadata']['sim_pubid'], + issue_item=sim["issue_item"], + pub_collection=sim["pub_item_metadata"]["metadata"]["identifier"], + sim_pubid=sim["issue_item_metadata"]["metadata"]["sim_pubid"], first_page=first_page, ) + SIM_RELEASE_TYPE_MAP = { - 'Scholarly Journals': 'article-journal', + "Scholarly Journals": "article-journal", # TODO: } SIM_LANG_MAP = { - 'English': 'en', + "English": "en", # TODO: } SIM_COUNTRY_MAP = { - 'Netherlands': 'nl', + "Netherlands": "nl", # TODO: } + def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: - issue_meta = sim['issue_item_metadata']['metadata'] - pub_meta = sim['pub_item_metadata']['metadata'] + issue_meta = sim["issue_item_metadata"]["metadata"] + pub_meta = sim["pub_item_metadata"]["metadata"] first_page = None - if sim['page_texts']: - first_page = sim['page_texts'][0]['page_num'] - container_name = sim['pub_item_metadata']['metadata']['title'] + if sim["page_texts"]: + first_page = sim["page_texts"][0]["page_num"] + container_name = sim["pub_item_metadata"]["metadata"]["title"] last_word = container_name.split()[-1] - if len(last_word) == 9 and last_word[4] == '-': + if len(last_word) == 9 and last_word[4] == "-": container_name = container_name[:-10] issns = [] - raw_issn = issue_meta.get('issn') + raw_issn = issue_meta.get("issn") if raw_issn and len(raw_issn) == 9: issns.append(raw_issn) - volume = issue_meta.get('volume') + volume = issue_meta.get("volume") volume_int = None if volume and volume.isdigit(): volume_int = int(volume) - issue = issue_meta.get('issue') + issue = issue_meta.get("issue") issue_int = None if issue and issue.isdigit(): issue_int = int(issue) - date = issue_meta.get('date') + date = issue_meta.get("date") release_year = None if date and len(date) > 4 and date[:4].isdigit(): release_year = int(date[:4]) @@ -96,52 +101,52 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: release_date = date return ScholarBiblio( - #release_ident=release.ident, + # release_ident=release.ident, title=None, - #subtitle=None, - #original_title=release.original_title, + # subtitle=None, + # original_title=release.original_title, release_date=release_date, release_year=release_year, - release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get('pub_type')), - release_stage="published", # as a default - #withdrawn_status=release.withdrawn_status, - lang_code=SIM_LANG_MAP.get(pub_meta.get('language')), - country_code=SIM_COUNTRY_MAP.get(pub_meta.get('country')), + release_type=SIM_RELEASE_TYPE_MAP.get(pub_meta.get("pub_type")), + release_stage="published", # as a default + # withdrawn_status=release.withdrawn_status, + lang_code=SIM_LANG_MAP.get(pub_meta.get("language")), + country_code=SIM_COUNTRY_MAP.get(pub_meta.get("country")), volume=volume, volume_int=volume_int, issue=issue, issue_int=issue_int, - pages=sim.get('pages'), + pages=sim.get("pages"), first_page=first_page, first_page_int=None, - #number=None, - + # number=None, # no external identifiers - - #license_slug=release.license_slug, - publisher=issue_meta.get('publisher'), + # license_slug=release.license_slug, + publisher=issue_meta.get("publisher"), container_name=container_name, - container_original_name=None, # TODO pass-through - container_ident=None, # TODO: pass-through - container_type=None, # TODO - container_issnl=None, # TODO: pass-through + container_original_name=None, # TODO pass-through + container_ident=None, # TODO: pass-through + container_type=None, # TODO + container_issnl=None, # TODO: pass-through issns=issns, - # no contrib/affiliation info contrib_names=[], affiliations=[], ) -def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity) -> ScholarFulltext: + +def _add_file_release_meta( + fulltext: ScholarFulltext, re: ReleaseEntity, fe: FileEntity +) -> ScholarFulltext: best_url = None best_url_type = None for url in fe.urls: best_url = url.url best_url_type = AccessType.web - if '//archive.org/' in url.url: + if "//archive.org/" in url.url: best_url_type = AccessType.ia_file break - elif '//web.archive.org/' in url.url: + elif "//web.archive.org/" in url.url: best_url_type = AccessType.wayback break if url.rel == "repository": @@ -157,30 +162,36 @@ def _add_file_release_meta(fulltext: ScholarFulltext, re: ReleaseEntity, fe: Fil return fulltext -def es_fulltext_from_grobid(tei_xml: str, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: +def es_fulltext_from_grobid( + tei_xml: str, re: ReleaseEntity, fe: FileEntity +) -> Optional[ScholarFulltext]: obj = teixml2json(tei_xml) - if not obj.get('body'): + if not obj.get("body"): return None ret = ScholarFulltext( - lang_code=obj.get('lang'), - body=obj.get('body'), - acknowledgement=obj.get('acknowledgement'), - annex=obj.get('annex'), - thumbnail_url=None, # TODO: sandcrawler thumbnails + lang_code=obj.get("lang"), + body=obj.get("body"), + acknowledgement=obj.get("acknowledgement"), + annex=obj.get("annex"), + thumbnail_url=None, # TODO: sandcrawler thumbnails ) return _add_file_release_meta(ret, re, fe) -def es_fulltext_from_pdftotext(pdftotext: Any, re: ReleaseEntity, fe: FileEntity) -> Optional[ScholarFulltext]: + +def es_fulltext_from_pdftotext( + pdftotext: Any, re: ReleaseEntity, fe: FileEntity +) -> Optional[ScholarFulltext]: ret = ScholarFulltext( lang_code=re.language, - body=pdftotext['raw_text'], + body=pdftotext["raw_text"], acknowledgement=None, annex=None, - thumbnail_url=None, # TODO: sandcrawler thumbnails + thumbnail_url=None, # TODO: sandcrawler thumbnails ) return _add_file_release_meta(ret, re, fe) + def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: tags: List[str] = [] @@ -203,7 +214,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: work_ident = heavy.releases[0].work_id key = f"work_{work_ident}" assert heavy.biblio_release_ident - primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0] + primary_release = [ + r for r in heavy.releases if r.ident == heavy.biblio_release_ident + ][0] biblio = es_biblio_from_release(primary_release) # TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse @@ -212,19 +225,44 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") if heavy.grobid_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.grobid_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.grobid_fulltext['file_ident']][0] - fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file) + fulltext_release = [ + r + for r in heavy.releases + if r.ident == heavy.grobid_fulltext["release_ident"] + ][0] + fulltext_file = [ + f + for f in fulltext_release.files + if f.ident == heavy.grobid_fulltext["file_ident"] + ][0] + fulltext = es_fulltext_from_grobid( + heavy.grobid_fulltext["tei_xml"], fulltext_release, fulltext_file + ) # hack to pull through thumbnail from local pdftotext - if fulltext and fulltext.file_sha1 and not fulltext.thumbnail_url and heavy.pdftotext_fulltext: + if ( + fulltext + and fulltext.file_sha1 + and not fulltext.thumbnail_url + and heavy.pdftotext_fulltext + ): # https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png" if not fulltext and heavy.pdftotext_fulltext: - fulltext_release = [r for r in heavy.releases if r.ident == heavy.pdftotext_fulltext['release_ident']][0] - fulltext_file = [f for f in fulltext_release.files if f.ident == heavy.pdftotext_fulltext['file_ident']][0] - fulltext = es_fulltext_from_pdftotext(heavy.pdftotext_fulltext, fulltext_release, fulltext_file) + fulltext_release = [ + r + for r in heavy.releases + if r.ident == heavy.pdftotext_fulltext["release_ident"] + ][0] + fulltext_file = [ + f + for f in fulltext_release.files + if f.ident == heavy.pdftotext_fulltext["file_ident"] + ][0] + fulltext = es_fulltext_from_pdftotext( + heavy.pdftotext_fulltext, fulltext_release, fulltext_file + ) # TODO: additional access list access_dict = dict() @@ -246,41 +284,41 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: # tags if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"): - tags.append('oa') + tags.append("oa") if primary_release and primary_release.container: container = primary_release.container if container.extra: - if container.extra.get('doaj'): - tags.append('doaj') - tags.append('oa') - if container.extra.get('road'): - tags.append('road') - tags.append('oa') - if container.extra.get('szczepanski'): - tags.append('szczepanski') - tags.append('oa') - if container.extra.get('ia', {}).get('longtail_oa'): - tags.append('longtail') - tags.append('oa') - if container.extra.get('sherpa_romeo', {}).get('color') == 'white': - tags.append('oa') - if container.extra.get('default_license', '').lower().startswith('cc-'): - tags.append('oa') - if container.extra.get('platform'): + if container.extra.get("doaj"): + tags.append("doaj") + tags.append("oa") + if container.extra.get("road"): + tags.append("road") + tags.append("oa") + if container.extra.get("szczepanski"): + tags.append("szczepanski") + tags.append("oa") + if container.extra.get("ia", {}).get("longtail_oa"): + tags.append("longtail") + tags.append("oa") + if container.extra.get("sherpa_romeo", {}).get("color") == "white": + tags.append("oa") + if container.extra.get("default_license", "").lower().startswith("cc-"): + tags.append("oa") + if container.extra.get("platform"): # scielo, ojs, wordpress, etc - tags.append(container.extra['platform'].lower()) - if biblio.doi_prefix == '10.2307': - tags.append('jstor') + tags.append(container.extra["platform"].lower()) + if biblio.doi_prefix == "10.2307": + tags.append("jstor") # biorxiv/medrxiv hacks if not biblio.container_name and biblio.release_stage != "published": for _, acc in access_dict.items(): if "://www.medrxiv.org/" in acc.access_url: - biblio.container_name = 'medRxiv' + biblio.container_name = "medRxiv" if biblio.release_stage == None: biblio.release_stage = "submitted" elif "://www.biorxiv.org/" in acc.access_url: - biblio.container_name = 'bioRxiv' + biblio.container_name = "bioRxiv" if biblio.release_stage == None: biblio.release_stage = "submitted" tags = list(set(tags)) @@ -291,7 +329,6 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: doc_index_ts=datetime.datetime.utcnow(), work_ident=work_ident, tags=tags, - biblio=biblio, fulltext=fulltext, ia_sim=ia_sim, @@ -300,23 +337,28 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: access=list(access_dict.values()), ) + def run_transform(infile): for line in infile: obj = json.loads(line) heavy = IntermediateBundle( - doc_type=DocType(obj['doc_type']), - releases=[entity_from_json(json.dumps(re), ReleaseEntity) for re in obj['releases']], - biblio_release_ident=obj.get('biblio_release_ident'), - grobid_fulltext=obj.get('grobid_fulltext'), - pdftotext_fulltext=obj.get('pdftotext_fulltext'), - sim_fulltext=obj.get('sim_fulltext'), + doc_type=DocType(obj["doc_type"]), + releases=[ + entity_from_json(json.dumps(re), ReleaseEntity) + for re in obj["releases"] + ], + biblio_release_ident=obj.get("biblio_release_ident"), + grobid_fulltext=obj.get("grobid_fulltext"), + pdftotext_fulltext=obj.get("pdftotext_fulltext"), + sim_fulltext=obj.get("sim_fulltext"), ) es_doc = transform_heavy(heavy) if not es_doc: continue print(es_doc.json()) + def main(): """ Run this command like: @@ -325,25 +367,32 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - sub = subparsers.add_parser('run_transform', - help="iterates through 'heavy' intermediate") - sub.set_defaults(func='run_transform') - sub.add_argument("json_file", + sub = subparsers.add_parser( + "run_transform", help="iterates through 'heavy' intermediate" + ) + sub.set_defaults(func="run_transform") + sub.add_argument( + "json_file", help="intermediate globs as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do! (try --help)") sys.exit(-1) - if args.func == 'run_transform': + if args.func == "run_transform": run_transform(infile=args.json_file) else: raise NotImplementedError(args.func) -if __name__=="__main__": + +if __name__ == "__main__": main() |