diff options
Diffstat (limited to 'fatcat_scholar/sim_pipeline.py')
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 110 |
1 files changed, 65 insertions, 45 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index 6b52535..b84ac47 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -1,4 +1,3 @@ - import os import io import sys @@ -12,9 +11,17 @@ import internetarchive from fatcat_scholar.api_entities import * from fatcat_scholar.djvu import djvu_extract_leaf_texts -from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient +from fatcat_scholar.sandcrawler import ( + SandcrawlerPostgrestClient, + SandcrawlerMinioClient, +) from fatcat_scholar.issue_db import IssueDB, SimIssueRow -from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle +from fatcat_scholar.schema import ( + es_biblio_from_release, + es_release_from_release, + DocType, + IntermediateBundle, +) def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]: @@ -23,26 +30,27 @@ def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]: collection, and simplifies it by removing fields. Motivation is to make intermediate bundle files smaller. """ - full.pop('files') - if 'ulrichs' in full and full['ulrichs']: - full['ulrichs'][0].pop('reviews_mfl') - full['ulrichs'][0].pop('editorial_description') + full.pop("files") + if "ulrichs" in full and full["ulrichs"]: + full["ulrichs"][0].pop("reviews_mfl") + full["ulrichs"][0].pop("editorial_description") # these are interesting, but just too long - full['ulrichs'][0].pop('online_availability_full_text') - full['ulrichs'][0].pop('abstracting_indexing') - full['ulrichs'][0].pop('publisher_and_ordering_details') + full["ulrichs"][0].pop("online_availability_full_text") + full["ulrichs"][0].pop("abstracting_indexing") + full["ulrichs"][0].pop("publisher_and_ordering_details") return full + def truncate_issue_meta(full: Dict[str, Any]) -> Dict[str, Any]: """ Same as truncate_pub_meta() but for issue item metadata """ - full.pop('files') + full.pop("files") return full -class SimPipeline(): +class SimPipeline: def __init__(self, issue_db: IssueDB): self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() @@ -60,44 +68,50 @@ class SimPipeline(): issue_item_metadata """ # fetch full metadata from API - issue_meta = self.ia_client.get_metadata(issue_db_row['issue_item']) - pub_meta = self.ia_client.get_metadata(issue_db_row['pub_collection']) + issue_meta = self.ia_client.get_metadata(issue_db_row["issue_item"]) + pub_meta = self.ia_client.get_metadata(issue_db_row["pub_collection"]) leaf_index = dict() leaf_list = [] - if not 'page_numbers' in issue_meta: + if not "page_numbers" in issue_meta: # TODO: warn return None - for entry in issue_meta['page_numbers'].get('pages', []): - page_num = entry['pageNumber'] - leaf_index[entry['leafNum']] = page_num + for entry in issue_meta["page_numbers"].get("pages", []): + page_num = entry["pageNumber"] + leaf_index[entry["leafNum"]] = page_num if not (page_num and page_num.isdigit()): continue page_num = int(page_num) - leaf_list.append(entry['leafNum']) + leaf_list.append(entry["leafNum"]) if not leaf_list: return None page_texts: List[Dict[str, Any]] = [] - issue_item = self.ia_client.get_item(issue_db_row['issue_item']) - issue_item_djvu = issue_item.get_file(issue_db_row['issue_item'] + "_djvu.xml") + issue_item = self.ia_client.get_item(issue_db_row["issue_item"]) + issue_item_djvu = issue_item.get_file(issue_db_row["issue_item"] + "_djvu.xml") # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() - djvu_bytes.close = lambda: None # type: ignore + djvu_bytes.close = lambda: None # type: ignore assert issue_item_djvu.download(fileobj=djvu_bytes) == True djvu_bytes.seek(0) djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8")) - del(djvu_bytes) + del djvu_bytes leaf_dict = djvu_extract_leaf_texts(djvu_xml) for leaf_num, raw_text in leaf_dict.items(): - page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text)) + page_texts.append( + dict( + page_num=leaf_index.get(leaf_num), + leaf_num=leaf_num, + raw_text=raw_text, + ) + ) return dict( - issue_item=issue_db_row['issue_item'], + issue_item=issue_db_row["issue_item"], pages=None, page_texts=page_texts, release_ident=None, @@ -109,10 +123,14 @@ class SimPipeline(): count = 0 self.issue_db.db.row_factory = sqlite3.Row cur = self.issue_db.db.cursor() - for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'): + for row in cur.execute( + "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3" + ): # filter out "contents" and "index" items # TODO: more filters; also redundant with IssueDB code? - if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'): + if row["issue_item"].endswith("_contents") or row["issue_item"].endswith( + "_index" + ): continue try: full_issue = self.fetch_sim_issue(row) @@ -124,7 +142,7 @@ class SimPipeline(): continue if not full_issue: continue - for leaf in full_issue['page_texts']: + for leaf in full_issue["page_texts"]: bundle = IntermediateBundle( doc_type=DocType.sim_page, releases=[], @@ -132,13 +150,13 @@ class SimPipeline(): grobid_fulltext=None, pdftotext_fulltext=None, sim_fulltext=dict( - issue_item=full_issue['issue_item'], - pages=str(leaf['page_num']), + issue_item=full_issue["issue_item"], + pages=str(leaf["page_num"]), page_texts=[leaf], release_ident=None, - pub_item_metadata=full_issue['pub_item_metadata'], - issue_item_metadata=full_issue['issue_item_metadata'], - ) + pub_item_metadata=full_issue["pub_item_metadata"], + issue_item_metadata=full_issue["issue_item_metadata"], + ), ) print(bundle.json()) count += 1 @@ -147,6 +165,7 @@ class SimPipeline(): if limit is not None and count >= limit: break + def main(): """ Run this command like: @@ -155,20 +174,20 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--issue-db-file", + parser.add_argument( + "--issue-db-file", help="sqlite3 database file to open", - default='data/issue_db.sqlite', - type=str) + default="data/issue_db.sqlite", + type=str, + ) - sub = subparsers.add_parser('run_issue_db', - help="iterates through entire IssueDB") - sub.set_defaults(func='run_issue_db') - sub.add_argument("--limit", - help="maximum number of pages to index", - type=int) + sub = subparsers.add_parser("run_issue_db", help="iterates through entire IssueDB") + sub.set_defaults(func="run_issue_db") + sub.add_argument("--limit", help="maximum number of pages to index", type=int) args = parser.parse_args() if not args.__dict__.get("func"): @@ -177,11 +196,12 @@ def main(): sp = SimPipeline(issue_db=IssueDB(args.issue_db_file)) - if args.func == 'run_issue_db': + if args.func == "run_issue_db": sp.run_issue_db(limit=args.limit) else: func = getattr(sp, args.func) func() -if __name__=="__main__": + +if __name__ == "__main__": main() |