diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-16 18:16:56 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-16 18:16:56 -0700 |
commit | 701bc6b77808daba3ea15a1e49add326d9f6badd (patch) | |
tree | ed31e27d4d674b4ab66b41a85163c819a5456b79 | |
parent | 8174a8f627ebba6f1595b44fd48ceebdde97e107 (diff) | |
download | fatcat-scholar-701bc6b77808daba3ea15a1e49add326d9f6badd.tar.gz fatcat-scholar-701bc6b77808daba3ea15a1e49add326d9f6badd.zip |
SIM pipeline: refactor issue item fetching and bundle conversion
-rw-r--r-- | fatcat_scholar/sim_pipeline.py | 55 |
1 files changed, 32 insertions, 23 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index 95e5cad..47a3e22 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -46,7 +46,7 @@ class SimPipeline: self.issue_db: IssueDB = issue_db self.ia_client = internetarchive.get_session() - def fetch_sim_issue(self, issue_db_row: Any) -> Optional[Any]: + def fetch_sim_issue(self, issue_item: str, pub_collection: str) -> Optional[Any]: """ issue_item pages: str @@ -59,8 +59,8 @@ class SimPipeline: issue_item_metadata """ # fetch full metadata from API - issue_meta = self.ia_client.get_metadata(issue_db_row["issue_item"]) - pub_meta = self.ia_client.get_metadata(issue_db_row["pub_collection"]) + issue_meta = self.ia_client.get_metadata(issue_item) + pub_meta = self.ia_client.get_metadata(pub_collection) leaf_index = dict() leaf_list = [] @@ -79,8 +79,8 @@ class SimPipeline: return None page_texts: List[Dict[str, Any]] = [] - issue_item = self.ia_client.get_item(issue_db_row["issue_item"]) - issue_item_djvu = issue_item.get_file(issue_db_row["issue_item"] + "_djvu.xml") + issue_item_obj = self.ia_client.get_item(issue_item) + issue_item_djvu = issue_item_obj.get_file(issue_item + "_djvu.xml") # override 'close()' method so we can still read out contents djvu_bytes = io.BytesIO() @@ -102,7 +102,7 @@ class SimPipeline: ) return dict( - issue_item=issue_db_row["issue_item"], + issue_item=issue_item, pages=None, page_texts=page_texts, release_ident=None, @@ -110,6 +110,27 @@ class SimPipeline: issue_item_metadata=truncate_issue_meta(issue_meta), ) + def full_issue_to_pages(self, full_issue: dict) -> List[IntermediateBundle]: + pages = [] + for leaf in full_issue["page_texts"]: + bundle = IntermediateBundle( + doc_type=DocType.sim_page, + releases=[], + biblio_release_ident=None, + grobid_fulltext=None, + pdftotext_fulltext=None, + sim_fulltext=dict( + issue_item=full_issue["issue_item"], + pages=str(leaf["page_num"]), + page_texts=[leaf], + release_ident=None, + pub_item_metadata=full_issue["pub_item_metadata"], + issue_item_metadata=full_issue["issue_item_metadata"], + ), + ) + pages.append(bundle) + return pages + def run_issue_db(self, limit: int = None) -> None: count = 0 self.issue_db.db.row_factory = sqlite3.Row @@ -124,7 +145,9 @@ class SimPipeline: ): continue try: - full_issue = self.fetch_sim_issue(row) + full_issue = self.fetch_sim_issue( + row["issue_item"], row["pub_collection"] + ) except requests.exceptions.ConnectionError as e: print(str(e), file=sys.stderr) continue @@ -133,22 +156,8 @@ class SimPipeline: continue if not full_issue: continue - for leaf in full_issue["page_texts"]: - bundle = IntermediateBundle( - doc_type=DocType.sim_page, - releases=[], - biblio_release_ident=None, - grobid_fulltext=None, - pdftotext_fulltext=None, - sim_fulltext=dict( - issue_item=full_issue["issue_item"], - pages=str(leaf["page_num"]), - page_texts=[leaf], - release_ident=None, - pub_item_metadata=full_issue["pub_item_metadata"], - issue_item_metadata=full_issue["issue_item_metadata"], - ), - ) + pages = self.full_issue_to_pages(full_issue) + for bundle in pages: print(bundle.json(exclude_none=True, sort_keys=True)) count += 1 if limit is not None and count >= limit: |