aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/sim_pipeline.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/sim_pipeline.py')
-rw-r--r--fatcat_scholar/sim_pipeline.py110
1 files changed, 65 insertions, 45 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 6b52535..b84ac47 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -1,4 +1,3 @@
-
import os
import io
import sys
@@ -12,9 +11,17 @@ import internetarchive
from fatcat_scholar.api_entities import *
from fatcat_scholar.djvu import djvu_extract_leaf_texts
-from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient
+from fatcat_scholar.sandcrawler import (
+ SandcrawlerPostgrestClient,
+ SandcrawlerMinioClient,
+)
from fatcat_scholar.issue_db import IssueDB, SimIssueRow
-from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle
+from fatcat_scholar.schema import (
+ es_biblio_from_release,
+ es_release_from_release,
+ DocType,
+ IntermediateBundle,
+)
def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
@@ -23,26 +30,27 @@ def truncate_pub_meta(full: Dict[str, Any]) -> Dict[str, Any]:
collection, and simplifies it by removing fields. Motivation is to make
intermediate bundle files smaller.
"""
- full.pop('files')
- if 'ulrichs' in full and full['ulrichs']:
- full['ulrichs'][0].pop('reviews_mfl')
- full['ulrichs'][0].pop('editorial_description')
+ full.pop("files")
+ if "ulrichs" in full and full["ulrichs"]:
+ full["ulrichs"][0].pop("reviews_mfl")
+ full["ulrichs"][0].pop("editorial_description")
# these are interesting, but just too long
- full['ulrichs'][0].pop('online_availability_full_text')
- full['ulrichs'][0].pop('abstracting_indexing')
- full['ulrichs'][0].pop('publisher_and_ordering_details')
+ full["ulrichs"][0].pop("online_availability_full_text")
+ full["ulrichs"][0].pop("abstracting_indexing")
+ full["ulrichs"][0].pop("publisher_and_ordering_details")
return full
+
def truncate_issue_meta(full: Dict[str, Any]) -> Dict[str, Any]:
"""
Same as truncate_pub_meta() but for issue item metadata
"""
- full.pop('files')
+ full.pop("files")
return full
-class SimPipeline():
+class SimPipeline:
def __init__(self, issue_db: IssueDB):
self.issue_db: IssueDB = issue_db
self.ia_client = internetarchive.get_session()
@@ -60,44 +68,50 @@ class SimPipeline():
issue_item_metadata
"""
# fetch full metadata from API
- issue_meta = self.ia_client.get_metadata(issue_db_row['issue_item'])
- pub_meta = self.ia_client.get_metadata(issue_db_row['pub_collection'])
+ issue_meta = self.ia_client.get_metadata(issue_db_row["issue_item"])
+ pub_meta = self.ia_client.get_metadata(issue_db_row["pub_collection"])
leaf_index = dict()
leaf_list = []
- if not 'page_numbers' in issue_meta:
+ if not "page_numbers" in issue_meta:
# TODO: warn
return None
- for entry in issue_meta['page_numbers'].get('pages', []):
- page_num = entry['pageNumber']
- leaf_index[entry['leafNum']] = page_num
+ for entry in issue_meta["page_numbers"].get("pages", []):
+ page_num = entry["pageNumber"]
+ leaf_index[entry["leafNum"]] = page_num
if not (page_num and page_num.isdigit()):
continue
page_num = int(page_num)
- leaf_list.append(entry['leafNum'])
+ leaf_list.append(entry["leafNum"])
if not leaf_list:
return None
page_texts: List[Dict[str, Any]] = []
- issue_item = self.ia_client.get_item(issue_db_row['issue_item'])
- issue_item_djvu = issue_item.get_file(issue_db_row['issue_item'] + "_djvu.xml")
+ issue_item = self.ia_client.get_item(issue_db_row["issue_item"])
+ issue_item_djvu = issue_item.get_file(issue_db_row["issue_item"] + "_djvu.xml")
# override 'close()' method so we can still read out contents
djvu_bytes = io.BytesIO()
- djvu_bytes.close = lambda: None # type: ignore
+ djvu_bytes.close = lambda: None # type: ignore
assert issue_item_djvu.download(fileobj=djvu_bytes) == True
djvu_bytes.seek(0)
djvu_xml = io.StringIO(djvu_bytes.read().decode("UTF-8"))
- del(djvu_bytes)
+ del djvu_bytes
leaf_dict = djvu_extract_leaf_texts(djvu_xml)
for leaf_num, raw_text in leaf_dict.items():
- page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text))
+ page_texts.append(
+ dict(
+ page_num=leaf_index.get(leaf_num),
+ leaf_num=leaf_num,
+ raw_text=raw_text,
+ )
+ )
return dict(
- issue_item=issue_db_row['issue_item'],
+ issue_item=issue_db_row["issue_item"],
pages=None,
page_texts=page_texts,
release_ident=None,
@@ -109,10 +123,14 @@ class SimPipeline():
count = 0
self.issue_db.db.row_factory = sqlite3.Row
cur = self.issue_db.db.cursor()
- for row in cur.execute('SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3'):
+ for row in cur.execute(
+ "SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3"
+ ):
# filter out "contents" and "index" items
# TODO: more filters; also redundant with IssueDB code?
- if row['issue_item'].endswith('_contents') or row['issue_item'].endswith('_index'):
+ if row["issue_item"].endswith("_contents") or row["issue_item"].endswith(
+ "_index"
+ ):
continue
try:
full_issue = self.fetch_sim_issue(row)
@@ -124,7 +142,7 @@ class SimPipeline():
continue
if not full_issue:
continue
- for leaf in full_issue['page_texts']:
+ for leaf in full_issue["page_texts"]:
bundle = IntermediateBundle(
doc_type=DocType.sim_page,
releases=[],
@@ -132,13 +150,13 @@ class SimPipeline():
grobid_fulltext=None,
pdftotext_fulltext=None,
sim_fulltext=dict(
- issue_item=full_issue['issue_item'],
- pages=str(leaf['page_num']),
+ issue_item=full_issue["issue_item"],
+ pages=str(leaf["page_num"]),
page_texts=[leaf],
release_ident=None,
- pub_item_metadata=full_issue['pub_item_metadata'],
- issue_item_metadata=full_issue['issue_item_metadata'],
- )
+ pub_item_metadata=full_issue["pub_item_metadata"],
+ issue_item_metadata=full_issue["issue_item_metadata"],
+ ),
)
print(bundle.json())
count += 1
@@ -147,6 +165,7 @@ class SimPipeline():
if limit is not None and count >= limit:
break
+
def main():
"""
Run this command like:
@@ -155,20 +174,20 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--issue-db-file",
+ parser.add_argument(
+ "--issue-db-file",
help="sqlite3 database file to open",
- default='data/issue_db.sqlite',
- type=str)
+ default="data/issue_db.sqlite",
+ type=str,
+ )
- sub = subparsers.add_parser('run_issue_db',
- help="iterates through entire IssueDB")
- sub.set_defaults(func='run_issue_db')
- sub.add_argument("--limit",
- help="maximum number of pages to index",
- type=int)
+ sub = subparsers.add_parser("run_issue_db", help="iterates through entire IssueDB")
+ sub.set_defaults(func="run_issue_db")
+ sub.add_argument("--limit", help="maximum number of pages to index", type=int)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -177,11 +196,12 @@ def main():
sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))
- if args.func == 'run_issue_db':
+ if args.func == "run_issue_db":
sp.run_issue_db(limit=args.limit)
else:
func = getattr(sp, args.func)
func()
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()