aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-06 15:51:01 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-06 15:51:01 -0800
commit1389c0fdf03b002b2a555f9a69755c4798b5530e (patch)
tree72c090883a303cad986d9043efa458e8c25da11b /fatcat_scholar
parentc038787106576dcceb680f5714b27a8e529489ab (diff)
downloadfatcat-scholar-1389c0fdf03b002b2a555f9a69755c4798b5530e.tar.gz
fatcat-scholar-1389c0fdf03b002b2a555f9a69755c4798b5530e.zip
SIM pipeline: improve issue skipping (based on suffix)
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/sim_pipeline.py32
1 files changed, 21 insertions, 11 deletions
diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py
index 18e3beb..cceec30 100644
--- a/fatcat_scholar/sim_pipeline.py
+++ b/fatcat_scholar/sim_pipeline.py
@@ -44,6 +44,22 @@ def truncate_issue_meta(full: Dict[str, Any]) -> Dict[str, Any]:
return full
+def should_skip_item(item_name: str) -> bool:
+ for suffix in [
+ "_contents",
+ "_contents_0",
+ "_index",
+ "_index_0",
+ "_index_1",
+ "_cumulative-index",
+ "_index-contents",
+ "_table-of-contents",
+ ]:
+ if item_name.endswith(suffix):
+ return True
+ return False
+
+
class SimPipeline:
def __init__(self, issue_db: IssueDB):
self.issue_db: IssueDB = issue_db
@@ -68,7 +84,7 @@ class SimPipeline:
leaf_index = dict()
leaf_list = []
if "page_numbers" not in issue_meta:
- # TODO: warn
+ print(f"issue without page_numbers: {issue_item}", file=sys.stderr)
return None
for entry in issue_meta["page_numbers"].get("pages", []):
page_num = entry["pageNumber"]
@@ -79,6 +95,7 @@ class SimPipeline:
leaf_list.append(entry["leafNum"])
if not leaf_list:
+ print(f"issue without leaf numbers: {issue_item}", file=sys.stderr)
return None
page_texts: List[Dict[str, Any]] = []
@@ -147,12 +164,9 @@ class SimPipeline:
for row in cur.execute(
"SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < 3"
):
- # filter out "contents" and "index" items
- # TODO: more filters; also redundant with IssueDB code?
- if row["issue_item"].endswith("_contents") or row["issue_item"].endswith(
- "_index"
- ):
+ if should_skip_item(row["issue_item"]):
continue
+
try:
full_issue = self.fetch_sim_issue(
row["issue_item"], row["pub_collection"]
@@ -187,11 +201,7 @@ class SimPipeline:
for row in cur.execute(
f"SELECT * FROM sim_issue LEFT JOIN sim_pub ON sim_issue.sim_pubid = sim_pub.sim_pubid WHERE sim_issue.release_count < {max_release_count}"
):
- # filter out "contents" and "index" items
- # TODO: more filters; also redundant with IssueDB code?
- if row["issue_item"].endswith("_contents") or row["issue_item"].endswith(
- "_index"
- ):
+ if should_skip_item(row["issue_item"]):
continue
print(f"{row['issue_item']}\t{row['pub_collection']}")