aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py43
1 files changed, 29 insertions, 14 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index f17d69d..93e7aa2 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -19,11 +19,20 @@ from fatcat_scholar.sandcrawler import (
SandcrawlerMinioClient,
SandcrawlerPostgrestClient,
)
-from fatcat_scholar.schema import DocType, IntermediateBundle
+from fatcat_scholar.schema import DocType, IntermediateBundle, clean_str
from fatcat_scholar.sim_pipeline import truncate_issue_meta, truncate_pub_meta
def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]:
+ """
+ Takes a string representing page numbers, and tries to turn it into a span
+ of page numbers as integers.
+
+ Handles common syntax like "466-7" to mean "466 to 467".
+
+ If there is only a single page number, returns the first page as the last
+ page as well.
+ """
first_raw = raw.split("-")[0]
if not first_raw.isdigit():
return (None, None)
@@ -228,11 +237,13 @@ class WorkPipeline:
Fetches (cached) crossref metadata JSON from sandcrawler-db via
postgrest HTTP interface.
- Returns a JSON object on success, or None if not found.
+ Returns a dict object on success, or None if not found.
- release_ident: Optional[str]
- doi: Optional[str]
- record: Optional[str]
+ Dict keys:
+
+ release_ident: Optional[str]
+ doi: Optional[str]
+ record: Optional[str]
"""
if not re.ext_ids.doi:
# can't do lookup without a DOI
@@ -280,15 +291,19 @@ class WorkPipeline:
release_ident: str,
) -> Optional[Any]:
"""
- issue_item
- pages: str
- page_texts: list
- page_num
- leaf_num
- raw_text
- release_ident: Optional[str]
- pub_item_metadata
- issue_item_metadata
+ Returns a dict with keys:
+
+ issue_item
+ pages: str
+ page_texts: list
+ page_num
+ leaf_num
+ raw_text
+ release_ident: Optional[str]
+ pub_item_metadata
+ issue_item_metadata
+
+ Or None if not found.
"""
first_page, last_page = parse_pages(pages)