diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:17:43 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 18:17:43 -0700 | 
| commit | 1ccd50eca45667aaf232f3bfb6a5aafadf17fc09 (patch) | |
| tree | 13e5bcc999f76938d8de65ec0f20fb2e1b8c19ce | |
| parent | 73cde66c9ab0bcae733097e53f1b6c4f00a8ea86 (diff) | |
| download | fatcat-scholar-1ccd50eca45667aaf232f3bfb6a5aafadf17fc09.tar.gz fatcat-scholar-1ccd50eca45667aaf232f3bfb6a5aafadf17fc09.zip  | |
fixes to release+sim pipeline
| -rw-r--r-- | fatcat_scholar/issue_db.py | 22 | ||||
| -rw-r--r-- | fatcat_scholar/transform.py | 3 | ||||
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 26 | 
3 files changed, 39 insertions, 12 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 5278750..4f5ff53 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -25,6 +25,20 @@ class SimPubRow:      def tuple(self):          return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid) +    @classmethod +    def from_tuple(cls, row: Any) -> "SimPubRow": +        return SimPubRow( +            sim_pubid=row[0], +            pub_collection=row[1], +            title=row[2], +            issn=row[3], +            pub_type=row[4], +            publisher=row[5], +            container_issnl=row[6], +            container_ident=row[7], +            wikidata_qid=row[8], +        ) +  @dataclass  class SimIssueRow:      """ @@ -45,7 +59,7 @@ class SimIssueRow:          return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)      @classmethod -    def from_tuple(self, row: Any): +    def from_tuple(cls, row: Any) -> "SimIssueRow":          return SimIssueRow(              issue_item=row[0],              sim_pubid=row[1], @@ -166,6 +180,12 @@ class IssueDB():              return None          return SimIssueRow.from_tuple(row[0]) +    def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]: +        row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) +        if not row: +            return None +        return SimPubRow.from_tuple(row[0]) +      def load_pubs(self, json_lines: Sequence[str], api: Any):          """          Reads a file (or some other iterator) of JSON lines, parses them into a diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index a86fe15..d858a4c 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -184,11 +184,12 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:      tags: List[str] = []      work_ident: Optional[str] = None      abstracts: List[ScholarAbstract] = [] -    fulltext: Optional[ScholarFulltext] +    fulltext: Optional[ScholarFulltext] = None      ia_sim: Optional[ScholarSim] = None      if heavy.sim_fulltext is not None:          ia_sim = es_sim_from_sim(heavy.sim_fulltext) +        fulltext = es_fulltext_from_sim(heavy.sim_fulltext)      if heavy.doc_type == DocType.sim_page:          assert ia_sim is not None diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 081878c..ebc2923 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -11,8 +11,9 @@ import internetarchive  from fatcat_scholar.api_entities import *  from fatcat_scholar.djvu import djvu_extract_leaf_texts  from fatcat_scholar.sandcrawler import SandcrawlerPostgrestClient, SandcrawlerMinioClient -from fatcat_scholar.issue_db import IssueDB, SimIssueRow +from fatcat_scholar.issue_db import IssueDB, SimIssueRow, SimPubRow  from fatcat_scholar.schema import es_biblio_from_release, es_release_from_release, DocType, IntermediateBundle +from fatcat_scholar.sim_pipeline import truncate_pub_meta, truncate_issue_meta  def parse_pages(raw: str) -> Tuple[Optional[int], Optional[int]]: @@ -124,12 +125,13 @@ class WorkPipeline():          return self.issue_db.lookup_issue(sim_pubid=sim_pubid, volume=release.volume, issue=release.issue) -    def fetch_sim(self, issue_db_row: SimIssueRow, pages: str, release_ident: str) -> Optional[Any]: +    def fetch_sim(self, issue_db_row: SimIssueRow, issue_db_pub_row: SimPubRow, pages: str, release_ident: str) -> Optional[Any]:          """          issue_item           pages: str          page_texts: list -            page_number +            page_num +            leaf_num              raw_text          release_ident: Optional[str]          pub_item_metadata @@ -142,13 +144,14 @@ class WorkPipeline():          # fetch full metadata from API          issue_meta = self.ia_client.get_metadata(issue_db_row.issue_item) -        # XXX: pub_meta = self.ia_client.get_metadata(issue_db_row.pub_collection) -        pub_meta = None +        pub_meta = self.ia_client.get_metadata(issue_db_pub_row.pub_collection) +        leaf_index = dict()          leaf_list = []          assert 'page_numbers' in issue_meta          for entry in issue_meta['page_numbers'].get('pages', []):              page_num = entry['pageNumber'] +            leaf_index[entry['leafNum']] = page_num              if not (page_num and page_num.isdigit()):                  continue              page_num = int(page_num) @@ -172,16 +175,16 @@ class WorkPipeline():          leaf_dict = djvu_extract_leaf_texts(djvu_xml, only_leaves=leaf_list) -        for leaf, raw_text in leaf_dict.items(): -            page_texts.append(dict(page_number=leaf, raw_text=raw_text)) +        for leaf_num, raw_text in leaf_dict.items(): +            page_texts.append(dict(page_num=leaf_index.get(leaf_num), leaf_num=leaf_num, raw_text=raw_text))          return dict(              issue_item=issue_db_row.issue_item,              pages=pages,              page_texts=page_texts,              release_ident=release_ident, -            pub_item_metadata=pub_meta, -            issue_item_metadata=issue_item.metadata, +            pub_item_metadata=truncate_pub_meta(pub_meta), +            issue_item_metadata=truncate_issue_meta(issue_meta),          )      def process_release_list(self, releases: List[ReleaseEntity]) -> IntermediateBundle: @@ -226,8 +229,11 @@ class WorkPipeline():              #print(f"release_{release.ident}: sim_issue={sim_issue}", file=sys.stderr)              if not sim_issue:                  continue +            sim_pub = self.issue_db.lookup_pub(sim_issue.sim_pubid) +            if not sim_pub: +                continue              # XXX: control flow tweak? -            sim_fulltext = self.fetch_sim(sim_issue, release.pages, release.ident) +            sim_fulltext = self.fetch_sim(sim_issue, sim_pub, release.pages, release.ident)              if sim_fulltext:                  break  | 
