diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 19:51:00 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-18 11:58:26 -0800 | 
| commit | 4979c58ee91903148962f4d62d1a8d423349ad67 (patch) | |
| tree | 09901de13601d058eb413614a0dc626e1e30f4d2 /fatcat_scholar/work_pipeline.py | |
| parent | 7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff) | |
| download | fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip | |
add basic html fulltext support to fetch pipeline
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
| -rw-r--r-- | fatcat_scholar/work_pipeline.py | 48 | 
1 files changed, 46 insertions, 2 deletions
| diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index 631bda8..ea33a01 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -7,7 +7,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence  import minio  import requests  import internetarchive -from fatcat_openapi_client import ReleaseEntity, FileEntity +from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity  from fatcat_scholar.api_entities import *  from fatcat_scholar.config import settings @@ -169,6 +169,41 @@ class WorkPipeline:              raw_text=raw_text, release_ident=release_ident, file_ident=fe.ident,          ) +    def fetch_webcapture_html_fulltext( +        self, wc: WebcaptureEntity, release_ident: str, +    ) -> Optional[Dict[str, Any]]: + +        primary_resources = [cdx for cdx in wc.cdx if cdx.url == wc.original_url] +        if not primary_resources or primary_resources[0].mimetype != "text/html": +            return None +        html_meta = self.sandcrawler_db_client.get_html_meta(primary_resources[0].sha1) +        if not html_meta: +            return None +        sha1hex = html_meta.get("sha1hex") +        if not sha1hex: +            return None +        if html_meta.get("status") != "success" or not html_meta.get("has_teixml"): +            return None + +        try: +            tei_xml = self.sandcrawler_s3_client.get_blob( +                bucket="sandcrawler", +                prefix="", +                folder="html_body", +                sha1hex=sha1hex, +                extension=".tei.xml", +            ) +            # print(grobid_xml) +        except minio.error.NoSuchKey: +            return None + +        return dict( +            html_meta=html_meta, +            tei_xml=tei_xml, +            release_ident=release_ident, +            webcapture_ident=wc.ident, +        ) +      def lookup_sim(self, release: ReleaseEntity) -> Optional[SimIssueRow]:          """          Checks in IssueDB to see if this release is likely to have a copy in a @@ -279,6 +314,7 @@ class WorkPipeline:          grobid_fulltext: Optional[Any] = None          pdf_meta: Optional[Any] = None          pdftotext_fulltext: Optional[Any] = None +        html_fulltext: Optional[Any] = None          for ident in pref_idents:              release = release_dict[ident]              if not release.files: @@ -295,7 +331,14 @@ class WorkPipeline:                      pdftotext_fulltext = self.fetch_file_pdftotext(fe, ident)                  if grobid_fulltext or pdftotext_fulltext:                      break -            if grobid_fulltext or pdftotext_fulltext: +                pdf_meta = None +            for wc in release.webcaptures: +                # find primary web capture object +                html_fulltext = self.fetch_webcapture_html_fulltext(wc, ident) +                if html_fulltext and html_fulltext.get("tei_xml"): +                    break +                html_fulltext = None +            if grobid_fulltext or pdftotext_fulltext or html_fulltext:                  break          # find best accessible SIM metadata and fulltext @@ -335,6 +378,7 @@ class WorkPipeline:              grobid_fulltext=grobid_fulltext,              pdftotext_fulltext=pdftotext_fulltext,              pdf_meta=pdf_meta, +            html_fulltext=html_fulltext,              sim_fulltext=sim_fulltext,          ) | 
