diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 20:55:44 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-18 11:58:26 -0800 | 
| commit | c14e3a9d0ef68822da347e59e77c4b2b9a7b0828 (patch) | |
| tree | 0642fce712478b5441809d99e2bdf03f4457b0f9 | |
| parent | 4979c58ee91903148962f4d62d1a8d423349ad67 (diff) | |
| download | fatcat-scholar-c14e3a9d0ef68822da347e59e77c4b2b9a7b0828.tar.gz fatcat-scholar-c14e3a9d0ef68822da347e59e77c4b2b9a7b0828.zip  | |
basic HTML transform/index support
| -rw-r--r-- | fatcat_scholar/transform.py | 48 | ||||
| -rw-r--r-- | fatcat_scholar/worker.py | 1 | 
2 files changed, 47 insertions, 2 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index e92e34a..41ed417 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,9 +1,10 @@  import sys  import argparse  import datetime +import xml.etree.ElementTree as ET  from typing import List, Dict, Optional, Any, Sequence -from fatcat_openapi_client import ReleaseEntity, FileEntity +from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity  from fatcat_scholar.api_entities import *  from fatcat_scholar.schema import * @@ -230,6 +231,36 @@ def es_fulltext_from_pdftotext(      return _add_file_release_meta(ret, pdf_meta, re, fe) +def es_fulltext_from_html( +    html_fulltext: Dict[str, Any], re: ReleaseEntity, wc: WebcaptureEntity, +) -> Optional[ScholarFulltext]: + +    if not wc.archive_urls or not html_fulltext.get("tei_xml"): +        return None + +    ns = {"tei": "http://www.tei-c.org/ns/1.0"} +    tree = ET.fromstring(html_fulltext["tei_xml"]) +    body = tree.find(".//tei:body", ns) +    if body: +        raw_text = " ".join(body.itertext()) +    else: +        return None + +    ret = ScholarFulltext( +        lang_code=re.language, +        body=raw_text, +        acknowledgement=None, +        annex=None, +        release_ident=re.ident, +        # webcapture_ident=wc.ident, +        file_sha1=html_fulltext.get("html_meta", {}).get("sha1hex"), +        file_mimetype="text/html", +        access_url=wc.archive_urls[0].url, +        access_type=AccessType.wayback, +    ) +    return ret + +  def biblio_metadata_hacks(biblio: ScholarBiblio) -> ScholarBiblio:  # noqa: C901      """      This function does platform/publisher specific metadata hacks. @@ -435,7 +466,20 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:              fulltext_file,          ) -    # TODO: additional access list +    if not fulltext and heavy.html_fulltext: +        fulltext_release = [ +            r for r in heavy.releases if r.ident == heavy.html_fulltext["release_ident"] +        ][0] +        fulltext_webcapture = [ +            f +            for f in fulltext_release.webcaptures +            if f.ident == heavy.html_fulltext["webcapture_ident"] +        ][0] +        fulltext = es_fulltext_from_html( +            heavy.html_fulltext, fulltext_release, fulltext_webcapture, +        ) + +    # TODO: additional access list (eg, HTML if only PDF currently)      access_dict = dict()      if fulltext and fulltext.access_type:          access_dict[fulltext.access_type] = ScholarAccess( diff --git a/fatcat_scholar/worker.py b/fatcat_scholar/worker.py index af84dd1..d2cc3cb 100644 --- a/fatcat_scholar/worker.py +++ b/fatcat_scholar/worker.py @@ -131,6 +131,7 @@ class IndexDocsWorker(KafkaWorker):                  grobid_fulltext=obj.get("grobid_fulltext"),                  pdftotext_fulltext=obj.get("pdftotext_fulltext"),                  pdf_meta=obj.get("pdf_meta"), +                html_fulltext=obj.get("html_fulltext"),                  sim_fulltext=obj.get("sim_fulltext"),              )              es_doc = transform_heavy(bundle)  | 
