basic support for excluding web content from index

Based on particular patterns in metadata, or exclusion lists in settings
author: Bryan Newbold <bnewbold@archive.org> 2021-01-22 19:48:56 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-01-22 19:49:09 -0800
commit: 85ba0df07009e1973f0a13e2973cd170afe6ddb6 (patch)
tree: 1a5af321e48a9014577cfef94b7bab096e09b00d
parent: 2124a68f7659826d9fec80803af45e86e0374204 (diff)
download: fatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.tar.gz
fatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.zip
4 files changed, 62 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 434f735..9377048 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -159,6 +159,20 @@ class ScholarFulltext(BaseModel):
     access_url: Optional[str]
     access_type: Optional[AccessType]
 
+    def remove_access(self) -> Any:
+        """
+        Returns a fulltext-indexable copy of self, but with access options and
+        file-level details removed
+        """
+        return ScholarFulltext(
+            lang_code=self.lang_code,
+            body=self.body,
+            acknowledgement=self.acknowledgement,
+            annex=self.annex,
+            release_ident=self.release_ident,
+            thumbnail_url=self.thumbnail_url,
+        )
+
 
 class ScholarRelease(BaseModel):
     ident: Optional[str]
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index a63d4ea..6c7cb56 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -236,7 +236,7 @@
     {% trans %}Preserved Fulltext{% endtrans %}
   </h4>
 
-  {% if paper.fulltext %}
+  {% if paper.fulltext and paper.fulltext.access_url %}
     <div class="ui items">
       <div class="item">
         <div class="image">
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index ec0ed12..539f76f 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -394,6 +394,28 @@ def generate_tags(
     return list(set(tags))
 
 
+def check_exclude_web(biblio: ScholarBiblio) -> bool:
+    """
+    Returns a flag that fulltext web archive options to a work should not be
+    linked to from web interface
+    """
+    if biblio.release_year and biblio.release_year <= 1925:
+        return False
+    if biblio.container_ident and biblio.container_ident in settings.EXCLUDE_WEB_CONTAINER_IDENTS:
+        return True
+    if biblio.publisher:
+        for pub in settings.EXCLUDE_WEB_PUBLISHERS:
+            if pub in biblio.publisher.lower():
+                return True
+    if biblio.license_slug and biblio.license_slug.startswith("cc-"):
+        return False
+    if biblio.pmcid:
+        return False
+    if biblio.container_sherpa_color and biblio.container_sherpa_color == "white":
+        return True
+    return False
+
+
 def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
 
     tags: List[str] = []
@@ -402,6 +424,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
     abstracts: List[ScholarAbstract] = []
     fulltext: Optional[ScholarFulltext] = None
     primary_release: Optional[ReleaseEntity] = None
+    exclude_web_fulltext: bool = False
 
     ia_sim: Optional[ScholarSim] = None
     if heavy.sim_fulltext is not None:
@@ -427,6 +450,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         ][0]
         biblio = es_biblio_from_release(primary_release)
         biblio = biblio_metadata_hacks(biblio)
+        exclude_web_fulltext = check_exclude_web(biblio)
         abstracts = es_abstracts_from_release(primary_release)
 
         # if no abstract from primary_release, try all the other releases
@@ -448,11 +472,17 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             if f.ident == heavy.grobid_fulltext["file_ident"]
         ][0]
         tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
-        fulltext = es_fulltext_from_grobid(
-            tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
-        )
         if not abstracts:
             abstracts = es_abstracts_from_grobid(tei_dict)
+        grobid_fulltext = es_fulltext_from_grobid(
+            tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+        )
+        if exclude_web_fulltext and grobid_fulltext:
+            if not fulltext:
+                # include only partial fulltext object, with no access
+                fulltext = grobid_fulltext.remove_access()
+        else:
+            fulltext = grobid_fulltext
 
     if not fulltext and heavy.pdftotext_fulltext:
         fulltext_release = [
@@ -465,12 +495,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             for f in fulltext_release.files
             if f.ident == heavy.pdftotext_fulltext["file_ident"]
         ][0]
-        fulltext = es_fulltext_from_pdftotext(
+        pdftotext_fulltext = es_fulltext_from_pdftotext(
             heavy.pdftotext_fulltext["raw_text"],
             heavy.pdf_meta,
             fulltext_release,
             fulltext_file,
         )
+        if exclude_web_fulltext and pdftotext_fulltext:
+            fulltext = pdftotext_fulltext.remove_access()
+        else:
+            fulltext = pdftotext_fulltext
 
     if not fulltext and heavy.html_fulltext:
         fulltext_release = [
@@ -481,9 +515,13 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             for f in fulltext_release.webcaptures
             if f.ident == heavy.html_fulltext["webcapture_ident"]
         ][0]
-        fulltext = es_fulltext_from_html(
+        html_fulltext = es_fulltext_from_html(
             heavy.html_fulltext, fulltext_release, fulltext_webcapture,
         )
+        if exclude_web_fulltext and html_fulltext:
+            fulltext = html_fulltext.remove_access()
+        else:
+            fulltext = html_fulltext
 
     # TODO: additional access list (eg, HTML if only PDF currently)
     access_dict = dict()
@@ -499,9 +537,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         access_dict[AccessType.ia_sim] = ScholarAccess(
             access_type=AccessType.ia_sim,
             access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
+            # TODO: release_ident
         )
 
-    # TODO: additional abstracts
+    # TODO: additional abstracts (?)
 
     tags = generate_tags(biblio, primary_release)
 
diff --git a/settings.toml b/settings.toml
index 98986fd..b2faf21 100644
--- a/settings.toml
+++ b/settings.toml
@@ -21,6 +21,8 @@ GOATCOUNTER_SCRIPT_URL = "https://goatcounter.scholar.fatcat.wiki/count.js"
 ONION_DOMAIN = "scholar-qa.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion"
 ENABLE_PROMETHEUS = false
 ENABLE_CITATION_QUERY = true
+EXCLUDE_WEB_CONTAINER_IDENTS = []
+EXCLUDE_WEB_PUBLISHERS = []
 
 [test]
 SCHOLAR_ENV = "test"
author	Bryan Newbold <bnewbold@archive.org>	2021-01-22 19:48:56 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-01-22 19:49:09 -0800
commit	85ba0df07009e1973f0a13e2973cd170afe6ddb6 (patch)
tree	1a5af321e48a9014577cfef94b7bab096e09b00d
parent	2124a68f7659826d9fec80803af45e86e0374204 (diff)
download	fatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.tar.gz fatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.zip