aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-22 19:48:56 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-22 19:49:09 -0800
commit85ba0df07009e1973f0a13e2973cd170afe6ddb6 (patch)
tree1a5af321e48a9014577cfef94b7bab096e09b00d /fatcat_scholar
parent2124a68f7659826d9fec80803af45e86e0374204 (diff)
downloadfatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.tar.gz
fatcat-scholar-85ba0df07009e1973f0a13e2973cd170afe6ddb6.zip
basic support for excluding web content from index
Based on particular patterns in metadata, or exclusion lists in settings
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/schema.py14
-rw-r--r--fatcat_scholar/templates/search_macros.html2
-rw-r--r--fatcat_scholar/transform.py51
3 files changed, 60 insertions, 7 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 434f735..9377048 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -159,6 +159,20 @@ class ScholarFulltext(BaseModel):
access_url: Optional[str]
access_type: Optional[AccessType]
+ def remove_access(self) -> Any:
+ """
+ Returns a fulltext-indexable copy of self, but with access options and
+ file-level details removed
+ """
+ return ScholarFulltext(
+ lang_code=self.lang_code,
+ body=self.body,
+ acknowledgement=self.acknowledgement,
+ annex=self.annex,
+ release_ident=self.release_ident,
+ thumbnail_url=self.thumbnail_url,
+ )
+
class ScholarRelease(BaseModel):
ident: Optional[str]
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index a63d4ea..6c7cb56 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -236,7 +236,7 @@
{% trans %}Preserved Fulltext{% endtrans %}
</h4>
- {% if paper.fulltext %}
+ {% if paper.fulltext and paper.fulltext.access_url %}
<div class="ui items">
<div class="item">
<div class="image">
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index ec0ed12..539f76f 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -394,6 +394,28 @@ def generate_tags(
return list(set(tags))
+def check_exclude_web(biblio: ScholarBiblio) -> bool:
+ """
+ Returns a flag that fulltext web archive options to a work should not be
+ linked to from web interface
+ """
+ if biblio.release_year and biblio.release_year <= 1925:
+ return False
+ if biblio.container_ident and biblio.container_ident in settings.EXCLUDE_WEB_CONTAINER_IDENTS:
+ return True
+ if biblio.publisher:
+ for pub in settings.EXCLUDE_WEB_PUBLISHERS:
+ if pub in biblio.publisher.lower():
+ return True
+ if biblio.license_slug and biblio.license_slug.startswith("cc-"):
+ return False
+ if biblio.pmcid:
+ return False
+ if biblio.container_sherpa_color and biblio.container_sherpa_color == "white":
+ return True
+ return False
+
+
def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
tags: List[str] = []
@@ -402,6 +424,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
abstracts: List[ScholarAbstract] = []
fulltext: Optional[ScholarFulltext] = None
primary_release: Optional[ReleaseEntity] = None
+ exclude_web_fulltext: bool = False
ia_sim: Optional[ScholarSim] = None
if heavy.sim_fulltext is not None:
@@ -427,6 +450,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
][0]
biblio = es_biblio_from_release(primary_release)
biblio = biblio_metadata_hacks(biblio)
+ exclude_web_fulltext = check_exclude_web(biblio)
abstracts = es_abstracts_from_release(primary_release)
# if no abstract from primary_release, try all the other releases
@@ -448,11 +472,17 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
- fulltext = es_fulltext_from_grobid(
- tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
- )
if not abstracts:
abstracts = es_abstracts_from_grobid(tei_dict)
+ grobid_fulltext = es_fulltext_from_grobid(
+ tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+ )
+ if exclude_web_fulltext and grobid_fulltext:
+ if not fulltext:
+ # include only partial fulltext object, with no access
+ fulltext = grobid_fulltext.remove_access()
+ else:
+ fulltext = grobid_fulltext
if not fulltext and heavy.pdftotext_fulltext:
fulltext_release = [
@@ -465,12 +495,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
for f in fulltext_release.files
if f.ident == heavy.pdftotext_fulltext["file_ident"]
][0]
- fulltext = es_fulltext_from_pdftotext(
+ pdftotext_fulltext = es_fulltext_from_pdftotext(
heavy.pdftotext_fulltext["raw_text"],
heavy.pdf_meta,
fulltext_release,
fulltext_file,
)
+ if exclude_web_fulltext and pdftotext_fulltext:
+ fulltext = pdftotext_fulltext.remove_access()
+ else:
+ fulltext = pdftotext_fulltext
if not fulltext and heavy.html_fulltext:
fulltext_release = [
@@ -481,9 +515,13 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
for f in fulltext_release.webcaptures
if f.ident == heavy.html_fulltext["webcapture_ident"]
][0]
- fulltext = es_fulltext_from_html(
+ html_fulltext = es_fulltext_from_html(
heavy.html_fulltext, fulltext_release, fulltext_webcapture,
)
+ if exclude_web_fulltext and html_fulltext:
+ fulltext = html_fulltext.remove_access()
+ else:
+ fulltext = html_fulltext
# TODO: additional access list (eg, HTML if only PDF currently)
access_dict = dict()
@@ -499,9 +537,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
access_dict[AccessType.ia_sim] = ScholarAccess(
access_type=AccessType.ia_sim,
access_url=f"https://archive.org/details/{ia_sim.issue_item}/page/{ia_sim.first_page}",
+ # TODO: release_ident
)
- # TODO: additional abstracts
+ # TODO: additional abstracts (?)
tags = generate_tags(biblio, primary_release)