html: pdf and html extract similar to XML

Note that the primary PDF URL extraction path is a separate code path.
author: Bryan Newbold <bnewbold@archive.org> 2020-11-06 18:17:49 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-06 18:17:52 -0800
commit: 8958b12ff12c59f1c1f7267a509a99bfaa14c7d7 (patch)
tree: 8fdc9fb84edf1ec3545384f8752156ebf2c8eecf
parent: 8f4a22d78acb6518c6546645557ad5f0d2253c66 (diff)
download: sandcrawler-8958b12ff12c59f1c1f7267a509a99bfaa14c7d7.tar.gz
sandcrawler-8958b12ff12c59f1c1f7267a509a99bfaa14c7d7.zip
2 files changed, 55 insertions, 22 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 8928978..0d14166 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -154,14 +154,6 @@ HEAD_META_PATTERNS: Any = {
         "meta[name='dc.language']",
         "meta[name='og:locale']",
     ],
-    "html_fulltext_url": [
-        "meta[name='citation_fulltext_html_url']",
-        "meta[name='bepress_citation_fulltext_html_url']",
-    ],
-    "pdf_fulltext_url": [
-        "meta[name='citation_pdf_url']",
-        "meta[name='bepress_citation_pdf_url']",
-    ],
 }
 
 HEAD_META_LIST_PATTERNS: Any = {
@@ -205,6 +197,27 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
     },
 ]
 
+HTML_FULLTEXT_PATTERNS: List[dict] = [
+    {
+        "selector": "meta[name='citation_fulltext_html_url']",
+        "attr": "content",
+        "technique": "citation_fulltext_html_url",
+    },
+]
+
+PDF_FULLTEXT_PATTERNS: List[dict] = [
+    {
+        "selector": "meta[name='citation_pdf_url']",
+        "attr": "content",
+        "technique": "citation_pdf_url",
+    },
+    {
+        "selector": "meta[name='bepress_citation_pdf_url']",
+        "attr": "content",
+        "technique": "citation_pdf_url",
+    },
+]
+
 RELEASE_TYPE_MAP = {
     "research article": "article-journal",
     "text.serial.journal": "article-journal",
@@ -308,9 +321,15 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
                 break
 
     # (some) fulltext extractions
+    pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
+    if pdf_fulltext_url:
+        meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
     xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
     if xml_fulltext_url:
         meta['xml_fulltext_url'] = xml_fulltext_url[0]
+    html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
+    if html_fulltext_url:
+        meta['html_fulltext_url'] = html_fulltext_url[0]
 
     # TODO: replace with clean_doi() et al
     if meta.get('doi') and meta.get('doi').startswith('doi:'):
@@ -340,24 +359,12 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
         if release_type:
             meta['release_type'] = release_type
 
-    # resolve relative URLs
-    for key in ('pdf_fulltext_url', 'html_fulltext_url'):
-        if meta.get(key):
-            meta[key] = urllib.parse.urljoin(doc_url, meta[key])
-
     return BiblioMetadata(**meta)
 
 def load_adblock_rules() -> braveblock.Adblocker:
     """
     TODO: consider blocking very generic assets:
-
-    - favicon.ico
     - ://fonts.googleapis.com/css*
-    - ://widgets.figshare.com/*
-    - ://crossmark-cdn.crossref.org/widget/*
-    - ://code.jquery.com/*
-        => hrm
-    - ://platform.twitter.com/widgets.js
     - ://journals.plos.org/plosone/resource/img/icon.*
     """
     return braveblock.Adblocker(
@@ -384,6 +391,9 @@ def load_adblock_rules() -> braveblock.Adblocker:
             #"||ajax.googleapis.com^",
             #"||cdnjs.cloudflare.com^",
 
+            # badges, "share" buttons, etc
+            "apis.google.com/js/plusone",
+
             # PLOS images
             "/resource/img/icon.*.16.png^",
         ],
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 363dfb8..f696231 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -19,8 +19,8 @@ from sandcrawler.html_ingest import fetch_html_resources, \
     quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
     WebResource
 from sandcrawler.html_metadata import html_extract_fulltext_url, \
-    XML_FULLTEXT_PATTERNS, BiblioMetadata, html_extract_resources, \
-    html_extract_biblio, load_adblock_rules
+    XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, BiblioMetadata, \
+    html_extract_resources, html_extract_biblio, load_adblock_rules
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.xml import xml_reserialize
@@ -563,6 +563,29 @@ class IngestFileWorker(SandcrawlerWorker):
                             next_url,
                         ),
                         file=sys.stderr)
+                    if next_url in hops:
+                        result['status'] = 'link-loop'
+                        result['error_message'] = "repeated: {}".format(next_url)
+                        return result
+                    hops.append(next_url)
+                    continue
+            elif ingest_type == "html" and html_ish_resource:
+                # parse with selectolax, extract XML fulltext URL
+                html_doc = HTMLParser(resource.body)
+                extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, HTML_FULLTEXT_PATTERNS)
+                if extract_next_hop:
+                    next_url = extract_next_hop[0]
+                    technique = extract_next_hop[1]
+                    if next_url in hops:
+                        # for HTML ingest, we don't count this as a link-loop
+                        break
+                    print("[PARSE  {:>6}] {}  {}".format(
+                            ingest_type,
+                            technique,
+                            next_url,
+                        ),
+                        file=sys.stderr)
+                    hops.append(next_url)
                     continue
 
             # default is to NOT keep hopping
author	Bryan Newbold <bnewbold@archive.org>	2020-11-06 18:17:49 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-06 18:17:52 -0800
commit	8958b12ff12c59f1c1f7267a509a99bfaa14c7d7 (patch)
tree	8fdc9fb84edf1ec3545384f8752156ebf2c8eecf
parent	8f4a22d78acb6518c6546645557ad5f0d2253c66 (diff)
download	sandcrawler-8958b12ff12c59f1c1f7267a509a99bfaa14c7d7.tar.gz sandcrawler-8958b12ff12c59f1c1f7267a509a99bfaa14c7d7.zip