ingest: shorted scope+platform keys; use html_biblio extraction for PDFs

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 19:23:31 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 19:23:36 -0800
commit: 0850b7fe7d5266ee0c4153b3e333d93eff164857 (patch)
tree: 08eaa9cb6420a67c6375d6fb1c8eaf27cd204f79
parent: a8ff73617a16a8b8b524c454247bde2399f34bf1 (diff)
download: sandcrawler-0850b7fe7d5266ee0c4153b3e333d93eff164857.tar.gz
sandcrawler-0850b7fe7d5266ee0c4153b3e333d93eff164857.zip
1 files changed, 35 insertions, 15 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index fb442d9..2c60b22 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -17,10 +17,11 @@ from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
 from sandcrawler.html import extract_fulltext_url
 from sandcrawler.html_ingest import fetch_html_resources, \
     quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
-    WebResource
+    WebResource, html_guess_platform
 from sandcrawler.html_metadata import html_extract_fulltext_url, \
-    XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, BiblioMetadata, \
-    html_extract_resources, html_extract_biblio, load_adblock_rules
+    XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, PDF_FULLTEXT_PATTERNS, \
+    BiblioMetadata, html_extract_resources, html_extract_biblio, \
+    load_adblock_rules
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.xml import xml_reserialize
@@ -353,6 +354,7 @@ class IngestFileWorker(SandcrawlerWorker):
         html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
         assert html_biblio
         html_body = html_extract_body_teixml(resource.body)
+        html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
         html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
         html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
 
@@ -360,14 +362,16 @@ class IngestFileWorker(SandcrawlerWorker):
             return dict(
                 status=html_scope,
                 html_biblio=html_biblio_dict,
-                html_scope=html_scope,
+                scope=html_scope,
+                platform=html_platform,
             )
         elif html_scope == 'unknown':
             html_body.pop("tei_xml", None)
             return dict(
                 status="unknown-scope",
                 html_biblio=html_biblio_dict,
-                html_scope=html_scope,
+                scope=html_scope,
+                platform=html_platform,
                 html_body=html_body,
             )
         elif html_scope not in ('article-fulltext',):
@@ -375,7 +379,8 @@ class IngestFileWorker(SandcrawlerWorker):
             return dict(
                 status="wrong-scope",
                 html_biblio=html_biblio_dict,
-                html_scope=html_scope,
+                scope=html_scope,
+                platform=html_platform,
                 html_body=html_body,
             )
 
@@ -385,7 +390,8 @@ class IngestFileWorker(SandcrawlerWorker):
             return dict(
                 status="too-many-resources",
                 html_biblio=html_biblio_dict,
-                html_scope=html_scope,
+                scope=html_scope,
+                platform=html_platform,
                 html_body=html_body,
             )
 
@@ -396,7 +402,8 @@ class IngestFileWorker(SandcrawlerWorker):
 
         partial_result = dict(
             html_biblio=html_biblio_dict,
-            html_scope=html_scope,
+            scope=html_scope,
+            platform=html_platform,
             html_body=html_body,
         )
 
@@ -434,6 +441,7 @@ class IngestFileWorker(SandcrawlerWorker):
             html_body=html_body,
             html_biblio=html_biblio_dict,
             scope=html_scope,
+            platform=html_platform,
             html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
         )
 
@@ -586,18 +594,30 @@ class IngestFileWorker(SandcrawlerWorker):
                 or "text/xml" in file_meta['mimetype']
             )
             html_biblio = None
+            html_doc = None
             if html_ish_resource and resource.body:
-                html_doc = HTMLParser(resource.body)
-                html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
-                if html_biblio and html_biblio.title:
-                    result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
-                    #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+                try:
+                    html_doc = HTMLParser(resource.body)
+                    html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+                    if html_biblio:
+                        if not 'html_biblio' in result or html_biblio.title:
+                            result['html_biblio'] = json.loads(html_biblio.json(exclude_none=True))
+                            #print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
+                except ValueError:
+                    pass
 
             if ingest_type == "pdf" and html_ish_resource:
-                # Got landing page or similar. Some XHTML detected as "application/xml"
+                # Got landing page or similar
                 fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-                result['extract_next_hop'] = fulltext_url
 
+                # this is the new style of URL extraction
+                if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url:
+                    fulltext_url = dict(
+                        pdf_url=html_biblio.pdf_fulltext_url,
+                        technique="html_biblio",
+                    )
+
+                result['extract_next_hop'] = fulltext_url
                 if not fulltext_url:
                     result['status'] = 'no-pdf-link'
                     return result
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 19:23:31 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 19:23:36 -0800
commit	0850b7fe7d5266ee0c4153b3e333d93eff164857 (patch)
tree	08eaa9cb6420a67c6375d6fb1c8eaf27cd204f79
parent	a8ff73617a16a8b8b524c454247bde2399f34bf1 (diff)
download	sandcrawler-0850b7fe7d5266ee0c4153b3e333d93eff164857.tar.gz sandcrawler-0850b7fe7d5266ee0c4153b3e333d93eff164857.zip