refactor ingest to a loop, allowing multiple hops

author: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:31:08 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-09 17:31:08 -0800
commit: 24185837a47f305757a5c783b95ca25b709f66e3 (patch)
tree: e71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python
parent: 00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff)
download: sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz
sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip
2 files changed, 57 insertions, 27 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 4b6c587..591c971 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -151,47 +151,70 @@ class IngestFileWorker(SandcrawlerWorker):
 
         result = dict(request=request, hit=False)
 
-        try:
-            # first hop
-            resource = self.find_resource(base_url, best_mimetype)
+        next_url = base_url
+        hops = [base_url]
+        self.max_hops = 4
+
+
+        while len(hops) <= self.max_hops:
+
+            result['hops'] = hops
+            try:
+                resource = self.find_resource(next_url, best_mimetype)
+            except SavePageNowError as e:
+                result['status'] = 'spn-error'
+                result['error_message'] = str(e)
+                return result
+            except PetaboxError as e:
+                result['status'] = 'petabox-error'
+                result['error_message'] = str(e)
+                return result
+            except CdxApiError as e:
+                result['status'] = 'cdx-error'
+                result['error_message'] = str(e)
+                return result
+            except WaybackError as e:
+                result['status'] = 'wayback-error'
+                result['error_message'] = str(e)
+                return result
+
             if not resource.hit:
                 result['status'] = resource.status
                 return result
             file_meta = gen_file_metadata(resource.body)
 
             if "html" in file_meta['mimetype']:
-                # got landing page, try another hop
+                # got landing page or similar
                 fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
                 
                 result['html'] = fulltext_url
                 if not fulltext_url or not 'pdf_url' in fulltext_url:
                     result['status'] = 'no-pdf-link'
+                    if resource.terminal_dt:
+                        result['terminal'] = {
+                            "terminal_url": resource.terminal_url,
+                            "terminal_dt": resource.terminal_dt,
+                            "terminal_status_code": resource.terminal_status_code,
+                        }
                     return result
                 print("\tlanding page URL extracted ({}): {}".format(
                         fulltext_url.get('technique'),
                         fulltext_url['pdf_url'],
                     ),
                     file=sys.stderr)
-                resource = self.find_resource(fulltext_url['pdf_url'], best_mimetype)
-                if not resource.hit:
-                    result['status'] = resource.status
+                next_url = fulltext_url['pdf_url']
+                if next_url in hops:
+                    result['status'] = 'link-loop'
+                    result['error_message'] = "repeated: {}".format(next_url)
                     return result
-                file_meta = gen_file_metadata(resource.body)
-        except SavePageNowError as e:
-            result['status'] = 'spn-error'
-            result['error_message'] = str(e)
-            return result
-        except PetaboxError as e:
-            result['status'] = 'petabox-error'
-            result['error_message'] = str(e)
-            return result
-        except CdxApiError as e:
-            result['status'] = 'cdx-error'
-            result['error_message'] = str(e)
-            return result
-        except WaybackError as e:
-            result['status'] = 'wayback-error'
-            result['error_message'] = str(e)
+                hops.append(next_url)
+                continue
+            
+            # default is to NOT keep hopping
+            break
+
+        if len(hops) >= self.max_hops:
+            result['status'] = "max-hops-exceeded"
             return result
 
         if resource.terminal_dt:
@@ -201,11 +224,12 @@ class IngestFileWorker(SandcrawlerWorker):
                 "terminal_status_code": resource.terminal_status_code,
             }
 
-        # must be a hit if we got this far
+        # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
         assert resource.hit == True
         assert resource.terminal_status_code == 200
 
         result['file_meta'] = file_meta
+        result['cdx'] = cdx_to_dict(resource.cdx)
 
         # other failure cases
         if not resource.body or file_meta['size_bytes'] == 0:
@@ -221,7 +245,6 @@ class IngestFileWorker(SandcrawlerWorker):
 
         result['status'] = "success"
         result['hit'] = True
-        result['cdx'] = cdx_to_dict(resource.cdx)
         return result
 
 
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8692b21..f5599e9 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -109,12 +109,19 @@ def test_ingest_landing(ingest_worker):
         headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
         body=WARC_BODY)
 
+    # this is for second time around; don't want to fetch same landing page
+    # HTML again and result in a loop
+    responses.add(responses.GET,
+        'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+        status=200,
+        headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+        body="<html></html>")
+
     resp = ingest_worker.process(request)
 
     print(resp)
     assert resp['hit'] == False
-    assert resp['status'] == "wrong-mimetype"
+    assert resp['status'] == "no-pdf-link"
     assert resp['request'] == request
     assert 'grobid' not in resp
-    assert resp['terminal']
author	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:31:08 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-09 17:31:08 -0800
commit	24185837a47f305757a5c783b95ca25b709f66e3 (patch)
tree	e71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python
parent	00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff)
download	sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip