WIP on extractor-with-mrjob

author: Bryan Newbold <bnewbold@archive.org> 2018-04-03 16:30:19 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-04-03 16:30:19 -0700
commit: 5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc (patch)
tree: 6198a1c156d6922c02f12964d5e9a30d71677dcc /extraction/extraction.py
parent: f10e8b798c8611d316279aa0afb158ee14691236 (diff)
download: sandcrawler-5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc.tar.gz
sandcrawler-5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc.zip
1 files changed, 52 insertions, 0 deletions
diff --git a/extraction/extraction.py b/extraction/extraction.py
new file mode 100644
index 0000000..cdca433
--- /dev/null
+++ b/extraction/extraction.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import io
+import sys
+import requests
+#import happybase
+import mrjob
+from mrjob.job import MRJob
+from wayback.resource import Resource
+from wayback.resource import ArcResource
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+
+def process_pdf_using_grobid(content_buffer, debug_line):
+    """Query GrobId server & process response
+    """
+    GROBID_SERVER="http://wbgrp-svc096.us.archive.org:8070"
+    content = content_buffer.read()
+    r = requests.post(GROBID_SERVER + "/api/processFulltextDocument",
+            files={'input': content})
+    if r.status_code is not 200:
+        print("FAIL (Grobid: {}): {}".format(r.content.decode('utf8'), debug_line))
+    else:
+        print("SUCCESS: " + debug_line)
+
+class Cdx_Record_Pipeline(object):
+
+    def read_cdx_and_parse(self, parser_func, accepted_mimes = []):
+        """Read in CDX lines and process PDF records fetched over HTTP
+        """
+        rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) 
+        for line in sys.stdin:
+            line = line.rstrip()
+            cdx_line = line.split()
+            #ignoring NLine offset
+            if len(cdx_line) != 12:
+                continue
+            cdx_line = cdx_line[1:]
+            (src_url, timestamp, mime, record_location, record_offset, record_length) = (cdx_line[2], cdx_line[1], cdx_line[3], cdx_line[-1], cdx_line[-2], cdx_line[-3])
+            if '-' == record_length or not record_location.endswith('arc.gz') or mime not in accepted_mimes:
+                continue
+            orig_url = cdx_line[2]
+            debug_line = ' '.join(cdx_line)
+            try:
+                record_location = 'http://archive.org/download/' + record_location
+                record_offset = int(record_offset)
+                record_length = int(record_length)
+                resource_data = rstore.load_resource(record_location, record_offset, record_length)
+                parser_func(resource_data.open_raw_content(), debug_line)
+            except:
+                continue
author	Bryan Newbold <bnewbold@archive.org>	2018-04-03 16:30:19 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-04-03 16:30:19 -0700
commit	5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc (patch)
tree	6198a1c156d6922c02f12964d5e9a30d71677dcc /extraction/extraction.py
parent	f10e8b798c8611d316279aa0afb158ee14691236 (diff)
download	sandcrawler-5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc.tar.gz sandcrawler-5ea0ef0fd34f09fab51f51e2f1dbe5cf6ec137cc.zip