diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-03-29 20:16:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-03-29 20:16:33 -0700 |
commit | 7c81b7bea3d670876faff1eb290c40656697dddb (patch) | |
tree | 4d3413d98089d56fa50de75f0f9c7ea310f02ce4 /mapreduce/cdx-record-pipeline/cdx-record-pipeline.py | |
parent | d2203182c9ed6e1ff13fa70fb25f049ef87c75a0 (diff) | |
download | sandcrawler-7c81b7bea3d670876faff1eb290c40656697dddb.tar.gz sandcrawler-7c81b7bea3d670876faff1eb290c40656697dddb.zip |
move to top level
Diffstat (limited to 'mapreduce/cdx-record-pipeline/cdx-record-pipeline.py')
-rwxr-xr-x | mapreduce/cdx-record-pipeline/cdx-record-pipeline.py | 67 |
1 files changed, 0 insertions, 67 deletions
diff --git a/mapreduce/cdx-record-pipeline/cdx-record-pipeline.py b/mapreduce/cdx-record-pipeline/cdx-record-pipeline.py deleted file mode 100755 index 9e521bf..0000000 --- a/mapreduce/cdx-record-pipeline/cdx-record-pipeline.py +++ /dev/null @@ -1,67 +0,0 @@ -#!./cdx-record-pipeline-venv/bin/python -''' -GrobId PDF Pipeline Test -Read in CDX lines and query GROBID server for each PDF resource -TODO: Testing / HBase integration -- Bryan will update as needed -''' -import os -import re -import sys -import base64 -import hashlib -import urllib -import urlparse -import re -import string -from wayback.resource import Resource -from wayback.resource import ArcResource -from wayback.resourcestore import ResourceStore -from gwb.loader import CDXLoaderFactory -from StringIO import StringIO -import requests -import sys - -def process_pdf_using_grobid(content_buffer, debug_line): - """Query GrobId server & process response - """ - GROBID_SERVER="http://wbgrp-svc096.us.archive.org:8070" - content = content_buffer.read() - r = requests.post(GROBID_SERVER + "/api/processFulltextDocument", - files={'input': content}) - if r.status_code is not 200: - print("FAIL (Grobid: {}): {}".format(r.content.decode('utf8'), debug_line)) - else: - print("SUCCESS: " + debug_line) - -class Cdx_Record_Pipeline(object): - - def read_cdx_and_parse(self, parser_func, accepted_mimes = []): - """Read in CDX lines and process PDF records fetched over HTTP - """ - rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) - for line in sys.stdin: - line = line.rstrip() - cdx_line = line.split() - #ignoring NLine offset - if len(cdx_line) != 12: - continue - cdx_line = cdx_line[1:] - (src_url, timestamp, mime, record_location, record_offset, record_length) = (cdx_line[2], cdx_line[1], cdx_line[3], cdx_line[-1], cdx_line[-2], cdx_line[-3]) - if '-' == record_length or not record_location.endswith('arc.gz') or mime not in accepted_mimes: - continue - orig_url = cdx_line[2] - debug_line = ' '.join(cdx_line) - try: - record_location = 'http://archive.org/download/' + record_location - record_offset = int(record_offset) - record_length = int(record_length) - resource_data = rstore.load_resource(record_location, record_offset, record_length) - parser_func(resource_data.open_raw_content(), debug_line) - except: - continue - -# main() -#_______________________________________________________________________________ -if __name__ == '__main__': - cdx_record_pipeline = Cdx_Record_Pipeline() - cdx_record_pipeline.read_cdx_and_parse(process_pdf_using_grobid, ['application/pdf', 'application/x-pdf']) |