extraction/extraction.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

#!/usr/bin/env python3

import io
import sys
import requests
#import happybase
import mrjob
from mrjob.job import MRJob
from wayback.resource import Resource
from wayback.resource import ArcResource
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory


def process_pdf_using_grobid(content_buffer, debug_line):
    """Query GrobId server & process response
    """
    GROBID_SERVER="http://wbgrp-svc096.us.archive.org:8070"
    content = content_buffer.read()
    r = requests.post(GROBID_SERVER + "/api/processFulltextDocument",
            files={'input': content})
    if r.status_code is not 200:
        print("FAIL (Grobid: {}): {}".format(r.content.decode('utf8'), debug_line))
    else:
        print("SUCCESS: " + debug_line)

class Cdx_Record_Pipeline(object):

    def read_cdx_and_parse(self, parser_func, accepted_mimes = []):
        """Read in CDX lines and process PDF records fetched over HTTP
        """
        rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) 
        for line in sys.stdin:
            line = line.rstrip()
            cdx_line = line.split()
            #ignoring NLine offset
            if len(cdx_line) != 12:
                continue
            cdx_line = cdx_line[1:]
            (src_url, timestamp, mime, record_location, record_offset, record_length) = (cdx_line[2], cdx_line[1], cdx_line[3], cdx_line[-1], cdx_line[-2], cdx_line[-3])
            if '-' == record_length or not record_location.endswith('arc.gz') or mime not in accepted_mimes:
                continue
            orig_url = cdx_line[2]
            debug_line = ' '.join(cdx_line)
            try:
                record_location = 'http://archive.org/download/' + record_location
                record_offset = int(record_offset)
                record_length = int(record_length)
                resource_data = rstore.load_resource(record_location, record_offset, record_length)
                parser_func(resource_data.open_raw_content(), debug_line)
            except:
                continue