import sys
import json
import base64
import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple

from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict
from sandcrawler.grobid import GrobidClient
from sandcrawler.misc import gen_file_metadata
from sandcrawler.html import extract_fulltext_url
from sandcrawler.workers import SandcrawlerWorker


class IngestFileWorker(SandcrawlerWorker):
    """
    High level flow is to look in history first, then go to live web if
    resource not found. Following redirects is treated as "fetching a
    resource". Current version fetches a single resource; if it isn't a hit
    but is an HTML 200, treats it as a landing page, tries to extract
    fulltext link, then fetches that resource.

        process(request) -> response
            Does all the things!

    Check existing processing (short circuit):

        check_existing_ingest(base_url) -> ingest_file_result or none
        process_existing(result) -> response
            try fetching all the rows we want. if any don't exist, fetch the resource itself and call process_hit()

    Fetch resource:

        find_resource(url) -> ResourceResult

    Process resource:

        process_hit(ResourceResult) -> response
        process_grobid(ResourceResult)
    """

    def __init__(self, sink=None, **kwargs):
        super().__init__()
        
        self.sink = sink
        self.wayback_client = kwargs.get('wayback_client')
        if not self.wayback_client:
            self.wayback_client = WaybackClient()
        self.spn_client = kwargs.get('spn_client')
        if not self.spn_client:
            self.spn_client = SavePageNowClient()
        self.grobid_client = kwargs.get('grobid_client')
        if not self.grobid_client:
            self.grobid_client = GrobidClient()

        self.try_existing_ingest = False
        self.try_wayback = True
        self.try_spn2 = True

    def check_existing_ingest(self, base_url):
        """
        Check in sandcrawler-db (postgres) to see if we have already ingested
        this URL (ingest file result table).

        Returns existing row *if* found *and* we should use it, otherwise None.

        Looks at existing ingest results and makes a decision based on, eg,
        status and timestamp.
        """
        if not self.try_existing_ingest:
            return None
        raise NotImplementedError

        # this "return True" is just here to make pylint happy
        return True

    def find_resource(self, url, best_mimetype=None):
        """
        Looks in wayback for a resource starting at the URL, following any
        redirects. If a hit isn't found, try crawling with SPN.
        """
        via = "none"
        resource = None
        if self.try_wayback:
            via = "wayback"
            resource = self.wayback_client.lookup_resource(url, best_mimetype)
        if self.try_spn2 and (not resource or not resource.hit):
            via = "spn2"
            resource = self.spn_client.crawl_resource(url, self.wayback_client)
        print("[FETCH {}\t] {}\turl:{}".format(
                via,
                resource.status,
                url),
            file=sys.stderr)
        return resource

    def process_existing(self, request, result_row):
        """
        If we have an existing ingest file result, do any database fetches or
        additional processing necessary to return a result.
        """
        result = {
            'hit': result_row.hit,
            'status': result_row.status,
            'request': request,
        }
        # TODO: fetch file_meta
        # TODO: fetch grobid
        return result

    def process_hit(self, resource, file_meta):
        """
        Run all the necessary processing for a new/fresh ingest hit.
        """
        return {
            'grobid': self.process_grobid(resource),
        }

    def process_grobid(self, resource):
        """
        Submits to resource body to GROBID for processing.

        TODO: By default checks sandcrawler-db for an existing row first, then
        decide if we should re-process

        TODO: Code to push to Kafka might also go here?
        """
        result = self.grobid_client.process_fulltext(resource.body)
        if result['status'] == "success":
            metadata = self.grobid_client.metadata(result)
            if metadata:
                result.update(metadata)
        result.pop('tei_xml', None)
        return result

    def process(self, request):

        # for now, only pdf ingest is implemented
        assert request.get('ingest_type') == "pdf"
        ingest_type = request.get('ingest_type')
        base_url = request['base_url']

        best_mimetype = None
        if ingest_type == "pdf":
            best_mimetype = "application/pdf"

        existing = self.check_existing_ingest(base_url)
        if existing:
            return self.process_existing(request, existing)

        result = dict(request=request, hit=False)

        try:
            # first hop
            resource = self.find_resource(base_url, best_mimetype)
            if not resource.hit:
                result['status'] = resource.status
                return result
            file_meta = gen_file_metadata(resource.body)

            if "html" in file_meta['mimetype']:
                # got landing page, try another hop
                fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
                
                result['html'] = fulltext_url
                if not fulltext_url or not 'pdf_url' in fulltext_url:
                    result['status'] = 'no-pdf-link'
                    return result
                print("\tlanding page URL extracted ({}): {}".format(
                        fulltext_url.get('technique'),
                        fulltext_url['pdf_url'],
                    ),
                    file=sys.stderr)
                resource = self.find_resource(fulltext_url['pdf_url'], best_mimetype)
                if not resource.hit:
                    result['status'] = resource.status
                    return result
                file_meta = gen_file_metadata(resource.body)
        except SavePageNowError as e:
            result['status'] = 'spn-error'
            result['error_message'] = str(e)
            return result
        except PetaboxError as e:
            result['status'] = 'petabox-error'
            result['error_message'] = str(e)
            return result
        except CdxApiError as e:
            result['status'] = 'cdx-error'
            result['error_message'] = str(e)
            return result
        except WaybackError as e:
            result['status'] = 'wayback-error'
            result['error_message'] = str(e)
            return result

        if resource.terminal_dt:
            result['terminal'] = {
                "terminal_url": resource.terminal_url,
                "terminal_dt": resource.terminal_dt,
                "terminal_status_code": resource.terminal_status_code,
            }

        # must be a hit if we got this far
        assert resource.hit == True
        assert resource.terminal_status_code == 200

        result['file_meta'] = file_meta

        # other failure cases
        if not resource.body or file_meta['size_bytes'] == 0:
            result['status'] = 'null-body'
            return result

        if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
            result['status'] = "wrong-mimetype"  # formerly: "other-mimetype"
            return result

        info = self.process_hit(resource, file_meta)
        result.update(info)

        result['status'] = "success"
        result['hit'] = True
        result['cdx'] = cdx_to_dict(resource.cdx)
        return result


class IngestFileRequestHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path != "/ingest":
            self.send_response(404)
            self.end_headers()
            self.wfile.write("404: Not Found")
            return
        length = int(self.headers.get('content-length'))
        request = json.loads(self.rfile.read(length).decode('utf-8'))
        print("Got request: {}".format(request))
        ingester = IngestFileWorker()
        result = ingester.process(request)
        self.send_response(200)
        self.end_headers()
        self.wfile.write(json.dumps(result))