#!/usr/bin/env python3
"""
Helpers to create Web Capture entities from extracted wayback content.
Works as a stand-alone script (for debugging) or as library routines.
"""
import sys
import json
import hashlib
import requests
import datetime
import argparse
import subprocess
from bs4 import BeautifulSoup
from fatcat_openapi_client import *
from .common import b32_hex
CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
GWB_URL_BASE = "https://web.archive.org/web"
REQ_SESSION = requests.Session()
def parse_wbm_url(url):
    """Takes a wayback machine URL, and returns a tuple:
        (timestamp, datetime, original_url)
    """
    chunks = url.split('/')
    assert len(chunks) >= 6
    assert chunks[2] == 'web.archive.org'
    assert chunks[3] == 'web'
    return (chunks[4],
            parse_wbm_timestamp(chunks[4]),
            '/'.join(chunks[5:]))
def test_parse_wbm_url():
    u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
    assert parse_wbm_url(u) == (
        "20010712114837",
        datetime.datetime(2001, 7, 12, 11, 48, 37),
        "http://www.dlib.org/dlib/june01/reich/06reich.html")
def parse_wbm_timestamp(timestamp):
    """
    Takes a complete WBM timestamp string (like "20020327115625") and returns a
    python datetime object (UTC)
    """
    # strip any "im_" or "id_" suffix
    if timestamp.endswith('_'):
        timestamp = timestamp[:-3]
    # inflexible; require the full second-precision timestamp
    assert len(timestamp) == 14
    return datetime.datetime(
        year=int(timestamp[0:4]),
        month=int(timestamp[4:6]),
        day=int(timestamp[6:8]),
        hour=int(timestamp[8:10]),
        minute=int(timestamp[10:12]),
        second=int(timestamp[12:14]))
def test_parse_wbm_timestamp():
    assert parse_wbm_timestamp("20010712114837") == \
        datetime.datetime(2001, 7, 12, 11, 48, 37)
def fetch_wbm(url):
    resp = REQ_SESSION.get(url)
    resp.raise_for_status()
    assert resp.content
    return resp.content
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
    sys.stderr.write(embed_url + "\n")
    assert embed_url.startswith('/web/')
    embed_url = embed_url.split('/')
    timestamp = embed_url[2]
    if timestamp.endswith('_'):
        timestamp = timestamp[:-3]
    url = '/'.join(embed_url[3:])
    #print((timestamp, url))
    resp = REQ_SESSION.get(CDX_API_BASE, params=dict(
        url=url,
        closest=timestamp,
        sort="closest",
        resolveRevisits="true",
        matchType="exact",
        limit=1,
    ))
    resp.raise_for_status()
    #print(resp.url)
    if resp.content:
        hit = resp.content.decode('utf-8').split('\n')[0]
        if cdx_output:
            cdx_output.write(hit + "\n")
        cdx = hit.split(' ')
        cdx = [x if (x and x != '-') else None for x in cdx]
        webcapture_cdx = WebcaptureCdxLine(
            surt=cdx[0],
            timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
            url=cdx[2],
            mimetype=cdx[3],
            status_code=(cdx[4] and int(cdx[4])) or None,
            sha1=b32_hex(cdx[5]),
            sha256=None,
        )
        if verify_hashes:
            resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format(
                cdx[1], # raw timestamp
                webcapture_cdx.url))
            resp.raise_for_status()
            assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
            webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
            webcapture_cdx.size = len(resp.content)
        return webcapture_cdx
    else:
        return None
def wayback_url_to_relative(url):
    """
    Wayback URLs can be relative or absolute in rewritten documents. This
    function converts any form of rewritten URL to a relative (to
    web.archive.org) one, or returns None if it isn't a rewritten URL at all.
    """
    if url.startswith('https://web.archive.org/'):
        url = url[23:]
    elif url.startswith('http://web.archive.org/'):
        url = url[22:]
    if url.startswith('/web/'):
        return url
    else:
        return None
def extract_embeds(soup):
    embeds = set()
    # 
    for tag in soup.find_all('link', href=True):
        if tag['rel'] not in ('stylesheet',):
            continue
        url = wayback_url_to_relative(tag['href'])
        if url:
            embeds.add(url)
    # 
    for tag in soup.find_all('img', src=True):
        url = wayback_url_to_relative(tag['src'])
        if url:
            embeds.add(url)
    #