## fatcat-changelog pipeline

Overall ingest status, past 30 days:

    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-changelog'
    GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
    ORDER BY COUNT DESC
    LIMIT 20;

Broken domains, past 30 days:

    SELECT domain, status, COUNT((domain, status))
    FROM (
        SELECT
            ingest_file_result.ingest_type,
            ingest_file_result.status,
            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE
            -- ingest_request.created >= NOW() - '3 day'::INTERVAL
            ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-changelog'
    ) t1
    WHERE t1.domain != ''
        AND t1.status != 'success'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25;

Summary of significant domains and status, past 7 days:

    SELECT domain, status, count
    FROM (
        SELECT domain, status, COUNT((domain, status)) as count
        FROM (
            SELECT
                ingest_file_result.ingest_type,
                ingest_file_result.status,
                substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
            FROM ingest_file_result
            LEFT JOIN ingest_request
                ON ingest_file_result.ingest_type = ingest_request.ingest_type
                AND ingest_file_result.base_url = ingest_request.base_url
            WHERE
                ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
                AND ingest_request.ingest_type = 'pdf'
                AND ingest_request.ingest_request_source = 'fatcat-changelog'
        ) t1
        WHERE t1.domain != ''
        GROUP BY CUBE (domain, status)
    ) t2
    WHERE count > 200
    ORDER BY domain ASC , count DESC;

Summary of DOI prefix and status, past 7 days:

    SELECT doi_prefix, status, count
    FROM (
        SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count
        FROM (
            SELECT
                ingest_file_result.ingest_type,
                ingest_file_result.status,
                substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix
            FROM ingest_file_result
            LEFT JOIN ingest_request
                ON ingest_file_result.ingest_type = ingest_request.ingest_type
                AND ingest_file_result.base_url = ingest_request.base_url
            WHERE
                ingest_file_result.updated >= NOW() - '7 day'::INTERVAL
                AND ingest_request.ingest_type = 'pdf'
                AND ingest_request.ingest_request_source = 'fatcat-changelog'
                AND ingest_request.link_source = 'doi'
        ) t1
        WHERE t1.doi_prefix != ''
        GROUP BY CUBE (doi_prefix, status)
    ) t2
    WHERE count > 200
    ORDER BY doi_prefix ASC , count DESC;


Throughput per day, and success, for past 30 days:

    SELECT ingest_request.ingest_type,
           date(ingest_request.created),
           COUNT(*) as total,
           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-changelog'
    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
    ORDER BY date(ingest_request.created) DESC;

## fatcat-ingest

Broken domains, past 7 days:

    SELECT domain, status, COUNT((domain, status))
    FROM (
        SELECT
            ingest_file_result.ingest_type,
            ingest_file_result.status,
            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE
            -- ingest_request.created >= NOW() - '7 day'::INTERVAL
            ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-ingest'
    ) t1
    WHERE t1.domain != ''
        AND t1.status != 'success'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25;

Throughput per day, and success, for past 7 days:

    SELECT ingest_request.ingest_type,
           date(ingest_file_result.updated),
           COUNT(*) as total,
           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE
        -- ingest_request.created >= NOW() - '7 day'::INTERVAL
        ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-ingest'
    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
    ORDER BY date(ingest_file_result.updated) DESC;

Overall status, updated requests past 3 days:

    SELECT ingest_request.ingest_type,
           ingest_file_result.status,
           COUNT(*)
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE
        -- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL
        ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-ingest'
    GROUP BY ingest_request.ingest_type, ingest_file_result.status
    ORDER BY COUNT(*) DESC;