diff options
| -rw-r--r-- | notes/cleanups/case_sensitive_dois.md | 71 | ||||
| -rw-r--r-- | notes/cleanups/file_release_ingest_bug.md | 192 | ||||
| -rw-r--r-- | notes/cleanups/scripts/fetch_full_cdx_ts.py | 201 | ||||
| -rw-r--r-- | notes/cleanups/wayback_timestamps.md | 291 | ||||
| -rw-r--r-- | python/fatcat_tools/cleanups/file_release_bugfix.py | 241 | ||||
| -rw-r--r-- | python/fatcat_tools/cleanups/file_short_wayback_ts.py | 344 | ||||
| -rw-r--r-- | python/fatcat_tools/cleanups/release_lowercase_doi.py | 154 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 9 | ||||
| -rw-r--r-- | python/fatcat_web/types.py | 2 | 
9 files changed, 1504 insertions, 1 deletions
| diff --git a/notes/cleanups/case_sensitive_dois.md b/notes/cleanups/case_sensitive_dois.md new file mode 100644 index 00000000..1bf1901e --- /dev/null +++ b/notes/cleanups/case_sensitive_dois.md @@ -0,0 +1,71 @@ + +Relevant github issue: https://github.com/internetarchive/fatcat/issues/83 + +How many existing fatcat releases have a non-lowercase DOI? As of June 2021: + +    zcat release_extid.tsv.gz | cut -f3 | rg '[A-Z]' | pv -l | wc -l +    139964 + +## Prep + +    wget https://archive.org/download/fatcat_bulk_exports_2021-11-05/release_extid.tsv.gz + +    # scratch:bin/fcid.py is roughly the same as `fatcat_util.py uuid2fcid` + +    zcat release_extid.tsv.gz \ +        | cut -f1,3 \ +        | rg '[A-Z]' \ +        | /fast/scratch/bin/fcid.py \ +        | pv -l \ +        > nonlowercase_doi_releases.tsv +    # 140k 0:03:54 [ 599 /s] + +    wc -l nonlowercase_doi_releases.tsv +    140530 nonlowercase_doi_releases.tsv + +Uhoh, there are ~500 more than previously? Guess those are from after the fix? + +Create a sample for testing: + +    shuf -n10000 nonlowercase_doi_releases.tsv \ +        > nonlowercase_doi_releases.10k_sample.tsv + +## Test in QA + +In pipenv: + +    export FATCAT_AUTH_WORKER_CLEANUP=[...] + +    head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ +        | python -m fatcat_tools.cleanups.release_lowercase_doi - +    # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + +    head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ +        | python -m fatcat_tools.cleanups.release_lowercase_doi - +    # Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + +    head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ +        | python -m fatcat_tools.cleanups.release_lowercase_doi - +    # no such release_ident found: dcjsybvqanffhmu4dhzdnptave + +Presumably because this is being run in QA, and there are some newer prod releases in the snapshot. + +Did a quick update, and then: + +    head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ +        | python -m fatcat_tools.cleanups.release_lowercase_doi - +    # Counter({'total': 2000, 'skip-existing-doi-fine': 1100, 'update': 898, 'skip-existing-not-found': 2, 'skip': 0, 'insert': 0, 'exists': 0}) + +Did some spot checking in QA. Out of 20 DOIs checked, 15 were valid, 5 were not +valid (doi.org 404). It seems like roughly 1/3 have a dupe DOI (the lower-case +DOI exists); didn't count exact numbers. + +This cleanup is simple and looks good to go. Batch size of 50 is good for full +releases. + +Example of parallelization: + +    cat /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \ +        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi - + +Ready to go! diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md new file mode 100644 index 00000000..d818905b --- /dev/null +++ b/notes/cleanups/file_release_ingest_bug.md @@ -0,0 +1,192 @@ + +We want to find cases where the `file_edit` metadata for a file entity does not match the DOI of the release it is linked to. + +## Background + + +Seems to mostly happen when the `link_source_id` DOI is not found during a +fatcat 'lookup', eg due to upstream DOI metadata not including a 'title' field +(required by fatcat schema). Many of these seem to be 'journal-issue' or +something like that. Presumably upstream (unpaywall) crawled and found some PDF +for the DOI and indexes that. + +TODO: what was the specific code bug that caused this? time window of edits? +https://github.com/internetarchive/fatcat/commit/d8cf02dbdc6848be4de2710e54f4463d702b6e7c + +TODO: how many empty-title Crossref DOIs are there? perhaps we should import these after all, in some situations? eg, pull `html_biblio`? +TODO: examples of other mismatches? edits from the same timespan with mismatching identifiers + +## Queries + +    file_edit +        ident_id +        rev_id +        extra_json->>'link_source_id'::text +    file_rev_release +        file_rev +        target_release_ident_id +    release_ident +        id +        rev_id +    release_rev +        id +        doi + +Query some samples: + +    SELECT file_edit.ident_id as file_ident, release_ident.id as release_ident, file_edit.extra_json->>'link_source_id' as file_edit_doi, release_rev.doi as release_doi +    FROM file_edit +        LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev +        LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id +        LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id +    WHERE +        file_edit.extra_json->>'ingest_request_source' = 'unpaywall' +        AND release_rev.doi != file_edit.extra_json->>'link_source_id' +    LIMIT 20; + +                  file_ident              |            release_ident             |             file_edit_doi             |             release_doi              +    --------------------------------------+--------------------------------------+---------------------------------------+------------------------------------- +     8fe6eb2a-da3a-4e1c-b281-85fad3cae601 | 0a0cbabd-3f04-4296-94d5-b61c27fddee4 | 10.7167/2013/520930/dataset/3         | 10.7167/2013/520930 +     cbb2d4e7-a9a7-41df-be1d-faf9dd552ee1 | d7b356c7-de6b-4885-b37d-ea603849a30a | 10.1525/elementa.124.t1               | 10.1525/elementa.124 +     26242648-2a7b-42ea-b2ca-e4fba8f0a2c7 | d2647e07-fbb3-47f5-b06a-de2872bf84ec | 10.1023/a:1000301702170               | 10.1007/bfb0085374 +     f774854a-3939-4aa9-bf73-7e0573908b88 | e19ce792-422d-4feb-92ba-dd8536a60618 | 10.32835/2223-5752.2018.17.14-21      | 10.31652/2412-1142-2018-50-151-156 +     615ddf8c-2d91-4c1c-b04a-76b69f07b300 | e7af8377-9542-4711-9444-872f45521dc1 | 10.5644/sjm.10.1.00                   | 10.5644/sjm.10.1.01 +     43fa1b53-eddb-4f31-96af-bc9bd3bb31b6 | e1ec5d8a-ff88-49f4-8540-73ee4906e119 | 10.1136/bjo.81.11.934                 | 10.1038/sj.bjc.6690790 +     64b04cbc-ab3d-4cef-aff5-9d6b2532c47a | 43c387d1-e4c9-49f3-9995-552451a9a246 | 10.1023/a:1002015330987               | 10.1006/jnth.1998.2271 +     31e6e1a8-8457-4c93-82d3-12a51c0dc1bb | a2ec4305-63d2-4164-8470-02bf5c4ba74c | 10.1186/1742-4690-2-s1-s84            | 10.1038/nm0703-847 +     3461024a-1800-44a0-a7be-73f14bac99dd | 6849d39f-6bae-460b-8a9d-d6b86fbac2d3 | 10.17265/2328-2150/2014.11            | 10.1590/s0074-02762002000900009 +     c2707e43-6a4b-4f5d-8fb4-babe16b42b4d | 82127faf-d125-4cd4-88b9-9a8618392019 | 10.1055/s-005-28993                   | 10.1055/s-0034-1380711 +     0fe72304-1884-4dde-98c6-7cf3aec5bf31 | e3fe46cd-0205-42e4-8255-490e0eba38ea | 10.1787/5k4c1s0z2gs1-en               | 10.1787/9789282105931-2-en +     686a1508-b6c5-4060-9035-1dd8a4b85be1 | dcecd03d-e0f6-40ce-a376-7e64bd90bdca | 10.15406/jlprr.2.3                    | 10.15406/jlprr.2015.02.00039 +     9403bc0e-2a50-4c8a-ab79-943bce20e583 | 3073dee2-c8f7-42ed-a727-12306d86fa35 | 10.1787/5jrtgzfl6g9w-en               | 10.1111/1468-2443.00004 +     9a080a6f-aaaf-4f4b-acab-3eda2bfacc22 | af28a167-2ddc-43db-98ca-1cc29b863f9f | 10.13040/ijpsr.0975-8232.4(6).2094-05 | 10.1002/chin.201418290 +     810f1a00-623f-4a5c-88e2-7240232ae8f9 | 639d4ddd-ef24-49dc-8d4e-ea626b0b85f8 | 10.13040/ijpsr.0975-8232.5(6).2216-24 | 10.1006/phrs.1997.0184 +                                                                                   10.13040/IJPSR.0975-8232.5(6).2216-24 +     72752c59-9532-467d-8b4f-fe4d994bcff5 | cc3db36c-27e9-45bf-96b0-56d1119b93d6 | 10.22201/facmed.2007865xe.2018.26     | 10.22201/facmed.2007865x.2018.26.01 +     a0740fc2-a1db-4bc8-ac98-177ccebde24f | 0a7136ac-86ad-4636-9c2a-b56a6d5a3a27 | 10.24966/cmph-1978/vol6iss1           | 10.1093/milmed/146.4.283 +     5b05df6a-7a0e-411b-a4e1-0081d7522673 | dc0cb585-c709-4990-a82e-c7c9b254fc74 | 10.15406/jig.3.1                      | 10.15406/jig.2016.03.00039 +     011964dd-24a6-4d25-b9dd-921077f1947e | deabab6d-7381-4567-bf56-5294ab081e20 | 10.17265/1537-1506/2013.01            | 10.1109/icsssm.2016.7538527 +     4d7efa59-30c7-4015-bea9-d7df171a97ed | a05449a0-34d7-4f45-9b92-bcf71db4918d | 10.14413/herj.2017.01.09              | 10.14413/herj.2017.01.09. +    (20 rows) + +Total counts, broken DOIs, by source: + +    SELECT file_edit.extra_json->>'ingest_request_source' as source, COUNT(*) as broken_files +    FROM file_edit +        LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev +        LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id +        LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id +    WHERE +        file_edit.extra_json->>'link_source_id' IS NOT NULL +        AND file_edit.extra_json->>'link_source_id' LIKE '10.%' +        AND release_rev.doi != file_edit.extra_json->>'link_source_id' +    GROUP BY file_edit.extra_json->>'ingest_request_source'; + + +          source      | broken_files +    ------------------+-------------- +     fatcat-changelog |          872 +     unpaywall        |       227954 +    (2 rows) + +## Cleanup Pseudocode + +Dump file idents from above SQL queries. Transform from UUID to fatcat ident. + +For each file ident, fetch file entity, with releases expanded. Also fetch edit +history. Filter by: + +- single release for file +- single edit in history, matching broken agent +- edit metadata indicates a DOI, which doesn't match release DOI (or release doesn't have DOI?) + +If all that is the case, do a lookup by the correct DOI. If there is a match, +update the file entity to point to that release (and only that release). +Operate in batches. + +Write cleanup bot, run in QA to test. Run SQL query again and iterate if some +files were not updated. Merge to master, run full dump and update in prod. + +## Export Matches Needing Fix + +    COPY ( +        SELECT row_to_json(row.*) FROM ( +            SELECT file_edit.ident_id as file_ident, release_ident.id as wrong_release_ident, file_edit.extra_json as edit_extra +            FROM file_edit +                LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev +                LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id +                LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id +            WHERE +                file_edit.extra_json->>'link_source_id' IS NOT NULL +                AND file_edit.extra_json->>'link_source_id' LIKE '10.%' +                AND release_rev.doi != file_edit.extra_json->>'link_source_id' +            -- LIMIT 20; +        ) as row +    ) TO '/srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json'; +    # COPY 228,827 + +Then need to do a `sed` pass, and a `rg` pass, to filter out bad escapes: + +    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.unfiltered.json \ +        | sed 's/\\"/\"/g' \ +        | rg -v "\\\\" \ +        | rg '^\{' \ +        | pv -l \ +        > /srv/fatcat/snapshots/file_release_bugfix_20211105.json +    # 228k 0:00:00 [ 667k/s] + +    wc -l /srv/fatcat/snapshots/file_release_bugfix_20211105.json +    # 228,826 + +And create a sample file: + +    shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json + + +## Testing in QA + +    export FATCAT_AUTH_WORKER_CLEANUP=[...] + +    head -n10 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ +        | python -m fatcat_tools.cleanups.file_release_bugfix - +    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0}) + +    file_wmjzuybuorfrjdgnr2d32vg5va: was wrong, now correct, no other files +    file_wgszcwehlffnzmoypr4l2yhvza: was correct, now correct, no other files. multiple articles in single PDF +    file_p4r5sbminzgrhn4yaiqyr7ahwi: was correct, now wrong (!!!) +        doi:10.1055/s-0036-1579844 +        PDF says: 10.4103/0028-3886.158210 +        unpaywall still has this incorrect linkage +    file_n5jtvrnodbfdbccl5d6hshhvw4: now stub, was wrong +        doi:10.19080/bboaj.2018.04.555640 release_suvtcm7hdbbr3fczcxxt4thgoi seems correct? +        doi:10.19080/bboaj.4.4 is the bad DOI, not in fatcat. is registered, as an entire issue +    file_kt5fv4d5lrbk7j3dxxgcm7hph4: was wrong, now correct +    file_jq4juugnynhsdkh3whkcjod46q: was wrong, now correct +    file_ef7w6y7k4jhjhgwmfai37yjjmm: was wrong, now correct +    file_ca2svhd6knfnff4dktuonp45du: was correct, now correct. complicated, multiple DOIs/copies/files of same work +    file_borst2aewvcwzhucnyth2vf3lm: was correct, now correct. complicated, multiple DOIs, single file of same work + +Overall, seems like this might not be as much of an obvious improvement as +hoped! But still progress and more correct. + +    head -n1000 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ +        | python -m fatcat_tools.cleanups.file_release_bugfix - +    # Counter({'total': 1000, 'update': 929, 'skip-existing-history-updated': 58, 'skip-existing-fixed': 10, 'skip': 3, 'skip-link-source': 3, 'insert': 0, 'exists': 0}) + +Looking at `skip-link-source`, it is cases where `link_source` is 'doi' not +'fatcat-changelog'. Will update filter behavior, 'fatcat-changelog' is a +`ingest_request_source`. + +Checking another 10 examples. They all seem to end up as correct matches. + +Did a small update and running the whole batch: + +    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ +        | python -m fatcat_tools.cleanups.file_release_bugfix - +    # Counter({'total': 10000, 'update': 8499, 'skip-existing-fixed': 939, 'skip-existing-history-updated': 560, 'skip': 2, 'skip-wrong-release-is-ok': 2, 'insert': 0, 'exists': 0}) + +I think this is ready to go! Example with parallel: + +    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ +        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix - + diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py new file mode 100644 index 00000000..ebcf0d62 --- /dev/null +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +import sys +import json +import base64 +from typing import Optional, List + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error + +def requests_retry_session( +    retries: int = 10, +    backoff_factor: int = 3, +    status_forcelist: List[int] = [500, 502, 504], +    session: requests.Session = None, +) -> requests.Session: +    """ +    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests +    """ +    session = session or requests.Session() +    retry = Retry( +        total=retries, +        read=retries, +        connect=retries, +        backoff_factor=backoff_factor, +        status_forcelist=status_forcelist, +    ) +    adapter = HTTPAdapter(max_retries=retry) +    session.mount("http://", adapter) +    session.mount("https://", adapter) +    return session + +def b32_hex(s: str) -> str: +    """ +    Converts a base32-encoded SHA-1 checksum into hex-encoded + +    base32 checksums are used by, eg, heritrix and in wayback CDX files +    """ +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        if len(s) == 40: +            return s +        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + + +SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" + +def get_db_cdx(url: str, http_session) -> List[dict]: +    resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url)) +    resp.raise_for_status() +    rows = resp.json() +    return rows or [] + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + +def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]: + +    params = { +        "url": url, +        "from": partial_dt, +        "to": partial_dt, +        "matchType": "exact", +        "output": "json", +        "limit": 20, +        # can't filter status because might be warc/revisit +        #"filter": "statuscode:200", +    } +    resp = http_session.get(CDX_API_URL, params=params) +    resp.raise_for_status() +    rows = resp.json() + +    if not rows: +        return None +    #print(rows, file=sys.stderr) +    if len(rows) < 2: +        return None + +    for raw in rows[1:]: +        record = dict( +            surt=raw[0], +            datetime=raw[1], +            url=raw[2], +            mimetype=raw[3], +            status_code=raw[4], +            sha1b32=raw[5], +            sha1hex=b32_hex(raw[5]), +        ) +        if record['url'] != url: +            # TODO: could allow HTTP/HTTPS fuzzy match +            print("CDX API near match: URL", file=sys.stderr) +            continue +        if not record['datetime'].startswith(partial_dt): +            print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr) +            continue +        if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'): +            return record +        else: +            print(f"CDX API near match: status  {record['status_code']}", file=sys.stderr) +    return None + +def process_file(fe, session) -> dict: +    short_urls = [] +    self_urls = dict() +    full_urls = dict() +    status = "unknown" + +    for pair in fe['urls']: +        u = pair['url'] +        if not '://web.archive.org/web/' in u: +            continue +        seg = u.split('/') +        assert seg[2] == "web.archive.org" +        assert seg[3] == "web" +        if not seg[4].isdigit(): +            continue +        original_url = "/".join(seg[5:]) +        if len(seg[4]) == 12 or len(seg[4]) == 4: +            short_urls.append(u) +        elif len(seg[4]) == 14: +            self_urls[original_url] = u +        else: +            print(f"other bogus ts: {seg[4]}", file=sys.stderr) +            return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts") + +    if len(short_urls) == 0: +        return dict(file_entity=fe, full_urls=[], status="skip-no-shorts") + +    for short in list(set(short_urls)): +        seg = short.split('/') +        ts = seg[4] +        assert len(ts) in [12,4] and ts.isdigit() +        original_url = '/'.join(seg[5:]) + +        if short in full_urls: +            continue + +        if original_url in self_urls and ts in self_urls[original_url]: +            full_urls[short] = self_urls[original_url] +            status = "success-self" +            continue + +        cdx_row_list = get_db_cdx(original_url, http_session=session) +        for cdx_row in cdx_row_list: +            if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): +                assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() +                full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" +                status = "success-db" +                break +            else: +                #print(f"cdx DB found, but no match", file=sys.stderr) +                pass +        cdx_row = None + +        if short in full_urls: +            continue + +        cdx_record = None +        try: +            cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) +        except requests.exceptions.HTTPError as e: +            if e.response.status_code == 403: +                return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") +            else: +                raise +        if cdx_record: +            if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): +                assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() +                full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" +                status = "success-api" +                break +            else: +                print(f"cdx API found, but no match", file=sys.stderr) +        else: +            print(f"no CDX API record found: {original_url}", file=sys.stderr) + +        if short not in full_urls: +            return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") + +    return dict( +        file_entity=fe, +        full_urls=full_urls, +        status=status, +    ) + +def main(): +    session = requests_retry_session() +    session.headers.update({ +        "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", +    }) +    for line in sys.stdin: +        if not line.strip(): +            continue +        fe = json.loads(line) +        print(json.dumps(process_file(fe, session=session))) + +if __name__=="__main__": +    main() diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md new file mode 100644 index 00000000..38bc3f7d --- /dev/null +++ b/notes/cleanups/wayback_timestamps.md @@ -0,0 +1,291 @@ + +At some point, using the arabesque importer (from targetted crawling), we +accidentially imported a bunch of files with wayback URLs that have 12-digit +timestamps, instead of the full canonical 14-digit timestamps. + + +## Prep (2021-11-04) + +Download most recent file export: + +    wget https://archive.org/download/fatcat_bulk_exports_2021-10-07/file_export.json.gz + +Filter to files with problem of interest: + +    zcat file_export.json.gz \ +        | pv -l \ +        | rg 'web.archive.org/web/\d{12}/' \ +        | gzip \ +        > files_20211007_shortts.json.gz +    # 111M 0:12:35 + +    zcat files_20211007_shortts.json.gz | wc -l +    # 7,935,009 + +    zcat files_20211007_shortts.json.gz | shuf -n10000 > files_20211007_shortts.10k_sample.json + +Wow, this is a lot more than I thought! + +There might also be some other short URL patterns, check for those: + +    zcat file_export.json.gz \ +        | pv -l \ +        | rg 'web.archive.org/web/\d{1,11}/' \ +        | gzip \ +        > files_20211007_veryshortts.json.gz +    # skipped, mergine with below + +    zcat file_export.json.gz \ +        | rg 'web.archive.org/web/None/' \ +        | pv -l \ +        > /dev/null +    # 0.00  0:10:06 [0.00 /s] +    # whew, that pattern has been fixed it seems + +    zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null +    # 2.00  0:10:01 [3.33m/s] + +    zcat file_export.json.gz \ +        | rg 'web.archive.org/web/\d{13}/' \ +        | pv -l \ +        > /dev/null +    # 0.00  0:10:09 [0.00 /s] + +Yes, 4-digit is a popular pattern as well, need to handle those: + +    zcat file_export.json.gz \ +        | pv -l \ +        | rg 'web.archive.org/web/\d{4,12}/' \ +        | gzip \ +        > files_20211007_moreshortts.json.gz +    # 111M 0:13:22 [ 139k/s] + +    zcat files_20211007_moreshortts.json.gz | wc -l +    # 9,958,854 + +    zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json + + +## Fetch Complete URL + +Want to export JSON like: + +    file_entity +        [existing file entity] +    full_urls[]: list of Dicts[str,str] +        <short_url>: <full_url> +    status: str + +Status one of: + +- 'success-self': the file already has a fixed URL internally +- 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched +- 'success-cdx': CDX API lookup succeeded, and SHA1 matched +- 'fail-not-found': no matching CDX record found + +Ran over a sample: + +    cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + +    cat sample_out.json | jq .status | sort | uniq -c +          5 "fail-not-found" +        576 "success-api" +       7212 "success-db" +       2207 "success-self" + +    head -n1000  | ./fetch_full_cdx_ts.py > sample_out.json + +    zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c +          2 "fail-not-found" +        168 "success-api" +        208 "success-db" +        622 "success-self" + +Investigating the "fail-not-found", they look like http/https URL +not-exact-matches. Going to put off handling these for now because it is a +small fraction and more delicate. + +Again with the broader set: + +    cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + +    cat sample_out.json | jq .status | sort | uniq -c +          9 "fail-not-found" +        781 "success-api" +       6175 "success-db" +       3035 "success-self" + +While running a larger batch, got a CDX API error: + +    requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20 + +    org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error + +So maybe need to use credentials after all. + + +## Cleanup Process + +Other possible cleanups to run at the same time, which would not require +external requests or other context: + +- URL has ://archive.org/ link with rel=repository => rel=archive +- mimetype is bogus => clean mimetype +- bogus file => set some new extra field, like scope=stub or scope=partial (?) + +It looks like the rel swap is already implemented in `generic_file_cleanups()`. +From sampling it seems like the mimetype issue is pretty small, so not going to +bite that off now. The "bogus file" issue requires thought, so also skipping. + + +## Commands (old) + +Running with 8x parallelism to not break things; expecting some errors along +the way, may need to add handlers for connection errors etc: + +    # OLD SNAPSHOT +    zcat files_20211007_moreshortts.json.gz \ +        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ +        | pv -l \ +        | gzip \ +        > files_20211007_moreshortts.fetched.json.gz + +At 300 records/sec, this should take around 9-10 hours to process. + + + +## Prep Again (2021-11-09) + +After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot). + +Filter again: + +    # note: in the future use pigz instead of gzip here +    zcat file_export.json.gz \ +        | pv -l \ +        | rg 'web.archive.org/web/\d{4,12}/' \ +        | gzip \ +        > files_20211105_moreshortts.json.gz +    # 112M 0:13:27 [ 138k/s] + +    zcat files_20211105_moreshortts.json.gz | wc -l +    # 9,958,854 +    # good, exact same number as previous snapshot + +    zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json +    # done + +    cat files_20211105_moreshortts.10k_sample.json \ +        | ./fetch_full_cdx_ts.py \ +        | pv -l \ +        > files_20211105_moreshortts.10k_sample.fetched.json +    # 10.0k 0:03:36 [46.3 /s] + +    cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c +         13 "fail-not-found" +        774 "success-api" +       6193 "success-db" +       3020 "success-self" + +After tweaking `success-self` logic: + +         13 "fail-not-found" +        859 "success-api" +       6229 "success-db" +       2899 "success-self" + + +## Testing in QA + +Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json` + +    # OLD ATTEMPT +    export FATCAT_API_AUTH_TOKEN=[...] +    head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \ +        | python -m fatcat_tools.cleanups.file_short_wayback_ts - + +Ran in to issues, iterated above. + +Trying again with updated script and sample file: + +    export FATCAT_AUTH_WORKER_CLEANUP=[...] + +    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ +        | python -m fatcat_tools.cleanups.file_short_wayback_ts - +    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0}) + +Manually inspected and these look good. Trying some repeats and larger batched: + +    head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ +        | python -m fatcat_tools.cleanups.file_short_wayback_ts - +    # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + +    head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ +        | python -m fatcat_tools.cleanups.file_short_wayback_ts - + +    [...] +    bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf +    bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf +    bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf +    [...] + +    # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0}) + + +It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp. + +Tweaked fetch script and re-ran: + +    # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0}) + +Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`. + +Run the rest through: + +    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ +        | python -m fatcat_tools.cleanups.file_short_wayback_ts - +    # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0}) + +Should tweak batch size to 100 (vs. 50). + +How to parallelize import: + +    # from within pipenv +    cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ +        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts - + + +## Full Batch Commands + +Running in bulk again: + +    zcat files_20211105_moreshortts.json.gz \ +        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ +        | pv -l \ +        | gzip \ +        > files_20211105_moreshortts.fetched.json.gz + +Ran in to one: `requests.exceptions.HTTPError: 503 Server Error: Service +Temporarily Unavailable for url: [...]`. Will try again, if there are more +failures may need to split up in smaller chunks. + +Unexpected: + +    Traceback (most recent call last): +      File "./fetch_full_cdx_ts.py", line 200, in <module> +        main() +      File "./fetch_full_cdx_ts.py", line 197, in main +        print(json.dumps(process_file(fe, session=session))) +      File "./fetch_full_cdx_ts.py", line 118, in process_file +        assert seg[4].isdigit() +    AssertionError +    3.96M 3:04:46 [ 357 /s] + +Ugh. + +    zcat files_20211105_moreshortts.json.gz \ +        | tac \ +        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ +        | pv -l \ +        | gzip \ +        > files_20211105_moreshortts.fetched.json.gz diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py new file mode 100644 index 00000000..dc27f9b5 --- /dev/null +++ b/python/fatcat_tools/cleanups/file_release_bugfix.py @@ -0,0 +1,241 @@ +import argparse +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity + +from fatcat_tools import authenticated_api, public_api, uuid2fcid +from fatcat_tools.importers.common import EntityImporter, JsonLinePusher +from fatcat_tools.normal import clean_doi + + +class FileReleaseBugfix(EntityImporter): +    """ +    This is a one-off / one-time cleanup script for file entities which got +    imported with incorrect release ident mappings, due to a bug in the file +    ingest importer. + +    While this calls itself a cleanup, it is based on the import code path. It +    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; +    instead it has a __main__ function and is invoked like: + +        python -m fatcat_tools.cleans.file_release_bugfix - < blah.json +    """ + +    def __init__(self, api: ApiClient, **kwargs): + +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Correct bad file/release import mappings" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix") +        super().__init__( +            api, +            do_updates=True, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs, +        ) +        self.testing_mode = False + +    def want(self, row: Dict[str, Any]) -> bool: +        if not ( +            row.get("edit_extra") +            and row["edit_extra"].get("link_source") +            and row["edit_extra"].get("link_source_id") +        ): +            self.counts["skip-partial"] += 1 +            return False +        if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]: +            self.counts["skip-link-source"] += 1 +            return False +        if row["edit_extra"].get("ingest_request_source") not in [ +            "unpaywall", +            "fatcat-changelog", +        ]: +            self.counts["skip-ingest-request-source"] += 1 +            return False +        if not row["edit_extra"]["link_source_id"].startswith("10."): +            self.counts["skip-source-id-not-doi"] += 1 +            return False +        return True + +    def parse_record(self, row: Dict[str, Any]) -> FileEntity: + +        # bezerk mode doesn't make sense for this importer +        assert self.bezerk_mode is False + +        file_ident = uuid2fcid(row["file_ident"]) +        wrong_release_ident = uuid2fcid(row["wrong_release_ident"]) +        edit_extra = row["edit_extra"] +        assert edit_extra["link_source"] in ["unpaywall", "doi"] +        file_edit_doi = clean_doi(edit_extra["link_source_id"]) + +        if not file_edit_doi: +            self.counts["skip-bad-doi"] += 1 +            return False + +        # check that the "wrong" release exists and doesn't have the DOI +        wrong_release = None +        try: +            wrong_release = self.api.get_release(wrong_release_ident) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not wrong_release: +            self.counts["skip-wrong-release-missing"] += 1 +            return None + +        if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi: +            self.counts["skip-wrong-release-is-ok"] += 1 +            return None + +        # fetch the "correct" release, if any +        fixed_release_ids = [] +        correct_release = None +        try: +            correct_release = self.api.lookup_release(doi=file_edit_doi) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if correct_release: +            fixed_release_ids.append(correct_release.ident) + +        fe = FileEntity( +            ident=file_ident, +            release_ids=fixed_release_ids, +            edit_extra=edit_extra, +        ) +        fe._wrong_release_ident = wrong_release_ident +        return fe + +    def try_update(self, fe: FileEntity) -> bool: + +        wrong_release_ident = fe._wrong_release_ident +        assert len(wrong_release_ident) == 26 + +        # should always be existing... but in QA it might not be +        existing = None +        try: +            existing = self.api.get_file(fe.ident) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            self.counts["skip-existing-not-found"] += 1 +            return False + +        if existing.state != "active": +            self.counts["skip-existing-entity-state"] += 1 +            return False + +        if wrong_release_ident not in existing.release_ids: +            self.counts["skip-existing-fixed"] += 1 +            return False + +        # fetch existing history to verify mismatch +        history = self.api.get_file_history(existing.ident) + +        for entry in history: +            if entry.editgroup.editor.is_bot is not True: +                self.counts["skip-existing-edit-history-human"] += 1 +                return False + +        bad_edit = history[-1].edit +        if bad_edit.extra != fe.edit_extra: +            self.counts["skip-existing-edit-history-extra-mismatch"] += 1 +            return False + +        bad_editgroup = history[-1].editgroup +        if not bad_editgroup.extra: +            self.counts["skip-existing-editgroup-missing-extra"] += 1 +            return False + +        if ( +            bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa" +            or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter" +            or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3") +            or bad_editgroup.created.year != 2020 +        ): +            self.counts["skip-existing-edit-history-mismatch"] += 1 +            return False + +        existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident] + +        if len(fe.release_ids) == 1: +            if fe.release_ids[0] not in existing.release_ids: +                existing.release_ids.append(fe.release_ids[0]) + +        existing.edit_extra = fe.edit_extra + +        # not doing a check for "in current editgroup", because the source of +        # these corrections (entity dump) contains no dupes + +        if not self.testing_mode: +            self.api.update_file(self.get_editgroup_id(), existing.ident, existing) +        self.counts["update"] += 1 +        return False + + +def test_file_release_bugfix() -> None: +    api = public_api("http://localhost:9411/v0") +    frbc = FileReleaseBugfix(api=api) +    frbc.testing_mode = True + +    assert frbc.want({"this": "asdf"}) is False + +    example_line: Dict[str, Any] = { +        "file_ident": "00000000-0000-0000-3333-000000000002", +        "wrong_release_ident": "00000000-0000-0000-4444-000000000002", +        "edit_extra": { +            "link_source": "unpaywall", +            "link_source_id": "10.1371/journal.pmed.0020124", +            "ingest_request_source": "unpaywall", +        }, +    } + +    fe1 = frbc.parse_record(example_line) +    print(frbc.counts) +    frbc.try_update(fe1) + +    # NOTE: this test is pretty incompleted + + +def main() -> None: +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" +    ) +    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) +    parser.set_defaults( +        auth_var="FATCAT_AUTH_WORKER_CLEANUP", +    ) +    parser.add_argument( +        "json_file", +        help="File with jsonlines with cleanup context", +        default=sys.stdin, +        type=argparse.FileType("r"), +    ) + +    args = parser.parse_args() +    api = authenticated_api( +        args.host_url, +        # token is an optional kwarg (can be empty string, None, etc) +        token=os.environ.get(args.auth_var), +    ) + +    frbc = FileReleaseBugfix( +        api, +        edit_batch_size=args.batch_size, +    ) +    JsonLinePusher(frbc, args.json_file).run() + + +if __name__ == "__main__": +    main() diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py new file mode 100644 index 00000000..bdd49f9b --- /dev/null +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -0,0 +1,344 @@ +import argparse +import copy +import os +import sys +from typing import Any, Dict + +import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity + +from fatcat_tools import authenticated_api, entity_from_dict, public_api +from fatcat_tools.importers.common import EntityImporter, JsonLinePusher + + +class FileShortWaybackTimestampCleanup(EntityImporter): +    """ +    This is a one-off / one-time cleanup script for file entities, fix short +    timestamps in wayback URLs. These timestamps are supposed to have 14 digits +    (datetime with year, hour, seconds, etc). Some legacy file imports ended up +    with only 4 or 12 digits. + +    While this calls itself a cleanup, it is based on the import code path. It +    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; +    instead it has a __main__ function and is invoked like: + +        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json +    """ + +    def __init__(self, api: ApiClient, **kwargs): + +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Expand trunacted timestamps in wayback URLs" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get( +            "agent", "fatcat_tools.FileShortWaybackTimestampCleanup" +        ) +        super().__init__( +            api, +            do_updates=True, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs, +        ) +        self.testing_mode = False + +    def want(self, row: Dict[str, Any]) -> bool: +        if row["status"].startswith("success"): +            return True +        else: +            self.counts["skip-status"] += 1 +            return False + +    def parse_record(self, row: Dict[str, Any]) -> FileEntity: + +        # bezerk mode doesn't make sense for this importer +        assert self.bezerk_mode is False + +        fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity) +        status: str = row["status"] +        assert status.startswith("success") +        url_expansions: Dict[str, str] = row["full_urls"] +        assert len(url_expansions) >= 1 + +        # actual cleanup happens here +        any_fixed = False +        for fe_url in fe.urls: +            if "://web.archive.org/web/" not in fe_url.url: +                continue +            seq = fe_url.url.split("/") +            partial_ts = seq[4] +            original_url = "/".join(seq[5:]) +            if seq[2] != "web.archive.org": +                continue +            if len(partial_ts) not in [4, 12]: +                continue +            if fe_url.url in url_expansions: +                fix_url = url_expansions[fe_url.url] +                # defensive checks +                if not ( +                    f"/web/{partial_ts}" in fix_url +                    and fe_url.url.endswith(original_url) +                    and fix_url.endswith(original_url) +                ): +                    print( +                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}", +                        file=sys.stderr, +                    ) +                    self.counts["skip-bad-replacement"] += 1 +                    return None +                assert "://" in fix_url +                fe_url.url = fix_url +                any_fixed = True + +        if not any_fixed: +            self.counts["skip-no-fixes"] += 1 +            return None + +        # do any other generic file entity cleanups +        # this includes removing duplicates +        fe = self.generic_file_cleanups(fe) + +        # verify that there are no exact duplicates +        final_urls = [u.url for u in fe.urls] +        assert len(final_urls) == len(list(set(final_urls))) + +        return fe + +    def try_update(self, fe: FileEntity) -> bool: + +        # should always be existing +        try: +            existing = self.api.get_file(fe.ident) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            self.counts["skip-existing-not-found"] += 1 +            return False + +        if existing.state != "active": +            self.counts["skip-existing-entity-state"] += 1 +            return False + +        if existing.sha1 != fe.sha1: +            self.counts["skip-existing-mismatch"] += 1 +            return False + +        assert fe.revision and existing.revision +        if existing.revision != fe.revision: +            self.counts["skip-revision-changed"] += 1 +            return False + +        # verify that at least one URL remains +        if not fe.urls or len(fe.urls) < 1: +            self.counts["skip-no-urls"] += 1 +            return False + +        # verify that all wayback urls have 14-digit timestamps, and are generally well-formed +        for u in fe.urls: +            if "://web.archive.org/web/" not in u.url: +                continue +            if u.rel != "webarchive": +                self.counts["skip-bad-wayback-rel"] += 1 +                return False +            seg = u.url.split("/") +            if ( +                len(seg) < 6 +                or seg[0] != "https:" +                or seg[2] != "web.archive.org" +                or seg[3] != "web" +            ): +                self.counts["skip-bad-wayback"] += 1 +                return False +            if len(seg[4]) != 14 or not seg[4].isdigit(): +                self.counts["skip-bad-wayback-timestamp"] += 1 +                return False + +        if existing == fe or existing.urls == fe.urls: +            self.counts["skip-no-change"] += 1 +            return False + +        # not doing a check for "in current editgroup", because the source of +        # these corrections (entity dump) contains no dupes + +        if not self.testing_mode: +            # note: passing 'fe' instead of 'existing' here, which is not +            # usually how it goes! +            self.api.update_file(self.get_editgroup_id(), fe.ident, fe) +        self.counts["update"] += 1 +        return False + + +def test_short_wayback_ts() -> None: +    api = public_api("http://localhost:9411/v0") +    fswtc = FileShortWaybackTimestampCleanup(api=api) +    fswtc.testing_mode = True + +    assert fswtc.want({"status": "fail"}) is False +    assert fswtc.want({"status": "success-self"}) is True + +    example_line: Dict[str, Any] = { +        "status": "success-db", +        "file_entity": { +            # note: doesn't match actual entity +            "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"], +            "mimetype": "application/pdf", +            "urls": [ +                { +                    "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971", +                    "rel": "web", +                }, +                { +                    "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971", +                    "rel": "webarchive", +                }, +            ], +            "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29", +            "sha1": "be714299b9be21b5afdaa7affd7d710c58269433", +            "md5": "9edb542be5b3446a1905e61a8a3abebd", +            "size": 666242, +            "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd", +            "ident": "4ghpvs2t2rdtrdum2mkreh62me", +            "state": "active", +        }, +        "full_urls": { +            "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971" +        }, +    } +    example_fe = entity_from_dict(example_line["file_entity"], FileEntity) + +    fe1 = copy.copy(example_fe) +    fe1.urls[ +        1 +    ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971" +    assert fswtc.parse_record(example_line) == fe1 + +    # update code path; requires a known file ident and API running locally +    assert fswtc.counts["update"] == 0 +    dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai") +    fe1.ident = dummy_fe.ident + +    assert fswtc.try_update(fe1) is False +    assert fswtc.counts["skip-existing-mismatch"] == 1 + +    fe1.sha1 = dummy_fe.sha1 +    assert fswtc.try_update(fe1) is False +    assert fswtc.counts["skip-revision-changed"] == 1 + +    fe1.revision = dummy_fe.revision +    assert fswtc.try_update(fe1) is False +    print(fswtc.counts) +    assert fswtc.counts["update"] == 1 + +    # another example, which failed with an assertion in prod due to duplicated URLs +    example_line2: Dict[str, Any] = { +        "file_entity": { +            "release_ids": ["22jt7euq4fafhblzullmnesso4"], +            "mimetype": "application/pdf", +            "urls": [ +                { +                    "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", +                    "rel": "repository", +                }, +                { +                    "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", +                    "rel": "webarchive", +                }, +                { +                    "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf", +                    "rel": "webarchive", +                }, +            ], +            "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19", +            "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433", +            "md5": "3d509743359649e34a27ae70c5cd3018", +            "size": 430665, +            "extra": { +                "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"} +            }, +            "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48", +            "ident": "duymhmxk3fgtzk37yp2pvthtxq", +            "state": "active", +        }, +        "full_urls": { +            "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" +        }, +        "status": "success-self", +    } + +    fe2 = fswtc.parse_record(example_line2) +    assert len(fe2.urls) == 2 +    assert fe2.urls[0].rel == "repository" +    assert ( +        fe2.urls[1].url +        == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf" +    ) + +    # ensure URL order is stable +    example_line3: Dict[str, Any] = { +        "file_entity": { +            "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"], +            "mimetype": "application/pdf", +            "urls": [ +                {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"}, +                { +                    "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf", +                    "rel": "webarchive", +                }, +            ], +            "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0", +            "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3", +            "md5": "89b6e6cc4e0259317e26ddf1a9a336a0", +            "size": 41265, +            "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65", +            "ident": "lvnz23nzijaapf5iti45zez6zu", +            "state": "active", +        }, +        "full_urls": { +            "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf" +        }, +        "status": "success-db", +    } + +    fe3 = fswtc.parse_record(example_line3) +    assert len(fe3.urls) == 2 +    assert fe3.urls[0].rel == "web" +    assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf" +    assert fe3.urls[1].rel == "webarchive" + + +def main() -> None: +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" +    ) +    parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int) +    parser.set_defaults( +        auth_var="FATCAT_AUTH_WORKER_CLEANUP", +    ) +    parser.add_argument( +        "json_file", +        help="File with jsonlines from file_meta schema to import from", +        default=sys.stdin, +        type=argparse.FileType("r"), +    ) + +    args = parser.parse_args() +    api = authenticated_api( +        args.host_url, +        # token is an optional kwarg (can be empty string, None, etc) +        token=os.environ.get(args.auth_var), +    ) + +    fswtc = FileShortWaybackTimestampCleanup( +        api, +        edit_batch_size=args.batch_size, +    ) +    JsonLinePusher(fswtc, args.json_file).run() + + +if __name__ == "__main__": +    main() diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py new file mode 100644 index 00000000..5e3275db --- /dev/null +++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py @@ -0,0 +1,154 @@ +import argparse +import os +import sys + +from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds + +from fatcat_tools import authenticated_api, public_api +from fatcat_tools.importers.common import EntityImporter, LinePusher + + +class ReleaseLowercaseDoiCleanup(EntityImporter): +    """ +    This is a one-off / one-time cleanup script for release entities, to fix +    upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase. + +    While this calls itself a cleanup, it is based on the import code path. It +    is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; +    instead it has a __main__ function and is invoked like: + +        python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv + +    It expects to get a simple text line on stdin, which is a release entity. +    The correction is implemented by fetching the current version of the +    entity, verifying the issue, and updating if it is still a problem. + +    This does not try to do any merging, just corrects the case in a single +    update. +    """ + +    def __init__(self, api: ApiClient, **kwargs): + +        eg_desc = ( +            kwargs.pop("editgroup_description", None) +            or "Normalize release DOIs (extid) to lower-case" +        ) +        eg_extra = kwargs.pop("editgroup_extra", dict()) +        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup") +        super().__init__( +            api, +            do_updates=True, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs, +        ) +        self.testing_mode = False + +    def want(self, row: str) -> bool: +        row = row.strip() +        if not row: +            return False +        row = row.split()[0] +        if len(row) == 26: +            return True +        else: +            return False + +    def parse_record(self, row: str) -> ReleaseEntity: + +        # bezerk mode doesn't make sense for this importer +        assert self.bezerk_mode is False + +        ident = row.strip().split()[0] +        assert len(ident) == 26 + +        return ReleaseEntity( +            ident=ident, +            ext_ids=ReleaseExtIds(), +        ) + +    def try_update(self, re: ReleaseEntity) -> bool: + +        # should always be existing, but sometimes not because of prod/QA flip +        existing = None +        try: +            existing = self.api.get_release(re.ident) +        except ApiException as err: +            if err.status != 404: +                raise err + +        if not existing: +            self.counts["skip-existing-not-found"] += 1 +            return False + +        if existing.state != "active": +            self.counts["skip-existing-entity-state"] += 1 +            return False + +        if not existing.ext_ids.doi: +            self.counts["skip-existing-no-doi"] += 1 +            return False + +        if existing.ext_ids.doi == existing.ext_ids.doi.lower(): +            self.counts["skip-existing-doi-fine"] += 1 +            return False + +        existing.ext_ids.doi = existing.ext_ids.doi.lower() + +        # not doing a check for "in current editgroup", because the source of +        # these corrections (entity dump) contains no dupes + +        if not self.testing_mode: +            self.api.update_release(self.get_editgroup_id(), existing.ident, existing) +        self.counts["update"] += 1 +        return False + + +def test_lowercase_doi() -> None: +    api = public_api("http://localhost:9411/v0") +    rldc = ReleaseLowercaseDoiCleanup(api=api) +    rldc.testing_mode = True + +    assert rldc.want("") is False +    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True +    assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True +    rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai") + +    dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai") +    assert rldc.try_update(dummy_re) is False +    assert rldc.counts["skip-existing-doi-fine"] == 1 +    # this isn't a very complete test, doesn't get to update part + + +def main() -> None: +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" +    ) +    parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) +    parser.set_defaults( +        auth_var="FATCAT_AUTH_WORKER_CLEANUP", +    ) +    parser.add_argument( +        "idents_file", +        help="File with release identifier to try updating", +        default=sys.stdin, +        type=argparse.FileType("r"), +    ) + +    args = parser.parse_args() +    api = authenticated_api( +        args.host_url, +        # token is an optional kwarg (can be empty string, None, etc) +        token=os.environ.get(args.auth_var), +    ) + +    rldc = ReleaseLowercaseDoiCleanup( +        api, +        edit_batch_size=args.batch_size, +    ) +    LinePusher(rldc, args.idents_file).run() + + +if __name__ == "__main__": +    main() diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index fd472d11..2ec6efda 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -436,6 +436,15 @@ class EntityImporter:              if u.rel == "social":                  u.rel = "academicsocial" +        # remove exact URL duplicates, while preserving order, and removing +        # "later" copies, not "first" copies +        # this is sensitive to both url.url and url.rel combined! +        dedupe_urls = [] +        for url_pair in existing.urls: +            if url_pair not in dedupe_urls: +                dedupe_urls.append(url_pair) +        existing.urls = dedupe_urls +          # remove URLs which are near-duplicates          redundant_urls = []          all_urls = [u.url for u in existing.urls] diff --git a/python/fatcat_web/types.py b/python/fatcat_web/types.py index f5182d71..f373825f 100644 --- a/python/fatcat_web/types.py +++ b/python/fatcat_web/types.py @@ -1,4 +1,4 @@ -from typing import Any, Tuple, Union +from typing import Tuple, Union  from flask import Response as FlaskResponse  from werkzeug.wrappers import Response as WerkzeugResponse | 
