diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 18:58:37 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 18:58:37 -0800 | 
| commit | 8080eef139b5dcf6201e4f27076a879d0df20096 (patch) | |
| tree | 4a19cb90122865bc55f172b5115e9ac052cfdf30 | |
| parent | f6c4bd65c104e3a728e94561be80242cf35cbea3 (diff) | |
| download | fatcat-8080eef139b5dcf6201e4f27076a879d0df20096.tar.gz fatcat-8080eef139b5dcf6201e4f27076a879d0df20096.zip | |
file de-dupe: notes on prep and QA testing
| -rw-r--r-- | notes/cleanups/file_sha1_dedupe.md | 64 | ||||
| -rwxr-xr-x | notes/cleanups/scripts/file_dupe_to_json.py | 72 | 
2 files changed, 136 insertions, 0 deletions
| diff --git a/notes/cleanups/file_sha1_dedupe.md b/notes/cleanups/file_sha1_dedupe.md new file mode 100644 index 00000000..0829bc79 --- /dev/null +++ b/notes/cleanups/file_sha1_dedupe.md @@ -0,0 +1,64 @@ + + +## Prep + +Using `check_hashes.sh`: + +    zcat $HASH_FILE \ +        | awk '{print $3 "\t" $1}' \ +        | rg -v '^\t' \ +        | sort -S 4G \ +        | uniq -D -w 40 \ +        > sha1_ident.dupes.tsv + +    wc -l sha1_ident.dupes.tsv  +    # 6,350 + +    cut -f1 sha1_ident.dupes.tsv | uniq | wc -l +    # 2,039 + +Want to create JSON for each group, like: + +    entity_type: "file" +    primary_id: str or None +    duplicate_ids: [str] +    evidence: +        extid: str +        extid_type: "sha1" + +Run transform script: + +    cat sha1_ident.dupes.tsv | ./file_dupe_to_json.py | pv -l > file_sha1_dupes.json +    # 2.04k 0:00:00 [9.16k/s] + + +## QA Testing + +    export FATCAT_AUTH_API_TOKEN=[...] + +    head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \ +        | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" --dry-run merge-files - + +Hit some small bugs running in QA; test coverage isn't great, but I think hits +the important parts. + +    head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \ +        | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" --dry-run merge-files - +    # Running in dry-run mode! +    # Counter({'updated-entities': 60, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0}) + +Dry-run mode didn't actually work, and edits actually happened (!). + +Edits do look good. + +Try again, not dry-run, to ensure that case is handled: + +    head -n25 /srv/fatcat/datasets/file_sha1_dupes.json | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files - +    # Counter({'lines': 25, 'skip': 25, 'skip-not-active-entity': 25, 'merged': 0, 'updated-total': 0}) + +And then run 500 through for more testing: + +    head -n500 /srv/fatcat/datasets/file_sha1_dupes.json | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files - +    # Counter({'updated-entities': 1341, 'lines': 500, 'merged': 474, 'skip': 26, 'skip-not-active-entity': 25, 'skip-entity-not-found': 1, 'updated-total': 0}) + +The majority of merges seem to be cases where there are multiple articles in the same PDF. diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py new file mode 100755 index 00000000..2064dc1c --- /dev/null +++ b/notes/cleanups/scripts/file_dupe_to_json.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate file entity hash export rows +into JSON objects which can be passed to the file entity merger. + +The input is expected to be a TSV with two columns: a hash value in the first +column, and a fatcat file entity ident (in UUID format, not "fatcat ident" +encoded) in the second column. The rows are assumed to be sorted by hash value +(the first column), and duplicate values (same hash, differing UUID) are +contiguous. + +File hashes aren't really "external identifiers" (ext_id), but we treat them as +such here. + +Script is pretty simple, should be possible to copy and reuse for release, +container, creator entity duplicates. +""" + +import json, sys +from typing import Optional +import base64, uuid + +EXTID_TYPE = "sha1" + +def uuid2fcid(s: str) -> str: +    """ +    Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) +    """      +    raw = uuid.UUID(s).bytes +    return base64.b32encode(raw)[:26].lower().decode("utf-8") + +def print_group(extid, dupe_ids): +    if len(dupe_ids) < 2: +        return +    group = dict( +        entity_type="file", +        primary_id=None, +        duplicate_ids=dupe_ids, +        evidence=dict( +            extid=extid, +            extid_type=EXTID_TYPE, +        ), +    ) +    print(json.dumps(group, sort_keys=True)) + +def run(): +    last_extid = None +    dupe_ids = [] +    for l in sys.stdin: +        l = l.strip() +        if not l: +            continue +        (row_extid, row_uuid) = l.split("\t")[0:2] +        if EXTID_TYPE == "sha1": +            assert len(row_extid) == 40 +        else: +            raise Exception(f"extid type not supported yet: {EXTID_TYPE}") +        row_id = uuid2fcid(row_uuid) +        if row_extid == last_extid: +            dupe_ids.append(row_id) +            continue +        elif dupe_ids: +            print_group(last_extid, dupe_ids) +        last_extid = row_extid +        dupe_ids = [row_id] +    if last_extid and dupe_ids: +        print_group(last_extid, dupe_ids) + + +if __name__=="__main__": +    run() | 
