From 74ae11c45dae25ba6cf2546082d3cc3833379f10 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 4 Sep 2020 19:44:40 -0700 Subject: file_meta import notes --- notes/bulk_edits/2020-09-02_file_meta.md | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 notes/bulk_edits/2020-09-02_file_meta.md diff --git a/notes/bulk_edits/2020-09-02_file_meta.md b/notes/bulk_edits/2020-09-02_file_meta.md new file mode 100644 index 00000000..35c4d87f --- /dev/null +++ b/notes/bulk_edits/2020-09-02_file_meta.md @@ -0,0 +1,75 @@ + +Approximately 18 million file entities have only partial file metadata. All +have a sha1 (hex), but many are missing file size, md5, mimetype, etc. + +At least a few thousand of these are additionally *not* `application/pdf` +mimetype based on actually retrieving the file and sniffing the file type. +These are added earlier to the catalog likely based on CDX mimetype, which is +server-reported and can be incorrect. + +## QA Testing + + ./fatcat_import.py --editgroup-description-override "backfill of full file-level metadata for early-imported papers" file-meta - + => Counter({'total': 1000, 'update': 1000, 'skip': 0, 'insert': 0, 'exists': 0}) + + # Identical command, verifying that don't double-insert: + => Counter({'total': 1000, 'skip-existing-complete': 1000, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + +Two additional file-level cleanups should probably be done at the same time: + +Partial wayback URL timestamps, for cases where we have the full timestamped URL. Eg: + + https://web.archive.org/web/2017/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38 + https://web.archive.org/web/20170922010835/http://www.geoamazonia.net/index.php/revista/article/download/51/pdf_38 + https://qa.fatcat.wiki/file/4udmm4zd4bgfhnaaycqoztgfgm + https://qa.fatcat.wiki/file/k73il3k5hzemtnkqa5qyorg6ci + https://qa.fatcat.wiki/file/7hstlrabfjb6vgyph7ntqtpkne + +Live-web URLs identical except for http/https flip or other trival things (much less frequent case): + + http://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf + https://eo1.gsfc.nasa.gov/new/validationReport/Technology/JoeCD/asner_etal_PNAS_20041.pdf + + http://homepages.math.uic.edu/~rosendal/PapersWebsite/BanachMinimalExamples.pdf + http://homepages.math.uic.edu:80/~rosendal/PapersWebsite/BanachMinimalExamples.pdf + https://qa.fatcat.wiki/file/h2wx6re5fjhx7c6duifzskeo6u + https://qa.fatcat.wiki/file/vw7divmjwveftn4djj2cp32n4i + +Which bot to use? Let's do `sandcrawler-bot`. + +Trying a larger batch to see what database size increase is going to look like, +and whether single-threaded is going to be too slow: + + # before: Size: 517.87G + + time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | head -n500000 | pv -l | ./fatcat_import.py --editgroup-description-override "backfill of full file-level metadata for early-imported papers" file-meta - + => 145m18.615s + + # after: 518.47G + # delta: 600 MB + +A million records would take about 5 hours, so 100 hours total, or 4 days. Let's do parallelism. + +Total size increase estimated as 24 GBytes. It all adds up! + + time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | tail -n500000 | pv -l | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + => real 32m53.935s + +## Production Import + +Before Size: 624.63G + + export FATCAT_API_AUTH_TOKEN... # sandcrawler-bot + + # start small + time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | pv -l | head -n1000 | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + + # full batch + time zcat /srv/fatcat/datasets/fatcat_file_partial.file_meta.json.gz | pv -l | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + + => 18.1M 20:53:32 [ 241 /s] + + Counter({'total': 2234159, 'update': 2234111, 'skip-existing-complete': 48, 'skip': 0, 'insert': 0, 'exists': 0}) + (etc, 8x) + +After Size: 653.69G (+29GB or so) -- cgit v1.2.3