From 3aa70adb3380e82a0a6964baa9058a41d8a2d454 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 27 Dec 2019 12:36:01 -0800
Subject: hadoop job log rename and update

---
 notes/hadoop_job_log.md | 210 ++++++++++++++++++++++++++++++++++++++++++++++++
 notes/job_log.txt       | 185 ------------------------------------------
 2 files changed, 210 insertions(+), 185 deletions(-)
 create mode 100644 notes/hadoop_job_log.md
 delete mode 100644 notes/job_log.txt

(limited to 'notes')

diff --git a/notes/hadoop_job_log.md b/notes/hadoop_job_log.md
new file mode 100644
index 0000000..f812c0a
--- /dev/null
+++ b/notes/hadoop_job_log.md
@@ -0,0 +1,210 @@
+
+### QA matchcrossref
+
+[D8C7F2CA7620450991838D540489948D/8B17786779BE44579C98D8A325AC5959] sandcrawler.ScoreJob/(1/1) ...-24-2102.32-matchcrossref
+
+Submitted:  Fri Aug 24 21:03:09 UTC 2018
+Started:    Fri Aug 24 21:03:20 UTC 2018
+Finished:   Sat Aug 25 09:46:55 UTC 2018
+Elapsed:    12hrs, 43mins, 34sec
+Diagnostics:    
+Average Map Time    24mins, 31sec
+Average Shuffle Time    15sec
+Average Merge Time  21sec
+Average Reduce Time 7mins, 17sec
+
+Map 2312    2312
+Reduce  100 100
+
+crossref-rows-filtered  73901964    0   73901964
+grobid-rows-filtered    1092992 0   1092992
+joined-rows 0   623837  623837
+
+cascading.flow.StepCounters 
+Tuples_Read 94831255    0   94831255
+Tuples_Written  0   623837  623837
+
+Read_Duration   7108430 352241  7460671
+Tuples_Read 94831255    74994956    169826211
+Tuples_Written  74994956    623837  75618793
+Write_Duration  7650302 21468   7671770
+
+## QA UnGrobided
+
+Submitted:  Sat Aug 25 01:23:22 UTC 2018
+Started:    Sat Aug 25 05:06:36 UTC 2018
+Finished:   Sat Aug 25 05:13:45 UTC 2018
+Elapsed:    7mins, 8sec
+Diagnostics:    
+Average Map Time    1mins, 20sec
+Average Shuffle Time    12sec
+Average Merge Time  15sec
+Average Reduce Time 29sec
+
+Map 48  48
+Reduce  1   1
+
+bnewbold@bnewbold-dev$ gohdfs du -sh sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part*
+56.8M   /user/bnewbold/sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part-00000
+
+## Prod UnGrobided
+
+[D76F6BF91D894E879E747C868B0DEDE7/394A1AFC44694992B71E6920AF8BA3FB] sandcrawler.DumpUnGrobidedJob/(1/1) ...26-0910.25-dumpungrobided
+
+Map    278 278
+Reduce  1   1
+
+Submitted:  Sun Aug 26 09:10:51 UTC 2018
+Started:    Sun Aug 26 09:18:21 UTC 2018
+Finished:   Sun Aug 26 10:29:28 UTC 2018
+Elapsed:    1hrs, 11mins, 7sec
+Diagnostics:    
+Average Map Time    4mins, 48sec
+Average Shuffle Time    24mins, 17sec
+Average Merge Time  14sec
+Average Reduce Time 13mins, 54sec
+
+
+cading.flow.StepCounters    
+Name
+Map
+Reduce
+Total
+Tuples_Read 64510564    0   64510564
+Tuples_Written  0   21618164    21618164
+
+## Prod Crossref Match
+
+[6C063C0809244446BA8602C3BE99CEC2/5FE5D87899154F38991A1ED58BEB34D4] sandcrawler.ScoreJob/(1/1) ...-25-1753.01-matchcrossref
+
+Map 2427    2427
+Reduce  50  50
+
+Submitted:  Sat Aug 25 17:53:50 UTC 2018
+Started:    Sat Aug 25 17:53:59 UTC 2018
+Finished:   Sun Aug 26 11:22:52 UTC 2018
+Elapsed:    17hrs, 28mins, 52sec
+Diagnostics:    
+Average Map Time    31mins, 20sec
+Average Shuffle Time    1mins, 21sec
+Average Merge Time  41sec
+Average Reduce Time 3hrs, 14mins, 39sec
+
+crossref-rows-filtered  73901964    0   73901964
+grobid-rows-filtered    14222226    0   14222226
+joined-rows 0   14115453    14115453
+
+## "Prod" Fatcat Group Works (run 2019-08-10)
+
+    ./please --prod groupworks-fatcat hdfs:///user/bnewbold/release_export.2019-07-07.json
+
+    job_1559844455575_118299
+    http://ia802401.us.archive.org:6988/proxy/application_1559844455575_118299
+
+## Re-GROBID batch (2019-11-12)
+
+Want to re-process "old" GROBID output with newer (0.5.5+fatcat) GROBID version
+(vanilla training) plus biblio-glutton identification. Hoping to make a couple
+million new fatcat matches; will probably do a later round of ML matching over
+this batch as well.
+
+    # in /grande/regrobid
+
+    # as postgres
+    psql sandcrawler < dump_regrobid_pdf.sql > dump_regrobid_pdf.txt
+
+    # as bnewbold
+    cat dump_regrobid_pdf.txt | sort -S 4G | uniq -w 40 | cut -f2 | pv -l > dump_regrobid_pdf.2019-11-12.json
+    # 41.5M lines, uniq by SHA1
+    # NOTE: not the full 56m+ from GROBID table... some in archive.org, others
+    # not application/pdf type. Will need to follow-up on those later
+
+    # intend to have 3 worker machines, but splitting 6 ways in case we need to
+    # re-balance load or get extra machines or something
+    split -n l/6 -a1 -d --additional-suffix=.json dump_regrobid_pdf.2019-11-12.json regrobid_cdx.split_
+
+    # distribute to tmp001, tmp002, tmp003:
+    tmp001: 0,1
+    tmp002: 2,3
+    tmp003: 4,5
+
+    # test local grobid config:
+    head /srv/sandcrawler/tasks/regrobid_cdx.split_0.json | pv -l | ./grobid_tool.py --grobid-host http://localhost:8070 -j0 extract-json - > example_out.json
+    # expect at least a couple fatcat matches
+    cat example_out.json | jq .tei_xml -r | rg fatcat
+
+    # test GROBID+kafka config:
+    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | head | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+    
+    # full run, in a screen session
+    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+NOTE: really should get parallel kafka worker going soon. if there is a reboot
+or something in the middle of this process, will need to re-run from the start.
+
+Was getting a bunch of weird kafka INVALID_MSG errors on produce. Would be nice to be able to retry, so doing:
+
+    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel --joblog regrobid_job.log --retries 5 -j40 --linebuffer --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+Never mind, going to split into chunks which can be retried.
+
+    cd /srv/sandcrawler/tasks
+    sudo chown sandcrawler:staff .
+    cat regrobid_cdx.split_* | split -l 20000 -a4 -d --additional-suffix=.json - chunk_
+    ls /srv/sandcrawler/tasks/chunk_*.json | parallel -j4 ./extract_chunk.sh {}
+
+extract_chunk.sh:
+
+
+    #!/bin/bash
+
+    set -x -e -u -o pipefail
+
+    if [ -f $1.SUCCESS ]; then
+        echo "Skipping: $1..."
+        exit
+    fi
+
+    echo "Extracting $1..."
+
+    date
+    cat $1 | parallel -j10 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
+
+    touch $1.SUCCESS
+
+seems to be working better! tested and if there is a problem with one chunk the others continue
+
+## Pig Joins (around 2019-12-24)
+
+Partial (as a start):
+
+    pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
+
+    HadoopVersion   PigVersion      UserId  StartedAt       FinishedAt      Features
+2.6.0-cdh5.11.2 0.12.0-cdh5.0.1 bnewbold        2019-12-27 00:39:38     2019-12-27 15:32:44     HASH_JOIN,ORDER_BY,DISTINCT,FILTER
+
+    Success!
+
+    Job Stats (time in seconds):
+    JobId   Maps    Reduces MaxMapTime      MinMapTIme      AvgMapTime      MedianMapTime   MaxReduceTime   MinReduceTime   AvgReduceTime   MedianReducetime      Alias   Feature Outputs
+    job_1574819148370_46540 4880    0       143     10      27      21      n/a     n/a     n/a     n/a     cdx     MAP_ONLY
+    job_1574819148370_46541 19      0       59      9       25      18      n/a     n/a     n/a     n/a     digests MAP_ONLY
+    job_1574819148370_46773 24      1       17      7       10      9       6       6       6       6       digests SAMPLER
+    job_1574819148370_46774 7306    1       55      4       7       7       25      25      25      25      cdx     SAMPLER
+    job_1574819148370_46778 7306    40      127     8       18      15      4970    1936    2768    2377    cdx     ORDER_BY
+    job_1574819148370_46779 24      20      80      24      60      66      90      26      38      37      digests ORDER_BY
+    job_1574819148370_46822 22      3       101     27      53      48      1501    166     735     539             DISTINCT
+    job_1574819148370_46828 7146    959     122     7       16      14      91      21      35      32      full_join,result        HASH_JOIN    /user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx,
+
+    Input(s):
+    Successfully read 1968654006 records (654323590996 bytes) from: "/user/bnewbold/pdfs/gwb-pdf-20191005172329"
+    Successfully read 74254196 records (2451575849 bytes) from: "/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted"
+
+    Output(s):
+    Successfully stored 0 records in: "/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx"
+
+Oops! Didn't upper-case the sha1b32 output.
+
+Full GWB:
+
+    pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
diff --git a/notes/job_log.txt b/notes/job_log.txt
deleted file mode 100644
index 67623ec..0000000
--- a/notes/job_log.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-### QA matchcrossref
-
-[D8C7F2CA7620450991838D540489948D/8B17786779BE44579C98D8A325AC5959] sandcrawler.ScoreJob/(1/1) ...-24-2102.32-matchcrossref
-
-Submitted:  Fri Aug 24 21:03:09 UTC 2018
-Started:    Fri Aug 24 21:03:20 UTC 2018
-Finished:   Sat Aug 25 09:46:55 UTC 2018
-Elapsed:    12hrs, 43mins, 34sec
-Diagnostics:    
-Average Map Time    24mins, 31sec
-Average Shuffle Time    15sec
-Average Merge Time  21sec
-Average Reduce Time 7mins, 17sec
-
-Map 2312    2312
-Reduce  100 100
-
-crossref-rows-filtered  73901964    0   73901964
-grobid-rows-filtered    1092992 0   1092992
-joined-rows 0   623837  623837
-
-cascading.flow.StepCounters 
-Tuples_Read 94831255    0   94831255
-Tuples_Written  0   623837  623837
-
-Read_Duration   7108430 352241  7460671
-Tuples_Read 94831255    74994956    169826211
-Tuples_Written  74994956    623837  75618793
-Write_Duration  7650302 21468   7671770
-
-## QA UnGrobided
-
-Submitted:  Sat Aug 25 01:23:22 UTC 2018
-Started:    Sat Aug 25 05:06:36 UTC 2018
-Finished:   Sat Aug 25 05:13:45 UTC 2018
-Elapsed:    7mins, 8sec
-Diagnostics:    
-Average Map Time    1mins, 20sec
-Average Shuffle Time    12sec
-Average Merge Time  15sec
-Average Reduce Time 29sec
-
-Map 48  48
-Reduce  1   1
-
-bnewbold@bnewbold-dev$ gohdfs du -sh sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part*
-56.8M   /user/bnewbold/sandcrawler/output-qa/2018-08-25-0122.54-dumpungrobided/part-00000
-
-## Prod UnGrobided
-
-[D76F6BF91D894E879E747C868B0DEDE7/394A1AFC44694992B71E6920AF8BA3FB] sandcrawler.DumpUnGrobidedJob/(1/1) ...26-0910.25-dumpungrobided
-
-Map    278 278
-Reduce  1   1
-
-Submitted:  Sun Aug 26 09:10:51 UTC 2018
-Started:    Sun Aug 26 09:18:21 UTC 2018
-Finished:   Sun Aug 26 10:29:28 UTC 2018
-Elapsed:    1hrs, 11mins, 7sec
-Diagnostics:    
-Average Map Time    4mins, 48sec
-Average Shuffle Time    24mins, 17sec
-Average Merge Time  14sec
-Average Reduce Time 13mins, 54sec
-
-
-cading.flow.StepCounters    
-Name
-Map
-Reduce
-Total
-Tuples_Read 64510564    0   64510564
-Tuples_Written  0   21618164    21618164
-
-## Prod Crossref Match
-
-[6C063C0809244446BA8602C3BE99CEC2/5FE5D87899154F38991A1ED58BEB34D4] sandcrawler.ScoreJob/(1/1) ...-25-1753.01-matchcrossref
-
-Map 2427    2427
-Reduce  50  50
-
-Submitted:  Sat Aug 25 17:53:50 UTC 2018
-Started:    Sat Aug 25 17:53:59 UTC 2018
-Finished:   Sun Aug 26 11:22:52 UTC 2018
-Elapsed:    17hrs, 28mins, 52sec
-Diagnostics:    
-Average Map Time    31mins, 20sec
-Average Shuffle Time    1mins, 21sec
-Average Merge Time  41sec
-Average Reduce Time 3hrs, 14mins, 39sec
-
-crossref-rows-filtered  73901964    0   73901964
-grobid-rows-filtered    14222226    0   14222226
-joined-rows 0   14115453    14115453
-
-## "Prod" Fatcat Group Works (run 2019-08-10)
-
-    ./please --prod groupworks-fatcat hdfs:///user/bnewbold/release_export.2019-07-07.json
-
-    job_1559844455575_118299
-    http://ia802401.us.archive.org:6988/proxy/application_1559844455575_118299
-
-## Re-GROBID batch (2019-11-12)
-
-Want to re-process "old" GROBID output with newer (0.5.5+fatcat) GROBID version
-(vanilla training) plus biblio-glutton identification. Hoping to make a couple
-million new fatcat matches; will probably do a later round of ML matching over
-this batch as well.
-
-    # in /grande/regrobid
-
-    # as postgres
-    psql sandcrawler < dump_regrobid_pdf.sql > dump_regrobid_pdf.txt
-
-    # as bnewbold
-    cat dump_regrobid_pdf.txt | sort -S 4G | uniq -w 40 | cut -f2 | pv -l > dump_regrobid_pdf.2019-11-12.json
-    # 41.5M lines, uniq by SHA1
-    # NOTE: not the full 56m+ from GROBID table... some in archive.org, others
-    # not application/pdf type. Will need to follow-up on those later
-
-    # intend to have 3 worker machines, but splitting 6 ways in case we need to
-    # re-balance load or get extra machines or something
-    split -n l/6 -a1 -d --additional-suffix=.json dump_regrobid_pdf.2019-11-12.json regrobid_cdx.split_
-
-    # distribute to tmp001, tmp002, tmp003:
-    tmp001: 0,1
-    tmp002: 2,3
-    tmp003: 4,5
-
-    # test local grobid config:
-    head /srv/sandcrawler/tasks/regrobid_cdx.split_0.json | pv -l | ./grobid_tool.py --grobid-host http://localhost:8070 -j0 extract-json - > example_out.json
-    # expect at least a couple fatcat matches
-    cat example_out.json | jq .tei_xml -r | rg fatcat
-
-    # test GROBID+kafka config:
-    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | head | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
-    
-    # full run, in a screen session
-    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
-
-NOTE: really should get parallel kafka worker going soon. if there is a reboot
-or something in the middle of this process, will need to re-run from the start.
-
-Was getting a bunch of weird kafka INVALID_MSG errors on produce. Would be nice to be able to retry, so doing:
-
-    cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel --joblog regrobid_job.log --retries 5 -j40 --linebuffer --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
-
-Never mind, going to split into chunks which can be retried.
-
-    cd /srv/sandcrawler/tasks
-    sudo chown sandcrawler:staff .
-    cat regrobid_cdx.split_* | split -l 20000 -a4 -d --additional-suffix=.json - chunk_
-    ls /srv/sandcrawler/tasks/chunk_*.json | parallel -j4 ./extract_chunk.sh {}
-
-extract_chunk.sh:
-
-
-    #!/bin/bash
-
-    set -x -e -u -o pipefail
-
-    if [ -f $1.SUCCESS ]; then
-        echo "Skipping: $1..."
-        exit
-    fi
-
-    echo "Extracting $1..."
-
-    date
-    cat $1 | parallel -j10 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
-
-    touch $1.SUCCESS
-
-seems to be working better! tested and if there is a problem with one chunk the others continue
-
-## Pig Joins (around 2019-12-24)
-
-Partial (as a start):
-
-    pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
-
-Full GWB:
-
-    pig -param INPUT_CDX="/user/bnewbold/pdfs/gwb-pdf-20191005172329" -param INPUT_DIGEST="/user/bnewbold/scihash/shadow.20191222.sha1b32.sorted" -param OUTPUT="/user/bnewbold/scihash/gwb-pdf-20191005172329.shadow.20191222.join.cdx" join-cdx-sha1.pig
-- 
cgit v1.2.3