From 5cfa53140bf8638565027fa9bd8e394fc2c40c27 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 28 Feb 2019 12:33:47 -0800 Subject: include report and sqlite3 example files --- examples/output.sqlite3 | Bin 0 -> 262144 bytes examples/report.html | 896 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 896 insertions(+) create mode 100644 examples/output.sqlite3 create mode 100644 examples/report.html diff --git a/examples/output.sqlite3 b/examples/output.sqlite3 new file mode 100644 index 0000000..b86e281 Binary files /dev/null and b/examples/output.sqlite3 differ diff --git a/examples/report.html b/examples/report.html new file mode 100644 index 0000000..7d1c595 --- /dev/null +++ b/examples/report.html @@ -0,0 +1,896 @@ +

Crawl QA Report

+

This crawl report is auto-generated from a sqlite database file, which should be available/included.

+

Seedlist Stats

+
+ + + + + + + + + + +
identifiersurisdomains
480583163
QUERY: SELECT COUNT(DISTINCT identifier) as identifiers, COUNT(DISTINCT initial_url) as uris, COUNT(DISTINCT initial_domain) AS domains FROM crawl_result;
+

FTP seed URLs

+
+ + + + + + +
ftp_urls
0
QUERY: SELECT COUNT(*) as ftp_urls FROM crawl_result WHERE initial_url LIKE 'ftp://%';
+

Successful Hits

+
+ + + + + + + + + + +
identifiersurisunique_sha1
63166166
QUERY: SELECT COUNT(DISTINCT identifier) as identifiers, COUNT(DISTINCT initial_url) as uris, COUNT(DISTINCT final_sha1) as unique_sha1 FROM crawl_result WHERE hit=1;
+

De-duplication percentage (aka, fraction of hits where content had been crawled and identified previously):

+
+ + + + + + +
percent
47.59036144578313
QUERY: SELECT 100. * AVG(final_was_dedupe) as percent FROM crawl_result WHERE hit=1;
+

Top mimetypes for successful hits (these are usually filtered to a fixed list in post-processing):

+
+ + + + + + + + + + + + +
final_mimetypeCOUNT(*)
application/pdf161
application/octet-stream5
QUERY: SELECT final_mimetype, COUNT(*) FROM crawl_result WHERE hit=1 GROUP BY final_mimetype ORDER BY COUNT(*) DESC LIMIT 10;
+

Most popular breadcrumbs (a measure of how hard the crawler had to work):

+
+ + + + + + + + + + + + + + + + +
breadcrumbsCOUNT(*)
-125
R39
L2
QUERY: SELECT breadcrumbs, COUNT(*) FROM crawl_result WHERE hit=1 GROUP BY breadcrumbs ORDER BY COUNT(*) DESC LIMIT 10;
+

FTP vs. HTTP hits (200 is HTTP, 226 is FTP):

+
+ + + + + + + + +
final_status_codeCOUNT(*)
200166
QUERY: SELECT final_status_code, COUNT(*) FROM crawl_result WHERE hit=1 GROUP BY final_status_code LIMIT 10;
+

Domain Summary

+

Top initial domains:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
initial_domainCOUNT(*)percent
www.nature.com223.7735849056603774
www.medicaljournals.se213.6020583190394513
ajpgi.physiology.org142.4013722126929675
jn.physiology.org122.058319039451115
naukaru.ru122.058319039451115
www.physiology.org122.058319039451115
web.mit.edu111.8867924528301887
www.nada.kth.se111.8867924528301887
medicaljournals.se101.7152658662092624
www.jstage.jst.go.jp101.7152658662092624
www.site.uottawa.ca101.7152658662092624
www.tandfonline.com101.7152658662092624
academic.oup.com91.5437392795883362
iopscience.iop.org91.5437392795883362
www.amjbot.org91.5437392795883362
www.efmaefm.org91.5437392795883362
ajpcell.physiology.org81.3722126929674099
ajpheart.physiology.org81.3722126929674099
content.iospress.com81.3722126929674099
link.springer.com81.3722126929674099
QUERY: SELECT initial_domain, COUNT(*), 100. * COUNT(*) / (SELECT COUNT(*) FROM crawl_result) as percent FROM crawl_result GROUP BY initial_domain ORDER BY count(*) DESC LIMIT 20;
+

Top successful, final domains, where hits were found:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
initial_domainCOUNT(*)percent
www.physiology.org127.228915662650603
www.jstage.jst.go.jp106.024096385542169
content.iospress.com84.819277108433735
digital.library.unt.edu74.216867469879518
files.eccomasproceedia.org74.216867469879518
link.springer.com74.216867469879518
www.scielo.br74.216867469879518
www.termedia.pl74.216867469879518
ijpsr.com63.6144578313253013
uvadoc.uva.es63.6144578313253013
www.jafs.com.pl63.6144578313253013
hal.archives-ouvertes.fr53.0120481927710845
iopscience.iop.org53.0120481927710845
www.cambridge.org53.0120481927710845
digitool.library.mcgill.ca42.4096385542168677
www.ejgm.co.uk42.4096385542168677
www.pnas.org42.4096385542168677
aaltodoc.aalto.fi31.8072289156626506
citeseerx.ist.psu.edu31.8072289156626506
digital.csic.es31.8072289156626506
QUERY: SELECT initial_domain, COUNT(*), 100. * COUNT(*) / (SELECT COUNT(*) FROM crawl_result WHERE hit=1) AS percent  FROM crawl_result WHERE hit=1 GROUP BY initial_domain ORDER BY COUNT(*) DESC LIMIT 20;
+

Top non-successful, final domains where crawl paths terminated before a successful hit (but crawl did run):

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
final_domainCOUNT(*)
www.medicaljournals.se21
www.nature.com21
ajpgi.physiology.org14
jn.physiology.org12
naukaru.ru12
web.mit.edu11
www.nada.kth.se11
medicaljournals.se10
www.site.uottawa.ca10
www.tandfonline.com10
academic.oup.com9
www.amjbot.org9
www.efmaefm.org9
ajpcell.physiology.org8
ajpheart.physiology.org8
pdfs.journals.lww.com8
www.osti.gov8
ajpregu.physiology.org7
pubs.rsna.org7
download.atlantis-press.com6
QUERY: SELECT final_domain, COUNT(*) FROM crawl_result WHERE hit=0 AND final_status_code IS NOT NULL GROUP BY final_domain ORDER BY count(*) DESC LIMIT 20;
+

Top uncrawled, initial domains, where the crawl didn't even attempt to run:

+
+ + + + +
initial_domainCOUNT(*)
QUERY: SELECT initial_domain, COUNT(*) FROM crawl_result WHERE hit=0 AND final_status_code IS NULL GROUP BY initial_domain ORDER BY count(*) DESC LIMIT 20;
+

Top blocked, final domains:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
final_domainCOUNT(*)
140.115.82.1911
classes.maxwell.syr.edu1
drona.csa.iisc.ernet.in1
lamar.colostate.edu1
linux46.ma.utexas.edu1
mathro.fpms.ac.be1
pdl.cmu.edu1
sammelpunkt.philo.at1
suma.ldc.usb.ve1
virtualmentor.ama-assn.org1
www.cais.ntu.edu.sg1
www.cse.ucla.edu1
www.ece.stevens-tech.edu1
www.lance.colostate.edu1
www2.asanet.org1
QUERY: SELECT final_domain, COUNT(*) FROM crawl_result WHERE hit=0 AND (final_status_code='-61' OR final_status_code='-2') GROUP BY final_domain ORDER BY count(*) DESC LIMIT 20;
+

Top rate-limited, final domains:

+
+ + + + + + + + + + + + +
final_domainCOUNT(*)
www.researchgate.net6
openknowledge.worldbank.org1
QUERY: SELECT final_domain, COUNT(*) FROM crawl_result WHERE hit=0 AND final_status_code='429' GROUP BY final_domain ORDER BY count(*) DESC LIMIT 20;
+

Status Summary

+

Top failure status codes:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
final_status_codeCOUNT(*)
404112
30185
40361
30260
-636
30321
-215
4297
5037
2005
QUERY: SELECT final_status_code, COUNT(*) FROM crawl_result WHERE hit=0 GROUP BY final_status_code ORDER BY count(*) DESC LIMIT 10;
+

Example Results

+

A handful of random success lines:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
identifierinitial_urlbreadcrumbsfinal_urlfinal_sha1final_mimetype
10.1017/s0022149x00006660 +https://www.cambridge.org/core/services/aop-cambridge-core/content/view/A291CBD43AD6F7FA0F44E6592E214060/S0022149X00006660a.pdf/div-class-title-jhl-volume-54-issue-4-cover-and-back-matter-div.pdf-https://www.cambridge.org/core/services/aop-cambridge-core/content/view/A291CBD43AD6F7FA0F44E6592E214060/S0022149X00006660a.pdf/div-class-title-jhl-volume-54-issue-4-cover-and-back-matter-div.pdfW7UGJ7XAIILAEZFHH73FZ7XH5XRUENOZapplication/pdf
10.7712/100016.2380.8613 +https://files.eccomasproceedia.org/papers/eccomas-congress-2016/8613.pdf?mtime=20170308165111-https://files.eccomasproceedia.org/papers/eccomas-congress-2016/8613.pdf?mtime=20170308165111FM5ZQWTUQ2N7T7SXFNLCVA6N5RWQRTI6application/pdf
https://aaltodoc.aalto.fi/bitstream/handle/123456789/17665/A1_hakonen_pertti_j_1987.pdf;jsessionid=F5E9AAC28EEB3F2E2ECA2997AA0A194B?sequence=1Rhttps://aaltodoc.aalto.fi/bitstream/handle/123456789/17665/A1_hakonen_pertti_j_1987.pdf;jsessionid=F5E9AAC28EEB3F2E2ECA2997AA0A194B?sequence=14OUP6PQQ6CISN26ZSYSI7YK4QZG2VBCHapplication/pdf
https://hal.archives-ouvertes.fr/hal-01578692/document-https://hal.archives-ouvertes.fr/hal-01578692/document6USL3UAMYQSKX2CLZXZ3N7YA7RBE4MAZapplication/pdf
http://www.jafs.com.pl/pdf-80904-17172?filename=Effect-http://www.jafs.com.pl/pdf-80904-17172?filename=EffectWHHSO2BB3AYSYOMNWAQLFJXA6RSDK4SZapplication/pdf
10.1109/lcomm.2012.120312.121675 +http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.337.8390&rep=rep1&type=pdf-http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.337.8390&rep=rep1&type=pdfLP4ZFJ36GN6N7PKWSCLXFSQQFHTEZD3Oapplication/pdf
http://www.jafs.com.pl/pdf-77058-14511?filename=Effects-http://www.jafs.com.pl/pdf-77058-14511?filename=EffectsYCHB676GBGVZH5O5CAH7EM2USTRVH5VLapplication/pdf
https://content.iospress.com/download/information-services-and-use/isu851?id=information-services-and-use%2Fisu851-https://content.iospress.com/download/information-services-and-use/isu851?id=information-services-and-use%2Fisu851NFITUUUWEGUOI6OWWBVI45Z5JQQV4QBIapplication/pdf
10.1007/bf02907787 +https://link.springer.com/content/pdf/10.1007%2FBF02907787.pdf-https://link.springer.com/content/pdf/10.1007%2FBF02907787.pdfGF4XYUGTDKK4JL7FFLTJXMJJAZLCPQZ2application/pdf
10.2172/73948 +https://digital.library.unt.edu/ark:/67531/metadc704352/m2/1/high_res_d/73948.pdf-https://digital.library.unt.edu/ark:/67531/metadc704352/m2/1/high_res_d/73948.pdfKKSZMZOTULQNXFHQKO4VGMXWI36NIZKHapplication/pdf
QUERY: SELECT identifier, initial_url, breadcrumbs, final_url, final_sha1, final_mimetype FROM crawl_result WHERE hit=1 ORDER BY random() LIMIT 10;
+

Handful of random non-success lines:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
identifierinitial_urlbreadcrumbsfinal_urlfinal_status_codefinal_mimetype
10.1109/78.661335 +http://www-sccm.stanford.edu/Students/vanderveen/SPtrans98b.ps.gz-http://www-sccm.stanford.edu/Students/vanderveen/SPtrans98b.ps.gz-6application/octet-stream
10.1109/mobhoc.2009.5336965 +http://www.cis.umassd.edu/%7Exbai/pubs/J-DirectionalCoverage.pdf-http://www.cis.umassd.edu/%7Exbai/pubs/J-DirectionalCoverage.pdf404text/html
10.2340/00015555-1505 +https://www.medicaljournals.se/acta/content_files/download.php?doi=10.2340/00015555-1505-https://www.medicaljournals.se/acta/content_files/download.php?doi=10.2340/00015555-1505403text/html
10.1016/s0166-3542(01)00195-4 +http://dissertations.ub.rug.nl/FILES/faculties/science/2001/b.w.a.van.der.strate/c1.pdf-http://dissertations.ub.rug.nl/FILES/faculties/science/2001/b.w.a.van.der.strate/c1.pdf-6application/octet-stream
10.1145/996566.996624 +http://www2.dac.com/41st/41acceptedpapers.nsf/0c4c09c6ffa905c487256b7b007afb72/b23ec16f6e1fc42c87256e54007a1f0a/$file/13_3.pdf-http://www2.dac.com/41st/41acceptedpapers.nsf/0c4c09c6ffa905c487256b7b007afb72/b23ec16f6e1fc42c87256e54007a1f0a/$file/13_3.pdf404text/html
10.1080/07438141.2011.627625 +http://www.tandfonline.com/doi/pdf/10.1080/07438141.2011.627625?needAccess=true-http://www.tandfonline.com/doi/pdf/10.1080/07438141.2011.627625?needAccess=true302text/html
10.1152/physiolgenomics.00296.2005 +http://physiolgenomics.physiology.org/content/physiolgenomics/26/1/91.full.pdf-http://physiolgenomics.physiology.org/content/physiolgenomics/26/1/91.full.pdf301application/octet-stream
10.1111/j.1540-6261.2006.01064.x +http://www.efmaefm.org/efmsympo2005/accepted_papers/06-Neil_Brisley_paper.pdf-http://www.efmaefm.org/efmsympo2005/accepted_papers/06-Neil_Brisley_paper.pdf404text/html
10.1109/18.923725 +http://web.mit.edu/bchen/www/pubs/it01-chen.pdf-http://web.mit.edu/bchen/www/pubs/it01-chen.pdf404text/html
10.2991/iccia.2012.347 +http://download.atlantis-press.com/php/download_paper.php?id=4295-http://download.atlantis-press.com/php/download_paper.php?id=4295301text/html
10.1126/science.1164647 +https://www.orgchem.science.ru.nl/pubs/10.1126_1668.pdf-https://www.orgchem.science.ru.nl/pubs/10.1126_1668.pdf403text/html
10.1080/000155500750012298 +https://medicaljournals.se/acta/content_files/download.php?doi=10.1080/000155500750012298-https://medicaljournals.se/acta/content_files/download.php?doi=10.1080/000155500750012298403text/html
10.1109/icpr.1996.546998 +http://www.ee.ed.ac.uk/~sasg/Papers/96_papers/ICPR96_whn.ps-http://www.ee.ed.ac.uk/~sasg/Papers/96_papers/ICPR96_whn.ps-6application/octet-stream
10.1137/s106482750241565x +http://www.seas.upenn.edu/~biros/papers/lnks/paper.pdfRhttps://www.seas.upenn.edu/~biros/papers/lnks/paper.pdf404text/html
10.2340/00015555-1046 +https://www.medicaljournals.se/acta/content_files/download.php?doi=10.2340/00015555-1046-https://www.medicaljournals.se/acta/content_files/download.php?doi=10.2340/00015555-1046403text/html
10.2991/sschd-16.2016.23 +http://download.atlantis-press.com/php/download_paper.php?id=25860593Rhttps://download.atlantis-press.com/php/download_paper.php?id=25860593302application/octet-stream
10.1152/jn.2001.85.6.2613 +http://www.nada.kth.se/~anfa/smalllargeforce.pdf-http://www.nada.kth.se/~anfa/smalllargeforce.pdf403text/html
10.1152/jn.00416.2002 +http://jn.physiology.org/content/jn/89/1/12.full.pdf-http://jn.physiology.org/content/jn/89/1/12.full.pdf301application/octet-stream
10.1152/physiolgenomics.00086.2011 +http://physiolgenomics.physiology.org/content/physiolgenomics/43/21/1241.full.pdf-http://physiolgenomics.physiology.org/content/physiolgenomics/43/21/1241.full.pdf301application/octet-stream
10.3732/ajb.1300036 +http://www.amjbot.org/content/100/10/2016.full.pdf-http://www.amjbot.org/content/100/10/2016.full.pdf404text/html
10.2139/ssrn.1458963 +http://www.efmaefm.org/0EFMAMEETINGS/EFMA%20ANNUAL%20MEETINGS/2010-Aarhus/EFMA2010_0074_fullpaper.pdf-http://www.efmaefm.org/0EFMAMEETINGS/EFMA%20ANNUAL%20MEETINGS/2010-Aarhus/EFMA2010_0074_fullpaper.pdf503text/html
10.1152/ajpgi.00160.2012 +http://ajpgi.physiology.org/content/ajpgi/304/10/G897.full.pdf-http://ajpgi.physiology.org/content/ajpgi/304/10/G897.full.pdf301application/octet-stream
10.1080/09853111.2007.9736326 +https://www.tandfonline.com/doi/pdf/10.1080/09853111.2007.9736326?needAccess=trueRhttps://www.tandfonline.com/doi/pdf/10.1080/09853111.2007.9736326?needAccess=true&cookieSet=1302text/html
10.1152/japplphysiol.00624.2004 +http://jap.physiology.org/content/jap/99/2/665.full.pdf-http://jap.physiology.org/content/jap/99/2/665.full.pdf301application/octet-stream
10.4304/jnw.4.6.436-444 +http://academypublisher.net/jnw/vol04/no06/jnw0406436444.pdf-http://academypublisher.net/jnw/vol04/no06/jnw0406436444.pdf-6application/octet-stream
QUERY: SELECT identifier, initial_url, breadcrumbs, final_url, final_status_code, final_mimetype FROM crawl_result WHERE hit=0 ORDER BY random() LIMIT 25;
+
-- cgit v1.2.3