From f3a721a9dce8801b78f7bc31e88dc912b0ec1dba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Dec 2022 15:52:02 -0800 Subject: move a bunch of top-level files/directories to ./extra/ --- RUNBOOK.md | 44 -------- blobs/README.md | 86 --------------- blobs/minio/README.md | 74 ------------- blobs/minio/minio.conf | 14 --- blobs/seaweedfs/README.md | 9 -- blobs/tasks.md | 53 --------- extra/RUNBOOK.md | 44 ++++++++ extra/blobs/README.md | 86 +++++++++++++++ extra/blobs/minio/README.md | 74 +++++++++++++ extra/blobs/minio/minio.conf | 14 +++ extra/blobs/seaweedfs/README.md | 9 ++ extra/blobs/tasks.md | 53 +++++++++ extra/hbase/howto.md | 42 ++++++++ extra/hbase/notes.txt | 232 ++++++++++++++++++++++++++++++++++++++++ extra/hbase/schema_design.md | 79 ++++++++++++++ extra/nginx/README.md | 18 ++++ extra/nginx/fatcat-blobs | 51 +++++++++ extra/nginx/sandcrawler-db | 80 ++++++++++++++ extra/nginx/sandcrawler-minio | 57 ++++++++++ hbase/howto.md | 42 -------- hbase/notes.txt | 232 ---------------------------------------- hbase/schema_design.md | 79 -------------- nginx/README.md | 18 ---- nginx/fatcat-blobs | 51 --------- nginx/sandcrawler-db | 80 -------------- nginx/sandcrawler-minio | 57 ---------- 26 files changed, 839 insertions(+), 839 deletions(-) delete mode 100644 RUNBOOK.md delete mode 100644 blobs/README.md delete mode 100644 blobs/minio/README.md delete mode 100644 blobs/minio/minio.conf delete mode 100644 blobs/seaweedfs/README.md delete mode 100644 blobs/tasks.md create mode 100644 extra/RUNBOOK.md create mode 100644 extra/blobs/README.md create mode 100644 extra/blobs/minio/README.md create mode 100644 extra/blobs/minio/minio.conf create mode 100644 extra/blobs/seaweedfs/README.md create mode 100644 extra/blobs/tasks.md create mode 100644 extra/hbase/howto.md create mode 100644 extra/hbase/notes.txt create mode 100644 extra/hbase/schema_design.md create mode 100644 extra/nginx/README.md create mode 100644 extra/nginx/fatcat-blobs create mode 100644 extra/nginx/sandcrawler-db create mode 100644 extra/nginx/sandcrawler-minio delete mode 100644 hbase/howto.md delete mode 100644 hbase/notes.txt delete mode 100644 hbase/schema_design.md delete mode 100644 nginx/README.md delete mode 100644 nginx/fatcat-blobs delete mode 100644 nginx/sandcrawler-db delete mode 100644 nginx/sandcrawler-minio diff --git a/RUNBOOK.md b/RUNBOOK.md deleted file mode 100644 index 6c4165d..0000000 --- a/RUNBOOK.md +++ /dev/null @@ -1,44 +0,0 @@ - -## Process Un-GROBID-ed PDFs from Wayback - -Sometimes ingest doesn't pick up everything, or we do some heuristic CDX -import, and we want to run GROBID over all the PDFs that haven't been processed -yet. Only want one CDX line per `sha1hex`. - -A hybrid SQL/UNIX way of generating processing list: - - psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json - -From here, there are two options: enqueue in Kafka and let workers run, or -create job files and run them using local worker and GNU/parallel. - -#### Kafka - -Copy/transfer to a Kafka node; load a sample and then the whole output: - - head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 - cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 - -#### Local JSON - -Older example; if this fails, need to re-run entire thing: - - cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - - -TODO: is it possible to use job log with millions of `--pipe` inputs? That -would be more efficient in the event of failure. - -## GROBID over many .zip files - -Want to use GNU/Parallel in a mode that will do retries well: - - fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ - sort | \ - parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ - './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' - -After starting, check that messages are actually getting pushed to kafka -(producer failures can be silent!). If anything goes wrong, run the exact same -command again. The sort is to ensure jobs are enqueued in the same order again; -could also dump `fd` output to a command file first. - diff --git a/blobs/README.md b/blobs/README.md deleted file mode 100644 index 555db92..0000000 --- a/blobs/README.md +++ /dev/null @@ -1,86 +0,0 @@ - -This document describes sandcrawler/fatcat use of "blob store" infrastructure -for storing hundreds of millions of small files. For example, GROBID XML -documents, jpeg thumbnails of PDFs. - -The basic feature requirements for this system are: - -- don't need preservation data resiliency: all this data is derived from - primary content, and is usually redundantly stored in Kafka topics (and thus - can be re-indexed to any server bounded only by throughput of the object - store service; Kafka is usually faster) -- don't require SSDs or large amounts of RAM. Ability to accelerate performance - with additional RAM or moving indexes to SSD is nice, but we will be using - spinning disks for primary data storage -- hundreds of millions or billions of objects, fetchable by a key we define -- optional transparent compression (for text and XML) -- typical object (file) size of 5-200 KBytes uncompressed, want to support up - to several MBytes -- very simple internal API for GET/PUT (S3 API compatible is good) -- ability to proxy to HTTP publicly for reads (eg, HTTP fall-back with no - authenticaiton), controllable by at least bucket granularity - -## Infrastructure - -`minio` was used initially, but did not scale well in number of files. We -currently use seaweedfs. Any S3-compatible key/value store should work in -theory. openlibrary.org has used WARCs in petabox items in the past. Actual -cloud object stores tend to be expensive for this kind of use case. - -The facebook "haystack" project (and whitepaper) are good background reading -describing one type of system that works well for this application. - - -## Bucket / Folder Structure - -Currently we run everything off a single server, with no redundancy. There is -no QA/prod distinction. - -Setting access control and doing bulk deletions is easiest at the bucket level, -less easy at the folder level, most difficult at the suffix (file extention) -level. - -For files that are derived from PDFs, we use the SHA-1 (in lower-case hex) of -the source PDF to contruct keys. We generate nested "directories" from the hash -to limit the number of keys per "directory" (even though in S3/seaweedfs there -are no actual directories involved). The structure looks like: - - //// - -Eg: - - sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml - -The nesting is sort of a hold-over from minio (where files were actually -on-disk), but seems worth keeping in case we end up switching storage systems -in the future. - -## Existing Content - -sandcrawler: internal/controlled access to PDF derivatives - grobid: TEI-XML documents - extension: .tei.xml - text: raw pdftotext (or other text transform) - extension: .txt - -thumbnail: public bucket for thumbnail images - pdf: thumbnails from PDF files - extension: .180px.jpg - -## Proxy and URLs - -Internal HTTP access via: - - http://wbgrp-svc169.us.archive.org:8333// - -Public access via: - - https://blobs.fatcat.wiki// - -Eg: - - http://wbgrp-svc169.us.archive.org:8333/testing/small.txt - http://wbgrp-svc169.us.archive.org:8333/sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml - https://blobs.fatcat.wiki/testing/small.txt - https://blobs.fatcat.wiki/thumbnail/pdf/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.180px.jpg - diff --git a/blobs/minio/README.md b/blobs/minio/README.md deleted file mode 100644 index d8f1c69..0000000 --- a/blobs/minio/README.md +++ /dev/null @@ -1,74 +0,0 @@ - -minio is used as an S3-compatible blob store. Initial use case is GROBID XML -documents, addressed by the sha1 of the PDF file the XML was extracted from. - -Note that on the backend minio is just storing objects as files on disk. - -## Deploying minio Server - -It seems to be important to use a version of minio from at least December 2019 -era for on-disk compression to actually work. - -Currently install minio (and mc, the minio client) in prod by simply -downloading the binaries and calling from systemd. - -## Buckets and Directories - -Hosts and buckets: - - localhost:sandcrawler-dev - create locally for development (see below) - - cluster:sandcrawler - main sandcrawler storage bucket, for GROBID output and other derivatives. - Note it isn't "sandcrawler-prod", for backwards compatibility reasons. - - cluster:sandcrawler-qa - for, eg, testing on cluster servers - - cluster:unpaywall - subset of sandcrawler content crawled due to unpaywall URLs; - potentially made publicly accessible - -Directory structure within sandcrawler buckets: - - grobid/2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml - SHA1 (lower-case hex) of PDF that XML was extracted from - -Create new buckets like: - - mc mb cluster/sandcrawler-qa - -## Development - -Run minio server locally, with non-persisted data: - - docker run -p 9000:9000 minio/minio server /data - -Credentials are `minioadmin:minioadmin`. Install `mc` client utility, and -configure: - - mc config host add localhost http://localhost:9000 minioadmin minioadmin - -Then create dev bucket: - - mc mb --ignore-existing localhost/sandcrawler-dev - -A common "gotcha" with `mc` command is that it will first look for a local -folder/directory with same name as the configured remote host, so make sure -there isn't a `./localhost` folder. - - -## Users - -Create a new readonly user like: - - mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly - -Make a prefix within a bucket world-readable like: - - mc policy set download cluster/unpaywall/grobid - -## Config - - mc admin config set aitio compression extensions=.txt,.log,.csv,.json,.tsv,.pdf,.xml mime_types=text/csv,text/plain,application/json,application/xml,application/octet-stream,application/tei+xml diff --git a/blobs/minio/minio.conf b/blobs/minio/minio.conf deleted file mode 100644 index 2e93f9a..0000000 --- a/blobs/minio/minio.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# Volume to be used for MinIO server. -MINIO_VOLUMES="/sandcrawler-minio/data" -# Use if you want to run MinIO on a custom port. -MINIO_OPTS="--address :9000" -# Access Key of the server. -MINIO_ACCESS_KEY=REDACTED -# Secret key of the server. -MINIO_SECRET_KEY=REDACTED - -# may need to set these manually using `mc admin config get`, edit the JSON, then `set` -MINIO_COMPRESS="on" -MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv" -MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream" diff --git a/blobs/seaweedfs/README.md b/blobs/seaweedfs/README.md deleted file mode 100644 index d19e9e0..0000000 --- a/blobs/seaweedfs/README.md +++ /dev/null @@ -1,9 +0,0 @@ - -## HOWTO: Create new bucket in SeaweedFS - -Log in to the seaweedfs VM. - -Run `weed shell` to start a shell, then: - - bucket.create -name - diff --git a/blobs/tasks.md b/blobs/tasks.md deleted file mode 100644 index beb765f..0000000 --- a/blobs/tasks.md +++ /dev/null @@ -1,53 +0,0 @@ - -## Backfill GROBID XML to Blob Store - -Initially ran this when spinning up new seaweedfs server to replace minio. At -this time grobid persist worker was in db-only mode, as minio was too slow to -accept uploads. Rough plan is to: - -1. run grobid persist worker from Kafka with a new temporary consumer group, - from the start of the GROBID output topic -2. when it gets to end, stop the *regular* consumer group while this one is - still running. with temporary worker still running, at that point in time - entire topic should be in S3 -3. then reconfigure regular worker to db+s3 mode. halt the temporary worker, - restart the regular one with new config, run it indefinitely - -Consumer group isn't an arg, so just edit `persist_worker.py` and set it to -`persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode -didn't try to connect to postgresql. - -Commands: - - ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only - => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed - => run briefly, then kill - -On kafka-broker worker: - - ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run - -Then run 2x instances of worker (same command as above): - - ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only - -At this point CPU-limited on this worker by the python processes (only 4 cores -on this machine). - -Check in weed shell: - - weed shell - - > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml - [...] - "isGzipped": false - [...] - "mime": "application/xml", - [...] - -An open question is if we should have separate buckets per derive type. Eg, a -GROBID XML bucket separate from thumbnails bucket. Or are prefix directories -enough. Basically this comes down to whether we want things mixed together at -the volume level. I think we should keep separate. - -Need to set the mimetype in the upload for gzip on XML? diff --git a/extra/RUNBOOK.md b/extra/RUNBOOK.md new file mode 100644 index 0000000..6c4165d --- /dev/null +++ b/extra/RUNBOOK.md @@ -0,0 +1,44 @@ + +## Process Un-GROBID-ed PDFs from Wayback + +Sometimes ingest doesn't pick up everything, or we do some heuristic CDX +import, and we want to run GROBID over all the PDFs that haven't been processed +yet. Only want one CDX line per `sha1hex`. + +A hybrid SQL/UNIX way of generating processing list: + + psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json + +From here, there are two options: enqueue in Kafka and let workers run, or +create job files and run them using local worker and GNU/parallel. + +#### Kafka + +Copy/transfer to a Kafka node; load a sample and then the whole output: + + head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 + cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 + +#### Local JSON + +Older example; if this fails, need to re-run entire thing: + + cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + +TODO: is it possible to use job log with millions of `--pipe` inputs? That +would be more efficient in the event of failure. + +## GROBID over many .zip files + +Want to use GNU/Parallel in a mode that will do retries well: + + fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ + sort | \ + parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ + './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' + +After starting, check that messages are actually getting pushed to kafka +(producer failures can be silent!). If anything goes wrong, run the exact same +command again. The sort is to ensure jobs are enqueued in the same order again; +could also dump `fd` output to a command file first. + diff --git a/extra/blobs/README.md b/extra/blobs/README.md new file mode 100644 index 0000000..555db92 --- /dev/null +++ b/extra/blobs/README.md @@ -0,0 +1,86 @@ + +This document describes sandcrawler/fatcat use of "blob store" infrastructure +for storing hundreds of millions of small files. For example, GROBID XML +documents, jpeg thumbnails of PDFs. + +The basic feature requirements for this system are: + +- don't need preservation data resiliency: all this data is derived from + primary content, and is usually redundantly stored in Kafka topics (and thus + can be re-indexed to any server bounded only by throughput of the object + store service; Kafka is usually faster) +- don't require SSDs or large amounts of RAM. Ability to accelerate performance + with additional RAM or moving indexes to SSD is nice, but we will be using + spinning disks for primary data storage +- hundreds of millions or billions of objects, fetchable by a key we define +- optional transparent compression (for text and XML) +- typical object (file) size of 5-200 KBytes uncompressed, want to support up + to several MBytes +- very simple internal API for GET/PUT (S3 API compatible is good) +- ability to proxy to HTTP publicly for reads (eg, HTTP fall-back with no + authenticaiton), controllable by at least bucket granularity + +## Infrastructure + +`minio` was used initially, but did not scale well in number of files. We +currently use seaweedfs. Any S3-compatible key/value store should work in +theory. openlibrary.org has used WARCs in petabox items in the past. Actual +cloud object stores tend to be expensive for this kind of use case. + +The facebook "haystack" project (and whitepaper) are good background reading +describing one type of system that works well for this application. + + +## Bucket / Folder Structure + +Currently we run everything off a single server, with no redundancy. There is +no QA/prod distinction. + +Setting access control and doing bulk deletions is easiest at the bucket level, +less easy at the folder level, most difficult at the suffix (file extention) +level. + +For files that are derived from PDFs, we use the SHA-1 (in lower-case hex) of +the source PDF to contruct keys. We generate nested "directories" from the hash +to limit the number of keys per "directory" (even though in S3/seaweedfs there +are no actual directories involved). The structure looks like: + + //// + +Eg: + + sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml + +The nesting is sort of a hold-over from minio (where files were actually +on-disk), but seems worth keeping in case we end up switching storage systems +in the future. + +## Existing Content + +sandcrawler: internal/controlled access to PDF derivatives + grobid: TEI-XML documents + extension: .tei.xml + text: raw pdftotext (or other text transform) + extension: .txt + +thumbnail: public bucket for thumbnail images + pdf: thumbnails from PDF files + extension: .180px.jpg + +## Proxy and URLs + +Internal HTTP access via: + + http://wbgrp-svc169.us.archive.org:8333// + +Public access via: + + https://blobs.fatcat.wiki// + +Eg: + + http://wbgrp-svc169.us.archive.org:8333/testing/small.txt + http://wbgrp-svc169.us.archive.org:8333/sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml + https://blobs.fatcat.wiki/testing/small.txt + https://blobs.fatcat.wiki/thumbnail/pdf/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.180px.jpg + diff --git a/extra/blobs/minio/README.md b/extra/blobs/minio/README.md new file mode 100644 index 0000000..d8f1c69 --- /dev/null +++ b/extra/blobs/minio/README.md @@ -0,0 +1,74 @@ + +minio is used as an S3-compatible blob store. Initial use case is GROBID XML +documents, addressed by the sha1 of the PDF file the XML was extracted from. + +Note that on the backend minio is just storing objects as files on disk. + +## Deploying minio Server + +It seems to be important to use a version of minio from at least December 2019 +era for on-disk compression to actually work. + +Currently install minio (and mc, the minio client) in prod by simply +downloading the binaries and calling from systemd. + +## Buckets and Directories + +Hosts and buckets: + + localhost:sandcrawler-dev + create locally for development (see below) + + cluster:sandcrawler + main sandcrawler storage bucket, for GROBID output and other derivatives. + Note it isn't "sandcrawler-prod", for backwards compatibility reasons. + + cluster:sandcrawler-qa + for, eg, testing on cluster servers + + cluster:unpaywall + subset of sandcrawler content crawled due to unpaywall URLs; + potentially made publicly accessible + +Directory structure within sandcrawler buckets: + + grobid/2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml + SHA1 (lower-case hex) of PDF that XML was extracted from + +Create new buckets like: + + mc mb cluster/sandcrawler-qa + +## Development + +Run minio server locally, with non-persisted data: + + docker run -p 9000:9000 minio/minio server /data + +Credentials are `minioadmin:minioadmin`. Install `mc` client utility, and +configure: + + mc config host add localhost http://localhost:9000 minioadmin minioadmin + +Then create dev bucket: + + mc mb --ignore-existing localhost/sandcrawler-dev + +A common "gotcha" with `mc` command is that it will first look for a local +folder/directory with same name as the configured remote host, so make sure +there isn't a `./localhost` folder. + + +## Users + +Create a new readonly user like: + + mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly + +Make a prefix within a bucket world-readable like: + + mc policy set download cluster/unpaywall/grobid + +## Config + + mc admin config set aitio compression extensions=.txt,.log,.csv,.json,.tsv,.pdf,.xml mime_types=text/csv,text/plain,application/json,application/xml,application/octet-stream,application/tei+xml diff --git a/extra/blobs/minio/minio.conf b/extra/blobs/minio/minio.conf new file mode 100644 index 0000000..2e93f9a --- /dev/null +++ b/extra/blobs/minio/minio.conf @@ -0,0 +1,14 @@ + +# Volume to be used for MinIO server. +MINIO_VOLUMES="/sandcrawler-minio/data" +# Use if you want to run MinIO on a custom port. +MINIO_OPTS="--address :9000" +# Access Key of the server. +MINIO_ACCESS_KEY=REDACTED +# Secret key of the server. +MINIO_SECRET_KEY=REDACTED + +# may need to set these manually using `mc admin config get`, edit the JSON, then `set` +MINIO_COMPRESS="on" +MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv" +MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream" diff --git a/extra/blobs/seaweedfs/README.md b/extra/blobs/seaweedfs/README.md new file mode 100644 index 0000000..d19e9e0 --- /dev/null +++ b/extra/blobs/seaweedfs/README.md @@ -0,0 +1,9 @@ + +## HOWTO: Create new bucket in SeaweedFS + +Log in to the seaweedfs VM. + +Run `weed shell` to start a shell, then: + + bucket.create -name + diff --git a/extra/blobs/tasks.md b/extra/blobs/tasks.md new file mode 100644 index 0000000..beb765f --- /dev/null +++ b/extra/blobs/tasks.md @@ -0,0 +1,53 @@ + +## Backfill GROBID XML to Blob Store + +Initially ran this when spinning up new seaweedfs server to replace minio. At +this time grobid persist worker was in db-only mode, as minio was too slow to +accept uploads. Rough plan is to: + +1. run grobid persist worker from Kafka with a new temporary consumer group, + from the start of the GROBID output topic +2. when it gets to end, stop the *regular* consumer group while this one is + still running. with temporary worker still running, at that point in time + entire topic should be in S3 +3. then reconfigure regular worker to db+s3 mode. halt the temporary worker, + restart the regular one with new config, run it indefinitely + +Consumer group isn't an arg, so just edit `persist_worker.py` and set it to +`persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode +didn't try to connect to postgresql. + +Commands: + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed + => run briefly, then kill + +On kafka-broker worker: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run + +Then run 2x instances of worker (same command as above): + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + +At this point CPU-limited on this worker by the python processes (only 4 cores +on this machine). + +Check in weed shell: + + weed shell + + > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml + [...] + "isGzipped": false + [...] + "mime": "application/xml", + [...] + +An open question is if we should have separate buckets per derive type. Eg, a +GROBID XML bucket separate from thumbnails bucket. Or are prefix directories +enough. Basically this comes down to whether we want things mixed together at +the volume level. I think we should keep separate. + +Need to set the mimetype in the upload for gzip on XML? diff --git a/extra/hbase/howto.md b/extra/hbase/howto.md new file mode 100644 index 0000000..26d33f4 --- /dev/null +++ b/extra/hbase/howto.md @@ -0,0 +1,42 @@ + +Commands can be run from any cluster machine with hadoop environment config +set up. Most of these commands are run from the shell (start with `hbase +shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be +QA/prod tables, but there are not QA/prod clusters. + +## Create Table + +Create column families (note: not all individual columns) with something like: + + create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'} + +## Run Thrift Server Informally + +The Thrift server can technically be run from any old cluster machine that has +Hadoop client stuff set up, using: + + hbase thrift start -nonblocking -c + +Note that this will run version 0.96, while the actual HBase service seems to +be running 0.98. + +To interact with this config, use happybase (python) config: + + conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact") + # Test connection + conn.tables() + +## Queries From Shell + +Fetch all columns for a single row: + + hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ' + +Fetch multiple columns for a single row, using column families: + + hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file' + +Scan a fixed number of rows (here 5) starting at a specific key prefix, all +columns: + + hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'} diff --git a/extra/hbase/notes.txt b/extra/hbase/notes.txt new file mode 100644 index 0000000..20f406f --- /dev/null +++ b/extra/hbase/notes.txt @@ -0,0 +1,232 @@ + +## Notes on HBase features + +Decent one-page introduction: +https://www.tutorialspoint.com/hbase/hbase_overview.htm + +Question: what version of hbase are we running? what on-disk format? + +=> Server: HBase 0.98.6-cdh5.3.1 +=> Client: HBase 0.96.1.1-cdh5.0.1 +=> As of 2018, 1.2 is stable and 2.0 is released. + +Question: what are our servers? how can we monitor? + +=> http://ia802402.us.archive.org:6410/master-status + +I haven't been able to find a simple table of hbase version and supported/new +features over the years (release notes are too detailed). + +Normal/online mapreduce over tables sounds like it goes through a "region +server" and is slow. Using snapshots allows direct access to underlying +tablets on disk? Or is access always direct? + +Could consider "Medium-sized Object" support for 100 KByte to 10 MByte sized +files. This seems to depend on HBase v3, which was added in HBase 0.98, so we +can't use it yet. + +Do we need to decide on on-disk format? Just stick with defaults. + +Looks like we use the `happybase` python package to write. This is packaged in +debian, but only for python2. There is also a `starbase` python library +wrapping the REST API. + +There is a "bulk load" mechanism for going directly from HDFS into HBase, by +creating HFiles that can immediately be used by HBase. + +## Specific "Queries" needed + +"Identifier" will mostly want to get "new" (unprocessed) rows to process. It +can do so by + +Question: if our columns are mostly "dense" within a column group (aka, mostly +all or none set), what is the value of splitting out columns instead of using a +single JSON blob or something like that? Not needing to store the key strings? +Being able to do scan filters? The later obviously makes sense in some +contexts. + +- is there a visible distinction between "get(table, group:col)" being + zero-length (somebody put() an empty string (like "") versus that column not + having being written to? + +## Conversation with Noah about Heritrix De-Dupe + +AIT still uses HBase for url-agnostic de-dupe, but may move away from it. Does +about 250 reads/sec (estimate based on URL hits per quarter). Used to have more +problems (region servers?) but haven't for a while. If a crawler can't reach +HBase, it will "fail safe" and retain the URL. However, Heritrix has trouble +starting up if it can't connect at start. Uses the native JVM drivers. + +Key is "sha1:-", so in theory they can control whether to +dedupe inside or outside of individual crawls (or are they account IDs?). IIRC +all columns were in one family, and had short names (single character). Eg: + + hbase(main):012:0> scan 'ait-prod-digest-history',{LIMIT=>5,STARTROW=>'sha1:A'} + sha1:A22222453XRJ63AC7YCSK46APWHTJKFY-2312 column=f:c, timestamp=1404151869546, value={"c":1,"u":"http://www.theroot.com/category/views-tags/black-fortune-500-ceos","d":"2012-02-23T08:27:10Z","o":8867552,"f":"ARCHIVEIT-REDACTED-20120223080317-00009-crawling201.us.archive.org-6681.warc.gz"} + +Code for url-agnostic dedupe is in: + + heritrix3/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBaseContentDigestHistory.java + +Crawl config snippet: + + [...] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [...] + +## Kenji Conversation (global wayback) + +Spoke with Kenji, who had previous experience trying to use HBase for crawl +de-dupe. Take away was that it didn't perform well for them even back then, +with 3000+ req/sec. AIT today is more like 250 req/sec. + +Apparently CDX API is just the fastest thing ever; stable slow latency on reads +(~200ms!), and it takes an hour for "writes" (bulk deltacdx or whatever). + +Sounds like HBase in particular struggled with concurrent heavy reads and +writes; frequent re-compaction caused large tail latencies, and when region +servers were loaded they would time-out of zookeeper. + +He pushed to use elasticsearch instead of hbase to store extracted fulltext, as +a persistant store, particularly if we end up using it for fulltext someday. He +thinks it operates really well as a datastore. I am not really comfortable with +this usecase, or depending on elastic as a persistant store in general, and it +doesn't work for the crawl dedupe case. + +He didn't seem beholden to the tiny column name convention. + + +## Google BigTable paper (2006) + +Hadn't read this paper in a long time, and didn't really understand it at the +time. HBase is a clone of BigTable. + +They used bigtable to store crawled HTML! Very similar use-case to our journal +stuff. Retained multiple crawls using versions; version timestamps are crawl +timestamps, aha. + +Crazy to me how the whole hadoop world is Java (garbage collected), while all +the google stuff is C++. So many issues with hadoop are performance/latency +sensitive; having garbage collection in a situation when RAM is tight and +network timeouts are problematic seems like a bad combination for operability +(median/optimistic performance is probably fine) + +"Locality" metadata important for actually separating column families. Column +name scarcity doesn't seem to be a thing/concern. Compression settings +important. Key selection to allow local compression seems important to them. + +Performance probably depends a lot on 1) relative rate of growth (slow if +re-compressing, etc), 2) + +Going to want/need table-level monitoring, probably right from the start. + +## Querying/Aggregating/Stats + +We'll probably want to be able to run simple pig-style queries over HBase. How +will that work? A couple options: + +- query hbase using pig via HBaseStorage and HBaseLoader +- hive runs on map/reduce like pig +- drill is an online/fast SQL query engine with HBase back-end support. Not + map/reduce based; can run from a single server. Supports multiple "backends". + Somewhat more like pig; "schema-free"/JSON. +- impala supports HBase backends +- phoenix is a SQL engine on top of HBase + +## Hive Integration + +Run `hive` from a hadoop cluster machine: + + bnewbold@ia802405$ hive --version + Hive 0.13.1-cdh5.3.1 + Subversion file:///var/lib/jenkins/workspace/generic-package-ubuntu64-12-04/CDH5.3.1-Packaging-Hive-2015-01-27_16-23-36/hive-0.13.1+cdh5.3.1+308-1.cdh5.3.1.p0.17~precise -r Unknown + Compiled by jenkins on Tue Jan 27 16:38:11 PST 2015 + From source with checksum 1bb86e4899928ce29cbcaec8cf43c9b6 + +Need to create mapping tables: + + CREATE EXTERNAL TABLE journal_extract_qa(rowkey STRING, grobid_status STRING, file_size STRING) + STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' + WITH SERDEPROPERTIES ('hbase.columns.mapping' = ':key,grobid0:status_code,file:size') + TBLPROPERTIES ('hbase.table.name' = 'wbgrp-journal-extract-0-qa'); + +Maybe: + + SET hive.aux.jars.path = file:///home/webcrawl/hadoop-2/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///home/webcrawl/hadoop-2/hive/lib/hbase-client-0.96.1.1-cdh5.0.1.jar; + SELECT * from journal_extract_qa LIMIT 10; + +Or? + + ADD jar /usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar; + ADD jar /usr/lib/hive/lib/hive-shims-common-secure-0.13.1-cdh5.3.1.jar; + ADD jar /usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; + ADD jar /usr/lib/hbase/hbase-client-0.98.6-cdh5.3.1.jar; + ADD jar /usr/lib/hbase/hbase-common-0.98.6-cdh5.3.1.jar; + +Or, from a real node? + + SET hive.aux.jars.path = file:///usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar,file:///usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; + SELECT * from journal_extract_qa LIMIT 10; + +Getting an error: + + Exception in thread "main" java.lang.NoSuchMethodError: org.apache.hadoop.hdfs.client.HdfsAdmin.getEncryptionZoneForPath(Lorg/apache/hadoop/fs/Path;)Lorg/apache/hadoop/hdfs/protocol/EncryptionZone; + +The HdfsAdmin admin class is in hadoop-hdfs, but `getEncryptionZoneForPath` +isn't in there. See upstream commit: + + https://github.com/apache/hadoop/commit/20dcb841ce55b0d414885ceba530c30b5b528b0f + +## Debugging + +List hbase tables known to zookeeper (as opposed to `list` from `hbase shell`): + + hbase zkcli ls /hbase/table + +Look for jar files with a given symbol: + + rg HdfsAdmin -a /usr/lib/*/*.jar + +## Performance + +Should pre-allocate regions for tables that are going to be non-trivially +sized, otherwise all load hits a single node. From the shell, this seems to +involve specifying the split points (key prefixes) manually. From the docs: + + http://hbase.apache.org/book.html#precreate.regions + +There is an ImportTsv tool which might have been useful for original CDX +backfill, but :shrug:. It is nice to have only a single pipeline and have it +work well. diff --git a/extra/hbase/schema_design.md b/extra/hbase/schema_design.md new file mode 100644 index 0000000..2db8998 --- /dev/null +++ b/extra/hbase/schema_design.md @@ -0,0 +1,79 @@ + +## PDF Table + +Table name: `wbgrp-journal-extract--` + +Eg: `wbgrp-journal-extract-0-prod` + +Key is the sha1 of the file, as raw bytes (20 bytes). + +Could conceivably need to handle, eg, postscript files, JATS XML, or even HTML +in the future? If possible be filetype-agnostic, but only "fulltext" file types +will end up in here, and don't bend over backwards. + +Keep only a single version (do we need `VERSIONS => 1`, or is 1 the default?) + +IMPORTANT: column names should be unique across column families. Eg, should not +have both `grobid0:status` and `match0:status`. HBase and some client libraries +don't care, but some map/reduce frameworks (eg, Scalding) can have name +collisions. Differences between "orthogonal" columns *might* be OK (eg, +`grobid0:status` and `grobid1:status`). + +Column families: + +- `key`: sha1 of the file in base32 (not a column or column family) +- `f`: heritrix HBaseContentDigestHistory de-dupe + - `c`: (json string) + - `u`: original URL (required) + - `d`: original date (required; ISO 8601:1988) + - `f`: warc filename (recommend) + - `o`: warc offset (recommend) + - `c`: dupe count (optional) + - `i`: warc record ID (optional) +- `file`: crawl and file metadata + - `size` (uint64), uncompressed (not in CDX) + - `mime` (string; might do postscript in the future; normalized) + - `cdx` (json string) with all as strings + - `surt` + - `url` + - `dt` + - `warc` (item and file name) + - `offset` + - `c_size` (compressed size) + - `meta` (json string) + - `size` (int) + - `mime` (str) + - `magic` (str) + - `magic_mime` (str) + - `sha1` (hex str) + - `md5` (hex str) + - `sha256` (hex str) +- `grobid0`: processing status, version, XML and JSON fulltext, JSON metadata. timestamp. Should be compressed! `COMPRESSION => SNAPPY` + - `status_code` (signed int; HTTP status from grobid) + - `quality` (int or string; we define the meaning ("good"/"marginal"/"bad") + - `status` (json string from grobid) + - `tei_xml` (xml string from grobid) + - `tei_json` (json string with fulltext) + - `metadata` (json string with author, title, abstract, citations, etc) +- `match0`: status of identification against "the catalog" + - `mstatus` (string; did it match?) + - `doi` (string) + - `minfo` (json string) + +Can add additional groups in the future for additional processing steps. For +example, we might want to do first pass looking at files to see "is this a PDF +or not", which out output some status (and maybe certainty). + +The Heritrix schema is fixed by the existing implementation. We could +patch/extend heritrix to use the `file` schema in the future if we decide +it's worth it. There are some important pieces of metadata missing, so at +least to start I think we should keep `f` and `file` distinct. We could merge +them later. `f` info will be populated by crawlers; `file` info should be +populated when back-filling or processing CDX lines. + +If we wanted to support multiple CDX rows as part of the same row (eg, as +alternate locations), we could use HBase's versions feature, which can +automatically cap the number of versions stored. + +If we had enough RAM resources, we could store `f` (and maybe `file`) metadata +in memory for faster access. diff --git a/extra/nginx/README.md b/extra/nginx/README.md new file mode 100644 index 0000000..0369f9b --- /dev/null +++ b/extra/nginx/README.md @@ -0,0 +1,18 @@ + +This folder contains nginx configs for partner access to sandcrawler DB +(postgrest) and GROBID XML blobs (minio). + +`fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to +show how it works. + +## Let's Encrypt + +As... bnewbold? + + sudo certbot certonly \ + --non-interactive \ + --agree-tos \ + --email bnewbold@archive.org \ + --webroot -w /var/www/letsencrypt \ + -d sandcrawler-minio.fatcat.wiki \ + -d sandcrawler-db.fatcat.wiki diff --git a/extra/nginx/fatcat-blobs b/extra/nginx/fatcat-blobs new file mode 100644 index 0000000..5c692ef --- /dev/null +++ b/extra/nginx/fatcat-blobs @@ -0,0 +1,51 @@ + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name blobs.fatcat.wiki; + + ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/fatcat-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location /unpaywall/ { + if ($request_method !~ "GET") { + return 403; + break; + } + + #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args; + proxy_pass http://207.241.227.141:9000$uri$is_args$args; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + location / { + default_type text/plain; + return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead'; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/nginx/sandcrawler-db b/extra/nginx/sandcrawler-db new file mode 100644 index 0000000..67d1a2d --- /dev/null +++ b/extra/nginx/sandcrawler-db @@ -0,0 +1,80 @@ + +upstream postgrest { + server localhost:3030; + keepalive 64; +} + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name sandcrawler-db.fatcat.wiki db.sandcrawler.org; + + ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/sandcrawler-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location / { + + default_type application/json; + + if ($request_method !~ "GET") { + return 403; + break; + } + + proxy_redirect off; + + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + + proxy_pass http://postgrest/; + } + + # support /endpoint/:id url style for sha1hex lookups + location ~ "^/(file_meta|grobid|fatcat_file)/([a-f0-9]{40})$" { + + if ($request_method !~ "GET") { + return 403; + break; + } + + # assuming an upstream named "postgrest" + # doing this rewrite as part of the proxy_pass line itself didn't seem + # to work, so doing a formal rewrite here + rewrite "/([a-z_]+)/([a-f0-9]{40})" /$1?sha1hex=eq.$2 break; + proxy_pass http://postgrest; + + # make the response singular + #default_type application/vnd.pgrst.object+json; + proxy_set_header Accept "application/vnd.pgrst.object+json"; + + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/nginx/sandcrawler-minio b/extra/nginx/sandcrawler-minio new file mode 100644 index 0000000..2e9bfe3 --- /dev/null +++ b/extra/nginx/sandcrawler-minio @@ -0,0 +1,57 @@ + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name sandcrawler-minio.fatcat.wiki minio.sandcrawler.org; + + ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/sandcrawler-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location /minio/ { + + # allows all HTTP verbs + + proxy_pass http://localhost:9000; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + location / { + if ($request_method !~ "GET") { + return 403; + break; + } + + proxy_pass http://localhost:9000; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/hbase/howto.md b/hbase/howto.md deleted file mode 100644 index 26d33f4..0000000 --- a/hbase/howto.md +++ /dev/null @@ -1,42 +0,0 @@ - -Commands can be run from any cluster machine with hadoop environment config -set up. Most of these commands are run from the shell (start with `hbase -shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be -QA/prod tables, but there are not QA/prod clusters. - -## Create Table - -Create column families (note: not all individual columns) with something like: - - create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'} - -## Run Thrift Server Informally - -The Thrift server can technically be run from any old cluster machine that has -Hadoop client stuff set up, using: - - hbase thrift start -nonblocking -c - -Note that this will run version 0.96, while the actual HBase service seems to -be running 0.98. - -To interact with this config, use happybase (python) config: - - conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact") - # Test connection - conn.tables() - -## Queries From Shell - -Fetch all columns for a single row: - - hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ' - -Fetch multiple columns for a single row, using column families: - - hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file' - -Scan a fixed number of rows (here 5) starting at a specific key prefix, all -columns: - - hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'} diff --git a/hbase/notes.txt b/hbase/notes.txt deleted file mode 100644 index 20f406f..0000000 --- a/hbase/notes.txt +++ /dev/null @@ -1,232 +0,0 @@ - -## Notes on HBase features - -Decent one-page introduction: -https://www.tutorialspoint.com/hbase/hbase_overview.htm - -Question: what version of hbase are we running? what on-disk format? - -=> Server: HBase 0.98.6-cdh5.3.1 -=> Client: HBase 0.96.1.1-cdh5.0.1 -=> As of 2018, 1.2 is stable and 2.0 is released. - -Question: what are our servers? how can we monitor? - -=> http://ia802402.us.archive.org:6410/master-status - -I haven't been able to find a simple table of hbase version and supported/new -features over the years (release notes are too detailed). - -Normal/online mapreduce over tables sounds like it goes through a "region -server" and is slow. Using snapshots allows direct access to underlying -tablets on disk? Or is access always direct? - -Could consider "Medium-sized Object" support for 100 KByte to 10 MByte sized -files. This seems to depend on HBase v3, which was added in HBase 0.98, so we -can't use it yet. - -Do we need to decide on on-disk format? Just stick with defaults. - -Looks like we use the `happybase` python package to write. This is packaged in -debian, but only for python2. There is also a `starbase` python library -wrapping the REST API. - -There is a "bulk load" mechanism for going directly from HDFS into HBase, by -creating HFiles that can immediately be used by HBase. - -## Specific "Queries" needed - -"Identifier" will mostly want to get "new" (unprocessed) rows to process. It -can do so by - -Question: if our columns are mostly "dense" within a column group (aka, mostly -all or none set), what is the value of splitting out columns instead of using a -single JSON blob or something like that? Not needing to store the key strings? -Being able to do scan filters? The later obviously makes sense in some -contexts. - -- is there a visible distinction between "get(table, group:col)" being - zero-length (somebody put() an empty string (like "") versus that column not - having being written to? - -## Conversation with Noah about Heritrix De-Dupe - -AIT still uses HBase for url-agnostic de-dupe, but may move away from it. Does -about 250 reads/sec (estimate based on URL hits per quarter). Used to have more -problems (region servers?) but haven't for a while. If a crawler can't reach -HBase, it will "fail safe" and retain the URL. However, Heritrix has trouble -starting up if it can't connect at start. Uses the native JVM drivers. - -Key is "sha1:-", so in theory they can control whether to -dedupe inside or outside of individual crawls (or are they account IDs?). IIRC -all columns were in one family, and had short names (single character). Eg: - - hbase(main):012:0> scan 'ait-prod-digest-history',{LIMIT=>5,STARTROW=>'sha1:A'} - sha1:A22222453XRJ63AC7YCSK46APWHTJKFY-2312 column=f:c, timestamp=1404151869546, value={"c":1,"u":"http://www.theroot.com/category/views-tags/black-fortune-500-ceos","d":"2012-02-23T08:27:10Z","o":8867552,"f":"ARCHIVEIT-REDACTED-20120223080317-00009-crawling201.us.archive.org-6681.warc.gz"} - -Code for url-agnostic dedupe is in: - - heritrix3/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBaseContentDigestHistory.java - -Crawl config snippet: - - [...] - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - [...] - -## Kenji Conversation (global wayback) - -Spoke with Kenji, who had previous experience trying to use HBase for crawl -de-dupe. Take away was that it didn't perform well for them even back then, -with 3000+ req/sec. AIT today is more like 250 req/sec. - -Apparently CDX API is just the fastest thing ever; stable slow latency on reads -(~200ms!), and it takes an hour for "writes" (bulk deltacdx or whatever). - -Sounds like HBase in particular struggled with concurrent heavy reads and -writes; frequent re-compaction caused large tail latencies, and when region -servers were loaded they would time-out of zookeeper. - -He pushed to use elasticsearch instead of hbase to store extracted fulltext, as -a persistant store, particularly if we end up using it for fulltext someday. He -thinks it operates really well as a datastore. I am not really comfortable with -this usecase, or depending on elastic as a persistant store in general, and it -doesn't work for the crawl dedupe case. - -He didn't seem beholden to the tiny column name convention. - - -## Google BigTable paper (2006) - -Hadn't read this paper in a long time, and didn't really understand it at the -time. HBase is a clone of BigTable. - -They used bigtable to store crawled HTML! Very similar use-case to our journal -stuff. Retained multiple crawls using versions; version timestamps are crawl -timestamps, aha. - -Crazy to me how the whole hadoop world is Java (garbage collected), while all -the google stuff is C++. So many issues with hadoop are performance/latency -sensitive; having garbage collection in a situation when RAM is tight and -network timeouts are problematic seems like a bad combination for operability -(median/optimistic performance is probably fine) - -"Locality" metadata important for actually separating column families. Column -name scarcity doesn't seem to be a thing/concern. Compression settings -important. Key selection to allow local compression seems important to them. - -Performance probably depends a lot on 1) relative rate of growth (slow if -re-compressing, etc), 2) - -Going to want/need table-level monitoring, probably right from the start. - -## Querying/Aggregating/Stats - -We'll probably want to be able to run simple pig-style queries over HBase. How -will that work? A couple options: - -- query hbase using pig via HBaseStorage and HBaseLoader -- hive runs on map/reduce like pig -- drill is an online/fast SQL query engine with HBase back-end support. Not - map/reduce based; can run from a single server. Supports multiple "backends". - Somewhat more like pig; "schema-free"/JSON. -- impala supports HBase backends -- phoenix is a SQL engine on top of HBase - -## Hive Integration - -Run `hive` from a hadoop cluster machine: - - bnewbold@ia802405$ hive --version - Hive 0.13.1-cdh5.3.1 - Subversion file:///var/lib/jenkins/workspace/generic-package-ubuntu64-12-04/CDH5.3.1-Packaging-Hive-2015-01-27_16-23-36/hive-0.13.1+cdh5.3.1+308-1.cdh5.3.1.p0.17~precise -r Unknown - Compiled by jenkins on Tue Jan 27 16:38:11 PST 2015 - From source with checksum 1bb86e4899928ce29cbcaec8cf43c9b6 - -Need to create mapping tables: - - CREATE EXTERNAL TABLE journal_extract_qa(rowkey STRING, grobid_status STRING, file_size STRING) - STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' - WITH SERDEPROPERTIES ('hbase.columns.mapping' = ':key,grobid0:status_code,file:size') - TBLPROPERTIES ('hbase.table.name' = 'wbgrp-journal-extract-0-qa'); - -Maybe: - - SET hive.aux.jars.path = file:///home/webcrawl/hadoop-2/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///home/webcrawl/hadoop-2/hive/lib/hbase-client-0.96.1.1-cdh5.0.1.jar; - SELECT * from journal_extract_qa LIMIT 10; - -Or? - - ADD jar /usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar; - ADD jar /usr/lib/hive/lib/hive-shims-common-secure-0.13.1-cdh5.3.1.jar; - ADD jar /usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; - ADD jar /usr/lib/hbase/hbase-client-0.98.6-cdh5.3.1.jar; - ADD jar /usr/lib/hbase/hbase-common-0.98.6-cdh5.3.1.jar; - -Or, from a real node? - - SET hive.aux.jars.path = file:///usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar,file:///usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; - SELECT * from journal_extract_qa LIMIT 10; - -Getting an error: - - Exception in thread "main" java.lang.NoSuchMethodError: org.apache.hadoop.hdfs.client.HdfsAdmin.getEncryptionZoneForPath(Lorg/apache/hadoop/fs/Path;)Lorg/apache/hadoop/hdfs/protocol/EncryptionZone; - -The HdfsAdmin admin class is in hadoop-hdfs, but `getEncryptionZoneForPath` -isn't in there. See upstream commit: - - https://github.com/apache/hadoop/commit/20dcb841ce55b0d414885ceba530c30b5b528b0f - -## Debugging - -List hbase tables known to zookeeper (as opposed to `list` from `hbase shell`): - - hbase zkcli ls /hbase/table - -Look for jar files with a given symbol: - - rg HdfsAdmin -a /usr/lib/*/*.jar - -## Performance - -Should pre-allocate regions for tables that are going to be non-trivially -sized, otherwise all load hits a single node. From the shell, this seems to -involve specifying the split points (key prefixes) manually. From the docs: - - http://hbase.apache.org/book.html#precreate.regions - -There is an ImportTsv tool which might have been useful for original CDX -backfill, but :shrug:. It is nice to have only a single pipeline and have it -work well. diff --git a/hbase/schema_design.md b/hbase/schema_design.md deleted file mode 100644 index 2db8998..0000000 --- a/hbase/schema_design.md +++ /dev/null @@ -1,79 +0,0 @@ - -## PDF Table - -Table name: `wbgrp-journal-extract--` - -Eg: `wbgrp-journal-extract-0-prod` - -Key is the sha1 of the file, as raw bytes (20 bytes). - -Could conceivably need to handle, eg, postscript files, JATS XML, or even HTML -in the future? If possible be filetype-agnostic, but only "fulltext" file types -will end up in here, and don't bend over backwards. - -Keep only a single version (do we need `VERSIONS => 1`, or is 1 the default?) - -IMPORTANT: column names should be unique across column families. Eg, should not -have both `grobid0:status` and `match0:status`. HBase and some client libraries -don't care, but some map/reduce frameworks (eg, Scalding) can have name -collisions. Differences between "orthogonal" columns *might* be OK (eg, -`grobid0:status` and `grobid1:status`). - -Column families: - -- `key`: sha1 of the file in base32 (not a column or column family) -- `f`: heritrix HBaseContentDigestHistory de-dupe - - `c`: (json string) - - `u`: original URL (required) - - `d`: original date (required; ISO 8601:1988) - - `f`: warc filename (recommend) - - `o`: warc offset (recommend) - - `c`: dupe count (optional) - - `i`: warc record ID (optional) -- `file`: crawl and file metadata - - `size` (uint64), uncompressed (not in CDX) - - `mime` (string; might do postscript in the future; normalized) - - `cdx` (json string) with all as strings - - `surt` - - `url` - - `dt` - - `warc` (item and file name) - - `offset` - - `c_size` (compressed size) - - `meta` (json string) - - `size` (int) - - `mime` (str) - - `magic` (str) - - `magic_mime` (str) - - `sha1` (hex str) - - `md5` (hex str) - - `sha256` (hex str) -- `grobid0`: processing status, version, XML and JSON fulltext, JSON metadata. timestamp. Should be compressed! `COMPRESSION => SNAPPY` - - `status_code` (signed int; HTTP status from grobid) - - `quality` (int or string; we define the meaning ("good"/"marginal"/"bad") - - `status` (json string from grobid) - - `tei_xml` (xml string from grobid) - - `tei_json` (json string with fulltext) - - `metadata` (json string with author, title, abstract, citations, etc) -- `match0`: status of identification against "the catalog" - - `mstatus` (string; did it match?) - - `doi` (string) - - `minfo` (json string) - -Can add additional groups in the future for additional processing steps. For -example, we might want to do first pass looking at files to see "is this a PDF -or not", which out output some status (and maybe certainty). - -The Heritrix schema is fixed by the existing implementation. We could -patch/extend heritrix to use the `file` schema in the future if we decide -it's worth it. There are some important pieces of metadata missing, so at -least to start I think we should keep `f` and `file` distinct. We could merge -them later. `f` info will be populated by crawlers; `file` info should be -populated when back-filling or processing CDX lines. - -If we wanted to support multiple CDX rows as part of the same row (eg, as -alternate locations), we could use HBase's versions feature, which can -automatically cap the number of versions stored. - -If we had enough RAM resources, we could store `f` (and maybe `file`) metadata -in memory for faster access. diff --git a/nginx/README.md b/nginx/README.md deleted file mode 100644 index 0369f9b..0000000 --- a/nginx/README.md +++ /dev/null @@ -1,18 +0,0 @@ - -This folder contains nginx configs for partner access to sandcrawler DB -(postgrest) and GROBID XML blobs (minio). - -`fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to -show how it works. - -## Let's Encrypt - -As... bnewbold? - - sudo certbot certonly \ - --non-interactive \ - --agree-tos \ - --email bnewbold@archive.org \ - --webroot -w /var/www/letsencrypt \ - -d sandcrawler-minio.fatcat.wiki \ - -d sandcrawler-db.fatcat.wiki diff --git a/nginx/fatcat-blobs b/nginx/fatcat-blobs deleted file mode 100644 index 5c692ef..0000000 --- a/nginx/fatcat-blobs +++ /dev/null @@ -1,51 +0,0 @@ - -server { - listen 80; - listen [::]:80; - listen 443 ssl http2; - listen [::]:443 ssl http2; - server_name blobs.fatcat.wiki; - - ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; - - #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; - add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 - add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 - add_header X-Xss-Protection "1"; - # Enable STS with one year period (breaks http; optional) - #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; - - error_log /var/log/nginx/fatcat-errors.log; - access_log /dev/null; - - if ($scheme = http) { - return 301 https://$server_name$request_uri; - } - - location /unpaywall/ { - if ($request_method !~ "GET") { - return 403; - break; - } - - #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args; - proxy_pass http://207.241.227.141:9000$uri$is_args$args; - proxy_redirect off; - - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - } - - location / { - default_type text/plain; - return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead'; - } - - # Let's Encrypt SSL Certs - location /.well-known/acme-challenge/ { - root /var/www/letsencrypt; - autoindex off; - } -} diff --git a/nginx/sandcrawler-db b/nginx/sandcrawler-db deleted file mode 100644 index 67d1a2d..0000000 --- a/nginx/sandcrawler-db +++ /dev/null @@ -1,80 +0,0 @@ - -upstream postgrest { - server localhost:3030; - keepalive 64; -} - -server { - listen 80; - listen [::]:80; - listen 443 ssl http2; - listen [::]:443 ssl http2; - server_name sandcrawler-db.fatcat.wiki db.sandcrawler.org; - - ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; - - #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; - add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 - add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 - add_header X-Xss-Protection "1"; - # Enable STS with one year period (breaks http; optional) - #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; - - error_log /var/log/nginx/sandcrawler-errors.log; - access_log /dev/null; - - if ($scheme = http) { - return 301 https://$server_name$request_uri; - } - - location / { - - default_type application/json; - - if ($request_method !~ "GET") { - return 403; - break; - } - - proxy_redirect off; - - proxy_http_version 1.1; - proxy_set_header Connection ""; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - - proxy_pass http://postgrest/; - } - - # support /endpoint/:id url style for sha1hex lookups - location ~ "^/(file_meta|grobid|fatcat_file)/([a-f0-9]{40})$" { - - if ($request_method !~ "GET") { - return 403; - break; - } - - # assuming an upstream named "postgrest" - # doing this rewrite as part of the proxy_pass line itself didn't seem - # to work, so doing a formal rewrite here - rewrite "/([a-z_]+)/([a-f0-9]{40})" /$1?sha1hex=eq.$2 break; - proxy_pass http://postgrest; - - # make the response singular - #default_type application/vnd.pgrst.object+json; - proxy_set_header Accept "application/vnd.pgrst.object+json"; - - proxy_http_version 1.1; - proxy_set_header Connection ""; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - } - - # Let's Encrypt SSL Certs - location /.well-known/acme-challenge/ { - root /var/www/letsencrypt; - autoindex off; - } -} diff --git a/nginx/sandcrawler-minio b/nginx/sandcrawler-minio deleted file mode 100644 index 2e9bfe3..0000000 --- a/nginx/sandcrawler-minio +++ /dev/null @@ -1,57 +0,0 @@ - -server { - listen 80; - listen [::]:80; - listen 443 ssl http2; - listen [::]:443 ssl http2; - server_name sandcrawler-minio.fatcat.wiki minio.sandcrawler.org; - - ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; - - #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; - add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 - add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 - add_header X-Xss-Protection "1"; - # Enable STS with one year period (breaks http; optional) - #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; - - error_log /var/log/nginx/sandcrawler-errors.log; - access_log /dev/null; - - if ($scheme = http) { - return 301 https://$server_name$request_uri; - } - - location /minio/ { - - # allows all HTTP verbs - - proxy_pass http://localhost:9000; - proxy_redirect off; - - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - } - - location / { - if ($request_method !~ "GET") { - return 403; - break; - } - - proxy_pass http://localhost:9000; - proxy_redirect off; - - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - } - - # Let's Encrypt SSL Certs - location /.well-known/acme-challenge/ { - root /var/www/letsencrypt; - autoindex off; - } -} -- cgit v1.2.3