From f3a721a9dce8801b78f7bc31e88dc912b0ec1dba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Dec 2022 15:52:02 -0800 Subject: move a bunch of top-level files/directories to ./extra/ --- extra/RUNBOOK.md | 44 ++++++++ extra/blobs/README.md | 86 +++++++++++++++ extra/blobs/minio/README.md | 74 +++++++++++++ extra/blobs/minio/minio.conf | 14 +++ extra/blobs/seaweedfs/README.md | 9 ++ extra/blobs/tasks.md | 53 +++++++++ extra/hbase/howto.md | 42 ++++++++ extra/hbase/notes.txt | 232 ++++++++++++++++++++++++++++++++++++++++ extra/hbase/schema_design.md | 79 ++++++++++++++ extra/nginx/README.md | 18 ++++ extra/nginx/fatcat-blobs | 51 +++++++++ extra/nginx/sandcrawler-db | 80 ++++++++++++++ extra/nginx/sandcrawler-minio | 57 ++++++++++ 13 files changed, 839 insertions(+) create mode 100644 extra/RUNBOOK.md create mode 100644 extra/blobs/README.md create mode 100644 extra/blobs/minio/README.md create mode 100644 extra/blobs/minio/minio.conf create mode 100644 extra/blobs/seaweedfs/README.md create mode 100644 extra/blobs/tasks.md create mode 100644 extra/hbase/howto.md create mode 100644 extra/hbase/notes.txt create mode 100644 extra/hbase/schema_design.md create mode 100644 extra/nginx/README.md create mode 100644 extra/nginx/fatcat-blobs create mode 100644 extra/nginx/sandcrawler-db create mode 100644 extra/nginx/sandcrawler-minio (limited to 'extra') diff --git a/extra/RUNBOOK.md b/extra/RUNBOOK.md new file mode 100644 index 0000000..6c4165d --- /dev/null +++ b/extra/RUNBOOK.md @@ -0,0 +1,44 @@ + +## Process Un-GROBID-ed PDFs from Wayback + +Sometimes ingest doesn't pick up everything, or we do some heuristic CDX +import, and we want to run GROBID over all the PDFs that haven't been processed +yet. Only want one CDX line per `sha1hex`. + +A hybrid SQL/UNIX way of generating processing list: + + psql sandcrawler < /fast/sandcrawler/sql/dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2020.01-27.json + +From here, there are two options: enqueue in Kafka and let workers run, or +create job files and run them using local worker and GNU/parallel. + +#### Kafka + +Copy/transfer to a Kafka node; load a sample and then the whole output: + + head -n1000 dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 + cat dump_ungrobid_pdf.2020.01-27.json | kafkacat -P -b localhost -t sandcrawler-prod.ungrobided-pg -p -1 + +#### Local JSON + +Older example; if this fails, need to re-run entire thing: + + cat /srv/sandcrawler/tasks/regrobid_cdx.split_*.json | pv -l | parallel -j40 --linebuffer --round-robin --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + +TODO: is it possible to use job log with millions of `--pipe` inputs? That +would be more efficient in the event of failure. + +## GROBID over many .zip files + +Want to use GNU/Parallel in a mode that will do retries well: + + fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ + sort | \ + parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ + './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc350.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' + +After starting, check that messages are actually getting pushed to kafka +(producer failures can be silent!). If anything goes wrong, run the exact same +command again. The sort is to ensure jobs are enqueued in the same order again; +could also dump `fd` output to a command file first. + diff --git a/extra/blobs/README.md b/extra/blobs/README.md new file mode 100644 index 0000000..555db92 --- /dev/null +++ b/extra/blobs/README.md @@ -0,0 +1,86 @@ + +This document describes sandcrawler/fatcat use of "blob store" infrastructure +for storing hundreds of millions of small files. For example, GROBID XML +documents, jpeg thumbnails of PDFs. + +The basic feature requirements for this system are: + +- don't need preservation data resiliency: all this data is derived from + primary content, and is usually redundantly stored in Kafka topics (and thus + can be re-indexed to any server bounded only by throughput of the object + store service; Kafka is usually faster) +- don't require SSDs or large amounts of RAM. Ability to accelerate performance + with additional RAM or moving indexes to SSD is nice, but we will be using + spinning disks for primary data storage +- hundreds of millions or billions of objects, fetchable by a key we define +- optional transparent compression (for text and XML) +- typical object (file) size of 5-200 KBytes uncompressed, want to support up + to several MBytes +- very simple internal API for GET/PUT (S3 API compatible is good) +- ability to proxy to HTTP publicly for reads (eg, HTTP fall-back with no + authenticaiton), controllable by at least bucket granularity + +## Infrastructure + +`minio` was used initially, but did not scale well in number of files. We +currently use seaweedfs. Any S3-compatible key/value store should work in +theory. openlibrary.org has used WARCs in petabox items in the past. Actual +cloud object stores tend to be expensive for this kind of use case. + +The facebook "haystack" project (and whitepaper) are good background reading +describing one type of system that works well for this application. + + +## Bucket / Folder Structure + +Currently we run everything off a single server, with no redundancy. There is +no QA/prod distinction. + +Setting access control and doing bulk deletions is easiest at the bucket level, +less easy at the folder level, most difficult at the suffix (file extention) +level. + +For files that are derived from PDFs, we use the SHA-1 (in lower-case hex) of +the source PDF to contruct keys. We generate nested "directories" from the hash +to limit the number of keys per "directory" (even though in S3/seaweedfs there +are no actual directories involved). The structure looks like: + + //// + +Eg: + + sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml + +The nesting is sort of a hold-over from minio (where files were actually +on-disk), but seems worth keeping in case we end up switching storage systems +in the future. + +## Existing Content + +sandcrawler: internal/controlled access to PDF derivatives + grobid: TEI-XML documents + extension: .tei.xml + text: raw pdftotext (or other text transform) + extension: .txt + +thumbnail: public bucket for thumbnail images + pdf: thumbnails from PDF files + extension: .180px.jpg + +## Proxy and URLs + +Internal HTTP access via: + + http://wbgrp-svc169.us.archive.org:8333// + +Public access via: + + https://blobs.fatcat.wiki// + +Eg: + + http://wbgrp-svc169.us.archive.org:8333/testing/small.txt + http://wbgrp-svc169.us.archive.org:8333/sandcrawler/grobid/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.tei.xml + https://blobs.fatcat.wiki/testing/small.txt + https://blobs.fatcat.wiki/thumbnail/pdf/1a/64/1a6462a925a9767b797fe6085093b6aa9f27f523.180px.jpg + diff --git a/extra/blobs/minio/README.md b/extra/blobs/minio/README.md new file mode 100644 index 0000000..d8f1c69 --- /dev/null +++ b/extra/blobs/minio/README.md @@ -0,0 +1,74 @@ + +minio is used as an S3-compatible blob store. Initial use case is GROBID XML +documents, addressed by the sha1 of the PDF file the XML was extracted from. + +Note that on the backend minio is just storing objects as files on disk. + +## Deploying minio Server + +It seems to be important to use a version of minio from at least December 2019 +era for on-disk compression to actually work. + +Currently install minio (and mc, the minio client) in prod by simply +downloading the binaries and calling from systemd. + +## Buckets and Directories + +Hosts and buckets: + + localhost:sandcrawler-dev + create locally for development (see below) + + cluster:sandcrawler + main sandcrawler storage bucket, for GROBID output and other derivatives. + Note it isn't "sandcrawler-prod", for backwards compatibility reasons. + + cluster:sandcrawler-qa + for, eg, testing on cluster servers + + cluster:unpaywall + subset of sandcrawler content crawled due to unpaywall URLs; + potentially made publicly accessible + +Directory structure within sandcrawler buckets: + + grobid/2c/0d/2c0daa9307887a27054d4d1f137514b0fa6c6b2d.tei.xml + SHA1 (lower-case hex) of PDF that XML was extracted from + +Create new buckets like: + + mc mb cluster/sandcrawler-qa + +## Development + +Run minio server locally, with non-persisted data: + + docker run -p 9000:9000 minio/minio server /data + +Credentials are `minioadmin:minioadmin`. Install `mc` client utility, and +configure: + + mc config host add localhost http://localhost:9000 minioadmin minioadmin + +Then create dev bucket: + + mc mb --ignore-existing localhost/sandcrawler-dev + +A common "gotcha" with `mc` command is that it will first look for a local +folder/directory with same name as the configured remote host, so make sure +there isn't a `./localhost` folder. + + +## Users + +Create a new readonly user like: + + mc admin user add sandcrawler unpaywall $RANDOM_SECRET_KEY readonly + +Make a prefix within a bucket world-readable like: + + mc policy set download cluster/unpaywall/grobid + +## Config + + mc admin config set aitio compression extensions=.txt,.log,.csv,.json,.tsv,.pdf,.xml mime_types=text/csv,text/plain,application/json,application/xml,application/octet-stream,application/tei+xml diff --git a/extra/blobs/minio/minio.conf b/extra/blobs/minio/minio.conf new file mode 100644 index 0000000..2e93f9a --- /dev/null +++ b/extra/blobs/minio/minio.conf @@ -0,0 +1,14 @@ + +# Volume to be used for MinIO server. +MINIO_VOLUMES="/sandcrawler-minio/data" +# Use if you want to run MinIO on a custom port. +MINIO_OPTS="--address :9000" +# Access Key of the server. +MINIO_ACCESS_KEY=REDACTED +# Secret key of the server. +MINIO_SECRET_KEY=REDACTED + +# may need to set these manually using `mc admin config get`, edit the JSON, then `set` +MINIO_COMPRESS="on" +MINIO_COMPRESS_EXTENSIONS=".txt,.log,.csv,.json,.tar,.xml,.bin,.pdf,.tsv" +MINIO_COMPRESS_MIME_TYPES="text/*,application/json,application/xml,application/pdf,application/octet-stream" diff --git a/extra/blobs/seaweedfs/README.md b/extra/blobs/seaweedfs/README.md new file mode 100644 index 0000000..d19e9e0 --- /dev/null +++ b/extra/blobs/seaweedfs/README.md @@ -0,0 +1,9 @@ + +## HOWTO: Create new bucket in SeaweedFS + +Log in to the seaweedfs VM. + +Run `weed shell` to start a shell, then: + + bucket.create -name + diff --git a/extra/blobs/tasks.md b/extra/blobs/tasks.md new file mode 100644 index 0000000..beb765f --- /dev/null +++ b/extra/blobs/tasks.md @@ -0,0 +1,53 @@ + +## Backfill GROBID XML to Blob Store + +Initially ran this when spinning up new seaweedfs server to replace minio. At +this time grobid persist worker was in db-only mode, as minio was too slow to +accept uploads. Rough plan is to: + +1. run grobid persist worker from Kafka with a new temporary consumer group, + from the start of the GROBID output topic +2. when it gets to end, stop the *regular* consumer group while this one is + still running. with temporary worker still running, at that point in time + entire topic should be in S3 +3. then reconfigure regular worker to db+s3 mode. halt the temporary worker, + restart the regular one with new config, run it indefinitely + +Consumer group isn't an arg, so just edit `persist_worker.py` and set it to +`persist-grobid-seaweedfs`. Also needed to patch a bit so `--s3-only` mode +didn't try to connect to postgresql. + +Commands: + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + => Consuming from kafka topic sandcrawler-prod.grobid-output-pg, group persist-grobid-seaweed + => run briefly, then kill + +On kafka-broker worker: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --reset-offsets --to-earliest --group persist-grobid-seaweed --topic sandcrawler-prod.grobid-output-pg --dry-run + +Then run 2x instances of worker (same command as above): + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc350.us.archive.org:9092 --env prod --s3-bucket sandcrawler --s3-url wbgrp-svc169.us.archive.org:8333 persist-grobid --s3-only + +At this point CPU-limited on this worker by the python processes (only 4 cores +on this machine). + +Check in weed shell: + + weed shell + + > > fs.meta.cat buckets/sandcrawler/grobid/00/00/000068a76ab125389506e8834483c6ba4c73338a.tei.xml + [...] + "isGzipped": false + [...] + "mime": "application/xml", + [...] + +An open question is if we should have separate buckets per derive type. Eg, a +GROBID XML bucket separate from thumbnails bucket. Or are prefix directories +enough. Basically this comes down to whether we want things mixed together at +the volume level. I think we should keep separate. + +Need to set the mimetype in the upload for gzip on XML? diff --git a/extra/hbase/howto.md b/extra/hbase/howto.md new file mode 100644 index 0000000..26d33f4 --- /dev/null +++ b/extra/hbase/howto.md @@ -0,0 +1,42 @@ + +Commands can be run from any cluster machine with hadoop environment config +set up. Most of these commands are run from the shell (start with `hbase +shell`). There is only one AIT/Webgroup HBase instance/namespace; there may be +QA/prod tables, but there are not QA/prod clusters. + +## Create Table + +Create column families (note: not all individual columns) with something like: + + create 'wbgrp-journal-extract-0-qa', 'f', 'file', {NAME => 'grobid0', COMPRESSION => 'snappy'} + +## Run Thrift Server Informally + +The Thrift server can technically be run from any old cluster machine that has +Hadoop client stuff set up, using: + + hbase thrift start -nonblocking -c + +Note that this will run version 0.96, while the actual HBase service seems to +be running 0.98. + +To interact with this config, use happybase (python) config: + + conn = happybase.Connection("bnewbold-dev.us.archive.org", transport="framed", protocol="compact") + # Test connection + conn.tables() + +## Queries From Shell + +Fetch all columns for a single row: + + hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ' + +Fetch multiple columns for a single row, using column families: + + hbase> get 'wbgrp-journal-extract-0-qa', 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 'f', 'file' + +Scan a fixed number of rows (here 5) starting at a specific key prefix, all +columns: + + hbase> scan 'wbgrp-journal-extract-0-qa',{LIMIT=>5,STARTROW=>'sha1:A'} diff --git a/extra/hbase/notes.txt b/extra/hbase/notes.txt new file mode 100644 index 0000000..20f406f --- /dev/null +++ b/extra/hbase/notes.txt @@ -0,0 +1,232 @@ + +## Notes on HBase features + +Decent one-page introduction: +https://www.tutorialspoint.com/hbase/hbase_overview.htm + +Question: what version of hbase are we running? what on-disk format? + +=> Server: HBase 0.98.6-cdh5.3.1 +=> Client: HBase 0.96.1.1-cdh5.0.1 +=> As of 2018, 1.2 is stable and 2.0 is released. + +Question: what are our servers? how can we monitor? + +=> http://ia802402.us.archive.org:6410/master-status + +I haven't been able to find a simple table of hbase version and supported/new +features over the years (release notes are too detailed). + +Normal/online mapreduce over tables sounds like it goes through a "region +server" and is slow. Using snapshots allows direct access to underlying +tablets on disk? Or is access always direct? + +Could consider "Medium-sized Object" support for 100 KByte to 10 MByte sized +files. This seems to depend on HBase v3, which was added in HBase 0.98, so we +can't use it yet. + +Do we need to decide on on-disk format? Just stick with defaults. + +Looks like we use the `happybase` python package to write. This is packaged in +debian, but only for python2. There is also a `starbase` python library +wrapping the REST API. + +There is a "bulk load" mechanism for going directly from HDFS into HBase, by +creating HFiles that can immediately be used by HBase. + +## Specific "Queries" needed + +"Identifier" will mostly want to get "new" (unprocessed) rows to process. It +can do so by + +Question: if our columns are mostly "dense" within a column group (aka, mostly +all or none set), what is the value of splitting out columns instead of using a +single JSON blob or something like that? Not needing to store the key strings? +Being able to do scan filters? The later obviously makes sense in some +contexts. + +- is there a visible distinction between "get(table, group:col)" being + zero-length (somebody put() an empty string (like "") versus that column not + having being written to? + +## Conversation with Noah about Heritrix De-Dupe + +AIT still uses HBase for url-agnostic de-dupe, but may move away from it. Does +about 250 reads/sec (estimate based on URL hits per quarter). Used to have more +problems (region servers?) but haven't for a while. If a crawler can't reach +HBase, it will "fail safe" and retain the URL. However, Heritrix has trouble +starting up if it can't connect at start. Uses the native JVM drivers. + +Key is "sha1:-", so in theory they can control whether to +dedupe inside or outside of individual crawls (or are they account IDs?). IIRC +all columns were in one family, and had short names (single character). Eg: + + hbase(main):012:0> scan 'ait-prod-digest-history',{LIMIT=>5,STARTROW=>'sha1:A'} + sha1:A22222453XRJ63AC7YCSK46APWHTJKFY-2312 column=f:c, timestamp=1404151869546, value={"c":1,"u":"http://www.theroot.com/category/views-tags/black-fortune-500-ceos","d":"2012-02-23T08:27:10Z","o":8867552,"f":"ARCHIVEIT-REDACTED-20120223080317-00009-crawling201.us.archive.org-6681.warc.gz"} + +Code for url-agnostic dedupe is in: + + heritrix3/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBaseContentDigestHistory.java + +Crawl config snippet: + + [...] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [...] + +## Kenji Conversation (global wayback) + +Spoke with Kenji, who had previous experience trying to use HBase for crawl +de-dupe. Take away was that it didn't perform well for them even back then, +with 3000+ req/sec. AIT today is more like 250 req/sec. + +Apparently CDX API is just the fastest thing ever; stable slow latency on reads +(~200ms!), and it takes an hour for "writes" (bulk deltacdx or whatever). + +Sounds like HBase in particular struggled with concurrent heavy reads and +writes; frequent re-compaction caused large tail latencies, and when region +servers were loaded they would time-out of zookeeper. + +He pushed to use elasticsearch instead of hbase to store extracted fulltext, as +a persistant store, particularly if we end up using it for fulltext someday. He +thinks it operates really well as a datastore. I am not really comfortable with +this usecase, or depending on elastic as a persistant store in general, and it +doesn't work for the crawl dedupe case. + +He didn't seem beholden to the tiny column name convention. + + +## Google BigTable paper (2006) + +Hadn't read this paper in a long time, and didn't really understand it at the +time. HBase is a clone of BigTable. + +They used bigtable to store crawled HTML! Very similar use-case to our journal +stuff. Retained multiple crawls using versions; version timestamps are crawl +timestamps, aha. + +Crazy to me how the whole hadoop world is Java (garbage collected), while all +the google stuff is C++. So many issues with hadoop are performance/latency +sensitive; having garbage collection in a situation when RAM is tight and +network timeouts are problematic seems like a bad combination for operability +(median/optimistic performance is probably fine) + +"Locality" metadata important for actually separating column families. Column +name scarcity doesn't seem to be a thing/concern. Compression settings +important. Key selection to allow local compression seems important to them. + +Performance probably depends a lot on 1) relative rate of growth (slow if +re-compressing, etc), 2) + +Going to want/need table-level monitoring, probably right from the start. + +## Querying/Aggregating/Stats + +We'll probably want to be able to run simple pig-style queries over HBase. How +will that work? A couple options: + +- query hbase using pig via HBaseStorage and HBaseLoader +- hive runs on map/reduce like pig +- drill is an online/fast SQL query engine with HBase back-end support. Not + map/reduce based; can run from a single server. Supports multiple "backends". + Somewhat more like pig; "schema-free"/JSON. +- impala supports HBase backends +- phoenix is a SQL engine on top of HBase + +## Hive Integration + +Run `hive` from a hadoop cluster machine: + + bnewbold@ia802405$ hive --version + Hive 0.13.1-cdh5.3.1 + Subversion file:///var/lib/jenkins/workspace/generic-package-ubuntu64-12-04/CDH5.3.1-Packaging-Hive-2015-01-27_16-23-36/hive-0.13.1+cdh5.3.1+308-1.cdh5.3.1.p0.17~precise -r Unknown + Compiled by jenkins on Tue Jan 27 16:38:11 PST 2015 + From source with checksum 1bb86e4899928ce29cbcaec8cf43c9b6 + +Need to create mapping tables: + + CREATE EXTERNAL TABLE journal_extract_qa(rowkey STRING, grobid_status STRING, file_size STRING) + STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' + WITH SERDEPROPERTIES ('hbase.columns.mapping' = ':key,grobid0:status_code,file:size') + TBLPROPERTIES ('hbase.table.name' = 'wbgrp-journal-extract-0-qa'); + +Maybe: + + SET hive.aux.jars.path = file:///home/webcrawl/hadoop-2/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///home/webcrawl/hadoop-2/hive/lib/hbase-client-0.96.1.1-cdh5.0.1.jar; + SELECT * from journal_extract_qa LIMIT 10; + +Or? + + ADD jar /usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar; + ADD jar /usr/lib/hive/lib/hive-shims-common-secure-0.13.1-cdh5.3.1.jar; + ADD jar /usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; + ADD jar /usr/lib/hbase/hbase-client-0.98.6-cdh5.3.1.jar; + ADD jar /usr/lib/hbase/hbase-common-0.98.6-cdh5.3.1.jar; + +Or, from a real node? + + SET hive.aux.jars.path = file:///usr/lib/hive/lib/hive-hbase-handler-0.13.1-cdh5.3.1.jar,file:///usr/lib/hbase/lib/hbase-client-0.98.6-cdh5.3.1.jar,file:///usr/lib/hadoop-hdfs/hadoop-hdfs-2.5.0-cdh5.3.1.jar; + SELECT * from journal_extract_qa LIMIT 10; + +Getting an error: + + Exception in thread "main" java.lang.NoSuchMethodError: org.apache.hadoop.hdfs.client.HdfsAdmin.getEncryptionZoneForPath(Lorg/apache/hadoop/fs/Path;)Lorg/apache/hadoop/hdfs/protocol/EncryptionZone; + +The HdfsAdmin admin class is in hadoop-hdfs, but `getEncryptionZoneForPath` +isn't in there. See upstream commit: + + https://github.com/apache/hadoop/commit/20dcb841ce55b0d414885ceba530c30b5b528b0f + +## Debugging + +List hbase tables known to zookeeper (as opposed to `list` from `hbase shell`): + + hbase zkcli ls /hbase/table + +Look for jar files with a given symbol: + + rg HdfsAdmin -a /usr/lib/*/*.jar + +## Performance + +Should pre-allocate regions for tables that are going to be non-trivially +sized, otherwise all load hits a single node. From the shell, this seems to +involve specifying the split points (key prefixes) manually. From the docs: + + http://hbase.apache.org/book.html#precreate.regions + +There is an ImportTsv tool which might have been useful for original CDX +backfill, but :shrug:. It is nice to have only a single pipeline and have it +work well. diff --git a/extra/hbase/schema_design.md b/extra/hbase/schema_design.md new file mode 100644 index 0000000..2db8998 --- /dev/null +++ b/extra/hbase/schema_design.md @@ -0,0 +1,79 @@ + +## PDF Table + +Table name: `wbgrp-journal-extract--` + +Eg: `wbgrp-journal-extract-0-prod` + +Key is the sha1 of the file, as raw bytes (20 bytes). + +Could conceivably need to handle, eg, postscript files, JATS XML, or even HTML +in the future? If possible be filetype-agnostic, but only "fulltext" file types +will end up in here, and don't bend over backwards. + +Keep only a single version (do we need `VERSIONS => 1`, or is 1 the default?) + +IMPORTANT: column names should be unique across column families. Eg, should not +have both `grobid0:status` and `match0:status`. HBase and some client libraries +don't care, but some map/reduce frameworks (eg, Scalding) can have name +collisions. Differences between "orthogonal" columns *might* be OK (eg, +`grobid0:status` and `grobid1:status`). + +Column families: + +- `key`: sha1 of the file in base32 (not a column or column family) +- `f`: heritrix HBaseContentDigestHistory de-dupe + - `c`: (json string) + - `u`: original URL (required) + - `d`: original date (required; ISO 8601:1988) + - `f`: warc filename (recommend) + - `o`: warc offset (recommend) + - `c`: dupe count (optional) + - `i`: warc record ID (optional) +- `file`: crawl and file metadata + - `size` (uint64), uncompressed (not in CDX) + - `mime` (string; might do postscript in the future; normalized) + - `cdx` (json string) with all as strings + - `surt` + - `url` + - `dt` + - `warc` (item and file name) + - `offset` + - `c_size` (compressed size) + - `meta` (json string) + - `size` (int) + - `mime` (str) + - `magic` (str) + - `magic_mime` (str) + - `sha1` (hex str) + - `md5` (hex str) + - `sha256` (hex str) +- `grobid0`: processing status, version, XML and JSON fulltext, JSON metadata. timestamp. Should be compressed! `COMPRESSION => SNAPPY` + - `status_code` (signed int; HTTP status from grobid) + - `quality` (int or string; we define the meaning ("good"/"marginal"/"bad") + - `status` (json string from grobid) + - `tei_xml` (xml string from grobid) + - `tei_json` (json string with fulltext) + - `metadata` (json string with author, title, abstract, citations, etc) +- `match0`: status of identification against "the catalog" + - `mstatus` (string; did it match?) + - `doi` (string) + - `minfo` (json string) + +Can add additional groups in the future for additional processing steps. For +example, we might want to do first pass looking at files to see "is this a PDF +or not", which out output some status (and maybe certainty). + +The Heritrix schema is fixed by the existing implementation. We could +patch/extend heritrix to use the `file` schema in the future if we decide +it's worth it. There are some important pieces of metadata missing, so at +least to start I think we should keep `f` and `file` distinct. We could merge +them later. `f` info will be populated by crawlers; `file` info should be +populated when back-filling or processing CDX lines. + +If we wanted to support multiple CDX rows as part of the same row (eg, as +alternate locations), we could use HBase's versions feature, which can +automatically cap the number of versions stored. + +If we had enough RAM resources, we could store `f` (and maybe `file`) metadata +in memory for faster access. diff --git a/extra/nginx/README.md b/extra/nginx/README.md new file mode 100644 index 0000000..0369f9b --- /dev/null +++ b/extra/nginx/README.md @@ -0,0 +1,18 @@ + +This folder contains nginx configs for partner access to sandcrawler DB +(postgrest) and GROBID XML blobs (minio). + +`fatcat-blobs` is part of the fatcat.wiki ansible config, but included here to +show how it works. + +## Let's Encrypt + +As... bnewbold? + + sudo certbot certonly \ + --non-interactive \ + --agree-tos \ + --email bnewbold@archive.org \ + --webroot -w /var/www/letsencrypt \ + -d sandcrawler-minio.fatcat.wiki \ + -d sandcrawler-db.fatcat.wiki diff --git a/extra/nginx/fatcat-blobs b/extra/nginx/fatcat-blobs new file mode 100644 index 0000000..5c692ef --- /dev/null +++ b/extra/nginx/fatcat-blobs @@ -0,0 +1,51 @@ + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name blobs.fatcat.wiki; + + ssl_certificate /etc/letsencrypt/live/fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/fatcat-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location /unpaywall/ { + if ($request_method !~ "GET") { + return 403; + break; + } + + #proxy_pass http://sandcrawler-minio.fatcat.wiki:9000$uri$is_args$args; + proxy_pass http://207.241.227.141:9000$uri$is_args$args; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + location / { + default_type text/plain; + return 504 'blobs.fatcat.wiki hosts many files; full URLs are required!\nyou probably want https://fatcat.wiki/ instead'; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/nginx/sandcrawler-db b/extra/nginx/sandcrawler-db new file mode 100644 index 0000000..67d1a2d --- /dev/null +++ b/extra/nginx/sandcrawler-db @@ -0,0 +1,80 @@ + +upstream postgrest { + server localhost:3030; + keepalive 64; +} + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name sandcrawler-db.fatcat.wiki db.sandcrawler.org; + + ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/sandcrawler-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location / { + + default_type application/json; + + if ($request_method !~ "GET") { + return 403; + break; + } + + proxy_redirect off; + + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + + proxy_pass http://postgrest/; + } + + # support /endpoint/:id url style for sha1hex lookups + location ~ "^/(file_meta|grobid|fatcat_file)/([a-f0-9]{40})$" { + + if ($request_method !~ "GET") { + return 403; + break; + } + + # assuming an upstream named "postgrest" + # doing this rewrite as part of the proxy_pass line itself didn't seem + # to work, so doing a formal rewrite here + rewrite "/([a-z_]+)/([a-f0-9]{40})" /$1?sha1hex=eq.$2 break; + proxy_pass http://postgrest; + + # make the response singular + #default_type application/vnd.pgrst.object+json; + proxy_set_header Accept "application/vnd.pgrst.object+json"; + + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} diff --git a/extra/nginx/sandcrawler-minio b/extra/nginx/sandcrawler-minio new file mode 100644 index 0000000..2e9bfe3 --- /dev/null +++ b/extra/nginx/sandcrawler-minio @@ -0,0 +1,57 @@ + +server { + listen 80; + listen [::]:80; + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name sandcrawler-minio.fatcat.wiki minio.sandcrawler.org; + + ssl_certificate /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/sandcrawler-minio.fatcat.wiki/privkey.pem; + + #add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'"; + add_header X-Frame-Options "SAMEORIGIN"; # 'always' if nginx > 1.7.5 + add_header X-Content-Type-Options "nosniff"; # 'always' if nginx > 1.7.5 + add_header X-Xss-Protection "1"; + # Enable STS with one year period (breaks http; optional) + #add_header Strict-Transport-Security "max-age=31557600; includeSubDomains"; + + error_log /var/log/nginx/sandcrawler-errors.log; + access_log /dev/null; + + if ($scheme = http) { + return 301 https://$server_name$request_uri; + } + + location /minio/ { + + # allows all HTTP verbs + + proxy_pass http://localhost:9000; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + location / { + if ($request_method !~ "GET") { + return 403; + break; + } + + proxy_pass http://localhost:9000; + proxy_redirect off; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + } + + # Let's Encrypt SSL Certs + location /.well-known/acme-challenge/ { + root /var/www/letsencrypt; + autoindex off; + } +} -- cgit v1.2.3