aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 21:17:27 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit6a5a0b090d7f303f3332759d63ffd0ac77cdd28c (patch)
tree0d3f418b78166c8411ef5c54852c01bb41eb8946 /python/sandcrawler
parent293d4b176855d400324559c814abd2e404cdf31e (diff)
downloadsandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.tar.gz
sandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.zip
add PersistGrobidDiskWorker
To help with making dumps directly from Kafka (eg, for partner delivery)
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/persist.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index b017a82..9f8171c 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -245,3 +245,36 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.db.commit()
return []
+
+class PersistGrobidDiskWorker(SandcrawlerWorker):
+ """
+ Writes blobs out to disk.
+
+ This could be refactored into a "Sink" type with an even thinner wrapper.
+ """
+
+ def __init__(self, output_dir):
+ super().__init__()
+ self.output_dir = output_dir
+
+ def _blob_path(self, sha1hex, extension=".tei.xml"):
+ obj_path = "{}/{}/{}{}".format(
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def process(self, record):
+
+ if record['status_code'] != 200 or not record.get('tei_xml'):
+ return False
+ assert(len(record['key'])) == 40
+ p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))
+ os.makedirs(os.path.dirname(p), exist_ok=True)
+ with open(p, 'w') as f:
+ f.write(record.pop('tei_xml'))
+ self.counts['written'] += 1
+ return record
+