diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-26 21:17:27 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | 6a5a0b090d7f303f3332759d63ffd0ac77cdd28c (patch) | |
tree | 0d3f418b78166c8411ef5c54852c01bb41eb8946 /python/sandcrawler | |
parent | 293d4b176855d400324559c814abd2e404cdf31e (diff) | |
download | sandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.tar.gz sandcrawler-6a5a0b090d7f303f3332759d63ffd0ac77cdd28c.zip |
add PersistGrobidDiskWorker
To help with making dumps directly from Kafka (eg, for partner delivery)
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/persist.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index b017a82..9f8171c 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -245,3 +245,36 @@ class PersistGrobidWorker(SandcrawlerWorker): self.db.commit() return [] + +class PersistGrobidDiskWorker(SandcrawlerWorker): + """ + Writes blobs out to disk. + + This could be refactored into a "Sink" type with an even thinner wrapper. + """ + + def __init__(self, output_dir): + super().__init__() + self.output_dir = output_dir + + def _blob_path(self, sha1hex, extension=".tei.xml"): + obj_path = "{}/{}/{}{}".format( + sha1hex[0:2], + sha1hex[2:4], + sha1hex, + extension, + ) + return obj_path + + def process(self, record): + + if record['status_code'] != 200 or not record.get('tei_xml'): + return False + assert(len(record['key'])) == 40 + p = "{}/{}".format(self.output_dir, self._blob_path(record['key'])) + os.makedirs(os.path.dirname(p), exist_ok=True) + with open(p, 'w') as f: + f.write(record.pop('tei_xml')) + self.counts['written'] += 1 + return record + |