From 91e6b33a4733fbe622ce0e09460a75cd377bee7a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 14 Jan 2020 17:00:59 -0800 Subject: small fixups to SandcrawlerPostgrestClient --- python/sandcrawler/__init__.py | 1 + python/sandcrawler/db.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 699126f..2d28829 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -5,4 +5,5 @@ from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker +from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index eb1a922..1a47b0b 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -9,7 +9,7 @@ import requests class SandcrawlerPostgrestClient: def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs): - self.api_uri = api_url + self.api_url = api_url def get_cdx(self, url): resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url)) @@ -34,6 +34,15 @@ class SandcrawlerPostgrestClient: else: return None + def get_ingest_file_result(self, url): + resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url)) + resp.raise_for_status() + resp = resp.json() + if resp: + return resp[0] + else: + return None + class SandcrawlerPostgresClient: def __init__(self, db_url, **kwargs): -- cgit v1.2.3