diff options
-rw-r--r-- | python/sandcrawler/__init__.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/db.py | 11 |
2 files changed, 11 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 699126f..2d28829 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -5,4 +5,5 @@ from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker +from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index eb1a922..1a47b0b 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -9,7 +9,7 @@ import requests class SandcrawlerPostgrestClient: def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs): - self.api_uri = api_url + self.api_url = api_url def get_cdx(self, url): resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url)) @@ -34,6 +34,15 @@ class SandcrawlerPostgrestClient: else: return None + def get_ingest_file_result(self, url): + resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url)) + resp.raise_for_status() + resp = resp.json() + if resp: + return resp[0] + else: + return None + class SandcrawlerPostgresClient: def __init__(self, db_url, **kwargs): |