aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/__init__.py1
-rw-r--r--python/sandcrawler/db.py11
2 files changed, 11 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 699126f..2d28829 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -5,4 +5,5 @@ from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher,
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker
+from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index eb1a922..1a47b0b 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -9,7 +9,7 @@ import requests
class SandcrawlerPostgrestClient:
def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs):
- self.api_uri = api_url
+ self.api_url = api_url
def get_cdx(self, url):
resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
@@ -34,6 +34,15 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_ingest_file_result(self, url):
+ resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
class SandcrawlerPostgresClient:
def __init__(self, db_url, **kwargs):