aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 17:00:59 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 17:00:59 -0800
commit91e6b33a4733fbe622ce0e09460a75cd377bee7a (patch)
tree8b9f5ee6773de2ff9bf6ece77b73425c8437531a /python
parent2bf0095335203d200370e23922a6ff38ac98201c (diff)
downloadsandcrawler-91e6b33a4733fbe622ce0e09460a75cd377bee7a.tar.gz
sandcrawler-91e6b33a4733fbe622ce0e09460a75cd377bee7a.zip
small fixups to SandcrawlerPostgrestClient
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/__init__.py1
-rw-r--r--python/sandcrawler/db.py11
2 files changed, 11 insertions, 1 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 699126f..2d28829 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -5,4 +5,5 @@ from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher,
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker
+from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index eb1a922..1a47b0b 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -9,7 +9,7 @@ import requests
class SandcrawlerPostgrestClient:
def __init__(self, api_url="http://aitio.us.archive.org:3030", **kwargs):
- self.api_uri = api_url
+ self.api_url = api_url
def get_cdx(self, url):
resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.'+url))
@@ -34,6 +34,15 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_ingest_file_result(self, url):
+ resp = requests.get(self.api_url + "/ingest_file_result", params=dict(base_url='eq.'+url))
+ resp.raise_for_status()
+ resp = resp.json()
+ if resp:
+ return resp[0]
+ else:
+ return None
+
class SandcrawlerPostgresClient:
def __init__(self, db_url, **kwargs):