summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-12-12 17:50:12 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-12-12 19:49:50 -0800
commit91662c063b088bb665b86c252ecd27be9d9083be (patch)
tree6613d45fbfc7b63c67adff8b4f6aa1d531eda12b
parent0827b67a3f195f151a77ca01708e6c98daf778bf (diff)
downloadfatcat-91662c063b088bb665b86c252ecd27be9d9083be.tar.gz
fatcat-91662c063b088bb665b86c252ecd27be9d9083be.zip
ensure importer description arg isn't clobbered
-rw-r--r--python/fatcat_tools/importers/arabesque.py3
-rw-r--r--python/fatcat_tools/importers/common.py4
-rw-r--r--python/fatcat_tools/importers/matched.py3
3 files changed, 5 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 7017c56c..acfc2b87 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -42,8 +42,7 @@ class ArabesqueMatchImporter(EntityImporter):
def __init__(self, api, extid_type, require_grobid=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Match web crawl files to releases based on identifier/URL seedlist")
+ eg_desc = kwargs.get('editgroup_description', None) or "Match web crawl files to releases based on identifier/URL seedlist"
eg_extra = kwargs.get('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
if kwargs.get('crawl_id'):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 4a3cd648..13b1e5b8 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -6,6 +6,7 @@ import json
import ftfy
import base64
import sqlite3
+import datetime
import subprocess
import unicodedata
from collections import Counter
@@ -756,10 +757,11 @@ class KafkaJsonPusher(RecordPusher):
print("... got {} kafka messages ({}sec poll interval)".format(
len(batch), self.poll_interval))
if not batch:
- if datetime.datetime.now() - last_push > datetime.timedelta(minutes=5):
+ if datetime.datetime.now() - last_push > datetime.timedelta(seconds=30): #XXX minutes=5
# it has been some time, so flush any current editgroup
self.importer.finish()
last_push = datetime.datetime.now()
+ #print("Flushed any partial import batch: {}".format(self.importer.counts))
continue
# first check errors on entire batch...
for msg in batch:
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index dbb78ec9..180d7ba3 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -35,8 +35,7 @@ class MatchedImporter(EntityImporter):
def __init__(self, api, **kwargs):
- eg_desc = kwargs.pop('editgroup_description',
- "Import of large-scale file-to-release match results. Source of metadata varies.")
+ eg_desc = kwargs.pop('editgroup_description', None) or "Import of large-scale file-to-release match results. Source of metadata varies."
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
super().__init__(api,