aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 12:02:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch)
tree930331468462a74873aebb44b88c051e8b096c4f /python/fatcat_tools/importers/common.py
parent4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff)
downloadfatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz
fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip
initial flesh out of JALC parser
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 282f775c..7fca38cf 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,13 +7,16 @@ import ftfy
import sqlite3
import itertools
import subprocess
+import unicodedata
from collections import Counter
import pykafka
+from bs4 import BeautifulSoup
import fatcat_client
from fatcat_client.rest import ApiException
+DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
@@ -52,6 +55,23 @@ def test_clean():
assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+def is_cjk(s):
+ if not s:
+ return False
+ return unicodedata.name(s[0]).startswith("CJK")
+
+def test_is_cjk():
+ assert is_cjk(None) == False
+ assert is_cjk('') == False
+ assert is_cjk('blah') == False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+ assert is_cjk('菊') == True
+ assert is_cjk('ひヒ') == True
+ assert is_cjk('english with ひヒ') == True
+ assert is_cjk('き゚ゅ') == True
+ assert is_cjk('水道') == True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+
DOMAIN_REL_MAP = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher):
return counts
+class Bs4XmlFilePusher(RecordPusher):
+
+ def __init__(self, importer, xml_file, record_tag, **kwargs):
+ self.importer = importer
+ self.xml_file = xml_file
+ self.record_tag = record_tag
+
+ def run(self):
+ soup = BeautifulSoup(self.xml_file, "xml")
+ for record in soup.find_all(self.record_tag):
+ self.importer.push_record(record)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
class KafkaJsonPusher(RecordPusher):
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):