diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 12:02:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a (patch) | |
tree | 930331468462a74873aebb44b88c051e8b096c4f /python/fatcat_tools/importers/common.py | |
parent | 4cff530fa3a49e845a2c21bbc85d74a92a3e2b06 (diff) | |
download | fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.tar.gz fatcat-82c6c4ee8f27e5f7e0d4b3f39b2cf2a9ffcc667a.zip |
initial flesh out of JALC parser
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 282f775c..7fca38cf 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,13 +7,16 @@ import ftfy import sqlite3 import itertools import subprocess +import unicodedata from collections import Counter import pykafka +from bs4 import BeautifulSoup import fatcat_client from fatcat_client.rest import ApiException +DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 @@ -52,6 +55,23 @@ def test_clean(): assert clean('<b>a&b</b>') == '<b>a&b</b>' assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +def is_cjk(s): + if not s: + return False + return unicodedata.name(s[0]).startswith("CJK") + +def test_is_cjk(): + assert is_cjk(None) == False + assert is_cjk('') == False + assert is_cjk('blah') == False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True + assert is_cjk('菊') == True + assert is_cjk('ひヒ') == True + assert is_cjk('english with ひヒ') == True + assert is_cjk('き゚ゅ') == True + assert is_cjk('水道') == True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + DOMAIN_REL_MAP = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" @@ -456,6 +476,22 @@ class SqlitePusher(RecordPusher): return counts +class Bs4XmlFilePusher(RecordPusher): + + def __init__(self, importer, xml_file, record_tag, **kwargs): + self.importer = importer + self.xml_file = xml_file + self.record_tag = record_tag + + def run(self): + soup = BeautifulSoup(self.xml_file, "xml") + for record in soup.find_all(self.record_tag): + self.importer.push_record(record) + counts = self.importer.finish() + print(counts) + return counts + + class KafkaJsonPusher(RecordPusher): def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): |