aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-10-08 16:11:09 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-10-08 16:11:09 -0700
commitb9279efacdee9bf8038203d6efe9dc105cc4dce3 (patch)
treee1c40cebb0ea688182b5d8f1dcf5fbd02c3fa4c0 /python/fatcat_tools/importers
parent4b7c3c7b317cf4793f5ba5ad0d96102f103b66a3 (diff)
downloadfatcat-b9279efacdee9bf8038203d6efe9dc105cc4dce3.tar.gz
fatcat-b9279efacdee9bf8038203d6efe9dc105cc4dce3.zip
refactor duplicated b32_hex function in importers
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py12
-rw-r--r--python/fatcat_tools/importers/common.py9
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py11
3 files changed, 11 insertions, 21 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 65597d5c..7017c56c 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -1,20 +1,10 @@
import sys
import json
-import base64
import sqlite3
import itertools
import fatcat_openapi_client
-from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
-
-
-def b32_hex(s):
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex
ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL'
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index a25c3196..74595790 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -4,6 +4,7 @@ import sys
import csv
import json
import ftfy
+import base64
import sqlite3
import subprocess
import unicodedata
@@ -141,6 +142,14 @@ def test_clean():
assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
def is_cjk(s):
if not s:
return False
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index fa0bd4ef..5b23347f 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -8,7 +8,6 @@ Works as a stand-alone script (for debugging) or as library routines.
import sys
import json
-import base64
import hashlib
import requests
import datetime
@@ -17,21 +16,13 @@ import subprocess
from bs4 import BeautifulSoup
from fatcat_openapi_client import *
+from .common import b32_hex
CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
GWB_URL_BASE = "https://web.archive.org/web"
REQ_SESSION = requests.Session()
-def b32_hex(s):
- """copy/pasta from elsewhere"""
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
def parse_wbm_url(url):
"""Takes a wayback machine URL, and returns a tuple: