diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-10-08 16:11:09 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-10-08 16:11:09 -0700 | 
| commit | b9279efacdee9bf8038203d6efe9dc105cc4dce3 (patch) | |
| tree | e1c40cebb0ea688182b5d8f1dcf5fbd02c3fa4c0 /python/fatcat_tools/importers | |
| parent | 4b7c3c7b317cf4793f5ba5ad0d96102f103b66a3 (diff) | |
| download | fatcat-b9279efacdee9bf8038203d6efe9dc105cc4dce3.tar.gz fatcat-b9279efacdee9bf8038203d6efe9dc105cc4dce3.zip | |
refactor duplicated b32_hex function in importers
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 12 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 9 | ||||
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 11 | 
3 files changed, 11 insertions, 21 deletions
| diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 65597d5c..7017c56c 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,20 +1,10 @@  import sys  import json -import base64  import sqlite3  import itertools  import fatcat_openapi_client -from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS - - -def b32_hex(s): -    s = s.strip().split()[0].lower() -    if s.startswith("sha1:"): -        s = s[5:] -    if len(s) != 32: -        return s -    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') +from .common import EntityImporter, clean, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS, b32_hex  ARABESQUE_MATCH_WHERE_CLAUSE='WHERE hit = 1 AND identifier IS NOT NULL' diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index a25c3196..74595790 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -4,6 +4,7 @@ import sys  import csv  import json  import ftfy +import base64  import sqlite3  import subprocess  import unicodedata @@ -141,6 +142,14 @@ def test_clean():      assert clean('<b>a&b</b>') == '<b>a&b</b>'      assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +def b32_hex(s): +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        return s +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') +  def is_cjk(s):      if not s:          return False diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index fa0bd4ef..5b23347f 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -8,7 +8,6 @@ Works as a stand-alone script (for debugging) or as library routines.  import sys  import json -import base64  import hashlib  import requests  import datetime @@ -17,21 +16,13 @@ import subprocess  from bs4 import BeautifulSoup  from fatcat_openapi_client import * +from .common import b32_hex  CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"  GWB_URL_BASE = "https://web.archive.org/web"  REQ_SESSION = requests.Session() -def b32_hex(s): -    """copy/pasta from elsewhere""" -    s = s.strip().split()[0].lower() -    if s.startswith("sha1:"): -        s = s[5:] -    if len(s) != 32: -        return s -    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') -  def parse_wbm_url(url):      """Takes a wayback machine URL, and returns a tuple: | 
