make fmt (black 21.9b0)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
commit: 826c7538e091fac14d987a3cd654975da964e240 (patch)
tree: 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts/filter_groupworks.py
parent: 020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download: sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
1 files changed, 20 insertions, 20 deletions
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index fda9098..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -31,15 +31,15 @@ REQUIRE_AUTHORS = False
 
 def tokenize(s, remove_whitespace=False):
 
-    s.replace('&apos;', "'")
+    s.replace("&apos;", "'")
     # Remove non-alphanumeric characters
-    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
 
     if remove_whitespace:
-        s = ''.join(s.split())
+        s = "".join(s.split())
 
     # Encode as dumb ASCII (TODO: this is horrible)
-    return s.encode('ascii', 'replace').replace(b'?', b'')
+    return s.encode("ascii", "replace").replace(b"?", b"")
 
 
 def check_authors(left, right):
@@ -53,7 +53,7 @@ def check_authors(left, right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
-        l = left[i].lower().replace('jr.', '').split()
+        l = left[i].lower().replace("jr.", "").split()
         if not l:
             return False
         l = tokenize(l[-1])
@@ -61,21 +61,21 @@ def check_authors(left, right):
             # weird author name (single char)
             return False
         if l not in right_all:
-            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
 
 def test_check_authors():
     assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
-    assert not check_authors([], ['one'])
-    assert check_authors(['one'], ['one'])
-    assert check_authors(['one two'], ['One Two'])
-    assert check_authors(['two'], ['One Two'])
-    assert check_authors(['two'], ['two, one'])
-    assert check_authors(['mago'], ['Mr. Magoo'])
-    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], ["one"])
+    assert check_authors(["one"], ["one"])
+    assert check_authors(["one two"], ["One Two"])
+    assert check_authors(["two"], ["One Two"])
+    assert check_authors(["two"], ["two, one"])
+    assert check_authors(["mago"], ["Mr. Magoo"])
+    assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
 
 
 # Rows are (score, left, right)
@@ -90,10 +90,10 @@ def process_group(rows):
         left = json.loads(row[1])
         right = json.loads(row[2])
         # authors must roughly match
-        if not check_authors(left['authors'], right['authors']):
+        if not check_authors(left["authors"], right["authors"]):
             continue
         # years must match (if defined)
-        if left['year'] and right['year'] and left['year'] != right['year']:
+        if left["year"] and right["year"] and left["year"] != right["year"]:
             continue
         filtered.append((left, right))
 
@@ -105,8 +105,8 @@ def process_group(rows):
     group_ids = set()
     for row in filtered[1:]:
         (left, right) = row
-        l_id = left['fatcat_release']
-        r_id = right['fatcat_release']
+        l_id = left["fatcat_release"]
+        r_id = right["fatcat_release"]
         releases[l_id] = left
         releases[r_id] = right
         if not group_ids:
@@ -131,7 +131,7 @@ def run():
 
     # group lines by slug, and process in batches
     for line in sys.stdin:
-        line = line.strip().split('\t')
+        line = line.strip().split("\t")
         assert len(line) == 4
         slug = line[0]
         if last_slug and slug != last_slug and lines:
@@ -146,5 +146,5 @@ def run():
         process_group(lines)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run()
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
commit	826c7538e091fac14d987a3cd654975da964e240 (patch)
tree	90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts/filter_groupworks.py
parent	020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download	sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip