aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/filter_groupworks.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/filter_groupworks.py')
-rwxr-xr-xpython/scripts/filter_groupworks.py48
1 files changed, 27 insertions, 21 deletions
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
- dates differ (not just year)
"""
-import sys
import json
+import sys
# out of 1000
SCORE_THRESHOLD = 900
@@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
- s.replace(''', "'")
+ s.replace("'", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
+
def check_authors(left, right):
"""
@@ -51,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -59,20 +61,22 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -86,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -101,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -126,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == "__main__":
run()