diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-07 17:24:26 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-30 13:27:20 -0700 |
commit | 8dea77ade1c7c72c25bfb2cd9907c84840206ce6 (patch) | |
tree | 4145eb16bca106b679fb007b7f8d0acdf05534a5 | |
parent | 36bfdda186151972167c35cd0f642bc0c2ca1a54 (diff) | |
download | fatcat-8dea77ade1c7c72c25bfb2cd9907c84840206ce6.tar.gz fatcat-8dea77ade1c7c72c25bfb2cd9907c84840206ce6.zip |
initial code to handle multiple KBART spans better
-rwxr-xr-x | extra/journal_metadata/parse_merge_metadata.py | 66 |
1 files changed, 64 insertions, 2 deletions
diff --git a/extra/journal_metadata/parse_merge_metadata.py b/extra/journal_metadata/parse_merge_metadata.py index bb43bcbf..9e5a4076 100755 --- a/extra/journal_metadata/parse_merge_metadata.py +++ b/extra/journal_metadata/parse_merge_metadata.py @@ -121,6 +121,55 @@ def test_gaps(): assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \ [[1950, 1954], [1957, 1964], [1966, 1970]] +def merge_spans(old, new): + print(old) + print(new) + if not new: + return old + if not old: + old = [] + old.extend(new) + years = set() + for span in old: + for y in range(span[0], span[1]): + years.add(y) + if not years: + return [] + spans = [] + start = None + last = None + todo = False + for y in sorted(list(years)): + if start == None: + # very first + start = y + last = y + todo = True + continue + if y == last + 1: + # span continues + last = y + todo = True + continue + # a gap just happened! + spans.append([start, last]) + start = y + last = y + todo = False + if todo: + spans.append([start, last]) + return spans + +def test_merge_spans(): + assert merge_spans([[5, 10]], [10, 20]) == \ + [[5, 20]] + assert merge_spans([], []) == \ + [] + assert merge_spans([[9, 11]], []) == \ + [[9,11]] + assert merge_spans([[2000, 2000]], [1450, 1900]) == \ + [[1450, 1900], [2000, 2000]] + class Munger(): """ Top-level fields we'd like to fill in if possible: @@ -495,10 +544,23 @@ class Munger(): d = self.data[issnl] if not 'kbart' in d: self.data[issnl]['kbart'] = dict() + d = self.data[issnl] + if not name in d['kbart']: + self.data[issnl]['kbart'][name] = dict() + old_spans = self.data[issnl]['kbart'].get(name, dict()).get('year_spans', []) kbart = dict() if row['date_first_issue_online'] and row['date_last_issue_online']: - kbart['year_spans'] = [[int(row['date_first_issue_online'][:4]), int(row['date_last_issue_online'][:4])]] - self.data[issnl]['kbart'][name] = kbart + start = int(row['date_first_issue_online'][:4]) + end = int(row['date_last_issue_online'][:4]) + if not start <= end: + print("{}: {} not before {}! er, mangling".format( + issnl, + row['date_first_issue_online'], + row['date_last_issue_online'])) + new_spans = [end, start] + else: + new_spans = [start, end] + self.data[issnl]['kbart'][name]['year_spans'] = merge_spans(old_spans, new_spans) print(counts) def load_crossref(self, path): |