diff options
Diffstat (limited to 'chocula/util.py')
-rw-r--r-- | chocula/util.py | 237 |
1 files changed, 119 insertions, 118 deletions
diff --git a/chocula/util.py b/chocula/util.py index 2cb771d..11303b8 100644 --- a/chocula/util.py +++ b/chocula/util.py @@ -1,4 +1,3 @@ - import sys from dataclasses import dataclass from typing import Dict, Optional @@ -11,119 +10,120 @@ import pycountry # NOTE: this is a partial list, focusing on non-publisher hosted platforms and # software frameworks PLATFORM_MAP = { - 'OJS': 'ojs', - 'OJS SEER': 'ojs', - 'Open Journal System/OJS': 'ojs', - 'BMC': 'bmc', - 'SciELO Brazil': 'scielo', - 'SciELO Argentina': 'scielo', - 'SciELO': 'scielo', - 'SciELO Mexico': 'scielo', - 'SciELO Spain': 'scielo', - 'SciELO Portugal': 'scielo', - 'WordPress': 'wordpress', - 'Sciendo': 'sciendo', - 'Drupal': 'drupal', - 'revues.org': 'openedition', + "OJS": "ojs", + "OJS SEER": "ojs", + "Open Journal System/OJS": "ojs", + "BMC": "bmc", + "SciELO Brazil": "scielo", + "SciELO Argentina": "scielo", + "SciELO": "scielo", + "SciELO Mexico": "scielo", + "SciELO Spain": "scielo", + "SciELO Portugal": "scielo", + "WordPress": "wordpress", + "Sciendo": "sciendo", + "Drupal": "drupal", + "revues.org": "openedition", } MIMETYPE_MAP = { - 'PDF': 'application/pdf', - 'HTML': 'text/html', - 'XML': 'application/xml', + "PDF": "application/pdf", + "HTML": "text/html", + "XML": "application/xml", } BIG5_PUBLISHERS = [ - 'Elsevier', - 'Informa UK (Taylor & Francis)', - 'Springer-Verlag', - 'SAGE Publications', - 'Wiley (Blackwell Publishing)', - 'Wiley (John Wiley & Sons)', - 'Springer (Biomed Central Ltd.)', - 'Springer Nature', + "Elsevier", + "Informa UK (Taylor & Francis)", + "Springer-Verlag", + "SAGE Publications", + "Wiley (Blackwell Publishing)", + "Wiley (John Wiley & Sons)", + "Springer (Biomed Central Ltd.)", + "Springer Nature", ] COMMERCIAL_PUBLISHERS = [ - 'Peter Lang International Academic Publishers', - 'Walter de Gruyter GmbH', - 'Oldenbourg Wissenschaftsverlag', - 'Georg Thieme Verlag KG', # not springer - 'Emerald (MCB UP )', - 'Medknow Publications', - 'Inderscience Enterprises Ltd', - 'Bentham Science', - 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins', - 'Scientific Research Publishing, Inc', - 'MDPI AG', - 'S. Karger AG', - 'Pleiades Publishing', - 'Science Publishing Group', - 'IGI Global', - 'The Economist Intelligence Unit', - 'Maney Publishing', - 'Diva Enterprises Private Limited', - 'World Scientific', - 'Mary Ann Liebert', - 'Trans Tech Publications', + "Peter Lang International Academic Publishers", + "Walter de Gruyter GmbH", + "Oldenbourg Wissenschaftsverlag", + "Georg Thieme Verlag KG", # not springer + "Emerald (MCB UP )", + "Medknow Publications", + "Inderscience Enterprises Ltd", + "Bentham Science", + "Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins", + "Scientific Research Publishing, Inc", + "MDPI AG", + "S. Karger AG", + "Pleiades Publishing", + "Science Publishing Group", + "IGI Global", + "The Economist Intelligence Unit", + "Maney Publishing", + "Diva Enterprises Private Limited", + "World Scientific", + "Mary Ann Liebert", + "Trans Tech Publications", ] OA_PUBLISHERS = [ - 'Hindawi Limited', - 'OMICS Publishing Group', - 'De Gruyter Open Sp. z o.o.', - 'OpenEdition', - 'Hindawi (International Scholarly Research Network)', - 'Public Library of Science', - 'Frontiers Media SA', - 'eLife Sciences Publications, Ltd', - 'MDPI AG', - 'Hindawi (International Scholarly Research Network)', - 'Dove Medical Press', - 'Open Access Text', + "Hindawi Limited", + "OMICS Publishing Group", + "De Gruyter Open Sp. z o.o.", + "OpenEdition", + "Hindawi (International Scholarly Research Network)", + "Public Library of Science", + "Frontiers Media SA", + "eLife Sciences Publications, Ltd", + "MDPI AG", + "Hindawi (International Scholarly Research Network)", + "Dove Medical Press", + "Open Access Text", ] SOCIETY_PUBLISHERS = [ - 'Institute of Electrical and Electronics Engineers', - 'Institution of Electrical Engineers', - 'Association for Computing Machinery', - 'American Psychological Association', - 'IOS Press', - 'IOP Publishing', - 'American Chemical Society', - 'Royal Society of Chemistry (RSC)', - 'American Geophysical Union', - 'American College of Physicians', - 'New England Journal of Medicine', - 'BMJ', - 'RCN Publishing', - 'International Union of Crystallography', - 'Portland Press', - 'ASME International', + "Institute of Electrical and Electronics Engineers", + "Institution of Electrical Engineers", + "Association for Computing Machinery", + "American Psychological Association", + "IOS Press", + "IOP Publishing", + "American Chemical Society", + "Royal Society of Chemistry (RSC)", + "American Geophysical Union", + "American College of Physicians", + "New England Journal of Medicine", + "BMJ", + "RCN Publishing", + "International Union of Crystallography", + "Portland Press", + "ASME International", ] UNI_PRESS_PUBLISHERS = [ - 'Cambridge University Press', - 'Oxford University Press', - 'The University of Chicago Press', - 'MIT Press', + "Cambridge University Press", + "Oxford University Press", + "The University of Chicago Press", + "MIT Press", ] ARCHIVE_PUBLISHERS = [ - 'JSTOR', - 'Portico', + "JSTOR", + "Portico", ] REPOSITORY_PUBLISHERS = [ - 'PERSEE Program', - 'Social Science Electronic Publishing', - 'CAIRN', - 'CSIRO Publishing', + "PERSEE Program", + "Social Science Electronic Publishing", + "CAIRN", + "CSIRO Publishing", ] OTHER_PUBLISHERS = [ - 'African Journals Online', - 'Smithsonian Institution Biodiversity Heritage Library', - 'Canadian Science Publishing', - 'Philosophy Documentation Center', - 'Project MUSE', + "African Journals Online", + "Smithsonian Institution Biodiversity Heritage Library", + "Canadian Science Publishing", + "Philosophy Documentation Center", + "Project MUSE", ] + def parse_lang(s): - if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'): + if not s or s in ("Not applicable", "Multiple languages", "Unknown"): return None try: if len(s) == 2: @@ -138,8 +138,9 @@ def parse_lang(s): except AttributeError: return None + def parse_country(s): - if not s or s in ('Unknown'): + if not s or s in ("Unknown"): return None try: if len(s) == 2: @@ -153,12 +154,13 @@ def parse_country(s): else: return None + def parse_mimetypes(val): # XXX: multiple mimetypes? if not val: return mimetype = None - if '/' in val: + if "/" in val: mimetype = val else: mimetype = MIMETYPE_MAP.get(val) @@ -166,13 +168,14 @@ def parse_mimetypes(val): return None return [mimetype] + def gaps_to_spans(first, last, gaps): if not gaps: return [[first, last]] if not (last >= first and max(gaps) < last and min(gaps) > first): # years seem mangled? will continue though print("mangled years: {}".format((first, last, gaps)), file=sys.stderr) - full = list(range(first, last+1)) + full = list(range(first, last + 1)) for missing in gaps: if missing in full: full.remove(missing) @@ -184,7 +187,7 @@ def gaps_to_spans(first, last, gaps): low = year last = year continue - if year != last+1: + if year != last + 1: spans.append([low, last]) low = year last = year @@ -193,15 +196,17 @@ def gaps_to_spans(first, last, gaps): spans.append([low, last]) return spans + def test_gaps(): - assert gaps_to_spans(1900, 1900, None) == \ - [[1900, 1900]] - assert gaps_to_spans(1900, 1903, None) == \ - [[1900, 1903]] - assert gaps_to_spans(1900, 1902, [1901]) == \ - [[1900, 1900], [1902, 1902]] - assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \ - [[1950, 1954], [1957, 1964], [1966, 1970]] + assert gaps_to_spans(1900, 1900, None) == [[1900, 1900]] + assert gaps_to_spans(1900, 1903, None) == [[1900, 1903]] + assert gaps_to_spans(1900, 1902, [1901]) == [[1900, 1900], [1902, 1902]] + assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == [ + [1950, 1954], + [1957, 1964], + [1966, 1970], + ] + def merge_spans(old, new): if not new: @@ -211,7 +216,7 @@ def merge_spans(old, new): old.extend(new) years = set() for span in old: - for y in range(span[0], span[1]+1): + for y in range(span[0], span[1] + 1): years.add(y) if not years: return [] @@ -240,19 +245,14 @@ def merge_spans(old, new): spans.append([start, last]) return spans + def test_merge_spans(): - assert merge_spans([[5, 10]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 9]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 11]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([], []) == \ - [] - assert merge_spans([[9, 11]], []) == \ - [[9,11]] - assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \ - [[1450, 1900], [2000, 2000]] + assert merge_spans([[5, 10]], [[10, 20]]) == [[5, 20]] + assert merge_spans([[5, 9]], [[10, 20]]) == [[5, 20]] + assert merge_spans([[5, 11]], [[10, 20]]) == [[5, 20]] + assert merge_spans([], []) == [] + assert merge_spans([[9, 11]], []) == [[9, 11]] + assert merge_spans([[2000, 2000]], [[1450, 1900]]) == [[1450, 1900], [2000, 2000]] def unquote(s: str) -> str: @@ -260,7 +260,7 @@ def unquote(s: str) -> str: s = s[1:] if s.endswith('"') or s.endswith("'"): s = s[:-1] - if s.endswith('.'): + if s.endswith("."): s = s[:-1] return s.strip() @@ -283,6 +283,7 @@ def clean_str(s: Optional[str]) -> Optional[str]: s = unquote(ftfy.fix_text(s)) return s or None + def test_clean_str(): assert clean_str("") is None assert clean_str(" ") is None @@ -290,7 +291,6 @@ def test_clean_str(): assert clean_str(" Bloody work.") == "Bloody work" - def clean_issn(s: str) -> Optional[str]: s = s.strip().upper() if len(s) == 8: @@ -299,6 +299,7 @@ def clean_issn(s: str) -> Optional[str]: return None return s + def test_clean_issn(): assert clean_issn("1234-5678") == "1234-5678" assert clean_issn(" 12345678") == "1234-5678" |