aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'chocula/util.py')
-rw-r--r--chocula/util.py237
1 files changed, 119 insertions, 118 deletions
diff --git a/chocula/util.py b/chocula/util.py
index 2cb771d..11303b8 100644
--- a/chocula/util.py
+++ b/chocula/util.py
@@ -1,4 +1,3 @@
-
import sys
from dataclasses import dataclass
from typing import Dict, Optional
@@ -11,119 +10,120 @@ import pycountry
# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
# software frameworks
PLATFORM_MAP = {
- 'OJS': 'ojs',
- 'OJS SEER': 'ojs',
- 'Open Journal System/OJS': 'ojs',
- 'BMC': 'bmc',
- 'SciELO Brazil': 'scielo',
- 'SciELO Argentina': 'scielo',
- 'SciELO': 'scielo',
- 'SciELO Mexico': 'scielo',
- 'SciELO Spain': 'scielo',
- 'SciELO Portugal': 'scielo',
- 'WordPress': 'wordpress',
- 'Sciendo': 'sciendo',
- 'Drupal': 'drupal',
- 'revues.org': 'openedition',
+ "OJS": "ojs",
+ "OJS SEER": "ojs",
+ "Open Journal System/OJS": "ojs",
+ "BMC": "bmc",
+ "SciELO Brazil": "scielo",
+ "SciELO Argentina": "scielo",
+ "SciELO": "scielo",
+ "SciELO Mexico": "scielo",
+ "SciELO Spain": "scielo",
+ "SciELO Portugal": "scielo",
+ "WordPress": "wordpress",
+ "Sciendo": "sciendo",
+ "Drupal": "drupal",
+ "revues.org": "openedition",
}
MIMETYPE_MAP = {
- 'PDF': 'application/pdf',
- 'HTML': 'text/html',
- 'XML': 'application/xml',
+ "PDF": "application/pdf",
+ "HTML": "text/html",
+ "XML": "application/xml",
}
BIG5_PUBLISHERS = [
- 'Elsevier',
- 'Informa UK (Taylor & Francis)',
- 'Springer-Verlag',
- 'SAGE Publications',
- 'Wiley (Blackwell Publishing)',
- 'Wiley (John Wiley & Sons)',
- 'Springer (Biomed Central Ltd.)',
- 'Springer Nature',
+ "Elsevier",
+ "Informa UK (Taylor & Francis)",
+ "Springer-Verlag",
+ "SAGE Publications",
+ "Wiley (Blackwell Publishing)",
+ "Wiley (John Wiley & Sons)",
+ "Springer (Biomed Central Ltd.)",
+ "Springer Nature",
]
COMMERCIAL_PUBLISHERS = [
- 'Peter Lang International Academic Publishers',
- 'Walter de Gruyter GmbH',
- 'Oldenbourg Wissenschaftsverlag',
- 'Georg Thieme Verlag KG', # not springer
- 'Emerald (MCB UP )',
- 'Medknow Publications',
- 'Inderscience Enterprises Ltd',
- 'Bentham Science',
- 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins',
- 'Scientific Research Publishing, Inc',
- 'MDPI AG',
- 'S. Karger AG',
- 'Pleiades Publishing',
- 'Science Publishing Group',
- 'IGI Global',
- 'The Economist Intelligence Unit',
- 'Maney Publishing',
- 'Diva Enterprises Private Limited',
- 'World Scientific',
- 'Mary Ann Liebert',
- 'Trans Tech Publications',
+ "Peter Lang International Academic Publishers",
+ "Walter de Gruyter GmbH",
+ "Oldenbourg Wissenschaftsverlag",
+ "Georg Thieme Verlag KG", # not springer
+ "Emerald (MCB UP )",
+ "Medknow Publications",
+ "Inderscience Enterprises Ltd",
+ "Bentham Science",
+ "Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins",
+ "Scientific Research Publishing, Inc",
+ "MDPI AG",
+ "S. Karger AG",
+ "Pleiades Publishing",
+ "Science Publishing Group",
+ "IGI Global",
+ "The Economist Intelligence Unit",
+ "Maney Publishing",
+ "Diva Enterprises Private Limited",
+ "World Scientific",
+ "Mary Ann Liebert",
+ "Trans Tech Publications",
]
OA_PUBLISHERS = [
- 'Hindawi Limited',
- 'OMICS Publishing Group',
- 'De Gruyter Open Sp. z o.o.',
- 'OpenEdition',
- 'Hindawi (International Scholarly Research Network)',
- 'Public Library of Science',
- 'Frontiers Media SA',
- 'eLife Sciences Publications, Ltd',
- 'MDPI AG',
- 'Hindawi (International Scholarly Research Network)',
- 'Dove Medical Press',
- 'Open Access Text',
+ "Hindawi Limited",
+ "OMICS Publishing Group",
+ "De Gruyter Open Sp. z o.o.",
+ "OpenEdition",
+ "Hindawi (International Scholarly Research Network)",
+ "Public Library of Science",
+ "Frontiers Media SA",
+ "eLife Sciences Publications, Ltd",
+ "MDPI AG",
+ "Hindawi (International Scholarly Research Network)",
+ "Dove Medical Press",
+ "Open Access Text",
]
SOCIETY_PUBLISHERS = [
- 'Institute of Electrical and Electronics Engineers',
- 'Institution of Electrical Engineers',
- 'Association for Computing Machinery',
- 'American Psychological Association',
- 'IOS Press',
- 'IOP Publishing',
- 'American Chemical Society',
- 'Royal Society of Chemistry (RSC)',
- 'American Geophysical Union',
- 'American College of Physicians',
- 'New England Journal of Medicine',
- 'BMJ',
- 'RCN Publishing',
- 'International Union of Crystallography',
- 'Portland Press',
- 'ASME International',
+ "Institute of Electrical and Electronics Engineers",
+ "Institution of Electrical Engineers",
+ "Association for Computing Machinery",
+ "American Psychological Association",
+ "IOS Press",
+ "IOP Publishing",
+ "American Chemical Society",
+ "Royal Society of Chemistry (RSC)",
+ "American Geophysical Union",
+ "American College of Physicians",
+ "New England Journal of Medicine",
+ "BMJ",
+ "RCN Publishing",
+ "International Union of Crystallography",
+ "Portland Press",
+ "ASME International",
]
UNI_PRESS_PUBLISHERS = [
- 'Cambridge University Press',
- 'Oxford University Press',
- 'The University of Chicago Press',
- 'MIT Press',
+ "Cambridge University Press",
+ "Oxford University Press",
+ "The University of Chicago Press",
+ "MIT Press",
]
ARCHIVE_PUBLISHERS = [
- 'JSTOR',
- 'Portico',
+ "JSTOR",
+ "Portico",
]
REPOSITORY_PUBLISHERS = [
- 'PERSEE Program',
- 'Social Science Electronic Publishing',
- 'CAIRN',
- 'CSIRO Publishing',
+ "PERSEE Program",
+ "Social Science Electronic Publishing",
+ "CAIRN",
+ "CSIRO Publishing",
]
OTHER_PUBLISHERS = [
- 'African Journals Online',
- 'Smithsonian Institution Biodiversity Heritage Library',
- 'Canadian Science Publishing',
- 'Philosophy Documentation Center',
- 'Project MUSE',
+ "African Journals Online",
+ "Smithsonian Institution Biodiversity Heritage Library",
+ "Canadian Science Publishing",
+ "Philosophy Documentation Center",
+ "Project MUSE",
]
+
def parse_lang(s):
- if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+ if not s or s in ("Not applicable", "Multiple languages", "Unknown"):
return None
try:
if len(s) == 2:
@@ -138,8 +138,9 @@ def parse_lang(s):
except AttributeError:
return None
+
def parse_country(s):
- if not s or s in ('Unknown'):
+ if not s or s in ("Unknown"):
return None
try:
if len(s) == 2:
@@ -153,12 +154,13 @@ def parse_country(s):
else:
return None
+
def parse_mimetypes(val):
# XXX: multiple mimetypes?
if not val:
return
mimetype = None
- if '/' in val:
+ if "/" in val:
mimetype = val
else:
mimetype = MIMETYPE_MAP.get(val)
@@ -166,13 +168,14 @@ def parse_mimetypes(val):
return None
return [mimetype]
+
def gaps_to_spans(first, last, gaps):
if not gaps:
return [[first, last]]
if not (last >= first and max(gaps) < last and min(gaps) > first):
# years seem mangled? will continue though
print("mangled years: {}".format((first, last, gaps)), file=sys.stderr)
- full = list(range(first, last+1))
+ full = list(range(first, last + 1))
for missing in gaps:
if missing in full:
full.remove(missing)
@@ -184,7 +187,7 @@ def gaps_to_spans(first, last, gaps):
low = year
last = year
continue
- if year != last+1:
+ if year != last + 1:
spans.append([low, last])
low = year
last = year
@@ -193,15 +196,17 @@ def gaps_to_spans(first, last, gaps):
spans.append([low, last])
return spans
+
def test_gaps():
- assert gaps_to_spans(1900, 1900, None) == \
- [[1900, 1900]]
- assert gaps_to_spans(1900, 1903, None) == \
- [[1900, 1903]]
- assert gaps_to_spans(1900, 1902, [1901]) == \
- [[1900, 1900], [1902, 1902]]
- assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
- [[1950, 1954], [1957, 1964], [1966, 1970]]
+ assert gaps_to_spans(1900, 1900, None) == [[1900, 1900]]
+ assert gaps_to_spans(1900, 1903, None) == [[1900, 1903]]
+ assert gaps_to_spans(1900, 1902, [1901]) == [[1900, 1900], [1902, 1902]]
+ assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == [
+ [1950, 1954],
+ [1957, 1964],
+ [1966, 1970],
+ ]
+
def merge_spans(old, new):
if not new:
@@ -211,7 +216,7 @@ def merge_spans(old, new):
old.extend(new)
years = set()
for span in old:
- for y in range(span[0], span[1]+1):
+ for y in range(span[0], span[1] + 1):
years.add(y)
if not years:
return []
@@ -240,19 +245,14 @@ def merge_spans(old, new):
spans.append([start, last])
return spans
+
def test_merge_spans():
- assert merge_spans([[5, 10]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 9]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 11]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([], []) == \
- []
- assert merge_spans([[9, 11]], []) == \
- [[9,11]]
- assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
- [[1450, 1900], [2000, 2000]]
+ assert merge_spans([[5, 10]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([[5, 9]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([[5, 11]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([], []) == []
+ assert merge_spans([[9, 11]], []) == [[9, 11]]
+ assert merge_spans([[2000, 2000]], [[1450, 1900]]) == [[1450, 1900], [2000, 2000]]
def unquote(s: str) -> str:
@@ -260,7 +260,7 @@ def unquote(s: str) -> str:
s = s[1:]
if s.endswith('"') or s.endswith("'"):
s = s[:-1]
- if s.endswith('.'):
+ if s.endswith("."):
s = s[:-1]
return s.strip()
@@ -283,6 +283,7 @@ def clean_str(s: Optional[str]) -> Optional[str]:
s = unquote(ftfy.fix_text(s))
return s or None
+
def test_clean_str():
assert clean_str("") is None
assert clean_str(" ") is None
@@ -290,7 +291,6 @@ def test_clean_str():
assert clean_str(" Bloody work.") == "Bloody work"
-
def clean_issn(s: str) -> Optional[str]:
s = s.strip().upper()
if len(s) == 8:
@@ -299,6 +299,7 @@ def clean_issn(s: str) -> Optional[str]:
return None
return s
+
def test_clean_issn():
assert clean_issn("1234-5678") == "1234-5678"
assert clean_issn(" 12345678") == "1234-5678"