aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'chocula/util.py')
-rw-r--r--chocula/util.py311
1 files changed, 311 insertions, 0 deletions
diff --git a/chocula/util.py b/chocula/util.py
new file mode 100644
index 0000000..533b41a
--- /dev/null
+++ b/chocula/util.py
@@ -0,0 +1,311 @@
+
+import urlcanon
+import surt
+import tldextract
+import pycountry
+
+################### Utilities
+
+# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
+# software frameworks
+PLATFORM_MAP = {
+ 'OJS': 'ojs',
+ 'BMC': 'bmc',
+ 'SciELO Brazil': 'scielo',
+ 'SciELO Argentina': 'scielo',
+ 'SciELO': 'scielo',
+ 'SciELO Mexico': 'scielo',
+ 'SciELO Spain': 'scielo',
+ 'SciELO Portugal': 'scielo',
+ 'WordPress': 'wordpress',
+ 'Sciendo': 'sciendo',
+ 'Drupal': 'drupal',
+ 'revues.org': 'openedition',
+}
+
+MIMETYPE_MAP = {
+ 'PDF': 'application/pdf',
+ 'HTML': 'text/html',
+ 'XML': 'application/xml',
+}
+
+BIG5_PUBLISHERS = [
+ 'Elsevier',
+ 'Informa UK (Taylor & Francis)',
+ 'Springer-Verlag',
+ 'SAGE Publications',
+ 'Wiley (Blackwell Publishing)',
+ 'Wiley (John Wiley & Sons)',
+ 'Springer (Biomed Central Ltd.)',
+ 'Springer Nature',
+]
+COMMERCIAL_PUBLISHERS = [
+ 'Peter Lang International Academic Publishers',
+ 'Walter de Gruyter GmbH',
+ 'Oldenbourg Wissenschaftsverlag',
+ 'Georg Thieme Verlag KG', # not springer
+ 'Emerald (MCB UP )',
+ 'Medknow Publications',
+ 'Inderscience Enterprises Ltd',
+ 'Bentham Science',
+ 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins',
+ 'Scientific Research Publishing, Inc',
+ 'MDPI AG',
+ 'S. Karger AG',
+ 'Pleiades Publishing',
+ 'Science Publishing Group',
+ 'IGI Global',
+ 'The Economist Intelligence Unit',
+ 'Maney Publishing',
+ 'Diva Enterprises Private Limited',
+ 'World Scientific',
+ 'Mary Ann Liebert',
+ 'Trans Tech Publications',
+]
+OA_PUBLISHERS = [
+ 'Hindawi Limited',
+ 'OMICS Publishing Group',
+ 'De Gruyter Open Sp. z o.o.',
+ 'OpenEdition',
+ 'Hindawi (International Scholarly Research Network)',
+ 'Public Library of Science',
+ 'Frontiers Media SA',
+ 'eLife Sciences Publications, Ltd',
+ 'MDPI AG',
+ 'Hindawi (International Scholarly Research Network)',
+ 'Dove Medical Press',
+ 'Open Access Text',
+]
+SOCIETY_PUBLISHERS = [
+ 'Institute of Electrical and Electronics Engineers',
+ 'Institution of Electrical Engineers',
+ 'Association for Computing Machinery',
+ 'American Psychological Association',
+ 'IOS Press',
+ 'IOP Publishing',
+ 'American Chemical Society',
+ 'Royal Society of Chemistry (RSC)',
+ 'American Geophysical Union',
+ 'American College of Physicians',
+ 'New England Journal of Medicine',
+ 'BMJ',
+ 'RCN Publishing',
+ 'International Union of Crystallography',
+ 'Portland Press',
+ 'ASME International',
+]
+UNI_PRESS_PUBLISHERS = [
+ 'Cambridge University Press',
+ 'Oxford University Press',
+ 'The University of Chicago Press',
+ 'MIT Press',
+]
+ARCHIVE_PUBLISHERS = [
+ 'JSTOR',
+ 'Portico',
+]
+REPOSITORY_PUBLISHERS = [
+ 'PERSEE Program',
+ 'Social Science Electronic Publishing',
+ 'CAIRN',
+ 'CSIRO Publishing',
+]
+OTHER_PUBLISHERS = [
+ 'African Journals Online',
+ 'Smithsonian Institution Biodiversity Heritage Library',
+ 'Canadian Science Publishing',
+ 'Philosophy Documentation Center',
+ 'Project MUSE',
+]
+
+def unquote(s):
+ if s.startswith('"'):
+ s = s[1:]
+ if s.endswith('"'):
+ s = s[:-1]
+ if s.endswith('.'):
+ s = s[:-1]
+ return s.strip()
+
+def parse_lang(s):
+ if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ lang = pycountry.languages.get(alpha2=s.lower())
+ elif len(s) == 3:
+ lang = pycountry.languages.get(alpha3=s.lower())
+ else:
+ lang = pycountry.languages.get(name=s)
+ return lang.alpha2.lower()
+ except KeyError:
+ return None
+ except AttributeError:
+ return None
+
+def parse_country(s):
+ if not s or s in ('Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ country = pycountry.countries.get(alpha2=s.lower())
+ else:
+ country = pycountry.countries.get(name=s)
+ except KeyError:
+ return None
+ if country:
+ return country.alpha_2.lower()
+ else:
+ return None
+
+def parse_mimetypes(val):
+ # XXX: multiple mimetypes?
+ if not val:
+ return
+ mimetype = None
+ if '/' in val:
+ mimetype = val
+ else:
+ mimetype = MIMETYPE_MAP.get(val)
+ if not mimetype:
+ return None
+ return [mimetype]
+
+def gaps_to_spans(first, last, gaps):
+ if not gaps:
+ return [[first, last]]
+ if not (last >= first and max(gaps) < last and min(gaps) > first):
+ # mangled
+ print("mangled years: {}".format((first, last, gaps)))
+ return []
+ full = list(range(first, last+1))
+ for missing in gaps:
+ full.remove(missing)
+ spans = []
+ low = None
+ last = None
+ for year in full:
+ if not low:
+ low = year
+ last = year
+ continue
+ if year != last+1:
+ spans.append([low, last])
+ low = year
+ last = year
+ last = year
+ if low:
+ spans.append([low, last])
+ return spans
+
+def test_gaps():
+ assert gaps_to_spans(1900, 1900, None) == \
+ [[1900, 1900]]
+ assert gaps_to_spans(1900, 1903, None) == \
+ [[1900, 1903]]
+ assert gaps_to_spans(1900, 1902, [1901]) == \
+ [[1900, 1900], [1902, 1902]]
+ assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
+ [[1950, 1954], [1957, 1964], [1966, 1970]]
+
+def merge_spans(old, new):
+ if not new:
+ return old
+ if not old:
+ old = []
+ old.extend(new)
+ years = set()
+ for span in old:
+ for y in range(span[0], span[1]+1):
+ years.add(y)
+ if not years:
+ return []
+ spans = []
+ start = None
+ last = None
+ todo = False
+ for y in sorted(list(years)):
+ if start == None:
+ # very first
+ start = y
+ last = y
+ todo = True
+ continue
+ if y == last + 1:
+ # span continues
+ last = y
+ todo = True
+ continue
+ # a gap just happened!
+ spans.append([start, last])
+ start = y
+ last = y
+ todo = True
+ if todo:
+ spans.append([start, last])
+ return spans
+
+def test_merge_spans():
+ assert merge_spans([[5, 10]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 9]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 11]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([], []) == \
+ []
+ assert merge_spans([[9, 11]], []) == \
+ [[9,11]]
+ assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
+ [[1450, 1900], [2000, 2000]]
+
+
+def parse_url(url):
+ """
+ Parses/cleans URLs.
+
+ Returns a dict with:
+
+ url: str, cleaned/normalized URL
+ url_surt: str, "sortable url" (a web-archiving format)
+ host: str, full hostname
+ registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
+ suffix: str, eg "com" or "co.uk"
+
+ Returns None if url is really bad (not a URL).
+ """
+ if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+ return None
+ if url.startswith('www.'):
+ url = "http://" + url
+ if url.startswith('ttp://') or url.startswith('ttps://'):
+ url = "h" + url
+ url.replace('Http://', 'http://')
+
+ url = str(urlcanon.semantic_precise(url))
+ if url == 'http://na/':
+ # sort of redundant with above, but some only match after canonicalization
+ return None
+ url_surt = surt.surt(url)
+ tld = tldextract.extract(url)
+ host = '.'.join(tld)
+ if host.startswith('.'):
+ host = host[1:]
+ return dict(url=url,
+ url_surt=url_surt or None,
+ host=host or None,
+ registered_domain=tld.registered_domain or None,
+ suffix=tld.suffix or None)
+
+def test_parse_url():
+
+ assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
+ assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
+ assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
+
+ assert parse_url("google.com")['suffix'] == 'com'
+ assert parse_url("google.com")['host'] == 'google.com'
+
+ assert parse_url("mailto:bnewbold@bogus.com") == None
+ assert parse_url("thing.com")['url'] == 'http://thing.com/'
+ assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'