aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-06 14:21:39 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-06 14:21:39 -0700
commit25b4ca85cf535629a7e1cdce02999084e5bc2535 (patch)
tree7aba40cf479c130bc0987ddb9ea5a1204971deb6
parent3e7e3eb8684a91fd7dc97d05f259eec61c525927 (diff)
downloadchocula-25b4ca85cf535629a7e1cdce02999084e5bc2535.tar.gz
chocula-25b4ca85cf535629a7e1cdce02999084e5bc2535.zip
start refactoring files into module
-rw-r--r--chocula/__init__.py2
-rwxr-xr-xchocula/chocula.py (renamed from chocula.py)462
-rw-r--r--chocula/config.py30
-rw-r--r--chocula/util.py311
-rwxr-xr-xchocula_tool.py123
-rw-r--r--extra/count_chocola.jpg (renamed from count_chocola.jpg)bin32753 -> 32753 bytes
-rw-r--r--extra/wikidata/wikidata.sparql (renamed from wikidata.sparql)0
7 files changed, 470 insertions, 458 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py
new file mode 100644
index 0000000..0b8a5e1
--- /dev/null
+++ b/chocula/__init__.py
@@ -0,0 +1,2 @@
+
+from chocula.chocula import ChoculaDatabase
diff --git a/chocula.py b/chocula/chocula.py
index f34a4e2..27d6f80 100755
--- a/chocula.py
+++ b/chocula/chocula.py
@@ -1,398 +1,14 @@
-#!/usr/bin/env python3
-"""
-Count Chocula - online serials metadata and stats
-
- "one, two, three, un-preserved web-native open-access long-tail indie
- journals, hah, hah, hah!"
-
- (yeah, I know, this name isn't very good)
- (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
-
-Commands:
-
- everything
- init_db
- summarize
- export
- export_fatcat
-
- index_doaj
- index_road
- index_crossref
- index_entrez
- index_norwegian
- index_szczepanski
- index_ezb
- index_wikidata
- index_openapc
-
- load_fatcat
- load_fatcat_stats
-
- export_urls
- update_url_status
-
-Future commands:
-
- index_jurn
- index_datacite
- preserve_kbart --keeper SLUG
- preserve_sim
-
-See TODO.md for more work-in-progress
-"""
-
-import sys, csv, json
+import csv
+import json
from collections import Counter
import sqlite3
-import argparse
import ftfy
-import urlcanon
-import surt
-import tldextract
-import pycountry
import stdnum.issn
-
-################### File Config
-
-ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt'
-
-ENTREZ_FILE = 'data/entrez-journals.csv'
-ROAD_FILE = 'data/road-2018-01-24.tsv'
-ROAD_DATE = '2018-01-24'
-DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv'
-DOAJ_DATE = '2019-12-21'
-CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv'
-SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
-SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
-NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv'
-NORWEGIAN_DATE = '2019-12-21'
-LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
-CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
-PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
-JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt'
-SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
-SZCZEPANSKI_DATE = '2018'
-SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
-EZB_FILE = 'data/ezb_metadata.json'
-GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv'
-WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv'
-OPENAPC_FILE = 'data/apc_de.2019-12-20.csv'
-FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json'
-
-IA_CRAWL_FILE = 'data/url_status.20191223.json'
-FATCAT_STATS_FILE = 'data/container_stats.20191213.json'
-
-
-################### Utilities
-
-# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
-# software frameworks
-PLATFORM_MAP = {
- 'OJS': 'ojs',
- 'BMC': 'bmc',
- 'SciELO Brazil': 'scielo',
- 'SciELO Argentina': 'scielo',
- 'SciELO': 'scielo',
- 'SciELO Mexico': 'scielo',
- 'SciELO Spain': 'scielo',
- 'SciELO Portugal': 'scielo',
- 'WordPress': 'wordpress',
- 'Sciendo': 'sciendo',
- 'Drupal': 'drupal',
- 'revues.org': 'openedition',
-}
-
-MIMETYPE_MAP = {
- 'PDF': 'application/pdf',
- 'HTML': 'text/html',
- 'XML': 'application/xml',
-}
-
-BIG5_PUBLISHERS = [
- 'Elsevier',
- 'Informa UK (Taylor & Francis)',
- 'Springer-Verlag',
- 'SAGE Publications',
- 'Wiley (Blackwell Publishing)',
- 'Wiley (John Wiley & Sons)',
- 'Springer (Biomed Central Ltd.)',
- 'Springer Nature',
-]
-COMMERCIAL_PUBLISHERS = [
- 'Peter Lang International Academic Publishers',
- 'Walter de Gruyter GmbH',
- 'Oldenbourg Wissenschaftsverlag',
- 'Georg Thieme Verlag KG', # not springer
- 'Emerald (MCB UP )',
- 'Medknow Publications',
- 'Inderscience Enterprises Ltd',
- 'Bentham Science',
- 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins',
- 'Scientific Research Publishing, Inc',
- 'MDPI AG',
- 'S. Karger AG',
- 'Pleiades Publishing',
- 'Science Publishing Group',
- 'IGI Global',
- 'The Economist Intelligence Unit',
- 'Maney Publishing',
- 'Diva Enterprises Private Limited',
- 'World Scientific',
- 'Mary Ann Liebert',
- 'Trans Tech Publications',
-]
-OA_PUBLISHERS = [
- 'Hindawi Limited',
- 'OMICS Publishing Group',
- 'De Gruyter Open Sp. z o.o.',
- 'OpenEdition',
- 'Hindawi (International Scholarly Research Network)',
- 'Public Library of Science',
- 'Frontiers Media SA',
- 'eLife Sciences Publications, Ltd',
- 'MDPI AG',
- 'Hindawi (International Scholarly Research Network)',
- 'Dove Medical Press',
- 'Open Access Text',
-]
-SOCIETY_PUBLISHERS = [
- 'Institute of Electrical and Electronics Engineers',
- 'Institution of Electrical Engineers',
- 'Association for Computing Machinery',
- 'American Psychological Association',
- 'IOS Press',
- 'IOP Publishing',
- 'American Chemical Society',
- 'Royal Society of Chemistry (RSC)',
- 'American Geophysical Union',
- 'American College of Physicians',
- 'New England Journal of Medicine',
- 'BMJ',
- 'RCN Publishing',
- 'International Union of Crystallography',
- 'Portland Press',
- 'ASME International',
-]
-UNI_PRESS_PUBLISHERS = [
- 'Cambridge University Press',
- 'Oxford University Press',
- 'The University of Chicago Press',
- 'MIT Press',
-]
-ARCHIVE_PUBLISHERS = [
- 'JSTOR',
- 'Portico',
-]
-REPOSITORY_PUBLISHERS = [
- 'PERSEE Program',
- 'Social Science Electronic Publishing',
- 'CAIRN',
- 'CSIRO Publishing',
-]
-OTHER_PUBLISHERS = [
- 'African Journals Online',
- 'Smithsonian Institution Biodiversity Heritage Library',
- 'Canadian Science Publishing',
- 'Philosophy Documentation Center',
- 'Project MUSE',
-]
-
-def unquote(s):
- if s.startswith('"'):
- s = s[1:]
- if s.endswith('"'):
- s = s[:-1]
- if s.endswith('.'):
- s = s[:-1]
- return s.strip()
-
-def parse_lang(s):
- if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
- return None
- try:
- if len(s) == 2:
- lang = pycountry.languages.get(alpha2=s.lower())
- elif len(s) == 3:
- lang = pycountry.languages.get(alpha3=s.lower())
- else:
- lang = pycountry.languages.get(name=s)
- return lang.alpha2.lower()
- except KeyError:
- return None
- except AttributeError:
- return None
-
-def parse_country(s):
- if not s or s in ('Unknown'):
- return None
- try:
- if len(s) == 2:
- country = pycountry.countries.get(alpha2=s.lower())
- else:
- country = pycountry.countries.get(name=s)
- except KeyError:
- return None
- if country:
- return country.alpha_2.lower()
- else:
- return None
-
-def parse_mimetypes(val):
- # XXX: multiple mimetypes?
- if not val:
- return
- mimetype = None
- if '/' in val:
- mimetype = val
- else:
- mimetype = MIMETYPE_MAP.get(val)
- if not mimetype:
- return None
- return [mimetype]
-
-def gaps_to_spans(first, last, gaps):
- if not gaps:
- return [[first, last]]
- if not (last >= first and max(gaps) < last and min(gaps) > first):
- # mangled
- print("mangled years: {}".format((first, last, gaps)))
- return []
- full = list(range(first, last+1))
- for missing in gaps:
- full.remove(missing)
- spans = []
- low = None
- last = None
- for year in full:
- if not low:
- low = year
- last = year
- continue
- if year != last+1:
- spans.append([low, last])
- low = year
- last = year
- last = year
- if low:
- spans.append([low, last])
- return spans
-
-def test_gaps():
- assert gaps_to_spans(1900, 1900, None) == \
- [[1900, 1900]]
- assert gaps_to_spans(1900, 1903, None) == \
- [[1900, 1903]]
- assert gaps_to_spans(1900, 1902, [1901]) == \
- [[1900, 1900], [1902, 1902]]
- assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
- [[1950, 1954], [1957, 1964], [1966, 1970]]
-
-def merge_spans(old, new):
- if not new:
- return old
- if not old:
- old = []
- old.extend(new)
- years = set()
- for span in old:
- for y in range(span[0], span[1]+1):
- years.add(y)
- if not years:
- return []
- spans = []
- start = None
- last = None
- todo = False
- for y in sorted(list(years)):
- if start == None:
- # very first
- start = y
- last = y
- todo = True
- continue
- if y == last + 1:
- # span continues
- last = y
- todo = True
- continue
- # a gap just happened!
- spans.append([start, last])
- start = y
- last = y
- todo = True
- if todo:
- spans.append([start, last])
- return spans
-
-def test_merge_spans():
- assert merge_spans([[5, 10]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 9]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 11]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([], []) == \
- []
- assert merge_spans([[9, 11]], []) == \
- [[9,11]]
- assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
- [[1450, 1900], [2000, 2000]]
-
-
-def parse_url(url):
- """
- Parses/cleans URLs.
-
- Returns a dict with:
-
- url: str, cleaned/normalized URL
- url_surt: str, "sortable url" (a web-archiving format)
- host: str, full hostname
- registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
- suffix: str, eg "com" or "co.uk"
-
- Returns None if url is really bad (not a URL).
- """
- if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
- return None
- if url.startswith('www.'):
- url = "http://" + url
- if url.startswith('ttp://') or url.startswith('ttps://'):
- url = "h" + url
- url.replace('Http://', 'http://')
-
- url = str(urlcanon.semantic_precise(url))
- if url == 'http://na/':
- # sort of redundant with above, but some only match after canonicalization
- return None
- url_surt = surt.surt(url)
- tld = tldextract.extract(url)
- host = '.'.join(tld)
- if host.startswith('.'):
- host = host[1:]
- return dict(url=url,
- url_surt=url_surt or None,
- host=host or None,
- registered_domain=tld.registered_domain or None,
- suffix=tld.suffix or None)
-
-def test_parse_url():
-
- assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
- assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
- assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
-
- assert parse_url("google.com")['suffix'] == 'com'
- assert parse_url("google.com")['host'] == 'google.com'
-
- assert parse_url("mailto:bnewbold@bogus.com") == None
- assert parse_url("thing.com")['url'] == 'http://thing.com/'
- assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
-
+from chocula.config import *
+from chocula.util import *
################### Main Class
@@ -1397,73 +1013,3 @@ class ChoculaDatabase():
self.db.executescript(fschema.read())
print("Done!")
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- subparsers = parser.add_subparsers()
-
- parser.add_argument("--db-file",
- help="run in mode that considers only terminal HTML success",
- default='chocula.sqlite',
- type=str)
- parser.add_argument("--input-file",
- help="override default input file path",
- default=None,
- type=str)
-
- sub = subparsers.add_parser('everything',
- help="run all the commands")
- sub.set_defaults(func='everything')
-
- sub = subparsers.add_parser('init_db',
- help="create sqlite3 output file and tables")
- sub.set_defaults(func='init_db')
-
- sub = subparsers.add_parser('summarize',
- help="aggregate metadata from all tables into 'journals' table")
- sub.set_defaults(func='summarize')
-
- sub = subparsers.add_parser('export',
- help="dump JSON output")
- sub.set_defaults(func='export')
-
- sub = subparsers.add_parser('export_fatcat',
- help="dump JSON output in a format that can load into fatcat")
- sub.set_defaults(func='export_fatcat')
-
- # TODO: 'jurn'
- for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
- sub = subparsers.add_parser('index_{}'.format(ind),
- help="load metadata from {}".format(ind))
- sub.set_defaults(func='index_{}'.format(ind))
-
- sub = subparsers.add_parser('load_fatcat',
- help="load fatcat container metadata")
- sub.set_defaults(func='load_fatcat')
-
- sub = subparsers.add_parser('load_fatcat_stats',
- help="update container-level stats from JSON file")
- sub.set_defaults(func='load_fatcat_stats')
-
- sub = subparsers.add_parser('export_urls',
- help="dump homepage URLs (eg, to crawl for status)")
- sub.set_defaults(func='export_urls')
-
- sub = subparsers.add_parser('update_url_status',
- help="import homepage URL crawl status")
- sub.set_defaults(func='update_url_status')
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- print("tell me what to do! (try --help)")
- sys.exit(-1)
-
- cdb = ChoculaDatabase(args.db_file)
- if args.func.startswith('index_') or args.func in ('everything','summarize',):
- cdb.read_issn_map_file(ISSNL_FILE)
- func = getattr(cdb, args.func)
- func(args)
-
-if __name__ == '__main__':
- main()
-
diff --git a/chocula/config.py b/chocula/config.py
new file mode 100644
index 0000000..a32bdd1
--- /dev/null
+++ b/chocula/config.py
@@ -0,0 +1,30 @@
+
+################### File Config
+
+ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt'
+
+ENTREZ_FILE = 'data/entrez-journals.csv'
+ROAD_FILE = 'data/road-2018-01-24.tsv'
+ROAD_DATE = '2018-01-24'
+DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv'
+DOAJ_DATE = '2019-12-21'
+CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv'
+SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
+SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
+NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv'
+NORWEGIAN_DATE = '2019-12-21'
+LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
+CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
+PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
+JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt'
+SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
+SZCZEPANSKI_DATE = '2018'
+SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
+EZB_FILE = 'data/ezb_metadata.json'
+GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv'
+WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv'
+OPENAPC_FILE = 'data/apc_de.2019-12-20.csv'
+FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json'
+
+IA_CRAWL_FILE = 'data/url_status.20191223.json'
+FATCAT_STATS_FILE = 'data/container_stats.20191213.json'
diff --git a/chocula/util.py b/chocula/util.py
new file mode 100644
index 0000000..533b41a
--- /dev/null
+++ b/chocula/util.py
@@ -0,0 +1,311 @@
+
+import urlcanon
+import surt
+import tldextract
+import pycountry
+
+################### Utilities
+
+# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
+# software frameworks
+PLATFORM_MAP = {
+ 'OJS': 'ojs',
+ 'BMC': 'bmc',
+ 'SciELO Brazil': 'scielo',
+ 'SciELO Argentina': 'scielo',
+ 'SciELO': 'scielo',
+ 'SciELO Mexico': 'scielo',
+ 'SciELO Spain': 'scielo',
+ 'SciELO Portugal': 'scielo',
+ 'WordPress': 'wordpress',
+ 'Sciendo': 'sciendo',
+ 'Drupal': 'drupal',
+ 'revues.org': 'openedition',
+}
+
+MIMETYPE_MAP = {
+ 'PDF': 'application/pdf',
+ 'HTML': 'text/html',
+ 'XML': 'application/xml',
+}
+
+BIG5_PUBLISHERS = [
+ 'Elsevier',
+ 'Informa UK (Taylor & Francis)',
+ 'Springer-Verlag',
+ 'SAGE Publications',
+ 'Wiley (Blackwell Publishing)',
+ 'Wiley (John Wiley & Sons)',
+ 'Springer (Biomed Central Ltd.)',
+ 'Springer Nature',
+]
+COMMERCIAL_PUBLISHERS = [
+ 'Peter Lang International Academic Publishers',
+ 'Walter de Gruyter GmbH',
+ 'Oldenbourg Wissenschaftsverlag',
+ 'Georg Thieme Verlag KG', # not springer
+ 'Emerald (MCB UP )',
+ 'Medknow Publications',
+ 'Inderscience Enterprises Ltd',
+ 'Bentham Science',
+ 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins',
+ 'Scientific Research Publishing, Inc',
+ 'MDPI AG',
+ 'S. Karger AG',
+ 'Pleiades Publishing',
+ 'Science Publishing Group',
+ 'IGI Global',
+ 'The Economist Intelligence Unit',
+ 'Maney Publishing',
+ 'Diva Enterprises Private Limited',
+ 'World Scientific',
+ 'Mary Ann Liebert',
+ 'Trans Tech Publications',
+]
+OA_PUBLISHERS = [
+ 'Hindawi Limited',
+ 'OMICS Publishing Group',
+ 'De Gruyter Open Sp. z o.o.',
+ 'OpenEdition',
+ 'Hindawi (International Scholarly Research Network)',
+ 'Public Library of Science',
+ 'Frontiers Media SA',
+ 'eLife Sciences Publications, Ltd',
+ 'MDPI AG',
+ 'Hindawi (International Scholarly Research Network)',
+ 'Dove Medical Press',
+ 'Open Access Text',
+]
+SOCIETY_PUBLISHERS = [
+ 'Institute of Electrical and Electronics Engineers',
+ 'Institution of Electrical Engineers',
+ 'Association for Computing Machinery',
+ 'American Psychological Association',
+ 'IOS Press',
+ 'IOP Publishing',
+ 'American Chemical Society',
+ 'Royal Society of Chemistry (RSC)',
+ 'American Geophysical Union',
+ 'American College of Physicians',
+ 'New England Journal of Medicine',
+ 'BMJ',
+ 'RCN Publishing',
+ 'International Union of Crystallography',
+ 'Portland Press',
+ 'ASME International',
+]
+UNI_PRESS_PUBLISHERS = [
+ 'Cambridge University Press',
+ 'Oxford University Press',
+ 'The University of Chicago Press',
+ 'MIT Press',
+]
+ARCHIVE_PUBLISHERS = [
+ 'JSTOR',
+ 'Portico',
+]
+REPOSITORY_PUBLISHERS = [
+ 'PERSEE Program',
+ 'Social Science Electronic Publishing',
+ 'CAIRN',
+ 'CSIRO Publishing',
+]
+OTHER_PUBLISHERS = [
+ 'African Journals Online',
+ 'Smithsonian Institution Biodiversity Heritage Library',
+ 'Canadian Science Publishing',
+ 'Philosophy Documentation Center',
+ 'Project MUSE',
+]
+
+def unquote(s):
+ if s.startswith('"'):
+ s = s[1:]
+ if s.endswith('"'):
+ s = s[:-1]
+ if s.endswith('.'):
+ s = s[:-1]
+ return s.strip()
+
+def parse_lang(s):
+ if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ lang = pycountry.languages.get(alpha2=s.lower())
+ elif len(s) == 3:
+ lang = pycountry.languages.get(alpha3=s.lower())
+ else:
+ lang = pycountry.languages.get(name=s)
+ return lang.alpha2.lower()
+ except KeyError:
+ return None
+ except AttributeError:
+ return None
+
+def parse_country(s):
+ if not s or s in ('Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ country = pycountry.countries.get(alpha2=s.lower())
+ else:
+ country = pycountry.countries.get(name=s)
+ except KeyError:
+ return None
+ if country:
+ return country.alpha_2.lower()
+ else:
+ return None
+
+def parse_mimetypes(val):
+ # XXX: multiple mimetypes?
+ if not val:
+ return
+ mimetype = None
+ if '/' in val:
+ mimetype = val
+ else:
+ mimetype = MIMETYPE_MAP.get(val)
+ if not mimetype:
+ return None
+ return [mimetype]
+
+def gaps_to_spans(first, last, gaps):
+ if not gaps:
+ return [[first, last]]
+ if not (last >= first and max(gaps) < last and min(gaps) > first):
+ # mangled
+ print("mangled years: {}".format((first, last, gaps)))
+ return []
+ full = list(range(first, last+1))
+ for missing in gaps:
+ full.remove(missing)
+ spans = []
+ low = None
+ last = None
+ for year in full:
+ if not low:
+ low = year
+ last = year
+ continue
+ if year != last+1:
+ spans.append([low, last])
+ low = year
+ last = year
+ last = year
+ if low:
+ spans.append([low, last])
+ return spans
+
+def test_gaps():
+ assert gaps_to_spans(1900, 1900, None) == \
+ [[1900, 1900]]
+ assert gaps_to_spans(1900, 1903, None) == \
+ [[1900, 1903]]
+ assert gaps_to_spans(1900, 1902, [1901]) == \
+ [[1900, 1900], [1902, 1902]]
+ assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
+ [[1950, 1954], [1957, 1964], [1966, 1970]]
+
+def merge_spans(old, new):
+ if not new:
+ return old
+ if not old:
+ old = []
+ old.extend(new)
+ years = set()
+ for span in old:
+ for y in range(span[0], span[1]+1):
+ years.add(y)
+ if not years:
+ return []
+ spans = []
+ start = None
+ last = None
+ todo = False
+ for y in sorted(list(years)):
+ if start == None:
+ # very first
+ start = y
+ last = y
+ todo = True
+ continue
+ if y == last + 1:
+ # span continues
+ last = y
+ todo = True
+ continue
+ # a gap just happened!
+ spans.append([start, last])
+ start = y
+ last = y
+ todo = True
+ if todo:
+ spans.append([start, last])
+ return spans
+
+def test_merge_spans():
+ assert merge_spans([[5, 10]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 9]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 11]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([], []) == \
+ []
+ assert merge_spans([[9, 11]], []) == \
+ [[9,11]]
+ assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
+ [[1450, 1900], [2000, 2000]]
+
+
+def parse_url(url):
+ """
+ Parses/cleans URLs.
+
+ Returns a dict with:
+
+ url: str, cleaned/normalized URL
+ url_surt: str, "sortable url" (a web-archiving format)
+ host: str, full hostname
+ registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
+ suffix: str, eg "com" or "co.uk"
+
+ Returns None if url is really bad (not a URL).
+ """
+ if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+ return None
+ if url.startswith('www.'):
+ url = "http://" + url
+ if url.startswith('ttp://') or url.startswith('ttps://'):
+ url = "h" + url
+ url.replace('Http://', 'http://')
+
+ url = str(urlcanon.semantic_precise(url))
+ if url == 'http://na/':
+ # sort of redundant with above, but some only match after canonicalization
+ return None
+ url_surt = surt.surt(url)
+ tld = tldextract.extract(url)
+ host = '.'.join(tld)
+ if host.startswith('.'):
+ host = host[1:]
+ return dict(url=url,
+ url_surt=url_surt or None,
+ host=host or None,
+ registered_domain=tld.registered_domain or None,
+ suffix=tld.suffix or None)
+
+def test_parse_url():
+
+ assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
+ assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
+ assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
+
+ assert parse_url("google.com")['suffix'] == 'com'
+ assert parse_url("google.com")['host'] == 'google.com'
+
+ assert parse_url("mailto:bnewbold@bogus.com") == None
+ assert parse_url("thing.com")['url'] == 'http://thing.com/'
+ assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
diff --git a/chocula_tool.py b/chocula_tool.py
new file mode 100755
index 0000000..345097e
--- /dev/null
+++ b/chocula_tool.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+"""
+Count Chocula - online serials metadata and stats
+
+ "one, two, three, un-preserved web-native open-access long-tail indie
+ journals, hah, hah, hah!"
+
+ (yeah, I know, this name isn't very good)
+ (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
+
+Commands:
+
+ everything
+ init_db
+ summarize
+ export
+ export_fatcat
+
+ index_doaj
+ index_road
+ index_crossref
+ index_entrez
+ index_norwegian
+ index_szczepanski
+ index_ezb
+ index_wikidata
+ index_openapc
+
+ load_fatcat
+ load_fatcat_stats
+
+ export_urls
+ update_url_status
+
+Future commands:
+
+ index_jurn
+ index_datacite
+ preserve_kbart --keeper SLUG
+ preserve_sim
+
+See TODO.md for more work-in-progress
+"""
+
+import sys
+import csv
+import argparse
+
+from chocula import ChoculaDatabase
+from chocula.config import *
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ parser.add_argument("--db-file",
+ help="run in mode that considers only terminal HTML success",
+ default='chocula.sqlite',
+ type=str)
+ parser.add_argument("--input-file",
+ help="override default input file path",
+ default=None,
+ type=str)
+
+ sub = subparsers.add_parser('everything',
+ help="run all the commands")
+ sub.set_defaults(func='everything')
+
+ sub = subparsers.add_parser('init_db',
+ help="create sqlite3 output file and tables")
+ sub.set_defaults(func='init_db')
+
+ sub = subparsers.add_parser('summarize',
+ help="aggregate metadata from all tables into 'journals' table")
+ sub.set_defaults(func='summarize')
+
+ sub = subparsers.add_parser('export',
+ help="dump JSON output")
+ sub.set_defaults(func='export')
+
+ sub = subparsers.add_parser('export_fatcat',
+ help="dump JSON output in a format that can load into fatcat")
+ sub.set_defaults(func='export_fatcat')
+
+ # TODO: 'jurn'
+ for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
+ sub = subparsers.add_parser('index_{}'.format(ind),
+ help="load metadata from {}".format(ind))
+ sub.set_defaults(func='index_{}'.format(ind))
+
+ sub = subparsers.add_parser('load_fatcat',
+ help="load fatcat container metadata")
+ sub.set_defaults(func='load_fatcat')
+
+ sub = subparsers.add_parser('load_fatcat_stats',
+ help="update container-level stats from JSON file")
+ sub.set_defaults(func='load_fatcat_stats')
+
+ sub = subparsers.add_parser('export_urls',
+ help="dump homepage URLs (eg, to crawl for status)")
+ sub.set_defaults(func='export_urls')
+
+ sub = subparsers.add_parser('update_url_status',
+ help="import homepage URL crawl status")
+ sub.set_defaults(func='update_url_status')
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do! (try --help)")
+ sys.exit(-1)
+
+ cdb = ChoculaDatabase(args.db_file)
+ if args.func.startswith('index_') or args.func in ('everything','summarize',):
+ cdb.read_issn_map_file(ISSNL_FILE)
+ func = getattr(cdb, args.func)
+ func(args)
+
+if __name__ == '__main__':
+ main()
+
diff --git a/count_chocola.jpg b/extra/count_chocola.jpg
index e9da539..e9da539 100644
--- a/count_chocola.jpg
+++ b/extra/count_chocola.jpg
Binary files differ
diff --git a/wikidata.sparql b/extra/wikidata/wikidata.sparql
index 3f7e2f9..3f7e2f9 100644
--- a/wikidata.sparql
+++ b/extra/wikidata/wikidata.sparql