diff options
-rw-r--r-- | chocula/__init__.py | 2 | ||||
-rwxr-xr-x | chocula/chocula.py (renamed from chocula.py) | 462 | ||||
-rw-r--r-- | chocula/config.py | 30 | ||||
-rw-r--r-- | chocula/util.py | 311 | ||||
-rwxr-xr-x | chocula_tool.py | 123 | ||||
-rw-r--r-- | extra/count_chocola.jpg (renamed from count_chocola.jpg) | bin | 32753 -> 32753 bytes | |||
-rw-r--r-- | extra/wikidata/wikidata.sparql (renamed from wikidata.sparql) | 0 |
7 files changed, 470 insertions, 458 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py new file mode 100644 index 0000000..0b8a5e1 --- /dev/null +++ b/chocula/__init__.py @@ -0,0 +1,2 @@ + +from chocula.chocula import ChoculaDatabase diff --git a/chocula.py b/chocula/chocula.py index f34a4e2..27d6f80 100755 --- a/chocula.py +++ b/chocula/chocula.py @@ -1,398 +1,14 @@ -#!/usr/bin/env python3 -""" -Count Chocula - online serials metadata and stats - - "one, two, three, un-preserved web-native open-access long-tail indie - journals, hah, hah, hah!" - - (yeah, I know, this name isn't very good) - (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html) - -Commands: - - everything - init_db - summarize - export - export_fatcat - - index_doaj - index_road - index_crossref - index_entrez - index_norwegian - index_szczepanski - index_ezb - index_wikidata - index_openapc - - load_fatcat - load_fatcat_stats - - export_urls - update_url_status - -Future commands: - - index_jurn - index_datacite - preserve_kbart --keeper SLUG - preserve_sim - -See TODO.md for more work-in-progress -""" - -import sys, csv, json +import csv +import json from collections import Counter import sqlite3 -import argparse import ftfy -import urlcanon -import surt -import tldextract -import pycountry import stdnum.issn - -################### File Config - -ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt' - -ENTREZ_FILE = 'data/entrez-journals.csv' -ROAD_FILE = 'data/road-2018-01-24.tsv' -ROAD_DATE = '2018-01-24' -DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv' -DOAJ_DATE = '2019-12-21' -CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv' -SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv' -SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv' -NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv' -NORWEGIAN_DATE = '2019-12-21' -LOCKSS_FILE = 'data/kbart_LOCKSS.txt' -CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt' -PORTICO_FILE = 'data/Portico_Holding_KBart.txt' -JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt' -SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv' -SZCZEPANSKI_DATE = '2018' -SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json' -EZB_FILE = 'data/ezb_metadata.json' -GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv' -WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv' -OPENAPC_FILE = 'data/apc_de.2019-12-20.csv' -FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json' - -IA_CRAWL_FILE = 'data/url_status.20191223.json' -FATCAT_STATS_FILE = 'data/container_stats.20191213.json' - - -################### Utilities - -# NOTE: this is a partial list, focusing on non-publisher hosted platforms and -# software frameworks -PLATFORM_MAP = { - 'OJS': 'ojs', - 'BMC': 'bmc', - 'SciELO Brazil': 'scielo', - 'SciELO Argentina': 'scielo', - 'SciELO': 'scielo', - 'SciELO Mexico': 'scielo', - 'SciELO Spain': 'scielo', - 'SciELO Portugal': 'scielo', - 'WordPress': 'wordpress', - 'Sciendo': 'sciendo', - 'Drupal': 'drupal', - 'revues.org': 'openedition', -} - -MIMETYPE_MAP = { - 'PDF': 'application/pdf', - 'HTML': 'text/html', - 'XML': 'application/xml', -} - -BIG5_PUBLISHERS = [ - 'Elsevier', - 'Informa UK (Taylor & Francis)', - 'Springer-Verlag', - 'SAGE Publications', - 'Wiley (Blackwell Publishing)', - 'Wiley (John Wiley & Sons)', - 'Springer (Biomed Central Ltd.)', - 'Springer Nature', -] -COMMERCIAL_PUBLISHERS = [ - 'Peter Lang International Academic Publishers', - 'Walter de Gruyter GmbH', - 'Oldenbourg Wissenschaftsverlag', - 'Georg Thieme Verlag KG', # not springer - 'Emerald (MCB UP )', - 'Medknow Publications', - 'Inderscience Enterprises Ltd', - 'Bentham Science', - 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins', - 'Scientific Research Publishing, Inc', - 'MDPI AG', - 'S. Karger AG', - 'Pleiades Publishing', - 'Science Publishing Group', - 'IGI Global', - 'The Economist Intelligence Unit', - 'Maney Publishing', - 'Diva Enterprises Private Limited', - 'World Scientific', - 'Mary Ann Liebert', - 'Trans Tech Publications', -] -OA_PUBLISHERS = [ - 'Hindawi Limited', - 'OMICS Publishing Group', - 'De Gruyter Open Sp. z o.o.', - 'OpenEdition', - 'Hindawi (International Scholarly Research Network)', - 'Public Library of Science', - 'Frontiers Media SA', - 'eLife Sciences Publications, Ltd', - 'MDPI AG', - 'Hindawi (International Scholarly Research Network)', - 'Dove Medical Press', - 'Open Access Text', -] -SOCIETY_PUBLISHERS = [ - 'Institute of Electrical and Electronics Engineers', - 'Institution of Electrical Engineers', - 'Association for Computing Machinery', - 'American Psychological Association', - 'IOS Press', - 'IOP Publishing', - 'American Chemical Society', - 'Royal Society of Chemistry (RSC)', - 'American Geophysical Union', - 'American College of Physicians', - 'New England Journal of Medicine', - 'BMJ', - 'RCN Publishing', - 'International Union of Crystallography', - 'Portland Press', - 'ASME International', -] -UNI_PRESS_PUBLISHERS = [ - 'Cambridge University Press', - 'Oxford University Press', - 'The University of Chicago Press', - 'MIT Press', -] -ARCHIVE_PUBLISHERS = [ - 'JSTOR', - 'Portico', -] -REPOSITORY_PUBLISHERS = [ - 'PERSEE Program', - 'Social Science Electronic Publishing', - 'CAIRN', - 'CSIRO Publishing', -] -OTHER_PUBLISHERS = [ - 'African Journals Online', - 'Smithsonian Institution Biodiversity Heritage Library', - 'Canadian Science Publishing', - 'Philosophy Documentation Center', - 'Project MUSE', -] - -def unquote(s): - if s.startswith('"'): - s = s[1:] - if s.endswith('"'): - s = s[:-1] - if s.endswith('.'): - s = s[:-1] - return s.strip() - -def parse_lang(s): - if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'): - return None - try: - if len(s) == 2: - lang = pycountry.languages.get(alpha2=s.lower()) - elif len(s) == 3: - lang = pycountry.languages.get(alpha3=s.lower()) - else: - lang = pycountry.languages.get(name=s) - return lang.alpha2.lower() - except KeyError: - return None - except AttributeError: - return None - -def parse_country(s): - if not s or s in ('Unknown'): - return None - try: - if len(s) == 2: - country = pycountry.countries.get(alpha2=s.lower()) - else: - country = pycountry.countries.get(name=s) - except KeyError: - return None - if country: - return country.alpha_2.lower() - else: - return None - -def parse_mimetypes(val): - # XXX: multiple mimetypes? - if not val: - return - mimetype = None - if '/' in val: - mimetype = val - else: - mimetype = MIMETYPE_MAP.get(val) - if not mimetype: - return None - return [mimetype] - -def gaps_to_spans(first, last, gaps): - if not gaps: - return [[first, last]] - if not (last >= first and max(gaps) < last and min(gaps) > first): - # mangled - print("mangled years: {}".format((first, last, gaps))) - return [] - full = list(range(first, last+1)) - for missing in gaps: - full.remove(missing) - spans = [] - low = None - last = None - for year in full: - if not low: - low = year - last = year - continue - if year != last+1: - spans.append([low, last]) - low = year - last = year - last = year - if low: - spans.append([low, last]) - return spans - -def test_gaps(): - assert gaps_to_spans(1900, 1900, None) == \ - [[1900, 1900]] - assert gaps_to_spans(1900, 1903, None) == \ - [[1900, 1903]] - assert gaps_to_spans(1900, 1902, [1901]) == \ - [[1900, 1900], [1902, 1902]] - assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \ - [[1950, 1954], [1957, 1964], [1966, 1970]] - -def merge_spans(old, new): - if not new: - return old - if not old: - old = [] - old.extend(new) - years = set() - for span in old: - for y in range(span[0], span[1]+1): - years.add(y) - if not years: - return [] - spans = [] - start = None - last = None - todo = False - for y in sorted(list(years)): - if start == None: - # very first - start = y - last = y - todo = True - continue - if y == last + 1: - # span continues - last = y - todo = True - continue - # a gap just happened! - spans.append([start, last]) - start = y - last = y - todo = True - if todo: - spans.append([start, last]) - return spans - -def test_merge_spans(): - assert merge_spans([[5, 10]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 9]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 11]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([], []) == \ - [] - assert merge_spans([[9, 11]], []) == \ - [[9,11]] - assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \ - [[1450, 1900], [2000, 2000]] - - -def parse_url(url): - """ - Parses/cleans URLs. - - Returns a dict with: - - url: str, cleaned/normalized URL - url_surt: str, "sortable url" (a web-archiving format) - host: str, full hostname - registered_domain: "primary domain", eg "google.com" or "thing.co.uk" - suffix: str, eg "com" or "co.uk" - - Returns None if url is really bad (not a URL). - """ - if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'): - return None - if url.startswith('www.'): - url = "http://" + url - if url.startswith('ttp://') or url.startswith('ttps://'): - url = "h" + url - url.replace('Http://', 'http://') - - url = str(urlcanon.semantic_precise(url)) - if url == 'http://na/': - # sort of redundant with above, but some only match after canonicalization - return None - url_surt = surt.surt(url) - tld = tldextract.extract(url) - host = '.'.join(tld) - if host.startswith('.'): - host = host[1:] - return dict(url=url, - url_surt=url_surt or None, - host=host or None, - registered_domain=tld.registered_domain or None, - suffix=tld.suffix or None) - -def test_parse_url(): - - assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk' - assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' - assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' - - assert parse_url("google.com")['suffix'] == 'com' - assert parse_url("google.com")['host'] == 'google.com' - - assert parse_url("mailto:bnewbold@bogus.com") == None - assert parse_url("thing.com")['url'] == 'http://thing.com/' - assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' - +from chocula.config import * +from chocula.util import * ################### Main Class @@ -1397,73 +1013,3 @@ class ChoculaDatabase(): self.db.executescript(fschema.read()) print("Done!") -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - subparsers = parser.add_subparsers() - - parser.add_argument("--db-file", - help="run in mode that considers only terminal HTML success", - default='chocula.sqlite', - type=str) - parser.add_argument("--input-file", - help="override default input file path", - default=None, - type=str) - - sub = subparsers.add_parser('everything', - help="run all the commands") - sub.set_defaults(func='everything') - - sub = subparsers.add_parser('init_db', - help="create sqlite3 output file and tables") - sub.set_defaults(func='init_db') - - sub = subparsers.add_parser('summarize', - help="aggregate metadata from all tables into 'journals' table") - sub.set_defaults(func='summarize') - - sub = subparsers.add_parser('export', - help="dump JSON output") - sub.set_defaults(func='export') - - sub = subparsers.add_parser('export_fatcat', - help="dump JSON output in a format that can load into fatcat") - sub.set_defaults(func='export_fatcat') - - # TODO: 'jurn' - for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'): - sub = subparsers.add_parser('index_{}'.format(ind), - help="load metadata from {}".format(ind)) - sub.set_defaults(func='index_{}'.format(ind)) - - sub = subparsers.add_parser('load_fatcat', - help="load fatcat container metadata") - sub.set_defaults(func='load_fatcat') - - sub = subparsers.add_parser('load_fatcat_stats', - help="update container-level stats from JSON file") - sub.set_defaults(func='load_fatcat_stats') - - sub = subparsers.add_parser('export_urls', - help="dump homepage URLs (eg, to crawl for status)") - sub.set_defaults(func='export_urls') - - sub = subparsers.add_parser('update_url_status', - help="import homepage URL crawl status") - sub.set_defaults(func='update_url_status') - - args = parser.parse_args() - if not args.__dict__.get("func"): - print("tell me what to do! (try --help)") - sys.exit(-1) - - cdb = ChoculaDatabase(args.db_file) - if args.func.startswith('index_') or args.func in ('everything','summarize',): - cdb.read_issn_map_file(ISSNL_FILE) - func = getattr(cdb, args.func) - func(args) - -if __name__ == '__main__': - main() - diff --git a/chocula/config.py b/chocula/config.py new file mode 100644 index 0000000..a32bdd1 --- /dev/null +++ b/chocula/config.py @@ -0,0 +1,30 @@ + +################### File Config + +ISSNL_FILE = 'data/20200323.ISSN-to-ISSN-L.txt' + +ENTREZ_FILE = 'data/entrez-journals.csv' +ROAD_FILE = 'data/road-2018-01-24.tsv' +ROAD_DATE = '2018-01-24' +DOAJ_FILE = 'data/journalcsv__doaj_20191221_0135_utf8.csv' +DOAJ_DATE = '2019-12-21' +CROSSREF_FILE = 'data/doi_titles_file_2019-12-20.csv' +SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv' +SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv' +NORWEGIAN_FILE = 'data/2019-12-21 Norwegian Register for Scientific Journals and Series.csv' +NORWEGIAN_DATE = '2019-12-21' +LOCKSS_FILE = 'data/kbart_LOCKSS.txt' +CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt' +PORTICO_FILE = 'data/Portico_Holding_KBart.txt' +JSTOR_FILE = 'data/JSTOR_Global_AllArchiveTitles_2019-12-21.txt' +SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv' +SZCZEPANSKI_DATE = '2018' +SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json' +EZB_FILE = 'data/ezb_metadata.json' +GOLD_OA_FILE = 'data/ISSN_Gold-OA_3.0.csv' +WIKIDATA_SPARQL_FILE = 'data/wikidata_journals_sparql.2019-12-20.tsv' +OPENAPC_FILE = 'data/apc_de.2019-12-20.csv' +FATCAT_CONTAINER_FILE = 'data/container_export.2019-12-13.json' + +IA_CRAWL_FILE = 'data/url_status.20191223.json' +FATCAT_STATS_FILE = 'data/container_stats.20191213.json' diff --git a/chocula/util.py b/chocula/util.py new file mode 100644 index 0000000..533b41a --- /dev/null +++ b/chocula/util.py @@ -0,0 +1,311 @@ + +import urlcanon +import surt +import tldextract +import pycountry + +################### Utilities + +# NOTE: this is a partial list, focusing on non-publisher hosted platforms and +# software frameworks +PLATFORM_MAP = { + 'OJS': 'ojs', + 'BMC': 'bmc', + 'SciELO Brazil': 'scielo', + 'SciELO Argentina': 'scielo', + 'SciELO': 'scielo', + 'SciELO Mexico': 'scielo', + 'SciELO Spain': 'scielo', + 'SciELO Portugal': 'scielo', + 'WordPress': 'wordpress', + 'Sciendo': 'sciendo', + 'Drupal': 'drupal', + 'revues.org': 'openedition', +} + +MIMETYPE_MAP = { + 'PDF': 'application/pdf', + 'HTML': 'text/html', + 'XML': 'application/xml', +} + +BIG5_PUBLISHERS = [ + 'Elsevier', + 'Informa UK (Taylor & Francis)', + 'Springer-Verlag', + 'SAGE Publications', + 'Wiley (Blackwell Publishing)', + 'Wiley (John Wiley & Sons)', + 'Springer (Biomed Central Ltd.)', + 'Springer Nature', +] +COMMERCIAL_PUBLISHERS = [ + 'Peter Lang International Academic Publishers', + 'Walter de Gruyter GmbH', + 'Oldenbourg Wissenschaftsverlag', + 'Georg Thieme Verlag KG', # not springer + 'Emerald (MCB UP )', + 'Medknow Publications', + 'Inderscience Enterprises Ltd', + 'Bentham Science', + 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins', + 'Scientific Research Publishing, Inc', + 'MDPI AG', + 'S. Karger AG', + 'Pleiades Publishing', + 'Science Publishing Group', + 'IGI Global', + 'The Economist Intelligence Unit', + 'Maney Publishing', + 'Diva Enterprises Private Limited', + 'World Scientific', + 'Mary Ann Liebert', + 'Trans Tech Publications', +] +OA_PUBLISHERS = [ + 'Hindawi Limited', + 'OMICS Publishing Group', + 'De Gruyter Open Sp. z o.o.', + 'OpenEdition', + 'Hindawi (International Scholarly Research Network)', + 'Public Library of Science', + 'Frontiers Media SA', + 'eLife Sciences Publications, Ltd', + 'MDPI AG', + 'Hindawi (International Scholarly Research Network)', + 'Dove Medical Press', + 'Open Access Text', +] +SOCIETY_PUBLISHERS = [ + 'Institute of Electrical and Electronics Engineers', + 'Institution of Electrical Engineers', + 'Association for Computing Machinery', + 'American Psychological Association', + 'IOS Press', + 'IOP Publishing', + 'American Chemical Society', + 'Royal Society of Chemistry (RSC)', + 'American Geophysical Union', + 'American College of Physicians', + 'New England Journal of Medicine', + 'BMJ', + 'RCN Publishing', + 'International Union of Crystallography', + 'Portland Press', + 'ASME International', +] +UNI_PRESS_PUBLISHERS = [ + 'Cambridge University Press', + 'Oxford University Press', + 'The University of Chicago Press', + 'MIT Press', +] +ARCHIVE_PUBLISHERS = [ + 'JSTOR', + 'Portico', +] +REPOSITORY_PUBLISHERS = [ + 'PERSEE Program', + 'Social Science Electronic Publishing', + 'CAIRN', + 'CSIRO Publishing', +] +OTHER_PUBLISHERS = [ + 'African Journals Online', + 'Smithsonian Institution Biodiversity Heritage Library', + 'Canadian Science Publishing', + 'Philosophy Documentation Center', + 'Project MUSE', +] + +def unquote(s): + if s.startswith('"'): + s = s[1:] + if s.endswith('"'): + s = s[:-1] + if s.endswith('.'): + s = s[:-1] + return s.strip() + +def parse_lang(s): + if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'): + return None + try: + if len(s) == 2: + lang = pycountry.languages.get(alpha2=s.lower()) + elif len(s) == 3: + lang = pycountry.languages.get(alpha3=s.lower()) + else: + lang = pycountry.languages.get(name=s) + return lang.alpha2.lower() + except KeyError: + return None + except AttributeError: + return None + +def parse_country(s): + if not s or s in ('Unknown'): + return None + try: + if len(s) == 2: + country = pycountry.countries.get(alpha2=s.lower()) + else: + country = pycountry.countries.get(name=s) + except KeyError: + return None + if country: + return country.alpha_2.lower() + else: + return None + +def parse_mimetypes(val): + # XXX: multiple mimetypes? + if not val: + return + mimetype = None + if '/' in val: + mimetype = val + else: + mimetype = MIMETYPE_MAP.get(val) + if not mimetype: + return None + return [mimetype] + +def gaps_to_spans(first, last, gaps): + if not gaps: + return [[first, last]] + if not (last >= first and max(gaps) < last and min(gaps) > first): + # mangled + print("mangled years: {}".format((first, last, gaps))) + return [] + full = list(range(first, last+1)) + for missing in gaps: + full.remove(missing) + spans = [] + low = None + last = None + for year in full: + if not low: + low = year + last = year + continue + if year != last+1: + spans.append([low, last]) + low = year + last = year + last = year + if low: + spans.append([low, last]) + return spans + +def test_gaps(): + assert gaps_to_spans(1900, 1900, None) == \ + [[1900, 1900]] + assert gaps_to_spans(1900, 1903, None) == \ + [[1900, 1903]] + assert gaps_to_spans(1900, 1902, [1901]) == \ + [[1900, 1900], [1902, 1902]] + assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \ + [[1950, 1954], [1957, 1964], [1966, 1970]] + +def merge_spans(old, new): + if not new: + return old + if not old: + old = [] + old.extend(new) + years = set() + for span in old: + for y in range(span[0], span[1]+1): + years.add(y) + if not years: + return [] + spans = [] + start = None + last = None + todo = False + for y in sorted(list(years)): + if start == None: + # very first + start = y + last = y + todo = True + continue + if y == last + 1: + # span continues + last = y + todo = True + continue + # a gap just happened! + spans.append([start, last]) + start = y + last = y + todo = True + if todo: + spans.append([start, last]) + return spans + +def test_merge_spans(): + assert merge_spans([[5, 10]], [[10, 20]]) == \ + [[5, 20]] + assert merge_spans([[5, 9]], [[10, 20]]) == \ + [[5, 20]] + assert merge_spans([[5, 11]], [[10, 20]]) == \ + [[5, 20]] + assert merge_spans([], []) == \ + [] + assert merge_spans([[9, 11]], []) == \ + [[9,11]] + assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \ + [[1450, 1900], [2000, 2000]] + + +def parse_url(url): + """ + Parses/cleans URLs. + + Returns a dict with: + + url: str, cleaned/normalized URL + url_surt: str, "sortable url" (a web-archiving format) + host: str, full hostname + registered_domain: "primary domain", eg "google.com" or "thing.co.uk" + suffix: str, eg "com" or "co.uk" + + Returns None if url is really bad (not a URL). + """ + if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'): + return None + if url.startswith('www.'): + url = "http://" + url + if url.startswith('ttp://') or url.startswith('ttps://'): + url = "h" + url + url.replace('Http://', 'http://') + + url = str(urlcanon.semantic_precise(url)) + if url == 'http://na/': + # sort of redundant with above, but some only match after canonicalization + return None + url_surt = surt.surt(url) + tld = tldextract.extract(url) + host = '.'.join(tld) + if host.startswith('.'): + host = host[1:] + return dict(url=url, + url_surt=url_surt or None, + host=host or None, + registered_domain=tld.registered_domain or None, + suffix=tld.suffix or None) + +def test_parse_url(): + + assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' + assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' + + assert parse_url("google.com")['suffix'] == 'com' + assert parse_url("google.com")['host'] == 'google.com' + + assert parse_url("mailto:bnewbold@bogus.com") == None + assert parse_url("thing.com")['url'] == 'http://thing.com/' + assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' diff --git a/chocula_tool.py b/chocula_tool.py new file mode 100755 index 0000000..345097e --- /dev/null +++ b/chocula_tool.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +""" +Count Chocula - online serials metadata and stats + + "one, two, three, un-preserved web-native open-access long-tail indie + journals, hah, hah, hah!" + + (yeah, I know, this name isn't very good) + (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html) + +Commands: + + everything + init_db + summarize + export + export_fatcat + + index_doaj + index_road + index_crossref + index_entrez + index_norwegian + index_szczepanski + index_ezb + index_wikidata + index_openapc + + load_fatcat + load_fatcat_stats + + export_urls + update_url_status + +Future commands: + + index_jurn + index_datacite + preserve_kbart --keeper SLUG + preserve_sim + +See TODO.md for more work-in-progress +""" + +import sys +import csv +import argparse + +from chocula import ChoculaDatabase +from chocula.config import * + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + subparsers = parser.add_subparsers() + + parser.add_argument("--db-file", + help="run in mode that considers only terminal HTML success", + default='chocula.sqlite', + type=str) + parser.add_argument("--input-file", + help="override default input file path", + default=None, + type=str) + + sub = subparsers.add_parser('everything', + help="run all the commands") + sub.set_defaults(func='everything') + + sub = subparsers.add_parser('init_db', + help="create sqlite3 output file and tables") + sub.set_defaults(func='init_db') + + sub = subparsers.add_parser('summarize', + help="aggregate metadata from all tables into 'journals' table") + sub.set_defaults(func='summarize') + + sub = subparsers.add_parser('export', + help="dump JSON output") + sub.set_defaults(func='export') + + sub = subparsers.add_parser('export_fatcat', + help="dump JSON output in a format that can load into fatcat") + sub.set_defaults(func='export_fatcat') + + # TODO: 'jurn' + for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'): + sub = subparsers.add_parser('index_{}'.format(ind), + help="load metadata from {}".format(ind)) + sub.set_defaults(func='index_{}'.format(ind)) + + sub = subparsers.add_parser('load_fatcat', + help="load fatcat container metadata") + sub.set_defaults(func='load_fatcat') + + sub = subparsers.add_parser('load_fatcat_stats', + help="update container-level stats from JSON file") + sub.set_defaults(func='load_fatcat_stats') + + sub = subparsers.add_parser('export_urls', + help="dump homepage URLs (eg, to crawl for status)") + sub.set_defaults(func='export_urls') + + sub = subparsers.add_parser('update_url_status', + help="import homepage URL crawl status") + sub.set_defaults(func='update_url_status') + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do! (try --help)") + sys.exit(-1) + + cdb = ChoculaDatabase(args.db_file) + if args.func.startswith('index_') or args.func in ('everything','summarize',): + cdb.read_issn_map_file(ISSNL_FILE) + func = getattr(cdb, args.func) + func(args) + +if __name__ == '__main__': + main() + diff --git a/count_chocola.jpg b/extra/count_chocola.jpg Binary files differindex e9da539..e9da539 100644 --- a/count_chocola.jpg +++ b/extra/count_chocola.jpg diff --git a/wikidata.sparql b/extra/wikidata/wikidata.sparql index 3f7e2f9..3f7e2f9 100644 --- a/wikidata.sparql +++ b/extra/wikidata/wikidata.sparql |