From 57db2db336c08031324e44b2d2880fbd4b6893c9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 1 Jun 2020 17:01:20 -0700 Subject: 'everything' at least partially working --- Makefile | 59 ++++++++------- TODO.md | 48 ++++++++++--- chocula/__init__.py | 12 +--- chocula/__main__.py | 155 ++++++++++++++++++++++++++++++++++++++++ chocula/common.py | 27 +++++++ chocula/database.py | 142 +++++++++--------------------------- chocula/directories/__init__.py | 19 +++++ chocula/directories/sim.py | 71 ++++++++++++++++++ chocula_tool.py | 123 ------------------------------- sources.toml | 8 ++- tests/test_directories.py | 4 +- 11 files changed, 386 insertions(+), 282 deletions(-) create mode 100755 chocula/__main__.py create mode 100644 chocula/directories/__init__.py create mode 100644 chocula/directories/sim.py delete mode 100755 chocula_tool.py diff --git a/Makefile b/Makefile index 53e0c6d..977c80e 100644 --- a/Makefile +++ b/Makefile @@ -4,39 +4,37 @@ SNAPSHOTITEM := $(shell grep ia_item sources.toml | cut -f2 -d'"') .PHONY: help help: ## Print info about all commands - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' .PHONY: test test: ## Run all tests and lints pipenv run pytest pipenv run mypy *.py chocula/*.py chocula/*/*.py --ignore-missing-imports -#.PHONY: database -#database: ## Build database from sources -# @if [ ! -f data/ISSN-to-ISSN-L.txt ]; then echo "You must run 'make fetch-sources' first"; exit -1; fi -# pipenv run ./chocula_tool.py everything - -#data/container_stats.json: -# cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv -# cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json -# mv /tmp/container_stats.json data +data/container_stats.json: + mkdir -p data + cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv + cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json + mv /tmp/container_stats.json data -#.PHONY: container-stats -#container-stats: data/container_stats.json -# wc -l data/container_stats.json -# @echo -# @echo Done +.PHONY: container-stats +container-stats: data/container_stats.json + wc -l data/container_stats.json + @echo + @echo Done -#data/homepage_status.json: -# pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv -# pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json -# cp /tmp/homepage_status.json data/ +data/homepage_status.json: + pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv + pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json + cp /tmp/homepage_status.json data/ -#.PHONY: homepage-status -#homepage-status: data/homepage_status.json -# wc -l data/homepage-status.json -# @echo -# @echo Done +.PHONY: homepage-status +homepage-status: data/homepage_status.json + wc -l data/homepage-status.json + @echo + @echo Done .PHONY: fetch-sources fetch-sources: ## Download existing snapshot versions of all sources from archive.org @@ -45,6 +43,7 @@ fetch-sources: ## Download existing snapshot versions of all sources from archiv .PHONY: update-sources update-sources: ## Download new versions of updatable sources + @# TODO: refactor to be individual targets-per-file (see fatcat-covid19 example) mkdir -p data/$(TODAY) wget -c "https://www.issn.org/wp-content/uploads/2014/03/issnltables.zip" -O /tmp/issnltables.$(TODAY).zip unzip -p /tmp/issnltables.$(TODAY).zip "*.ISSN-to-ISSN-L.txt" > /tmp/ISSN-to-ISSN-L.$(TODAY).txt @@ -58,11 +57,17 @@ update-sources: ## Download new versions of updatable sources @echo @echo "Successfully updated for date (UTC): $(TODAY)" -#.PHONY: upload-sources -#upload-sources: ## Upload an updated snapshot of sources to archive.org -# ia upload --checksum chocula-sources-$(TODAY) data/*.tsv data/*.csv data/*.json data/*.txt +.PHONY: upload-sources +upload-sources: update-sources ## Upload most recent update-able sources to a new IA item + ia upload --checksum chocula-sources-snapshot-$(TODAY) data/$(TODAY)/* + # TODO: ia upload --checksum chocula-sources-$(TODAY) data/*.tsv data/*.csv data/*.json data/*.txt #.PHONY: upload-snapshot #upload-snapshot: ## Upload an sqlite snapshot to archive.org # ia upload --checksum --no-derive chocula-snapshot-$(TODAY) chocula.sqlite3 README.md extra/count_chocula.jpg + +.PHONY: database +database: ## Build database from sources + @if [ ! -f data/ISSN-to-ISSN-L.txt ]; then echo "You must run 'make fetch-sources' first"; exit -1; fi + pipenv run python -m chocula everything diff --git a/TODO.md b/TODO.md index 2d0c7e3..a6814a0 100644 --- a/TODO.md +++ b/TODO.md @@ -1,33 +1,66 @@ +2020-05-06 +x python3.7 +x type annotations / dataclasses +x "update-sources" + => makefile +- run "everything" successfully +- "upload-sources" + => to archive.org, with datetime +- "fetch-sources" + => all snapshots in a single ia item, with datetime +- scielo journal metadata +- kbart loading +- "platform" column in database +- rewrite README + +- flag to delete old table/rows when loading (?) +- "loaders" not directories? +- makefile +- black +- refactor most code into module directory +- tests + => index process +- update upstreams + +refactors: +- "directory" command with directory as arg +- "kbart" command with directory as arg +- "load" command with directory as arg + +https://isaw.nyu.edu/publications/awol-index/ + ## Chocula +- fully automated updates, luigi/gluish style + => downloads/uploads source metadata files + => outputs config file for chocula run + => runs chocula everything + priorities: -x fraction/which are pointing to wayback - coverage stats, particularly for longtail -x wikidata linkage (prep for wikimania) - "still in print" flag - clean out invalid ISSN-L from fatcat - don't list dead URLs in fatcat - summary report of some of above -- update all fatcat (wikidata QID, urls, fixed ISSN-L, etc) - when updating fatcat: if title is "blah, Proceedings of the", set type to proceedings and re-write title if title like "Workshop on", set type source improvements: - entrez: "NLM Unique Id" -- JUFO: finish +- JURN: finish - crossref: empty string identifiers? +- scielo: https://scielo.org/en/journals/list-by-alphabetical-order/?export=csv +- https://www.arc.gov.au/excellence-research-australia (journal list) - public scopus list (?) - scrape/munge public clarivate dumps - import JURN into fatcat (one way or another) => try to title match and get ISSN-L => manual lookups for remainders? -- dump json - "GOLD" importer (for scopus/WoS) - check that all fields actually getting imported reasonably -- homepage crawl/status script - could poll portal.issn.org like: https://portal.issn.org/resource/ISSN/1561-7645?format=json @@ -40,7 +73,4 @@ source improvements: - update_url_status (needs re-write) - log out index issues (duplicate ISSN-L, etc) to a file - validate against GOLD OA list -- decide what to do with JURN... match? fuzzy match? create missing fatcat? -- lots of bogus ISSN-L, like 9999-9999 or 0000-0000. should both validate - check digit and require an ISSN-L to actually exist. diff --git a/chocula/__init__.py b/chocula/__init__.py index a0947e1..440e7a5 100644 --- a/chocula/__init__.py +++ b/chocula/__init__.py @@ -1,15 +1,5 @@ from chocula.config import ChoculaConfig +from chocula.directories import * from chocula.database import ChoculaDatabase, IssnDatabase -from chocula.directories.crossref import CrossrefLoader -from chocula.directories.doaj import DoajLoader -from chocula.directories.entrez import EntrezLoader -from chocula.directories.ezb import EzbLoader -from chocula.directories.gold_oa import GoldOALoader -from chocula.directories.norwegian import NorwegianLoader -from chocula.directories.openapc import OpenAPCLoader -from chocula.directories.road import RoadLoader -from chocula.directories.sherpa_romeo import SherpaRomeoLoader -from chocula.directories.szczepanski import SzczepanskiLoader -from chocula.directories.wikidata import WikidataLoader diff --git a/chocula/__main__.py b/chocula/__main__.py new file mode 100755 index 0000000..21f3976 --- /dev/null +++ b/chocula/__main__.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +""" +Count Chocula - online serials metadata and stats + + "one, two, three, un-preserved web-native open-access long-tail indie + journals, hah, hah, hah!" + + (yeah, I know, this name isn't very good) + (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html) + +Commands: + + everything + init_db + summarize + export + export_fatcat + + index_doaj + index_road + index_crossref + index_entrez + index_norwegian + index_szczepanski + index_ezb + index_wikidata + index_openapc + index_sim + + load_fatcat_containers + load_fatcat_stats + load_homepage_status + + export_urls + +Future commands: + + index_jurn + index_datacite + preserve_kbart --keeper SLUG + preserve_sim + +See TODO.md for more work-in-progress +""" + +import sys +import csv +import argparse + +from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase, ALL_CHOCULA_DIR_CLASSES + + +def run_everything(config, database): + + database.init_db() + for cls in ALL_CHOCULA_DIR_CLASSES: + loader = cls(config) + counts = loader.index_file(database) + print(counts) + + # XXX: TODO: + database.load_fatcat_containers(config) + database.load_fatcat_stats(config) + # XXX: TODO: + #self.preserve_kbart('lockss', LOCKSS_FILE) + #self.preserve_kbart('clockss', CLOCKSS_FILE) + #self.preserve_kbart('portico', PORTICO_FILE) + #self.preserve_kbart('jstor', JSTOR_FILE) + #self.preserve_sim(args) + database.load_homepage_status(config) + database.summarize() + print("### Done with everything!") + +def run_index(config, database, cls): + loader = cls(config) + counts = loader.index_file(database) + print(counts) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + subparsers = parser.add_subparsers() + + parser.add_argument("--db-file", + help="sqlite database file", + default='chocula.sqlite', + type=str) + + sub = subparsers.add_parser('everything', + help="run all the commands") + sub.set_defaults(func='everything') + + sub = subparsers.add_parser('init_db', + help="create sqlite3 output file and tables") + sub.set_defaults(func='init_db') + + sub = subparsers.add_parser('summarize', + help="aggregate metadata from all tables into 'journals' table") + sub.set_defaults(func='summarize') + + sub = subparsers.add_parser('export', + help="dump JSON output") + sub.set_defaults(func='export') + + sub = subparsers.add_parser('export_fatcat', + help="dump JSON output in a format that can load into fatcat") + sub.set_defaults(func='export_fatcat') + + for cls in ALL_CHOCULA_DIR_CLASSES: + sub = subparsers.add_parser('index_{}'.format(cls.source_slug), + help="load metadata from {}".format(cls.source_slug)) + sub.set_defaults(func='index_{}'.format(cls.source_slug), index_cls=cls) + + sub = subparsers.add_parser('load_fatcat_containers', + help="load fatcat container metadata") + sub.set_defaults(func='load_fatcat_containers') + + sub = subparsers.add_parser('load_fatcat_stats', + help="update container-level stats from JSON file") + sub.set_defaults(func='load_fatcat_stats') + + sub = subparsers.add_parser('export_urls', + help="dump homepage URLs (eg, to crawl for status)") + sub.set_defaults(func='export_urls') + + sub = subparsers.add_parser('load_homepage_status', + help="import homepage URL crawl status") + sub.set_defaults(func='load_homepage_status') + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do! (try --help)") + sys.exit(-1) + + config = ChoculaConfig.from_file() + if args.func.startswith('index_') or args.func in ('everything','summarize',): + issn_db = IssnDatabase(config.issnl.filepath) + else: + issn_db = None + cdb = ChoculaDatabase(args.db_file, issn_db) + if args.func == 'everything': + run_everything(config, cdb) + elif args.func.startswith('index_'): + print(run_index(config, cdb, args.index_cls)) + elif args.func.startswith('load_'): + func = getattr(cdb, args.func) + print(func(config)) + else: + func = getattr(cdb, args.func) + print(func(), file=sys.stderr) + +if __name__ == '__main__': + main() + diff --git a/chocula/common.py b/chocula/common.py index 54856c9..f515e6f 100644 --- a/chocula/common.py +++ b/chocula/common.py @@ -33,3 +33,30 @@ class DirectoryLoader(): cur.close() db.db.commit() return counts + +class KbartLoader(): + + source_slug: str = "GENERIC" + + def __init__(self, config: ChoculaConfig): + self.config = config + + def open_file(self) -> Iterable: + raise NotImplementedError() + + def parse_record(self, record) -> Optional[DirectoryInfo]: + raise NotImplementedError() + + def index_file(self, db) -> Counter: + print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr) + counts: Counter = Counter() + cur = db.db.cursor() + for record in self.open_file(): + counts['total'] += 1 + info = self.parse_record(record) + if info: + status = db.insert_directory(info, cur=cur) + counts[status] += 1 + cur.close() + db.db.commit() + return counts diff --git a/chocula/database.py b/chocula/database.py index f6a000a..3efa725 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -15,6 +15,7 @@ import tldextract import ftfy import stdnum.issn +from chocula import * from chocula.util import * @@ -271,7 +272,7 @@ class ChoculaDatabase(): return "inserted" - def parse_kbart(self, name, path): + def parse_kbart(self, name, path) -> Counter: """ Transforms a KBART file into a dict of dicts; but basically a list of JSON objects, one per journal. KBART files can have multiple rows per @@ -318,69 +319,13 @@ class ChoculaDatabase(): else: new_spans = [[start, end]] d['year_spans'] = merge_spans(old_spans, new_spans) - print(counts) - return kbart_dict + return counts - - def index_sim(self, args): - path = args.input_file or SIM_FILE - print("##### Loading SIM Metadata...") - #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects - reader = csv.DictReader(open(path)) - counts = Counter() - cur = self.db.cursor() - for row in reader: - if not row['ISSN'] or row['ISSN'] == "NULL": - counts['no-issn'] += 1 - continue - issnl, status = self.add_issn( - 'ia_sim', - raw_issn=row['ISSN'][:9], - name=row['Title'], - publisher=row['Publisher'], - extra=extra, - ) - counts[status] += 1 - if not issnl: - continue - d = self.data[issnl] - sim = dict() - sim['id'] = row['NA Pub Cat ID'] - first_year = row['First Volume'] - if first_year: - first_year = int(first_year) - sim['first_year'] = int(row['First Volume']) - else: - first_year = None - last_year = row['Last Volume'] - if last_year: - last_year = int(last_year) - sim['last_year'] = last_year - else: - last_year = None - gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()] - if gaps: - sim['gaps'] = gaps - if first_year and last_year: - sim['year_spans'] = gaps_to_spans(first_year, last_year, gaps) - if row['Pub Language']: - self.add_lang(issnl, row['Pub Language']) - # TODO: 'Pub Type' - all_keys = list(sim.keys()) - for k in all_keys: - if not sim[k]: - sim.pop(k) - self.data[issnl]['sim'] = sim - cur.close() - self.db.commit() - print(counts) - - def update_url_status(self, args): - path = args.input_file or IA_CRAWL_FILE + def load_homepage_status(self, config: ChoculaConfig) -> Counter: print("##### Loading IA Homepage Crawl Results...") counts = Counter() cur = self.db.cursor() - for row in open(path, 'r'): + for row in open(config.homepage_status.filepath, 'r'): if not row.strip(): continue row = json.loads(row) @@ -405,13 +350,12 @@ class ChoculaDatabase(): counts['updated'] += 1 cur.close() self.db.commit() - print(counts) + return counts - def load_fatcat(self, args): - path = args.input_file or FATCAT_CONTAINER_FILE + def load_fatcat_containers(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Entities...") # JSON - json_file = open(path, 'r') + json_file = open(config.fatcat_containers.filepath, 'r') counts = Counter() cur = self.db.cursor() for row in json_file: @@ -445,22 +389,25 @@ class ChoculaDatabase(): )) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): - return None, "duplicate-issnl" - raise ie + counts["existing"] += 1 + continue + else: + raise ie counts['inserted'] += 1 if row.get('issnl'): urls = extra.get('urls', []) for url in urls: - self.add_url(row['issnl'], url) + homepage = HomepageUrl.from_url(url) + if homepage: + self.insert_homepage(row.get('issnl'), homepage, cur) cur.close() self.db.commit() - print(counts) + return counts - def load_fatcat_stats(self, args): - path = args.input_file or FATCAT_STATS_FILE + def load_fatcat_stats(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Stats...") # JSON - json_file = open(path, 'r') + json_file = open(config.fatcat_stats.filepath, 'r') counts = Counter() cur = self.db.cursor() for row in json_file: @@ -479,18 +426,21 @@ class ChoculaDatabase(): counts['updated'] += 1 cur.close() self.db.commit() - print(counts) + return counts - def export_urls(self, args): + def export_urls(self) -> Counter: + counts = Counter() cur = self.db.cursor() self.db.row_factory = sqlite3.Row cur = self.db.execute("SELECT issnl, url FROM homepage;") for hrow in cur: assert(hrow['url']) assert(len(hrow['url'].split()) == 1) + counts['total'] += 1 print('\t'.join((hrow['issnl'], hrow['url']))) + return counts - def summarize(self, args): + def summarize(self) -> Counter: print("##### Summarizing Everything...") counts = Counter() cur = self.db.cursor() @@ -506,7 +456,7 @@ class ChoculaDatabase(): out = dict() # check if ISSN-L is good. this is here because of fatcat import - out['known_issnl'] = (self.issn2issnl(issnl) == issnl) + out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl) if not out['known_issnl']: counts['unknown-issnl'] += 1 out['valid_issnl'] = stdnum.issn.is_valid(issnl) @@ -544,8 +494,8 @@ class ChoculaDatabase(): out['is_oa'] = True if irow['slug'] == 'sherpa_romeo': extra = json.loads(irow['extra']) - out['sherpa_color'] = extra['color'] - if extra['color'] == 'green': + out['sherpa_color'] = extra['sherpa_romeo']['color'] + if extra['sherpa_romeo']['color'] == 'green': out['is_oa'] = True # filter out "NA" ISSNs @@ -624,33 +574,9 @@ class ChoculaDatabase(): )) cur.close() self.db.commit() - print(counts) - - def everything(self, args): - self.init_db(args) - self.index_doaj(args) - self.index_norwegian(args) - self.index_crossref(args) - self.index_sherpa_romeo(args) - self.index_road(args) - self.index_entrez(args) - self.index_ezb(args) - self.index_szczepanski(args) - self.index_gold_oa(args) - self.index_openapc(args) - self.index_wikidata(args) - self.load_fatcat(args) - self.load_fatcat_stats(args) - #self.preserve_kbart('lockss', LOCKSS_FILE) - #self.preserve_kbart('clockss', CLOCKSS_FILE) - #self.preserve_kbart('portico', PORTICO_FILE) - #self.preserve_kbart('jstor', JSTOR_FILE) - #self.preserve_sim(args) - self.update_url_status(args) - self.summarize(args) - print("### Done with everything!") - - def export(self, args): + return counts + + def export(self) -> Counter: def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): @@ -662,8 +588,9 @@ class ChoculaDatabase(): for row in cur.execute('SELECT * FROM journal'): print(json.dumps(row)) counts['total'] += 1 + return counts - def export_fatcat(self, args): + def export_fatcat(self): counts = Counter() self.db.row_factory = sqlite3.Row cur = self.db.cursor() @@ -748,13 +675,14 @@ class ChoculaDatabase(): ezb = json.loads(drow['extra']) extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color']) if drow['slug'] == 'szczepanski': - # XXX: pull from record - extra['szczepanski'] = dict(as_of=config.szczepanski.date) + # TODO: what to put here? + extra['szczepanski'] = drow['extra'] if drow['slug'] == 'doaj': extra['doaj'] = json.loads(drow['extra']) out['extra'] = extra print(json.dumps(out)) + return counts def init_db(self): print("### Creating Database...", file=sys.stderr) diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py new file mode 100644 index 0000000..4bed696 --- /dev/null +++ b/chocula/directories/__init__.py @@ -0,0 +1,19 @@ + +from chocula.directories.crossref import CrossrefLoader +from chocula.directories.doaj import DoajLoader +from chocula.directories.entrez import EntrezLoader +from chocula.directories.ezb import EzbLoader +from chocula.directories.gold_oa import GoldOALoader +from chocula.directories.norwegian import NorwegianLoader +from chocula.directories.openapc import OpenAPCLoader +from chocula.directories.road import RoadLoader +from chocula.directories.sherpa_romeo import SherpaRomeoLoader +from chocula.directories.sim import SimLoader +from chocula.directories.szczepanski import SzczepanskiLoader +from chocula.directories.wikidata import WikidataLoader + +ALL_CHOCULA_DIR_CLASSES = [ + CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader, + NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader, + SzczepanskiLoader, WikidataLoader, SimLoader, +] diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py new file mode 100644 index 0000000..c0c02df --- /dev/null +++ b/chocula/directories/sim.py @@ -0,0 +1,71 @@ + +from typing import Iterable, Optional, Dict, Any +import csv + +from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class SimLoader(DirectoryLoader): + + source_slug = "sim" + + def open_file(self) -> Iterable: + return csv.DictReader(open(self.config.sim.filepath)) + + def parse_record(self, row) -> Optional[DirectoryInfo]: + + """ + NA Pub Cat ID + Title + Publisher + ISSN + Impact Rank + Total Cities + Journal Impact Factor + Eigenfact or Score + First Volume + Last Volume + NA Gaps + "Scholarly / Peer-\n Reviewed" + "Peer-\n Reviewed" + Pub Type + Pub Language + Subjects + """ + # TODO: 'Pub Type' + + extra = {} + first_year = row['First Volume'] + if first_year: + first_year = int(first_year) + extra['first_year'] = int(row['First Volume']) + else: + first_year = None + last_year = row['Last Volume'] + if last_year: + last_year = int(last_year) + extra['last_year'] = last_year + else: + last_year = None + gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()] + if gaps: + extra['gaps'] = gaps + if first_year and last_year: + extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps) + extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"] + extra['peer_reviewed'] = row["Peer-\nReviewed"] + extra['pub_type'] = clean_str(row["Pub Type"]) + + info = DirectoryInfo( + directory_slug=self.source_slug, + name=clean_str(row['Title']), + publisher=clean_str(row['Publisher']), + raw_issn=row['ISSN'][:9], + custom_id=row.get('NA Pub Cat ID').strip() or None, + langs=[parse_lang(row['Pub Language'])], + extra=extra, + ) + return info + diff --git a/chocula_tool.py b/chocula_tool.py deleted file mode 100755 index 7dfe80e..0000000 --- a/chocula_tool.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python3 - -""" -Count Chocula - online serials metadata and stats - - "one, two, three, un-preserved web-native open-access long-tail indie - journals, hah, hah, hah!" - - (yeah, I know, this name isn't very good) - (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html) - -Commands: - - everything - init_db - summarize - export - export_fatcat - - index_doaj - index_road - index_crossref - index_entrez - index_norwegian - index_szczepanski - index_ezb - index_wikidata - index_openapc - - load_fatcat - load_fatcat_stats - - export_urls - update_url_status - -Future commands: - - index_jurn - index_datacite - preserve_kbart --keeper SLUG - preserve_sim - -See TODO.md for more work-in-progress -""" - -import sys -import csv -import argparse - -from chocula import ChoculaDatabase, ChoculaConfig - - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - subparsers = parser.add_subparsers() - - parser.add_argument("--db-file", - help="run in mode that considers only terminal HTML success", - default='chocula.sqlite', - type=str) - parser.add_argument("--input-file", - help="override default input file path", - default=None, - type=str) - - sub = subparsers.add_parser('everything', - help="run all the commands") - sub.set_defaults(func='everything') - - sub = subparsers.add_parser('init_db', - help="create sqlite3 output file and tables") - sub.set_defaults(func='init_db') - - sub = subparsers.add_parser('summarize', - help="aggregate metadata from all tables into 'journals' table") - sub.set_defaults(func='summarize') - - sub = subparsers.add_parser('export', - help="dump JSON output") - sub.set_defaults(func='export') - - sub = subparsers.add_parser('export_fatcat', - help="dump JSON output in a format that can load into fatcat") - sub.set_defaults(func='export_fatcat') - - # TODO: 'jurn' - for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'): - sub = subparsers.add_parser('index_{}'.format(ind), - help="load metadata from {}".format(ind)) - sub.set_defaults(func='index_{}'.format(ind)) - - sub = subparsers.add_parser('load_fatcat', - help="load fatcat container metadata") - sub.set_defaults(func='load_fatcat') - - sub = subparsers.add_parser('load_fatcat_stats', - help="update container-level stats from JSON file") - sub.set_defaults(func='load_fatcat_stats') - - sub = subparsers.add_parser('export_urls', - help="dump homepage URLs (eg, to crawl for status)") - sub.set_defaults(func='export_urls') - - sub = subparsers.add_parser('update_url_status', - help="import homepage URL crawl status") - sub.set_defaults(func='update_url_status') - - args = parser.parse_args() - if not args.__dict__.get("func"): - print("tell me what to do! (try --help)") - sys.exit(-1) - - config = ChoculaConfig.from_file() - cdb = ChoculaDatabase(args.db_file) - if args.func.startswith('index_') or args.func in ('everything','summarize',): - cdb.read_issn_map_file(config.issnl.filepath) - func = getattr(cdb, args.func) - func(args) - -if __name__ == '__main__': - main() - diff --git a/sources.toml b/sources.toml index e824e39..44ad219 100644 --- a/sources.toml +++ b/sources.toml @@ -1,6 +1,6 @@ [snapshot] -ia_item = "chocula-sources-snapshot-2020-05-08" +ia_item = "chocula-sources-snapshot-2020-05-29" [issnl] date = "2019-12-20" @@ -109,7 +109,11 @@ filename = "openapc.csv" original_url = "https://github.com/OpenAPC/openapc-de/blob/master/data/apc_de.csv" mirror_url = "https://archive.org/download/openapc-dataset" -[fatcat_container] +[sim] +date = "2019" +filename = "sim_master_title_metadata.csv" + +[fatcat_containers] date = "2019-12-13" filename = "container_export.json" diff --git a/tests/test_directories.py b/tests/test_directories.py index 37c6109..90856bc 100644 --- a/tests/test_directories.py +++ b/tests/test_directories.py @@ -19,9 +19,7 @@ def database(issn_db): def test_all(config, database): - for cls in (CrossrefLoader, DoajLoader, EntrezLoader, EzbLoader, - GoldOALoader, NorwegianLoader, OpenAPCLoader, RoadLoader, - SherpaRomeoLoader, SzczepanskiLoader, WikidataLoader): + for cls in ALL_CHOCULA_DIR_CLASSES: loader = cls(config) counts = loader.index_file(database) assert counts['total'] >= 20 -- cgit v1.2.3