aboutsummaryrefslogtreecommitdiffstats
path: root/chocula
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-01 17:01:20 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-01 17:01:20 -0700
commit57db2db336c08031324e44b2d2880fbd4b6893c9 (patch)
treef5ad462ab6b3e7d3ac7987049e8c604bd5ee9fbe /chocula
parent08867f9b8de576f0831e6bb9f7b88acddcc31dee (diff)
downloadchocula-57db2db336c08031324e44b2d2880fbd4b6893c9.tar.gz
chocula-57db2db336c08031324e44b2d2880fbd4b6893c9.zip
'everything' at least partially working
Diffstat (limited to 'chocula')
-rw-r--r--chocula/__init__.py12
-rwxr-xr-xchocula/__main__.py155
-rw-r--r--chocula/common.py27
-rw-r--r--chocula/database.py142
-rw-r--r--chocula/directories/__init__.py19
-rw-r--r--chocula/directories/sim.py71
6 files changed, 308 insertions, 118 deletions
diff --git a/chocula/__init__.py b/chocula/__init__.py
index a0947e1..440e7a5 100644
--- a/chocula/__init__.py
+++ b/chocula/__init__.py
@@ -1,15 +1,5 @@
from chocula.config import ChoculaConfig
+from chocula.directories import *
from chocula.database import ChoculaDatabase, IssnDatabase
-from chocula.directories.crossref import CrossrefLoader
-from chocula.directories.doaj import DoajLoader
-from chocula.directories.entrez import EntrezLoader
-from chocula.directories.ezb import EzbLoader
-from chocula.directories.gold_oa import GoldOALoader
-from chocula.directories.norwegian import NorwegianLoader
-from chocula.directories.openapc import OpenAPCLoader
-from chocula.directories.road import RoadLoader
-from chocula.directories.sherpa_romeo import SherpaRomeoLoader
-from chocula.directories.szczepanski import SzczepanskiLoader
-from chocula.directories.wikidata import WikidataLoader
diff --git a/chocula/__main__.py b/chocula/__main__.py
new file mode 100755
index 0000000..21f3976
--- /dev/null
+++ b/chocula/__main__.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+"""
+Count Chocula - online serials metadata and stats
+
+ "one, two, three, un-preserved web-native open-access long-tail indie
+ journals, hah, hah, hah!"
+
+ (yeah, I know, this name isn't very good)
+ (see also: https://teamyacht.com/ernstchoukula.com/Ernst-Choukula.html)
+
+Commands:
+
+ everything
+ init_db
+ summarize
+ export
+ export_fatcat
+
+ index_doaj
+ index_road
+ index_crossref
+ index_entrez
+ index_norwegian
+ index_szczepanski
+ index_ezb
+ index_wikidata
+ index_openapc
+ index_sim
+
+ load_fatcat_containers
+ load_fatcat_stats
+ load_homepage_status
+
+ export_urls
+
+Future commands:
+
+ index_jurn
+ index_datacite
+ preserve_kbart --keeper SLUG
+ preserve_sim
+
+See TODO.md for more work-in-progress
+"""
+
+import sys
+import csv
+import argparse
+
+from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase, ALL_CHOCULA_DIR_CLASSES
+
+
+def run_everything(config, database):
+
+ database.init_db()
+ for cls in ALL_CHOCULA_DIR_CLASSES:
+ loader = cls(config)
+ counts = loader.index_file(database)
+ print(counts)
+
+ # XXX: TODO:
+ database.load_fatcat_containers(config)
+ database.load_fatcat_stats(config)
+ # XXX: TODO:
+ #self.preserve_kbart('lockss', LOCKSS_FILE)
+ #self.preserve_kbart('clockss', CLOCKSS_FILE)
+ #self.preserve_kbart('portico', PORTICO_FILE)
+ #self.preserve_kbart('jstor', JSTOR_FILE)
+ #self.preserve_sim(args)
+ database.load_homepage_status(config)
+ database.summarize()
+ print("### Done with everything!")
+
+def run_index(config, database, cls):
+ loader = cls(config)
+ counts = loader.index_file(database)
+ print(counts)
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ parser.add_argument("--db-file",
+ help="sqlite database file",
+ default='chocula.sqlite',
+ type=str)
+
+ sub = subparsers.add_parser('everything',
+ help="run all the commands")
+ sub.set_defaults(func='everything')
+
+ sub = subparsers.add_parser('init_db',
+ help="create sqlite3 output file and tables")
+ sub.set_defaults(func='init_db')
+
+ sub = subparsers.add_parser('summarize',
+ help="aggregate metadata from all tables into 'journals' table")
+ sub.set_defaults(func='summarize')
+
+ sub = subparsers.add_parser('export',
+ help="dump JSON output")
+ sub.set_defaults(func='export')
+
+ sub = subparsers.add_parser('export_fatcat',
+ help="dump JSON output in a format that can load into fatcat")
+ sub.set_defaults(func='export_fatcat')
+
+ for cls in ALL_CHOCULA_DIR_CLASSES:
+ sub = subparsers.add_parser('index_{}'.format(cls.source_slug),
+ help="load metadata from {}".format(cls.source_slug))
+ sub.set_defaults(func='index_{}'.format(cls.source_slug), index_cls=cls)
+
+ sub = subparsers.add_parser('load_fatcat_containers',
+ help="load fatcat container metadata")
+ sub.set_defaults(func='load_fatcat_containers')
+
+ sub = subparsers.add_parser('load_fatcat_stats',
+ help="update container-level stats from JSON file")
+ sub.set_defaults(func='load_fatcat_stats')
+
+ sub = subparsers.add_parser('export_urls',
+ help="dump homepage URLs (eg, to crawl for status)")
+ sub.set_defaults(func='export_urls')
+
+ sub = subparsers.add_parser('load_homepage_status',
+ help="import homepage URL crawl status")
+ sub.set_defaults(func='load_homepage_status')
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do! (try --help)")
+ sys.exit(-1)
+
+ config = ChoculaConfig.from_file()
+ if args.func.startswith('index_') or args.func in ('everything','summarize',):
+ issn_db = IssnDatabase(config.issnl.filepath)
+ else:
+ issn_db = None
+ cdb = ChoculaDatabase(args.db_file, issn_db)
+ if args.func == 'everything':
+ run_everything(config, cdb)
+ elif args.func.startswith('index_'):
+ print(run_index(config, cdb, args.index_cls))
+ elif args.func.startswith('load_'):
+ func = getattr(cdb, args.func)
+ print(func(config))
+ else:
+ func = getattr(cdb, args.func)
+ print(func(), file=sys.stderr)
+
+if __name__ == '__main__':
+ main()
+
diff --git a/chocula/common.py b/chocula/common.py
index 54856c9..f515e6f 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -33,3 +33,30 @@ class DirectoryLoader():
cur.close()
db.db.commit()
return counts
+
+class KbartLoader():
+
+ source_slug: str = "GENERIC"
+
+ def __init__(self, config: ChoculaConfig):
+ self.config = config
+
+ def open_file(self) -> Iterable:
+ raise NotImplementedError()
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+ raise NotImplementedError()
+
+ def index_file(self, db) -> Counter:
+ print(f"##### Loading {self.source_slug} KBART...", file=sys.stderr)
+ counts: Counter = Counter()
+ cur = db.db.cursor()
+ for record in self.open_file():
+ counts['total'] += 1
+ info = self.parse_record(record)
+ if info:
+ status = db.insert_directory(info, cur=cur)
+ counts[status] += 1
+ cur.close()
+ db.db.commit()
+ return counts
diff --git a/chocula/database.py b/chocula/database.py
index f6a000a..3efa725 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -15,6 +15,7 @@ import tldextract
import ftfy
import stdnum.issn
+from chocula import *
from chocula.util import *
@@ -271,7 +272,7 @@ class ChoculaDatabase():
return "inserted"
- def parse_kbart(self, name, path):
+ def parse_kbart(self, name, path) -> Counter:
"""
Transforms a KBART file into a dict of dicts; but basically a list of
JSON objects, one per journal. KBART files can have multiple rows per
@@ -318,69 +319,13 @@ class ChoculaDatabase():
else:
new_spans = [[start, end]]
d['year_spans'] = merge_spans(old_spans, new_spans)
- print(counts)
- return kbart_dict
+ return counts
-
- def index_sim(self, args):
- path = args.input_file or SIM_FILE
- print("##### Loading SIM Metadata...")
- #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects
- reader = csv.DictReader(open(path))
- counts = Counter()
- cur = self.db.cursor()
- for row in reader:
- if not row['ISSN'] or row['ISSN'] == "NULL":
- counts['no-issn'] += 1
- continue
- issnl, status = self.add_issn(
- 'ia_sim',
- raw_issn=row['ISSN'][:9],
- name=row['Title'],
- publisher=row['Publisher'],
- extra=extra,
- )
- counts[status] += 1
- if not issnl:
- continue
- d = self.data[issnl]
- sim = dict()
- sim['id'] = row['NA Pub Cat ID']
- first_year = row['First Volume']
- if first_year:
- first_year = int(first_year)
- sim['first_year'] = int(row['First Volume'])
- else:
- first_year = None
- last_year = row['Last Volume']
- if last_year:
- last_year = int(last_year)
- sim['last_year'] = last_year
- else:
- last_year = None
- gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
- if gaps:
- sim['gaps'] = gaps
- if first_year and last_year:
- sim['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
- if row['Pub Language']:
- self.add_lang(issnl, row['Pub Language'])
- # TODO: 'Pub Type'
- all_keys = list(sim.keys())
- for k in all_keys:
- if not sim[k]:
- sim.pop(k)
- self.data[issnl]['sim'] = sim
- cur.close()
- self.db.commit()
- print(counts)
-
- def update_url_status(self, args):
- path = args.input_file or IA_CRAWL_FILE
+ def load_homepage_status(self, config: ChoculaConfig) -> Counter:
print("##### Loading IA Homepage Crawl Results...")
counts = Counter()
cur = self.db.cursor()
- for row in open(path, 'r'):
+ for row in open(config.homepage_status.filepath, 'r'):
if not row.strip():
continue
row = json.loads(row)
@@ -405,13 +350,12 @@ class ChoculaDatabase():
counts['updated'] += 1
cur.close()
self.db.commit()
- print(counts)
+ return counts
- def load_fatcat(self, args):
- path = args.input_file or FATCAT_CONTAINER_FILE
+ def load_fatcat_containers(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Entities...")
# JSON
- json_file = open(path, 'r')
+ json_file = open(config.fatcat_containers.filepath, 'r')
counts = Counter()
cur = self.db.cursor()
for row in json_file:
@@ -445,22 +389,25 @@ class ChoculaDatabase():
))
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
- return None, "duplicate-issnl"
- raise ie
+ counts["existing"] += 1
+ continue
+ else:
+ raise ie
counts['inserted'] += 1
if row.get('issnl'):
urls = extra.get('urls', [])
for url in urls:
- self.add_url(row['issnl'], url)
+ homepage = HomepageUrl.from_url(url)
+ if homepage:
+ self.insert_homepage(row.get('issnl'), homepage, cur)
cur.close()
self.db.commit()
- print(counts)
+ return counts
- def load_fatcat_stats(self, args):
- path = args.input_file or FATCAT_STATS_FILE
+ def load_fatcat_stats(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Stats...")
# JSON
- json_file = open(path, 'r')
+ json_file = open(config.fatcat_stats.filepath, 'r')
counts = Counter()
cur = self.db.cursor()
for row in json_file:
@@ -479,18 +426,21 @@ class ChoculaDatabase():
counts['updated'] += 1
cur.close()
self.db.commit()
- print(counts)
+ return counts
- def export_urls(self, args):
+ def export_urls(self) -> Counter:
+ counts = Counter()
cur = self.db.cursor()
self.db.row_factory = sqlite3.Row
cur = self.db.execute("SELECT issnl, url FROM homepage;")
for hrow in cur:
assert(hrow['url'])
assert(len(hrow['url'].split()) == 1)
+ counts['total'] += 1
print('\t'.join((hrow['issnl'], hrow['url'])))
+ return counts
- def summarize(self, args):
+ def summarize(self) -> Counter:
print("##### Summarizing Everything...")
counts = Counter()
cur = self.db.cursor()
@@ -506,7 +456,7 @@ class ChoculaDatabase():
out = dict()
# check if ISSN-L is good. this is here because of fatcat import
- out['known_issnl'] = (self.issn2issnl(issnl) == issnl)
+ out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl)
if not out['known_issnl']:
counts['unknown-issnl'] += 1
out['valid_issnl'] = stdnum.issn.is_valid(issnl)
@@ -544,8 +494,8 @@ class ChoculaDatabase():
out['is_oa'] = True
if irow['slug'] == 'sherpa_romeo':
extra = json.loads(irow['extra'])
- out['sherpa_color'] = extra['color']
- if extra['color'] == 'green':
+ out['sherpa_color'] = extra['sherpa_romeo']['color']
+ if extra['sherpa_romeo']['color'] == 'green':
out['is_oa'] = True
# filter out "NA" ISSNs
@@ -624,33 +574,9 @@ class ChoculaDatabase():
))
cur.close()
self.db.commit()
- print(counts)
-
- def everything(self, args):
- self.init_db(args)
- self.index_doaj(args)
- self.index_norwegian(args)
- self.index_crossref(args)
- self.index_sherpa_romeo(args)
- self.index_road(args)
- self.index_entrez(args)
- self.index_ezb(args)
- self.index_szczepanski(args)
- self.index_gold_oa(args)
- self.index_openapc(args)
- self.index_wikidata(args)
- self.load_fatcat(args)
- self.load_fatcat_stats(args)
- #self.preserve_kbart('lockss', LOCKSS_FILE)
- #self.preserve_kbart('clockss', CLOCKSS_FILE)
- #self.preserve_kbart('portico', PORTICO_FILE)
- #self.preserve_kbart('jstor', JSTOR_FILE)
- #self.preserve_sim(args)
- self.update_url_status(args)
- self.summarize(args)
- print("### Done with everything!")
-
- def export(self, args):
+ return counts
+
+ def export(self) -> Counter:
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
@@ -662,8 +588,9 @@ class ChoculaDatabase():
for row in cur.execute('SELECT * FROM journal'):
print(json.dumps(row))
counts['total'] += 1
+ return counts
- def export_fatcat(self, args):
+ def export_fatcat(self):
counts = Counter()
self.db.row_factory = sqlite3.Row
cur = self.db.cursor()
@@ -748,13 +675,14 @@ class ChoculaDatabase():
ezb = json.loads(drow['extra'])
extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
if drow['slug'] == 'szczepanski':
- # XXX: pull from record
- extra['szczepanski'] = dict(as_of=config.szczepanski.date)
+ # TODO: what to put here?
+ extra['szczepanski'] = drow['extra']
if drow['slug'] == 'doaj':
extra['doaj'] = json.loads(drow['extra'])
out['extra'] = extra
print(json.dumps(out))
+ return counts
def init_db(self):
print("### Creating Database...", file=sys.stderr)
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
new file mode 100644
index 0000000..4bed696
--- /dev/null
+++ b/chocula/directories/__init__.py
@@ -0,0 +1,19 @@
+
+from chocula.directories.crossref import CrossrefLoader
+from chocula.directories.doaj import DoajLoader
+from chocula.directories.entrez import EntrezLoader
+from chocula.directories.ezb import EzbLoader
+from chocula.directories.gold_oa import GoldOALoader
+from chocula.directories.norwegian import NorwegianLoader
+from chocula.directories.openapc import OpenAPCLoader
+from chocula.directories.road import RoadLoader
+from chocula.directories.sherpa_romeo import SherpaRomeoLoader
+from chocula.directories.sim import SimLoader
+from chocula.directories.szczepanski import SzczepanskiLoader
+from chocula.directories.wikidata import WikidataLoader
+
+ALL_CHOCULA_DIR_CLASSES = [
+ CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader,
+ NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader,
+ SzczepanskiLoader, WikidataLoader, SimLoader,
+]
diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py
new file mode 100644
index 0000000..c0c02df
--- /dev/null
+++ b/chocula/directories/sim.py
@@ -0,0 +1,71 @@
+
+from typing import Iterable, Optional, Dict, Any
+import csv
+
+from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class SimLoader(DirectoryLoader):
+
+ source_slug = "sim"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(open(self.config.sim.filepath))
+
+ def parse_record(self, row) -> Optional[DirectoryInfo]:
+
+ """
+ NA Pub Cat ID
+ Title
+ Publisher
+ ISSN
+ Impact Rank
+ Total Cities
+ Journal Impact Factor
+ Eigenfact or Score
+ First Volume
+ Last Volume
+ NA Gaps
+ "Scholarly / Peer-\n Reviewed"
+ "Peer-\n Reviewed"
+ Pub Type
+ Pub Language
+ Subjects
+ """
+ # TODO: 'Pub Type'
+
+ extra = {}
+ first_year = row['First Volume']
+ if first_year:
+ first_year = int(first_year)
+ extra['first_year'] = int(row['First Volume'])
+ else:
+ first_year = None
+ last_year = row['Last Volume']
+ if last_year:
+ last_year = int(last_year)
+ extra['last_year'] = last_year
+ else:
+ last_year = None
+ gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
+ if gaps:
+ extra['gaps'] = gaps
+ if first_year and last_year:
+ extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
+ extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"]
+ extra['peer_reviewed'] = row["Peer-\nReviewed"]
+ extra['pub_type'] = clean_str(row["Pub Type"])
+
+ info = DirectoryInfo(
+ directory_slug=self.source_slug,
+ name=clean_str(row['Title']),
+ publisher=clean_str(row['Publisher']),
+ raw_issn=row['ISSN'][:9],
+ custom_id=row.get('NA Pub Cat ID').strip() or None,
+ langs=[parse_lang(row['Pub Language'])],
+ extra=extra,
+ )
+ return info
+