move old scripts into subdirectory

author: Bryan Newbold <bnewbold@archive.org> 2019-12-23 19:16:20 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-12-23 19:16:20 -0800
commit: 3232f9509404c75777f23d7272416d8de4a45789 (patch)
tree: 6a5224d60b14cead9cf4b34ba2e8277e8712437b /old
parent: f8db4ee808b8e4db0ec413ad942f8129478041cc (diff)
download: chocula-3232f9509404c75777f23d7272416d8de4a45789.tar.gz
chocula-3232f9509404c75777f23d7272416d8de4a45789.zip
3 files changed, 867 insertions, 0 deletions
diff --git a/old/fix_invalid_issnl.py b/old/fix_invalid_issnl.py
new file mode 100755
index 0000000..521f334
--- /dev/null
+++ b/old/fix_invalid_issnl.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+"""
+This is a one-off script for pushing ISSN-L fixes to fatcat via the API.
+
+It excpects to have fatcat python libraries available. Run it like:
+
+    export FATCAT_API_AUTH_TOKEN="..."
+    ./fix_invalid_issnl.py ~/code/chocula/invalid_fatcat_issnl.tsv
+
+It creates a new editgroup, which you'll need to merge/accept manually.
+
+Defaults to QA API endpoint; edit the file to switch to prod.
+"""
+
+import os, sys
+import csv
+
+from fatcat_tools import authenticated_api
+from fatcat_client import Editgroup, ContainerEntity
+from fatcat_client.rest import ApiException
+
+API_ENDPOINT = "https://api.qa.fatcat.wiki/v0"
+
+
+def run(api, row_iter):
+
+    eg = api.create_editgroup(Editgroup(description=
+        "Update or merge containers with invalid (by checksum) ISSN-L. Using the fix_invalid_issnl.py script from chocula repo."))
+    print("Editgroup ident: {}".format(eg.editgroup_id))
+    for row in row_iter:
+        #print(row)
+        fixed_issnl = row['fixed_issnl'].strip()
+        if not fixed_issnl:
+            print("SKIP")
+            continue
+        assert row['issnl'].strip() != fixed_issnl
+        invalid = api.get_container(row['fatcat_ident'])
+        assert invalid.state == "active"
+        try:
+            fixed = api.lookup_container(issnl=row['fixed_issnl'])
+        except ApiException as ae:
+            if ae.status != 404:
+                raise ae
+            fixed = None
+
+        if fixed:
+            # merge/redirect
+            assert fixed.state == "active"
+            print("MERGE: {} -> {}".format(invalid.ident, fixed.ident))
+            invalid.redirect = fixed.ident
+            api.update_container(eg.editgroup_id, invalid.ident,
+                ContainerEntity(redirect=fixed.ident))
+        else:
+            # update in-place with fixed ISSN-L
+            print("FIX: {}: {}".format(invalid.ident, fixed_issnl))
+            invalid.issnl = fixed_issnl
+            api.update_container(eg.editgroup_id, invalid.ident, invalid)
+
+    # intentionally not merging editgroup
+    print("Editgroup ident: {}".format(eg.editgroup_id))
+    print("DONE")
+
+def main():
+    api = authenticated_api(
+        API_ENDPOINT,
+        # token is an optional kwarg (can be empty string, None, etc)
+        token=os.environ.get("FATCAT_API_AUTH_TOKEN"))
+
+    path = sys.argv[1]
+    reader = csv.DictReader(open(path), delimiter='\t')
+    run(api, reader)
+
+if __name__ == '__main__':
+    main()
diff --git a/old/invalid_fatcat_issnl.tsv b/old/invalid_fatcat_issnl.tsv
new file mode 100644
index 0000000..14a8dec
--- /dev/null
+++ b/old/invalid_fatcat_issnl.tsv
@@ -0,0 +1,118 @@
+fixed_issnl	issnl	fatcat_ident	name
+1530-1311	1550-1311	2nklacmgkjdfjib7kqi3hdh76a	International Symposium on Temporal Representation and Reasoning/TIME, Proceedings of the
+2306-0441	2223-0441	rngvdeed65ffhmgfxti7s2z6by	Journal of Local and Global Health Science
+1641-6554	1641-6565	yviicehmubf4bcxo23q43sbkzu	Kolposkopia
+1526-7539	1526-7639	pnyefvclqfabjlnl4suox6pdte	International Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems, Proceedings of the
+0276-6574	0276-6547	hcrk2xeoknf7daxwurje2vg3n4	Computers in Cardiology Conference
+0018-9413	0359-4237	ngthxcdwgzhovfgimtll6owdnm	IEEE Transactions on Geoscience Electronics
+	2630-4301	drgmggxvfjbjjkakrjegaujkgy	Food Modelling Journal
+1089-7771	1086-7771	sjjfcknh3zawndw6jdmvfrix7a	IEEE transactions on information technology in biomedicine
+1093-1139	1039-1139	nnrvd2qmhzbebk2hsopvtchodq	Academic Physician and Scientist
+0037-7996	0037-7796	klgctd3pbvbwpf6ynvd4wns53m	The Social studies
+1520-5363	1520-5263	v3zwf2ujd5flzlopo2h4ggmdhu	Document Analysis and Recognition (ICDAR), Proceedings of the International Conference on
+1992-4712	1992-4721	mmptdalzizepxi7uqfohnjj3uy	NAFU Farmer
+0066-7374	0066-7372	s5fjnmhtkbggtcn2itw4g34bxq	Proceedings of the Aristotelian Society (Hardback)
+	1234-5678	lkoo6d6lcvba7hhwswxc6x2sly	Test s Publication
+1088-7725	1080-7725	ikqqv7cnwfhpfkqy2lfugiz6ae	AUTOTESTCON IEEE Systems Readiness Technology Conference, Proceedings of the
+	0537-4737	kklzxgby4zfqvow7klnneyjj7e	IEE Telecommunications Hot Topics Forum
+1527-4160	1528-1145	q7j7ro4qfnbjdb2kkkcxatesna	Journal of Psychiatric Practice
+2468-4295	2648-4309	qtvq67fxfje4ro5mldpan35s34	Brill Research Perspectives in Art and Law
+	1113-1114	qqkn2huytzfj5fw573bc5ihxyi	A-to-Z Guide to Thermodynamics Heat and Mass Transfer and Fluids Engineering
+1080-241X	1082-241X	c3pwhzd5hvbqzfgqdrbxnyfn3i	Proceedings Annual Simulation Symposium
+1050-4729	1059-4729	b5exnobxjvd65flifftm6gy3f4	IEEE International Conference on Engineering of Complex Computer Systems, Proceedings of the
+0295-1630	0296-1630	u7tretgoaffblbvtsxbtml6pbu	Pleine Marge
+1071-0485	1071-0483	6x6ijsddbva7xldh5ubqoxcf7a	IEEE International Workshop on Future Trends of Distributed Computing Systems (FTDCS)
+0093-2582	0093-2592	ajsbz23tbnghzmwleirmr7or3a	The St. Croix review
+1544-6301	1544-6307	tl5eyaqxbfcllmy7ip63up2sma	Journal Of Cancer Integrative Medicine
+2155-5745	2155-5754	b6s7uywtdfannklqblq6tghkte	International Radar Symposium
+0767-3701	3767-3701	wyh37ssfibcdtlncbi53m7ms4q	Recherche et Applications en Marketing
+2255-8691	2225-8691	bl6y7mpqgfdtxoqqbcq7bikwvu	Applied Computer Systems
+2561-3103	2561-3101	46uvtmw72zgpjhmbkvllrnwlau	Journal of Contact Lens Research and Science
+0939-7248	0939-7148	xma7s7w65zed3g4ojkmfz7r6ca	European journal of pediatric surgery
+1071-6084	1071-6048	dqthad6e3vfhxjf4px42u76dbe	The Journal of technology studies
+1528-8463	1538-8463	j4cxdxvaxrb2thxtvbda47gy2y	Hospital law's Regan report
+	2352-3951	pc6ftuth7ffvreuolz5rxbaye4	Nanoelectronics and Spintronics
+2006-9820	2006-9802	wmugzbvj6rcrzg5hf5jtug6tja	Journal of Toxicology and Environmental Health Sciences
+0953-9182	0955-9182	mcpenq5ytrd6zpmuyc5crmx2fi	Contemporary reviews in obstetrics and gynaecology
+0259-188X	0459-188X	iw7xtyujbrax7ojqqbgxjlrsem	Indicator South Africa
+0894-9115	0894-9112	wa3xjixz6ve75j2ecrk23sposq	American Journal of Physical Medicine & Rehabilitation
+0165-9367	0165-0937	32qxo574yjcbbbchqijyakggzm	BABesch - Bulletin Antieke Beschaving
+2586-1247	2968-9853	whexebkzebh5ncyb6hvjppkyqa	International Journal of Sustainable Lighting
+1811-7813	1511-7813	ipu26yclwnhw7ekbsve6wpb44a	Point-Based Graphics (PBG), IEEE VGTC Symposium Proceedings
+0394-1051	2532-4681	opjfte47erhb7dke5aeap5wbse	Eupalino : cultura della città e della casa
+	0538-9989	cjkaddjcfzhmveuuffmtycjxmm	Professional Development Course on Railway Electrification Infrastructure and Systems
+1081-7735	1085-7735	oam6q4knpjaq5enpfzra3jazfi	Asian Test Symposium, Proceedings of the
+1064-7546	1064-7564	su2chgwy2vf6zavx6ekbyv4sem	Journal of Environmental Polymer Degradation
+1817-4426	1817-4428	kwqvwiwyx5b5vbjxfetnpxt6ky	The Retail and Marketing Review
+	1432-3123	sa73qgbh25bgnd5q6o262u7inm	china Frontiers
+2141-2634	2142-2634	g2z2ld65nve2pdwjk7gr3qe25e	Journal of Civil Engineering and Construction Technology
+	9999-9998	nvya624wtbbgxl6bdsrqatslum	Space Resolver 15
+	0006-0809	5gtixxmzm5aodhqpd4bqp3hqiq	The Bridge
+0547-3578	7964-0977	bonch7zygnadddv2fqoecmj4yy	IEEE National Aerospace and Electronics Conference
+0003-3839	0003-3829	yqozzqdrwvedvafwrokyx5puna	Annales agronomiques
+2213-9885	2213-0886	hdfn46db7bgw5e7lmmlx3iubxe	Current Chemical Genomics and Translational Medicine
+2378-5225	2333-5225	tgcsgbazhrfqzodbpdkjgtvv6i	Pathophysiology of Cell Injury Journal
+1761-7227	1761-7727	uynmrtrspngoxdjdihrwx3x6oi	International orthodontics
+1079-5545	1990-1993	goo5ob7jqzh7ro5pft7j3qdkaq	Academy of Management Executive
+1536-0040	1111-1111	7au72xtbafdwpdsz6d53elvk2a	SIAM E-Journal on Applied Dynamical Systems
+0163-514X	0613-514X	jjup253sjjc2pkg3zz3vfz3nye	The Journal of Prevention
+1020-1491	1020-1497	gtn3v33zcrg5zndvyir7l65mna	Image & Text : a Journal for Design
+0972-8279	0972-8270	laktqf5ag5fpje2kfzzutnooyq	Journal of maxillofacial and oral surgery
+	0133-0189	kixvoietq5bhjeub36kqu6kwpa	DCDS Supplements
+1071-9458	1071-9658	xmwrlaklfbe53ayhlzrhinb5cu	International Symposium on Software Reliability Engineering, Proceedings of the
+0040-6325	0040-5325	qpbm4in22jevvfeht6mhawmllq	The Thomist: A Speculative Quarterly Review
+0039-4157	0039-4156	5xp3jodttzhmrp22xl5femz52q	Study encounter
+1122-5300	1125-5300	zeclu7ljprfkvauia5vljzvnfy	Parolechiave
+	1453-212X	dhdssmbdbjfyfe53dpbfgcejti	Research Journal of International Studies
+1530-1834	1550-1834	d57htlky2nexlmy4wa36lva6wy	Brazilian Symposium on Computer Graphics and Image Processing
+	0686-3174	ajqi4yc43feitknsozbzgpjnre	Y hoc Viet Nam
+0015-8208	1521-3979	hoifudhajjgwfauzsgevzgogpa	Fortschritte der Physik/Progress of Physics
+2141-6613	2141-6113	xghlebvnzzco3j6eb6ywlwal3m	International Journal of Water Resources and Environmental Engineering
+0112-9120	0110-7903	6c6fql7levae5lyunnm6aiuzaq	New Zealand monthly review
+	0798-1881	xnblbncvezfuveglr5loa2rqym	Construccion
+0017-3703	0017-3701	5dfgot47anfwxk65i3biooh6we	Greater amusements
+0015-8577	0015-1287	xjwp4btqb5gojjcmdegglcyrli	The Forumeer
+0160-8371	1060-8371	jbubc4zldfegxfkstwvg2oh47i	IEEE Photovoltaic Specialists Conference, Conference Record of the
+2378-9530	2378-8953	4x4y2afasfg45iqoo47vphr6iq	Veterinary Research and Case Reports
+2162-4933	2162-4923	azachjfocbf6xciiz3m3ze564u	Current Dermatology Reports
+0882-5645	0882-5646	7fyviche6rbyzfsznpfrvvlyv4	Topics in Pain Management
+0033-7196	0033-7916	hsp5k2eij5dwpe4tbqf2n7gkpa	RWDSU record
+1823-3902	1832-3902	sdw3bp3mk5b7vpkf22nj3dbttu	Journal of tropical biology & conservation
+1066-6192	1066-6172	jlzpb6temzauliox6i7ecma3mq	Euromicro Workshop on Parallel and Distributed Processing, Proceedings of the
+1818-9091	1819-9091	gom2wpnbtje6zfssyliobeup24	SA Horseman
+2504-2289	8888-0000	eyngovpvzfen5hsd55udttyk2u	Big Data and Cognitive Computing
+	8164-9547	knfumdwvafcnjnwge24v3kaip4	IEEE International Conference on Communications (2006)
+0369-8718	0368-8718	fatiuzqz7veebcr4n3t7wvn2n4	Proceedings of the Chemical Society
+2379-6227	2397-6227	37z2akit2nbtbea5qcn5shreru	Computational Psychiatry
+0025-6978	0026-6978	vq6o3qyo3rezvercvuai2dciti	Medical and biological illustration
+2588-4174	4174-2588	lr5kb4tnq5bpjf3otub5umkdmy	Afzalipour Journal of Clinical Research
+2641-7928	1089-796X	cff4m4os3bdznnc3fxbh2vv6ha	International Conference on Parallel Architectures and Compilation Techniques, Proceedings of the
+1931-7611	1931-7811	gmtif5qt6vddroc7uox4hxtch4	International journal of sexual health
+0538-933X	0583-9364	xt6yxm3j7jdyzb6viwcltb6vsq	International Reading Association. Proceedings of the annual convention
+	7777-8888	3va7v7macvbalhxfujg5qfb7vi	A Journal with Only 1 Volume
+2157-3689	2157-3698	qegnzb72dzhf5jdunf3ge7jmji	Limnology & Oceanography Fluids & Environment
+1673-3487	1673-3508	z3jjoyzt45ecrh3c62afjmmxjm	Frontiers of Physical in China
+2571-6255	0013-0013	kwk3mhqddfddrd6hjqdnkg3f3y	Fire
+2141-2618	2141-2648	w5aoztaff5dg7harb4d2fyxzji	Journal of Medical Laboratory and Diagnosis
+	0804-3321	bqecated6jcwzoe56hmlxxydsq	Utdanning
+	0842-3446	wiidl3rotncxfpqoj46uegf5mq	African Journal of Farm Child and Youth Development
+	0539-9989	c7abz7liw5d53bskkcq7fcetvy	Institution of Engineering and Technology International Conference on Medical Electrical Devices and Technology
+1051-9173	1057-9173	auy44witavcmnejiss5h4ndlnq	IEEE Symposium on Mass Storage Systems
+0276-2234	0726-2234	6cbknzfrcfchhj73pmrlull2ky	Oncology Times UK
+0029-0564	1949-1986	s4poo6asmffuhlfw4lsneh6fq4	Nineteenth-Century Fiction
+	9999-9999	zjdb7jqlxzcmlatzcjfjlrna6m	Space Resolver 16
+0026-475X	0027-475X	xmgc2fswtjaghmrocercgewwnu	Minerva dietologica
+	8164-2284	jkjfyzs3ijcnfbbfwxpewlwuaq	IEEE International Electron Devices Meeting
+1972-3792	2035-0167	nbo7hjouubbjba3kk5nn6oucmu	Quaderni materialisti
+0307-0565	0307-9565	mfgzm4p6zzgsxja4jjy7etl35q	International journal of obesity
+0738-6176	0723-6176	ynfkc2j7lzbpnchzpjic2orwlq	The Psychotherapy patient
+0963-0252	0965-0252	4s5mbkeut5g5tass6gz6qfivny	Plasma sources science & technology
+0144-3887	0143-3887	kdtbqfuuynd2fc64lprajnauvm	Current Psychological Research
+1470-3556	0741-9738	23er6bey4rgclaudewagavnf7m	Advances in Mind-Body Medicine
+0040-3261	0400-3261	arakq24urzgfhbz4abv6kz4mha	Tennessee historical quarterly
+0957-4484	0957-9207	k4v6ng56q5fb5p7yjyi7fsmv5a	Nanotechnology
+1087-9870	1089-9870	ubsj5pa5abbe7gqqdttkfibqle	Thermal and Thermomechanical Phenomena in Electronic Systems (ITHERM), Intersociety Conference on
+1812-6731	1812-6371	lhhrrjjxynd3lcadty56vqjk4u	New Voices in Psychology
+0163-0067	0613-0067	mlluqww5qnet7gqptazojr32bu	Information world
+1080-9775	1080-9175	yxjwuy4kj5hqvorfbmv4widyqa	Biomedical Safety & Standards
+	9999-9997	o6xy2oixgzag5o6oe26fdmmr4e	Space Resolver
diff --git a/old/parse_merge_metadata.py b/old/parse_merge_metadata.py
new file mode 100755
index 0000000..b1d038b
--- /dev/null
+++ b/old/parse_merge_metadata.py
@@ -0,0 +1,674 @@
+#!/usr/bin/env python3
+
+import sys, csv, json
+import ftfy
+import pycountry
+from collections import Counter
+
+ISSNL_FILE = 'data/20190220.ISSN-to-ISSN-L.txt'
+
+ENTREZ_FILE = 'data/entrez-journals.csv'
+ROAD_FILE = 'data/road-2018-01-24.tsv'
+ROAD_DATE = '2018-01-24'
+DOAJ_FILE = 'data/doaj_20190124.csv'
+DOAJ_DATE = '2019-01-24'
+CROSSREF_FILE = 'data/doi_titles_file_2019-01-24.csv'
+SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
+SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
+NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv'
+NORWEGIAN_DATE = '2018-03-02'
+LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
+CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
+PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
+JSTOR_FILE = 'data/jstor_all-archive-titles.txt'
+SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
+IA_CRAWL_FILE = 'data/journal_homepage_results.partial.tsv'
+
+# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
+# software frameworks
+PLATFORM_MAP = {
+    'OJS': 'ojs',
+    'BMC': 'bmc',
+    'SciELO Brazil': 'scielo',
+    'SciELO Argentina': 'scielo',
+    'SciELO': 'scielo',
+    'SciELO Mexico': 'scielo',
+    'SciELO Spain': 'scielo',
+    'SciELO Portugal': 'scielo',
+    'WordPress': 'wordpress',
+    'Sciendo': 'sciendo',
+    'Drupal': 'drupal',
+    'revues.org': 'openedition',
+}
+
+MIMETYPE_MAP = {
+    'PDF': 'application/pdf',
+    'HTML': 'text/html',
+    'XML': 'application/xml',
+}
+
+def unquote(s):
+    if s.startswith('"'):
+        s = s[1:]
+    if s.endswith('"'):
+        s = s[:-1]
+    if s.endswith('.'):
+        s = s[:-1]
+    return s.strip()
+
+def parse_lang(s):
+    if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+        return None
+    try:
+        if len(s) == 2:
+            lang = pycountry.languages.get(alpha2=s.lower())
+        elif len(s) == 3:
+            lang = pycountry.languages.get(alpha3=s.lower())
+        else:
+            lang = pycountry.languages.get(name=s)
+        return lang.alpha2.lower()
+    except KeyError:
+        return None
+    except AttributeError:
+        return None
+
+def parse_country(s):
+    if not s or s in ('Unknown'):
+        return None
+    try:
+        if len(s) == 2:
+            country = pycountry.countries.get(alpha2=s.lower())
+        else:
+            country = pycountry.countries.get(name=s)
+    except KeyError:
+        return None
+    return country.alpha2.lower()
+
+def gaps_to_spans(first, last, gaps):
+    if not gaps:
+        return [[first, last]]
+    if not (last >= first and max(gaps) < last and min(gaps) > first):
+        # mangled
+        print("mangled years: {}".format((first, last, gaps)))
+        return []
+    full = list(range(first, last+1))
+    for missing in gaps:
+        full.remove(missing)
+    spans = []
+    low = None
+    last = None
+    for year in full:
+        if not low:
+            low = year
+            last = year
+            continue
+        if year != last+1:
+            spans.append([low, last])
+            low = year
+            last = year
+        last = year
+    if low:
+        spans.append([low, last])
+    return spans
+
+def test_gaps():
+    assert gaps_to_spans(1900, 1900, None) == \
+        [[1900, 1900]]
+    assert gaps_to_spans(1900, 1903, None) == \
+        [[1900, 1903]]
+    assert gaps_to_spans(1900, 1902, [1901]) == \
+        [[1900, 1900], [1902, 1902]]
+    assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
+        [[1950, 1954], [1957, 1964], [1966, 1970]]
+
+def merge_spans(old, new):
+    if not new:
+        return old
+    if not old:
+        old = []
+    old.extend(new)
+    years = set()
+    for span in old:
+        for y in range(span[0], span[1]+1):
+            years.add(y)
+    if not years:
+        return []
+    spans = []
+    start = None
+    last = None
+    todo = False
+    for y in sorted(list(years)):
+        if start == None:
+            # very first
+            start = y
+            last = y
+            todo = True
+            continue
+        if y == last + 1:
+            # span continues
+            last = y
+            todo = True
+            continue
+        # a gap just happened!
+        spans.append([start, last])
+        start = y
+        last = y
+        todo = True
+    if todo:
+        spans.append([start, last])
+    return spans
+
+def test_merge_spans():
+    assert merge_spans([[5, 10]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([[5, 9]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([[5, 11]], [[10, 20]]) == \
+        [[5, 20]]
+    assert merge_spans([], []) == \
+        []
+    assert merge_spans([[9, 11]], []) == \
+        [[9,11]]
+    assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
+        [[1450, 1900], [2000, 2000]]
+
+class Munger():
+    """
+    Top-level fields we'd like to fill in if possible:
+
+        issnp: string
+        issne: string
+        first_year: year (integer)
+        last_year: if publishing has stopped
+        languages: array of ISO codes; first is the "primary" language
+        country: ISO shortcode of country published from
+        urls: homepage links
+        abbrev: string
+        default_license: slug
+        original_name: native name (if name is translated)
+        platform: hosting platform: OJS, wordpress, scielo, etc
+        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+        aliases: array of "also known as"
+
+    Lower priority (TODO/later):
+        coden: string
+        oclc_id: string (lookup?)
+        lccn_id: string (lookup?)
+        dblb_id: string
+        region: TODO: continent/world-region
+        discipline: TODO: highest-level subject; "life science", "humanities", etc
+        field: TODO: narrower description of field
+        subjects: TODO?
+
+    TODO: so many missing ISSN/ISSN-L
+    TODO: abbrev
+    """
+
+    def __init__(self):
+        self.data = dict()
+        with open(ISSNL_FILE, 'r') as f:
+            self.read_issn_map_file(f)
+
+    def run(self, out_path):
+        self.load_doaj(DOAJ_FILE)
+        self.load_norwegian(NORWEGIAN_FILE)
+        self.load_crossref(CROSSREF_FILE)
+        self.load_sherpa_romeo(SHERPA_ROMEO_JOURNAL_FILE, SHERPA_ROMEO_POLICY_FILE)
+        self.load_road(ROAD_FILE)
+        self.load_kbart('lockss', LOCKSS_FILE)
+        self.load_kbart('clockss', CLOCKSS_FILE)
+        self.load_kbart('portico', PORTICO_FILE)
+        self.load_kbart('jstor', JSTOR_FILE)
+        self.load_entrez(ENTREZ_FILE)
+        self.load_sim(SIM_FILE)
+        self.load_homepage_crawl(IA_CRAWL_FILE)
+        self.summarize()
+        self.dump(out_path)
+        print("Done!")
+
+    def dump(self, out_path):
+        print("#### Dumping to {}".format(out_path))
+        with open(out_path, 'w') as out:
+            for issnl in self.data:
+                out.write(json.dumps(self.data[issnl]) + "\n")
+
+    def summarize(self):
+        print("##### Loaded {} unique entries".format(len(self.data)))
+
+    def read_issn_map_file(self, issn_map_file):
+        print("##### Loading ISSN map file...")
+        self._issn_issnl_map = dict()
+        for line in issn_map_file:
+            if line.startswith("ISSN") or len(line) == 0:
+                continue
+            (issn, issnl) = line.split()[0:2]
+            self._issn_issnl_map[issn] = issnl
+            # double mapping makes lookups easy
+            self._issn_issnl_map[issnl] = issnl
+        print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map)))
+
+    def issn2issnl(self, issn):
+        if issn is None:
+            return None
+        return self._issn_issnl_map.get(issn)
+
+    def add_issn(self, raw_issn=None, issne=None, issnp=None, name=None, publisher=None):
+        # do ISSN => ISSN-L mappings for any raw ISSNs
+        issnl = None
+        if not (raw_issn or issne or issnp):
+            return None, 'no-issn'
+        for lookup in (issnp, issne, raw_issn):
+            if not lookup:
+                continue
+            lookup = lookup.strip().upper()
+            #if not (len(lookup) == 9 and lookup[4] == '-'):
+            #    print(lookup)
+            #    print(len(lookup))
+            #    print(lookup[4])
+            #    return None, 'invalid-issn'
+            #assert len(lookup) == 9 and lookup[4] == '-'
+            issnl = self.issn2issnl(lookup)
+            if issnl:
+                break
+        if not issnl:
+            #print((raw_issn, issne, issnp))
+            # UGH.
+            issnl = issne or issnp or raw_issn
+            if not issnl:
+                return None, 'no-issnl'
+            issnl = issnl.strip().upper()
+            assert len(issnl) == 9 and issnl[4] == '-'
+            status = 'found-munge'
+        else:
+            status = 'found'
+        # lookup ISSN-Ls in data (or create one)
+        if not issnl in self.data:
+            status = 'created'
+            self.data[issnl] = dict(issnl=issnl)
+        d = self.data[issnl]
+        # if name/publisher not set, do so
+        if name and not 'name' in d:
+            name = unquote(ftfy.fix_text(name))
+            if name:
+                self.data[issnl]['name'] = name
+        if publisher and not 'publisher' in d:
+            publisher = unquote(ftfy.fix_text(publisher))
+            if publisher:
+                self.data[issnl]['publisher'] = publisher
+        if issne and not 'issne' in d:
+            self.data[issnl]['issne'] = issne
+        if issnp and not 'issnp' in d:
+            self.data[issnl]['issnp'] = issnp
+        # always return ISSN-L
+        return issnl, status
+
+    def add_lang(self, issnl, lang):
+        if not (lang and issnl):
+            return
+        lang = parse_lang(lang)
+        if not lang:
+            return
+        if 'languages' not in self.data[issnl]:
+            self.data[issnl]['languages'] = [lang]
+        elif lang not in self.data[issnl]['languages']:
+            self.data[issnl]['languages'].append(lang)
+
+    def add_url(self, issnl, url):
+        if not (url and issnl) or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'):
+            return
+        if url.startswith('www.'):
+            url = "http://" + url
+        url.replace('Http://', 'http://')
+        if 'urls' not in self.data[issnl]:
+            self.data[issnl]['urls'] = [url]
+        elif url not in self.data[issnl]['urls']:
+            self.data[issnl]['urls'].append(url)
+
+    def add_country(self, issnl, country):
+        if not (country and issnl):
+            return
+        country = parse_country(country)
+        if not country:
+            return
+        if 'country' not in self.data[issnl]:
+            self.data[issnl]['country'] = country
+
+    def add_mimetype(self, issnl, val):
+        if not (val and issnl):
+            return
+        mimetype = None
+        if '/' in val:
+            mimetype = val
+        else:
+            mimetype = MIMETYPE_MAP.get(val)
+        if not mimetype:
+            return
+        if 'mimetypes' not in self.data[issnl]:
+            self.data[issnl]['mimestypes'] = [mimetype]
+        elif mimetype not in self.data[issnl]['mimestypes']:
+            self.data[issnl]['mimestypes'].append(mimetype)
+
+    def load_entrez(self, path):
+        print("##### Loading Entrez...")
+        # JrId,JournalTitle,MedAbbr,"ISSN (Print)","ISSN (Online)",IsoAbbr,NlmId
+        reader = csv.DictReader(open(path))
+        counts = Counter()
+        for row in reader:
+            if not (row.get('ISSN (Online)') or row.get('ISSN (Print)')):
+                counts['skipped'] += 1
+                continue
+            issnl, status = self.add_issn(
+                issne=row.get('ISSN (Online)'),
+                issnp=row.get('ISSN (Print)'),
+                name=row['JournalTitle'],
+            )
+            if row['IsoAbbr'] and not 'abbrev' in self.data[issnl]:
+                self.data[issnl]['abbrev'] = row['IsoAbbr'].strip()
+            counts[status] += 1
+        print(counts)
+
+    def load_road(self, path):
+        print("##### Loading ROAD...")
+        reader = csv.DictReader(open(path), delimiter='\t',
+            fieldnames=("ISSN", "ISSN-L", "Short Title", "Title", "Publisher", "URL1", "URL2", "Region", "Lang1", "Lang2")
+        )
+        counts = Counter()
+        for row in reader:
+            issnl, status = self.add_issn(
+                raw_issn=row['ISSN-L'],
+                name=row['Short Title'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            if row['URL1']:
+                self.add_url(issnl, row['URL1'])
+            if row['URL2']:
+                self.add_url(issnl, row['URL2'])
+            if row['Lang1']:
+                self.add_lang(issnl, row['Lang1'])
+            if row['Lang2']:
+                self.add_lang(issnl, row['Lang2'])
+            # TODO: region mapping: "Europe and North America"
+            # TODO: lang mapping: already alpha-3
+            self.data[issnl]['road'] = dict(as_of=ROAD_DATE)
+        print(counts)
+
+    def load_doaj(self, path):
+        print("##### Loading DOAJ...")
+        #Journal title,Journal URL,Alternative title,Journal ISSN (print version),Journal EISSN (online version),Publisher,Society or institution,"Platform, host or aggregator",Country of publisher,Journal article processing charges (APCs),APC information URL,APC amount,Currency,Journal article submission fee,Submission fee URL,Submission fee amount,Submission fee currency,Number of articles publish in the last calendar year,Number of articles information URL,Journal waiver policy (for developing country authors etc),Waiver policy information URL,Digital archiving policy or program(s),Archiving: national library,Archiving: other,Archiving infomation URL,Journal full-text crawl permission,Permanent article identifiers,Journal provides download statistics,Download statistics information URL,First calendar year journal provided online Open Access content,Full text formats,Keywords,Full text language,URL for the Editorial Board page,Review process,Review process information URL,URL for journal's aims & scope,URL for journal's instructions for authors,Journal plagiarism screening policy,Plagiarism information URL,Average number of weeks between submission and publication,URL for journal's Open Access statement,Machine-readable CC licensing information embedded or displayed in articles,URL to an example page with embedded licensing information,Journal license,License attributes,URL for license terms,Does this journal allow unrestricted reuse in compliance with BOAI?,Deposit policy directory,Author holds copyright without restrictions,Copyright information URL,Author holds publishing rights without restrictions,Publishing rights information URL,DOAJ Seal,Tick: Accepted after March 2014,Added on Date,Subjects
+        reader = csv.DictReader(open(path))
+        counts = Counter()
+        for row in reader:
+            issnl, status = self.add_issn(
+                issnp=row['Journal ISSN (print version)'],
+                issne=row['Journal EISSN (online version)'],
+                name=row['Journal title'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            doaj = dict(as_of=DOAJ_DATE)
+            # TODO: work_level: bool (are work-level publications deposited with DOAJ?)
+
+            if row['Digital archiving policy or program(s)']:
+                doaj['archive'] = [a.strip() for a in row['Digital archiving policy or program(s)'].split(',') if a.strip()]
+            elif row['Archiving: national library']:
+                doaj['archive'] = ['national-library']
+
+            crawl_permission = row['Journal full-text crawl permission']
+            if crawl_permission:
+                doaj['crawl-permission'] = dict(Yes=True, No=False)[crawl_permission]
+            # TODO: Permanent article identifiers
+            default_license = row['Journal license']
+            if default_license and default_license.startswith('CC'):
+                self.data[issnl]['default_license'] = default_license.replace('CC ', 'CC-').strip()
+
+            self.add_mimetype(issnl, row['Full text formats'])
+            platform = PLATFORM_MAP.get(row['Platform, host or aggregator'])
+            if platform:
+                self.data[issnl]['platform'] = platform
+            if row['DOAJ Seal']:
+                doaj['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()]
+            if row['Country of publisher']:
+                self.add_country(issnl, row['Country of publisher'])
+            if row['Full text language']:
+                self.add_lang(issnl, row['Full text language'])
+            if row['Journal URL']:
+                self.add_url(issnl, row['Journal URL'])
+            # TODO: Subjects
+            self.data[issnl]['doaj'] = doaj
+        print(counts)
+
+    def load_sherpa_romeo(self, journal_path, policy_path):
+        # first load policies
+        print("##### Loading SHERPA/ROMEO policies...")
+        #RoMEO Record ID,Publisher,Policy Heading,Country,RoMEO colour,Published Permission,Published Restrictions,Published Max embargo,Accepted Prmission,Accepted Restrictions,Accepted Max embargo,Submitted Permission,Submitted Restrictions,Submitted Max embargo,Open Access Publishing,Record Status,Updated
+        policies = dict()
+        fixed_policy_file = ftfy.fix_file(open(policy_path, 'rb'))
+        policy_reader = csv.DictReader(fixed_policy_file)
+        for row in policy_reader:
+            policies[row['RoMEO Record ID']] = row
+        print("##### Loading SHERPA/ROMEO journal metadata...")
+        #Journal Title,ISSN,ESSN,URL,RoMEO Record ID,Updated
+        # super mangled :(
+        raw_file = open(journal_path, 'rb').read().decode(errors='replace')
+        fixed_file = ftfy.fix_text(raw_file)
+        reader = csv.DictReader(fixed_file.split('\n'))
+        counts = Counter()
+        for row in reader:
+            #row['Journal Title'] = row.pop('\ufeffJournal Title')
+            row.update(policies[row['RoMEO Record ID']])
+            issnl, status = self.add_issn(
+                issnp=row['ISSN'],
+                issne=row['ESSN'],
+                name=row['Journal Title'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            sherpa_romeo = dict()
+            if row['RoMEO colour']:
+                sherpa_romeo['color'] = row['RoMEO colour']
+            # row['Open Access Publishing']
+            if row['Country']:
+                self.add_country(issnl, row['Country'])
+            self.data[issnl]['sherpa_romeo'] = sherpa_romeo
+        print(counts)
+
+    def load_norwegian(self, path):
+        print("##### Loading Norwegian Registry...")
+        #pandas.read_csv(NORWEGIAN_FILE, sep=';', encoding="ISO-8859-1")
+        #NSD tidsskrift_id;Original title;International title;Present Level (2018);Print ISSN;Online ISSN;Open Access;NPI Scientific Field;NPI Academic Discipline;URL;Publishing Company;Publisher;Country of publication;Language;Level 2019;Level 2018;Level 2017;Level 2016;Level 2015;Level 2014;Level 2013;Level 2012;Level 2011;Level 2010;Level 2009;Level 2008;Level 2007;Level 2006;Level 2005;Level 2004;itar_id
+        reader = csv.DictReader(open(path, encoding="ISO-8859-1"), delimiter=";")
+        counts = Counter()
+        for row in reader:
+            issnp = row['Print ISSN']
+            issne = row['Online ISSN']
+            if issne and len(issne.strip()) != 9:
+                issne = None
+            if issnp and len(issnp.strip()) != 9:
+                issnp = None
+            if not (issnp or issne):
+                counts['no-issn'] += 1
+                continue
+            issnl, status = self.add_issn(
+                issnp=issnp,
+                issne=issne,
+                name=row['International title'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            norwegian = dict(as_of=NORWEGIAN_DATE)
+            norwegian['level'] = int(row['Present Level (2018)'])
+            norwegian['id'] = int(row['NSD tidsskrift_id'])
+
+            if row['Original title'] != row['International title'] and not 'original_name' in d:
+                self.data[issnl]['original_name'] = row['Original title']
+            if row['Country of publication']:
+                self.add_country(issnl, row['Country of publication'])
+            if row['Language']:
+                self.add_lang(issnl, row['Language'])
+            if row['URL']:
+                self.add_url(issnl, row['URL'])
+            self.data[issnl]['norwegian'] = norwegian
+        print(counts)
+
+    def load_kbart(self, name, path):
+        print("##### Loading KBART file for {}...".format(name))
+        #publication_title      print_identifier        online_identifier       date_first_issue_online num_first_vol_online    num_first_issue_online  date_last_issue_online  num_last_vol_online     num_last_issue_online   title_url       first_author    title_id        embargo_info    coverage_depth  coverage_notes  publisher_name
+        raw_file = open(path, 'rb').read().decode(errors='replace')
+        fixed_file = ftfy.fix_text(raw_file)
+        reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
+        counts = Counter()
+        for row in reader:
+            if not row['print_identifier'] and not row['online_identifier']:
+                counts['no-issn'] += 1
+                continue
+            issnl, status = self.add_issn(
+                issnp=row['print_identifier'],
+                issne=row['online_identifier'],
+                name=row['publication_title'],
+                publisher=row['publisher_name'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            if not 'kbart' in d:
+                self.data[issnl]['kbart'] = dict()
+                d = self.data[issnl]
+            if not name in d['kbart']:
+                self.data[issnl]['kbart'][name] = dict()
+            old_spans = self.data[issnl]['kbart'].get(name, dict()).get('year_spans', [])
+            kbart = dict()
+            if row['date_first_issue_online'] and row['date_last_issue_online']:
+                start = int(row['date_first_issue_online'][:4])
+                end = int(row['date_last_issue_online'][:4])
+                if not start <= end:
+                    print("{}: {} not before {}! er, mangling".format(
+                        issnl,
+                        row['date_first_issue_online'],
+                        row['date_last_issue_online']))
+                    new_spans = [[end, start]]
+                else:
+                    new_spans = [[start, end]]
+                self.data[issnl]['kbart'][name]['year_spans'] = merge_spans(old_spans, new_spans)
+        print(counts)
+
+    def load_crossref(self, path):
+        print("##### Loading Crossref...")
+        #"JournalTitle","JournalID","Publisher","pissn","eissn","additionalIssns","doi","(year1)[volume1]issue1,issue2,issue3(year2)[volume2]issue4,issues5"
+        reader = csv.DictReader(open(path))
+        counts = Counter()
+        for row in reader:
+            if row['pissn'] and len(row['pissn']) == 8:
+                row['pissn'] = row['pissn'][:4] + '-' + row['pissn'][4:]
+            if row['eissn'] and len(row['eissn']) == 8:
+                row['eissn'] = row['eissn'][:4] + '-' + row['eissn'][4:]
+            if not (row['pissn'] or row['eissn']):
+                counts['no-issn'] += 1
+                continue
+            issnl, status = self.add_issn(
+                issnp=row['pissn'],
+                issne=row['eissn'],
+                name=row['JournalTitle'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            crossref = dict()
+            if row['doi']:
+                crossref['doi'] = row['doi']
+            crossref['any'] = True
+            self.data[issnl]['crossref'] = crossref
+        print(counts)
+
+    def load_sim(self, path):
+        print("##### Loading SIM Metadata...")
+        #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects
+        reader = csv.DictReader(open(path))
+        counts = Counter()
+        for row in reader:
+            if not row['ISSN'] or row['ISSN'] == "NULL":
+                counts['no-issn'] += 1
+                continue
+            issnl, status = self.add_issn(
+                raw_issn=row['ISSN'][:9],
+                name=row['Title'],
+                publisher=row['Publisher'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            sim = dict()
+            sim['id'] = row['NA Pub Cat ID']
+            first_year = row['First Volume']
+            if first_year:
+                first_year = int(first_year)
+                sim['first_year'] = int(row['First Volume'])
+            else:
+                first_year = None
+            last_year = row['Last Volume']
+            if last_year:
+                last_year = int(last_year)
+                sim['last_year'] = last_year
+            else:
+                last_year = None
+            gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
+            if gaps:
+                sim['gaps'] = gaps
+            if first_year and last_year:
+                sim['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
+            if row['Pub Language']:
+                self.add_lang(issnl, row['Pub Language'])
+            # TODO: 'Pub Type'
+            all_keys = list(sim.keys())
+            for k in all_keys:
+                if not sim[k]:
+                    sim.pop(k)
+            self.data[issnl]['sim'] = sim
+        print(counts)
+
+    def load_homepage_crawl(self, path):
+        print("##### Loading IA Homepage Crawl Results...")
+        reader = csv.DictReader(open(path), delimiter='\t',
+            fieldnames=("ISSN", "first_url", "first_status", "last_status", "last_url")
+        )
+        counts = Counter()
+        for row in reader:
+            issnl, status = self.add_issn(
+                raw_issn=row['ISSN'],
+            )
+            counts[status] += 1
+            if not issnl:
+                continue
+            d = self.data[issnl]
+            ia = d.get('ia', dict())
+            ia['homepage_status'] = int(row['last_status'])
+            if ia['homepage_status'] == 200:
+                ia['homepage_url'] = row['last_url']
+            else:
+                ia['homepage_url'] = row['first_url']
+            self.data[issnl]['ia'] = ia
+        print(counts)
+
+if __name__=='__main__':
+    if len(sys.argv) != 2 or sys.argv[1].startswith('-'):
+        print("pass me path for an output JSON lines file")
+        sys.exit(-1)
+    munger = Munger()
+    munger.run(sys.argv[1])
+
author	Bryan Newbold <bnewbold@archive.org>	2019-12-23 19:16:20 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-12-23 19:16:20 -0800
commit	3232f9509404c75777f23d7272416d8de4a45789 (patch)
tree	6a5224d60b14cead9cf4b34ba2e8277e8712437b /old
parent	f8db4ee808b8e4db0ec413ad942f8129478041cc (diff)
download	chocula-3232f9509404c75777f23d7272416d8de4a45789.tar.gz chocula-3232f9509404c75777f23d7272416d8de4a45789.zip