aboutsummaryrefslogtreecommitdiffstats
path: root/old
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-23 19:16:20 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-23 19:16:20 -0800
commit3232f9509404c75777f23d7272416d8de4a45789 (patch)
tree6a5224d60b14cead9cf4b34ba2e8277e8712437b /old
parentf8db4ee808b8e4db0ec413ad942f8129478041cc (diff)
downloadchocula-3232f9509404c75777f23d7272416d8de4a45789.tar.gz
chocula-3232f9509404c75777f23d7272416d8de4a45789.zip
move old scripts into subdirectory
Diffstat (limited to 'old')
-rwxr-xr-xold/fix_invalid_issnl.py75
-rw-r--r--old/invalid_fatcat_issnl.tsv118
-rwxr-xr-xold/parse_merge_metadata.py674
3 files changed, 867 insertions, 0 deletions
diff --git a/old/fix_invalid_issnl.py b/old/fix_invalid_issnl.py
new file mode 100755
index 0000000..521f334
--- /dev/null
+++ b/old/fix_invalid_issnl.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+"""
+This is a one-off script for pushing ISSN-L fixes to fatcat via the API.
+
+It excpects to have fatcat python libraries available. Run it like:
+
+ export FATCAT_API_AUTH_TOKEN="..."
+ ./fix_invalid_issnl.py ~/code/chocula/invalid_fatcat_issnl.tsv
+
+It creates a new editgroup, which you'll need to merge/accept manually.
+
+Defaults to QA API endpoint; edit the file to switch to prod.
+"""
+
+import os, sys
+import csv
+
+from fatcat_tools import authenticated_api
+from fatcat_client import Editgroup, ContainerEntity
+from fatcat_client.rest import ApiException
+
+API_ENDPOINT = "https://api.qa.fatcat.wiki/v0"
+
+
+def run(api, row_iter):
+
+ eg = api.create_editgroup(Editgroup(description=
+ "Update or merge containers with invalid (by checksum) ISSN-L. Using the fix_invalid_issnl.py script from chocula repo."))
+ print("Editgroup ident: {}".format(eg.editgroup_id))
+ for row in row_iter:
+ #print(row)
+ fixed_issnl = row['fixed_issnl'].strip()
+ if not fixed_issnl:
+ print("SKIP")
+ continue
+ assert row['issnl'].strip() != fixed_issnl
+ invalid = api.get_container(row['fatcat_ident'])
+ assert invalid.state == "active"
+ try:
+ fixed = api.lookup_container(issnl=row['fixed_issnl'])
+ except ApiException as ae:
+ if ae.status != 404:
+ raise ae
+ fixed = None
+
+ if fixed:
+ # merge/redirect
+ assert fixed.state == "active"
+ print("MERGE: {} -> {}".format(invalid.ident, fixed.ident))
+ invalid.redirect = fixed.ident
+ api.update_container(eg.editgroup_id, invalid.ident,
+ ContainerEntity(redirect=fixed.ident))
+ else:
+ # update in-place with fixed ISSN-L
+ print("FIX: {}: {}".format(invalid.ident, fixed_issnl))
+ invalid.issnl = fixed_issnl
+ api.update_container(eg.editgroup_id, invalid.ident, invalid)
+
+ # intentionally not merging editgroup
+ print("Editgroup ident: {}".format(eg.editgroup_id))
+ print("DONE")
+
+def main():
+ api = authenticated_api(
+ API_ENDPOINT,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get("FATCAT_API_AUTH_TOKEN"))
+
+ path = sys.argv[1]
+ reader = csv.DictReader(open(path), delimiter='\t')
+ run(api, reader)
+
+if __name__ == '__main__':
+ main()
diff --git a/old/invalid_fatcat_issnl.tsv b/old/invalid_fatcat_issnl.tsv
new file mode 100644
index 0000000..14a8dec
--- /dev/null
+++ b/old/invalid_fatcat_issnl.tsv
@@ -0,0 +1,118 @@
+fixed_issnl issnl fatcat_ident name
+1530-1311 1550-1311 2nklacmgkjdfjib7kqi3hdh76a International Symposium on Temporal Representation and Reasoning/TIME, Proceedings of the
+2306-0441 2223-0441 rngvdeed65ffhmgfxti7s2z6by Journal of Local and Global Health Science
+1641-6554 1641-6565 yviicehmubf4bcxo23q43sbkzu Kolposkopia
+1526-7539 1526-7639 pnyefvclqfabjlnl4suox6pdte International Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems, Proceedings of the
+0276-6574 0276-6547 hcrk2xeoknf7daxwurje2vg3n4 Computers in Cardiology Conference
+0018-9413 0359-4237 ngthxcdwgzhovfgimtll6owdnm IEEE Transactions on Geoscience Electronics
+ 2630-4301 drgmggxvfjbjjkakrjegaujkgy Food Modelling Journal
+1089-7771 1086-7771 sjjfcknh3zawndw6jdmvfrix7a IEEE transactions on information technology in biomedicine
+1093-1139 1039-1139 nnrvd2qmhzbebk2hsopvtchodq Academic Physician and Scientist
+0037-7996 0037-7796 klgctd3pbvbwpf6ynvd4wns53m The Social studies
+1520-5363 1520-5263 v3zwf2ujd5flzlopo2h4ggmdhu Document Analysis and Recognition (ICDAR), Proceedings of the International Conference on
+1992-4712 1992-4721 mmptdalzizepxi7uqfohnjj3uy NAFU Farmer
+0066-7374 0066-7372 s5fjnmhtkbggtcn2itw4g34bxq Proceedings of the Aristotelian Society (Hardback)
+ 1234-5678 lkoo6d6lcvba7hhwswxc6x2sly Test s Publication
+1088-7725 1080-7725 ikqqv7cnwfhpfkqy2lfugiz6ae AUTOTESTCON IEEE Systems Readiness Technology Conference, Proceedings of the
+ 0537-4737 kklzxgby4zfqvow7klnneyjj7e IEE Telecommunications Hot Topics Forum
+1527-4160 1528-1145 q7j7ro4qfnbjdb2kkkcxatesna Journal of Psychiatric Practice
+2468-4295 2648-4309 qtvq67fxfje4ro5mldpan35s34 Brill Research Perspectives in Art and Law
+ 1113-1114 qqkn2huytzfj5fw573bc5ihxyi A-to-Z Guide to Thermodynamics Heat and Mass Transfer and Fluids Engineering
+1080-241X 1082-241X c3pwhzd5hvbqzfgqdrbxnyfn3i Proceedings Annual Simulation Symposium
+1050-4729 1059-4729 b5exnobxjvd65flifftm6gy3f4 IEEE International Conference on Engineering of Complex Computer Systems, Proceedings of the
+0295-1630 0296-1630 u7tretgoaffblbvtsxbtml6pbu Pleine Marge
+1071-0485 1071-0483 6x6ijsddbva7xldh5ubqoxcf7a IEEE International Workshop on Future Trends of Distributed Computing Systems (FTDCS)
+0093-2582 0093-2592 ajsbz23tbnghzmwleirmr7or3a The St. Croix review
+1544-6301 1544-6307 tl5eyaqxbfcllmy7ip63up2sma Journal Of Cancer Integrative Medicine
+2155-5745 2155-5754 b6s7uywtdfannklqblq6tghkte International Radar Symposium
+0767-3701 3767-3701 wyh37ssfibcdtlncbi53m7ms4q Recherche et Applications en Marketing
+2255-8691 2225-8691 bl6y7mpqgfdtxoqqbcq7bikwvu Applied Computer Systems
+2561-3103 2561-3101 46uvtmw72zgpjhmbkvllrnwlau Journal of Contact Lens Research and Science
+0939-7248 0939-7148 xma7s7w65zed3g4ojkmfz7r6ca European journal of pediatric surgery
+1071-6084 1071-6048 dqthad6e3vfhxjf4px42u76dbe The Journal of technology studies
+1528-8463 1538-8463 j4cxdxvaxrb2thxtvbda47gy2y Hospital law's Regan report
+ 2352-3951 pc6ftuth7ffvreuolz5rxbaye4 Nanoelectronics and Spintronics
+2006-9820 2006-9802 wmugzbvj6rcrzg5hf5jtug6tja Journal of Toxicology and Environmental Health Sciences
+0953-9182 0955-9182 mcpenq5ytrd6zpmuyc5crmx2fi Contemporary reviews in obstetrics and gynaecology
+0259-188X 0459-188X iw7xtyujbrax7ojqqbgxjlrsem Indicator South Africa
+0894-9115 0894-9112 wa3xjixz6ve75j2ecrk23sposq American Journal of Physical Medicine & Rehabilitation
+0165-9367 0165-0937 32qxo574yjcbbbchqijyakggzm BABesch - Bulletin Antieke Beschaving
+2586-1247 2968-9853 whexebkzebh5ncyb6hvjppkyqa International Journal of Sustainable Lighting
+1811-7813 1511-7813 ipu26yclwnhw7ekbsve6wpb44a Point-Based Graphics (PBG), IEEE VGTC Symposium Proceedings
+0394-1051 2532-4681 opjfte47erhb7dke5aeap5wbse Eupalino : cultura della città e della casa
+ 0538-9989 cjkaddjcfzhmveuuffmtycjxmm Professional Development Course on Railway Electrification Infrastructure and Systems
+1081-7735 1085-7735 oam6q4knpjaq5enpfzra3jazfi Asian Test Symposium, Proceedings of the
+1064-7546 1064-7564 su2chgwy2vf6zavx6ekbyv4sem Journal of Environmental Polymer Degradation
+1817-4426 1817-4428 kwqvwiwyx5b5vbjxfetnpxt6ky The Retail and Marketing Review
+ 1432-3123 sa73qgbh25bgnd5q6o262u7inm china Frontiers
+2141-2634 2142-2634 g2z2ld65nve2pdwjk7gr3qe25e Journal of Civil Engineering and Construction Technology
+ 9999-9998 nvya624wtbbgxl6bdsrqatslum Space Resolver 15
+ 0006-0809 5gtixxmzm5aodhqpd4bqp3hqiq The Bridge
+0547-3578 7964-0977 bonch7zygnadddv2fqoecmj4yy IEEE National Aerospace and Electronics Conference
+0003-3839 0003-3829 yqozzqdrwvedvafwrokyx5puna Annales agronomiques
+2213-9885 2213-0886 hdfn46db7bgw5e7lmmlx3iubxe Current Chemical Genomics and Translational Medicine
+2378-5225 2333-5225 tgcsgbazhrfqzodbpdkjgtvv6i Pathophysiology of Cell Injury Journal
+1761-7227 1761-7727 uynmrtrspngoxdjdihrwx3x6oi International orthodontics
+1079-5545 1990-1993 goo5ob7jqzh7ro5pft7j3qdkaq Academy of Management Executive
+1536-0040 1111-1111 7au72xtbafdwpdsz6d53elvk2a SIAM E-Journal on Applied Dynamical Systems
+0163-514X 0613-514X jjup253sjjc2pkg3zz3vfz3nye The Journal of Prevention
+1020-1491 1020-1497 gtn3v33zcrg5zndvyir7l65mna Image & Text : a Journal for Design
+0972-8279 0972-8270 laktqf5ag5fpje2kfzzutnooyq Journal of maxillofacial and oral surgery
+ 0133-0189 kixvoietq5bhjeub36kqu6kwpa DCDS Supplements
+1071-9458 1071-9658 xmwrlaklfbe53ayhlzrhinb5cu International Symposium on Software Reliability Engineering, Proceedings of the
+0040-6325 0040-5325 qpbm4in22jevvfeht6mhawmllq The Thomist: A Speculative Quarterly Review
+0039-4157 0039-4156 5xp3jodttzhmrp22xl5femz52q Study encounter
+1122-5300 1125-5300 zeclu7ljprfkvauia5vljzvnfy Parolechiave
+ 1453-212X dhdssmbdbjfyfe53dpbfgcejti Research Journal of International Studies
+1530-1834 1550-1834 d57htlky2nexlmy4wa36lva6wy Brazilian Symposium on Computer Graphics and Image Processing
+ 0686-3174 ajqi4yc43feitknsozbzgpjnre Y hoc Viet Nam
+0015-8208 1521-3979 hoifudhajjgwfauzsgevzgogpa Fortschritte der Physik/Progress of Physics
+2141-6613 2141-6113 xghlebvnzzco3j6eb6ywlwal3m International Journal of Water Resources and Environmental Engineering
+0112-9120 0110-7903 6c6fql7levae5lyunnm6aiuzaq New Zealand monthly review
+ 0798-1881 xnblbncvezfuveglr5loa2rqym Construccion
+0017-3703 0017-3701 5dfgot47anfwxk65i3biooh6we Greater amusements
+0015-8577 0015-1287 xjwp4btqb5gojjcmdegglcyrli The Forumeer
+0160-8371 1060-8371 jbubc4zldfegxfkstwvg2oh47i IEEE Photovoltaic Specialists Conference, Conference Record of the
+2378-9530 2378-8953 4x4y2afasfg45iqoo47vphr6iq Veterinary Research and Case Reports
+2162-4933 2162-4923 azachjfocbf6xciiz3m3ze564u Current Dermatology Reports
+0882-5645 0882-5646 7fyviche6rbyzfsznpfrvvlyv4 Topics in Pain Management
+0033-7196 0033-7916 hsp5k2eij5dwpe4tbqf2n7gkpa RWDSU record
+1823-3902 1832-3902 sdw3bp3mk5b7vpkf22nj3dbttu Journal of tropical biology & conservation
+1066-6192 1066-6172 jlzpb6temzauliox6i7ecma3mq Euromicro Workshop on Parallel and Distributed Processing, Proceedings of the
+1818-9091 1819-9091 gom2wpnbtje6zfssyliobeup24 SA Horseman
+2504-2289 8888-0000 eyngovpvzfen5hsd55udttyk2u Big Data and Cognitive Computing
+ 8164-9547 knfumdwvafcnjnwge24v3kaip4 IEEE International Conference on Communications (2006)
+0369-8718 0368-8718 fatiuzqz7veebcr4n3t7wvn2n4 Proceedings of the Chemical Society
+2379-6227 2397-6227 37z2akit2nbtbea5qcn5shreru Computational Psychiatry
+0025-6978 0026-6978 vq6o3qyo3rezvercvuai2dciti Medical and biological illustration
+2588-4174 4174-2588 lr5kb4tnq5bpjf3otub5umkdmy Afzalipour Journal of Clinical Research
+2641-7928 1089-796X cff4m4os3bdznnc3fxbh2vv6ha International Conference on Parallel Architectures and Compilation Techniques, Proceedings of the
+1931-7611 1931-7811 gmtif5qt6vddroc7uox4hxtch4 International journal of sexual health
+0538-933X 0583-9364 xt6yxm3j7jdyzb6viwcltb6vsq International Reading Association. Proceedings of the annual convention
+ 7777-8888 3va7v7macvbalhxfujg5qfb7vi A Journal with Only 1 Volume
+2157-3689 2157-3698 qegnzb72dzhf5jdunf3ge7jmji Limnology & Oceanography Fluids & Environment
+1673-3487 1673-3508 z3jjoyzt45ecrh3c62afjmmxjm Frontiers of Physical in China
+2571-6255 0013-0013 kwk3mhqddfddrd6hjqdnkg3f3y Fire
+2141-2618 2141-2648 w5aoztaff5dg7harb4d2fyxzji Journal of Medical Laboratory and Diagnosis
+ 0804-3321 bqecated6jcwzoe56hmlxxydsq Utdanning
+ 0842-3446 wiidl3rotncxfpqoj46uegf5mq African Journal of Farm Child and Youth Development
+ 0539-9989 c7abz7liw5d53bskkcq7fcetvy Institution of Engineering and Technology International Conference on Medical Electrical Devices and Technology
+1051-9173 1057-9173 auy44witavcmnejiss5h4ndlnq IEEE Symposium on Mass Storage Systems
+0276-2234 0726-2234 6cbknzfrcfchhj73pmrlull2ky Oncology Times UK
+0029-0564 1949-1986 s4poo6asmffuhlfw4lsneh6fq4 Nineteenth-Century Fiction
+ 9999-9999 zjdb7jqlxzcmlatzcjfjlrna6m Space Resolver 16
+0026-475X 0027-475X xmgc2fswtjaghmrocercgewwnu Minerva dietologica
+ 8164-2284 jkjfyzs3ijcnfbbfwxpewlwuaq IEEE International Electron Devices Meeting
+1972-3792 2035-0167 nbo7hjouubbjba3kk5nn6oucmu Quaderni materialisti
+0307-0565 0307-9565 mfgzm4p6zzgsxja4jjy7etl35q International journal of obesity
+0738-6176 0723-6176 ynfkc2j7lzbpnchzpjic2orwlq The Psychotherapy patient
+0963-0252 0965-0252 4s5mbkeut5g5tass6gz6qfivny Plasma sources science & technology
+0144-3887 0143-3887 kdtbqfuuynd2fc64lprajnauvm Current Psychological Research
+1470-3556 0741-9738 23er6bey4rgclaudewagavnf7m Advances in Mind-Body Medicine
+0040-3261 0400-3261 arakq24urzgfhbz4abv6kz4mha Tennessee historical quarterly
+0957-4484 0957-9207 k4v6ng56q5fb5p7yjyi7fsmv5a Nanotechnology
+1087-9870 1089-9870 ubsj5pa5abbe7gqqdttkfibqle Thermal and Thermomechanical Phenomena in Electronic Systems (ITHERM), Intersociety Conference on
+1812-6731 1812-6371 lhhrrjjxynd3lcadty56vqjk4u New Voices in Psychology
+0163-0067 0613-0067 mlluqww5qnet7gqptazojr32bu Information world
+1080-9775 1080-9175 yxjwuy4kj5hqvorfbmv4widyqa Biomedical Safety & Standards
+ 9999-9997 o6xy2oixgzag5o6oe26fdmmr4e Space Resolver
diff --git a/old/parse_merge_metadata.py b/old/parse_merge_metadata.py
new file mode 100755
index 0000000..b1d038b
--- /dev/null
+++ b/old/parse_merge_metadata.py
@@ -0,0 +1,674 @@
+#!/usr/bin/env python3
+
+import sys, csv, json
+import ftfy
+import pycountry
+from collections import Counter
+
+ISSNL_FILE = 'data/20190220.ISSN-to-ISSN-L.txt'
+
+ENTREZ_FILE = 'data/entrez-journals.csv'
+ROAD_FILE = 'data/road-2018-01-24.tsv'
+ROAD_DATE = '2018-01-24'
+DOAJ_FILE = 'data/doaj_20190124.csv'
+DOAJ_DATE = '2019-01-24'
+CROSSREF_FILE = 'data/doi_titles_file_2019-01-24.csv'
+SHERPA_ROMEO_JOURNAL_FILE = 'data/romeo-journals.csv'
+SHERPA_ROMEO_POLICY_FILE = 'data/romeo-policies.csv'
+NORWEGIAN_FILE = 'data/2018-03-02 Norwegian Register for Scientific Journals and Series.csv'
+NORWEGIAN_DATE = '2018-03-02'
+LOCKSS_FILE = 'data/kbart_LOCKSS.txt'
+CLOCKSS_FILE = 'data/kbart_CLOCKSS.txt'
+PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
+JSTOR_FILE = 'data/jstor_all-archive-titles.txt'
+SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
+IA_CRAWL_FILE = 'data/journal_homepage_results.partial.tsv'
+
+# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
+# software frameworks
+PLATFORM_MAP = {
+ 'OJS': 'ojs',
+ 'BMC': 'bmc',
+ 'SciELO Brazil': 'scielo',
+ 'SciELO Argentina': 'scielo',
+ 'SciELO': 'scielo',
+ 'SciELO Mexico': 'scielo',
+ 'SciELO Spain': 'scielo',
+ 'SciELO Portugal': 'scielo',
+ 'WordPress': 'wordpress',
+ 'Sciendo': 'sciendo',
+ 'Drupal': 'drupal',
+ 'revues.org': 'openedition',
+}
+
+MIMETYPE_MAP = {
+ 'PDF': 'application/pdf',
+ 'HTML': 'text/html',
+ 'XML': 'application/xml',
+}
+
+def unquote(s):
+ if s.startswith('"'):
+ s = s[1:]
+ if s.endswith('"'):
+ s = s[:-1]
+ if s.endswith('.'):
+ s = s[:-1]
+ return s.strip()
+
+def parse_lang(s):
+ if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ lang = pycountry.languages.get(alpha2=s.lower())
+ elif len(s) == 3:
+ lang = pycountry.languages.get(alpha3=s.lower())
+ else:
+ lang = pycountry.languages.get(name=s)
+ return lang.alpha2.lower()
+ except KeyError:
+ return None
+ except AttributeError:
+ return None
+
+def parse_country(s):
+ if not s or s in ('Unknown'):
+ return None
+ try:
+ if len(s) == 2:
+ country = pycountry.countries.get(alpha2=s.lower())
+ else:
+ country = pycountry.countries.get(name=s)
+ except KeyError:
+ return None
+ return country.alpha2.lower()
+
+def gaps_to_spans(first, last, gaps):
+ if not gaps:
+ return [[first, last]]
+ if not (last >= first and max(gaps) < last and min(gaps) > first):
+ # mangled
+ print("mangled years: {}".format((first, last, gaps)))
+ return []
+ full = list(range(first, last+1))
+ for missing in gaps:
+ full.remove(missing)
+ spans = []
+ low = None
+ last = None
+ for year in full:
+ if not low:
+ low = year
+ last = year
+ continue
+ if year != last+1:
+ spans.append([low, last])
+ low = year
+ last = year
+ last = year
+ if low:
+ spans.append([low, last])
+ return spans
+
+def test_gaps():
+ assert gaps_to_spans(1900, 1900, None) == \
+ [[1900, 1900]]
+ assert gaps_to_spans(1900, 1903, None) == \
+ [[1900, 1903]]
+ assert gaps_to_spans(1900, 1902, [1901]) == \
+ [[1900, 1900], [1902, 1902]]
+ assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
+ [[1950, 1954], [1957, 1964], [1966, 1970]]
+
+def merge_spans(old, new):
+ if not new:
+ return old
+ if not old:
+ old = []
+ old.extend(new)
+ years = set()
+ for span in old:
+ for y in range(span[0], span[1]+1):
+ years.add(y)
+ if not years:
+ return []
+ spans = []
+ start = None
+ last = None
+ todo = False
+ for y in sorted(list(years)):
+ if start == None:
+ # very first
+ start = y
+ last = y
+ todo = True
+ continue
+ if y == last + 1:
+ # span continues
+ last = y
+ todo = True
+ continue
+ # a gap just happened!
+ spans.append([start, last])
+ start = y
+ last = y
+ todo = True
+ if todo:
+ spans.append([start, last])
+ return spans
+
+def test_merge_spans():
+ assert merge_spans([[5, 10]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 9]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([[5, 11]], [[10, 20]]) == \
+ [[5, 20]]
+ assert merge_spans([], []) == \
+ []
+ assert merge_spans([[9, 11]], []) == \
+ [[9,11]]
+ assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
+ [[1450, 1900], [2000, 2000]]
+
+class Munger():
+ """
+ Top-level fields we'd like to fill in if possible:
+
+ issnp: string
+ issne: string
+ first_year: year (integer)
+ last_year: if publishing has stopped
+ languages: array of ISO codes; first is the "primary" language
+ country: ISO shortcode of country published from
+ urls: homepage links
+ abbrev: string
+ default_license: slug
+ original_name: native name (if name is translated)
+ platform: hosting platform: OJS, wordpress, scielo, etc
+ mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+ aliases: array of "also known as"
+
+ Lower priority (TODO/later):
+ coden: string
+ oclc_id: string (lookup?)
+ lccn_id: string (lookup?)
+ dblb_id: string
+ region: TODO: continent/world-region
+ discipline: TODO: highest-level subject; "life science", "humanities", etc
+ field: TODO: narrower description of field
+ subjects: TODO?
+
+ TODO: so many missing ISSN/ISSN-L
+ TODO: abbrev
+ """
+
+ def __init__(self):
+ self.data = dict()
+ with open(ISSNL_FILE, 'r') as f:
+ self.read_issn_map_file(f)
+
+ def run(self, out_path):
+ self.load_doaj(DOAJ_FILE)
+ self.load_norwegian(NORWEGIAN_FILE)
+ self.load_crossref(CROSSREF_FILE)
+ self.load_sherpa_romeo(SHERPA_ROMEO_JOURNAL_FILE, SHERPA_ROMEO_POLICY_FILE)
+ self.load_road(ROAD_FILE)
+ self.load_kbart('lockss', LOCKSS_FILE)
+ self.load_kbart('clockss', CLOCKSS_FILE)
+ self.load_kbart('portico', PORTICO_FILE)
+ self.load_kbart('jstor', JSTOR_FILE)
+ self.load_entrez(ENTREZ_FILE)
+ self.load_sim(SIM_FILE)
+ self.load_homepage_crawl(IA_CRAWL_FILE)
+ self.summarize()
+ self.dump(out_path)
+ print("Done!")
+
+ def dump(self, out_path):
+ print("#### Dumping to {}".format(out_path))
+ with open(out_path, 'w') as out:
+ for issnl in self.data:
+ out.write(json.dumps(self.data[issnl]) + "\n")
+
+ def summarize(self):
+ print("##### Loaded {} unique entries".format(len(self.data)))
+
+ def read_issn_map_file(self, issn_map_file):
+ print("##### Loading ISSN map file...")
+ self._issn_issnl_map = dict()
+ for line in issn_map_file:
+ if line.startswith("ISSN") or len(line) == 0:
+ continue
+ (issn, issnl) = line.split()[0:2]
+ self._issn_issnl_map[issn] = issnl
+ # double mapping makes lookups easy
+ self._issn_issnl_map[issnl] = issnl
+ print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map)))
+
+ def issn2issnl(self, issn):
+ if issn is None:
+ return None
+ return self._issn_issnl_map.get(issn)
+
+ def add_issn(self, raw_issn=None, issne=None, issnp=None, name=None, publisher=None):
+ # do ISSN => ISSN-L mappings for any raw ISSNs
+ issnl = None
+ if not (raw_issn or issne or issnp):
+ return None, 'no-issn'
+ for lookup in (issnp, issne, raw_issn):
+ if not lookup:
+ continue
+ lookup = lookup.strip().upper()
+ #if not (len(lookup) == 9 and lookup[4] == '-'):
+ # print(lookup)
+ # print(len(lookup))
+ # print(lookup[4])
+ # return None, 'invalid-issn'
+ #assert len(lookup) == 9 and lookup[4] == '-'
+ issnl = self.issn2issnl(lookup)
+ if issnl:
+ break
+ if not issnl:
+ #print((raw_issn, issne, issnp))
+ # UGH.
+ issnl = issne or issnp or raw_issn
+ if not issnl:
+ return None, 'no-issnl'
+ issnl = issnl.strip().upper()
+ assert len(issnl) == 9 and issnl[4] == '-'
+ status = 'found-munge'
+ else:
+ status = 'found'
+ # lookup ISSN-Ls in data (or create one)
+ if not issnl in self.data:
+ status = 'created'
+ self.data[issnl] = dict(issnl=issnl)
+ d = self.data[issnl]
+ # if name/publisher not set, do so
+ if name and not 'name' in d:
+ name = unquote(ftfy.fix_text(name))
+ if name:
+ self.data[issnl]['name'] = name
+ if publisher and not 'publisher' in d:
+ publisher = unquote(ftfy.fix_text(publisher))
+ if publisher:
+ self.data[issnl]['publisher'] = publisher
+ if issne and not 'issne' in d:
+ self.data[issnl]['issne'] = issne
+ if issnp and not 'issnp' in d:
+ self.data[issnl]['issnp'] = issnp
+ # always return ISSN-L
+ return issnl, status
+
+ def add_lang(self, issnl, lang):
+ if not (lang and issnl):
+ return
+ lang = parse_lang(lang)
+ if not lang:
+ return
+ if 'languages' not in self.data[issnl]:
+ self.data[issnl]['languages'] = [lang]
+ elif lang not in self.data[issnl]['languages']:
+ self.data[issnl]['languages'].append(lang)
+
+ def add_url(self, issnl, url):
+ if not (url and issnl) or 'mailto:' in url.lower() or url in ('http://n/a', 'http://N/A'):
+ return
+ if url.startswith('www.'):
+ url = "http://" + url
+ url.replace('Http://', 'http://')
+ if 'urls' not in self.data[issnl]:
+ self.data[issnl]['urls'] = [url]
+ elif url not in self.data[issnl]['urls']:
+ self.data[issnl]['urls'].append(url)
+
+ def add_country(self, issnl, country):
+ if not (country and issnl):
+ return
+ country = parse_country(country)
+ if not country:
+ return
+ if 'country' not in self.data[issnl]:
+ self.data[issnl]['country'] = country
+
+ def add_mimetype(self, issnl, val):
+ if not (val and issnl):
+ return
+ mimetype = None
+ if '/' in val:
+ mimetype = val
+ else:
+ mimetype = MIMETYPE_MAP.get(val)
+ if not mimetype:
+ return
+ if 'mimetypes' not in self.data[issnl]:
+ self.data[issnl]['mimestypes'] = [mimetype]
+ elif mimetype not in self.data[issnl]['mimestypes']:
+ self.data[issnl]['mimestypes'].append(mimetype)
+
+ def load_entrez(self, path):
+ print("##### Loading Entrez...")
+ # JrId,JournalTitle,MedAbbr,"ISSN (Print)","ISSN (Online)",IsoAbbr,NlmId
+ reader = csv.DictReader(open(path))
+ counts = Counter()
+ for row in reader:
+ if not (row.get('ISSN (Online)') or row.get('ISSN (Print)')):
+ counts['skipped'] += 1
+ continue
+ issnl, status = self.add_issn(
+ issne=row.get('ISSN (Online)'),
+ issnp=row.get('ISSN (Print)'),
+ name=row['JournalTitle'],
+ )
+ if row['IsoAbbr'] and not 'abbrev' in self.data[issnl]:
+ self.data[issnl]['abbrev'] = row['IsoAbbr'].strip()
+ counts[status] += 1
+ print(counts)
+
+ def load_road(self, path):
+ print("##### Loading ROAD...")
+ reader = csv.DictReader(open(path), delimiter='\t',
+ fieldnames=("ISSN", "ISSN-L", "Short Title", "Title", "Publisher", "URL1", "URL2", "Region", "Lang1", "Lang2")
+ )
+ counts = Counter()
+ for row in reader:
+ issnl, status = self.add_issn(
+ raw_issn=row['ISSN-L'],
+ name=row['Short Title'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ if row['URL1']:
+ self.add_url(issnl, row['URL1'])
+ if row['URL2']:
+ self.add_url(issnl, row['URL2'])
+ if row['Lang1']:
+ self.add_lang(issnl, row['Lang1'])
+ if row['Lang2']:
+ self.add_lang(issnl, row['Lang2'])
+ # TODO: region mapping: "Europe and North America"
+ # TODO: lang mapping: already alpha-3
+ self.data[issnl]['road'] = dict(as_of=ROAD_DATE)
+ print(counts)
+
+ def load_doaj(self, path):
+ print("##### Loading DOAJ...")
+ #Journal title,Journal URL,Alternative title,Journal ISSN (print version),Journal EISSN (online version),Publisher,Society or institution,"Platform, host or aggregator",Country of publisher,Journal article processing charges (APCs),APC information URL,APC amount,Currency,Journal article submission fee,Submission fee URL,Submission fee amount,Submission fee currency,Number of articles publish in the last calendar year,Number of articles information URL,Journal waiver policy (for developing country authors etc),Waiver policy information URL,Digital archiving policy or program(s),Archiving: national library,Archiving: other,Archiving infomation URL,Journal full-text crawl permission,Permanent article identifiers,Journal provides download statistics,Download statistics information URL,First calendar year journal provided online Open Access content,Full text formats,Keywords,Full text language,URL for the Editorial Board page,Review process,Review process information URL,URL for journal's aims & scope,URL for journal's instructions for authors,Journal plagiarism screening policy,Plagiarism information URL,Average number of weeks between submission and publication,URL for journal's Open Access statement,Machine-readable CC licensing information embedded or displayed in articles,URL to an example page with embedded licensing information,Journal license,License attributes,URL for license terms,Does this journal allow unrestricted reuse in compliance with BOAI?,Deposit policy directory,Author holds copyright without restrictions,Copyright information URL,Author holds publishing rights without restrictions,Publishing rights information URL,DOAJ Seal,Tick: Accepted after March 2014,Added on Date,Subjects
+ reader = csv.DictReader(open(path))
+ counts = Counter()
+ for row in reader:
+ issnl, status = self.add_issn(
+ issnp=row['Journal ISSN (print version)'],
+ issne=row['Journal EISSN (online version)'],
+ name=row['Journal title'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ doaj = dict(as_of=DOAJ_DATE)
+ # TODO: work_level: bool (are work-level publications deposited with DOAJ?)
+
+ if row['Digital archiving policy or program(s)']:
+ doaj['archive'] = [a.strip() for a in row['Digital archiving policy or program(s)'].split(',') if a.strip()]
+ elif row['Archiving: national library']:
+ doaj['archive'] = ['national-library']
+
+ crawl_permission = row['Journal full-text crawl permission']
+ if crawl_permission:
+ doaj['crawl-permission'] = dict(Yes=True, No=False)[crawl_permission]
+ # TODO: Permanent article identifiers
+ default_license = row['Journal license']
+ if default_license and default_license.startswith('CC'):
+ self.data[issnl]['default_license'] = default_license.replace('CC ', 'CC-').strip()
+
+ self.add_mimetype(issnl, row['Full text formats'])
+ platform = PLATFORM_MAP.get(row['Platform, host or aggregator'])
+ if platform:
+ self.data[issnl]['platform'] = platform
+ if row['DOAJ Seal']:
+ doaj['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()]
+ if row['Country of publisher']:
+ self.add_country(issnl, row['Country of publisher'])
+ if row['Full text language']:
+ self.add_lang(issnl, row['Full text language'])
+ if row['Journal URL']:
+ self.add_url(issnl, row['Journal URL'])
+ # TODO: Subjects
+ self.data[issnl]['doaj'] = doaj
+ print(counts)
+
+ def load_sherpa_romeo(self, journal_path, policy_path):
+ # first load policies
+ print("##### Loading SHERPA/ROMEO policies...")
+ #RoMEO Record ID,Publisher,Policy Heading,Country,RoMEO colour,Published Permission,Published Restrictions,Published Max embargo,Accepted Prmission,Accepted Restrictions,Accepted Max embargo,Submitted Permission,Submitted Restrictions,Submitted Max embargo,Open Access Publishing,Record Status,Updated
+ policies = dict()
+ fixed_policy_file = ftfy.fix_file(open(policy_path, 'rb'))
+ policy_reader = csv.DictReader(fixed_policy_file)
+ for row in policy_reader:
+ policies[row['RoMEO Record ID']] = row
+ print("##### Loading SHERPA/ROMEO journal metadata...")
+ #Journal Title,ISSN,ESSN,URL,RoMEO Record ID,Updated
+ # super mangled :(
+ raw_file = open(journal_path, 'rb').read().decode(errors='replace')
+ fixed_file = ftfy.fix_text(raw_file)
+ reader = csv.DictReader(fixed_file.split('\n'))
+ counts = Counter()
+ for row in reader:
+ #row['Journal Title'] = row.pop('\ufeffJournal Title')
+ row.update(policies[row['RoMEO Record ID']])
+ issnl, status = self.add_issn(
+ issnp=row['ISSN'],
+ issne=row['ESSN'],
+ name=row['Journal Title'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ sherpa_romeo = dict()
+ if row['RoMEO colour']:
+ sherpa_romeo['color'] = row['RoMEO colour']
+ # row['Open Access Publishing']
+ if row['Country']:
+ self.add_country(issnl, row['Country'])
+ self.data[issnl]['sherpa_romeo'] = sherpa_romeo
+ print(counts)
+
+ def load_norwegian(self, path):
+ print("##### Loading Norwegian Registry...")
+ #pandas.read_csv(NORWEGIAN_FILE, sep=';', encoding="ISO-8859-1")
+ #NSD tidsskrift_id;Original title;International title;Present Level (2018);Print ISSN;Online ISSN;Open Access;NPI Scientific Field;NPI Academic Discipline;URL;Publishing Company;Publisher;Country of publication;Language;Level 2019;Level 2018;Level 2017;Level 2016;Level 2015;Level 2014;Level 2013;Level 2012;Level 2011;Level 2010;Level 2009;Level 2008;Level 2007;Level 2006;Level 2005;Level 2004;itar_id
+ reader = csv.DictReader(open(path, encoding="ISO-8859-1"), delimiter=";")
+ counts = Counter()
+ for row in reader:
+ issnp = row['Print ISSN']
+ issne = row['Online ISSN']
+ if issne and len(issne.strip()) != 9:
+ issne = None
+ if issnp and len(issnp.strip()) != 9:
+ issnp = None
+ if not (issnp or issne):
+ counts['no-issn'] += 1
+ continue
+ issnl, status = self.add_issn(
+ issnp=issnp,
+ issne=issne,
+ name=row['International title'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ norwegian = dict(as_of=NORWEGIAN_DATE)
+ norwegian['level'] = int(row['Present Level (2018)'])
+ norwegian['id'] = int(row['NSD tidsskrift_id'])
+
+ if row['Original title'] != row['International title'] and not 'original_name' in d:
+ self.data[issnl]['original_name'] = row['Original title']
+ if row['Country of publication']:
+ self.add_country(issnl, row['Country of publication'])
+ if row['Language']:
+ self.add_lang(issnl, row['Language'])
+ if row['URL']:
+ self.add_url(issnl, row['URL'])
+ self.data[issnl]['norwegian'] = norwegian
+ print(counts)
+
+ def load_kbart(self, name, path):
+ print("##### Loading KBART file for {}...".format(name))
+ #publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name
+ raw_file = open(path, 'rb').read().decode(errors='replace')
+ fixed_file = ftfy.fix_text(raw_file)
+ reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
+ counts = Counter()
+ for row in reader:
+ if not row['print_identifier'] and not row['online_identifier']:
+ counts['no-issn'] += 1
+ continue
+ issnl, status = self.add_issn(
+ issnp=row['print_identifier'],
+ issne=row['online_identifier'],
+ name=row['publication_title'],
+ publisher=row['publisher_name'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ if not 'kbart' in d:
+ self.data[issnl]['kbart'] = dict()
+ d = self.data[issnl]
+ if not name in d['kbart']:
+ self.data[issnl]['kbart'][name] = dict()
+ old_spans = self.data[issnl]['kbart'].get(name, dict()).get('year_spans', [])
+ kbart = dict()
+ if row['date_first_issue_online'] and row['date_last_issue_online']:
+ start = int(row['date_first_issue_online'][:4])
+ end = int(row['date_last_issue_online'][:4])
+ if not start <= end:
+ print("{}: {} not before {}! er, mangling".format(
+ issnl,
+ row['date_first_issue_online'],
+ row['date_last_issue_online']))
+ new_spans = [[end, start]]
+ else:
+ new_spans = [[start, end]]
+ self.data[issnl]['kbart'][name]['year_spans'] = merge_spans(old_spans, new_spans)
+ print(counts)
+
+ def load_crossref(self, path):
+ print("##### Loading Crossref...")
+ #"JournalTitle","JournalID","Publisher","pissn","eissn","additionalIssns","doi","(year1)[volume1]issue1,issue2,issue3(year2)[volume2]issue4,issues5"
+ reader = csv.DictReader(open(path))
+ counts = Counter()
+ for row in reader:
+ if row['pissn'] and len(row['pissn']) == 8:
+ row['pissn'] = row['pissn'][:4] + '-' + row['pissn'][4:]
+ if row['eissn'] and len(row['eissn']) == 8:
+ row['eissn'] = row['eissn'][:4] + '-' + row['eissn'][4:]
+ if not (row['pissn'] or row['eissn']):
+ counts['no-issn'] += 1
+ continue
+ issnl, status = self.add_issn(
+ issnp=row['pissn'],
+ issne=row['eissn'],
+ name=row['JournalTitle'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ crossref = dict()
+ if row['doi']:
+ crossref['doi'] = row['doi']
+ crossref['any'] = True
+ self.data[issnl]['crossref'] = crossref
+ print(counts)
+
+ def load_sim(self, path):
+ print("##### Loading SIM Metadata...")
+ #NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-\n Reviewed","Peer-\n Reviewed",Pub Type,Pub Language,Subjects
+ reader = csv.DictReader(open(path))
+ counts = Counter()
+ for row in reader:
+ if not row['ISSN'] or row['ISSN'] == "NULL":
+ counts['no-issn'] += 1
+ continue
+ issnl, status = self.add_issn(
+ raw_issn=row['ISSN'][:9],
+ name=row['Title'],
+ publisher=row['Publisher'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ sim = dict()
+ sim['id'] = row['NA Pub Cat ID']
+ first_year = row['First Volume']
+ if first_year:
+ first_year = int(first_year)
+ sim['first_year'] = int(row['First Volume'])
+ else:
+ first_year = None
+ last_year = row['Last Volume']
+ if last_year:
+ last_year = int(last_year)
+ sim['last_year'] = last_year
+ else:
+ last_year = None
+ gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
+ if gaps:
+ sim['gaps'] = gaps
+ if first_year and last_year:
+ sim['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
+ if row['Pub Language']:
+ self.add_lang(issnl, row['Pub Language'])
+ # TODO: 'Pub Type'
+ all_keys = list(sim.keys())
+ for k in all_keys:
+ if not sim[k]:
+ sim.pop(k)
+ self.data[issnl]['sim'] = sim
+ print(counts)
+
+ def load_homepage_crawl(self, path):
+ print("##### Loading IA Homepage Crawl Results...")
+ reader = csv.DictReader(open(path), delimiter='\t',
+ fieldnames=("ISSN", "first_url", "first_status", "last_status", "last_url")
+ )
+ counts = Counter()
+ for row in reader:
+ issnl, status = self.add_issn(
+ raw_issn=row['ISSN'],
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ d = self.data[issnl]
+ ia = d.get('ia', dict())
+ ia['homepage_status'] = int(row['last_status'])
+ if ia['homepage_status'] == 200:
+ ia['homepage_url'] = row['last_url']
+ else:
+ ia['homepage_url'] = row['first_url']
+ self.data[issnl]['ia'] = ia
+ print(counts)
+
+if __name__=='__main__':
+ if len(sys.argv) != 2 or sys.argv[1].startswith('-'):
+ print("pass me path for an output JSON lines file")
+ sys.exit(-1)
+ munger = Munger()
+ munger.run(sys.argv[1])
+