From 7bdcfe04fc4dbbe7fbe14ef6c45a80e09c78450f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 30 Jul 2019 20:50:18 -0700 Subject: chocula: better ISSN-L handling --- extra/journal_metadata/Pipfile | 1 + extra/journal_metadata/Pipfile.lock | 28 ++++++++++++++++++---------- extra/journal_metadata/chocula.py | 27 ++++++++++++++++----------- extra/journal_metadata/chocula_schema.sql | 9 ++++++--- 4 files changed, 41 insertions(+), 24 deletions(-) (limited to 'extra') diff --git a/extra/journal_metadata/Pipfile b/extra/journal_metadata/Pipfile index 36cacf3d..0cb50f20 100644 --- a/extra/journal_metadata/Pipfile +++ b/extra/journal_metadata/Pipfile @@ -12,6 +12,7 @@ surt = "*" tldextract = "*" pycountry = "*" pytest = "*" +python-stdnum = "*" [requires] python_version = "3.5" diff --git a/extra/journal_metadata/Pipfile.lock b/extra/journal_metadata/Pipfile.lock index b0f618ff..25ab75dc 100644 --- a/extra/journal_metadata/Pipfile.lock +++ b/extra/journal_metadata/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "6ec6017f7806aac149bdda3c7816bca91a7e62ce4c7a950813db1c8e163af3e0" + "sha256": "f07c29aa5f493fc5251946f614298aa4124f5d0dfe17504589a1ad8d73f86bd8" }, "pipfile-spec": 6, "requires": { @@ -61,10 +61,10 @@ }, "importlib-metadata": { "hashes": [ - "sha256:6dfd58dfe281e8d240937776065dd3624ad5469c835248219bd16cf2e12dbeb7", - "sha256:cb6ee23b46173539939964df59d3d72c3e0c1b5d54b84f1d8a7e912fe43612db" + "sha256:23d3d873e008a513952355379d93cbcab874c58f4f034ff657c7a87422fa64e8", + "sha256:80d2de76188eabfbfcf27e6a37342c2827801e59c4cc14b0371c56fed43820e3" ], - "version": "==0.18" + "version": "==0.19" }, "more-itertools": { "hashes": [ @@ -75,10 +75,10 @@ }, "packaging": { "hashes": [ - "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af", - "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3" + "sha256:a7ac867b97fdc07ee80a8058fe4435ccd274ecc3b0ed61d852d7d53055528cf9", + "sha256:c491ca87294da7cc01902edbe30a5bc6c4c28172b5138ab4e4aa1b9d7bfaeafe" ], - "version": "==19.0" + "version": "==19.1" }, "pathlib2": { "hashes": [ @@ -111,10 +111,10 @@ }, "pyparsing": { "hashes": [ - "sha256:43c5486cefefa536c9aab528881c992328f020eefe4f6d06332449c365218580", - "sha256:d6c5ffe9d0305b9b977f7a642d36b9370954d1da7ada4c62393382cbadad4265" + "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", + "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" ], - "version": "==2.4.1.1" + "version": "==2.4.2" }, "pytest": { "hashes": [ @@ -124,6 +124,14 @@ "index": "pypi", "version": "==5.0.1" }, + "python-stdnum": { + "hashes": [ + "sha256:d5f0af1bee9ddd9a20b398b46ce062dbd4d41fcc9646940f2667256a44df3854", + "sha256:f445ec32bf5246c90389204cabba465f494545371c29a83fa2d30e6c872a6763" + ], + "index": "pypi", + "version": "==1.11" + }, "requests": { "hashes": [ "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py index 6049bb52..ad999f14 100755 --- a/extra/journal_metadata/chocula.py +++ b/extra/journal_metadata/chocula.py @@ -52,6 +52,7 @@ import urlcanon import surt import tldextract import pycountry +import stdnum.issn ################### File Config @@ -396,7 +397,7 @@ class ChoculaDatabase(): self.c = None def read_issn_map_file(self, issn_map_path): - print("##### Loading ISSN map file...") + print("##### Loading ISSN-L map file...") with open(issn_map_path, 'r') as issn_map_file: self._issn_issnl_map = dict() for line in issn_map_file: @@ -433,7 +434,7 @@ class ChoculaDatabase(): if issnl: break if not issnl: - return None, 'no-issnl' + return None, 'unknown-issnl' #print((raw_issn, issne, issnp)) # UGH. #issnl = issne or issnp or raw_issn @@ -1004,7 +1005,7 @@ class ChoculaDatabase(): lang = languages[0] try: self.c.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)", - (row['issnl'], + (row.get('issnl'), row['ident'], row['revision'], issne, @@ -1069,7 +1070,7 @@ class ChoculaDatabase(): self.c = self.db.cursor() self.db.row_factory = sqlite3.Row index_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM directory')) - fatcat_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM fatcat_container')) + fatcat_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null')) all_issnls = set([i[0] for i in index_issnls + fatcat_issnls]) print("{} total ISSN-Ls".format(len(all_issnls))) for issnl in list(all_issnls): @@ -1079,16 +1080,19 @@ class ChoculaDatabase(): out = dict() # check if ISSN-L is good. this is here because of fatcat import - out['bad_issnl'] = not (self.issn2issnl(issnl) == issnl) - if out['bad_issnl']: - counts['bad-issnl'] += 1 + out['known_issnl'] = (self.issn2issnl(issnl) == issnl) + if not out['known_issnl']: + counts['unknown-issnl'] += 1 + out['valid_issnl'] = stdnum.issn.is_valid(issnl) + if not out['valid_issnl']: + counts['invalid-issnl'] += 1 fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl])) if fatcat_row: frow = fatcat_row[0] out['fatcat_ident'] = frow['ident'] - for k in ('name', 'publisher', 'issne', 'issnp', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'): - if not out.get(k) and frow[k]: + for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'): + if not out.get(k) and frow[k] != None: out[k] = frow[k] cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl]) @@ -1145,7 +1149,7 @@ class ChoculaDatabase(): out['publisher_type'] = 'longtail' out['is_longtail'] = True - self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, fatcat_ident, name, publisher, country, lang, is_oa, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, bad_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, fatcat_ident, name, publisher, country, lang, is_oa, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", (issnl, out.get('issne'), out.get('issnp'), @@ -1161,7 +1165,8 @@ class ChoculaDatabase(): out.get('has_dois', False), out.get('any_homepage', False), out.get('any_live_homepage', False), - out.get('bad_issnl', False), + out.get('known_issnl'), + out.get('valid_issnl'), out.get('release_count'), out.get('ia_count'), diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql index e7e857a3..24adb5e5 100644 --- a/extra/journal_metadata/chocula_schema.sql +++ b/extra/journal_metadata/chocula_schema.sql @@ -28,7 +28,9 @@ CREATE TABLE IF NOT EXISTS journal has_dois BOOLEAN, any_homepage BOOLEAN, any_live_homepage BOOLEAN, - bad_issnl BOOLEAN + any_gwb_homepage BOOLEAN, + known_issnl BOOLEAN, + valid_issnl BOOLEAN ); CREATE TABLE IF NOT EXISTS directory @@ -41,9 +43,9 @@ CREATE TABLE IF NOT EXISTS directory ); CREATE TABLE IF NOT EXISTS fatcat_container - (issnl TEXT NOT NULL PRIMARY KEY, - ident TEXT NOT NULL, + (ident TEXT NOT NULL PRIMARY KEY, revision TEXT NOT NULL, + issnl TEXT, issne TEXT, issnp TEXT, wikidata_qid TEXT, @@ -60,6 +62,7 @@ CREATE TABLE IF NOT EXISTS fatcat_container preserved_count INTEGER, preserved_frac FLOAT ); +CREATE INDEX IF NOT EXISTS fatcat_container_issnl_idx ON fatcat_container(issnl); CREATE TABLE IF NOT EXISTS homepage (id INTEGER PRIMARY KEY, -- cgit v1.2.3