aboutsummaryrefslogtreecommitdiffstats
path: root/extra
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-30 20:50:18 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-30 20:50:18 -0700
commit7bdcfe04fc4dbbe7fbe14ef6c45a80e09c78450f (patch)
tree3b2ac523faea3179008dd7d4cd054576b0abe17b /extra
parent019c867aa46025ecce6f2fddccc16b682081f4be (diff)
downloadfatcat-7bdcfe04fc4dbbe7fbe14ef6c45a80e09c78450f.tar.gz
fatcat-7bdcfe04fc4dbbe7fbe14ef6c45a80e09c78450f.zip
chocula: better ISSN-L handling
Diffstat (limited to 'extra')
-rw-r--r--extra/journal_metadata/Pipfile1
-rw-r--r--extra/journal_metadata/Pipfile.lock28
-rwxr-xr-xextra/journal_metadata/chocula.py27
-rw-r--r--extra/journal_metadata/chocula_schema.sql9
4 files changed, 41 insertions, 24 deletions
diff --git a/extra/journal_metadata/Pipfile b/extra/journal_metadata/Pipfile
index 36cacf3d..0cb50f20 100644
--- a/extra/journal_metadata/Pipfile
+++ b/extra/journal_metadata/Pipfile
@@ -12,6 +12,7 @@ surt = "*"
tldextract = "*"
pycountry = "*"
pytest = "*"
+python-stdnum = "*"
[requires]
python_version = "3.5"
diff --git a/extra/journal_metadata/Pipfile.lock b/extra/journal_metadata/Pipfile.lock
index b0f618ff..25ab75dc 100644
--- a/extra/journal_metadata/Pipfile.lock
+++ b/extra/journal_metadata/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "6ec6017f7806aac149bdda3c7816bca91a7e62ce4c7a950813db1c8e163af3e0"
+ "sha256": "f07c29aa5f493fc5251946f614298aa4124f5d0dfe17504589a1ad8d73f86bd8"
},
"pipfile-spec": 6,
"requires": {
@@ -61,10 +61,10 @@
},
"importlib-metadata": {
"hashes": [
- "sha256:6dfd58dfe281e8d240937776065dd3624ad5469c835248219bd16cf2e12dbeb7",
- "sha256:cb6ee23b46173539939964df59d3d72c3e0c1b5d54b84f1d8a7e912fe43612db"
+ "sha256:23d3d873e008a513952355379d93cbcab874c58f4f034ff657c7a87422fa64e8",
+ "sha256:80d2de76188eabfbfcf27e6a37342c2827801e59c4cc14b0371c56fed43820e3"
],
- "version": "==0.18"
+ "version": "==0.19"
},
"more-itertools": {
"hashes": [
@@ -75,10 +75,10 @@
},
"packaging": {
"hashes": [
- "sha256:0c98a5d0be38ed775798ece1b9727178c4469d9c3b4ada66e8e6b7849f8732af",
- "sha256:9e1cbf8c12b1f1ce0bb5344b8d7ecf66a6f8a6e91bcb0c84593ed6d3ab5c4ab3"
+ "sha256:a7ac867b97fdc07ee80a8058fe4435ccd274ecc3b0ed61d852d7d53055528cf9",
+ "sha256:c491ca87294da7cc01902edbe30a5bc6c4c28172b5138ab4e4aa1b9d7bfaeafe"
],
- "version": "==19.0"
+ "version": "==19.1"
},
"pathlib2": {
"hashes": [
@@ -111,10 +111,10 @@
},
"pyparsing": {
"hashes": [
- "sha256:43c5486cefefa536c9aab528881c992328f020eefe4f6d06332449c365218580",
- "sha256:d6c5ffe9d0305b9b977f7a642d36b9370954d1da7ada4c62393382cbadad4265"
+ "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80",
+ "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4"
],
- "version": "==2.4.1.1"
+ "version": "==2.4.2"
},
"pytest": {
"hashes": [
@@ -124,6 +124,14 @@
"index": "pypi",
"version": "==5.0.1"
},
+ "python-stdnum": {
+ "hashes": [
+ "sha256:d5f0af1bee9ddd9a20b398b46ce062dbd4d41fcc9646940f2667256a44df3854",
+ "sha256:f445ec32bf5246c90389204cabba465f494545371c29a83fa2d30e6c872a6763"
+ ],
+ "index": "pypi",
+ "version": "==1.11"
+ },
"requests": {
"hashes": [
"sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py
index 6049bb52..ad999f14 100755
--- a/extra/journal_metadata/chocula.py
+++ b/extra/journal_metadata/chocula.py
@@ -52,6 +52,7 @@ import urlcanon
import surt
import tldextract
import pycountry
+import stdnum.issn
################### File Config
@@ -396,7 +397,7 @@ class ChoculaDatabase():
self.c = None
def read_issn_map_file(self, issn_map_path):
- print("##### Loading ISSN map file...")
+ print("##### Loading ISSN-L map file...")
with open(issn_map_path, 'r') as issn_map_file:
self._issn_issnl_map = dict()
for line in issn_map_file:
@@ -433,7 +434,7 @@ class ChoculaDatabase():
if issnl:
break
if not issnl:
- return None, 'no-issnl'
+ return None, 'unknown-issnl'
#print((raw_issn, issne, issnp))
# UGH.
#issnl = issne or issnp or raw_issn
@@ -1004,7 +1005,7 @@ class ChoculaDatabase():
lang = languages[0]
try:
self.c.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
- (row['issnl'],
+ (row.get('issnl'),
row['ident'],
row['revision'],
issne,
@@ -1069,7 +1070,7 @@ class ChoculaDatabase():
self.c = self.db.cursor()
self.db.row_factory = sqlite3.Row
index_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM directory'))
- fatcat_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM fatcat_container'))
+ fatcat_issnls = list(self.c.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null'))
all_issnls = set([i[0] for i in index_issnls + fatcat_issnls])
print("{} total ISSN-Ls".format(len(all_issnls)))
for issnl in list(all_issnls):
@@ -1079,16 +1080,19 @@ class ChoculaDatabase():
out = dict()
# check if ISSN-L is good. this is here because of fatcat import
- out['bad_issnl'] = not (self.issn2issnl(issnl) == issnl)
- if out['bad_issnl']:
- counts['bad-issnl'] += 1
+ out['known_issnl'] = (self.issn2issnl(issnl) == issnl)
+ if not out['known_issnl']:
+ counts['unknown-issnl'] += 1
+ out['valid_issnl'] = stdnum.issn.is_valid(issnl)
+ if not out['valid_issnl']:
+ counts['invalid-issnl'] += 1
fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]))
if fatcat_row:
frow = fatcat_row[0]
out['fatcat_ident'] = frow['ident']
- for k in ('name', 'publisher', 'issne', 'issnp', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'):
- if not out.get(k) and frow[k]:
+ for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'):
+ if not out.get(k) and frow[k] != None:
out[k] = frow[k]
cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl])
@@ -1145,7 +1149,7 @@ class ChoculaDatabase():
out['publisher_type'] = 'longtail'
out['is_longtail'] = True
- self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, fatcat_ident, name, publisher, country, lang, is_oa, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, bad_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+ self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, fatcat_ident, name, publisher, country, lang, is_oa, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
(issnl,
out.get('issne'),
out.get('issnp'),
@@ -1161,7 +1165,8 @@ class ChoculaDatabase():
out.get('has_dois', False),
out.get('any_homepage', False),
out.get('any_live_homepage', False),
- out.get('bad_issnl', False),
+ out.get('known_issnl'),
+ out.get('valid_issnl'),
out.get('release_count'),
out.get('ia_count'),
diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql
index e7e857a3..24adb5e5 100644
--- a/extra/journal_metadata/chocula_schema.sql
+++ b/extra/journal_metadata/chocula_schema.sql
@@ -28,7 +28,9 @@ CREATE TABLE IF NOT EXISTS journal
has_dois BOOLEAN,
any_homepage BOOLEAN,
any_live_homepage BOOLEAN,
- bad_issnl BOOLEAN
+ any_gwb_homepage BOOLEAN,
+ known_issnl BOOLEAN,
+ valid_issnl BOOLEAN
);
CREATE TABLE IF NOT EXISTS directory
@@ -41,9 +43,9 @@ CREATE TABLE IF NOT EXISTS directory
);
CREATE TABLE IF NOT EXISTS fatcat_container
- (issnl TEXT NOT NULL PRIMARY KEY,
- ident TEXT NOT NULL,
+ (ident TEXT NOT NULL PRIMARY KEY,
revision TEXT NOT NULL,
+ issnl TEXT,
issne TEXT,
issnp TEXT,
wikidata_qid TEXT,
@@ -60,6 +62,7 @@ CREATE TABLE IF NOT EXISTS fatcat_container
preserved_count INTEGER,
preserved_frac FLOAT
);
+CREATE INDEX IF NOT EXISTS fatcat_container_issnl_idx ON fatcat_container(issnl);
CREATE TABLE IF NOT EXISTS homepage
(id INTEGER PRIMARY KEY,