aboutsummaryrefslogtreecommitdiffstats
path: root/extra/journal_metadata
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-11 19:56:46 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-11 19:56:46 -0700
commita8df5fa53ff2621f23fc9dda2ad5403e0abeb714 (patch)
tree41236bdc7a66c1e519ed835241288220960b4591 /extra/journal_metadata
parent32179f8dd78655515eb46cb3d1024f242512976c (diff)
downloadfatcat-a8df5fa53ff2621f23fc9dda2ad5403e0abeb714.tar.gz
fatcat-a8df5fa53ff2621f23fc9dda2ad5403e0abeb714.zip
EZB and szczepanski indexers
Diffstat (limited to 'extra/journal_metadata')
-rwxr-xr-xextra/journal_metadata/chocula.py191
1 files changed, 146 insertions, 45 deletions
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py
index 6e32a3af..9ec4f1a8 100755
--- a/extra/journal_metadata/chocula.py
+++ b/extra/journal_metadata/chocula.py
@@ -18,9 +18,10 @@ Commands:
index_doaj
index_road
index_crossref
- index_szczepanski
- index_jurn
+ index_entrez
index_norwegian
+ index_szczepanski
+ index_ezb
fatcat_load
fatcat_stats
@@ -28,14 +29,18 @@ Commands:
export_urls
update_url_status
-TODO:
+Future commands:
- index_ezb
+ index_jurn
index_wikidata
index_datacite
preserve_kbart --keeper SLUG
preserve_sim
+TODO:
+- index -> directory
+- log out index issues (duplicate ISSN-L, etc)
+
"""
import sys, csv, json
@@ -70,6 +75,8 @@ PORTICO_FILE = 'data/Portico_Holding_KBart.txt'
JSTOR_FILE = 'data/jstor_all-archive-titles.txt'
SIM_FILE = 'data/MASTER TITLE_METADATA_LIST_20171019.converted.csv'
IA_CRAWL_FILE = 'data/journal_homepage_results.partial.tsv'
+SZCZEPANSKI_FILE = 'data/Jan-Szczepanski-Open-Access-Journals-2018_0.fixed.json'
+EZB_FILE = 'data/ezb_sample.json'
################### Utilities
@@ -132,11 +139,14 @@ def parse_country(s):
country = pycountry.countries.get(name=s)
except KeyError:
return None
- return country.alpha_2.lower()
+ if country:
+ return country.alpha_2.lower()
+ else:
+ return None
def parse_mimetypes(val):
# XXX: multiple mimetypes?
- if not (val and issnl):
+ if not val:
return
mimetype = None
if '/' in val:
@@ -333,7 +343,8 @@ class ChoculaDatabase():
if issnp:
extra['issnp'] = issnp
- publisher = unquote(ftfy.fix_text(publisher))
+ if publisher:
+ publisher = unquote(ftfy.fix_text(publisher))
if publisher:
extra['publisher'] = publisher
@@ -342,8 +353,14 @@ class ChoculaDatabase():
else:
extra = None
- self.c.execute("INSERT INTO journal_index VALUES (?,?,?,?,?,?)",
- (issnl, index_slug, identifier, name, None, extra))
+ try:
+ self.c.execute("INSERT INTO journal_index VALUES (?,?,?,?,?,?)",
+ (issnl, index_slug, identifier, name, None, extra))
+ status = 'inserted'
+ except sqlite3.IntegrityError as ie:
+ if str(ie).startswith("UNIQUE"):
+ return None, "duplicate-issnl"
+ raise ie
return issnl, status
@@ -355,12 +372,12 @@ class ChoculaDatabase():
url.replace('Http://', 'http://')
url = str(urlcanon.semantic_precise(url))
- surt = surt.surt(url)
+ url_surt = surt.surt(url)
tld = tldextract.extract(url)
domain = '.'.join(tld[:])
self.c.execute("INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)",
- (issnl, surt, url, tld.domain, tld.registered_domain, tld.suffix))
+ (issnl, url_surt, url, tld.domain, tld.registered_domain, tld.suffix))
def index_entrez(self, args):
path = args.input_file or ENTREZ_FILE
@@ -385,6 +402,7 @@ class ChoculaDatabase():
)
counts[status] += 1
self.c.close()
+ self.db.commit()
print(counts)
def index_road(self, args):
@@ -417,6 +435,7 @@ class ChoculaDatabase():
if row['URL2']:
self.add_url(issnl, row['URL2'])
self.c.close()
+ self.db.commit()
print(counts)
def index_doaj(self, args):
@@ -436,7 +455,6 @@ class ChoculaDatabase():
if row['DOAJ Seal']:
extra['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()]
if row['Country of publisher']:
- print(row['Country of publisher'])
extra['country'] = parse_country(row['Country of publisher'])
row['lang'] = parse_lang(row['Full text language'])
# TODO: work_level: bool (are work-level publications deposited with DOAJ?)
@@ -452,7 +470,7 @@ class ChoculaDatabase():
# TODO: Permanent article identifiers
default_license = row['Journal license']
if default_license and default_license.startswith('CC'):
- extra[issnl]['default_license'] = default_license.replace('CC ', 'CC-').strip()
+ extra['default_license'] = default_license.replace('CC ', 'CC-').strip()
issnl, status = self.add_issn(
'doaj',
@@ -468,6 +486,7 @@ class ChoculaDatabase():
# TODO: Subjects
self.c.close()
+ self.db.commit()
print(counts)
def index_sherpa_romeo(self, args):
@@ -510,6 +529,7 @@ class ChoculaDatabase():
if not issnl:
continue
self.c.close()
+ self.db.commit()
print(counts)
def index_norwegian(self, args):
@@ -530,10 +550,19 @@ class ChoculaDatabase():
if not (issnp or issne):
counts['no-issn'] += 1
continue
+ extra = dict(as_of=NORWEGIAN_DATE)
+ extra['level'] = int(row['Present Level (2018)'])
+ if row['Original title'] != row['International title']:
+ extra['original_name'] = row['Original title']
+ if row['Country of publication']:
+ extra['country'] = parse_country(row['Country of publication'])
+ if row['Language']:
+ extra['lang'] = parse_lang(row['Language'])
issnl, status = self.add_issn(
'norwegian',
issnp=issnp,
issne=issne,
+ identifier=row['NSD tidsskrift_id'],
name=row['International title'],
publisher=row['Publisher'],
extra=extra,
@@ -541,21 +570,88 @@ class ChoculaDatabase():
counts[status] += 1
if not issnl:
continue
- d = self.data[issnl]
- norwegian = dict(as_of=NORWEGIAN_DATE)
- norwegian['level'] = int(row['Present Level (2018)'])
- norwegian['id'] = int(row['NSD tidsskrift_id'])
-
- if row['Original title'] != row['International title'] and not 'original_name' in d:
- self.data[issnl]['original_name'] = row['Original title']
- if row['Country of publication']:
- self.add_country(issnl, row['Country of publication'])
- if row['Language']:
- self.add_lang(issnl, row['Language'])
if row['URL']:
self.add_url(issnl, row['URL'])
- self.data[issnl]['norwegian'] = norwegian
self.c.close()
+ self.db.commit()
+ print(counts)
+
+ def index_szczepanski(self, args):
+ path = args.input_file or SZCZEPANSKI_FILE
+ print("##### Loading Szczepanski...")
+ # JSON
+ json_file = open(path, 'r')
+ counts = Counter()
+ self.c = self.db.cursor()
+ for row in json_file:
+ if not row:
+ continue
+ row = json.loads(row)
+ if not (row.get('issne') or row.get('issnp') or row.get('issn')):
+ #print(row)
+ counts['no-issn'] += 1
+ continue
+ extra = dict()
+ if row.get('extra'):
+ extra['notes'] = row.get('extra')
+ for k in ('other_titles', 'year_spans', 'ed'):
+ if row.get(k):
+ extra[k] = row[k]
+ issnl, status = self.add_issn(
+ 'szczepanski',
+ issne=row.get('issne'),
+ issnp=row.get('issnp'),
+ raw_issn=row.get('issn'),
+ name=row['title'],
+ publisher=row.get('ed'),
+ extra=extra,
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ for url in row.get('urls', []):
+ self.add_url(issnl, url['url'])
+ self.c.close()
+ self.db.commit()
+ print(counts)
+
+ def index_ezb(self, args):
+ path = args.input_file or EZB_FILE
+ print("##### Loading EZB...")
+ # JSON
+ json_file = open(path, 'r')
+ counts = Counter()
+ self.c = self.db.cursor()
+ for row in json_file:
+ if not row:
+ continue
+ row = json.loads(row)
+ if not (row.get('issne') or row.get('issnp')):
+ #print(row)
+ counts['no-issn'] += 1
+ continue
+ extra = dict()
+ for k in ('ezb_color', 'subjects', 'keywords', 'zdb_id',
+ 'first_volume', 'first_issue', 'first_year',
+ 'appearance', 'costs'):
+ if row.get(k):
+ extra[k] = row[k]
+ issnl, status = self.add_issn(
+ 'ezb',
+ issne=row.get('issne'),
+ issnp=row.get('issnp'),
+ identifier=row['ezb_id'],
+ name=row['title'],
+ publisher=row.get('publisher'),
+ extra=extra,
+ )
+ counts[status] += 1
+ if not issnl:
+ continue
+ if row.get('url'):
+ self.add_url(issnl, row['url'])
+ self.c.close()
+ self.db.commit()
print(counts)
def index_kbart(self, name, path):
@@ -601,6 +697,7 @@ class ChoculaDatabase():
new_spans = [[start, end]]
self.data[issnl]['kbart'][name]['year_spans'] = merge_spans(old_spans, new_spans)
self.c.close()
+ self.db.commit()
print(counts)
def index_crossref(self, args):
@@ -615,27 +712,26 @@ class ChoculaDatabase():
row['pissn'] = row['pissn'][:4] + '-' + row['pissn'][4:]
if row['eissn'] and len(row['eissn']) == 8:
row['eissn'] = row['eissn'][:4] + '-' + row['eissn'][4:]
- if not (row['pissn'] or row['eissn']):
+ if row['additionalIssns'] and len(row['additionalIssns']) == 8:
+ row['additionalIssns'] = row['additionalIssns'][:4] + '-' + row['additionalIssns'][4:]
+ if not (row['pissn'] or row['eissn'] or row['additionalIssns']):
+ print(row)
counts['no-issn'] += 1
continue
+ extra = dict()
issnl, status = self.add_issn(
'crossref',
issnp=row['pissn'],
issne=row['eissn'],
+ raw_issn=row['additionalIssns'],
+ identifier=row.get('doi'),
name=row['JournalTitle'],
publisher=row['Publisher'],
extra=extra,
)
counts[status] += 1
- if not issnl:
- continue
- d = self.data[issnl]
- crossref = dict()
- if row['doi']:
- crossref['doi'] = row['doi']
- crossref['any'] = True
- self.data[issnl]['crossref'] = crossref
self.c.close()
+ self.db.commit()
print(counts)
def index_sim(self, args):
@@ -688,6 +784,7 @@ class ChoculaDatabase():
sim.pop(k)
self.data[issnl]['sim'] = sim
self.c.close()
+ self.db.commit()
print(counts)
def load_homepage_crawl(self, path):
@@ -713,6 +810,7 @@ class ChoculaDatabase():
ia['homepage_url'] = row['first_url']
self.data[issnl]['ia'] = ia
self.c.close()
+ self.db.commit()
print(counts)
def everything(self, args):
@@ -768,19 +866,22 @@ def main():
sub = subparsers.add_parser('summarize')
sub.set_defaults(func='summarize')
+ # TODO: 'jurn'
+ for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb'):
+ sub = subparsers.add_parser('index_{}'.format(ind))
+ sub.set_defaults(func='index_{}'.format(ind))
+
+ sub = subparsers.add_parser('fatcat_load')
+ sub.set_defaults(func='fatcat_load')
- #index_doaj
- #index_road
- #index_crossref
- #index_szczepanski
- #index_jurn
- #index_norwegian
+ sub = subparsers.add_parser('fatcat_stats')
+ sub.set_defaults(func='fatcat_stats')
- #fatcat_load
- #fatcat_stats
+ sub = subparsers.add_parser('export_urls')
+ sub.set_defaults(func='export_urls')
- #export_urls
- #update_url_status
+ sub = subparsers.add_parser('update_url_status')
+ sub.set_defaults(func='update_url_status')
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -788,7 +889,7 @@ def main():
sys.exit(-1)
cdb = ChoculaDatabase(args.db_file)
- if args.func not in ('init_db', 'summarize'):
+ if args.func.startswith('index_'):
cdb.read_issn_map_file(ISSNL_FILE)
func = getattr(cdb, args.func)
func(args)