aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:50:41 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:50:56 -0700
commit615c81605190499db2fa98cb85610197d3ce5507 (patch)
tree5e5b039a75acb2be8069ff71e9cd48031c1b2a14
parent6b38eb42b91ed486a2dc87450417a3857662deb9 (diff)
downloadfatcat-615c81605190499db2fa98cb85610197d3ce5507.tar.gz
fatcat-615c81605190499db2fa98cb85610197d3ce5507.zip
chocula: sherpa_color in summary; cleanups
-rwxr-xr-xextra/journal_metadata/chocula.py14
-rw-r--r--extra/journal_metadata/chocula_schema.sql1
-rwxr-xr-xextra/journal_metadata/data/fetch.sh3
3 files changed, 12 insertions, 6 deletions
diff --git a/extra/journal_metadata/chocula.py b/extra/journal_metadata/chocula.py
index 72bff283..415b741f 100755
--- a/extra/journal_metadata/chocula.py
+++ b/extra/journal_metadata/chocula.py
@@ -549,6 +549,7 @@ class ChoculaDatabase():
path = args.input_file or DOAJ_FILE
print("##### Loading DOAJ...")
#Journal title,Journal URL,Alternative title,Journal ISSN (print version),Journal EISSN (online version),Publisher,Society or institution,"Platform, host or aggregator",Country of publisher,Journal article processing charges (APCs),APC information URL,APC amount,Currency,Journal article submission fee,Submission fee URL,Submission fee amount,Submission fee currency,Number of articles publish in the last calendar year,Number of articles information URL,Journal waiver policy (for developing country authors etc),Waiver policy information URL,Digital archiving policy or program(s),Archiving: national library,Archiving: other,Archiving infomation URL,Journal full-text crawl permission,Permanent article identifiers,Journal provides download statistics,Download statistics information URL,First calendar year journal provided online Open Access content,Full text formats,Keywords,Full text language,URL for the Editorial Board page,Review process,Review process information URL,URL for journal's aims & scope,URL for journal's instructions for authors,Journal plagiarism screening policy,Plagiarism information URL,Average number of weeks between submission and publication,URL for journal's Open Access statement,Machine-readable CC licensing information embedded or displayed in articles,URL to an example page with embedded licensing information,Journal license,License attributes,URL for license terms,Does this journal allow unrestricted reuse in compliance with BOAI?,Deposit policy directory,Author holds copyright without restrictions,Copyright information URL,Author holds publishing rights without restrictions,Publishing rights information URL,DOAJ Seal,Tick: Accepted after March 2014,Added on Date,Subjects
+ # TODO: Subjects, Permanent article identifiers, work_level stuff
reader = csv.DictReader(open(path))
counts = Counter()
self.c = self.db.cursor()
@@ -564,7 +565,6 @@ class ChoculaDatabase():
if row['Country of publisher']:
extra['country'] = parse_country(row['Country of publisher'])
row['lang'] = parse_lang(row['Full text language'])
- # TODO: work_level: bool (are work-level publications deposited with DOAJ?)
if row['Digital archiving policy or program(s)']:
extra['archive'] = [a.strip() for a in row['Digital archiving policy or program(s)'].split(',') if a.strip()]
@@ -574,7 +574,6 @@ class ChoculaDatabase():
crawl_permission = row['Journal full-text crawl permission']
if crawl_permission:
extra['crawl-permission'] = dict(Yes=True, No=False)[crawl_permission]
- # TODO: Permanent article identifiers
default_license = row['Journal license']
if default_license and default_license.startswith('CC'):
extra['default_license'] = default_license.replace('CC ', 'CC-').strip()
@@ -591,7 +590,6 @@ class ChoculaDatabase():
self.add_url(issnl, row['Journal URL'])
counts[status] += 1
- # TODO: Subjects
self.c.close()
self.db.commit()
print(counts)
@@ -1142,8 +1140,12 @@ class ChoculaDatabase():
if not out.get(k) and extra.get(k):
out[k] = extra[k]
if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'):
- # TODO: or if sherma/romeo color is green
out['is_oa'] = True
+ if irow['slug'] == 'sherpa_romeo':
+ extra = json.loads(irow['extra'])
+ out['sherpa_color'] = extra['color']
+ if extra['color'] == 'green':
+ out['is_oa'] = True
cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl])
for hrow in cur:
@@ -1182,7 +1184,7 @@ class ChoculaDatabase():
out['publisher_type'] = 'longtail'
out['is_longtail'] = True
- self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+ self.c.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
(issnl,
out.get('issne'),
out.get('issnp'),
@@ -1193,6 +1195,7 @@ class ChoculaDatabase():
out.get('country'),
out.get('lang'),
out.get('is_oa', False),
+ out.get('sherpa_color'),
out.get('is_longtail', False),
out.get('is_active'),
out.get('publisher_type'),
@@ -1224,6 +1227,7 @@ class ChoculaDatabase():
self.index_entrez(args)
self.index_ezb(args)
self.index_gold_oa(args)
+ self.index_openapc(args)
self.index_wikidata(args)
self.load_fatcat(args)
self.load_fatcat_stats(args)
diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql
index b3c7c2fb..99462794 100644
--- a/extra/journal_metadata/chocula_schema.sql
+++ b/extra/journal_metadata/chocula_schema.sql
@@ -22,6 +22,7 @@ CREATE TABLE IF NOT EXISTS journal
is_active BOOLEAN,
is_oa BOOLEAN default false,
is_longtail BOOLEAN default false,
+ sherpa_color TEXT,
--vor_pdf BOOLEAN,
--vor_html BOOLEAN,
--vor_jats BOOLEAN,
diff --git a/extra/journal_metadata/data/fetch.sh b/extra/journal_metadata/data/fetch.sh
index f55f934c..fe6acb90 100755
--- a/extra/journal_metadata/data/fetch.sh
+++ b/extra/journal_metadata/data/fetch.sh
@@ -6,7 +6,8 @@ set -eu
#unzip -n road-2018-01-24-export-issn.zip
wget -c https://archive.org/download/road-issn-2018/road-2018-01-24.tsv
-wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv
+#wget -c https://archive.org/download/doaj_bulk_metadata_2019/doaj_20190124.csv
+wget -c https://archive.org/download/doaj_bulk_metadata_2019/journalcsv__doaj_20190731_0130_utf8.csv
#wget -c https://archive.org/download/issn_issnl_mappings/20190129.ISSN-to-ISSN-L.txt
wget -c https://archive.org/download/issn_issnl_mappings/20190730.ISSN-to-ISSN-L.txt