aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-07-31 18:33:19 -0700
committerBryan Newbold <bnewbold@archive.org>2019-07-31 18:33:19 -0700
commit46e975e852d6492362bfa15afd7419f68d10708d (patch)
treea091309710aace5897801209aec6ba72e1b75e3b
parentfc54908da2aad233e2943dd62fd8c0d93120011c (diff)
downloadchocula-46e975e852d6492362bfa15afd7419f68d10708d.tar.gz
chocula-46e975e852d6492362bfa15afd7419f68d10708d.zip
add 'export_fatcat'
-rwxr-xr-xchocula.py52
1 files changed, 51 insertions, 1 deletions
diff --git a/chocula.py b/chocula.py
index 0125cb9..2bd111e 100755
--- a/chocula.py
+++ b/chocula.py
@@ -15,6 +15,7 @@ Commands:
init_db
summarize
export
+ export_fatcat
index_doaj
index_road
@@ -34,7 +35,6 @@ Commands:
Future commands:
- fatcat_edits
index_jurn
index_datacite
preserve_kbart --keeper SLUG
@@ -1275,6 +1275,53 @@ class ChoculaDatabase():
print(json.dumps(row))
counts['total'] += 1
+ def export_fatcat(self, args):
+ counts = Counter()
+ self.db.row_factory = sqlite3.Row
+ self.c = self.db.cursor()
+ for row in self.c.execute('SELECT * FROM journal'):
+ counts['total'] += 1
+
+ if not row['name']:
+ counts['empty-name'] += 1
+ continue
+
+ out = dict(
+ issnl=row['issnl'],
+ wikidata_qid=row['wikidata_qid'],
+ ident=row['fatcat_ident'],
+ publisher=row['publisher'],
+ name=row['name'])
+
+ extra = dict(
+ issnp=row['issnp'],
+ issne=row['issne'],
+ country=row['country'],
+ lang=row['lang'],
+ )
+ if row['sherpa_color']:
+ extra['sherpa'] = dict(color=row['sherpa_color'])
+
+ urls = []
+ cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']])
+ for hrow in cur:
+ if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error':
+ urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
+ continue
+ if hrow['blocked']:
+ urls.append(hrow['url'])
+ continue
+ if hrow['terminal_status_code'] == 200:
+ if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/":
+ # check for trivial redirects; use post-redirect URL in those cases
+ urls.append(hrow['terminal_url'])
+ else:
+ urls.append(hrow['url'])
+ continue
+ extra['urls'] = urls
+ out['extra'] = extra
+ print(json.dumps(out))
+
def init_db(self, args):
print("### Creating Database...")
self.db.executescript("""
@@ -1312,6 +1359,9 @@ def main():
sub = subparsers.add_parser('export')
sub.set_defaults(func='export')
+ sub = subparsers.add_parser('export_fatcat')
+ sub.set_defaults(func='export_fatcat')
+
# TODO: 'jurn'
for ind in ('doaj', 'road', 'crossref', 'entrez', 'norwegian', 'szczepanski', 'ezb', 'gold_oa', 'wikidata', 'openapc'):
sub = subparsers.add_parser('index_{}'.format(ind))