aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-22 13:46:42 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-22 13:46:42 -0700
commitdb40b9e70b917dbbbfda48f6d77a2fc509366a82 (patch)
treee66626601ec32965c4cee7fad16982530408019b
parent5d3ce061d24a5188fc015012b2f70a4c6f568969 (diff)
downloadchocula-db40b9e70b917dbbbfda48f6d77a2fc509366a82.tar.gz
chocula-db40b9e70b917dbbbfda48f6d77a2fc509366a82.zip
fmt (black)
-rwxr-xr-xcheck_issn_urls.py133
-rw-r--r--chocula/__init__.py2
-rw-r--r--chocula/__main__.py87
-rw-r--r--chocula/common.py60
-rw-r--r--chocula/config.py3
-rw-r--r--chocula/database.py604
-rw-r--r--chocula/directories/__init__.py17
-rw-r--r--chocula/directories/crossref.py15
-rw-r--r--chocula/directories/doaj.py58
-rw-r--r--chocula/directories/entrez.py14
-rw-r--r--chocula/directories/ezb.py29
-rw-r--r--chocula/directories/gold_oa.py13
-rw-r--r--chocula/directories/norwegian.py31
-rw-r--r--chocula/directories/openapc.py19
-rw-r--r--chocula/directories/road.py29
-rw-r--r--chocula/directories/scielo.py32
-rw-r--r--chocula/directories/sherpa_romeo.py33
-rw-r--r--chocula/directories/sim.py41
-rw-r--r--chocula/directories/szczepanski.py25
-rw-r--r--chocula/directories/wikidata.py27
-rw-r--r--chocula/kbart.py3
-rw-r--r--chocula/util.py237
-rw-r--r--tests/test_database.py10
-rw-r--r--tests/test_directories.py9
24 files changed, 853 insertions, 678 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 1135d6c..23169f1 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -45,17 +45,17 @@ def sniff_platform(resp):
"""
# these are mostly here to filter out huge platforms and stop sniffing
domain_map = {
- 'jstor.org/': 'jstor',
- 'springer.com/': 'springer',
- 'springerlink.com/': 'springer',
- 'tandfonline.com/': 't_and_f',
- 'elsevier.com/': 'elsevier',
- 'wiley.com/': 'wiley',
- 'sciencedirect.com/': 'elsevier',
- 'sagepub.com/': 'sage',
- 'hypotheses.org/': 'hypothesis',
- 'tandf.co.uk/': 't_and_f',
- 'scielo': 'scielo',
+ "jstor.org/": "jstor",
+ "springer.com/": "springer",
+ "springerlink.com/": "springer",
+ "tandfonline.com/": "t_and_f",
+ "elsevier.com/": "elsevier",
+ "wiley.com/": "wiley",
+ "sciencedirect.com/": "elsevier",
+ "sagepub.com/": "sage",
+ "hypotheses.org/": "hypothesis",
+ "tandf.co.uk/": "t_and_f",
+ "scielo": "scielo",
}
for domain, platform in domain_map.items():
if domain in resp.url:
@@ -64,6 +64,7 @@ def sniff_platform(resp):
return "ojs"
return None
+
def sniff_blocked(resp):
"""
This function would try to figure out if we got blocked: soft-block, hard
@@ -73,23 +74,33 @@ def sniff_blocked(resp):
if resp.status_code in (403, 420):
return True
# JSTOR does this
- if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
+ if (
+ "Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA"
+ in resp.text
+ ):
return True
- if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
+ if (
+ resp.status_code == 416
+ and "something about your browser made us think you were a bot" in resp.text
+ ):
return True
return None
-def check_gwb(url, match_type='exact'):
- if '//web.archive.org/' in url:
+
+def check_gwb(url, match_type="exact"):
+ if "//web.archive.org/" in url:
return None
# crude/bad retry loop to work around CDX API throttling
for i in range(5):
- resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
- 'url': url,
- 'matchType': match_type,
- 'limit': -1,
- 'filter': 'statuscode:200'
- })
+ resp = requests.get(
+ "https://web.archive.org/cdx/search/cdx",
+ params={
+ "url": url,
+ "matchType": match_type,
+ "limit": -1,
+ "filter": "statuscode:200",
+ },
+ )
if resp.status_code == 200:
break
time.sleep(5)
@@ -98,81 +109,91 @@ def check_gwb(url, match_type='exact'):
# TODO: this isn't really correct, but not sure what to return/record
# if we failed through all timeouts
return None
- line = resp.text.strip().split('\n')[0]
+ line = resp.text.strip().split("\n")[0]
if line:
dt = line.split()[1]
int(dt)
return dt
else:
return None
-
+
def check_url(issnl, url):
- #print("Fetching: %s" % url)
+ # print("Fetching: %s" % url)
info = dict(issnl=issnl, url=url)
try:
- resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
+ resp = requests.get(
+ url,
+ timeout=30.0,
+ headers={
+ "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org"
+ },
+ )
except requests.exceptions.TooManyRedirects:
- info['error'] = 'TooManyRedirects'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "TooManyRedirects"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.SSLError:
- info['error'] = 'SSLError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "SSLError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ReadTimeout:
- info['error'] = 'ReadTimeout'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ReadTimeout"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ConnectionError:
- info['error'] = 'ConnectionError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ConnectionError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ChunkedEncodingError:
- info['error'] = 'ChunkedEncodingError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ChunkedEncodingError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.ContentDecodingError:
- info['error'] = 'ContentDecodingError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "ContentDecodingError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except requests.exceptions.InvalidSchema:
- info['error'] = 'InvalidSchema'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "InvalidSchema"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
except UnicodeDecodeError:
- info['error'] = 'UnicodeDecodeError'
- info['terminal_status_code'] = info['status_code'] = -1
+ info["error"] = "UnicodeDecodeError"
+ info["terminal_status_code"] = info["status_code"] = -1
return info
if resp.history:
- info['status_code'] = resp.history[0].status_code
+ info["status_code"] = resp.history[0].status_code
else:
- info['status_code'] = resp.status_code
+ info["status_code"] = resp.status_code
- info['terminal_status_code'] = resp.status_code
- info['terminal_url'] = resp.url
- content_type = resp.headers.get('Content-Type')
+ info["terminal_status_code"] = resp.status_code
+ info["terminal_url"] = resp.url
+ content_type = resp.headers.get("Content-Type")
if content_type:
- info['terminal_content_type'] = content_type.split(';')[0]
- info['issnl_in_body'] = bool(issnl in resp.text)
- info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
- info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
- info['blocked'] = sniff_blocked(resp)
- info['software_platform'] = sniff_platform(resp)
- #info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
+ info["terminal_content_type"] = content_type.split(";")[0]
+ info["issnl_in_body"] = bool(issnl in resp.text)
+ info["gwb_url_success_dt"] = check_gwb(url, match_type="exact")
+ info["gwb_terminal_url_success_dt"] = check_gwb(
+ info["terminal_url"], match_type="exact"
+ )
+ info["blocked"] = sniff_blocked(resp)
+ info["software_platform"] = sniff_platform(resp)
+ # info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
return info
+
def run(tsvfile):
for line in tsvfile:
- records = line.split('\t')
+ records = line.split("\t")
issnl = records[0]
url = records[1].strip()
print(json.dumps(check_url(issnl, url)))
-if __name__=="__main__":
+
+if __name__ == "__main__":
if len(sys.argv) != 2:
f = sys.stdin
else:
- f = open(sys.argv[1], 'r')
+ f = open(sys.argv[1], "r")
run(f)
diff --git a/chocula/__init__.py b/chocula/__init__.py
index 38e61c8..2191320 100644
--- a/chocula/__init__.py
+++ b/chocula/__init__.py
@@ -1,6 +1,4 @@
-
from chocula.config import ChoculaConfig
from chocula.database import ChoculaDatabase, IssnDatabase
from chocula.directories import *
from chocula.kbart import *
-
diff --git a/chocula/__main__.py b/chocula/__main__.py
index f897dd1..92f2e6f 100644
--- a/chocula/__main__.py
+++ b/chocula/__main__.py
@@ -48,8 +48,13 @@ import sys
import csv
import argparse
-from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase,\
- ALL_CHOCULA_DIR_CLASSES, ALL_CHOCULA_KBART_CLASSES
+from chocula import (
+ ChoculaDatabase,
+ ChoculaConfig,
+ IssnDatabase,
+ ALL_CHOCULA_DIR_CLASSES,
+ ALL_CHOCULA_KBART_CLASSES,
+)
def run_everything(config, database):
@@ -70,6 +75,7 @@ def run_everything(config, database):
database.summarize()
print("### Done with everything!")
+
def run_directory(config, database, source):
for cls in ALL_CHOCULA_DIR_CLASSES:
if cls.source_slug == source:
@@ -79,6 +85,7 @@ def run_directory(config, database, source):
return
raise NotImplementedError(f"unknown source: {source}")
+
def run_kbart(config, database, source):
for cls in ALL_CHOCULA_KBART_CLASSES:
if cls.source_slug == source:
@@ -88,63 +95,65 @@ def run_kbart(config, database, source):
return
raise NotImplementedError(f"unknown source: {source}")
+
def run_load(config, database, source):
- if source == 'fatcat_stats':
+ if source == "fatcat_stats":
print(database.load_fatcat_stats(config))
- elif source == 'fatcat_containers':
+ elif source == "fatcat_containers":
print(database.load_fatcat_containers(config))
- elif source == 'homepage_status':
+ elif source == "homepage_status":
print(database.load_homepage_status(config))
else:
raise NotImplementedError(f"unknown source: {source}")
+
def main():
parser = argparse.ArgumentParser(
- prog="python -m chocula",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ prog="python -m chocula", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--db-file",
- help="sqlite database file",
- default='chocula.sqlite',
- type=str)
+ parser.add_argument(
+ "--db-file", help="sqlite database file", default="chocula.sqlite", type=str
+ )
- sub = subparsers.add_parser('everything',
- help="run all the commands")
- sub.set_defaults(func='everything')
+ sub = subparsers.add_parser("everything", help="run all the commands")
+ sub.set_defaults(func="everything")
- sub = subparsers.add_parser('init_db',
- help="create sqlite3 output file and tables")
- sub.set_defaults(func='init_db')
+ sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables")
+ sub.set_defaults(func="init_db")
- sub = subparsers.add_parser('summarize',
- help="aggregate metadata from all tables into 'journals' table")
- sub.set_defaults(func='summarize')
+ sub = subparsers.add_parser(
+ "summarize", help="aggregate metadata from all tables into 'journals' table"
+ )
+ sub.set_defaults(func="summarize")
- sub = subparsers.add_parser('export',
- help="dump JSON output")
- sub.set_defaults(func='export')
+ sub = subparsers.add_parser("export", help="dump JSON output")
+ sub.set_defaults(func="export")
- sub = subparsers.add_parser('export_fatcat',
- help="dump JSON output in a format that can load into fatcat")
- sub.set_defaults(func='export_fatcat')
+ sub = subparsers.add_parser(
+ "export_fatcat", help="dump JSON output in a format that can load into fatcat"
+ )
+ sub.set_defaults(func="export_fatcat")
- sub = subparsers.add_parser('export_urls',
- help="dump homepage URLs (eg, to crawl for status)")
- sub.set_defaults(func='export_urls')
+ sub = subparsers.add_parser(
+ "export_urls", help="dump homepage URLs (eg, to crawl for status)"
+ )
+ sub.set_defaults(func="export_urls")
- sub = subparsers.add_parser('directory',
- help="index directory metadata from a given source")
+ sub = subparsers.add_parser(
+ "directory", help="index directory metadata from a given source"
+ )
sub.add_argument("source", type=str, help="short name of source to index")
sub.set_defaults(func=run_directory)
- sub = subparsers.add_parser('load',
- help="load metadata of a given type")
+ sub = subparsers.add_parser("load", help="load metadata of a given type")
sub.add_argument("source", type=str, help="short name of source to index")
sub.set_defaults(func=run_load)
- sub = subparsers.add_parser('kbart',
- help="index KBART holding metadata for a given source")
+ sub = subparsers.add_parser(
+ "kbart", help="index KBART holding metadata for a given source"
+ )
sub.add_argument("source", type=str, help="short name of source to index")
sub.set_defaults(func=run_kbart)
@@ -155,11 +164,11 @@ def main():
config = ChoculaConfig.from_file()
issn_db: Optional[IssnDatabase] = None
- if args.func in ('everything', 'summarize', run_directory, run_kbart):
+ if args.func in ("everything", "summarize", run_directory, run_kbart):
issn_db = IssnDatabase(config.issnl.filepath)
cdb = ChoculaDatabase(args.db_file, issn_db)
- if args.func == 'everything':
+ if args.func == "everything":
run_everything(config, cdb)
elif args.func in (run_directory, run_load, run_kbart):
args.func(config, cdb, args.source)
@@ -168,6 +177,6 @@ def main():
func = getattr(cdb, args.func)
print(func(), file=sys.stderr)
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main()
diff --git a/chocula/common.py b/chocula/common.py
index a5b3739..455649a 100644
--- a/chocula/common.py
+++ b/chocula/common.py
@@ -1,4 +1,3 @@
-
import sys
import csv
import datetime
@@ -17,7 +16,8 @@ from chocula.database import DirectoryInfo, IssnDatabase, HomepageUrl
csv.field_size_limit(1310720)
THIS_YEAR = datetime.date.today().year
-class DirectoryLoader():
+
+class DirectoryLoader:
source_slug: str = "GENERIC"
@@ -35,7 +35,7 @@ class DirectoryLoader():
counts: Counter = Counter()
cur = db.db.cursor()
for record in self.open_file():
- counts['total'] += 1
+ counts["total"] += 1
info = self.parse_record(record)
if info:
status = db.insert_directory(info, cur=cur)
@@ -44,6 +44,7 @@ class DirectoryLoader():
db.db.commit()
return counts
+
@dataclass
class KbartRecord:
issnl: Optional[str]
@@ -60,7 +61,7 @@ class KbartRecord:
year_spans: List[Any]
-class KbartLoader():
+class KbartLoader:
source_slug: str = "GENERIC"
@@ -68,19 +69,19 @@ class KbartLoader():
self.config = config
def file_path(self) -> str:
- #return self.config.TEMPLATE.filepath)
+ # return self.config.TEMPLATE.filepath)
raise NotImplementedError()
def open_file(self) -> Iterable:
- raw_file = open(self.file_path(), 'rb').read().decode(errors='replace')
+ raw_file = open(self.file_path(), "rb").read().decode(errors="replace")
fixed_file = ftfy.fix_text(raw_file)
- reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
+ reader = csv.DictReader(fixed_file.split("\n"), delimiter="\t")
return reader
def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]:
- issne: Optional[str] = clean_issn(row['online_identifier'] or "")
- issnp: Optional[str] = clean_issn(row['print_identifier'] or "")
+ issne: Optional[str] = clean_issn(row["online_identifier"] or "")
+ issnp: Optional[str] = clean_issn(row["print_identifier"] or "")
issnl: Optional[str] = None
if issne:
issnl = issn_db.issn2issnl(issne)
@@ -88,31 +89,31 @@ class KbartLoader():
issnl = issn_db.issn2issnl(issnp)
start_year: Optional[int] = None
end_year: Optional[int] = None
- if row['date_first_issue_online']:
- start_year = int(row['date_first_issue_online'][:4])
- if row['date_last_issue_online']:
- end_year = int(row['date_last_issue_online'][:4])
- end_volume = row['num_last_vol_online']
+ if row["date_first_issue_online"]:
+ start_year = int(row["date_first_issue_online"][:4])
+ if row["date_last_issue_online"]:
+ end_year = int(row["date_last_issue_online"][:4])
+ end_volume = row["num_last_vol_online"]
# hack to handle open-ended preservation
- if end_year is None and end_volume and '(present)' in end_volume:
+ if end_year is None and end_volume and "(present)" in end_volume:
end_year = THIS_YEAR
record = KbartRecord(
issnl=issnl,
issnp=issnp,
issne=issne,
- title=clean_str(row['publication_title']),
- publisher=clean_str(row['publisher_name']),
- url=HomepageUrl.from_url(row['title_url']),
- embargo=clean_str(row['embargo_info']),
+ title=clean_str(row["publication_title"]),
+ publisher=clean_str(row["publisher_name"]),
+ url=HomepageUrl.from_url(row["title_url"]),
+ embargo=clean_str(row["embargo_info"]),
start_year=start_year,
end_year=end_year,
- start_volume=clean_str(row['num_first_vol_online']),
- end_volume=clean_str(row['num_last_vol_online']),
+ start_volume=clean_str(row["num_first_vol_online"]),
+ end_volume=clean_str(row["num_last_vol_online"]),
year_spans=[],
)
- if record.start_volume == 'null':
+ if record.start_volume == "null":
record.start_volume = None
- if record.end_volume == 'null':
+ if record.end_volume == "null":
record.end_volume = None
return record
@@ -126,18 +127,18 @@ class KbartLoader():
counts: Counter = Counter()
kbart_dict: Dict[str, KbartRecord] = dict()
for row in self.open_file():
- counts['total'] += 1
+ counts["total"] += 1
record = self.parse_record(row, db.issn_db)
if record is None:
- counts['skip-parse'] += 1
+ counts["skip-parse"] += 1
continue
elif not record.issnl:
- counts['skip-issnl'] += 1
+ counts["skip-issnl"] += 1
continue
elif record.start_year is None or record.end_year is None:
- counts['partial-missing-years'] += 1
- counts['parsed'] += 1
+ counts["partial-missing-years"] += 1
+ counts["parsed"] += 1
existing = kbart_dict.get(record.issnl, record)
if record.start_year and record.end_year:
@@ -149,7 +150,7 @@ class KbartLoader():
record.year_spans = merge_spans(old_spans, new_spans)
kbart_dict[record.issnl] = record
- counts['unique-issnl'] = len(kbart_dict)
+ counts["unique-issnl"] = len(kbart_dict)
cur = db.db.cursor()
for issnl, record in kbart_dict.items():
info = DirectoryInfo(
@@ -169,4 +170,3 @@ class KbartLoader():
cur.close()
db.db.commit()
return counts
-
diff --git a/chocula/config.py b/chocula/config.py
index 2237404..3bd8ade 100644
--- a/chocula/config.py
+++ b/chocula/config.py
@@ -1,9 +1,8 @@
-
from types import SimpleNamespace
import toml
-class ChoculaConfig(SimpleNamespace):
+class ChoculaConfig(SimpleNamespace):
@classmethod
def from_file(cls, file_path="sources.toml", sources_dir="data/"):
diff --git a/chocula/database.py b/chocula/database.py
index f620515..11632b9 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -1,4 +1,3 @@
-
from __future__ import annotations
import sys
@@ -47,41 +46,49 @@ class HomepageUrl:
"""
Returns None if url is really bad (not a URL).
"""
- if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+ if (
+ not url
+ or "mailto:" in url.lower()
+ or url.lower() in ("http://n/a", "http://na/", "http://na")
+ ):
return None
- if url.startswith('www.'):
+ if url.startswith("www."):
url = "http://" + url
- if url.startswith('ttp://') or url.startswith('ttps://'):
+ if url.startswith("ttp://") or url.startswith("ttps://"):
url = "h" + url
- url.replace('Http://', 'http://')
+ url.replace("Http://", "http://")
url = str(urlcanon.semantic_precise(url))
- if url == 'http://na/':
+ if url == "http://na/":
# sort of redundant with above, but some only match after canonicalization
return None
url_surt = surt.surt(url)
tld = tldextract.extract(url)
- host = '.'.join(tld)
- if host.startswith('.'):
+ host = ".".join(tld)
+ if host.startswith("."):
host = host[1:]
- return HomepageUrl(url=url,
- surt=url_surt,
- host=host,
- domain=tld.registered_domain,
- suffix=tld.suffix)
+ return HomepageUrl(
+ url=url,
+ surt=url_surt,
+ host=host,
+ domain=tld.registered_domain,
+ suffix=tld.suffix,
+ )
+
def test_from_url():
-
- assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == 'core.ac.uk'
- assert HomepageUrl.from_url("http://thing.core.ac.uk").host == 'thing.core.ac.uk'
- assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix== 'ac.uk'
- assert HomepageUrl.from_url("google.com").suffix == 'com'
- assert HomepageUrl.from_url("google.com").host == 'google.com'
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == "core.ac.uk"
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").host == "thing.core.ac.uk"
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix == "ac.uk"
+
+ assert HomepageUrl.from_url("google.com").suffix == "com"
+ assert HomepageUrl.from_url("google.com").host == "google.com"
assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None
- assert HomepageUrl.from_url("thing.com").url == 'http://thing.com/'
- assert HomepageUrl.from_url("Http://thing.com///").url == 'http://thing.com/'
+ assert HomepageUrl.from_url("thing.com").url == "http://thing.com/"
+ assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/"
+
@dataclass
class UrlCrawlStatus:
@@ -95,6 +102,7 @@ class UrlCrawlStatus:
gwb_url_success_dt: Optional[str]
gwb_terminal_url_success_dt: Optional[str]
+
@dataclass
class DirectoryInfo:
directory_slug: str
@@ -127,10 +135,19 @@ class DirectoryInfo:
"""
if not self.issnl:
raise ValueError
- extra_dict = self.extra
-
- for k in ('issne', 'issnp', 'name', 'publisher', 'abbrev', 'platform',
- 'country', 'langs', 'original_name'):
+ extra_dict = self.extra
+
+ for k in (
+ "issne",
+ "issnp",
+ "name",
+ "publisher",
+ "abbrev",
+ "platform",
+ "country",
+ "langs",
+ "original_name",
+ ):
if self.__dict__[k]:
extra_dict[k] = self.__dict__[k]
@@ -151,7 +168,7 @@ class DirectoryInfo:
raise NotImplementedError()
-class IssnDatabase():
+class IssnDatabase:
"""
Holds complete ISSN/ISSN-L table and helps with lookups and munging of raw
ISSN strings
@@ -163,7 +180,7 @@ class IssnDatabase():
def read_issn_map_file(self, issn_map_path: str):
print("##### Loading ISSN-L map file...", file=sys.stderr)
- with open(issn_map_path, 'r') as issn_map_file:
+ with open(issn_map_path, "r") as issn_map_file:
for line in issn_map_file:
if line.startswith("ISSN") or len(line) == 0:
continue
@@ -209,7 +226,7 @@ class IssnDatabase():
return info
-class ChoculaDatabase():
+class ChoculaDatabase:
"""
Wraps a sqlite3 database
"""
@@ -218,7 +235,7 @@ class ChoculaDatabase():
"""
To create a temporary database, pass ":memory:" as db_file
"""
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self.data = dict()
self.issn_db = issn_db
@@ -247,8 +264,7 @@ class ChoculaDatabase():
cur = self.db.cursor()
try:
- cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)",
- info.to_db_tuple())
+ cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", info.to_db_tuple())
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
return "duplicate"
@@ -264,7 +280,8 @@ class ChoculaDatabase():
try:
cur.execute(
"INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)",
- homepage.to_db_tuple(issnl))
+ homepage.to_db_tuple(issnl),
+ )
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
return "duplicate"
@@ -276,29 +293,33 @@ class ChoculaDatabase():
print("##### Loading IA Homepage Crawl Results...")
counts: Counter = Counter()
cur = self.db.cursor()
- for line in open(config.homepage_status.filepath, 'r'):
+ for line in open(config.homepage_status.filepath, "r"):
if not line.strip():
continue
row = json.loads(line)
- counts['total'] += 1
- url = row['url']
- assert(url)
- if row.get('gwb_url_success_dt') == 'error':
- row['gwb_url_success_dt'] = None
- if row.get('gwb_terminal_url_success_dt') == 'error':
- row['gwb_terminal_url_success_dt'] = None
- cur.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
- (row['status_code'],
- row.get('crawl_error'),
- row.get('terminal_url'),
- row.get('terminal_status_code'),
- row.get('platform_software'),
- row.get('issnl_in_body'),
- row.get('blocked'),
- row.get('gwb_url_success_dt'),
- row.get('gwb_terminal_url_success_dt'),
- url))
- counts['updated'] += 1
+ counts["total"] += 1
+ url = row["url"]
+ assert url
+ if row.get("gwb_url_success_dt") == "error":
+ row["gwb_url_success_dt"] = None
+ if row.get("gwb_terminal_url_success_dt") == "error":
+ row["gwb_terminal_url_success_dt"] = None
+ cur.execute(
+ "UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
+ (
+ row["status_code"],
+ row.get("crawl_error"),
+ row.get("terminal_url"),
+ row.get("terminal_status_code"),
+ row.get("platform_software"),
+ row.get("issnl_in_body"),
+ row.get("blocked"),
+ row.get("gwb_url_success_dt"),
+ row.get("gwb_terminal_url_success_dt"),
+ url,
+ ),
+ )
+ counts["updated"] += 1
cur.close()
self.db.commit()
return counts
@@ -306,51 +327,54 @@ class ChoculaDatabase():
def load_fatcat_containers(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Entities...")
# JSON
- json_file = open(config.fatcat_containers.filepath, 'r')
+ json_file = open(config.fatcat_containers.filepath, "r")
counts: Counter = Counter()
cur = self.db.cursor()
for line in json_file:
if not line:
continue
row = json.loads(line)
- if row['state'] != 'active':
+ if row["state"] != "active":
continue
- counts['total'] += 1
- extra = row.get('extra', dict())
- issne = extra.get('issne')
- issnp = extra.get('issnp')
- country = extra.get('country')
- languages = extra.get('languages', [])
+ counts["total"] += 1
+ extra = row.get("extra", dict())
+ issne = extra.get("issne")
+ issnp = extra.get("issnp")
+ country = extra.get("country")
+ languages = extra.get("languages", [])
lang = None
if languages:
lang = languages[0]
try:
- cur.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
- (row.get('issnl'),
- row['ident'],
- row['revision'],
- issne,
- issnp,
- row.get('wikidata_qid'),
- row['name'],
- row.get('container_type'),
- extra.get('publisher'),
- country,
- lang,
- ))
+ cur.execute(
+ "INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
+ (
+ row.get("issnl"),
+ row["ident"],
+ row["revision"],
+ issne,
+ issnp,
+ row.get("wikidata_qid"),
+ row["name"],
+ row.get("container_type"),
+ extra.get("publisher"),
+ country,
+ lang,
+ ),
+ )
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
counts["existing"] += 1
continue
else:
raise ie
- counts['inserted'] += 1
- if row.get('issnl'):
- urls = extra.get('urls', [])
+ counts["inserted"] += 1
+ if row.get("issnl"):
+ urls = extra.get("urls", [])
for url in urls:
homepage = HomepageUrl.from_url(url)
if homepage:
- self.insert_homepage(row.get('issnl'), homepage, cur)
+ self.insert_homepage(row.get("issnl"), homepage, cur)
cur.close()
self.db.commit()
return counts
@@ -358,22 +382,31 @@ class ChoculaDatabase():
def load_fatcat_stats(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Stats...")
# JSON
- json_file = open(config.fatcat_stats.filepath, 'r')
+ json_file = open(config.fatcat_stats.filepath, "r")
counts: Counter = Counter()
cur = self.db.cursor()
for line in json_file:
if not line:
continue
row = json.loads(line)
- total = int(row['total'])
+ total = int(row["total"])
ia_frac: Optional[float] = None
preserved_frac: Optional[float] = None
if total > 0:
- ia_frac = float(row['in_web'])/total
- preserved_frac = float(row['is_preserved'])/total
- cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
- (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl']))
- counts['updated'] += 1
+ ia_frac = float(row["in_web"]) / total
+ preserved_frac = float(row["is_preserved"]) / total
+ cur.execute(
+ "UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
+ (
+ total,
+ row["in_web"],
+ ia_frac,
+ row["is_preserved"],
+ preserved_frac,
+ row["issnl"],
+ ),
+ )
+ counts["updated"] += 1
cur.close()
self.db.commit()
return counts
@@ -384,10 +417,10 @@ class ChoculaDatabase():
self.db.row_factory = sqlite3.Row
cur = self.db.execute("SELECT issnl, url FROM homepage;")
for hrow in cur:
- assert(hrow['url'])
- assert(len(hrow['url'].split()) == 1)
- counts['total'] += 1
- print('\t'.join((hrow['issnl'], hrow['url'])))
+ assert hrow["url"]
+ assert len(hrow["url"].split()) == 1
+ counts["total"] += 1
+ print("\t".join((hrow["issnl"], hrow["url"])))
return counts
def summarize(self) -> Counter:
@@ -395,135 +428,189 @@ class ChoculaDatabase():
counts: Counter = Counter()
cur = self.db.cursor()
self.db.row_factory = sqlite3.Row
- index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory'))
- fatcat_issnls = list(cur.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null'))
+ index_issnls = list(cur.execute("SELECT DISTINCT issnl FROM directory"))
+ fatcat_issnls = list(
+ cur.execute(
+ "SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null"
+ )
+ )
all_issnls = set([i[0] for i in index_issnls + fatcat_issnls])
print("{} total ISSN-Ls".format(len(all_issnls)))
for issnl in all_issnls:
- #print(issnl)
- counts['total'] += 1
+ # print(issnl)
+ counts["total"] += 1
out = dict()
# check if ISSN-L is good. this is here because of fatcat import
- out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl)
- if not out['known_issnl']:
- counts['unknown-issnl'] += 1
- out['valid_issnl'] = stdnum.issn.is_valid(issnl)
- if not out['valid_issnl']:
- counts['invalid-issnl'] += 1
-
- fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]))
+ out["known_issnl"] = self.issn_db.issn2issnl(issnl) == issnl
+ if not out["known_issnl"]:
+ counts["unknown-issnl"] += 1
+ out["valid_issnl"] = stdnum.issn.is_valid(issnl)
+ if not out["valid_issnl"]:
+ counts["invalid-issnl"] += 1
+
+ fatcat_row = list(
+ self.db.execute(
+ "SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]
+ )
+ )
if fatcat_row:
frow = fatcat_row[0]
- out['fatcat_ident'] = frow['ident']
- for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'):
+ out["fatcat_ident"] = frow["ident"]
+ for k in (
+ "name",
+ "publisher",
+ "issne",
+ "issnp",
+ "wikidata_qid",
+ "lang",
+ "country",
+ "release_count",
+ "ia_count",
+ "ia_frac",
+ "kbart_count",
+ "kbart_frac",
+ "preserved_count",
+ "preserved_frac",
+ ):
if not out.get(k) and frow[k] != None:
out[k] = frow[k]
cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl])
for irow in cur:
- if irow['slug'] in ('crossref',):
- out['has_dois'] = True
+ if irow["slug"] in ("crossref",):
+ out["has_dois"] = True
# TODO: other DOI registrars (japan, datacite)
- if irow['slug'] == 'wikidata':
- out['wikidata_qid'] = irow['identifier']
- for k in ('name',):
+ if irow["slug"] == "wikidata":
+ out["wikidata_qid"] = irow["identifier"]
+ for k in ("name",):
if not out.get(k) and irow[k]:
out[k] = irow[k]
- if irow['extra']:
- extra = json.loads(irow['extra'])
- for k in ('country', 'lang', 'issne', 'issnp', 'publisher', 'platform'):
+ if irow["extra"]:
+ extra = json.loads(irow["extra"])
+ for k in (
+ "country",
+ "lang",
+ "issne",
+ "issnp",
+ "publisher",
+ "platform",
+ ):
if not out.get(k) and extra.get(k):
out[k] = extra[k]
- if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'):
- out['is_oa'] = True
- if irow['slug'] == 'ezb':
- ezb_extra = json.loads(irow['extra'])
- if ezb_extra['ezb_color'] == 'green':
- out['is_oa'] = True
- if irow['slug'] == 'sherpa_romeo':
- extra = json.loads(irow['extra'])
- out['sherpa_color'] = extra['sherpa_romeo']['color']
- if extra['sherpa_romeo']['color'] == 'green':
- out['is_oa'] = True
+ if irow["slug"] in ("doaj", "road", "szczepanski", "gold_oa"):
+ out["is_oa"] = True
+ if irow["slug"] == "ezb":
+ ezb_extra = json.loads(irow["extra"])
+ if ezb_extra["ezb_color"] == "green":
+ out["is_oa"] = True
+ if irow["slug"] == "sherpa_romeo":
+ extra = json.loads(irow["extra"])
+ out["sherpa_color"] = extra["sherpa_romeo"]["color"]
+ if extra["sherpa_romeo"]["color"] == "green":
+ out["is_oa"] = True
# filter out "NA" ISSNs
- for k in ('issne', 'issnp'):
- if out.get(k) and (len(out[k]) != 9 or out[k][4] != '-'):
+ for k in ("issne", "issnp"):
+ if out.get(k) and (len(out[k]) != 9 or out[k][4] != "-"):
out.pop(k)
cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl])
for hrow in cur:
- out['any_homepage'] = True
- if hrow['terminal_status_code'] == 200 and hrow['host'] != 'web.archive.org':
- out['any_live_homepage'] = True
- if hrow['gwb_url_success_dt'] or hrow['gwb_terminal_url_success_dt']:
- out['any_gwb_homepage'] = True
-
- if out.get('wikidata_qid'):
- assert out['wikidata_qid'].startswith('Q')
- assert out['wikidata_qid'][1].isdigit()
- assert out['wikidata_qid'][-1].isdigit()
+ out["any_homepage"] = True
+ if (
+ hrow["terminal_status_code"] == 200
+ and hrow["host"] != "web.archive.org"
+ ):
+ out["any_live_homepage"] = True
+ if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]:
+ out["any_gwb_homepage"] = True
+
+ if out.get("wikidata_qid"):
+ assert out["wikidata_qid"].startswith("Q")
+ assert out["wikidata_qid"][1].isdigit()
+ assert out["wikidata_qid"][-1].isdigit()
# define publisher types
- publisher = out.get('publisher')
- pl = out.get('publisher', '').lower().strip()
- if out.get('platform') == 'scielo':
- out['publisher_type'] = 'scielo'
- elif publisher in BIG5_PUBLISHERS or 'elsevier' in pl or 'springer' in pl or 'wiley' in pl:
- out['publisher_type'] = 'big5'
+ publisher = out.get("publisher")
+ pl = out.get("publisher", "").lower().strip()
+ if out.get("platform") == "scielo":
+ out["publisher_type"] = "scielo"
+ elif (
+ publisher in BIG5_PUBLISHERS
+ or "elsevier" in pl
+ or "springer" in pl
+ or "wiley" in pl
+ ):
+ out["publisher_type"] = "big5"
elif publisher in OA_PUBLISHERS:
- out['publisher_type'] = 'oa'
- elif publisher in COMMERCIAL_PUBLISHERS or 'wolters kluwer' in pl or 'wolters-kluwer' in pl:
- out['publisher_type'] = 'commercial'
+ out["publisher_type"] = "oa"
+ elif (
+ publisher in COMMERCIAL_PUBLISHERS
+ or "wolters kluwer" in pl
+ or "wolters-kluwer" in pl
+ ):
+ out["publisher_type"] = "commercial"
elif publisher in ARCHIVE_PUBLISHERS:
- out['publisher_type'] = 'archive'
+ out["publisher_type"] = "archive"
elif publisher in REPOSITORY_PUBLISHERS:
- out['publisher_type'] = 'repository'
+ out["publisher_type"] = "repository"
elif publisher in OTHER_PUBLISHERS:
- out['publisher_type'] = 'other'
- elif publisher in SOCIETY_PUBLISHERS or 'society' in pl or 'association' in pl or 'academy of ' in pl or 'institute of' in pl:
- out['publisher_type'] = 'society'
- elif publisher in UNI_PRESS_PUBLISHERS or 'university ' in pl:
- out['publisher_type'] = 'unipress'
- elif 'scielo' in pl:
- out['publisher_type'] = 'scielo'
- elif out.get('is_oa') and (not out.get('has_dois') or out.get('lang') not in (None, 'en', 'de', 'fr', 'ja') or out.get('country') not in (None, 'us', 'gb', 'nl', 'cn', 'jp', 'de')):
+ out["publisher_type"] = "other"
+ elif (
+ publisher in SOCIETY_PUBLISHERS
+ or "society" in pl
+ or "association" in pl
+ or "academy of " in pl
+ or "institute of" in pl
+ ):
+ out["publisher_type"] = "society"
+ elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl:
+ out["publisher_type"] = "unipress"
+ elif "scielo" in pl:
+ out["publisher_type"] = "scielo"
+ elif out.get("is_oa") and (
+ not out.get("has_dois")
+ or out.get("lang") not in (None, "en", "de", "fr", "ja")
+ or out.get("country") not in (None, "us", "gb", "nl", "cn", "jp", "de")
+ ):
# current informal definition of longtail
- out['publisher_type'] = 'longtail'
- out['is_longtail'] = True
-
- cur.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
- (issnl,
- out.get('issne'),
- out.get('issnp'),
- out.get('wikidata_qid'),
- out.get('fatcat_ident'),
- out.get('name'),
- out.get('publisher'),
- out.get('country'),
- out.get('lang'),
- out.get('is_oa', False),
- out.get('sherpa_color'),
- out.get('is_longtail', False),
- out.get('is_active'),
- out.get('publisher_type'),
- out.get('has_dois', False),
- out.get('any_homepage', False),
- out.get('any_live_homepage', False),
- out.get('any_gwb_homepage', False),
- out.get('known_issnl'),
- out.get('valid_issnl'),
-
- out.get('release_count'),
- out.get('ia_count'),
- out.get('ia_frac'),
- out.get('kbart_count'),
- out.get('kbart_frac'),
- out.get('preserved_count'),
- out.get('preserved_frac'),
- ))
+ out["publisher_type"] = "longtail"
+ out["is_longtail"] = True
+
+ cur.execute(
+ "INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+ (
+ issnl,
+ out.get("issne"),
+ out.get("issnp"),
+ out.get("wikidata_qid"),
+ out.get("fatcat_ident"),
+ out.get("name"),
+ out.get("publisher"),
+ out.get("country"),
+ out.get("lang"),
+ out.get("is_oa", False),
+ out.get("sherpa_color"),
+ out.get("is_longtail", False),
+ out.get("is_active"),
+ out.get("publisher_type"),
+ out.get("has_dois", False),
+ out.get("any_homepage", False),
+ out.get("any_live_homepage", False),
+ out.get("any_gwb_homepage", False),
+ out.get("known_issnl"),
+ out.get("valid_issnl"),
+ out.get("release_count"),
+ out.get("ia_count"),
+ out.get("ia_frac"),
+ out.get("kbart_count"),
+ out.get("kbart_frac"),
+ out.get("preserved_count"),
+ out.get("preserved_frac"),
+ ),
+ )
cur.close()
self.db.commit()
return counts
@@ -534,125 +621,146 @@ class ChoculaDatabase():
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
+
counts: Counter = Counter()
self.db.row_factory = dict_factory
cur = self.db.cursor()
- for row in cur.execute('SELECT * FROM journal'):
+ for row in cur.execute("SELECT * FROM journal"):
print(json.dumps(row))
- counts['total'] += 1
+ counts["total"] += 1
return counts
def export_fatcat(self):
counts: Counter = Counter()
self.db.row_factory = sqlite3.Row
cur = self.db.cursor()
- for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'):
- counts['total'] += 1
+ for row in cur.execute("SELECT * FROM journal WHERE valid_issnl = 1"):
+ counts["total"] += 1
- name = row['name']
+ name = row["name"]
if name:
name = name.strip()
- if not row['name']:
- counts['empty-name'] += 1
+ if not row["name"]:
+ counts["empty-name"] += 1
continue
if len(name) <= 2:
- counts['short-name'] += 1
+ counts["short-name"] += 1
continue
- publisher = row['publisher']
+ publisher = row["publisher"]
if publisher:
publisher = publisher.strip() or None
out = dict(
- issnl=row['issnl'],
- wikidata_qid=row['wikidata_qid'],
- ident=row['fatcat_ident'],
+ issnl=row["issnl"],
+ wikidata_qid=row["wikidata_qid"],
+ ident=row["fatcat_ident"],
publisher=publisher,
name=name,
- _known_issnl=row['known_issnl'])
+ _known_issnl=row["known_issnl"],
+ )
extra = dict(
- issnp=row['issnp'],
- issne=row['issne'],
- country=row['country'],
+ issnp=row["issnp"], issne=row["issne"], country=row["country"],
)
- if row['lang']:
- extra['languages'] = [row['lang'],]
- if row['sherpa_color']:
- extra['sherpa_romeo'] = dict(color=row['sherpa_color'])
+ if row["lang"]:
+ extra["languages"] = [
+ row["lang"],
+ ]
+ if row["sherpa_color"]:
+ extra["sherpa_romeo"] = dict(color=row["sherpa_color"])
urls = []
webarchive_urls = []
- cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']])
+ cur = self.db.execute(
+ "SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]]
+ )
for hrow in cur:
- if '://doaj.org/' in hrow['url'] or '://www.doaj.org/' in hrow['url']:
+ if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]:
continue
- if '://www.ncbi.nlm.nih.gov/' in hrow['url']:
+ if "://www.ncbi.nlm.nih.gov/" in hrow["url"]:
continue
- if 'web.archive.org/web' in hrow['url']:
- webarchive_urls.append(hrow['url'])
- urls.append(hrow['url'])
+ if "web.archive.org/web" in hrow["url"]:
+ webarchive_urls.append(hrow["url"])
+ urls.append(hrow["url"])
continue
- if hrow['host'] in ('www.google.com', 'books.google.com'):
+ if hrow["host"] in ("www.google.com", "books.google.com"):
# individual books or google searches, not journal/conference homepages
continue
- if '/oai/request' in hrow['url']:
+ if "/oai/request" in hrow["url"]:
# OAI-PMH endpoints, not homepages
continue
- if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error':
- webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
+ if (
+ not row["any_live_homepage"]
+ and hrow["gwb_url_success_dt"]
+ and hrow["gwb_url_success_dt"] != "error"
+ ):
+ webarchive_urls.append(
+ "https://web.archive.org/web/{}/{}".format(
+ hrow["gwb_url_success_dt"], hrow["url"]
+ )
+ )
continue
- if hrow['blocked']:
- urls.append(hrow['url'])
+ if hrow["blocked"]:
+ urls.append(hrow["url"])
continue
- if hrow['terminal_status_code'] == 200:
- if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/":
+ if hrow["terminal_status_code"] == 200:
+ if (
+ hrow["terminal_url"]
+ == hrow["url"].replace("http://", "https://")
+ or hrow["terminal_url"] == hrow["url"] + "/"
+ ):
# check for trivial redirects; use post-redirect URL in those cases
- urls.append(hrow['terminal_url'])
+ urls.append(hrow["terminal_url"])
else:
- urls.append(hrow['url'])
+ urls.append(hrow["url"])
continue
# didn't even crawl and no match? add anyways as a pass-through
- if not hrow['status_code']:
- urls.append(hrow['url'])
+ if not hrow["status_code"]:
+ urls.append(hrow["url"])
continue
- extra['webarchive_urls'] = webarchive_urls
- extra['urls'] = urls
+ extra["webarchive_urls"] = webarchive_urls
+ extra["urls"] = urls
- cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [row['issnl']])
+ cur = self.db.execute(
+ "SELECT * FROM directory WHERE issnl = ?;", [row["issnl"]]
+ )
for drow in cur:
- if drow['slug'] == 'ezb':
- ezb = json.loads(drow['extra'])
- extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
- elif drow['slug'] == 'szczepanski':
- extra['szczepanski'] = drow['extra']
- elif drow['slug'] == 'doaj':
- extra['doaj'] = json.loads(drow['extra'])
- elif drow['slug'] == 'scielo':
- extra['scielo'] = json.loads(drow['extra'])
- elif drow['slug'] == 'sim':
- extra['ia'] = extra.get('ia', {})
- extra['ia']['sim'] = json.loads(drow['extra'])
- extra['ia']['sim']['sim_pubid'] = drow['identifier']
- elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'):
- extra['kbart'] = extra.get('kbart', {})
- extra['kbart'][drow['slug']] = json.loads(drow['extra'])
-
- out['extra'] = extra
+ if drow["slug"] == "ezb":
+ ezb = json.loads(drow["extra"])
+ extra["ezb"] = dict(
+ ezb_id=drow["identifier"], color=ezb["ezb_color"]
+ )
+ elif drow["slug"] == "szczepanski":
+ extra["szczepanski"] = drow["extra"]
+ elif drow["slug"] == "doaj":
+ extra["doaj"] = json.loads(drow["extra"])
+ elif drow["slug"] == "scielo":
+ extra["scielo"] = json.loads(drow["extra"])
+ elif drow["slug"] == "sim":
+ extra["ia"] = extra.get("ia", {})
+ extra["ia"]["sim"] = json.loads(drow["extra"])
+ extra["ia"]["sim"]["sim_pubid"] = drow["identifier"]
+ elif drow["slug"] in ("lockss", "clockss", "portico", "jstor"):
+ extra["kbart"] = extra.get("kbart", {})
+ extra["kbart"][drow["slug"]] = json.loads(drow["extra"])
+
+ out["extra"] = extra
print(json.dumps(out))
return counts
def init_db(self):
print("### Creating Database...", file=sys.stderr)
- self.db.executescript("""
+ self.db.executescript(
+ """
PRAGMA main.page_size = 4096;
PRAGMA main.cache_size = 20000;
PRAGMA main.locking_mode = EXCLUSIVE;
PRAGMA main.synchronous = OFF;
- """)
- with open('chocula_schema.sql', 'r') as fschema:
+ """
+ )
+ with open("chocula_schema.sql", "r") as fschema:
self.db.executescript(fschema.read())
print("Done!", file=sys.stderr)
-
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
index a233a26..90e6f26 100644
--- a/chocula/directories/__init__.py
+++ b/chocula/directories/__init__.py
@@ -1,4 +1,3 @@
-
from chocula.directories.crossref import CrossrefLoader
from chocula.directories.doaj import DoajLoader
from chocula.directories.entrez import EntrezLoader
@@ -14,7 +13,17 @@ from chocula.directories.szczepanski import SzczepanskiLoader
from chocula.directories.wikidata import WikidataLoader
ALL_CHOCULA_DIR_CLASSES = [
- CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader,
- NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader,
- SzczepanskiLoader, WikidataLoader, SimLoader, ScieloLoader,
+ CrossrefLoader,
+ DoajLoader,
+ EntrezLoader,
+ EzbLoader,
+ GoldOALoader,
+ NorwegianLoader,
+ OpenAPCLoader,
+ RoadLoader,
+ SherpaRomeoLoader,
+ SzczepanskiLoader,
+ WikidataLoader,
+ SimLoader,
+ ScieloLoader,
]
diff --git a/chocula/directories/crossref.py b/chocula/directories/crossref.py
index 4208008..a494021 100644
--- a/chocula/directories/crossref.py
+++ b/chocula/directories/crossref.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -23,14 +22,14 @@ class CrossrefLoader(DirectoryLoader):
def parse_record(self, record) -> Optional[DirectoryInfo]:
info = DirectoryInfo(
directory_slug=self.source_slug,
- issne=record['eissn'],
- issnp=record['pissn'],
- custom_id=record.get('doi').strip() or None,
- name=clean_str(record.get('JournalTitle')),
- publisher=clean_str(record.get('Publisher')),
+ issne=record["eissn"],
+ issnp=record["pissn"],
+ custom_id=record.get("doi").strip() or None,
+ name=clean_str(record.get("JournalTitle")),
+ publisher=clean_str(record.get("Publisher")),
)
- if record['additionalIssns']:
- info.raw_issn = record['additionalIssns'][0]
+ if record["additionalIssns"]:
+ info.raw_issn = record["additionalIssns"][0]
return info
diff --git a/chocula/directories/doaj.py b/chocula/directories/doaj.py
index 7968dc2..795ce68 100644
--- a/chocula/directories/doaj.py
+++ b/chocula/directories/doaj.py
@@ -1,8 +1,13 @@
-
from typing import Iterable, Optional, Dict, Any
import csv
-from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP
+from chocula.util import (
+ clean_str,
+ parse_mimetypes,
+ parse_country,
+ parse_lang,
+ PLATFORM_MAP,
+)
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl
@@ -81,40 +86,43 @@ class DoajLoader(DirectoryLoader):
info = DirectoryInfo(
directory_slug=self.source_slug,
- issnp=row['Journal ISSN (print version)'],
- issne=row['Journal EISSN (online version)'],
- name=clean_str(row['Journal title']),
- publisher=clean_str(row['Publisher']),
- platform=PLATFORM_MAP.get(row['Platform, host or aggregator']),
- country=parse_country(row['Country of publisher']),
+ issnp=row["Journal ISSN (print version)"],
+ issne=row["Journal EISSN (online version)"],
+ name=clean_str(row["Journal title"]),
+ publisher=clean_str(row["Publisher"]),
+ platform=PLATFORM_MAP.get(row["Platform, host or aggregator"]),
+ country=parse_country(row["Country of publisher"]),
)
- lang = parse_lang(row['Full text language'])
+ lang = parse_lang(row["Full text language"])
if lang:
info.langs.append(lang)
extra: Dict[str, Any] = dict(doaj=dict())
- extra['mimetypes'] = parse_mimetypes(row['Full text formats'])
- extra['doaj']['as_of'] = self.config.snapshot.date
- if row['DOAJ Seal']:
- extra['doaj']['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()]
+ extra["mimetypes"] = parse_mimetypes(row["Full text formats"])
+ extra["doaj"]["as_of"] = self.config.snapshot.date
+ if row["DOAJ Seal"]:
+ extra["doaj"]["seal"] = {"no": False, "yes": True}[row["DOAJ Seal"].lower()]
- if row['Digital archiving policy or program(s)']:
- extra['archive'] = [a.strip() for a in row['Digital archiving policy or program(s)'].split(',') if a.strip()]
- elif row['Archiving: national library']:
- extra['archive'] = ['national-library']
+ if row["Digital archiving policy or program(s)"]:
+ extra["archive"] = [
+ a.strip()
+ for a in row["Digital archiving policy or program(s)"].split(",")
+ if a.strip()
+ ]
+ elif row["Archiving: national library"]:
+ extra["archive"] = ["national-library"]
- crawl_permission = row['Journal full-text crawl permission']
+ crawl_permission = row["Journal full-text crawl permission"]
if crawl_permission:
- extra['crawl-permission'] = dict(Yes=True, No=False)[crawl_permission]
- default_license = row['Journal license']
- if default_license and default_license.startswith('CC'):
- extra['default_license'] = default_license.replace('CC ', 'CC-').strip()
+ extra["crawl-permission"] = dict(Yes=True, No=False)[crawl_permission]
+ default_license = row["Journal license"]
+ if default_license and default_license.startswith("CC"):
+ extra["default_license"] = default_license.replace("CC ", "CC-").strip()
- url = row['Journal URL']
+ url = row["Journal URL"]
if url:
- homepage = HomepageUrl.from_url(row['Journal URL'])
+ homepage = HomepageUrl.from_url(row["Journal URL"])
if homepage:
info.homepage_urls.append(homepage)
return info
-
diff --git a/chocula/directories/entrez.py b/chocula/directories/entrez.py
index b30f04d..f9f6d23 100644
--- a/chocula/directories/entrez.py
+++ b/chocula/directories/entrez.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -26,14 +25,13 @@ class EntrezLoader(DirectoryLoader):
return csv.DictReader(open(self.config.entrez_simple.filepath))
def parse_record(self, record) -> Optional[DirectoryInfo]:
- if not (record.get('ISSN (Online)') or record.get('ISSN (Print)')):
+ if not (record.get("ISSN (Online)") or record.get("ISSN (Print)")):
return None
return DirectoryInfo(
directory_slug=self.source_slug,
- issne=record.get('ISSN (Online)'),
- issnp=record.get('ISSN (Print)'),
- custom_id=record.get('NlmId').strip() or None,
- name=clean_str(record.get('JournalTitle')),
- abbrev=clean_str(record['IsoAbbr']),
+ issne=record.get("ISSN (Online)"),
+ issnp=record.get("ISSN (Print)"),
+ custom_id=record.get("NlmId").strip() or None,
+ name=clean_str(record.get("JournalTitle")),
+ abbrev=clean_str(record["IsoAbbr"]),
)
-
diff --git a/chocula/directories/ezb.py b/chocula/directories/ezb.py
index 1573048..056350d 100644
--- a/chocula/directories/ezb.py
+++ b/chocula/directories/ezb.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import json
@@ -16,7 +15,7 @@ class EzbLoader(DirectoryLoader):
source_slug = "ezb"
def open_file(self) -> Iterable:
- return open(self.config.ezb.filepath, 'r')
+ return open(self.config.ezb.filepath, "r")
def parse_record(self, row) -> Optional[DirectoryInfo]:
@@ -26,21 +25,29 @@ class EzbLoader(DirectoryLoader):
info = DirectoryInfo(
directory_slug=self.source_slug,
- issne=row.get('issne'),
- issnp=row.get('issnp'),
- custom_id=row['ezb_id'],
- name=clean_str(row['title']),
- publisher=clean_str(row.get('publisher')),
+ issne=row.get("issne"),
+ issnp=row.get("issnp"),
+ custom_id=row["ezb_id"],
+ name=clean_str(row["title"]),
+ publisher=clean_str(row.get("publisher")),
)
info.extra = dict()
- for k in ('ezb_color', 'subjects', 'keywords', 'zdb_id',
- 'first_volume', 'first_issue', 'first_year',
- 'appearance', 'costs'):
+ for k in (
+ "ezb_color",
+ "subjects",
+ "keywords",
+ "zdb_id",
+ "first_volume",
+ "first_issue",
+ "first_year",
+ "appearance",
+ "costs",
+ ):
if row.get(k):
info.extra[k] = row[k]
- url = HomepageUrl.from_url(row.get('url'))
+ url = HomepageUrl.from_url(row.get("url"))
if url:
info.homepage_urls.append(url)
diff --git a/chocula/directories/gold_oa.py b/chocula/directories/gold_oa.py
index a75944d..d0c6e8b 100644
--- a/chocula/directories/gold_oa.py
+++ b/chocula/directories/gold_oa.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -21,11 +20,11 @@ class GoldOALoader(DirectoryLoader):
def parse_record(self, row) -> Optional[DirectoryInfo]:
- if not (row.get('ISSN_L') and row.get('TITLE')):
+ if not (row.get("ISSN_L") and row.get("TITLE")):
return None
# TODO: also add for other non-direct indices
- #for ind in ('WOS', 'SCOPUS'):
+ # for ind in ('WOS', 'SCOPUS'):
# issnl, status = self.add_issn(
# ind.lower(),
# raw_issn=row['ISSN_L'],
@@ -33,12 +32,12 @@ class GoldOALoader(DirectoryLoader):
# )
extra = dict()
- for ind in ('DOAJ', 'ROAD', 'PMC', 'OAPC', 'WOS', 'SCOPUS'):
- extra['in_' + ind.lower()] = bool(int(row['JOURNAL_IN_' + ind]))
+ for ind in ("DOAJ", "ROAD", "PMC", "OAPC", "WOS", "SCOPUS"):
+ extra["in_" + ind.lower()] = bool(int(row["JOURNAL_IN_" + ind]))
return DirectoryInfo(
directory_slug=self.source_slug,
- raw_issn=row['ISSN_L'],
- name=clean_str(row['TITLE']),
+ raw_issn=row["ISSN_L"],
+ name=clean_str(row["TITLE"]),
extra=extra,
)
diff --git a/chocula/directories/norwegian.py b/chocula/directories/norwegian.py
index 2b83961..2425318 100644
--- a/chocula/directories/norwegian.py
+++ b/chocula/directories/norwegian.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -52,29 +51,31 @@ class NorwegianLoader(DirectoryLoader):
source_slug = "norwegian"
def open_file(self) -> Iterable:
- return csv.DictReader(open(self.config.norwegian.filepath, encoding="ISO-8859-1"), delimiter=";")
+ return csv.DictReader(
+ open(self.config.norwegian.filepath, encoding="ISO-8859-1"), delimiter=";"
+ )
def parse_record(self, row) -> Optional[DirectoryInfo]:
info = DirectoryInfo(
directory_slug=self.source_slug,
- issnp=row['Print ISSN'],
- issne=row['Online ISSN'],
- country=parse_country(row['Country of publication']),
- name=clean_str(row.get('International title')),
- langs=[l for l in [parse_lang(row['Language'])] if l],
+ issnp=row["Print ISSN"],
+ issne=row["Online ISSN"],
+ country=parse_country(row["Country of publication"]),
+ name=clean_str(row.get("International title")),
+ langs=[l for l in [parse_lang(row["Language"])] if l],
)
- info.extra['norwegian'] = dict(as_of=self.config.norwegian.date)
- if row['Level 2019']:
- info.extra['norwegian']['level'] = int(row['Level 2019'])
+ info.extra["norwegian"] = dict(as_of=self.config.norwegian.date)
+ if row["Level 2019"]:
+ info.extra["norwegian"]["level"] = int(row["Level 2019"])
- if row['Original title'] != row['International title']:
- info.original_name = clean_str(row['Original title'])
+ if row["Original title"] != row["International title"]:
+ info.original_name = clean_str(row["Original title"])
- identifier=row['NSD tidsskrift_id'],
- publisher=row['Publisher'],
+ identifier = (row["NSD tidsskrift_id"],)
+ publisher = (row["Publisher"],)
- url = HomepageUrl.from_url(row['URL'])
+ url = HomepageUrl.from_url(row["URL"])
if url:
info.homepage_urls.append(url)
diff --git a/chocula/directories/openapc.py b/chocula/directories/openapc.py
index c2acd95..99304c3 100644
--- a/chocula/directories/openapc.py
+++ b/chocula/directories/openapc.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -21,24 +20,22 @@ class OpenAPCLoader(DirectoryLoader):
def parse_record(self, row) -> Optional[DirectoryInfo]:
- if not row.get('issn'):
+ if not row.get("issn"):
return None
info = DirectoryInfo(
directory_slug=self.source_slug,
- issne=row['issn_electronic'],
- issnp=row['issn_print'],
- raw_issn=row['issn_l'] or row['issn'],
- name=clean_str(row['journal_full_title']),
- publisher=clean_str(row['publisher']),
+ issne=row["issn_electronic"],
+ issnp=row["issn_print"],
+ raw_issn=row["issn_l"] or row["issn"],
+ name=clean_str(row["journal_full_title"]),
+ publisher=clean_str(row["publisher"]),
)
- info.extra['is_hybrid'] = bool(row['is_hybrid'])
+ info.extra["is_hybrid"] = bool(row["is_hybrid"])
- homepage = HomepageUrl.from_url(row['url'])
+ homepage = HomepageUrl.from_url(row["url"])
if homepage:
info.homepage_urls.append(homepage)
return info
-
-
diff --git a/chocula/directories/road.py b/chocula/directories/road.py
index 23cca65..bc550fd 100644
--- a/chocula/directories/road.py
+++ b/chocula/directories/road.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -26,27 +25,39 @@ class RoadLoader(DirectoryLoader):
source_slug = "road"
def open_file(self) -> Iterable:
- return csv.DictReader(open(self.config.road.filepath), delimiter='\t',
- fieldnames=("ISSN", "ISSN-L", "Short Title", "Title", "Publisher", "URL1", "URL2", "Region", "Lang1", "Lang2")
+ return csv.DictReader(
+ open(self.config.road.filepath),
+ delimiter="\t",
+ fieldnames=(
+ "ISSN",
+ "ISSN-L",
+ "Short Title",
+ "Title",
+ "Publisher",
+ "URL1",
+ "URL2",
+ "Region",
+ "Lang1",
+ "Lang2",
+ ),
)
def parse_record(self, row) -> Optional[DirectoryInfo]:
info = DirectoryInfo(
directory_slug=self.source_slug,
- raw_issn=row['ISSN-L'],
- name=clean_str(row['Short Title']),
- publisher=clean_str(row['Publisher']),
- langs=[l for l in (row['Lang1'], row['Lang2']) if l],
+ raw_issn=row["ISSN-L"],
+ name=clean_str(row["Short Title"]),
+ publisher=clean_str(row["Publisher"]),
+ langs=[l for l in (row["Lang1"], row["Lang2"]) if l],
)
# TODO: region mapping: "Europe and North America"
# TODO: lang mapping: already alpha-3
# homepages
- for url in [u for u in (row['URL1'], row['URL2']) if u]:
+ for url in [u for u in (row["URL1"], row["URL2"]) if u]:
homepage = HomepageUrl.from_url(url)
if homepage:
info.homepage_urls.append(homepage)
return info
-
diff --git a/chocula/directories/scielo.py b/chocula/directories/scielo.py
index 247866b..0ed8fde 100644
--- a/chocula/directories/scielo.py
+++ b/chocula/directories/scielo.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import json
@@ -17,32 +16,31 @@ class ScieloLoader(DirectoryLoader):
def parse_record(self, line) -> Optional[DirectoryInfo]:
record = json.loads(line)
extra = dict(
- status=clean_str(record.get('current_status')),
- first_year=record.get('first_year'),
- collection=record.get('collection_acronym'),
+ status=clean_str(record.get("current_status")),
+ first_year=record.get("first_year"),
+ collection=record.get("collection_acronym"),
)
for k in list(extra.keys()):
if extra[k] is None:
extra.pop(k)
country: Optional[str] = None
- if record['publisher_country'] and len(record['publisher_country'][0]) == 2:
- country = record['publisher_country'][0].lower()
+ if record["publisher_country"] and len(record["publisher_country"][0]) == 2:
+ country = record["publisher_country"][0].lower()
info = DirectoryInfo(
directory_slug=self.source_slug,
- issne=clean_issn(record.get('electronic_issn') or ''),
- issnp=clean_issn(record.get('print_issn') or ''),
- custom_id=clean_str(record.get('scielo_issn')),
- name=clean_str(record.get('fulltitle')),
- publisher=clean_str((record.get('publisher_name') or [''])[0]),
- abbrev=clean_str(record['abbreviated_iso_title']),
- platform='scielo',
- langs=list(filter(lambda s: len(s) == 2, record['languages'])),
+ issne=clean_issn(record.get("electronic_issn") or ""),
+ issnp=clean_issn(record.get("print_issn") or ""),
+ custom_id=clean_str(record.get("scielo_issn")),
+ name=clean_str(record.get("fulltitle")),
+ publisher=clean_str((record.get("publisher_name") or [""])[0]),
+ abbrev=clean_str(record["abbreviated_iso_title"]),
+ platform="scielo",
+ langs=list(filter(lambda s: len(s) == 2, record["languages"])),
country=country,
extra=extra,
)
- if record['url']:
- homepage = HomepageUrl.from_url(record['url'])
+ if record["url"]:
+ homepage = HomepageUrl.from_url(record["url"])
if homepage:
info.homepage_urls.append(homepage)
return info
-
diff --git a/chocula/directories/sherpa_romeo.py b/chocula/directories/sherpa_romeo.py
index e92dc69..a8ba1b0 100644
--- a/chocula/directories/sherpa_romeo.py
+++ b/chocula/directories/sherpa_romeo.py
@@ -1,4 +1,3 @@
-
import sys
from typing import Iterable, Optional, Dict, Any
import csv
@@ -27,32 +26,38 @@ class SherpaRomeoLoader(DirectoryLoader):
# first load policies
print("##### Loading SHERPA/ROMEO policies...", file=sys.stderr)
- fixed_policy_file = ftfy.fix_file(open(self.config.sherpa_romeo_policies_simple.filepath, 'rb'))
+ fixed_policy_file = ftfy.fix_file(
+ open(self.config.sherpa_romeo_policies_simple.filepath, "rb")
+ )
policy_reader = csv.DictReader(fixed_policy_file)
for row in policy_reader:
- self.sherpa_policies[row['RoMEO Record ID']] = row
+ self.sherpa_policies[row["RoMEO Record ID"]] = row
# then open regular file
- raw_file = open(self.config.sherpa_romeo_journals_simple.filepath, 'rb').read().decode(errors='replace')
+ raw_file = (
+ open(self.config.sherpa_romeo_journals_simple.filepath, "rb")
+ .read()
+ .decode(errors="replace")
+ )
fixed_file = ftfy.fix_text(raw_file)
- return csv.DictReader(fixed_file.split('\n'))
+ return csv.DictReader(fixed_file.split("\n"))
def parse_record(self, row) -> Optional[DirectoryInfo]:
# super mangled :(
- row.update(self.sherpa_policies[row['RoMEO Record ID']])
+ row.update(self.sherpa_policies[row["RoMEO Record ID"]])
info = DirectoryInfo(
directory_slug=self.source_slug,
- issnp=row['ISSN'],
- issne=row['ESSN'],
- name=clean_str(row['Journal Title']),
- publisher=clean_str(row['Publisher']),
- country=parse_country(row['Country']),
- custom_id=row['RoMEO Record ID'],
+ issnp=row["ISSN"],
+ issne=row["ESSN"],
+ name=clean_str(row["Journal Title"]),
+ publisher=clean_str(row["Publisher"]),
+ country=parse_country(row["Country"]),
+ custom_id=row["RoMEO Record ID"],
)
- if row['RoMEO colour']:
- info.extra['sherpa_romeo'] = dict(color=row['RoMEO colour'])
+ if row["RoMEO colour"]:
+ info.extra["sherpa_romeo"] = dict(color=row["RoMEO colour"])
return info
diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py
index ff5cce3..97f84d2 100644
--- a/chocula/directories/sim.py
+++ b/chocula/directories/sim.py
@@ -1,8 +1,14 @@
-
from typing import Iterable, Optional, Dict, Any
import csv
-from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans
+from chocula.util import (
+ clean_str,
+ parse_mimetypes,
+ parse_country,
+ parse_lang,
+ PLATFORM_MAP,
+ gaps_to_spans,
+)
from chocula.common import DirectoryLoader
from chocula.database import DirectoryInfo, HomepageUrl
@@ -37,35 +43,34 @@ class SimLoader(DirectoryLoader):
# TODO: 'Pub Type'
extra: Dict[str, Any] = {}
- first_year = row['First Volume']
+ first_year = row["First Volume"]
if first_year:
first_year = int(first_year)
- extra['first_year'] = int(row['First Volume'])
+ extra["first_year"] = int(row["First Volume"])
else:
first_year = None
- last_year = row['Last Volume']
+ last_year = row["Last Volume"]
if last_year:
last_year = int(last_year)
- extra['last_year'] = last_year
+ extra["last_year"] = last_year
else:
last_year = None
- gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()]
+ gaps = [int(g) for g in row["NA Gaps"].split(";") if g.strip()]
if gaps:
- extra['gaps'] = gaps
+ extra["gaps"] = gaps
if first_year and last_year:
- extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps)
- extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"]
- extra['peer_reviewed'] = row["Peer-\nReviewed"]
- extra['pub_type'] = clean_str(row["Pub Type"])
+ extra["year_spans"] = gaps_to_spans(first_year, last_year, gaps)
+ extra["scholarly_peer_reviewed"] = row["Scholarly / Peer-\nReviewed"]
+ extra["peer_reviewed"] = row["Peer-\nReviewed"]
+ extra["pub_type"] = clean_str(row["Pub Type"])
info = DirectoryInfo(
directory_slug=self.source_slug,
- name=clean_str(row['Title']),
- publisher=clean_str(row['Publisher']),
- raw_issn=row['ISSN'][:9],
- custom_id=row.get('NA Pub Cat ID').strip() or None,
- langs=[parse_lang(row['Pub Language'])],
+ name=clean_str(row["Title"]),
+ publisher=clean_str(row["Publisher"]),
+ raw_issn=row["ISSN"][:9],
+ custom_id=row.get("NA Pub Cat ID").strip() or None,
+ langs=[parse_lang(row["Pub Language"])],
extra=extra,
)
return info
-
diff --git a/chocula/directories/szczepanski.py b/chocula/directories/szczepanski.py
index 0d1558a..3586acb 100644
--- a/chocula/directories/szczepanski.py
+++ b/chocula/directories/szczepanski.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import json
@@ -16,7 +15,7 @@ class SzczepanskiLoader(DirectoryLoader):
source_slug = "szczepanski"
def open_file(self) -> Iterable:
- return open(self.config.szczepanski.filepath, 'r')
+ return open(self.config.szczepanski.filepath, "r")
def parse_record(self, row) -> Optional[DirectoryInfo]:
@@ -27,21 +26,21 @@ class SzczepanskiLoader(DirectoryLoader):
info = DirectoryInfo(
directory_slug=self.source_slug,
- issne=row.get('issne'),
- issnp=row.get('issnp'),
- raw_issn=row.get('issn'),
- name=clean_str(row['title']),
- publisher=clean_str(row.get('ed')),
+ issne=row.get("issne"),
+ issnp=row.get("issnp"),
+ raw_issn=row.get("issn"),
+ name=clean_str(row["title"]),
+ publisher=clean_str(row.get("ed")),
)
- info.extra['szczepanski'] = dict(as_of=self.config.szczepanski.date)
- if row.get('extra'):
- info.extra['szczepanski']['notes'] = row.get('extra')
- for k in ('other_titles', 'year_spans', 'ed'):
+ info.extra["szczepanski"] = dict(as_of=self.config.szczepanski.date)
+ if row.get("extra"):
+ info.extra["szczepanski"]["notes"] = row.get("extra")
+ for k in ("other_titles", "year_spans", "ed"):
if row.get(k):
- info.extra['szczepanski'][k] = row[k]
+ info.extra["szczepanski"][k] = row[k]
- url = HomepageUrl.from_url(row.get('url'))
+ url = HomepageUrl.from_url(row.get("url"))
if url:
info.homepage_urls.append(url)
diff --git a/chocula/directories/wikidata.py b/chocula/directories/wikidata.py
index d16d8df..5ffe6fb 100644
--- a/chocula/directories/wikidata.py
+++ b/chocula/directories/wikidata.py
@@ -1,4 +1,3 @@
-
from typing import Iterable, Optional
import csv
@@ -16,27 +15,31 @@ class WikidataLoader(DirectoryLoader):
source_slug = "wikidata"
def open_file(self) -> Iterable:
- return csv.DictReader(open(self.config.wikidata.filepath), delimiter='\t')
+ return csv.DictReader(open(self.config.wikidata.filepath), delimiter="\t")
def parse_record(self, row) -> Optional[DirectoryInfo]:
- if not (row.get('issn') and row.get('title')):
+ if not (row.get("issn") and row.get("title")):
return None
- wikidata_qid = row['item'].strip().split('/')[-1]
- publisher = row['publisher_name']
- if (publisher.startswith('Q') and publisher[1].isdigit()) or publisher.startswith('t1') or not publisher:
+ wikidata_qid = row["item"].strip().split("/")[-1]
+ publisher = row["publisher_name"]
+ if (
+ (publisher.startswith("Q") and publisher[1].isdigit())
+ or publisher.startswith("t1")
+ or not publisher
+ ):
publisher = None
- info =DirectoryInfo(
+ info = DirectoryInfo(
directory_slug=self.source_slug,
- raw_issn=row['issn'],
+ raw_issn=row["issn"],
custom_id=wikidata_qid,
- name=clean_str(row['title']),
+ name=clean_str(row["title"]),
publisher=clean_str(publisher),
)
- if row.get('start_year'):
- info.extra['start_year'] = row['start_year']
+ if row.get("start_year"):
+ info.extra["start_year"] = row["start_year"]
- url = HomepageUrl.from_url(row.get('websiteurl'))
+ url = HomepageUrl.from_url(row.get("websiteurl"))
if url:
info.homepage_urls.append(url)
diff --git a/chocula/kbart.py b/chocula/kbart.py
index 6c1f580..e8094e3 100644
--- a/chocula/kbart.py
+++ b/chocula/kbart.py
@@ -1,4 +1,3 @@
-
from typing import List, Any
from chocula.common import KbartLoader
@@ -25,7 +24,7 @@ class PorticoKbartLoader(KbartLoader):
def file_path(self) -> str:
return self.config.portico.filepath
-
+
class JstorKbartLoader(KbartLoader):
diff --git a/chocula/util.py b/chocula/util.py
index 2cb771d..11303b8 100644
--- a/chocula/util.py
+++ b/chocula/util.py
@@ -1,4 +1,3 @@
-
import sys
from dataclasses import dataclass
from typing import Dict, Optional
@@ -11,119 +10,120 @@ import pycountry
# NOTE: this is a partial list, focusing on non-publisher hosted platforms and
# software frameworks
PLATFORM_MAP = {
- 'OJS': 'ojs',
- 'OJS SEER': 'ojs',
- 'Open Journal System/OJS': 'ojs',
- 'BMC': 'bmc',
- 'SciELO Brazil': 'scielo',
- 'SciELO Argentina': 'scielo',
- 'SciELO': 'scielo',
- 'SciELO Mexico': 'scielo',
- 'SciELO Spain': 'scielo',
- 'SciELO Portugal': 'scielo',
- 'WordPress': 'wordpress',
- 'Sciendo': 'sciendo',
- 'Drupal': 'drupal',
- 'revues.org': 'openedition',
+ "OJS": "ojs",
+ "OJS SEER": "ojs",
+ "Open Journal System/OJS": "ojs",
+ "BMC": "bmc",
+ "SciELO Brazil": "scielo",
+ "SciELO Argentina": "scielo",
+ "SciELO": "scielo",
+ "SciELO Mexico": "scielo",
+ "SciELO Spain": "scielo",
+ "SciELO Portugal": "scielo",
+ "WordPress": "wordpress",
+ "Sciendo": "sciendo",
+ "Drupal": "drupal",
+ "revues.org": "openedition",
}
MIMETYPE_MAP = {
- 'PDF': 'application/pdf',
- 'HTML': 'text/html',
- 'XML': 'application/xml',
+ "PDF": "application/pdf",
+ "HTML": "text/html",
+ "XML": "application/xml",
}
BIG5_PUBLISHERS = [
- 'Elsevier',
- 'Informa UK (Taylor & Francis)',
- 'Springer-Verlag',
- 'SAGE Publications',
- 'Wiley (Blackwell Publishing)',
- 'Wiley (John Wiley & Sons)',
- 'Springer (Biomed Central Ltd.)',
- 'Springer Nature',
+ "Elsevier",
+ "Informa UK (Taylor & Francis)",
+ "Springer-Verlag",
+ "SAGE Publications",
+ "Wiley (Blackwell Publishing)",
+ "Wiley (John Wiley & Sons)",
+ "Springer (Biomed Central Ltd.)",
+ "Springer Nature",
]
COMMERCIAL_PUBLISHERS = [
- 'Peter Lang International Academic Publishers',
- 'Walter de Gruyter GmbH',
- 'Oldenbourg Wissenschaftsverlag',
- 'Georg Thieme Verlag KG', # not springer
- 'Emerald (MCB UP )',
- 'Medknow Publications',
- 'Inderscience Enterprises Ltd',
- 'Bentham Science',
- 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins',
- 'Scientific Research Publishing, Inc',
- 'MDPI AG',
- 'S. Karger AG',
- 'Pleiades Publishing',
- 'Science Publishing Group',
- 'IGI Global',
- 'The Economist Intelligence Unit',
- 'Maney Publishing',
- 'Diva Enterprises Private Limited',
- 'World Scientific',
- 'Mary Ann Liebert',
- 'Trans Tech Publications',
+ "Peter Lang International Academic Publishers",
+ "Walter de Gruyter GmbH",
+ "Oldenbourg Wissenschaftsverlag",
+ "Georg Thieme Verlag KG", # not springer
+ "Emerald (MCB UP )",
+ "Medknow Publications",
+ "Inderscience Enterprises Ltd",
+ "Bentham Science",
+ "Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins",
+ "Scientific Research Publishing, Inc",
+ "MDPI AG",
+ "S. Karger AG",
+ "Pleiades Publishing",
+ "Science Publishing Group",
+ "IGI Global",
+ "The Economist Intelligence Unit",
+ "Maney Publishing",
+ "Diva Enterprises Private Limited",
+ "World Scientific",
+ "Mary Ann Liebert",
+ "Trans Tech Publications",
]
OA_PUBLISHERS = [
- 'Hindawi Limited',
- 'OMICS Publishing Group',
- 'De Gruyter Open Sp. z o.o.',
- 'OpenEdition',
- 'Hindawi (International Scholarly Research Network)',
- 'Public Library of Science',
- 'Frontiers Media SA',
- 'eLife Sciences Publications, Ltd',
- 'MDPI AG',
- 'Hindawi (International Scholarly Research Network)',
- 'Dove Medical Press',
- 'Open Access Text',
+ "Hindawi Limited",
+ "OMICS Publishing Group",
+ "De Gruyter Open Sp. z o.o.",
+ "OpenEdition",
+ "Hindawi (International Scholarly Research Network)",
+ "Public Library of Science",
+ "Frontiers Media SA",
+ "eLife Sciences Publications, Ltd",
+ "MDPI AG",
+ "Hindawi (International Scholarly Research Network)",
+ "Dove Medical Press",
+ "Open Access Text",
]
SOCIETY_PUBLISHERS = [
- 'Institute of Electrical and Electronics Engineers',
- 'Institution of Electrical Engineers',
- 'Association for Computing Machinery',
- 'American Psychological Association',
- 'IOS Press',
- 'IOP Publishing',
- 'American Chemical Society',
- 'Royal Society of Chemistry (RSC)',
- 'American Geophysical Union',
- 'American College of Physicians',
- 'New England Journal of Medicine',
- 'BMJ',
- 'RCN Publishing',
- 'International Union of Crystallography',
- 'Portland Press',
- 'ASME International',
+ "Institute of Electrical and Electronics Engineers",
+ "Institution of Electrical Engineers",
+ "Association for Computing Machinery",
+ "American Psychological Association",
+ "IOS Press",
+ "IOP Publishing",
+ "American Chemical Society",
+ "Royal Society of Chemistry (RSC)",
+ "American Geophysical Union",
+ "American College of Physicians",
+ "New England Journal of Medicine",
+ "BMJ",
+ "RCN Publishing",
+ "International Union of Crystallography",
+ "Portland Press",
+ "ASME International",
]
UNI_PRESS_PUBLISHERS = [
- 'Cambridge University Press',
- 'Oxford University Press',
- 'The University of Chicago Press',
- 'MIT Press',
+ "Cambridge University Press",
+ "Oxford University Press",
+ "The University of Chicago Press",
+ "MIT Press",
]
ARCHIVE_PUBLISHERS = [
- 'JSTOR',
- 'Portico',
+ "JSTOR",
+ "Portico",
]
REPOSITORY_PUBLISHERS = [
- 'PERSEE Program',
- 'Social Science Electronic Publishing',
- 'CAIRN',
- 'CSIRO Publishing',
+ "PERSEE Program",
+ "Social Science Electronic Publishing",
+ "CAIRN",
+ "CSIRO Publishing",
]
OTHER_PUBLISHERS = [
- 'African Journals Online',
- 'Smithsonian Institution Biodiversity Heritage Library',
- 'Canadian Science Publishing',
- 'Philosophy Documentation Center',
- 'Project MUSE',
+ "African Journals Online",
+ "Smithsonian Institution Biodiversity Heritage Library",
+ "Canadian Science Publishing",
+ "Philosophy Documentation Center",
+ "Project MUSE",
]
+
def parse_lang(s):
- if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
+ if not s or s in ("Not applicable", "Multiple languages", "Unknown"):
return None
try:
if len(s) == 2:
@@ -138,8 +138,9 @@ def parse_lang(s):
except AttributeError:
return None
+
def parse_country(s):
- if not s or s in ('Unknown'):
+ if not s or s in ("Unknown"):
return None
try:
if len(s) == 2:
@@ -153,12 +154,13 @@ def parse_country(s):
else:
return None
+
def parse_mimetypes(val):
# XXX: multiple mimetypes?
if not val:
return
mimetype = None
- if '/' in val:
+ if "/" in val:
mimetype = val
else:
mimetype = MIMETYPE_MAP.get(val)
@@ -166,13 +168,14 @@ def parse_mimetypes(val):
return None
return [mimetype]
+
def gaps_to_spans(first, last, gaps):
if not gaps:
return [[first, last]]
if not (last >= first and max(gaps) < last and min(gaps) > first):
# years seem mangled? will continue though
print("mangled years: {}".format((first, last, gaps)), file=sys.stderr)
- full = list(range(first, last+1))
+ full = list(range(first, last + 1))
for missing in gaps:
if missing in full:
full.remove(missing)
@@ -184,7 +187,7 @@ def gaps_to_spans(first, last, gaps):
low = year
last = year
continue
- if year != last+1:
+ if year != last + 1:
spans.append([low, last])
low = year
last = year
@@ -193,15 +196,17 @@ def gaps_to_spans(first, last, gaps):
spans.append([low, last])
return spans
+
def test_gaps():
- assert gaps_to_spans(1900, 1900, None) == \
- [[1900, 1900]]
- assert gaps_to_spans(1900, 1903, None) == \
- [[1900, 1903]]
- assert gaps_to_spans(1900, 1902, [1901]) == \
- [[1900, 1900], [1902, 1902]]
- assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \
- [[1950, 1954], [1957, 1964], [1966, 1970]]
+ assert gaps_to_spans(1900, 1900, None) == [[1900, 1900]]
+ assert gaps_to_spans(1900, 1903, None) == [[1900, 1903]]
+ assert gaps_to_spans(1900, 1902, [1901]) == [[1900, 1900], [1902, 1902]]
+ assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == [
+ [1950, 1954],
+ [1957, 1964],
+ [1966, 1970],
+ ]
+
def merge_spans(old, new):
if not new:
@@ -211,7 +216,7 @@ def merge_spans(old, new):
old.extend(new)
years = set()
for span in old:
- for y in range(span[0], span[1]+1):
+ for y in range(span[0], span[1] + 1):
years.add(y)
if not years:
return []
@@ -240,19 +245,14 @@ def merge_spans(old, new):
spans.append([start, last])
return spans
+
def test_merge_spans():
- assert merge_spans([[5, 10]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 9]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([[5, 11]], [[10, 20]]) == \
- [[5, 20]]
- assert merge_spans([], []) == \
- []
- assert merge_spans([[9, 11]], []) == \
- [[9,11]]
- assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \
- [[1450, 1900], [2000, 2000]]
+ assert merge_spans([[5, 10]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([[5, 9]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([[5, 11]], [[10, 20]]) == [[5, 20]]
+ assert merge_spans([], []) == []
+ assert merge_spans([[9, 11]], []) == [[9, 11]]
+ assert merge_spans([[2000, 2000]], [[1450, 1900]]) == [[1450, 1900], [2000, 2000]]
def unquote(s: str) -> str:
@@ -260,7 +260,7 @@ def unquote(s: str) -> str:
s = s[1:]
if s.endswith('"') or s.endswith("'"):
s = s[:-1]
- if s.endswith('.'):
+ if s.endswith("."):
s = s[:-1]
return s.strip()
@@ -283,6 +283,7 @@ def clean_str(s: Optional[str]) -> Optional[str]:
s = unquote(ftfy.fix_text(s))
return s or None
+
def test_clean_str():
assert clean_str("") is None
assert clean_str(" ") is None
@@ -290,7 +291,6 @@ def test_clean_str():
assert clean_str(" Bloody work.") == "Bloody work"
-
def clean_issn(s: str) -> Optional[str]:
s = s.strip().upper()
if len(s) == 8:
@@ -299,6 +299,7 @@ def clean_issn(s: str) -> Optional[str]:
return None
return s
+
def test_clean_issn():
assert clean_issn("1234-5678") == "1234-5678"
assert clean_issn(" 12345678") == "1234-5678"
diff --git a/tests/test_database.py b/tests/test_database.py
index 3d41e79..dc75d23 100644
--- a/tests/test_database.py
+++ b/tests/test_database.py
@@ -1,4 +1,3 @@
-
from chocula.database import IssnDatabase
@@ -6,10 +5,9 @@ def test_issn_database():
issn_db = IssnDatabase(issn_issnl_file_path="tests/files/ISSN-to-ISSN-L.txt")
- assert issn_db.issn2issnl('1234-5678') is None
- assert issn_db.issn2issnl('0000-0000') is None
+ assert issn_db.issn2issnl("1234-5678") is None
+ assert issn_db.issn2issnl("0000-0000") is None
# "The Lancet"
- assert issn_db.issn2issnl('0140-6736') == '0140-6736'
- assert issn_db.issn2issnl('1474-547X') == '0140-6736'
-
+ assert issn_db.issn2issnl("0140-6736") == "0140-6736"
+ assert issn_db.issn2issnl("1474-547X") == "0140-6736"
diff --git a/tests/test_directories.py b/tests/test_directories.py
index 90856bc..b366192 100644
--- a/tests/test_directories.py
+++ b/tests/test_directories.py
@@ -1,26 +1,29 @@
-
import pytest
from chocula import *
+
@pytest.fixture
def config():
config = ChoculaConfig.from_file(sources_dir="tests/files/")
return config
+
@pytest.fixture
def issn_db():
return IssnDatabase(issn_issnl_file_path="tests/files/ISSN-to-ISSN-L.txt")
+
@pytest.fixture
def database(issn_db):
db = ChoculaDatabase(db_file=":memory:", issn_db=issn_db)
db.init_db()
return db
+
def test_all(config, database):
for cls in ALL_CHOCULA_DIR_CLASSES:
loader = cls(config)
counts = loader.index_file(database)
- assert counts['total'] >= 20
- assert counts['inserted'] > 5
+ assert counts["total"] >= 20
+ assert counts["inserted"] > 5