aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--chocula/database.py43
-rw-r--r--chocula/directories/sim.py2
-rw-r--r--tests/files/ISSN-to-ISSN-L.txt6
-rw-r--r--tests/files/sim_master_title_metadata.csv30
4 files changed, 58 insertions, 23 deletions
diff --git a/chocula/database.py b/chocula/database.py
index 3efa725..12ac824 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -280,16 +280,16 @@ class ChoculaDatabase():
"""
print("##### Parsing KBART file for {}...".format(name))
#publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name
- kbart_dict = dict()
+ kbart_dict: Dict[str, Any] = dict()
raw_file = open(path, 'rb').read().decode(errors='replace')
fixed_file = ftfy.fix_text(raw_file)
reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t')
- counts = Counter()
+ counts: Counter = Counter()
for row in reader:
if not row['print_identifier'] and not row['online_identifier']:
counts['no-issn'] += 1
continue
- issnl, status = self.lookup_issnl(
+ issnl, status = self.issn_db.lookup_issnl(
issnp=row['print_identifier'],
issne=row['online_identifier'],
)
@@ -323,12 +323,12 @@ class ChoculaDatabase():
def load_homepage_status(self, config: ChoculaConfig) -> Counter:
print("##### Loading IA Homepage Crawl Results...")
- counts = Counter()
+ counts: Counter = Counter()
cur = self.db.cursor()
- for row in open(config.homepage_status.filepath, 'r'):
- if not row.strip():
+ for line in open(config.homepage_status.filepath, 'r'):
+ if not line.strip():
continue
- row = json.loads(row)
+ row = json.loads(line)
counts['total'] += 1
url = row['url']
assert(url)
@@ -356,12 +356,12 @@ class ChoculaDatabase():
print("##### Loading Fatcat Container Entities...")
# JSON
json_file = open(config.fatcat_containers.filepath, 'r')
- counts = Counter()
+ counts: Counter = Counter()
cur = self.db.cursor()
- for row in json_file:
- if not row:
+ for line in json_file:
+ if not line:
continue
- row = json.loads(row)
+ row = json.loads(line)
if row['state'] != 'active':
continue
counts['total'] += 1
@@ -408,19 +408,18 @@ class ChoculaDatabase():
print("##### Loading Fatcat Container Stats...")
# JSON
json_file = open(config.fatcat_stats.filepath, 'r')
- counts = Counter()
+ counts: Counter = Counter()
cur = self.db.cursor()
- for row in json_file:
- if not row:
+ for line in json_file:
+ if not line:
continue
- row = json.loads(row)
+ row = json.loads(line)
total = int(row['total'])
+ ia_frac: Optional[float] = None
+ preserved_frac: Optional[float] = None
if total > 0:
ia_frac = float(row['in_web'])/total
preserved_frac = float(row['is_preserved'])/total
- else:
- ia_frac = None
- preserved_frac = None
cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
(total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl']))
counts['updated'] += 1
@@ -429,7 +428,7 @@ class ChoculaDatabase():
return counts
def export_urls(self) -> Counter:
- counts = Counter()
+ counts: Counter = Counter()
cur = self.db.cursor()
self.db.row_factory = sqlite3.Row
cur = self.db.execute("SELECT issnl, url FROM homepage;")
@@ -442,7 +441,7 @@ class ChoculaDatabase():
def summarize(self) -> Counter:
print("##### Summarizing Everything...")
- counts = Counter()
+ counts: Counter = Counter()
cur = self.db.cursor()
self.db.row_factory = sqlite3.Row
index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory'))
@@ -582,7 +581,7 @@ class ChoculaDatabase():
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
- counts = Counter()
+ counts: Counter = Counter()
self.db.row_factory = dict_factory
cur = self.db.cursor()
for row in cur.execute('SELECT * FROM journal'):
@@ -591,7 +590,7 @@ class ChoculaDatabase():
return counts
def export_fatcat(self):
- counts = Counter()
+ counts: Counter = Counter()
self.db.row_factory = sqlite3.Row
cur = self.db.cursor()
for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'):
diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py
index c0c02df..ff5cce3 100644
--- a/chocula/directories/sim.py
+++ b/chocula/directories/sim.py
@@ -36,7 +36,7 @@ class SimLoader(DirectoryLoader):
"""
# TODO: 'Pub Type'
- extra = {}
+ extra: Dict[str, Any] = {}
first_year = row['First Volume']
if first_year:
first_year = int(first_year)
diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt
index 9d79a2b..f44ea24 100644
--- a/tests/files/ISSN-to-ISSN-L.txt
+++ b/tests/files/ISSN-to-ISSN-L.txt
@@ -252,3 +252,9 @@ ISSN ISSN-L
0870-1164 0870-1164
0962-2519 0962-2519
1042-7147 1042-7147
+0047-4959 0047-4959
+0047-4959 0031-7233
+0001-0782 0001-0782
+0009-5532 0009-5532
+0888-8817 0888-8817
+0001-1452 0001-1452
diff --git a/tests/files/sim_master_title_metadata.csv b/tests/files/sim_master_title_metadata.csv
new file mode 100644
index 0000000..3eae289
--- /dev/null
+++ b/tests/files/sim_master_title_metadata.csv
@@ -0,0 +1,30 @@
+NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer-
+Reviewed","Peer-
+Reviewed",Pub Type,Pub Language,Subjects
+16,Publishers Weekly,"PWxyz, LLC",0000-0019,,,,,1872,2016,,N,N,Trade Journals,English,Library And Information Sciences|Publishing And Book Trade
+12688,Association for Computing Machinery. Communications of the ACM,Association for Computing Machinery,0001-0782,1280,"11,320",3.621,0.020290,1958,2007,,Y,Y,Scholarly Journals,English,Computers--Data Communications And Data Transmission Systems
+1600,American Institute of Aeronautics and Astronautics. AIAA Journal,American Institute of Aeronautics and Astronautics,0001-1452,6178,"12,214",1.207,0.013340,1963,2014,,Y,Y,Scholarly Journals,English,Aeronautics And Space Flight
+12576,American Institute of Chemical Engineers. AIChE Journal,American Institute of Chemical Engineers,0001-1541,2187,"19,462",2.748,0.018990,1955,2004,,Y,Y,Scholarly Journals,English,Engineering--Chemical Engineering
+1162,American Medical News,American Medical Association,0001-1843,,,,,1958,2013,,N,N,Trade Journals,English,Medical Sciences
+9805,Association of Operating Room Nurses. AORN Journal,Elsevier Limited,0001-2092,,,,,1963,2015,,Y,N,Scholarly Journals,English,Medical Sciences--Nurses And Nursing
+2254,Atlantic Provinces Library Association. APLA Bulletin,Atlantic Provinces Library Association,0001-2203,,,,,1936,2008,,N,N,Trade Journals,English,Education|Library And Information Sciences
+18916,International Journal of Clothing Science and Technology,Emerald Group Publishing Limited,0955-6222,10604,379,0.350,0.000250,1991,1995,,Y,Y,Scholarly Journals,English,Business And Economics--Production of Goods And Services
+7410,Modern maturity,AARP,0026-8046,,,,,1958,1994,,N,N,Scholarly Journals,English,Health & Medical Sciences
+10676,Bowhunting world.,Grandview Media Group,1043-5492,,,,,1962,1994,,N,?,Magazines,English,General Interest
+1299,Manufacturing.,Institution of Engineering and,0956-9944,,,,,1842,1842,,N,?,Trade Journals,English,Engineering & Technology
+11913,American University Law Review,American University Law Review,0003-1453,,,,,1978,2011,,Y,N,Scholarly Journals,English,Law
+5289,The Congregational magazine.,Out-of-copyright,NULL,,,,,1979,1979,,N,?,Historical Journals,English,Historical Periodical
+8027,Scholastic Art,Scholastic Inc.,1060-832X,,,,,1970,2007,,N,N,Magazines,English,Art|Education|Humanities: Comprehensive Works
+5302,Herald of freedom,Out-of-copyright,NULL,,,,,1973,2006,1974; ,N,?,Historical Journals,English,Historical Periodical
+4790,Jacobite's journal,Open Court Publishing Co,NULL,,,,,1792,1792,,N,?,Historical Journals,English,History
+1163,Quarterly cumulative index medicus.,Superintendent of Government Documents,NULL,,,,,1927,1956,,N,N,Government Documents,English,Communication & Information Sciences
+7529,Christian news from Israel,Ministry Of Religious Affairs,0009-5532,,,,,1972,1982,1978; ,N,N,Magazines,English,Philosophy & Religion
+51047,Journal of Organizational Behavior Management,Taylor & Francis Ltd.,0160-8061,9861,374,0.486,0.000260,1977,2014,1989; 1991; ,Y,Y,Scholarly Journals,English,Psychology
+19119,Health Economics,Wiley Periodicals Inc.,1057-9230,3118,"4,397",2.227,0.011260,1992,2015,,Y,Y,Scholarly Journals,English,Business And Economics--Economic Situation And Conditions|Health Facilities And Administration|Medical Sciences|Public Health And Safety
+8374,Philadelphia Magazine,"Municipal Publications, Ltd.",0031-7233,,,,,1973,2015,,N,N,Magazines,English,General Interest Periodicals--United States
+8540,Quarterly bulletin of the National Library of South Africa.,South African Library,1562-9392,,,,,1987,2015,,N,?,Trade Journals,English,Language & Literature
+49501,Forensic engineering.,ELSEVIER LTD.,0888-8817,,,,,1987,1991,1989; ,Y,Y,Scholarly Journals,English,Engineering & Technology
+8634,Locus,Locus Publications,0047-4959,,,,,1980,2003,1997; 1999; 2001; ,N,?,Magazines,English,Language & Literature
+9647,Broiler industry.,Watt Publishing Co,0007-2176,,,,,1967,1975,,N,?,Trade Journals,English,Agriculture
+4699,American Sunday-school teachers' magazine and journal of education,Open Court Publishing Co,NULL,,,,,1850,1851,,N,?,Historical Journals,English,Education
+3528,Key,Open Court Publishing Co,NULL,,,,,1801,1802,,N,?,Historical Journals,English,Historical Periodical