diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-01 17:13:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-01 17:13:32 -0700 |
commit | d1283d04464bc80711db6b10db22c7041adc3dad (patch) | |
tree | 024adbd5a8cb30291fe5773b5e8ffaa9e10437f5 | |
parent | 57db2db336c08031324e44b2d2880fbd4b6893c9 (diff) | |
download | chocula-d1283d04464bc80711db6b10db22c7041adc3dad.tar.gz chocula-d1283d04464bc80711db6b10db22c7041adc3dad.zip |
fix tests and type annotations
-rw-r--r-- | chocula/database.py | 43 | ||||
-rw-r--r-- | chocula/directories/sim.py | 2 | ||||
-rw-r--r-- | tests/files/ISSN-to-ISSN-L.txt | 6 | ||||
-rw-r--r-- | tests/files/sim_master_title_metadata.csv | 30 |
4 files changed, 58 insertions, 23 deletions
diff --git a/chocula/database.py b/chocula/database.py index 3efa725..12ac824 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -280,16 +280,16 @@ class ChoculaDatabase(): """ print("##### Parsing KBART file for {}...".format(name)) #publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name - kbart_dict = dict() + kbart_dict: Dict[str, Any] = dict() raw_file = open(path, 'rb').read().decode(errors='replace') fixed_file = ftfy.fix_text(raw_file) reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t') - counts = Counter() + counts: Counter = Counter() for row in reader: if not row['print_identifier'] and not row['online_identifier']: counts['no-issn'] += 1 continue - issnl, status = self.lookup_issnl( + issnl, status = self.issn_db.lookup_issnl( issnp=row['print_identifier'], issne=row['online_identifier'], ) @@ -323,12 +323,12 @@ class ChoculaDatabase(): def load_homepage_status(self, config: ChoculaConfig) -> Counter: print("##### Loading IA Homepage Crawl Results...") - counts = Counter() + counts: Counter = Counter() cur = self.db.cursor() - for row in open(config.homepage_status.filepath, 'r'): - if not row.strip(): + for line in open(config.homepage_status.filepath, 'r'): + if not line.strip(): continue - row = json.loads(row) + row = json.loads(line) counts['total'] += 1 url = row['url'] assert(url) @@ -356,12 +356,12 @@ class ChoculaDatabase(): print("##### Loading Fatcat Container Entities...") # JSON json_file = open(config.fatcat_containers.filepath, 'r') - counts = Counter() + counts: Counter = Counter() cur = self.db.cursor() - for row in json_file: - if not row: + for line in json_file: + if not line: continue - row = json.loads(row) + row = json.loads(line) if row['state'] != 'active': continue counts['total'] += 1 @@ -408,19 +408,18 @@ class ChoculaDatabase(): print("##### Loading Fatcat Container Stats...") # JSON json_file = open(config.fatcat_stats.filepath, 'r') - counts = Counter() + counts: Counter = Counter() cur = self.db.cursor() - for row in json_file: - if not row: + for line in json_file: + if not line: continue - row = json.loads(row) + row = json.loads(line) total = int(row['total']) + ia_frac: Optional[float] = None + preserved_frac: Optional[float] = None if total > 0: ia_frac = float(row['in_web'])/total preserved_frac = float(row['is_preserved'])/total - else: - ia_frac = None - preserved_frac = None cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?", (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl'])) counts['updated'] += 1 @@ -429,7 +428,7 @@ class ChoculaDatabase(): return counts def export_urls(self) -> Counter: - counts = Counter() + counts: Counter = Counter() cur = self.db.cursor() self.db.row_factory = sqlite3.Row cur = self.db.execute("SELECT issnl, url FROM homepage;") @@ -442,7 +441,7 @@ class ChoculaDatabase(): def summarize(self) -> Counter: print("##### Summarizing Everything...") - counts = Counter() + counts: Counter = Counter() cur = self.db.cursor() self.db.row_factory = sqlite3.Row index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory')) @@ -582,7 +581,7 @@ class ChoculaDatabase(): for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d - counts = Counter() + counts: Counter = Counter() self.db.row_factory = dict_factory cur = self.db.cursor() for row in cur.execute('SELECT * FROM journal'): @@ -591,7 +590,7 @@ class ChoculaDatabase(): return counts def export_fatcat(self): - counts = Counter() + counts: Counter = Counter() self.db.row_factory = sqlite3.Row cur = self.db.cursor() for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'): diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py index c0c02df..ff5cce3 100644 --- a/chocula/directories/sim.py +++ b/chocula/directories/sim.py @@ -36,7 +36,7 @@ class SimLoader(DirectoryLoader): """ # TODO: 'Pub Type' - extra = {} + extra: Dict[str, Any] = {} first_year = row['First Volume'] if first_year: first_year = int(first_year) diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index 9d79a2b..f44ea24 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -252,3 +252,9 @@ ISSN ISSN-L 0870-1164 0870-1164 0962-2519 0962-2519 1042-7147 1042-7147 +0047-4959 0047-4959 +0047-4959 0031-7233 +0001-0782 0001-0782 +0009-5532 0009-5532 +0888-8817 0888-8817 +0001-1452 0001-1452 diff --git a/tests/files/sim_master_title_metadata.csv b/tests/files/sim_master_title_metadata.csv new file mode 100644 index 0000000..3eae289 --- /dev/null +++ b/tests/files/sim_master_title_metadata.csv @@ -0,0 +1,30 @@ +NA Pub Cat ID,Title,Publisher,ISSN,Impact Rank,Total Cities,Journal Impact Factor,Eigenfact or Score,First Volume,Last Volume,NA Gaps,"Scholarly / Peer- +Reviewed","Peer- +Reviewed",Pub Type,Pub Language,Subjects +16,Publishers Weekly,"PWxyz, LLC",0000-0019,,,,,1872,2016,,N,N,Trade Journals,English,Library And Information Sciences|Publishing And Book Trade +12688,Association for Computing Machinery. Communications of the ACM,Association for Computing Machinery,0001-0782,1280,"11,320",3.621,0.020290,1958,2007,,Y,Y,Scholarly Journals,English,Computers--Data Communications And Data Transmission Systems +1600,American Institute of Aeronautics and Astronautics. AIAA Journal,American Institute of Aeronautics and Astronautics,0001-1452,6178,"12,214",1.207,0.013340,1963,2014,,Y,Y,Scholarly Journals,English,Aeronautics And Space Flight +12576,American Institute of Chemical Engineers. AIChE Journal,American Institute of Chemical Engineers,0001-1541,2187,"19,462",2.748,0.018990,1955,2004,,Y,Y,Scholarly Journals,English,Engineering--Chemical Engineering +1162,American Medical News,American Medical Association,0001-1843,,,,,1958,2013,,N,N,Trade Journals,English,Medical Sciences +9805,Association of Operating Room Nurses. AORN Journal,Elsevier Limited,0001-2092,,,,,1963,2015,,Y,N,Scholarly Journals,English,Medical Sciences--Nurses And Nursing +2254,Atlantic Provinces Library Association. APLA Bulletin,Atlantic Provinces Library Association,0001-2203,,,,,1936,2008,,N,N,Trade Journals,English,Education|Library And Information Sciences +18916,International Journal of Clothing Science and Technology,Emerald Group Publishing Limited,0955-6222,10604,379,0.350,0.000250,1991,1995,,Y,Y,Scholarly Journals,English,Business And Economics--Production of Goods And Services +7410,Modern maturity,AARP,0026-8046,,,,,1958,1994,,N,N,Scholarly Journals,English,Health & Medical Sciences +10676,Bowhunting world.,Grandview Media Group,1043-5492,,,,,1962,1994,,N,?,Magazines,English,General Interest +1299,Manufacturing.,Institution of Engineering and,0956-9944,,,,,1842,1842,,N,?,Trade Journals,English,Engineering & Technology +11913,American University Law Review,American University Law Review,0003-1453,,,,,1978,2011,,Y,N,Scholarly Journals,English,Law +5289,The Congregational magazine.,Out-of-copyright,NULL,,,,,1979,1979,,N,?,Historical Journals,English,Historical Periodical +8027,Scholastic Art,Scholastic Inc.,1060-832X,,,,,1970,2007,,N,N,Magazines,English,Art|Education|Humanities: Comprehensive Works +5302,Herald of freedom,Out-of-copyright,NULL,,,,,1973,2006,1974; ,N,?,Historical Journals,English,Historical Periodical +4790,Jacobite's journal,Open Court Publishing Co,NULL,,,,,1792,1792,,N,?,Historical Journals,English,History +1163,Quarterly cumulative index medicus.,Superintendent of Government Documents,NULL,,,,,1927,1956,,N,N,Government Documents,English,Communication & Information Sciences +7529,Christian news from Israel,Ministry Of Religious Affairs,0009-5532,,,,,1972,1982,1978; ,N,N,Magazines,English,Philosophy & Religion +51047,Journal of Organizational Behavior Management,Taylor & Francis Ltd.,0160-8061,9861,374,0.486,0.000260,1977,2014,1989; 1991; ,Y,Y,Scholarly Journals,English,Psychology +19119,Health Economics,Wiley Periodicals Inc.,1057-9230,3118,"4,397",2.227,0.011260,1992,2015,,Y,Y,Scholarly Journals,English,Business And Economics--Economic Situation And Conditions|Health Facilities And Administration|Medical Sciences|Public Health And Safety +8374,Philadelphia Magazine,"Municipal Publications, Ltd.",0031-7233,,,,,1973,2015,,N,N,Magazines,English,General Interest Periodicals--United States +8540,Quarterly bulletin of the National Library of South Africa.,South African Library,1562-9392,,,,,1987,2015,,N,?,Trade Journals,English,Language & Literature +49501,Forensic engineering.,ELSEVIER LTD.,0888-8817,,,,,1987,1991,1989; ,Y,Y,Scholarly Journals,English,Engineering & Technology +8634,Locus,Locus Publications,0047-4959,,,,,1980,2003,1997; 1999; 2001; ,N,?,Magazines,English,Language & Literature +9647,Broiler industry.,Watt Publishing Co,0007-2176,,,,,1967,1975,,N,?,Trade Journals,English,Agriculture +4699,American Sunday-school teachers' magazine and journal of education,Open Court Publishing Co,NULL,,,,,1850,1851,,N,?,Historical Journals,English,Education +3528,Key,Open Court Publishing Co,NULL,,,,,1801,1802,,N,?,Historical Journals,English,Historical Periodical |