From 279b22e30d9b590838268f5f5acdaa1110ee593a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 13 Nov 2018 11:32:41 -0800 Subject: shuffle around fatcat_tools layout --- python/fatcat_tools/importers/issn.py | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 python/fatcat_tools/importers/issn.py (limited to 'python/fatcat_tools/importers/issn.py') diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py new file mode 100644 index 00000000..d7fb9082 --- /dev/null +++ b/python/fatcat_tools/importers/issn.py @@ -0,0 +1,72 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_tools.importers.common import FatcatImporter + +# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): +# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class FatcatIssnImporter(FatcatImporter): + + def parse_issn_row(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=title, + publisher=or_none(row['publisher']), + abbrev=None, + coden=None, + extra=extra) + return ce + + def create_row(self, row, editgroup=None): + ce = self.parse_issn_row(row) + if ce is not None: + self.api.create_container(ce, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + def create_batch(self, batch, editgroup=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_issn_row(l) + for l in batch if l != None] + objects = [o for o in objects if o != None] + self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) + self.insert_count = self.insert_count + len(objects) -- cgit v1.2.3