From f7c04ce03335c24da4cfe44a1bcbc1f8ea14784c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 21 Jun 2018 10:19:50 -0700 Subject: ISSN importer --- python/client.py | 13 ++++++ python/fatcat/importer_common.py | 11 ++++- python/fatcat/issn_importer.py | 51 ++++++++++++++++++++++ python/tests/files/journal_extra_metadata.snip.csv | 10 +++++ python/tests/issn.py | 17 ++++++++ 5 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 python/fatcat/issn_importer.py create mode 100644 python/tests/files/journal_extra_metadata.snip.csv create mode 100644 python/tests/issn.py diff --git a/python/client.py b/python/client.py index ca6af603..4f6d3ccc 100755 --- a/python/client.py +++ b/python/client.py @@ -18,6 +18,10 @@ def run_import_orcid(args): foi = FatcatOrcidImporter(args.host_url) foi.process_batch(args.json_file, size=args.batch_size) +def run_import_issn(args): + fii = FatcatIssnImporter(args.host_url) + fii.process_batch(args.csv_file, size=args.batch_size) + def run_import_manifest(args): fmi = FatcatManifestImporter(args.host_url) fmi.process_db(args.db_path, size=args.batch_size) @@ -55,6 +59,15 @@ def main(): help="size of batch to send", default=50, type=int) + sub_import_issn = subparsers.add_parser('import-issn') + sub_import_issn.set_defaults(func=run_import_issn) + sub_import_issn.add_argument('csv_file', + help="Journal ISSN CSV metadata file to import from (or stdin)", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_issn.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + sub_import_manifest = subparsers.add_parser('import-manifest') sub_import_manifest.set_defaults(func=run_import_manifest) sub_import_manifest.add_argument('db_path', diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index f8638418..9ec64e8f 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -1,5 +1,6 @@ import sys +import csv import json import itertools import fatcat_client @@ -25,7 +26,7 @@ class FatcatImporter: self.read_issn_map_file(issn_map_file) def process_source(self, source, group_size=100): - """Creates and auto-accepts editgropu every group_size rows""" + """Creates and auto-accepts editgroup every group_size rows""" eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) for i, row in enumerate(source): self.create_row(row, editgroup_id=eg.id) @@ -42,6 +43,14 @@ class FatcatImporter: self.create_batch(rows, eg.id) self.api.accept_editgroup(eg.id) + def process_csv_source(self, source, group_size=100, delimiter=','): + reader = csv.DictReader(source, delimiter=delimiter) + self.process_source(reader, group_size) + + def process_csv_batch(self, source, size=50, delimiter=','): + reader = csv.DictReader(source, delimiter=delimiter) + self.process_batch(reader, size) + def lookup_issnl(self, issnl): """Caches calls to the ISSN-L lookup API endpoint in a local dict""" assert len(issnl) == 9 and issnl[4] == '-' diff --git a/python/fatcat/issn_importer.py b/python/fatcat/issn_importer.py new file mode 100644 index 00000000..9055bdc2 --- /dev/null +++ b/python/fatcat/issn_importer.py @@ -0,0 +1,51 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): +# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + +class FatcatIssnImporter(FatcatImporter): + + def parse_issn_row(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity + """ + extra = dict( + in_doaj=row['in_doaj'], + in_road=row['in_road'], + language=row['lang'], + url=row['url'], + ISSNp=row['ISSN-print'], + ISSNe=row['ISSN-electronic'], + is_oa=row['is_oa'], + is_kept=row['is_kept'], + ) + ce = fatcat_client.ContainerEntity( + issnl=row['ISSN-L'], + name=row['title'], + publisher=row['publisher'], + abbrev=None, + coden=None, + extra=extra) + print(ce) + return ce + + def create_row(self, row, editgroup_id=None): + ce = self.parse_issn_row(row) + if ce is not None: + ce.editgroup_id = editgroup_id + self.api.create_container(ce) + + def create_batch(self, batch, editgroup_id=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_issn_row(l) + for l in batch if l != None] + objects = [o for o in objects if o != None] + for o in objects: + o.editgroup_id = editgroup_id + self.api.create_container_batch(objects) diff --git a/python/tests/files/journal_extra_metadata.snip.csv b/python/tests/files/journal_extra_metadata.snip.csv new file mode 100644 index 00000000..8cc50ee9 --- /dev/null +++ b/python/tests/files/journal_extra_metadata.snip.csv @@ -0,0 +1,10 @@ +ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count +0000-0019,False,False,True,False,Publishers weekly,,http://www.publishersweekly.com/,,0000-0019,2150-4008,0.0,False,False,False,,http://www.publishersweekly.com/,301.0,200.0,https://www.publishersweekly.com/,3xx,200,publishersweekly.com,1055.0 +0001-0782,False,False,True,True,Communications of the ACM,Association for Computing Machinery,http://www.acm.org/pubs/cacm/,,0001-0782,1557-7317,11894.0,True,False,True,55.0,http://www.acm.org/pubs/cacm/,301.0,200.0,https://cacm.acm.org/,3xx,200,acm.org,9.0 +0001-1452,False,False,True,True,AIAA Journal,American Institute of Aeronautics and Astronautics,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,,0001-1452,1533-385X,24193.0,True,False,True,6.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404.0,404.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404,404,aiaa.org,16.0 +0001-1541,False,False,True,True,AIChE Journal,Wiley Blackwell (John Wiley & Sons),http://www.aiche.org/Publications/AIChEJournal/index.aspx,,0001-1541,1547-5905,15860.0,True,False,True,607.0,http://www.aiche.org/Publications/AIChEJournal/index.aspx,301.0,200.0,https://www.aiche.org/publications/journals/aiche-journal,3xx,200,aiche.org,25.0 +0001-2092,False,False,True,True,AORN Journal,Wiley Blackwell (John Wiley & Sons),http://www.aorn.org/AORNJournal/,,0001-2092,1878-0369,12413.0,True,False,True,607.0,http://www.aorn.org/AORNJournal/,301.0,200.0,https://www.aorn.org/aorn-journal,3xx,200,aorn.org,0.0 +0001-2343,False,False,True,True,Archiv fuer Rechts- und Sozialphilosphie,Franz Steiner Verlag GmbH,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,,0001-2343,2363-5614,14.0,True,False,False,2.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200.0,200.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200,200,steiner-verlag.de,226.0 +0001-2351,False,False,True,True,Transactions of the ASAE,American Society of Agricultural and Biological Engineers,,,0001-2351,2151-0059,11515.0,True,False,True,3.0,,,,,,,,0.0 +0001-2491,False,False,True,False,ASHRAE Journal,,http://www.ashrae.org/,,0001-2491,1943-6637,0.0,False,False,False,,http://www.ashrae.org/,301.0,200.0,https://www.ashrae.org/,3xx,200,ashrae.org,684.0 +0001-2505,False,False,True,False,ASHRAE Transactions,,http://www.ashrae.org/template/AssetDetail/assetid/25903,,0001-2505,,0.0,False,False,False,,http://www.ashrae.org/template/AssetDetail/assetid/25903,301.0,404.0,https://www.ashrae.org/template/AssetDetail/assetid/25903,3xx,404,ashrae.org,684.0 diff --git a/python/tests/issn.py b/python/tests/issn.py new file mode 100644 index 00000000..fff112f7 --- /dev/null +++ b/python/tests/issn.py @@ -0,0 +1,17 @@ + +import pytest +from fatcat.issn_importer import FatcatIssnImporter + + +@pytest.fixture(scope="function") +def issn_importer(): + yield FatcatIssnImporter("http://localhost:9411/v0") + +# TODO: use API to check that entities actually created... +def test_issn_importer_batch(issn_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + issn_importer.process_csv_batch(f) + +def test_issn_importer(issn_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + issn_importer.process_csv_source(f) -- cgit v1.2.3