summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-21 10:19:50 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-21 10:19:50 -0700
commitf7c04ce03335c24da4cfe44a1bcbc1f8ea14784c (patch)
tree4a40303efab7fd01109adcb3e6ac3440c9f1b4fe
parenta1d94e6c28b080158fd65d0ec54ff6d64451df97 (diff)
downloadfatcat-f7c04ce03335c24da4cfe44a1bcbc1f8ea14784c.tar.gz
fatcat-f7c04ce03335c24da4cfe44a1bcbc1f8ea14784c.zip
ISSN importer
-rwxr-xr-xpython/client.py13
-rw-r--r--python/fatcat/importer_common.py11
-rw-r--r--python/fatcat/issn_importer.py51
-rw-r--r--python/tests/files/journal_extra_metadata.snip.csv10
-rw-r--r--python/tests/issn.py17
5 files changed, 101 insertions, 1 deletions
diff --git a/python/client.py b/python/client.py
index ca6af603..4f6d3ccc 100755
--- a/python/client.py
+++ b/python/client.py
@@ -18,6 +18,10 @@ def run_import_orcid(args):
foi = FatcatOrcidImporter(args.host_url)
foi.process_batch(args.json_file, size=args.batch_size)
+def run_import_issn(args):
+ fii = FatcatIssnImporter(args.host_url)
+ fii.process_batch(args.csv_file, size=args.batch_size)
+
def run_import_manifest(args):
fmi = FatcatManifestImporter(args.host_url)
fmi.process_db(args.db_path, size=args.batch_size)
@@ -55,6 +59,15 @@ def main():
help="size of batch to send",
default=50, type=int)
+ sub_import_issn = subparsers.add_parser('import-issn')
+ sub_import_issn.set_defaults(func=run_import_issn)
+ sub_import_issn.add_argument('csv_file',
+ help="Journal ISSN CSV metadata file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_import_issn.add_argument('--batch-size',
+ help="size of batch to send",
+ default=50, type=int)
+
sub_import_manifest = subparsers.add_parser('import-manifest')
sub_import_manifest.set_defaults(func=run_import_manifest)
sub_import_manifest.add_argument('db_path',
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
index f8638418..9ec64e8f 100644
--- a/python/fatcat/importer_common.py
+++ b/python/fatcat/importer_common.py
@@ -1,5 +1,6 @@
import sys
+import csv
import json
import itertools
import fatcat_client
@@ -25,7 +26,7 @@ class FatcatImporter:
self.read_issn_map_file(issn_map_file)
def process_source(self, source, group_size=100):
- """Creates and auto-accepts editgropu every group_size rows"""
+ """Creates and auto-accepts editgroup every group_size rows"""
eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
for i, row in enumerate(source):
self.create_row(row, editgroup_id=eg.id)
@@ -42,6 +43,14 @@ class FatcatImporter:
self.create_batch(rows, eg.id)
self.api.accept_editgroup(eg.id)
+ def process_csv_source(self, source, group_size=100, delimiter=','):
+ reader = csv.DictReader(source, delimiter=delimiter)
+ self.process_source(reader, group_size)
+
+ def process_csv_batch(self, source, size=50, delimiter=','):
+ reader = csv.DictReader(source, delimiter=delimiter)
+ self.process_batch(reader, size)
+
def lookup_issnl(self, issnl):
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
assert len(issnl) == 9 and issnl[4] == '-'
diff --git a/python/fatcat/issn_importer.py b/python/fatcat/issn_importer.py
new file mode 100644
index 00000000..9055bdc2
--- /dev/null
+++ b/python/fatcat/issn_importer.py
@@ -0,0 +1,51 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+class FatcatIssnImporter(FatcatImporter):
+
+ def parse_issn_row(self, row):
+ """
+ row is a python dict (parsed from CSV).
+ returns a ContainerEntity
+ """
+ extra = dict(
+ in_doaj=row['in_doaj'],
+ in_road=row['in_road'],
+ language=row['lang'],
+ url=row['url'],
+ ISSNp=row['ISSN-print'],
+ ISSNe=row['ISSN-electronic'],
+ is_oa=row['is_oa'],
+ is_kept=row['is_kept'],
+ )
+ ce = fatcat_client.ContainerEntity(
+ issnl=row['ISSN-L'],
+ name=row['title'],
+ publisher=row['publisher'],
+ abbrev=None,
+ coden=None,
+ extra=extra)
+ print(ce)
+ return ce
+
+ def create_row(self, row, editgroup_id=None):
+ ce = self.parse_issn_row(row)
+ if ce is not None:
+ ce.editgroup_id = editgroup_id
+ self.api.create_container(ce)
+
+ def create_batch(self, batch, editgroup_id=None):
+ """Reads and processes in batches (not API-call-per-line)"""
+ objects = [self.parse_issn_row(l)
+ for l in batch if l != None]
+ objects = [o for o in objects if o != None]
+ for o in objects:
+ o.editgroup_id = editgroup_id
+ self.api.create_container_batch(objects)
diff --git a/python/tests/files/journal_extra_metadata.snip.csv b/python/tests/files/journal_extra_metadata.snip.csv
new file mode 100644
index 00000000..8cc50ee9
--- /dev/null
+++ b/python/tests/files/journal_extra_metadata.snip.csv
@@ -0,0 +1,10 @@
+ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+0000-0019,False,False,True,False,Publishers weekly,,http://www.publishersweekly.com/,,0000-0019,2150-4008,0.0,False,False,False,,http://www.publishersweekly.com/,301.0,200.0,https://www.publishersweekly.com/,3xx,200,publishersweekly.com,1055.0
+0001-0782,False,False,True,True,Communications of the ACM,Association for Computing Machinery,http://www.acm.org/pubs/cacm/,,0001-0782,1557-7317,11894.0,True,False,True,55.0,http://www.acm.org/pubs/cacm/,301.0,200.0,https://cacm.acm.org/,3xx,200,acm.org,9.0
+0001-1452,False,False,True,True,AIAA Journal,American Institute of Aeronautics and Astronautics,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,,0001-1452,1533-385X,24193.0,True,False,True,6.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404.0,404.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404,404,aiaa.org,16.0
+0001-1541,False,False,True,True,AIChE Journal,Wiley Blackwell (John Wiley & Sons),http://www.aiche.org/Publications/AIChEJournal/index.aspx,,0001-1541,1547-5905,15860.0,True,False,True,607.0,http://www.aiche.org/Publications/AIChEJournal/index.aspx,301.0,200.0,https://www.aiche.org/publications/journals/aiche-journal,3xx,200,aiche.org,25.0
+0001-2092,False,False,True,True,AORN Journal,Wiley Blackwell (John Wiley & Sons),http://www.aorn.org/AORNJournal/,,0001-2092,1878-0369,12413.0,True,False,True,607.0,http://www.aorn.org/AORNJournal/,301.0,200.0,https://www.aorn.org/aorn-journal,3xx,200,aorn.org,0.0
+0001-2343,False,False,True,True,Archiv fuer Rechts- und Sozialphilosphie,Franz Steiner Verlag GmbH,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,,0001-2343,2363-5614,14.0,True,False,False,2.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200.0,200.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200,200,steiner-verlag.de,226.0
+0001-2351,False,False,True,True,Transactions of the ASAE,American Society of Agricultural and Biological Engineers,,,0001-2351,2151-0059,11515.0,True,False,True,3.0,,,,,,,,0.0
+0001-2491,False,False,True,False,ASHRAE Journal,,http://www.ashrae.org/,,0001-2491,1943-6637,0.0,False,False,False,,http://www.ashrae.org/,301.0,200.0,https://www.ashrae.org/,3xx,200,ashrae.org,684.0
+0001-2505,False,False,True,False,ASHRAE Transactions,,http://www.ashrae.org/template/AssetDetail/assetid/25903,,0001-2505,,0.0,False,False,False,,http://www.ashrae.org/template/AssetDetail/assetid/25903,301.0,404.0,https://www.ashrae.org/template/AssetDetail/assetid/25903,3xx,404,ashrae.org,684.0
diff --git a/python/tests/issn.py b/python/tests/issn.py
new file mode 100644
index 00000000..fff112f7
--- /dev/null
+++ b/python/tests/issn.py
@@ -0,0 +1,17 @@
+
+import pytest
+from fatcat.issn_importer import FatcatIssnImporter
+
+
+@pytest.fixture(scope="function")
+def issn_importer():
+ yield FatcatIssnImporter("http://localhost:9411/v0")
+
+# TODO: use API to check that entities actually created...
+def test_issn_importer_batch(issn_importer):
+ with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+ issn_importer.process_csv_batch(f)
+
+def test_issn_importer(issn_importer):
+ with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+ issn_importer.process_csv_source(f)