diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/client.py | 13 | ||||
| -rw-r--r-- | python/fatcat/importer_common.py | 11 | ||||
| -rw-r--r-- | python/fatcat/issn_importer.py | 51 | ||||
| -rw-r--r-- | python/tests/files/journal_extra_metadata.snip.csv | 10 | ||||
| -rw-r--r-- | python/tests/issn.py | 17 | 
5 files changed, 101 insertions, 1 deletions
| diff --git a/python/client.py b/python/client.py index ca6af603..4f6d3ccc 100755 --- a/python/client.py +++ b/python/client.py @@ -18,6 +18,10 @@ def run_import_orcid(args):      foi = FatcatOrcidImporter(args.host_url)      foi.process_batch(args.json_file, size=args.batch_size) +def run_import_issn(args): +    fii = FatcatIssnImporter(args.host_url) +    fii.process_batch(args.csv_file, size=args.batch_size) +  def run_import_manifest(args):      fmi = FatcatManifestImporter(args.host_url)      fmi.process_db(args.db_path, size=args.batch_size) @@ -55,6 +59,15 @@ def main():          help="size of batch to send",          default=50, type=int) +    sub_import_issn = subparsers.add_parser('import-issn') +    sub_import_issn.set_defaults(func=run_import_issn) +    sub_import_issn.add_argument('csv_file', +        help="Journal ISSN CSV metadata file to import from (or stdin)", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_import_issn.add_argument('--batch-size', +        help="size of batch to send", +        default=50, type=int) +      sub_import_manifest = subparsers.add_parser('import-manifest')      sub_import_manifest.set_defaults(func=run_import_manifest)      sub_import_manifest.add_argument('db_path', diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index f8638418..9ec64e8f 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -1,5 +1,6 @@  import sys +import csv  import json  import itertools  import fatcat_client @@ -25,7 +26,7 @@ class FatcatImporter:              self.read_issn_map_file(issn_map_file)      def process_source(self, source, group_size=100): -        """Creates and auto-accepts editgropu every group_size rows""" +        """Creates and auto-accepts editgroup every group_size rows"""          eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))          for i, row in enumerate(source):              self.create_row(row, editgroup_id=eg.id) @@ -42,6 +43,14 @@ class FatcatImporter:              self.create_batch(rows, eg.id)              self.api.accept_editgroup(eg.id) +    def process_csv_source(self, source, group_size=100, delimiter=','): +        reader = csv.DictReader(source, delimiter=delimiter) +        self.process_source(reader, group_size) + +    def process_csv_batch(self, source, size=50, delimiter=','): +        reader = csv.DictReader(source, delimiter=delimiter) +        self.process_batch(reader, size) +      def lookup_issnl(self, issnl):          """Caches calls to the ISSN-L lookup API endpoint in a local dict"""          assert len(issnl) == 9 and issnl[4] == '-' diff --git a/python/fatcat/issn_importer.py b/python/fatcat/issn_importer.py new file mode 100644 index 00000000..9055bdc2 --- /dev/null +++ b/python/fatcat/issn_importer.py @@ -0,0 +1,51 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): +# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + +class FatcatIssnImporter(FatcatImporter): + +    def parse_issn_row(self, row): +        """ +        row is a python dict (parsed from CSV). +        returns a ContainerEntity +        """ +        extra = dict( +            in_doaj=row['in_doaj'], +            in_road=row['in_road'], +            language=row['lang'], +            url=row['url'], +            ISSNp=row['ISSN-print'], +            ISSNe=row['ISSN-electronic'], +            is_oa=row['is_oa'], +            is_kept=row['is_kept'], +        ) +        ce = fatcat_client.ContainerEntity( +            issnl=row['ISSN-L'], +            name=row['title'], +            publisher=row['publisher'], +            abbrev=None, +            coden=None, +            extra=extra) +        print(ce) +        return ce + +    def create_row(self, row, editgroup_id=None): +        ce = self.parse_issn_row(row) +        if ce is not None: +            ce.editgroup_id = editgroup_id +            self.api.create_container(ce) + +    def create_batch(self, batch, editgroup_id=None): +        """Reads and processes in batches (not API-call-per-line)""" +        objects = [self.parse_issn_row(l) +                   for l in batch if l != None] +        objects = [o for o in objects if o != None] +        for o in objects: +            o.editgroup_id = editgroup_id +        self.api.create_container_batch(objects) diff --git a/python/tests/files/journal_extra_metadata.snip.csv b/python/tests/files/journal_extra_metadata.snip.csv new file mode 100644 index 00000000..8cc50ee9 --- /dev/null +++ b/python/tests/files/journal_extra_metadata.snip.csv @@ -0,0 +1,10 @@ +ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count +0000-0019,False,False,True,False,Publishers weekly,,http://www.publishersweekly.com/,,0000-0019,2150-4008,0.0,False,False,False,,http://www.publishersweekly.com/,301.0,200.0,https://www.publishersweekly.com/,3xx,200,publishersweekly.com,1055.0 +0001-0782,False,False,True,True,Communications of the ACM,Association for Computing Machinery,http://www.acm.org/pubs/cacm/,,0001-0782,1557-7317,11894.0,True,False,True,55.0,http://www.acm.org/pubs/cacm/,301.0,200.0,https://cacm.acm.org/,3xx,200,acm.org,9.0 +0001-1452,False,False,True,True,AIAA Journal,American Institute of Aeronautics and Astronautics,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,,0001-1452,1533-385X,24193.0,True,False,True,6.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404.0,404.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404,404,aiaa.org,16.0 +0001-1541,False,False,True,True,AIChE Journal,Wiley Blackwell (John Wiley & Sons),http://www.aiche.org/Publications/AIChEJournal/index.aspx,,0001-1541,1547-5905,15860.0,True,False,True,607.0,http://www.aiche.org/Publications/AIChEJournal/index.aspx,301.0,200.0,https://www.aiche.org/publications/journals/aiche-journal,3xx,200,aiche.org,25.0 +0001-2092,False,False,True,True,AORN Journal,Wiley Blackwell (John Wiley & Sons),http://www.aorn.org/AORNJournal/,,0001-2092,1878-0369,12413.0,True,False,True,607.0,http://www.aorn.org/AORNJournal/,301.0,200.0,https://www.aorn.org/aorn-journal,3xx,200,aorn.org,0.0 +0001-2343,False,False,True,True,Archiv fuer Rechts- und Sozialphilosphie,Franz Steiner Verlag GmbH,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,,0001-2343,2363-5614,14.0,True,False,False,2.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200.0,200.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200,200,steiner-verlag.de,226.0 +0001-2351,False,False,True,True,Transactions of the ASAE,American Society of Agricultural and Biological Engineers,,,0001-2351,2151-0059,11515.0,True,False,True,3.0,,,,,,,,0.0 +0001-2491,False,False,True,False,ASHRAE Journal,,http://www.ashrae.org/,,0001-2491,1943-6637,0.0,False,False,False,,http://www.ashrae.org/,301.0,200.0,https://www.ashrae.org/,3xx,200,ashrae.org,684.0 +0001-2505,False,False,True,False,ASHRAE Transactions,,http://www.ashrae.org/template/AssetDetail/assetid/25903,,0001-2505,,0.0,False,False,False,,http://www.ashrae.org/template/AssetDetail/assetid/25903,301.0,404.0,https://www.ashrae.org/template/AssetDetail/assetid/25903,3xx,404,ashrae.org,684.0 diff --git a/python/tests/issn.py b/python/tests/issn.py new file mode 100644 index 00000000..fff112f7 --- /dev/null +++ b/python/tests/issn.py @@ -0,0 +1,17 @@ + +import pytest +from fatcat.issn_importer import FatcatIssnImporter + + +@pytest.fixture(scope="function") +def issn_importer(): +    yield FatcatIssnImporter("http://localhost:9411/v0") + +# TODO: use API to check that entities actually created... +def test_issn_importer_batch(issn_importer): +    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: +        issn_importer.process_csv_batch(f) + +def test_issn_importer(issn_importer): +    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: +        issn_importer.process_csv_source(f) | 
