From 8a36af1664f0399f29a57b639dc0c89033578878 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Jan 2019 13:48:59 -0800 Subject: issn => journal_metadata in several places --- python/env.example | 2 +- python/fatcat_import.py | 18 ++--- python/fatcat_tools/importers/__init__.py | 2 +- python/fatcat_tools/importers/issn.py | 89 ----------------------- python/fatcat_tools/importers/journal_metadata.py | 89 +++++++++++++++++++++++ python/tests/import_issn.py | 26 ------- python/tests/import_journal_metadata.py | 26 +++++++ 7 files changed, 126 insertions(+), 126 deletions(-) delete mode 100644 python/fatcat_tools/importers/issn.py create mode 100644 python/fatcat_tools/importers/journal_metadata.py delete mode 100644 python/tests/import_issn.py create mode 100644 python/tests/import_journal_metadata.py diff --git a/python/env.example b/python/env.example index 9896dc86..a171ac09 100644 --- a/python/env.example +++ b/python/env.example @@ -15,6 +15,6 @@ SENTRY_DSN="" # FATCAT_API_AUTH_TOKEN FATCAT_AUTH_WORKER_CROSSREF="" FATCAT_AUTH_WORKER_ORCID="" -FATCAT_AUTH_WORKER_ISSN="" +FATCAT_AUTH_WORKER_JOURNAL_METADATA="" FATCAT_AUTH_WORKER_MATCHED="" FATCAT_AUTH_WORKER_GROBID_METADATA="" diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 0e176b2c..ed12416c 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -6,7 +6,7 @@ import os, sys, argparse from fatcat_tools import authenticated_api from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ - IssnImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer + JournalMetadataImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer def run_crossref(args): @@ -27,8 +27,8 @@ def run_orcid(args): foi.process_batch(args.json_file, size=args.batch_size) foi.describe_run() -def run_issn(args): - fii = IssnImporter(args.api) +def run_journal_metadata(args): + fii = JournalMetadataImporter(args.api) fii.process_csv_batch(args.csv_file, size=args.batch_size) fii.describe_run() @@ -98,15 +98,15 @@ def main(): help="size of batch to send", default=50, type=int) - sub_issn = subparsers.add_parser('issn') - sub_issn.set_defaults( - func=run_issn, - auth_var="FATCAT_AUTH_WORKER_ISSN", + sub_journal_metadata = subparsers.add_parser('journal-metadata') + sub_journal_metadata.set_defaults( + func=run_journal_metadata, + auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_issn.add_argument('csv_file', + sub_journal_metadata.add_argument('csv_file', help="Journal ISSN CSV metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_issn.add_argument('--batch-size', + sub_journal_metadata.add_argument('--batch-size', help="size of batch to send", default=50, type=int) diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index e6f081e5..47fc1fd3 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -2,6 +2,6 @@ from .common import FatcatImporter, make_kafka_consumer from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP from .grobid_metadata import GrobidMetadataImporter -from .issn import IssnImporter +from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py deleted file mode 100644 index f4d525a4..00000000 --- a/python/fatcat_tools/importers/issn.py +++ /dev/null @@ -1,89 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from .common import FatcatImporter - - -def or_none(s): - if s is None: - return None - if len(s) == 0: - return None - return s - -def truthy(s): - if s is None: - return None - s = s.lower() - - if s in ('true', 't', 'yes', 'y', '1'): - return True - elif s in ('false', 'f', 'no', 'n', '0'): - return False - else: - return None - -class IssnImporter(FatcatImporter): - """ - Imports journal metadata ("containers") by ISSN, currently from a custom - (data munged) .csv file format - - CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - - ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - """ - - def __init__(self, api, **kwargs): - - eg_desc = kwargs.get('editgroup_description', - "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') - super().__init__(api, - editgroup_description=eg_desc, - editgroup_extra=eg_extra) - - def parse_issn_row(self, row): - """ - row is a python dict (parsed from CSV). - returns a ContainerEntity (or None if invalid or couldn't parse) - """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return None - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) - ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=title, - publisher=or_none(row['publisher']), - abbrev=None, - coden=None, - extra=extra) - return ce - - def create_row(self, row, editgroup_id=None): - ce = self.parse_issn_row(row) - if ce is not None: - self.api.create_container(ce, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_issn_row(l) - for l in batch if (l is not None)] - objects = [o for o in objects if (o is not None)] - self.api.create_container_batch(objects, autoaccept="true") - self.counts['insert'] += len(objects) diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py new file mode 100644 index 00000000..859662ae --- /dev/null +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -0,0 +1,89 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import FatcatImporter + + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class JournalMetadataImporter(FatcatImporter): + """ + Imports journal metadata ("containers") by ISSN, currently from a custom + (data munged) .csv file format + + CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + + ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + """ + + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + + def parse_journal_metadata_row(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity (or None if invalid or couldn't parse) + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return None + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=title, + publisher=or_none(row['publisher']), + abbrev=None, + coden=None, + extra=extra) + return ce + + def create_row(self, row, editgroup_id=None): + ce = self.parse_journal_metadata_row(row) + if ce is not None: + self.api.create_container(ce, editgroup_id=editgroup_id) + self.counts['insert'] += 1 + + def create_batch(self, batch): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_journal_metadata_row(l) + for l in batch if (l is not None)] + objects = [o for o in objects if (o is not None)] + self.api.create_container_batch(objects, autoaccept="true") + self.counts['insert'] += len(objects) diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py deleted file mode 100644 index 6b5978d9..00000000 --- a/python/tests/import_issn.py +++ /dev/null @@ -1,26 +0,0 @@ - -import pytest -from fatcat_tools.importers import IssnImporter -from fixtures import api - - -@pytest.fixture(scope="function") -def issn_importer(api): - yield IssnImporter(api) - -# TODO: use API to check that entities actually created... -def test_issn_importer_batch(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_batch(f) - -def test_issn_importer(issn_importer): - with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: - issn_importer.process_csv_source(f) - - # fetch most recent editgroup - changes = issn_importer.api.get_changelog(limit=1) - eg = changes[0].editgroup - assert eg.description - assert "container" in eg.description.lower() - assert eg.extra['git_rev'] - assert "fatcat_tools.IssnImporter" in eg.extra['agent'] diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py new file mode 100644 index 00000000..81334bc6 --- /dev/null +++ b/python/tests/import_journal_metadata.py @@ -0,0 +1,26 @@ + +import pytest +from fatcat_tools.importers import JournalMetadataImporter +from fixtures import api + + +@pytest.fixture(scope="function") +def journal_metadata_importer(api): + yield JournalMetadataImporter(api) + +# TODO: use API to check that entities actually created... +def test_journal_metadata_importer_batch(journal_metadata_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.process_csv_batch(f) + +def test_journal_metadata_importer(journal_metadata_importer): + with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: + journal_metadata_importer.process_csv_source(f) + + # fetch most recent editgroup + changes = journal_metadata_importer.api.get_changelog(limit=1) + eg = changes[0].editgroup + assert eg.description + assert "container" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent'] -- cgit v1.2.3