From 8c930ded07b6a668bc1721ef98312d9259c06b59 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 Jan 2019 14:59:38 -0800 Subject: importers and tests all use new api-passing --- python/env.example | 8 ++++++++ python/fatcat_tools/importers/common.py | 1 + python/fatcat_tools/importers/crossref.py | 2 +- python/fatcat_tools/importers/grobid_metadata.py | 13 ++++++++++--- python/fatcat_tools/importers/issn.py | 10 ++++++++++ python/fatcat_tools/importers/matched.py | 18 ++++++++++++------ python/fatcat_tools/importers/orcid.py | 10 ++++++++++ python/tests/import_crossref.py | 2 +- python/tests/import_grobid_metadata.py | 13 +++++++++++-- python/tests/import_issn.py | 13 +++++++++++-- python/tests/import_matched.py | 13 +++++++++++-- python/tests/import_orcid.py | 13 +++++++++++-- python/tests/importer.py | 9 +++++---- python/tests/transform_tests.py | 1 + 14 files changed, 103 insertions(+), 23 deletions(-) diff --git a/python/env.example b/python/env.example index c1855440..fe9036b3 100644 --- a/python/env.example +++ b/python/env.example @@ -9,3 +9,11 @@ GITLAB_CLIENT_SECRET="" IA_XAUTH_CLIENT_ID="" IA_XAUTH_CLIENT_SECRET="" SENTRY_DSN="" + +# These auth keys only for workers/importers; locally will fall back to +# FATCAT_API_AUTH_TOKEN +FATCAT_AUTH_WORKER_CROSSREF="" +FATCAT_AUTH_WORKER_ORCID="" +FATCAT_AUTH_WORKER_ISSN="" +FATCAT_AUTH_WORKER_MATCHED="" +FATCAT_AUTH_WORKER_GROBID_METADATA="" diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 5c33ebc9..e39ec6c9 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -43,6 +43,7 @@ class FatcatImporter: eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['git_rev'] = eg_extra.get('git_rev', subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') self.api = api self._editgroup_description = kwargs.get('editgroup_description') diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 4f7faf59..ed60a78c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -46,7 +46,7 @@ class CrossrefImporter(FatcatImporter): eg_desc = kwargs.get('editgroup_description', "Automated import of Crossref DOI metadata, harvested from REST API") eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'CrossrefImporter') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') super().__init__(api, issn_map_file=issn_map_file, editgroup_description=eg_desc, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2cb97b01..5e61a154 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -12,9 +12,16 @@ MAX_ABSTRACT_BYTES=4096 class GrobidMetadataImporter(FatcatImporter): - def __init__(self, host_url, default_link_rel="web"): - super().__init__(host_url) - self.default_link_rel = default_link_rel + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Import of release and file metadata, as extracted from PDFs by GROBID.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + self.default_link_rel = kwargs.get("default_link_rel", "web") def parse_grobid_json(self, obj): diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 9b9ca63f..02a1eea0 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -35,6 +35,16 @@ class IssnImporter(FatcatImporter): ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count """ + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + def parse_issn_row(self, row): """ row is a python dict (parsed from CSV). diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5dbda27c..0b77bcf0 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -37,12 +37,18 @@ class MatchedImporter(FatcatImporter): - core_id, wikidata_id, pmcid, pmid: not as lists """ - def __init__(self, host_url, skip_file_updates=False, default_mime=None, - default_link_rel="web"): - super().__init__(host_url) - self.default_mime = default_mime - self.default_link_rel = default_link_rel - self.skip_file_updates = skip_file_updates + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Import of large-scale file-to-release match results. Source of metadata varies.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + self.default_link_rel = kwargs.get("default_link_rel", "web") + self.default_mime = kwargs.get("default_mime", None) + self.skip_file_updates = kwargs.get("skip_file_updates", False) def make_url(self, raw): rel = self.default_link_rel diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index fc4562d0..0aa4ab00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -22,6 +22,16 @@ def value_or_none(e): class OrcidImporter(FatcatImporter): + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of ORCID metadata, from official bulk releases.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + def parse_orcid_dict(self, obj): """ obj is a python dict (parsed from json). diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 3ef97719..e2ca6122 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -28,7 +28,7 @@ def test_crossref_importer(crossref_importer): assert eg.description assert "crossref" in eg.description.lower() assert eg.extra['git_rev'] - assert "CrossrefImporter" in eg.extra['agent'] + assert "fatcat_tools.CrossrefImporter" in eg.extra['agent'] def test_crossref_mappings(crossref_importer): assert crossref_importer.map_release_type('journal-article') == "article-journal" diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index 459b247b..97ebcaef 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -4,6 +4,7 @@ import json import base64 import pytest from fatcat_tools.importers import GrobidMetadataImporter +from fixtures import api """ WARNING: these tests are currently very fragile because they have database @@ -11,8 +12,8 @@ side-effects. Should probably be disabled or re-written. """ @pytest.fixture(scope="function") -def grobid_metadata_importer(): - yield GrobidMetadataImporter("http://localhost:9411/v0") +def grobid_metadata_importer(api): + yield GrobidMetadataImporter(api) # TODO: use API to check that entities actually created... #def test_grobid_metadata_importer_batch(grobid_metadata_importer): @@ -54,3 +55,11 @@ def test_file_metadata_parse(grobid_metadata_importer): def test_grobid_metadata_importer(grobid_metadata_importer): with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f: grobid_metadata_importer.process_source(f) + + # fetch most recent editgroup + changes = grobid_metadata_importer.api.get_changelog(limit=1) + eg = changes[0].editgroup + assert eg.description + assert "grobid" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent'] diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py index 98a9f4a7..6b5978d9 100644 --- a/python/tests/import_issn.py +++ b/python/tests/import_issn.py @@ -1,11 +1,12 @@ import pytest from fatcat_tools.importers import IssnImporter +from fixtures import api @pytest.fixture(scope="function") -def issn_importer(): - yield IssnImporter("http://localhost:9411/v0") +def issn_importer(api): + yield IssnImporter(api) # TODO: use API to check that entities actually created... def test_issn_importer_batch(issn_importer): @@ -15,3 +16,11 @@ def test_issn_importer_batch(issn_importer): def test_issn_importer(issn_importer): with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: issn_importer.process_csv_source(f) + + # fetch most recent editgroup + changes = issn_importer.api.get_changelog(limit=1) + eg = changes[0].editgroup + assert eg.description + assert "container" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.IssnImporter" in eg.extra['agent'] diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py index 46a9ef85..080674ac 100644 --- a/python/tests/import_matched.py +++ b/python/tests/import_matched.py @@ -2,11 +2,12 @@ import json import pytest from fatcat_tools.importers import MatchedImporter +from fixtures import api @pytest.fixture(scope="function") -def matched_importer(): - yield MatchedImporter("http://localhost:9411/v0") +def matched_importer(api): + yield MatchedImporter(api) # TODO: use API to check that entities actually created... def test_matched_importer_batch(matched_importer): @@ -17,6 +18,14 @@ def test_matched_importer(matched_importer): with open('tests/files/example_matched.json', 'r') as f: matched_importer.process_source(f) + # fetch most recent editgroup + changes = matched_importer.api.get_changelog(limit=1) + eg = changes[0].editgroup + assert eg.description + assert "file-to-release" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.MatchedImporter" in eg.extra['agent'] + def test_matched_dict_parse(matched_importer): with open('tests/files/example_matched.json', 'r') as f: raw = json.loads(f.readline()) diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py index 18199888..717a1328 100644 --- a/python/tests/import_orcid.py +++ b/python/tests/import_orcid.py @@ -2,11 +2,12 @@ import json import pytest from fatcat_tools.importers import OrcidImporter +from fixtures import api @pytest.fixture(scope="function") -def orcid_importer(): - yield OrcidImporter("http://localhost:9411/v0") +def orcid_importer(api): + yield OrcidImporter(api) # TODO: use API to check that entities actually created... def test_orcid_importer_batch(orcid_importer): @@ -21,6 +22,14 @@ def test_orcid_importer(orcid_importer): with open('tests/files/0000-0001-8254-7103.json', 'r') as f: orcid_importer.process_source(f) + # fetch most recent editgroup + changes = orcid_importer.api.get_changelog(limit=1) + eg = changes[0].editgroup + assert eg.description + assert "orcid" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.OrcidImporter" in eg.extra['agent'] + def test_orcid_importer_x(orcid_importer): with open('tests/files/0000-0003-3953-765X.json', 'r') as f: orcid_importer.process_source(f) diff --git a/python/tests/importer.py b/python/tests/importer.py index f228a9b2..34efa5d8 100644 --- a/python/tests/importer.py +++ b/python/tests/importer.py @@ -2,11 +2,12 @@ import pytest from fatcat_tools.importers import FatcatImporter +from fixtures import api -def test_issnl_mapping_lookup(): +def test_issnl_mapping_lookup(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter("http://localhost:9411/v0", issn_file) + fi = FatcatImporter(api, issn_map_file=issn_file) assert fi.issn2issnl('0000-0027') == '0002-0027' assert fi.issn2issnl('0002-0027') == '0002-0027' @@ -14,10 +15,10 @@ def test_issnl_mapping_lookup(): assert fi.lookup_issnl('9999-9999') == None -def test_identifiers(): +def test_identifiers(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - fi = FatcatImporter("http://localhost:9411/v0", issn_file) + fi = FatcatImporter(api, issn_map_file=issn_file) assert fi.is_issnl("1234-5678") == True assert fi.is_issnl("1234-5678.") == False diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index a42db244..e9d23250 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -3,6 +3,7 @@ import json import pytest from fatcat_tools import * from fatcat_client import * +from fixtures import api from import_crossref import crossref_importer -- cgit v1.2.3