aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/env.example8
-rw-r--r--python/fatcat_tools/importers/common.py1
-rw-r--r--python/fatcat_tools/importers/crossref.py2
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py13
-rw-r--r--python/fatcat_tools/importers/issn.py10
-rw-r--r--python/fatcat_tools/importers/matched.py18
-rw-r--r--python/fatcat_tools/importers/orcid.py10
-rw-r--r--python/tests/import_crossref.py2
-rw-r--r--python/tests/import_grobid_metadata.py13
-rw-r--r--python/tests/import_issn.py13
-rw-r--r--python/tests/import_matched.py13
-rw-r--r--python/tests/import_orcid.py13
-rw-r--r--python/tests/importer.py9
-rw-r--r--python/tests/transform_tests.py1
14 files changed, 103 insertions, 23 deletions
diff --git a/python/env.example b/python/env.example
index c1855440..fe9036b3 100644
--- a/python/env.example
+++ b/python/env.example
@@ -9,3 +9,11 @@ GITLAB_CLIENT_SECRET=""
IA_XAUTH_CLIENT_ID=""
IA_XAUTH_CLIENT_SECRET=""
SENTRY_DSN=""
+
+# These auth keys only for workers/importers; locally will fall back to
+# FATCAT_API_AUTH_TOKEN
+FATCAT_AUTH_WORKER_CROSSREF=""
+FATCAT_AUTH_WORKER_ORCID=""
+FATCAT_AUTH_WORKER_ISSN=""
+FATCAT_AUTH_WORKER_MATCHED=""
+FATCAT_AUTH_WORKER_GROBID_METADATA=""
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 5c33ebc9..e39ec6c9 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -43,6 +43,7 @@ class FatcatImporter:
eg_extra = kwargs.get('editgroup_extra', dict())
eg_extra['git_rev'] = eg_extra.get('git_rev',
subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
self.api = api
self._editgroup_description = kwargs.get('editgroup_description')
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 4f7faf59..ed60a78c 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -46,7 +46,7 @@ class CrossrefImporter(FatcatImporter):
eg_desc = kwargs.get('editgroup_description',
"Automated import of Crossref DOI metadata, harvested from REST API")
eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'CrossrefImporter')
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
super().__init__(api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 2cb97b01..5e61a154 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -12,9 +12,16 @@ MAX_ABSTRACT_BYTES=4096
class GrobidMetadataImporter(FatcatImporter):
- def __init__(self, host_url, default_link_rel="web"):
- super().__init__(host_url)
- self.default_link_rel = default_link_rel
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Import of release and file metadata, as extracted from PDFs by GROBID.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
def parse_grobid_json(self, obj):
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index 9b9ca63f..02a1eea0 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -35,6 +35,16 @@ class IssnImporter(FatcatImporter):
ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
"""
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
def parse_issn_row(self, row):
"""
row is a python dict (parsed from CSV).
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 5dbda27c..0b77bcf0 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -37,12 +37,18 @@ class MatchedImporter(FatcatImporter):
- core_id, wikidata_id, pmcid, pmid: not as lists
"""
- def __init__(self, host_url, skip_file_updates=False, default_mime=None,
- default_link_rel="web"):
- super().__init__(host_url)
- self.default_mime = default_mime
- self.default_link_rel = default_link_rel
- self.skip_file_updates = skip_file_updates
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Import of large-scale file-to-release match results. Source of metadata varies.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.default_mime = kwargs.get("default_mime", None)
+ self.skip_file_updates = kwargs.get("skip_file_updates", False)
def make_url(self, raw):
rel = self.default_link_rel
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index fc4562d0..0aa4ab00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -22,6 +22,16 @@ def value_or_none(e):
class OrcidImporter(FatcatImporter):
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of ORCID metadata, from official bulk releases.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
def parse_orcid_dict(self, obj):
"""
obj is a python dict (parsed from json).
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 3ef97719..e2ca6122 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -28,7 +28,7 @@ def test_crossref_importer(crossref_importer):
assert eg.description
assert "crossref" in eg.description.lower()
assert eg.extra['git_rev']
- assert "CrossrefImporter" in eg.extra['agent']
+ assert "fatcat_tools.CrossrefImporter" in eg.extra['agent']
def test_crossref_mappings(crossref_importer):
assert crossref_importer.map_release_type('journal-article') == "article-journal"
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index 459b247b..97ebcaef 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -4,6 +4,7 @@ import json
import base64
import pytest
from fatcat_tools.importers import GrobidMetadataImporter
+from fixtures import api
"""
WARNING: these tests are currently very fragile because they have database
@@ -11,8 +12,8 @@ side-effects. Should probably be disabled or re-written.
"""
@pytest.fixture(scope="function")
-def grobid_metadata_importer():
- yield GrobidMetadataImporter("http://localhost:9411/v0")
+def grobid_metadata_importer(api):
+ yield GrobidMetadataImporter(api)
# TODO: use API to check that entities actually created...
#def test_grobid_metadata_importer_batch(grobid_metadata_importer):
@@ -54,3 +55,11 @@ def test_file_metadata_parse(grobid_metadata_importer):
def test_grobid_metadata_importer(grobid_metadata_importer):
with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
grobid_metadata_importer.process_source(f)
+
+ # fetch most recent editgroup
+ changes = grobid_metadata_importer.api.get_changelog(limit=1)
+ eg = changes[0].editgroup
+ assert eg.description
+ assert "grobid" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent']
diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py
index 98a9f4a7..6b5978d9 100644
--- a/python/tests/import_issn.py
+++ b/python/tests/import_issn.py
@@ -1,11 +1,12 @@
import pytest
from fatcat_tools.importers import IssnImporter
+from fixtures import api
@pytest.fixture(scope="function")
-def issn_importer():
- yield IssnImporter("http://localhost:9411/v0")
+def issn_importer(api):
+ yield IssnImporter(api)
# TODO: use API to check that entities actually created...
def test_issn_importer_batch(issn_importer):
@@ -15,3 +16,11 @@ def test_issn_importer_batch(issn_importer):
def test_issn_importer(issn_importer):
with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
issn_importer.process_csv_source(f)
+
+ # fetch most recent editgroup
+ changes = issn_importer.api.get_changelog(limit=1)
+ eg = changes[0].editgroup
+ assert eg.description
+ assert "container" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.IssnImporter" in eg.extra['agent']
diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py
index 46a9ef85..080674ac 100644
--- a/python/tests/import_matched.py
+++ b/python/tests/import_matched.py
@@ -2,11 +2,12 @@
import json
import pytest
from fatcat_tools.importers import MatchedImporter
+from fixtures import api
@pytest.fixture(scope="function")
-def matched_importer():
- yield MatchedImporter("http://localhost:9411/v0")
+def matched_importer(api):
+ yield MatchedImporter(api)
# TODO: use API to check that entities actually created...
def test_matched_importer_batch(matched_importer):
@@ -17,6 +18,14 @@ def test_matched_importer(matched_importer):
with open('tests/files/example_matched.json', 'r') as f:
matched_importer.process_source(f)
+ # fetch most recent editgroup
+ changes = matched_importer.api.get_changelog(limit=1)
+ eg = changes[0].editgroup
+ assert eg.description
+ assert "file-to-release" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.MatchedImporter" in eg.extra['agent']
+
def test_matched_dict_parse(matched_importer):
with open('tests/files/example_matched.json', 'r') as f:
raw = json.loads(f.readline())
diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py
index 18199888..717a1328 100644
--- a/python/tests/import_orcid.py
+++ b/python/tests/import_orcid.py
@@ -2,11 +2,12 @@
import json
import pytest
from fatcat_tools.importers import OrcidImporter
+from fixtures import api
@pytest.fixture(scope="function")
-def orcid_importer():
- yield OrcidImporter("http://localhost:9411/v0")
+def orcid_importer(api):
+ yield OrcidImporter(api)
# TODO: use API to check that entities actually created...
def test_orcid_importer_batch(orcid_importer):
@@ -21,6 +22,14 @@ def test_orcid_importer(orcid_importer):
with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
orcid_importer.process_source(f)
+ # fetch most recent editgroup
+ changes = orcid_importer.api.get_changelog(limit=1)
+ eg = changes[0].editgroup
+ assert eg.description
+ assert "orcid" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.OrcidImporter" in eg.extra['agent']
+
def test_orcid_importer_x(orcid_importer):
with open('tests/files/0000-0003-3953-765X.json', 'r') as f:
orcid_importer.process_source(f)
diff --git a/python/tests/importer.py b/python/tests/importer.py
index f228a9b2..34efa5d8 100644
--- a/python/tests/importer.py
+++ b/python/tests/importer.py
@@ -2,11 +2,12 @@
import pytest
from fatcat_tools.importers import FatcatImporter
+from fixtures import api
-def test_issnl_mapping_lookup():
+def test_issnl_mapping_lookup(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- fi = FatcatImporter("http://localhost:9411/v0", issn_file)
+ fi = FatcatImporter(api, issn_map_file=issn_file)
assert fi.issn2issnl('0000-0027') == '0002-0027'
assert fi.issn2issnl('0002-0027') == '0002-0027'
@@ -14,10 +15,10 @@ def test_issnl_mapping_lookup():
assert fi.lookup_issnl('9999-9999') == None
-def test_identifiers():
+def test_identifiers(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- fi = FatcatImporter("http://localhost:9411/v0", issn_file)
+ fi = FatcatImporter(api, issn_map_file=issn_file)
assert fi.is_issnl("1234-5678") == True
assert fi.is_issnl("1234-5678.") == False
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index a42db244..e9d23250 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -3,6 +3,7 @@ import json
import pytest
from fatcat_tools import *
from fatcat_client import *
+from fixtures import api
from import_crossref import crossref_importer