diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 36 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 19 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
-rw-r--r-- | python/fatcat_tools/importers/issn.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 18 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 10 |
6 files changed, 81 insertions, 25 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e31cabf8..e39ec6c9 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -4,6 +4,7 @@ import sys import csv import json import itertools +import subprocess from collections import Counter import pykafka @@ -37,19 +38,33 @@ class FatcatImporter: Base class for fatcat importers """ - def __init__(self, host_url, issn_map_file=None): - conf = fatcat_client.Configuration() - conf.host = host_url - self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + def __init__(self, api, **kwargs): + + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['git_rev'] = eg_extra.get('git_rev', + subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') + + self.api = api + self._editgroup_description = kwargs.get('editgroup_description') + self._editgroup_extra = kwargs.get('editgroup_extra') + issn_map_file = kwargs.get('issn_map_file') + self._issnl_id_map = dict() self._orcid_id_map = dict() self._doi_id_map = dict() - self._issn_issnl_map = None - self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") if issn_map_file: self.read_issn_map_file(issn_map_file) + self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) + def _editgroup(self): + eg = fatcat_client.Editgroup( + description=self._editgroup_description, + extra=self._editgroup_extra, + ) + return self.api.create_editgroup(eg) + def describe_run(self): print("Processed {} lines, inserted {}, updated {}.".format( self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) @@ -64,15 +79,13 @@ class FatcatImporter: def process_source(self, source, group_size=100): """Creates and auto-accepts editgroup every group_size rows""" - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + eg = self._editgroup() i = 0 for i, row in enumerate(source): self.create_row(row, editgroup_id=eg.editgroup_id) if i > 0 and (i % group_size) == 0: self.api.accept_editgroup(eg.editgroup_id) - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + eg = self._editgroup() self.counts['processed_lines'] += 1 if i == 0 or (i % group_size) != 0: self.api.accept_editgroup(eg.editgroup_id) @@ -83,8 +96,7 @@ class FatcatImporter: if decode_kafka: rows = [msg.value.decode('utf-8') for msg in rows] self.counts['processed_lines'] += len(rows) - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + eg = self._editgroup() self.create_batch(rows, editgroup_id=eg.editgroup_id) def process_csv_source(self, source, group_size=100, delimiter=','): diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d4d0de68..ed60a78c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,6 +4,7 @@ import json import sqlite3 import datetime import itertools +import subprocess import fatcat_client from .common import FatcatImporter @@ -40,8 +41,19 @@ class CrossrefImporter(FatcatImporter): See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ - def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True): - super().__init__(host_url, issn_map_file) + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of Crossref DOI metadata, harvested from REST API") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + extid_map_file = kwargs.get('extid_map_file') + create_containers = kwargs.get('create_containers') + check_existing = kwargs.get('check_existing') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -313,8 +325,7 @@ class CrossrefImporter(FatcatImporter): if entities is not None: (re, ce) = entities if ce is not None: - ce_eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) self.api.accept_editgroup(ce_eg.editgroup_id) re.container_id = container.ident diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2cb97b01..5e61a154 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -12,9 +12,16 @@ MAX_ABSTRACT_BYTES=4096 class GrobidMetadataImporter(FatcatImporter): - def __init__(self, host_url, default_link_rel="web"): - super().__init__(host_url) - self.default_link_rel = default_link_rel + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Import of release and file metadata, as extracted from PDFs by GROBID.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + self.default_link_rel = kwargs.get("default_link_rel", "web") def parse_grobid_json(self, obj): diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 9b9ca63f..02a1eea0 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -35,6 +35,16 @@ class IssnImporter(FatcatImporter): ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count """ + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + def parse_issn_row(self, row): """ row is a python dict (parsed from CSV). diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5dbda27c..0b77bcf0 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -37,12 +37,18 @@ class MatchedImporter(FatcatImporter): - core_id, wikidata_id, pmcid, pmid: not as lists """ - def __init__(self, host_url, skip_file_updates=False, default_mime=None, - default_link_rel="web"): - super().__init__(host_url) - self.default_mime = default_mime - self.default_link_rel = default_link_rel - self.skip_file_updates = skip_file_updates + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Import of large-scale file-to-release match results. Source of metadata varies.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + self.default_link_rel = kwargs.get("default_link_rel", "web") + self.default_mime = kwargs.get("default_mime", None) + self.skip_file_updates = kwargs.get("skip_file_updates", False) def make_url(self, raw): rel = self.default_link_rel diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index fc4562d0..0aa4ab00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -22,6 +22,16 @@ def value_or_none(e): class OrcidImporter(FatcatImporter): + def __init__(self, api, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of ORCID metadata, from official bulk releases.") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra) + def parse_orcid_dict(self, obj): """ obj is a python dict (parsed from json). |