summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-09-08 17:49:57 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-10-14 18:11:12 -0700
commit70bdcb1ba7ce4aeb423fd6c5ff0ac002302fa1e9 (patch)
treee180f86665a672414d8ffefe003703ab74edee81 /python/fatcat_tools/importers
parent4be667616ae209fa0efaaa2350c1b75eacf0e344 (diff)
downloadfatcat-70bdcb1ba7ce4aeb423fd6c5ff0ac002302fa1e9.tar.gz
fatcat-70bdcb1ba7ce4aeb423fd6c5ff0ac002302fa1e9.zip
generic fileset importer class, with test coverage
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/common.py4
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py83
3 files changed, 88 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 9cb18506..5da669e1 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -33,3 +33,4 @@ from .file_meta import FileMetaImporter
from .doaj_article import DoajArticleImporter
from .dblp_release import DblpReleaseImporter
from .dblp_container import DblpContainerImporter
+from .fileset_generic import FilesetImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e936477c..680b4f9c 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -447,6 +447,10 @@ class EntityImporter:
existing.urls = [u for u in existing.urls if u.url not in redundant_urls]
return existing
+ @staticmethod
+ def generic_fileset_cleanups(existing):
+ return existing
+
def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]:
"""
This helper function uses fuzzycat (and elasticsearch) to look for
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
new file mode 100644
index 00000000..f0ad5460
--- /dev/null
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -0,0 +1,83 @@
+
+import fatcat_openapi_client
+
+from fatcat_tools import entity_from_dict
+from .common import EntityImporter
+
+
+class FilesetImporter(EntityImporter):
+ """
+ General purpose importer for fileset entities. Simply fileset schema JSON
+ and inserts.
+
+ By default requires release_ids to be non-empty, and will check each
+ release_id to see if a fileset is already associated; if so, skips the
+ import. This behavior may change in the future, and can be disabled.
+
+ Currently only creates (insert), no updates.
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Generic Fileset entity import"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FilesetImporter')
+ kwargs['do_updates'] = bool(kwargs.get("do_updates", False))
+ self.skip_release_fileset_check = bool(kwargs.get("skip_release_fileset_check", False))
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode == False
+
+ def want(self, row):
+ if not row.get('release_ids'):
+ self.counts['skip-no-release-ids'] += 1
+ return False
+ if not row.get('urls'):
+ self.counts['skip-no-urls'] += 1
+ return False
+ if not row.get('manifest'):
+ self.counts['skip-no-files'] += 1
+ return False
+
+ for f in row.get('manifest'):
+ for k in ('sha1', 'md5'):
+ if not f.get(k):
+ self.counts['skip-missing-file-field'] += 1
+ return False
+ return True
+
+ def parse_record(self, row):
+
+ fse = entity_from_dict(
+ row,
+ fatcat_openapi_client.FilesetEntity,
+ api_client=self.api.api_client,
+ )
+ fse = self.generic_fileset_cleanups(fse)
+ return fse
+
+ def try_update(self, fse):
+
+ if not self.skip_release_fileset_check:
+ for release_id in fse.release_ids:
+ # don't catch 404, that would be an error
+ release = self.api.get_release(release_id, expand='filesets', hide='abstracts,refs')
+ assert release.state == 'active'
+ if release.filesets:
+ self.counts['exists'] += 1
+ self.counts['exists-via-release-filesets'] += 1
+ return False
+
+ # do the insert
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_fileset_auto_batch(fatcat_openapi_client.FilesetAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))