From 70e905bc21835476f6713289ad0f31e4c56cae32 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 5 Apr 2022 12:02:33 -0700 Subject: ingest importer: improved extra/edit_extra code flow --- python/fatcat_tools/importers/ingest.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 471d673b..20963918 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -260,6 +260,16 @@ class IngestFileResultImporter(EntityImporter): edit_extra["grobid_status_code"] = row["grobid"]["status_code"] edit_extra["grobid_version"] = row["grobid"].get("grobid_version") + # fileset/platform metadata + if row.get("ingest_strategy"): + edit_extra["ingest_strategy"] = row["ingest_strategy"] + if row.get("platform_domain"): + edit_extra["platform_domain"] = row["platform_domain"] + if row.get("platform_name"): + edit_extra["platform_name"] = row["platform_name"] + if row.get("platform_id"): + edit_extra["platform_id"] = row["platform_id"] + return edit_extra def parse_record(self, row: Dict[str, Any]) -> FileEntity: @@ -518,7 +528,6 @@ class IngestWebResultImporter(IngestFileResultImporter): ) edit_extra = self.parse_edit_extra(row) - if edit_extra: wc.edit_extra = edit_extra return wc @@ -754,12 +763,6 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return None entity_extra: Dict[str, Any] = dict() - edit_extra = self.parse_edit_extra(row) - edit_extra["ingest_strategy"] = row["ingest_strategy"] - if row.get("platform"): - edit_extra["platform"] = row["platform"] - if row.get("platform_id"): - edit_extra["platform_id"] = row["platform_id"] entity_urls = self.parse_fileset_urls(row) if not entity_urls: @@ -799,10 +802,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter): manifest=manifest, urls=entity_urls, release_ids=[release_ident], + extra=entity_extra or None, ) - if entity_extra: - fe.extra = entity_extra + edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra return fe @@ -993,14 +996,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-release-not-found"] += 1 return None - entity_extra: Dict[str, Any] = dict() - edit_extra = self.parse_edit_extra(row) - edit_extra["ingest_strategy"] = row["ingest_strategy"] - if row.get("platform"): - edit_extra["platform"] = row["platform"] - if row.get("platform_id"): - edit_extra["platform_id"] = row["platform_id"] - assert row["file_count"] == len(row["manifest"]) == 1 file_meta = row["manifest"][0] # print(file_meta) @@ -1029,7 +1024,7 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-no-access-url"] += 1 return None - entity_extra = dict() + entity_extra: Dict[str, Any] = dict() entity_extra["path"] = file_meta["path"] # this is to work around a bug in old sandcrawler ingest code @@ -1051,8 +1046,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-partial-file-info"] += 1 return None - if entity_extra: - fe.extra = entity_extra edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra -- cgit v1.2.3