diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-04-05 12:02:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-04-05 12:02:33 -0700 |
commit | 70e905bc21835476f6713289ad0f31e4c56cae32 (patch) | |
tree | 27268684388ac57e4a1dd5a4353d2c4f9455f929 | |
parent | de46607f0442eea31428e6946b3220a6b94f8926 (diff) | |
download | fatcat-70e905bc21835476f6713289ad0f31e4c56cae32.tar.gz fatcat-70e905bc21835476f6713289ad0f31e4c56cae32.zip |
ingest importer: improved extra/edit_extra code flow
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 33 |
1 files changed, 13 insertions, 20 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 471d673b..20963918 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -260,6 +260,16 @@ class IngestFileResultImporter(EntityImporter): edit_extra["grobid_status_code"] = row["grobid"]["status_code"] edit_extra["grobid_version"] = row["grobid"].get("grobid_version") + # fileset/platform metadata + if row.get("ingest_strategy"): + edit_extra["ingest_strategy"] = row["ingest_strategy"] + if row.get("platform_domain"): + edit_extra["platform_domain"] = row["platform_domain"] + if row.get("platform_name"): + edit_extra["platform_name"] = row["platform_name"] + if row.get("platform_id"): + edit_extra["platform_id"] = row["platform_id"] + return edit_extra def parse_record(self, row: Dict[str, Any]) -> FileEntity: @@ -518,7 +528,6 @@ class IngestWebResultImporter(IngestFileResultImporter): ) edit_extra = self.parse_edit_extra(row) - if edit_extra: wc.edit_extra = edit_extra return wc @@ -754,12 +763,6 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return None entity_extra: Dict[str, Any] = dict() - edit_extra = self.parse_edit_extra(row) - edit_extra["ingest_strategy"] = row["ingest_strategy"] - if row.get("platform"): - edit_extra["platform"] = row["platform"] - if row.get("platform_id"): - edit_extra["platform_id"] = row["platform_id"] entity_urls = self.parse_fileset_urls(row) if not entity_urls: @@ -799,10 +802,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter): manifest=manifest, urls=entity_urls, release_ids=[release_ident], + extra=entity_extra or None, ) - if entity_extra: - fe.extra = entity_extra + edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra return fe @@ -993,14 +996,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-release-not-found"] += 1 return None - entity_extra: Dict[str, Any] = dict() - edit_extra = self.parse_edit_extra(row) - edit_extra["ingest_strategy"] = row["ingest_strategy"] - if row.get("platform"): - edit_extra["platform"] = row["platform"] - if row.get("platform_id"): - edit_extra["platform_id"] = row["platform_id"] - assert row["file_count"] == len(row["manifest"]) == 1 file_meta = row["manifest"][0] # print(file_meta) @@ -1029,7 +1024,7 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-no-access-url"] += 1 return None - entity_extra = dict() + entity_extra: Dict[str, Any] = dict() entity_extra["path"] = file_meta["path"] # this is to work around a bug in old sandcrawler ingest code @@ -1051,8 +1046,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter): self.counts["skip-partial-file-info"] += 1 return None - if entity_extra: - fe.extra = entity_extra edit_extra = self.parse_edit_extra(row) if edit_extra: fe.edit_extra = edit_extra |