aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-04-05 12:02:33 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-04-05 12:02:33 -0700
commit70e905bc21835476f6713289ad0f31e4c56cae32 (patch)
tree27268684388ac57e4a1dd5a4353d2c4f9455f929
parentde46607f0442eea31428e6946b3220a6b94f8926 (diff)
downloadfatcat-70e905bc21835476f6713289ad0f31e4c56cae32.tar.gz
fatcat-70e905bc21835476f6713289ad0f31e4c56cae32.zip
ingest importer: improved extra/edit_extra code flow
-rw-r--r--python/fatcat_tools/importers/ingest.py33
1 files changed, 13 insertions, 20 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 471d673b..20963918 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -260,6 +260,16 @@ class IngestFileResultImporter(EntityImporter):
edit_extra["grobid_status_code"] = row["grobid"]["status_code"]
edit_extra["grobid_version"] = row["grobid"].get("grobid_version")
+ # fileset/platform metadata
+ if row.get("ingest_strategy"):
+ edit_extra["ingest_strategy"] = row["ingest_strategy"]
+ if row.get("platform_domain"):
+ edit_extra["platform_domain"] = row["platform_domain"]
+ if row.get("platform_name"):
+ edit_extra["platform_name"] = row["platform_name"]
+ if row.get("platform_id"):
+ edit_extra["platform_id"] = row["platform_id"]
+
return edit_extra
def parse_record(self, row: Dict[str, Any]) -> FileEntity:
@@ -518,7 +528,6 @@ class IngestWebResultImporter(IngestFileResultImporter):
)
edit_extra = self.parse_edit_extra(row)
-
if edit_extra:
wc.edit_extra = edit_extra
return wc
@@ -754,12 +763,6 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
return None
entity_extra: Dict[str, Any] = dict()
- edit_extra = self.parse_edit_extra(row)
- edit_extra["ingest_strategy"] = row["ingest_strategy"]
- if row.get("platform"):
- edit_extra["platform"] = row["platform"]
- if row.get("platform_id"):
- edit_extra["platform_id"] = row["platform_id"]
entity_urls = self.parse_fileset_urls(row)
if not entity_urls:
@@ -799,10 +802,10 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
manifest=manifest,
urls=entity_urls,
release_ids=[release_ident],
+ extra=entity_extra or None,
)
- if entity_extra:
- fe.extra = entity_extra
+ edit_extra = self.parse_edit_extra(row)
if edit_extra:
fe.edit_extra = edit_extra
return fe
@@ -993,14 +996,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter):
self.counts["skip-release-not-found"] += 1
return None
- entity_extra: Dict[str, Any] = dict()
- edit_extra = self.parse_edit_extra(row)
- edit_extra["ingest_strategy"] = row["ingest_strategy"]
- if row.get("platform"):
- edit_extra["platform"] = row["platform"]
- if row.get("platform_id"):
- edit_extra["platform_id"] = row["platform_id"]
-
assert row["file_count"] == len(row["manifest"]) == 1
file_meta = row["manifest"][0]
# print(file_meta)
@@ -1029,7 +1024,7 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter):
self.counts["skip-no-access-url"] += 1
return None
- entity_extra = dict()
+ entity_extra: Dict[str, Any] = dict()
entity_extra["path"] = file_meta["path"]
# this is to work around a bug in old sandcrawler ingest code
@@ -1051,8 +1046,6 @@ class IngestFilesetFileResultImporter(IngestFileResultImporter):
self.counts["skip-partial-file-info"] += 1
return None
- if entity_extra:
- fe.extra = entity_extra
edit_extra = self.parse_edit_extra(row)
if edit_extra:
fe.edit_extra = edit_extra